From d9878379c849833a0f58918a2c6b8357ba54d2c1 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Fri, 21 Oct 2022 14:22:20 +0000 Subject: [PATCH 0001/1922] Fix typo under torch directory (#87274) This PR fixes typo in .md files under torch directory Pull Request resolved: https://github.com/pytorch/pytorch/pull/87274 Approved by: https://github.com/albanD --- .../activation_sparsifier/README.md | 2 +- .../_experimental/data_sparsifier/README.md | 2 +- torch/ao/quantization/fx/README.md | 2 +- .../ao/quantization/fx/_model_report/README.md | 18 +++++++++--------- torch/csrc/jit/OVERVIEW.md | 2 +- torch/csrc/jit/codegen/cuda/README.md | 4 ++-- .../jit/codegen/cuda/python_frontend/README.md | 4 ++-- torch/csrc/jit/docs/serialization.md | 2 +- torch/csrc/jit/operator_upgraders/README.md | 2 +- torch/csrc/jit/runtime/static/README.md | 4 ++-- torch/distributed/benchmarks/README.md | 2 +- torch/fx/passes/README.md | 2 +- 12 files changed, 23 insertions(+), 23 deletions(-) diff --git a/torch/ao/pruning/_experimental/activation_sparsifier/README.md b/torch/ao/pruning/_experimental/activation_sparsifier/README.md index 3c2514c2f116b..810b053d92221 100644 --- a/torch/ao/pruning/_experimental/activation_sparsifier/README.md +++ b/torch/ao/pruning/_experimental/activation_sparsifier/README.md @@ -60,7 +60,7 @@ def mask_fn(tensor, threshold): # threshold is the sparse config here ``` ## API Design -`ActivationSparsifier`: Attaches itself to a model layer and sparsifies the activation flowing through that layer. The user can pass in the default `aggregate_fn`, `reduce_fn` and `mask_fn`. Additionaly, `features` and `feature_dim` are also accepted. +`ActivationSparsifier`: Attaches itself to a model layer and sparsifies the activation flowing through that layer. The user can pass in the default `aggregate_fn`, `reduce_fn` and `mask_fn`. Additionally, `features` and `feature_dim` are also accepted. `register_layer`: Registers a layer for sparsification. Specifically, registers `forward_pre_hook()` that performs aggregation. diff --git a/torch/ao/pruning/_experimental/data_sparsifier/README.md b/torch/ao/pruning/_experimental/data_sparsifier/README.md index c6fc99b36c8c4..faea74355360a 100644 --- a/torch/ao/pruning/_experimental/data_sparsifier/README.md +++ b/torch/ao/pruning/_experimental/data_sparsifier/README.md @@ -3,7 +3,7 @@ The data sparsifier inherits from the `BaseSparsifier` class. It attempts to sparsify data tensors in general (trainable and non-trainable). ## Implementation Details -The data sparsifier does not receive a model or a layer to sparsify. Hence, the mask needs to be owned by the data sparsifier. This is acheived by introducing a private container model that registers the data as a parametrized buffer. +The data sparsifier does not receive a model or a layer to sparsify. Hence, the mask needs to be owned by the data sparsifier. This is achieved by introducing a private container model that registers the data as a parametrized buffer. The BaseDataSparsifier handles all the housekeeping while allowing the user to just implement the `update_mask` logic in their implementation. diff --git a/torch/ao/quantization/fx/README.md b/torch/ao/quantization/fx/README.md index 389a5e428627d..0ee5c5ec7e3f5 100644 --- a/torch/ao/quantization/fx/README.md +++ b/torch/ao/quantization/fx/README.md @@ -248,7 +248,7 @@ Note: weight + FakeQuantize is a part of qat_linear_relu `backend_config` configurations used in this step: ``` BackendConfig(nniqat.LinearReLU) - .set_observation_type(ObservationType.OUTPUT_USE_DIFFFERENT_OBSERVER_AS_INPUT) + .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) .set_dtype_configs([ DTypeConfig(input_dtype=torch.quint8, output_dtype = torch.quint8, weight_dtype = torch.qint8, bias_dtype = torch.float32)] ) diff --git a/torch/ao/quantization/fx/_model_report/README.md b/torch/ao/quantization/fx/_model_report/README.md index 0c4943ad6a755..dc11510f6c9ed 100644 --- a/torch/ao/quantization/fx/_model_report/README.md +++ b/torch/ao/quantization/fx/_model_report/README.md @@ -32,7 +32,7 @@ model_report = ModelReport(model, detector_set) ready_for_callibrate = model_report.prepare_detailed_callibration() # callibrate model and generate report -ready_for_callibrate(example_input) # TODO run callibration of model with relavent data +ready_for_callibrate(example_input) # TODO run callibration of model with relevant data reports = model_report.generate_model_report(remove_inserted_observers=True) for report_name in report.keys(): text_report, report_dict = reports[report_name] @@ -61,8 +61,8 @@ This is so that we can keep track of where we want to insert observers on a dete - `prepare_detailed_calibration(self)` → `GraphModule` inserts observers into the locations specified by each detector in the model. It then returns the GraphModule with the detectors inserted into both the regular module structure as well as the node structure. - `generate_model_report(self, remove_inserted_observers: bool)` → `Dict[str, Tuple[str, Dict]]` uses callibrated GraphModule to optionally removes inserted observers, and generate, for each detector the ModelReport instance was initialized with: - - A string-based report that is easily digestable and actionable explaining the data collected by relavent observers for that detector - - A dictionary containing statistics collected by the relavent observers and values calculated by the detector for futher analysis or plotting + - A string-based report that is easily digestable and actionable explaining the data collected by relevant observers for that detector + - A dictionary containing statistics collected by the relevant observers and values calculated by the detector for futher analysis or plotting ## ModelReportVisualizer Overview @@ -127,21 +127,21 @@ return_dict = { "[unique_observer_fqn_of_insert_location]" : { "target_node" -> the node we are trying to observe with this observer (torch.fx.node.Node), - "insert_observer" -> the intialized observer we wish to insert (ObserverBase), + "insert_observer" -> the initialized observer we wish to insert (ObserverBase), "insert_post" -> True if this is meant to be a post-observer for target_node, False if pre-observer, "observer_args" -> The arguments that are meant to be passed into the observer, } } ``` - `get_detector_name(self)` -> `str`: returns the name of the detector. -You should give your detector a unique name different from exisiting detectors. +You should give your detector a unique name different from existing detectors. - `generate_detector_report(self, model)` -> `Tuple[str, Dict[str, Any]]`: generates a report based on the information the detector is trying to collect. This report consists of both a text-based report as well as a dictionary of collected and calculated statistics. This report is returned to the `ModelReport` instance, which will then compile all the reports of all the Detectors requested by the user. ## ModelReportObserver Overview -As seen in the [requirments to implement a detector section](#requirements-to-implement-a-detector), one of the key parts of implementing a detector is to specify what `Observer` we are trying to insert. +As seen in the [requirements to implement a detector section](#requirements-to-implement-a-detector), one of the key parts of implementing a detector is to specify what `Observer` we are trying to insert. All the detectors in the ModelReport API use the [`ModelReportObserver`](https://github.com/pytorch/pytorch/blob/master/torch/ao/quantization/fx/_model_report/model_report_observer.py). While the core purpose of many observers in PyTorch's Quantization API is to collect min / max information to help determine quantization parameters, the `ModelReportObserver` collects additional statistics. @@ -152,7 +152,7 @@ The statistics collected by the `ModelReportObserver` include: - Ratio of 100th percentile to some *n*th percentile - Number of constant value batches to pass through each channel -After the `ModelReportObserver` collects the statistics above during the callibration process, the detectors then extract the information they need to generate their reports from the relavent observers. +After the `ModelReportObserver` collects the statistics above during the callibration process, the detectors then extract the information they need to generate their reports from the relevant observers. ### Using Your Own Observer @@ -187,7 +187,7 @@ Since you are also implementing your own detector in this case, it is up to you - A line plot (for both per-tensor and per-channel statistics) - A histogram (for both per-tensor and per-channel statistics) - `model_report.py`: File containing the `ModelReport` class - - Main class users are interacting with to go through the ModelReport worflow + - Main class users are interacting with to go through the ModelReport workflow - API described in detail in [Overview section](#modelreport-overview) # Tests @@ -200,7 +200,7 @@ These tests include: - Test class for the `ModelReportVisualizer` class - Test class for **each** of the implemented Detectors -If you wish to add a Detector, make sure to create a test class modeled after one of the exisiting classes and test your detector. +If you wish to add a Detector, make sure to create a test class modeled after one of the existing classes and test your detector. Because users will be interacting with the Detectors through the `ModelReport` class and not directly, ensure that the tests follow this as well. # Future Tasks and Improvements diff --git a/torch/csrc/jit/OVERVIEW.md b/torch/csrc/jit/OVERVIEW.md index c1bcd57c73a5f..638cbf883bf71 100644 --- a/torch/csrc/jit/OVERVIEW.md +++ b/torch/csrc/jit/OVERVIEW.md @@ -1408,7 +1408,7 @@ TODO: differentiation, symbolic autograd, fusion, operators We attempt to reduce the number of `prim::Guard` nodes as these nodes may interfere with optimizations. * First, `GuardElimination::moveGuardsToDefs` tries to move `prim::Guards` to their definitions, so the guards guarding the same `Tensor` follow the definition directly or another guard on the same `Tensor`. * This ordering allows us to **coalesce** (done in `GuardElimination::coalesceGuards`) multiple guards into a single one. -* After guards are **coaslesced** , `GuardElimination::eliminateGuards` attempts to eliminate more guards as follows: it inspects each operation and its inputs. It checks if inputs to the operation are guarded and also if the operation produces the consistent shapes given the guarded inputs. For example, if two inputs to `add` are guaranteed to be of shape `(2, 3)`, the output shape will also always be `(2, 3)`. If this property holds, we are allowed to remove the guard guarding operation's output. +* After guards are **coalesced** , `GuardElimination::eliminateGuards` attempts to eliminate more guards as follows: it inspects each operation and its inputs. It checks if inputs to the operation are guarded and also if the operation produces the consistent shapes given the guarded inputs. For example, if two inputs to `add` are guaranteed to be of shape `(2, 3)`, the output shape will also always be `(2, 3)`. If this property holds, we are allowed to remove the guard guarding operation's output. Lastly, we need to be handle cases when the assumptions about `Tensor` shapes fail at runtime. To handle guard failures, we need to be able to run the original code i.e. the code that doesn't rely on assumptions about shapes. As guards can be inserted and moved (by Optimizer) at/to arbitrary points in a computational graph, we need to be able to resume execution starting from those arbitrary points onward. diff --git a/torch/csrc/jit/codegen/cuda/README.md b/torch/csrc/jit/codegen/cuda/README.md index be8aed6c5ce44..284fd14111962 100644 --- a/torch/csrc/jit/codegen/cuda/README.md +++ b/torch/csrc/jit/codegen/cuda/README.md @@ -197,8 +197,8 @@ First thing is to check that you have fusion kernel running properly. Try to run If turning on NVFuser produces unexpected outputs, set the `PYTORCH_NVFUSER_DISABLE` environment variable to disable some of the optional features, e.g.: - `fma`: disable using FMA instructions -- `index_hoist`: disble optimization to hoist comon index expressions -- `predicate_elimination`: disble optimization to eliminate redundant predicates +- `index_hoist`: disable optimization to hoist common index expressions +- `predicate_elimination`: disable optimization to eliminate redundant predicates - `unroll_with_rng`: disable unrolling when RNG is used For example, `export PYTORCH_NVFUSER_DISABLE=fma,index_hoist` would disable FMA and index hoisting. diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/README.md b/torch/csrc/jit/codegen/cuda/python_frontend/README.md index 7f3364e05c69b..d519e69bcda3c 100644 --- a/torch/csrc/jit/codegen/cuda/python_frontend/README.md +++ b/torch/csrc/jit/codegen/cuda/python_frontend/README.md @@ -51,7 +51,7 @@ nvf_out = fs.execute([input1, input2])[0] * `id()`: Returns the fusion id for a given `Fusion`. * `print()`: Prints the low level IR for the currently defined fusion. -### `FusionDefiniton` Context Manager - Interface for Defining Fusions +### `FusionDefinition` Context Manager - Interface for Defining Fusions #### Defining Input Tensors _All intermediate tensors are created by operations. Constant tensors do not exist._ @@ -108,7 +108,7 @@ python -c "from torch._C._nvfuser import FusionDefinition; help(FusionDefinition ``` #### Notating Outputs -The `FusionDefintion` `add_output` method is used to indicate an intermediate is an output to the fusion. +The `FusionDefinition` `add_output` method is used to indicate an intermediate is an output to the fusion. ```python add_output(output: Tensor) diff --git a/torch/csrc/jit/docs/serialization.md b/torch/csrc/jit/docs/serialization.md index 8c3461a9abe83..a374f5bed40ba 100644 --- a/torch/csrc/jit/docs/serialization.md +++ b/torch/csrc/jit/docs/serialization.md @@ -127,7 +127,7 @@ its methods or attributes. **Uses of tensor constants**. Most constants are inlined as literals, like strings or ints. But since tensors are potentially very large, when -`PythonPrint` encouters a constant tensor it will emit a reference to a +`PythonPrint` encounters a constant tensor it will emit a reference to a global `CONSTANTS` table (like `foo = CONSTANTS.c0`). When importing, the importer will know how to resolve this reference into an diff --git a/torch/csrc/jit/operator_upgraders/README.md b/torch/csrc/jit/operator_upgraders/README.md index 084e6688f148e..a4061bf17921a 100644 --- a/torch/csrc/jit/operator_upgraders/README.md +++ b/torch/csrc/jit/operator_upgraders/README.md @@ -1,6 +1,6 @@ # Guidance for Operator Developer -PyTorch’s operators sometimes require changes for different reasons (e.g. from improving their usability to fixing bugs). These changes can be backward compatibility (BC) breaking, where older programs will no longer run as expected (or at all) on the latest version of PyTorch (an old program / new runtime problem), or forward compatibility (FC) breaking, where new programs will not run on older versions of PyTorch (a new program / old runtime problem). This guidance focuses on the requirements for maintaining backwards comatibility when making changes to an operator. +PyTorch’s operators sometimes require changes for different reasons (e.g. from improving their usability to fixing bugs). These changes can be backward compatibility (BC) breaking, where older programs will no longer run as expected (or at all) on the latest version of PyTorch (an old program / new runtime problem), or forward compatibility (FC) breaking, where new programs will not run on older versions of PyTorch (a new program / old runtime problem). This guidance focuses on the requirements for maintaining backwards compatibility when making changes to an operator. In order to do this we introduce the concept of the *upgrader*: a method to adapt the new operator to mimic the old operator behavior. When a new runtime reads an old program containing the old operator definition, the upgrader will adapt the old operator definition to comply with the new operator implementation. As you would expect, an upgrader is only applied when an old operation definition is encountered (i.e. if there are no "old" operators in the program, no upgrader would be used). For more details on the reasoning behind this new requirement please refer to the [PyTorch Operator Versioning RFC](https://github.com/pytorch/rfcs/blob/master/RFC-0017-PyTorch-Operator-Versioning.md). diff --git a/torch/csrc/jit/runtime/static/README.md b/torch/csrc/jit/runtime/static/README.md index 82d42d4b9f4c7..03e5ee6d75dc4 100644 --- a/torch/csrc/jit/runtime/static/README.md +++ b/torch/csrc/jit/runtime/static/README.md @@ -141,9 +141,9 @@ is selected instead. When loading a model, ops are selected for each `torch::jit::Node` in the graph as follows: -1) If an out variant is registered, pass the node to the function that prodcues the `SROperator`. If +1) If an out variant is registered, pass the node to the function that produces the `SROperator`. If the result is not `nulltpr`, use that op. -2) If a native function is registered, pass the node to the function that prodcues the `SROperator`. If +2) If a native function is registered, pass the node to the function that produces the `SROperator`. If the result is not `nulltpr`, use that op. 3) Use the JIT implementation. Static runtime will throw an exception if it does not exist. diff --git a/torch/distributed/benchmarks/README.md b/torch/distributed/benchmarks/README.md index 082ab87af623c..f5b1ec6bff2de 100644 --- a/torch/distributed/benchmarks/README.md +++ b/torch/distributed/benchmarks/README.md @@ -11,7 +11,7 @@ There are different training paradigms where combining these two techniques migh 2) Enable hybrid parallelism as described in the [PipeDream](https://arxiv.org/abs/1806.03377) paper. We can use the [Distributed RPC framework](https://pytorch.org/docs/master/rpc.html) to pipeline stages of the model across multiple workers and replicate each stage (if needed) using [DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel). ## Training Process -This benchmark focuses on the first paradime above. The training process is executed as follows: +This benchmark focuses on the first paradigm above. The training process is executed as follows: 1) The master creates embedding tables on each of the 8 Parameter Servers and holds an [RRef](https://pytorch.org/docs/master/rpc.html#rref) to it. 2) The master, then kicks off the training loop on the 8 trainers and passes the embedding table RRef to the trainers. diff --git a/torch/fx/passes/README.md b/torch/fx/passes/README.md index a2996848713e5..e972234f20824 100644 --- a/torch/fx/passes/README.md +++ b/torch/fx/passes/README.md @@ -1,5 +1,5 @@ ## FX Pass Infrastructure -This folder contains the pass infarstructure and passes for transforming fx.Graph. +This folder contains the pass infrastructure and passes for transforming fx.Graph. ## Code Structure From 1287a8968754ac63db965827bdc357ac274ce1ef Mon Sep 17 00:00:00 2001 From: Antonio Kim Date: Fri, 21 Oct 2022 14:28:14 +0000 Subject: [PATCH 0002/1922] Make LazyGraphExecutor extensible (#87218) Add `LazyGraphExecutor` to backend interface so that its is extensible by a vendor backend. I've made some preliminary methods virtual. Not sure if we want to make all methods in `LazyGraphExecutor` virtual. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87218 Approved by: https://github.com/wconstab, https://github.com/alanwaketan --- .github/ci_commit_pins/xla.txt | 2 +- torch/csrc/lazy/backend/backend_interface.cpp | 5 ----- torch/csrc/lazy/backend/backend_interface.h | 3 +-- torch/csrc/lazy/core/lazy_graph_executor.cpp | 13 +++++++++++-- torch/csrc/lazy/core/lazy_graph_executor.h | 14 ++++++++++++-- torch/csrc/lazy/ts_backend/ts_backend_impl.cpp | 4 ++++ 6 files changed, 29 insertions(+), 12 deletions(-) diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt index 2ca663bacdea0..e7375040708bd 100644 --- a/.github/ci_commit_pins/xla.txt +++ b/.github/ci_commit_pins/xla.txt @@ -1 +1 @@ -e1f5a49664b904e3ec1ddb9095ca75b6bbb5c10d +eff277e81fcfdeccba71e75ff40b6e2f3e29e27b diff --git a/torch/csrc/lazy/backend/backend_interface.cpp b/torch/csrc/lazy/backend/backend_interface.cpp index cbcd92b6a9924..250a8847351c6 100644 --- a/torch/csrc/lazy/backend/backend_interface.cpp +++ b/torch/csrc/lazy/backend/backend_interface.cpp @@ -18,11 +18,6 @@ const BackendImplInterface* getBackend() { return interface; } -// default implementation -bool BackendImplInterface::ShouldSyncTensor(const LazyTensorPtr tensor) const { - return tensor->GetIrValue()->op() != ltc_not_supported; -} - BackendRegistrar::BackendRegistrar( const BackendImplInterface* backend_impl_interface) { backend_impl_registry.store(backend_impl_interface); diff --git a/torch/csrc/lazy/backend/backend_interface.h b/torch/csrc/lazy/backend/backend_interface.h index 2936105dc6a3d..a70591c2a19c8 100644 --- a/torch/csrc/lazy/backend/backend_interface.h +++ b/torch/csrc/lazy/backend/backend_interface.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -41,8 +42,6 @@ class TORCH_API BackendImplInterface { virtual const IrBuilder* GetIrBuilder() const = 0; - virtual bool ShouldSyncTensor(const LazyTensorPtr tensor) const; - /** * Data Transfer * */ diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp index 96476e4a9663b..06b37797d3fa6 100644 --- a/torch/csrc/lazy/core/lazy_graph_executor.cpp +++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp @@ -390,10 +390,15 @@ bool TensorsHaveIR(const std::vector& tensors) { return false; } +std::atomic lazy_graph_executor_registry; } // namespace +void LazyGraphExecutor::Register(LazyGraphExecutor* executor) { + lazy_graph_executor_registry.store(executor); +} LazyGraphExecutor* LazyGraphExecutor::Get() { - static LazyGraphExecutor* executor = new LazyGraphExecutor(); + auto* executor = lazy_graph_executor_registry.load(); + TORCH_CHECK(executor, "Lazy graph executor not registered."); return executor; } @@ -604,6 +609,10 @@ void LazyGraphExecutor::Async::Wait() { } } +bool LazyGraphExecutor::ShouldSyncTensor(const LazyTensorPtr tensor) const { + return tensor->GetIrValue()->op() != ltc_not_supported; +} + LazyGraphExecutor::SyncTensorCollection LazyGraphExecutor::CollectSyncTensors( const std::vector& tensors, const SyncTensorsConfig& config) { @@ -635,7 +644,7 @@ LazyGraphExecutor::SyncTensorCollection LazyGraphExecutor::CollectSyncTensors( tensors[i]->CurrentDataHandle() == nullptr) { Value ir_value = tensors[i]->CurrentIrValue(); if (ir_value) { - if (getBackend()->ShouldSyncTensor(tensors[i])) { + if (ShouldSyncTensor(tensors[i])) { // Add only tensors which need to be synced. coll.hash = HashCombine(coll.hash, ir_value.hash()); coll.indices.push_back(i); diff --git a/torch/csrc/lazy/core/lazy_graph_executor.h b/torch/csrc/lazy/core/lazy_graph_executor.h index 8116ad23ff068..7a4498d85fc0f 100644 --- a/torch/csrc/lazy/core/lazy_graph_executor.h +++ b/torch/csrc/lazy/core/lazy_graph_executor.h @@ -21,10 +21,18 @@ class TORCH_API LazyGraphExecutor { bool read_only = false; }; + // Register a lazy graph executor instance that can be retrieved using Get() + static void Register(LazyGraphExecutor*); static LazyGraphExecutor* Get(); - void RegisterTensor(std::shared_ptr data); - void UnregisterTensor(LazyTensor::Data* data); + virtual ~LazyGraphExecutor() = default; + + // Override these methods to perform custom tensor registration and + // unregistration Note: It is vital that the parent implementations are also + // called + // in order for the tensors to show up in the live tensor list + virtual void RegisterTensor(std::shared_ptr data); + virtual void UnregisterTensor(LazyTensor::Data* data); // Seed for random generator Value GetRngSeed(const BackendDevice& device); @@ -181,6 +189,8 @@ class TORCH_API LazyGraphExecutor { std::vector tensors_data; }; + virtual bool ShouldSyncTensor(const LazyTensorPtr tensor) const; + SyncTensorCollection CollectSyncTensors( const std::vector& tensors, const SyncTensorsConfig& config); diff --git a/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp b/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp index a390ac76c1260..4003a005fbfab 100644 --- a/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp +++ b/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -273,6 +274,9 @@ void InitTorchScriptBackend() { register_ts_ltc_eager_fallback(); static std::unique_ptr s_registrar; s_registrar = std::make_unique(GetTSBackendImpl()); + + static LazyGraphExecutor* executor = new LazyGraphExecutor(); + LazyGraphExecutor::Register(executor); } } // namespace lazy From 98f127692ee7893266dc82f91592165652cd24a3 Mon Sep 17 00:00:00 2001 From: Masaki Kozuki Date: Fri, 21 Oct 2022 15:05:36 +0000 Subject: [PATCH 0003/1922] Reenable `isinstance` with `torch.distributed.ReduceOp` (#87303) tentatively marking as draft as I haven't gotten a comprehensive list of side effects... Ref: https://stackoverflow.com/questions/40244413/python-static-class-attribute-of-the-class-itself Rel: https://github.com/pytorch/pytorch/issues/87191 cc @kwen2501 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87303 Approved by: https://github.com/wanchaol --- test/distributed/test_c10d_common.py | 12 +++++++++++ torch/_C/_distributed_c10d.pyi | 3 +-- torch/csrc/distributed/c10d/Types.hpp | 5 ++++- torch/csrc/distributed/c10d/init.cpp | 29 +++++++++++++++++++-------- torch/distributed/distributed_c10d.py | 11 ++++++++++ 5 files changed, 49 insertions(+), 11 deletions(-) diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py index 68c760beacbbf..454595f85735c 100644 --- a/test/distributed/test_c10d_common.py +++ b/test/distributed/test_c10d_common.py @@ -1622,6 +1622,18 @@ def comm_fn(tensor, group=None): self._test_work_wait(tensor, comm_fn=comm_fn) +class ReduceOpTest(TestCase): + + def test_op_isinstance_of_reduceop(self): + for reduce_op in ( + c10d.ReduceOp.SUM, c10d.ReduceOp.AVG, c10d.ReduceOp.PRODUCT, c10d.ReduceOp.MIN, c10d.ReduceOp.MAX, + c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR, + ): + self.assertTrue(isinstance(reduce_op, c10d.ReduceOp)) + for scale in ([torch.tensor(1.0)], 2.0): + self.assertTrue(isinstance(dist._make_nccl_premul_sum(scale), c10d.ReduceOp)) + + if __name__ == "__main__": assert ( not torch.cuda._initialized diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi index aad37d6a8c5ae..bdf0166b8daa9 100644 --- a/torch/_C/_distributed_c10d.pyi +++ b/torch/_C/_distributed_c10d.pyi @@ -63,7 +63,6 @@ class DebugLevel(Enum): class ReduceOp: - # note(crcrpar): These values are populated from Kind SUM = ... PRODUCT = ... MIN = ... @@ -74,7 +73,7 @@ class ReduceOp: PREMUL_SUM = ... UNUSED = ... - class Kind(Enum): ... + class RedOpType(Enum): ... class BroadcastOptions: rootRank: int diff --git a/torch/csrc/distributed/c10d/Types.hpp b/torch/csrc/distributed/c10d/Types.hpp index 4d928976d87ee..64fbc45c6588c 100644 --- a/torch/csrc/distributed/c10d/Types.hpp +++ b/torch/csrc/distributed/c10d/Types.hpp @@ -29,6 +29,7 @@ struct NCCLPreMulSumSupplement : _SupplementBase { // Other ReduceOps that need different supplementary data can also // derive from _SupplementBase. struct TORCH_API ReduceOp : torch::CustomClassHolder { + // note(crcrpar): RedOpType could be defined outside of `ReduceOp` enum RedOpType : uint8_t { SUM = 0, AVG = 1, @@ -46,7 +47,9 @@ struct TORCH_API ReduceOp : torch::CustomClassHolder { ReduceOp(RedOpType op) : op_(op) { TORCH_INTERNAL_ASSERT( - op_ != PREMUL_SUM, "PREMUL_SUM requires a scale factor tensor or scalar argument"); + op_ != PREMUL_SUM, + "Use `torch.distributed._make_nccl_premul_sum` to create an instance of ReduceOp with PREMUL_SUM" + ); } ReduceOp(RedOpType op, c10::intrusive_ptr<_SupplementBase> optional_supplement) { diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index 327c041357266..6515a3d9a87d4 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -515,10 +515,14 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO R"(Sets the debug level of the torch.distributed package from the ``TORCH_DISTRIBUTED_DEBUG`` environment variable.)"); + // TODO(crcrpar): Hardening `ReduceOp`. + // While keeping most op types as enum value, + // making `PREMUL_SUM` callable, i.e., allowing for + // `ReduceOp.PREMUL_SUM(scale)` might be better as per @wanchaol. // https://pybind11.readthedocs.io/en/stable/classes.html#enumerations-and-internal-types py::class_<::c10d::ReduceOp> reduce_op(module, "ReduceOp", R"( An enum-like class for available reduction operations: ``SUM``, ``PRODUCT``, -``MIN``, ``MAX``, ``BAND``, ``BOR``, and ``BXOR``. +``MIN``, ``MAX``, ``BAND``, ``BOR``, ``BXOR``, and ``PREMUL_SUM``. ``BAND``, ``BOR``, and ``BXOR`` reductions are not available when using the ``NCCL`` backend. @@ -529,13 +533,16 @@ and only for NCCL versions 2.10 or later. ``PREMUL_SUM`` multiplies inputs by a given scalar locally before reduction. ``PREMUL_SUM`` is only available with the ``NCCL`` backend, -and only available for NCCL versions 2.11 or later. +and only available for NCCL versions 2.11 or later. Users are supposed to +use ``torch.distributed._make_nccl_premul_sum``. Additionally, ``MAX``, ``MIN`` and ``PRODUCT`` are not supported for complex tensors. The values of this class can be accessed as attributes, e.g., ``ReduceOp.SUM``. They are used in specifying strategies for reduction collectives, e.g., -:func:`reduce`, :func:`all_reduce_multigpu`, etc.)"); +:func:`reduce`, :func:`all_reduce_multigpu`, etc. + +This class does not support ``__members__`` property.)"); reduce_op.def(py::init<::c10d::ReduceOp::RedOpType>()) .def_readwrite("op", &::c10d::ReduceOp::op_); @@ -555,8 +562,14 @@ They are used in specifying strategies for reduction collectives, e.g., [](const ::c10d::ReduceOp& self, const ::c10d::ReduceOp& other) { return self == other.op_; }) - .def("__hash__", [](const ::c10d::ReduceOp& self) { return self.op_; }); - + .def("__hash__", [](const ::c10d::ReduceOp& self) { + return static_cast(self.op_); + }); + + // note(crcrpar): Deliberately skip + // [`export_values`](https://pybind11.readthedocs.io/en/stable/classes.html#enumerations-and-internal-types) + // here and manually set values in Python side. See note "ReduceOp static + // class attributes to support `isinstance`" py::enum_<::c10d::ReduceOp::RedOpType>(reduce_op, "RedOpType") .value("SUM", ::c10d::ReduceOp::RedOpType::SUM) .value("AVG", ::c10d::ReduceOp::RedOpType::AVG) @@ -566,10 +579,10 @@ They are used in specifying strategies for reduction collectives, e.g., .value("BAND", ::c10d::ReduceOp::RedOpType::BAND) .value("BOR", ::c10d::ReduceOp::RedOpType::BOR) .value("BXOR", ::c10d::ReduceOp::RedOpType::BXOR) - .value("PREMUL_SUM", ::c10d::ReduceOp::RedOpType::PREMUL_SUM) - .export_values(); + .value("PREMUL_SUM", ::c10d::ReduceOp::RedOpType::PREMUL_SUM); - // Ref: [Implicit + // note(crcrpar): This could be removed because users will not pass + // `RedOpType` to reduce collective ops Ref: [Implicit // conversions](https://pybind11.readthedocs.io/en/stable/advanced/classes.html#implicit-conversions) // Let us skip the explicit construction of `c10d::ReduceOp` from // `c10d::ReduceOp::RedOpType` in Python. diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index 5c49c6b821687..7de47876b5664 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -233,6 +233,17 @@ def register_backend(cls, name, func, extended_api=False): dist_backend = Backend +# NOTE(crcrpar): [ReduceOp static class attributes to support `isinstance`] +# A ReduceOp instance of `PREMUL_SUM` is supposed to be created via `_make_nccl_premul_sum` +# while the other `op`s (meaning RedOpType members) can be directly passed to c10d reduce collectives. +# I changed `ReduceOp` to struct from enum class and introduced RedOpType enum class for PREMUL_SUM, +# which broke an implicit contract of ReduceOp being enum-like with which users apply isinstance to +# `op`, for example, `isinstance(ReduceOp.SUM, ReduceOp)`: https://github.com/pytorch/pytorch/issues/87191 +DENY_LIST = ("PREMUL_SUM", ) +for _red_op_name, _red_op_value in ReduceOp.RedOpType.__members__.items(): + setattr(ReduceOp, _red_op_name, _red_op_value if _red_op_name in DENY_LIST else ReduceOp(_red_op_value)) + + class _reduce_op(object): r""" Deprecated enum-like class for reduction operations: ``SUM``, ``PRODUCT``, From 5bfe34ab4e33d3239f2781030eeb85806d118542 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Fri, 21 Oct 2022 13:29:31 +0100 Subject: [PATCH 0004/1922] OpInfo: Add test that sample_inputs_func returns a generator (#84567) This also includes a small list exception for single element lists since none of the memory usage or performance implications of lists apply there. Pull Request resolved: https://github.com/pytorch/pytorch/pull/84567 Approved by: https://github.com/lezcano, https://github.com/mruberry --- test/test_testing.py | 38 ++++++++++++++++++- .../_internal/common_methods_invocations.py | 12 +----- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/test/test_testing.py b/test/test_testing.py index e31872f7da6fd..fad72ab91de0a 100644 --- a/test/test_testing.py +++ b/test/test_testing.py @@ -12,7 +12,7 @@ import subprocess import sys import unittest.mock -from typing import Any, Callable, Iterator, List, Tuple +from typing import Any, Callable, Iterator, List, Tuple, Generator, Sequence import torch @@ -23,7 +23,7 @@ from torch.testing._internal.common_device_type import \ (PYTORCH_TESTING_DEVICE_EXCEPT_FOR_KEY, PYTORCH_TESTING_DEVICE_ONLY_FOR_KEY, dtypes, get_device_type_test_bases, instantiate_device_type_tests, onlyCUDA, onlyNativeDeviceTypes, - deviceCountAtLeast, ops, expectedFailureMeta) + deviceCountAtLeast, ops, expectedFailureMeta, OpDTypes) from torch.testing._internal.common_methods_invocations import op_db from torch.testing._internal import opinfo from torch.testing._internal.common_dtype import all_types_and_complex_and @@ -1881,5 +1881,39 @@ def test_sample_input_metadata(self) -> None: self.assertEqual(s2.name, "foo") +# Tests that validate the various sample generating functions on each OpInfo. +class TestOpInfoSampleFunctions(TestCase): + + def _assert_is_generator_or_singleton(self, item, property_name): + if isinstance(item, Sequence): + msg = ( + "{property_name} may only return lists for single items" + ", please use a coroutine which yields items instead") + self.assertTrue(len(item) <= 1, msg=msg) + else: + self.assertIsInstance(item, Generator) + + @ops(op_db, dtypes=OpDTypes.any_one) + def test_opinfo_sample_generators(self, device, dtype, op): + # Test op.sample_inputs doesn't generate multiple samples when called + samples = op.sample_inputs(device, dtype) + self._assert_is_generator_or_singleton(samples, "sample_inputs_func") + + @ops([op for op in op_db if op.reference_inputs_func is not None], dtypes=OpDTypes.any_one) + def test_opinfo_reference_generators(self, device, dtype, op): + # Test op.reference_inputs doesn't generate multiple samples when called + samples = op.reference_inputs(device, dtype) + self._assert_is_generator_or_singleton(samples, "reference_inputs_func") + + @ops([op for op in op_db if op.error_inputs_func is not None], dtypes=OpDTypes.none) + def test_opinfo_error_generators(self, device, op): + # Test op.error_inputs doesn't generate multiple inputs when called + samples = op.error_inputs(device) + self._assert_is_generator_or_singleton(samples, "error_inputs_func") + + +instantiate_device_type_tests(TestOpInfoSampleFunctions, globals()) + + if __name__ == '__main__': run_tests() diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 47c01caaecab6..e5d6e6efe18a9 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -1234,22 +1234,14 @@ def get_independent_tensor(tensor): return tensor.clone().requires_grad_(tensor.requires_grad) def sample_inputs_randint(self, device, dtype, requires_grad, **kwargs): - samples = [] low = 2 high = 10 for sample in sample_inputs_like_fns(self, device, dtype, requires_grad, **kwargs): # With high - samples.append(SampleInput( - high, - args=(sample.input.shape,) + sample.args, - kwargs=sample.kwargs)) + yield SampleInput(high, sample.input.shape, *sample.args, **sample.kwargs) # With low and high - samples.append(SampleInput( - low, - args=(high, sample.input.shape) + sample.args, - kwargs=sample.kwargs)) - return tuple(samples) + yield SampleInput(low, high, sample.input.shape, *sample.args, **sample.kwargs) def sample_inputs_randint_like(self, device, dtype, requires_grad, **kwargs): low = 2 From 3095b59fdb12e183f1d21dec4e951cb8697b4b6e Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Fri, 21 Oct 2022 05:54:15 -0700 Subject: [PATCH 0005/1922] Audit for error prone isinstance int/float and add lint (#87345) We recently fixed a bug on symbolic-shapes branch where an isinstance(x, int) test failed when passed a SymIntNode. To prevent this, I've added a lint for all the codepaths where we may pass SymInt/SymFloat directly to reject direct isinstance int/float tests, and instead use one of the aliases. The lint rule explains the options. I then go and fix all of them. Signed-off-by: Edward Z. Yang Pull Request resolved: https://github.com/pytorch/pytorch/pull/87345 Approved by: https://github.com/bdhirsh, https://github.com/albanD --- .lintrunner.toml | 29 +++++++++++++++++ torch/_C/__init__.pyi.in | 2 ++ torch/_decomp/decompositions.py | 4 +-- torch/_meta_registrations.py | 16 +++++----- torch/_prims/__init__.py | 8 +++-- torch/_prims_common/__init__.py | 21 +++++++++---- torch/_refs/__init__.py | 56 ++++++++++++++++++--------------- torch/_refs/linalg/__init__.py | 7 +++-- 8 files changed, 96 insertions(+), 47 deletions(-) diff --git a/.lintrunner.toml b/.lintrunner.toml index 56ecfc7295f4c..70e2a423edcc1 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -420,6 +420,35 @@ command = [ '@{{PATHSFILE}}' ] +[[linter]] +code = 'ERROR_PRONE_ISINSTANCE' +include_patterns = [ + 'torch/_refs/**/*.py', + 'torch/_prims/**/*.py', + 'torch/_prims_common/**/*.py', + 'torch/_decomp/**/*.py', + 'torch/_meta_registrations.py', +] +command = [ + 'python3', + 'tools/linter/adapters/grep_linter.py', + '--pattern=isinstance\([^)]+(int|float)\)', + '--linter-name=ERROR_PRONE_ISINSTANCE', + '--error-name=error prone isinstance', + """--error-description=\ + This line has an isinstance call that directly refers to \ + int or float. This is error-prone because you may also \ + have wanted to allow SymIntNode or SymFloatNode in your test. \ + To suppress this lint, use an appropriate type alias defined \ + in torch._prims_common; use IntLike/FloatLike when you would accept \ + both regular and symbolic numbers, Dim for ints representing \ + dimensions, or IntWithoutSymInt/FloatWithoutSymFloat if you really \ + meant to exclude symbolic numbers. + """, + '--', + '@{{PATHSFILE}}' +] + [[linter]] code = 'PYBIND11_SPECIALIZATION' include_patterns = [ diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index 70248d1325274..3c81b63721ccd 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -181,6 +181,8 @@ class SymFloatNode(object): @staticmethod def new_symfloat(obj) -> SymFloatNode: ... + def __ceil__(self) -> SymIntNode: ... + # Defined in torch/csrc/jit/passes/xnnpack_rewrite.h class MobileOptimizerType: ... diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py index 4f61dc9b26f8a..9e9c36104ddc5 100644 --- a/torch/_decomp/decompositions.py +++ b/torch/_decomp/decompositions.py @@ -11,7 +11,7 @@ import torch.nn.functional as F from torch import Tensor from torch._decomp import register_decomposition -from torch._prims_common import NumberType, TensorLike, TensorSequenceType +from torch._prims_common import IntLike, NumberType, TensorLike, TensorSequenceType from torch._prims_common.wrappers import _maybe_resize_out, _safe_copy_out, out_wrapper from torch.utils._pytree import tree_flatten, tree_map @@ -1740,7 +1740,7 @@ def compute_idx(in_size, out_size): return torch.mean(vals, dim=(-3, -1)) def maybe_mask(vals, length, range_max, adaptive, dim): - if isinstance(length, int): + if isinstance(length, IntLike): return vals, length else: # zero-out the things we didn't really want to select diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py index 2e1c728c582dc..c17aa091120cc 100644 --- a/torch/_meta_registrations.py +++ b/torch/_meta_registrations.py @@ -11,6 +11,8 @@ corresponding_real_dtype, elementwise_dtypes, ELEMENTWISE_TYPE_PROMOTION_KIND, + FloatLike, + IntLike, ) from torch._prims_common.wrappers import out_wrapper @@ -361,24 +363,24 @@ def calc_conv_nd_return_shape( output_padding: Optional[Union[List[int], int]] = None, ): ret_shape = [] - if isinstance(stride, int): + if isinstance(stride, IntLike): stride = [stride] * len(dims) elif len(stride) == 1: stride = [stride[0]] * len(dims) - if isinstance(padding, int): + if isinstance(padding, IntLike): padding = [padding] * len(dims) elif len(padding) == 1: padding = [padding[0]] * len(dims) - if isinstance(dilation, int): + if isinstance(dilation, IntLike): dilation = [dilation] * len(dims) elif len(dilation) == 1: dilation = [dilation[0]] * len(dims) output_padding_list: Optional[List[int]] = None if output_padding: - if isinstance(output_padding, int): + if isinstance(output_padding, IntLike): output_padding_list = [output_padding] * len(dims) elif len(output_padding) == 1: output_padding_list = [output_padding[0]] * len(dims) @@ -1393,11 +1395,11 @@ def meta_like(self, *args, **kwargs): # hacky: Please remove after math.ceil works with arange @register_meta(aten.arange.default) def arange(end, **kwargs): - if isinstance(end, float): - end = math.ceil(end) + if isinstance(end, FloatLike): + end = math.ceil(end) # type: ignore[arg-type] def is_integral(x): - return isinstance(x, int) or isinstance(x, bool) + return isinstance(x, IntLike) or isinstance(x, bool) set_to_integral_dtype = kwargs.get("dtype", None) is None and is_integral(end) if set_to_integral_dtype: diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py index 8ea992894cf5e..d724ac50e2839 100644 --- a/torch/_prims/__init__.py +++ b/torch/_prims/__init__.py @@ -16,8 +16,10 @@ from torch._prims.nvfuser_prims import register_nvprims from torch._prims_common import ( check, + Dim, DimsSequenceType, DimsType, + IntLike, Number, NumberType, RETURN_TYPE, @@ -929,7 +931,7 @@ def _fill_aten(a: Tensor, value: NumberType) -> Tensor: # div prim performs truncation division on integer inputs # and true division for floating and complex inputs def _div_aten(a, b): - is_integral = isinstance(a, (bool, int)) or ( + is_integral = isinstance(a, (bool, int, torch.SymIntNode)) or ( isinstance(a, torch.Tensor) and utils.is_integer_dtype(a.dtype) ) @@ -1198,7 +1200,7 @@ def _broadcast_in_dim_meta( # (no relative reordering of dims) of integers and # each dimension must be within the new shape def _greater_than_reduce(acc, x): - assert isinstance(x, int) + assert isinstance(x, Dim) assert x > acc assert x < len(shape) @@ -2319,7 +2321,7 @@ def _arange_meta( ) if dtype is not None: pass - elif all(isinstance(arg, int) for arg in (start, end, step)): + elif all(isinstance(arg, IntLike) for arg in (start, end, step)): dtype = torch.int64 else: dtype = torch.get_default_dtype() diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py index 72a01a85359c8..d8321ac9a47c7 100644 --- a/torch/_prims_common/__init__.py +++ b/torch/_prims_common/__init__.py @@ -47,7 +47,15 @@ def getnvFuserDtype(dtype: Union[torch.dtype, NumberTypeType]): # TODO: This needs a lot more type annotations # NumberType = Union[bool, int, float, complex, torch.SymIntNode, torch.SymFloatNode] NumberType = Union[bool, int, float, complex] + Number = (bool, int, float, complex, torch.SymIntNode, torch.SymFloatNode) +# I don't call it Integral because numbers.Integral includes bool, but IntLike +# does not +Dim = int +IntLike = (int, torch.SymIntNode) +FloatLike = (float, torch.SymFloatNode) +IntWithoutSymInt = int +FloatWithoutSymFloat = float DeviceLikeType = Union[str, torch.device] Tensor = torch.Tensor @@ -433,8 +441,8 @@ def validate_idx(rank: int, idx: int): Assumes the index is already canonicalized. """ - assert isinstance(idx, int) - assert isinstance(rank, int) + assert isinstance(idx, Dim) + assert isinstance(rank, Dim) assert idx >= 0 and idx < rank or idx == 0 @@ -450,8 +458,8 @@ def validate_exclusive_idx(rank: int, ex_idx: int): for the given shape. """ - assert isinstance(ex_idx, int) - assert isinstance(rank, int) + assert isinstance(ex_idx, Dim) + assert isinstance(rank, Dim) assert ex_idx > 0 and ex_idx <= rank @@ -500,7 +508,7 @@ def canonicalize_dims(rank: int, indices: int) -> int: def canonicalize_dims(rank, indices): - if isinstance(indices, int): + if isinstance(indices, Dim): return canonicalize_dim(rank, indices) return tuple(canonicalize_dim(rank, x) for x in indices) @@ -1439,7 +1447,8 @@ def set_correction( correction = 1 elif correction is None and unbiased is not None: correction = 0 if unbiased is False else 1 - if not isinstance(correction, int): + # NB: we don't actually support symint here, but it's harmless to accept + if not isinstance(correction, IntLike): raise ValueError("correction argument should be integer") if correction < 0: raise ValueError("correction argument should be non-negative") diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py index a37673afb72af..08e1361c76220 100644 --- a/torch/_refs/__init__.py +++ b/torch/_refs/__init__.py @@ -16,10 +16,13 @@ from torch._prims_common import ( check, DeviceLikeType, + Dim, DimsSequenceType, DimsType, dtype_to_type, ELEMENTWISE_TYPE_PROMOTION_KIND, + FloatLike, + IntLike, is_weakly_lesser_type, Number, NumberType, @@ -39,6 +42,7 @@ elementwise_unary_scalar_wrapper, out_wrapper, ) +from torch.fx.experimental.symbolic_shapes import sym_float, sym_int # Experimental module containing prototype Python references for existing # PyTorch operations. @@ -298,7 +302,7 @@ def _broadcast_shapes(*_shapes): shapes = tuple( - (x,) if isinstance(x, int) else x + (x,) if isinstance(x, IntLike) else x for x in filter(lambda x: x is not None, _shapes) ) @@ -1939,8 +1943,8 @@ def _reduction( "dtype argument and out dtype must match in reduction" ) if not accepts_dim_tuple: - assert dims is None or isinstance(dims, int) - if isinstance(dims, int): + assert dims is None or isinstance(dims, Dim) + if isinstance(dims, Dim): dims = (dims,) # type: ignore[assignment] dims = utils.reduction_dims(a.shape, dims) if not has_identity: @@ -1986,7 +1990,7 @@ def all( keepdim: bool = False, ) -> TensorLikeType: # Computes nelem - if isinstance(dim, int): + if isinstance(dim, Dim): dim = (dim,) # type: ignore[assignment] a_ = _maybe_convert_to_dtype(a, torch.bool) @@ -2246,7 +2250,7 @@ def mean( ) if utils.is_integer_dtype(dtype): raise RuntimeError("result type should be floating point or complex") - if isinstance(dim, int): + if isinstance(dim, Dim): dim = (dim,) # type: ignore[assignment] dims = utils.reduction_dims(a.shape, dim) # type: ignore[arg-type] nelem = 1 if a.ndim == 0 else reduce(operator.mul, (a.shape[i] for i in dims), 1) @@ -3299,7 +3303,7 @@ def tensor_split( raise ValueError(msg) # Case 0 -- indices_or_sections is an integer or a scalar tensor n and a is split along dim into n parts of equal-ish length - if isinstance(indices_or_sections, int) or ( + if isinstance(indices_or_sections, IntLike) or ( isinstance(indices_or_sections, TensorLike) and indices_or_sections.ndim == 0 ): sections: int = ( @@ -3365,7 +3369,7 @@ def hsplit( ), ) dim = 0 if a.ndim == 1 else 1 - if isinstance(indices_or_sections, int): + if isinstance(indices_or_sections, IntLike): split_size = indices_or_sections check( (split_size != 0 and a.shape[dim] % split_size == 0), @@ -3407,7 +3411,7 @@ def vsplit( + " dimensions!" ), ) - if isinstance(indices_or_sections, int): + if isinstance(indices_or_sections, IntLike): split_size = indices_or_sections check( (split_size != 0 and a.shape[0] % split_size == 0), @@ -3538,7 +3542,7 @@ def dsplit(a: TensorLikeType, sections: DimsType) -> TensorSequenceType: raise RuntimeError( f"torch.dsplit requires a tensor with at least 3 dimension, but got a tensor with {a.ndim} dimensions!" ) - if isinstance(sections, int) and (sections == 0 or a.shape[2] % sections != 0): + if isinstance(sections, IntLike) and (sections == 0 or a.shape[2] % sections != 0): raise RuntimeError( "torch._refs.dsplit attempted to split along dimension 2, " + f"but the size of the dimension {a.shape[2]} is not divisible by the split_size {sections}!" @@ -3983,21 +3987,21 @@ def linspace( # cast than not, because it allows us to always go into the precise path # if dtype is integral and not worry about whether start/end are float if prims.utils.is_integer_dtype(dtype): - if isinstance(start, float): - start = int(start) - if isinstance(end, float): - end = int(end) + if isinstance(start, FloatLike): + start = sym_int(start) + if isinstance(end, FloatLike): + end = sym_int(end) if py_any(isinstance(arg, complex) for arg in (start, end, steps)): raise NotImplementedError assert not isinstance(start, complex) and not isinstance(end, complex) # for mypy check( - isinstance(steps, int), + isinstance(steps, IntLike), lambda: "steps must be int, not float", exc_type=TypeError, ) - assert isinstance(steps, int) # for mypy + assert isinstance(steps, IntLike) # for mypy check(steps >= 0, lambda: "number of steps must be non-negative") factory_kwargs = { @@ -4016,7 +4020,7 @@ def linspace( if prims.utils.is_integer_dtype(dtype): # We need to cast to int, so to avoid off-by-one issues # do the entire computation with ints when we can - assert isinstance(start, int) and isinstance(end, int) + assert isinstance(start, IntLike) and isinstance(end, IntLike) step_size_x_denom = end - start eps = 1 if end > start else -1 denom = steps - 1 @@ -4063,10 +4067,10 @@ def logspace( # NB: NumPy doesn't have this cast if prims.utils.is_integer_dtype(dtype): - if isinstance(start, float): - start = int(start) - if isinstance(end, float): - end = int(end) + if isinstance(start, FloatLike): + start = sym_int(start) + if isinstance(end, FloatLike): + end = sym_int(end) assert not isinstance(base, complex) # for mypy if base < 0: @@ -4402,10 +4406,10 @@ def uniform( ) -> TensorLikeType: utils.validate_shape(shape) - assert isinstance(low, (bool, int, float)) - assert isinstance(high, (bool, int, float)) - low = float(low) - high = float(high) + assert isinstance(low, Number) + assert isinstance(high, Number) + low = sym_float(low) + high = sym_float(high) assert isinstance(dtype, torch.dtype) device = utils.canonicalize_device(device) @@ -4505,10 +4509,10 @@ def norm( ) -> TensorLikeType: # In these cases we compute the "Frobenius norm" if ( - p == "fro" and (dim is None or isinstance(dim, int) or len(dim) <= 2) + p == "fro" and (dim is None or isinstance(dim, Dim) or len(dim) <= 2) ) or p is None: p = 2 - if isinstance(dim, int): + if isinstance(dim, Dim): dim = [dim] if isinstance(p, str): # Here we either call the nuclear norm, or we call matrix_norm with some arguments diff --git a/torch/_refs/linalg/__init__.py b/torch/_refs/linalg/__init__.py index c3b8a3c603524..c8c8f84570d8e 100644 --- a/torch/_refs/linalg/__init__.py +++ b/torch/_refs/linalg/__init__.py @@ -14,6 +14,7 @@ check, check_fp_or_complex, check_is_matrix, + Dim, DimsType, NumberType, TensorLikeType, @@ -69,7 +70,7 @@ def vector_norm( # Checks check_fp_or_complex(x.dtype, "linalg.vector_norm") - if isinstance(dim, int): + if isinstance(dim, Dim): dim = [dim] # type: ignore[assignment] elif not isinstance(dim, List) and dim is not None: # refs.amin just accepts List rather than DimType (Tuple) @@ -142,7 +143,7 @@ def matrix_norm( check_is_matrix(A, "linalg.matrix_norm") # dim dim = utils.canonicalize_dims(A.ndim, dim) - if isinstance(dim, int): + if isinstance(dim, Dim): dim = (dim,) # type: ignore[assignment] check(len(dim) == 2, lambda: "linalg.matrix_norm: dim must be a 2-tuple. Got {dim}") check( @@ -219,7 +220,7 @@ def norm( dtype: Optional[torch.dtype] = None, ) -> TensorLikeType: if dim is not None: - if isinstance(dim, int): + if isinstance(dim, Dim): dim = (dim,) # type: ignore[assignment] check( len(dim) in (1, 2), From 0cad1bbd9ae735eac5dfc064648fc3437915b423 Mon Sep 17 00:00:00 2001 From: PyTorch MergeBot Date: Fri, 21 Oct 2022 16:03:00 +0000 Subject: [PATCH 0006/1922] Revert "Back out "Revert D40198461: [pytorch][PR] Backport currently dont work with some models if:" (#87124)" This reverts commit a42fbfa0cb467b582799a5132561c82a3d33b1b7. Reverted https://github.com/pytorch/pytorch/pull/87124 on behalf of https://github.com/ZainRizvi due to This is causing periodic jobs to fail --- buckbuild.bzl | 9 ++++----- test/cpp/jit/test_flatbuffer.cpp | 12 ++++-------- .../jit/mobile/compatibility/backport_manager.cpp | 2 -- torch/csrc/jit/mobile/flatbuffer_loader.cpp | 6 ++---- .../csrc/jit/serialization/flatbuffer_serializer.cpp | 6 ++---- 5 files changed, 12 insertions(+), 23 deletions(-) diff --git a/buckbuild.bzl b/buckbuild.bzl index d0185aa313a47..24302e64c92f1 100644 --- a/buckbuild.bzl +++ b/buckbuild.bzl @@ -1697,7 +1697,7 @@ def define_buck_targets( "torch/csrc/jit/serialization/mobile_bytecode.fbs", ], outs = { - "mobile_bytecode_generated_fbsource.h": ["mobile_bytecode_generated.h"], + "mobile_bytecode_generated.h": ["mobile_bytecode_generated.h"], }, cmd = "$(exe {})".format(third_party("flatc")) + " --cpp --gen-mutable --scoped-enums -o ${OUT} ${SRCS}", @@ -1713,7 +1713,7 @@ def define_buck_targets( name = "mobile_bytecode", header_namespace = "", exported_headers = { - "torch/csrc/jit/serialization/mobile_bytecode_generated_fbsource.h": ":mobile_bytecode_header[mobile_bytecode_generated_fbsource.h]", + "torch/csrc/jit/serialization/mobile_bytecode_generated.h": ":mobile_bytecode_header[mobile_bytecode_generated.h]", }, # Avoid leaking implementation details by only exposing this header to # the internals of the loader/serializer layer. @@ -1721,9 +1721,6 @@ def define_buck_targets( "{}:flatbuffer_loader".format(ROOT), "{}:flatbuffer_serializer_mobile".format(ROOT), ], - exported_deps = [ - third_party("flatbuffers-api"), - ], ) fb_xplat_cxx_library( @@ -1744,6 +1741,7 @@ def define_buck_targets( ":mobile_bytecode", ":torch_mobile_module", C10, + third_party("flatbuffers-api"), ], exported_deps = [ ":torch_mobile_train", @@ -1781,6 +1779,7 @@ def define_buck_targets( visibility = ["PUBLIC"], deps = [ ":mobile_bytecode", + third_party("flatbuffers-api"), ], exported_deps = [ ":torch_mobile_deserialize", diff --git a/test/cpp/jit/test_flatbuffer.cpp b/test/cpp/jit/test_flatbuffer.cpp index 89efcf7390179..de49838fc9ab6 100644 --- a/test/cpp/jit/test_flatbuffer.cpp +++ b/test/cpp/jit/test_flatbuffer.cpp @@ -27,14 +27,6 @@ #include #include #include - -#if defined(FB_XPLAT_BUILD) || defined(FBCODE_CAFFE2) -#include // NOLINT -namespace flatbuffers = flatbuffers_fbsource; -#define FLATBUFFERS_MAX_ALIGNMENT FLATBUFFERS_FBSOURCE_MAX_ALIGNMENT -#else -#include // NOLINT -#endif // Tests go in torch::jit namespace torch { namespace jit { @@ -1804,9 +1796,13 @@ TEST(FlatbufferUpgraderTest, DivScalarInplaceIntV2) { } // namespace jit } // namespace torch +#include namespace torch { namespace jit { +#if defined(FBCODE_CAFFE2) or defined(FB_XPLAT_BUILD) +namespace flatbuffers = flatbuffers_fbsource; +#endif /** * An Allocator that can only deallocate (using delete []), counting * the number of times that it has been asked to deallocate. diff --git a/torch/csrc/jit/mobile/compatibility/backport_manager.cpp b/torch/csrc/jit/mobile/compatibility/backport_manager.cpp index 489084912445f..2bad08c0765a2 100644 --- a/torch/csrc/jit/mobile/compatibility/backport_manager.cpp +++ b/torch/csrc/jit/mobile/compatibility/backport_manager.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include @@ -504,7 +503,6 @@ std::stringstream backport_v7_to_v6(std::stringstream& input_model_stream) { std::stringstream backport_v9_to_v8(std::stringstream& input_model_stream) { ExtraFilesMap extra_files; - register_flatbuffer_all(); Module torch_script = torch::jit::load(input_model_stream, c10::nullopt, extra_files); std::stringstream intermediate_model_stream; diff --git a/torch/csrc/jit/mobile/flatbuffer_loader.cpp b/torch/csrc/jit/mobile/flatbuffer_loader.cpp index 45e31fb5e1747..fb23e7ee97753 100644 --- a/torch/csrc/jit/mobile/flatbuffer_loader.cpp +++ b/torch/csrc/jit/mobile/flatbuffer_loader.cpp @@ -36,6 +36,7 @@ #include #include #include +#include #include #ifndef DISABLE_UPGRADER @@ -49,12 +50,9 @@ #include #endif -#if defined(FB_XPLAT_BUILD) || defined(FBCODE_CAFFE2) -#include // NOLINT +#if defined(FBCODE_CAFFE2) or defined(FB_XPLAT_BUILD) namespace flatbuffers = flatbuffers_fbsource; #define FLATBUFFERS_MAX_ALIGNMENT FLATBUFFERS_FBSOURCE_MAX_ALIGNMENT -#else -#include // NOLINT #endif namespace torch { diff --git a/torch/csrc/jit/serialization/flatbuffer_serializer.cpp b/torch/csrc/jit/serialization/flatbuffer_serializer.cpp index 54ec7c7b6ed3e..690541450a441 100644 --- a/torch/csrc/jit/serialization/flatbuffer_serializer.cpp +++ b/torch/csrc/jit/serialization/flatbuffer_serializer.cpp @@ -20,13 +20,11 @@ #include #include #include +#include // NOLINT -#if defined(FB_XPLAT_BUILD) || defined(FBCODE_CAFFE2) -#include // NOLINT +#if defined(FBCODE_CAFFE2) or defined(FB_XPLAT_BUILD) namespace flatbuffers = flatbuffers_fbsource; #define FLATBUFFERS_MAX_ALIGNMENT FLATBUFFERS_FBSOURCE_MAX_ALIGNMENT -#else -#include // NOLINT #endif namespace torch { From 786500c480d58bbfdee178babf19902329953bc7 Mon Sep 17 00:00:00 2001 From: jyx-su <108294040+jyx-su@users.noreply.github.com> Date: Fri, 21 Oct 2022 16:28:29 +0000 Subject: [PATCH 0007/1922] Fix input dimension issue in RNN, LSTM, GRU error message (#87442) Fixes #86576 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87442 Approved by: https://github.com/albanD --- torch/nn/modules/rnn.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py index 4d6fd9c959ebc..f94728653b0f6 100644 --- a/torch/nn/modules/rnn.py +++ b/torch/nn/modules/rnn.py @@ -441,6 +441,7 @@ def forward(self, input, hx=None): # noqa: F811 max_batch_size = int(batch_sizes[0]) else: batch_sizes = None + assert (input.dim() in (2, 3)), f"RNN: Expected input to be 2-D or 3-D but received {input.dim()}-D tensor" is_batched = input.dim() == 3 batch_dim = 0 if self.batch_first else 1 if not is_batched: @@ -733,6 +734,7 @@ def forward(self, input, hx=None): # noqa: F811 max_batch_size = int(max_batch_size) else: batch_sizes = None + assert (input.dim() in (2, 3)), f"LSTM: Expected input to be 2-D or 3-D but received {input.dim()}-D tensor" is_batched = input.dim() == 3 batch_dim = 0 if self.batch_first else 1 if not is_batched: @@ -923,6 +925,7 @@ def forward(self, input, hx=None): # noqa: F811 max_batch_size = int(max_batch_size) else: batch_sizes = None + assert (input.dim() in (2, 3)), f"GRU: Expected input to be 2-D or 3-D but received {input.dim()}-D tensor" is_batched = input.dim() == 3 batch_dim = 0 if self.batch_first else 1 if not is_batched: From edc78a47985fad6cdf3e5c45dadbcc8606557747 Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Fri, 21 Oct 2022 06:21:41 -0700 Subject: [PATCH 0008/1922] Reland "add an API for external backends to register custom device names (#86992)" (#87453) Re-land of https://github.com/pytorch/pytorch/pull/86992 This reverts commit a895af92506f206889610251624590798d0deabd. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87453 Approved by: https://github.com/ezyang, https://github.com/albanD --- aten/src/ATen/core/dispatch/OperatorEntry.cpp | 18 ++++++- c10/core/Device.cpp | 3 ++ c10/core/DeviceType.cpp | 48 ++++++++++++++++++- c10/core/DeviceType.h | 3 ++ torch/_C/__init__.pyi.in | 3 ++ torch/csrc/Module.cpp | 18 +++++++ torch/utils/__init__.py | 1 + torch/utils/backend_registration.py | 30 ++++++++++++ 8 files changed, 122 insertions(+), 2 deletions(-) create mode 100644 torch/utils/backend_registration.py diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp index 5d53500e7dfe0..822924a602533 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp +++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp @@ -495,6 +495,22 @@ void OperatorEntry::reportSignatureError(const CppSignature& call_signature, con ); }; +std::string post_process_dispatch_key_str(std::string dispatch_key) { + const std::string substr = "PrivateUse1"; + if (substr.size() <= dispatch_key.size() && std::equal(substr.rbegin(), substr.rend(), dispatch_key.rbegin())) { + auto privateuse1_backend = get_privateuse1_backend(); + if (privateuse1_backend != "privateuseone") { + // remove trailing "*PrivateUse1" + dispatch_key.erase(dispatch_key.length() - substr.length()); + // append the registered backend's name. + // AutogradPrivateUse1 -> AutogradFoo + auto backend_name = c10::get_privateuse1_backend(); + dispatch_key = dispatch_key + backend_name; + } + } + return dispatch_key; +} + void OperatorEntry::reportError(DispatchKey dispatchKey) const { // If there is an invariant problem, report it now. checkInvariants(); @@ -509,7 +525,7 @@ void OperatorEntry::reportError(DispatchKey dispatchKey) const { } TORCH_CHECK_NOT_IMPLEMENTED(false, "Could not run '", name_, "' with arguments", - " from the '", toString(dispatchKey), "' backend. This could be because " + " from the '", post_process_dispatch_key_str(toString(dispatchKey)), "' backend. This could be because " "the operator doesn't exist for this backend, or was omitted during ", "the selective/custom build process (if using custom build). If you are a ", "Facebook employee using PyTorch on mobile, please visit ", diff --git a/c10/core/Device.cpp b/c10/core/Device.cpp index 7b55d2dbe283b..96d2504ec7de5 100644 --- a/c10/core/Device.cpp +++ b/c10/core/Device.cpp @@ -47,6 +47,9 @@ DeviceType parse_type(const std::string& device_string) { if (device != types.end()) { return device->second; } + if (device_string == get_privateuse1_backend()) { + return DeviceType::PrivateUse1; + } std::vector device_names; for (const auto& it : types) { if (it.first) { diff --git a/c10/core/DeviceType.cpp b/c10/core/DeviceType.cpp index ac4c1f653efbf..22f0029d747d4 100644 --- a/c10/core/DeviceType.cpp +++ b/c10/core/DeviceType.cpp @@ -1,5 +1,9 @@ #include #include +#include +#include +#include +#include namespace c10 { @@ -46,7 +50,7 @@ std::string DeviceTypeName(DeviceType d, bool lower_case) { case DeviceType::IPU: return lower_case ? "ipu" : "IPU"; case DeviceType::PrivateUse1: - return lower_case ? "privateuseone" : "PRIVATEUSEONE"; + return get_privateuse1_backend(/*lowercase=*/lower_case); default: TORCH_CHECK( false, @@ -101,4 +105,46 @@ std::ostream& operator<<(std::ostream& stream, DeviceType type) { return stream; } +// We use both a mutex and an atomic here because: +// (1) Mutex is needed during writing: +// We need to first check the value and potentially error, +// before setting the value (without any one else racing in the middle). +// It's also totally fine for this to be slow, since it happens exactly once +// at import time. +// (2) Atomic is needed during reading: +// Whenever a user prints a privatuse1 device name, they need to read this +// variable. Although unlikely, we'll data race if someone else is trying to +// set this variable at the same time that another thread is print the +// device name. We could re-use the same mutex, but reading the atomic will +// be much faster. +static std::atomic privateuse1_backend_name_set; +static std::string privateuse1_backend_name; +static std::mutex privateuse1_lock; + +std::string get_privateuse1_backend(bool lower_case) { + // Applying the same atomic read memory ordering logic as in Note [Memory + // ordering on Python interpreter tag]. + auto name_registered = + privateuse1_backend_name_set.load(std::memory_order_acquire); + // Guaranteed that if the flag is set, then privateuse1_backend_name has been + // set, and will never be written to. + auto backend_name = + name_registered ? privateuse1_backend_name : "privateuseone"; + return backend_name; +} + +void register_privateuse1_backend(std::string backend_name) { + std::lock_guard guard(privateuse1_lock); + TORCH_CHECK( + !privateuse1_backend_name_set.load() || + privateuse1_backend_name == backend_name, + "torch.register_privateuse1_backend() has already been set! Current backend: ", + privateuse1_backend_name); + + privateuse1_backend_name = backend_name; + // Invariant: once this flag is set, privateuse1_backend_name is NEVER written + // to. + privateuse1_backend_name_set.store(true, std::memory_order_relaxed); +} + } // namespace c10 diff --git a/c10/core/DeviceType.h b/c10/core/DeviceType.h index 000ad331828b0..065444827833d 100644 --- a/c10/core/DeviceType.h +++ b/c10/core/DeviceType.h @@ -95,6 +95,9 @@ C10_API bool isValidDeviceType(DeviceType d); C10_API std::ostream& operator<<(std::ostream& stream, DeviceType type); +C10_API void register_privateuse1_backend(std::string backend_name); +C10_API std::string get_privateuse1_backend(bool lower_case = true); + } // namespace c10 namespace std { diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index 3c81b63721ccd..5b9049e4bdc7d 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -1017,6 +1017,9 @@ def _jit_pass_lint(Graph) -> None: ... # Defined in torch/csrc/jit/python/python_custome_class.cpp def _get_custom_class_python_wrapper(name: str, attr: str) -> Any: ... +# Defined in torch/csrc/Module.cpp +def _rename_privateuse1_backend(backend: str) -> None: ... + # Defined in torch/csrc/Generator.cpp class Generator(object): device: _device diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index 08b9b81217e93..e41f0305a2e11 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -441,6 +441,20 @@ PyObject* THModule_getCppBacktrace(PyObject* _unused, PyObject* args) { c10::get_backtrace(frames_to_skip, maximum_number_of_frames, true)); END_HANDLE_TH_ERRORS } +static PyObject* THModule_rename_privateuse1_backend( + PyObject* _unused, + PyObject* arg) { + HANDLE_TH_ERRORS + THPUtils_assert( + THPUtils_checkString(arg), + "_rename_privateuse1_backend expects a str, " + "but got %s", + THPUtils_typename(arg)); + const std::string backend_name = THPUtils_unpackString(arg); + c10::register_privateuse1_backend(backend_name); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} PyObject* THPModule_setAllowTF32CuDNN(PyObject* _unused, PyObject* arg) { THPUtils_assert( @@ -990,6 +1004,10 @@ static PyMethodDef TorchMethods[] = { {"_to_dlpack", THPModule_toDLPack, METH_O, nullptr}, {"_from_dlpack", THPModule_fromDLPack, METH_O, nullptr}, {"_get_cpp_backtrace", THModule_getCppBacktrace, METH_VARARGS, nullptr}, + {"_rename_privateuse1_backend", + THModule_rename_privateuse1_backend, + METH_O, + nullptr}, {"set_flush_denormal", THPModule_setFlushDenormal, METH_O, nullptr}, {"get_default_dtype", THPModule_getDefaultDtype, METH_NOARGS, nullptr}, {"_get_default_device", THPModule_getDefaultDevice, METH_NOARGS, nullptr}, diff --git a/torch/utils/__init__.py b/torch/utils/__init__.py index f05ffc3fc96b8..c2054a9b5c653 100644 --- a/torch/utils/__init__.py +++ b/torch/utils/__init__.py @@ -4,6 +4,7 @@ from .throughput_benchmark import ThroughputBenchmark from ._crash_handler import enable_minidumps, disable_minidumps, enable_minidumps_on_exceptions from .cpp_backtrace import get_cpp_backtrace +from .backend_registration import rename_privateuse1_backend # Set the module for a given object for nicer printing def set_module(obj, mod): diff --git a/torch/utils/backend_registration.py b/torch/utils/backend_registration.py new file mode 100644 index 0000000000000..539d5c65d237e --- /dev/null +++ b/torch/utils/backend_registration.py @@ -0,0 +1,30 @@ +from torch._C import _rename_privateuse1_backend + +def rename_privateuse1_backend(backend_name: str) -> None: + r""" + rename_privateuse1_backend(backend_name) -> None + + This is a registration API for external backends that would like to register their + own device and C++ kernels out of tree. + + The steps are: + (1) (In C++) implement kernels for various torch operations, and register them + to the PrivateUse1 dispatch key. + (2) (In python) call torch.register_privateuse1_backend("foo") + + You can now use "foo" as an ordinary device string in python. + + Note: this API can only be called once per process. Attempting to change + the external backend after it's already been set will result in an error. + + For more details, see https://pytorch.org/tutorials/advanced/extend_dispatcher.html#get-a-dispatch-key-for-your-backend + For an existing example, see https://github.com/bdhirsh/pytorch_open_registration_example + + Example:: + + >>> torch.register_privateuse1_backend("foo") + # This will work, assuming that you've implemented the right C++ kernels + # to implement torch.ones. + >>> a = torch.ones(2, device="foo") + """ + return _rename_privateuse1_backend(backend_name) From b9fb94821f059036b59c14df2349fb5b344ef429 Mon Sep 17 00:00:00 2001 From: Andrew Gu Date: Fri, 21 Oct 2022 11:30:56 +0000 Subject: [PATCH 0009/1922] [FSDP][2/N] Remove `_fsdp_wrapped_module.flat_param` (#86122) This removes **direct** usages of `_fsdp_wrapped_module.flat_param` with `_handles[0].flat_param`. The preferred way to access the `flat_param` will be through the handle. We may converge to only storing `self._handles` and no longer `self.params` in the future. Right now, `self.params` is always exactly `[handle.flat_param for handle in self._handles]`. cc @mrshenli @pritamdamania87 @zhaojuanmao @satgera @rohan-varma @gqchen @aazzolini @osalpekar @jiayisuse @H-Huang @kwen2501 Pull Request resolved: https://github.com/pytorch/pytorch/pull/86122 Approved by: https://github.com/zhaojuanmao --- test/distributed/fsdp/test_fsdp_misc.py | 4 +-- .../fsdp/test_fsdp_summon_full_params.py | 28 +++++++------------ .../fsdp/fully_sharded_data_parallel.py | 12 ++++---- 3 files changed, 18 insertions(+), 26 deletions(-) diff --git a/test/distributed/fsdp/test_fsdp_misc.py b/test/distributed/fsdp/test_fsdp_misc.py index f2ae0dcfcaeaf..ca566b984002a 100644 --- a/test/distributed/fsdp/test_fsdp_misc.py +++ b/test/distributed/fsdp/test_fsdp_misc.py @@ -206,8 +206,8 @@ def forward(self, x, y): loss.backward() # self.a receives grad, self.b does not - a_grad = fsdp.module.a._fsdp_wrapped_module.flat_param.grad - b_grad = fsdp.module.b._fsdp_wrapped_module.flat_param.grad + a_grad = fsdp.module.a._handles[0].flat_param.grad + b_grad = fsdp.module.b._handles[0].flat_param.grad self.assertIsNotNone(a_grad) self.assertIsNone(b_grad) diff --git a/test/distributed/fsdp/test_fsdp_summon_full_params.py b/test/distributed/fsdp/test_fsdp_summon_full_params.py index 29bf252b796fd..d78aa81a19d7a 100644 --- a/test/distributed/fsdp/test_fsdp_summon_full_params.py +++ b/test/distributed/fsdp/test_fsdp_summon_full_params.py @@ -52,10 +52,8 @@ def _run_test_summon_full_param_writeback( model = wrap(nn.Sequential(lin1, lin2)) # set the value - outer_param = model.get_parameter("_fsdp_wrapped_module.flat_param") - inner_param = model.get_parameter( - "_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param" - ) + outer_param = model._handles[0].flat_param + inner_param = model.module[0]._handles[0].flat_param p = outer_param if modify_outer else inner_param with torch.no_grad(): @@ -176,10 +174,8 @@ def test_summon_full_param_recursive(self, recurse, summon_outer, mixed_precisio shard_inner_numel = int(math.ceil(global_inner_numel / self.world_size)) shard_outer_numel = int(math.ceil(global_outer_numel / self.world_size)) - outer_param = model.get_parameter("_fsdp_wrapped_module.flat_param") - inner_param = model.get_parameter( - "_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param" - ) + outer_param = model._handles[0].flat_param + inner_param = model.module[0]._handles[0].flat_param self.assertEqual(shard_outer_numel, outer_param.numel()) self.assertEqual(shard_inner_numel, inner_param.numel()) @@ -259,10 +255,8 @@ def _test_summon_full_params_respects_reshard_after_forward( **fsdp_kwargs, ).cuda(self.rank) - outer_param = model.get_parameter("_fsdp_wrapped_module.flat_param") - inner_param = model.get_parameter( - "_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param" - ) + outer_param = model._handles[0].flat_param + inner_param = model.module[0]._handles[0].flat_param outer_full_param_size = outer_param.numel() * self.world_size # trigger lazy init @@ -285,7 +279,7 @@ def _test_summon_full_params_respects_reshard_after_forward( def test_summon_single_param(self): model = FSDP(nn.Linear(1, 1, bias=False)).cuda(self.rank) - p = model.get_parameter("_fsdp_wrapped_module.flat_param") + p = model._handles[0].flat_param self.assertEqual(1, p.numel()) with torch.no_grad(): @@ -388,10 +382,8 @@ def test_reshard_outside_forward_backward_iteration( mixed_precision=mixed_precision, ).cuda(self.rank) - outer_param = model.get_parameter("_fsdp_wrapped_module.flat_param") - inner_param = model.get_parameter( - "_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param" - ) + outer_param = model._handles[0].flat_param + inner_param = model.module[0]._handles[0].flat_param outer_full_param_size = outer_param.numel() * self.world_size # First lets validate our assumption about resharding @@ -451,7 +443,7 @@ def test_params_are_unflattenned(self, rank0_only, offload_to_cpu, mixed_precisi ) def _get_flat_param(): - return fsdp_model.get_parameter("_fsdp_wrapped_module.flat_param") + return fsdp_model._handles[0].flat_param flattened_param = _get_flat_param() self.assertEqual(layer_shape[0] * layer_shape[1] / 2, flattened_param.numel()) diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py index 31bb2d5000b2b..338a232a4271a 100644 --- a/torch/distributed/fsdp/fully_sharded_data_parallel.py +++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py @@ -1631,7 +1631,7 @@ def module(self) -> nn.Module: return self._fsdp_wrapped_module.module def __getattr__(self, name: str) -> Any: - """Forward missing attributes to wrapped module.""" + """Forward missing attributes to the wrapped module.""" try: return super().__getattr__(name) # defer to nn.Module's logic except AttributeError: @@ -2538,7 +2538,7 @@ def state_dict(self, *args, **kwargs): self._state_dict_type == StateDictType.SHARDED_STATE_DICT ): if ( - self._fsdp_wrapped_module.flat_param is not None and + self._fsdp_wrapped_module.has_params and not self._fsdp_wrapped_module.handle.uses_sharded_strategy ): raise RuntimeError( @@ -2606,8 +2606,8 @@ def _local_pre_load_state_dict_hook( _replace_by_prefix(state_dict, prefix, f"{prefix}{FSDP_WRAPPED_MODULE}.") fqn = f"{prefix}{FSDP_WRAPPED_MODULE}.{FLAT_PARAM}" if fqn not in state_dict: - assert getattr(self._fsdp_wrapped_module, FLAT_PARAM, None) is None, ( - "No flat parameter in state_dict but self._fsdp_wrapped_module.flat_param is not None" + assert not self._fsdp_wrapped_module.has_params, ( + "No `FlatParameter` in `state_dict` for this FSDP instance but it has parameters" ) return load_tensor = state_dict[fqn] @@ -2622,7 +2622,7 @@ def _local_pre_load_state_dict_hook( # Get the metada of the flat_param to decide whether to pad the loaded # tensor. - flat_param = self._fsdp_wrapped_module.flat_param + flat_param = self._handles[0].flat_param assert flat_param is not None if flat_param._shard_numel_padded not in (0, flat_param.numel()): assert load_tensor.numel() < flat_param.numel(), ( @@ -2694,7 +2694,7 @@ def _sharded_pre_load_state_dict_hook( nonsharded_tensors.append(tensor) # Create a new flat_param from the loaded, non-sharded tensors. - flat_param = self._fsdp_wrapped_module.flat_param + flat_param = self._handles[0].flat_param loaded_flat_param = FlatParamHandle.flatten_params(nonsharded_tensors, requires_grad=False) # Get the chunk from the loaded flat_param for the local rank. From b142154711e9186446364a4d1a1682ab3d57ee0b Mon Sep 17 00:00:00 2001 From: Andrew Gu Date: Fri, 21 Oct 2022 11:30:57 +0000 Subject: [PATCH 0010/1922] [FSDP][3/N] Register `flat_param` to wrapped module (#87086) This PR registers each `FlatParameter` to the wrapped module, eliminating `FlattenParamsWrapper` usage completely from FSDP. Registering each `FlatParameter` to the wrapped module is preferred over registering to the `FullyShardedDataParallel` instance for both functional-like and non-recursive wrapping. It simplifies the `FlatParameter` naming to be a function of the number of `FlatParameter`s per wrapped module instead of the number of `FlatParameter`s per FSDP instance. For now, we assume 1 `FlatParameter` per wrapped module, so we can simply use a single name `FLAT_PARAM = _flat_param`. From an implementation perspective, we raise some methods from `FlattenParamsWrapper` directly up to `FullyShardedDataParallel`. There will need to be further refactoring for functional-like and non-recursive wrapping. For example, the property `self._has_params -> bool` may need to change to a method `self._has_params(wrapped_module) -> bool`. Such changes are out of scope for this PR and will be done in follow-ups. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87086 Approved by: https://github.com/zhaojuanmao --- test/distributed/fsdp/test_fsdp_state_dict.py | 20 +- torch/distributed/fsdp/flat_param.py | 2 +- .../fsdp/fully_sharded_data_parallel.py | 202 +++++++++++------- 3 files changed, 133 insertions(+), 91 deletions(-) diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py index af56ee956743f..6592ec108f074 100644 --- a/test/distributed/fsdp/test_fsdp_state_dict.py +++ b/test/distributed/fsdp/test_fsdp_state_dict.py @@ -23,9 +23,7 @@ StateDictType, ) from torch.distributed.fsdp._shard_utils import _gather_state_dict -from torch.distributed.fsdp.fully_sharded_data_parallel import ( - FullyShardedDataParallel, -) +from torch.distributed.fsdp.fully_sharded_data_parallel import FLAT_PARAM from torch.distributed.fsdp.wrap import ( enable_wrap, transformer_auto_wrap_policy, @@ -124,10 +122,14 @@ def _broadcast_state_dict(self, state_dict): return olist[0] def _compare_models(self, model, model_new, assert_fn, check_fp16=False): - with FullyShardedDataParallel.summon_full_params(model): - with FullyShardedDataParallel.summon_full_params(model_new): + assert assert_fn in (self.assertEqual, self.assertNotEqual) + with FSDP.summon_full_params(model): + with FSDP.summon_full_params(model_new): params = list(model.parameters()) params_new = list(model_new.parameters()) + # Regardless of `assert_fn`, the number of parameters should be + # the same + self.assertEqual(len(params), len(params_new)) assert_fn(params, params_new) if check_fp16: for tensor in model_new.parameters(): @@ -327,8 +329,8 @@ def test_state_dict_rank0_offload_save_load_flow(self, use_orig_params: bool): assert_fn=self.assertEqual, ) # Check FSDP models correctly loaded the checkpoint - with FullyShardedDataParallel.summon_full_params(fsdp_model): - with FullyShardedDataParallel.summon_full_params(new_fsdp_model): + with FSDP.summon_full_params(fsdp_model): + with FSDP.summon_full_params(new_fsdp_model): params = list(fsdp_model.parameters()) params_new = list(new_fsdp_model.parameters()) self.assertEqual(params, params_new) @@ -570,7 +572,7 @@ def test_state_dict_save_load_flow(self, state_dict_type): def test_fsdp_state_dict_keys(self, state_dict_type): state_dict = self._state_dict(self._initialize_model(True), state_dict_type) if state_dict_type == "local_state_dict": - self.assertEqual(set(["flat_param", "inner.flat_param"]), state_dict.keys()) + self.assertEqual(set([FLAT_PARAM, f"inner.{FLAT_PARAM}"]), state_dict.keys()) elif state_dict_type in ("state_dict", "sharded_state_dict"): # Keys should match local model. local_model = self._initialize_model(wrap_fsdp=False, wrap_ddp=False) @@ -606,7 +608,7 @@ def test_state_dict_load_into_local_module( optim.step() optim.zero_grad() - with FullyShardedDataParallel.summon_full_params(model): + with FSDP.summon_full_params(model): fsdp_params = deepcopy(list(model.parameters())) # get FSDP state_dict. Note that by default we return full_state_dict. diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py index 6e30a031a16c7..2c65dd80ea3c3 100644 --- a/torch/distributed/fsdp/flat_param.py +++ b/torch/distributed/fsdp/flat_param.py @@ -297,7 +297,7 @@ def __init__( device: torch.device, config: HandleConfig, use_orig_params: bool, - ) -> None: + ): super().__init__() self.device = device self._config = config diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py index 338a232a4271a..2cf3af6d540c0 100644 --- a/torch/distributed/fsdp/fully_sharded_data_parallel.py +++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py @@ -83,11 +83,6 @@ HandleShardingStrategy, HandleTrainingState, ) -from .flatten_params_wrapper import ( - FLAT_PARAM, - FPW_MODULE, - FlattenParamsWrapper, -) from .wrap import ( ParamExecOrderWrapPolicy, _or_policy, @@ -120,8 +115,12 @@ ] +# NOTE: `FSDP_WRAPPED_MODULE` cannot be a substring of any other module wrapper +# name (e.g. for activation checkpointing) since then `replace()`-based FQN +# cleaning breaks. FSDP_WRAPPED_MODULE = "_fsdp_wrapped_module" -FSDP_PREFIX = FSDP_WRAPPED_MODULE + "." + FPW_MODULE + "." +FSDP_PREFIX = FSDP_WRAPPED_MODULE + "." +FLAT_PARAM = "_flat_param" _PARAM_BROADCAST_BUCKET_SIZE = int(250 * 1024 * 1024) @@ -1087,27 +1086,22 @@ def __init__( self.mixed_precision.reduce_dtype, self.mixed_precision.keep_low_precision_grads, ) - self._fsdp_wrapped_module = FlattenParamsWrapper( - module, - params_to_flatten, - self.compute_device, - config, - use_orig_params, - ) - if not use_orig_params: - self._check_orig_params_flattened(ignored_params) # Invariant: `self.params` contains exactly the `FlatParameter`s of the # handles in `self._handles` self._handles: List[FlatParamHandle] = [] self.params: List[FlatParameter] = [] - if self._fsdp_wrapped_module.has_params: - handle = self._fsdp_wrapped_module.handle + self._fsdp_wrapped_module = module + if params_to_flatten: + handle = FlatParamHandle(params_to_flatten, module, self.compute_device, config, use_orig_params) + self._handles.append(handle) self.params.append(handle.flat_param) self._register_param_handle(handle) handle.shard(self.process_group) if self.cpu_offload.offload_params and handle.flat_param.device != torch.device("cpu"): - with torch.no_grad(): - handle.flat_param_to(torch.device("cpu")) + handle.flat_param_to(torch.device("cpu")) + if not use_orig_params: + self._check_orig_params_flattened(ignored_params) + self._register_flat_param() self._sync_gradients = True self._communication_hook = self._get_default_comm_hook() @@ -1190,7 +1184,7 @@ def _get_ignored_modules( child for module in ignored_root_modules for child in module.modules() - if not isinstance(child, (FullyShardedDataParallel, FlattenParamsWrapper)) + if not isinstance(child, FullyShardedDataParallel) ) if root_module in ignored_modules: warnings.warn( @@ -1243,13 +1237,10 @@ def _get_buffer_names(self, root_module: nn.Module) -> Set[str]: """ def module_fn(module: nn.Module, prefix: str, buffer_names: Set[str]): - # For FSDP modules, only add the entry when considering the - # contained `FlattenParamsWrapper` to avoid duplication - if not isinstance(module, FullyShardedDataParallel): - for buffer_name, _ in module.named_buffers(recurse=False): - # Clean module wrapper prefixes in case of nested wrapping - prefixed_buffer_name = clean_tensor_name(prefix + buffer_name) - buffer_names.add(prefixed_buffer_name) + for buffer_name, _ in module.named_buffers(recurse=False): + # Clean module wrapper prefixes in case of nested wrapping + prefixed_buffer_name = clean_tensor_name(prefix + buffer_name) + buffer_names.add(prefixed_buffer_name) def return_fn(buffer_names: Set[str], *args): return buffer_names @@ -1627,8 +1618,16 @@ def module(self) -> nn.Module: """ Returns the wrapped module (like :class:`DistributedDataParallel`). """ - assert isinstance(self._fsdp_wrapped_module, FlattenParamsWrapper) - return self._fsdp_wrapped_module.module + return self._fsdp_wrapped_module + + @property + def _has_params(self) -> bool: + """Returns whether this FSDP instance manages any parameters.""" + return hasattr(self, "_handles") and len(self._handles) > 0 + + @property + def _flat_param(self) -> Optional[FlatParameter]: + return self._handles[0].flat_param if self._handles else None def __getattr__(self, name: str) -> Any: """Forward missing attributes to the wrapped module.""" @@ -1638,7 +1637,7 @@ def __getattr__(self, name: str) -> Any: return getattr(self._fsdp_wrapped_module, name) def __getitem__(self, key: int) -> Any: - """Forward indexing calls in case the module is a nn.Sequential.""" + """Forward indexing calls in case the module is an ``nn.Sequential``.""" return self._fsdp_wrapped_module.__getitem__(key) # type: ignore[operator] def check_is_root(self) -> bool: @@ -2228,8 +2227,8 @@ def state_dict_type( ) def _convert_to_wrapped_module_name(self, module_name: str) -> str: - module_name = module_name.replace(f"{FPW_MODULE}.", "") - module_name = module_name.replace(f"{FPW_MODULE}", "") + module_name = module_name.replace(f"{FSDP_PREFIX}", "") + module_name = module_name.replace(f"{FSDP_WRAPPED_MODULE}", "") if module_name: module_name = f"{module_name}." # Activation checkpoint adds a prefix that has to be @@ -2241,6 +2240,8 @@ def _convert_to_wrapped_module_name(self, module_name: str) -> str: @property def _param_fqns(self) -> Iterator[Tuple[str, str, str]]: + if not self._has_params: + return for param_name, module_name in ( self._handles[0].parameter_module_names() ): @@ -2266,12 +2267,12 @@ def _full_post_state_dict_hook( Hook that runs after model.state_dict() is called before returning result to user. For FSDP, we may have to clone the tensors in state_dict as params go back to sharded version after _summon_full_params ends, and also remove - "_fsdp_wrapped_module" prefix. + the ``FSDP_WRAPPED_MODULE`` prefix. """ - _replace_by_prefix(state_dict, prefix + f"{FSDP_WRAPPED_MODULE}.", prefix) + _replace_by_prefix(state_dict, prefix + f"{FSDP_PREFIX}", prefix) self._assert_state([TrainingState_.SUMMON_FULL_PARAMS]) # Return early for trivial cases - if not state_dict or not self._fsdp_wrapped_module.has_params: + if not state_dict or not self._has_params: return state_dict # If a rank has already exited the `summon_full_params()` context here @@ -2285,7 +2286,7 @@ def _full_post_state_dict_hook( if ( ( not self._use_orig_params - and "flat_param" in self._fsdp_wrapped_module._parameters + and FLAT_PARAM in self.module._parameters ) or ( self._use_orig_params @@ -2299,8 +2300,8 @@ def _full_post_state_dict_hook( offload_to_cpu = self._state_dict_config.offload_to_cpu cpu_device = torch.device("cpu") - # Loop only the parameters saved in self._fsdp_wrapped_module to avoid - # processing buffers. + # Loop only the parameters saved in this instance's wrapped module to + # avoid processing buffers. for fqn, param_name, module_name in self._param_fqns: fqn = f"{prefix}{fqn}" clean_key = fqn @@ -2361,16 +2362,16 @@ def _local_post_state_dict_hook( the state_dict[f"{prefix}{FLAT_PARAM}] with the ShardedTensor. No copy will happen. The underlying storage is the same. """ - _replace_by_prefix(state_dict, f"{prefix}{FSDP_WRAPPED_MODULE}.", prefix) - if not self._fsdp_wrapped_module.has_params: + _replace_by_prefix(state_dict, f"{prefix}{FSDP_PREFIX}", prefix) + if not self._has_params: return state_dict # state_dict[f"{prefix}{FLAT_PARAM}"] exists and has the same tensor # value as the flat_param but it is a pure Tensor because # nn.Module.state_dict() will detach the parameter. Therefore, we need - # to get flat_param from the FlattenParamsWrapper to get the metadata. - flat_param = getattr(self._fsdp_wrapped_module, FLAT_PARAM, None) - assert flat_param is not None + # to get flat_param to get the metadata. + assert self._handles, "Should have returned early" + flat_param = self._handles[0].flat_param # Construct a ShardedTensor from the flat_param. full_numel = flat_param._unpadded_unsharded_size.numel() # type: ignore[attr-defined] shard_offset = flat_param.numel() * self.rank @@ -2398,8 +2399,8 @@ def _sharded_post_state_dict_hook( The hook replaces the unflattened, unsharded parameter in the state_dict with a unflattened, sharded parameter (a ShardedTensor). """ - _replace_by_prefix(state_dict, f"{prefix}{FSDP_WRAPPED_MODULE}.", prefix) - if not self._fsdp_wrapped_module.has_params: + _replace_by_prefix(state_dict, f"{prefix}{FSDP_PREFIX}", prefix) + if not self._has_params: return state_dict assert self.training_state != TrainingState_.SUMMON_FULL_PARAMS, ( @@ -2538,8 +2539,8 @@ def state_dict(self, *args, **kwargs): self._state_dict_type == StateDictType.SHARDED_STATE_DICT ): if ( - self._fsdp_wrapped_module.has_params and - not self._fsdp_wrapped_module.handle.uses_sharded_strategy + self._has_params and + not self._handles[0].uses_sharded_strategy ): raise RuntimeError( "sharded_state_dict/local_state_dict can only be called " @@ -2588,7 +2589,7 @@ def _full_pre_load_state_dict_hook( recurse=False, writeback=True ) self._full_param_ctx.__enter__() - _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_WRAPPED_MODULE}.") + _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_PREFIX}") def _local_post_load_state_dict_hook(self, *args, **kwargs) -> None: pass @@ -2603,10 +2604,10 @@ def _local_pre_load_state_dict_hook( state_dict. The flat_param should be a ShardedTensor. This hook converts the ShardedTensor to a tensor. No copy happen unless padding is required. """ - _replace_by_prefix(state_dict, prefix, f"{prefix}{FSDP_WRAPPED_MODULE}.") - fqn = f"{prefix}{FSDP_WRAPPED_MODULE}.{FLAT_PARAM}" + _replace_by_prefix(state_dict, prefix, f"{prefix}{FSDP_PREFIX}") + fqn = f"{prefix}{FSDP_PREFIX}{FLAT_PARAM}" if fqn not in state_dict: - assert not self._fsdp_wrapped_module.has_params, ( + assert not self._has_params, ( "No `FlatParameter` in `state_dict` for this FSDP instance but it has parameters" ) return @@ -2645,11 +2646,11 @@ def _sharded_pre_load_state_dict_hook( The hook combines the unflattened, sharded parameters (ShardedTensor) to a new FlatParameter and shards the new FlatParameter to the local chunk. """ - _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_WRAPPED_MODULE}.") - if not self._fsdp_wrapped_module.has_params: + _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_PREFIX}") + if not self._has_params: return - if not self._fsdp_wrapped_module.handle.uses_sharded_strategy: + if not self._handles[0].uses_sharded_strategy: raise RuntimeError( "load_sharded_state_dict can only be called when parameters " "are flatten and sharded." @@ -2663,7 +2664,7 @@ def _sharded_pre_load_state_dict_hook( # https://github.com/pytorch/pytorch/issues/77461 shared_fqns = [fqn for fqn, _, _ in self._shared_param_fqns] for fqn, _, _ in self._param_fqns: - full_fqn = f"{prefix}{FSDP_WRAPPED_MODULE}.{fqn}" + full_fqn = f"{prefix}{FSDP_PREFIX}{fqn}" param = state_dict.pop(full_fqn) if fqn in shared_fqns: continue @@ -2710,7 +2711,7 @@ def _sharded_pre_load_state_dict_hook( f"The loaded local chunk has different padding({num_to_pad}) " f"from the local chunk {flat_param._shard_numel_padded}." ) - state_dict[f"{prefix}_fsdp_wrapped_module.flat_param"] = loaded_flat_param + state_dict[f"{prefix}{FSDP_PREFIX}{FLAT_PARAM}"] = loaded_flat_param if self._use_orig_params: self._deregister_orig_params() @@ -3157,7 +3158,7 @@ def _summon_full_params( # move parameters. # TODO (awgu): This FPW call assumes 1 `FlatParameter` if not self._use_orig_params: - stack.enter_context(self._fsdp_wrapped_module.unflatten_as_params()) + stack.enter_context(self._unflatten_as_params()) try: yield finally: @@ -3215,6 +3216,50 @@ def _writeback_to_local_shard( ) existing_grad[:grad_shard.numel()].copy_(grad_shard) + @contextlib.contextmanager + def _unflatten_as_params(self) -> Generator: + """ + Assumes that the flattened parameter is unsharded. When in the context, + de-registers the flattened parameter and unflattens the original + parameters as ``nn.Parameter`` views into the flattened parameter. + After the context, re-registers the flattened parameter and restores + the original parameters as ``Tensor`` views into the flattened + parameter. + """ + if not self._handles: + yield + else: + self._deregister_flat_param() + try: + with self._handles[0].unflatten_as_params(): + yield + finally: + if not self._handles[0]._use_orig_params: + self._register_flat_param() + + def _register_flat_param(self): + """ + Registers the flattened parameter to the wrapped module, making it + visible to ``nn.Module`` methods. + + We do not use :meth:`nn.Module.register_parameter` because we want + ``FLAT_PARAM`` to always be an attribute but dynamically change whether + it is visible to ``nn.Module`` methods. + """ + if self._has_params: + self.module._parameters[FLAT_PARAM] = self._handles[0].flat_param + + def _deregister_flat_param(self): + """ + De-registers the flattened parameter from the wrapped module, hiding it + from ``nn.Module`` methods. + + We do not use ``del`` because we want ``FLAT_PARAM`` to always be an + attribute but dynamically change whether it is visible to ``nn.Module`` + methods. + """ + self.module._parameters.pop(FLAT_PARAM, None) + @contextlib.contextmanager def _deregister_orig_params_ctx(self): """ @@ -3254,7 +3299,7 @@ def _deregister_orig_params(self): f"handle: {handle._use_orig_params}" ) handle._deregister_orig_params() - self._fsdp_wrapped_module._register_flat_param() + self._register_flat_param() def _register_orig_params(self): """ @@ -3263,7 +3308,7 @@ def _register_orig_params(self): if not self._handles: return handle = self._handles[0] - self._fsdp_wrapped_module._deregister_flat_param() + self._deregister_flat_param() if handle.is_sharded(handle.flat_param): handle._use_sharded_views() handle._use_sharded_grad_views() @@ -4613,25 +4658,22 @@ def _get_param_to_unflat_param_names( unflattened parameter names. """ def module_fn(module, prefix, param_to_unflat_param_names): - # For FSDP modules, only add the entry when considering the contained - # `FlattenParamsWrapper` to avoid duplication - if not isinstance(module, FullyShardedDataParallel): - for param_name, param in module.named_parameters(recurse=False): - module_prefixed_param_names = ( - param._fqns if type(param) is FlatParameter - else [param_name] - ) # prefixed from `module` - fully_prefixed_param_names = [ - clean_tensor_name(prefix + name) - for name in module_prefixed_param_names - ] # fully prefixed from the top level including `prefix` - # If this parameter has already been visited, then it is a - # shared parameter; then, only take the first parameter name - is_shared_param = param in param_to_unflat_param_names - if not is_shared_param: - param_to_unflat_param_names[param] = fully_prefixed_param_names - elif not dedup_shared_params: - param_to_unflat_param_names[param].extend(fully_prefixed_param_names) + for param_name, param in module.named_parameters(recurse=False): + module_prefixed_param_names = ( + param._fqns if type(param) is FlatParameter + else [param_name] + ) # prefixed from `module` + fully_prefixed_param_names = [ + clean_tensor_name(prefix + name) + for name in module_prefixed_param_names + ] # fully prefixed from the top level including `prefix` + # If this parameter has already been visited, then it is a + # shared parameter; then, only take the first parameter name + is_shared_param = param in param_to_unflat_param_names + if not is_shared_param: + param_to_unflat_param_names[param] = fully_prefixed_param_names + elif not dedup_shared_params: + param_to_unflat_param_names[param].extend(fully_prefixed_param_names) def return_fn(param_to_unflat_param_names): return param_to_unflat_param_names @@ -4684,9 +4726,7 @@ def _get_param_name_to_param( def clean_tensor_name(tensor_name: str) -> str: """Cleans the parameter or buffer name by removing any module wrapper prefixes.""" - # Call `replace()` twice separately since the name may not have both - tensor_name = tensor_name.replace(FSDP_WRAPPED_MODULE + ".", "") - tensor_name = tensor_name.replace(FPW_MODULE + ".", "") + tensor_name = tensor_name.replace(FSDP_PREFIX, "") # TODO: Explicitly replacing checkpoint_wrapper prefix is not ideal, # as it increases coupling between CheckpointWrapper and FSDP. This is also not # scalable for additional wrapped modules, we should come up with a general solution From 1b165436c7087f898756d6f9224bc08614ed1038 Mon Sep 17 00:00:00 2001 From: Andrew Gu Date: Fri, 21 Oct 2022 11:30:57 +0000 Subject: [PATCH 0011/1922] [FSDP][4/N] Rework FPW test to not use FPW (#87112) Testing coverage is pretty much preserved except that we do not test on CPU, which is not a tangible loss for FSDP anyway. I renamed a few tests slightly, and I moved some helpers to be immediately below the corresponding test method. This makes it a bit easier to read. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87112 Approved by: https://github.com/zhaojuanmao --- ...wrapper.py => test_fsdp_flatten_params.py} | 375 +++++++++++------- 1 file changed, 227 insertions(+), 148 deletions(-) rename test/distributed/fsdp/{test_flatten_params_wrapper.py => test_fsdp_flatten_params.py} (51%) diff --git a/test/distributed/fsdp/test_flatten_params_wrapper.py b/test/distributed/fsdp/test_fsdp_flatten_params.py similarity index 51% rename from test/distributed/fsdp/test_flatten_params_wrapper.py rename to test/distributed/fsdp/test_fsdp_flatten_params.py index 016398c88deba..cfc2a494d4406 100644 --- a/test/distributed/fsdp/test_flatten_params_wrapper.py +++ b/test/distributed/fsdp/test_fsdp_flatten_params.py @@ -1,44 +1,45 @@ # Owner(s): ["oncall: distributed"] import sys -import unittest import torch +import torch.nn as nn from torch import distributed as dist +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from torch.distributed.fsdp.flat_param import ( + FlatParamHandle, FlatParamShardMetadata, HandleConfig, HandleShardingStrategy, ) -from torch.distributed.fsdp.flatten_params_wrapper import FlattenParamsWrapper -from torch.testing._internal.common_utils import TestCase, run_tests +from torch.testing._internal.common_distributed import skip_if_lt_x_gpu +from torch.testing._internal.common_fsdp import FSDPTest +from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN if not dist.is_available(): print("Distributed not available, skipping tests", file=sys.stderr) sys.exit(0) +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) -class TestFlattenParams(TestCase): - """Base test class and used for CPU case.""" - - def _get_default_config(self): - return HandleConfig(HandleShardingStrategy.FULL_SHARD, False, None, None) - - def _get_empty_module(self, seed=0): - torch.manual_seed(seed) # keep everything deterministic - class Test(torch.nn.Module): - def forward(self, x): - return x + 1 +class TestFlattenParams(FSDPTest): + """Tests parameter flattening and shard metadata logic.""" - module = Test() + @property + def world_size(self) -> int: + # Clamp the world size to 1 since these unit tests either exercise only + # the flattening logic or check sharding subroutines directly without + # requiring multiple ranks + return 1 - def get_input(device, dtype): - torch.manual_seed(1) # keep everything deterministic - return torch.rand(1).to(device=device, dtype=dtype) - - module.get_input = get_input - return module + def _get_default_config(self): + return HandleConfig(HandleShardingStrategy.FULL_SHARD, False, None, None) def _get_transformer(self, seed=0): torch.manual_seed(seed) # keep everything deterministic @@ -68,152 +69,243 @@ def _get_shared_params_transformer(self, seed=0): dec_layer.linear2.weight = enc_layer.linear2.weight return module - def _get_output(self, module): - device = next(module.parameters()).device - dtype = next(module.parameters()).dtype - input = module.get_input(device, dtype) - return module(*input) - - def _get_pnorm_after_step(self, module): - optim = torch.optim.SGD(module.parameters(), lr=0.01) - loss = self._get_output(module).sum() - loss.backward() - optim.step() - return torch.norm(torch.stack([p.detach().norm() for p in module.parameters()])) - - def _test_num_params(self, module): - ref_num_params = sum(p.numel() for p in module.parameters()) - - params_to_flatten = list(module.parameters()) - flat_module = FlattenParamsWrapper( - module, - params_to_flatten, - torch.device("cuda"), - self._get_default_config(), - False, - ) - flat_num_params = sum(p.numel() for p in flat_module.parameters()) - - self.assertEqual(ref_num_params, flat_num_params) - self.assertEqual(flat_num_params, flat_module.flat_param.numel()) - - def _test_output(self, module): - ref_output = self._get_output(module) - - params_to_flatten = list(module.parameters()) - flat_module = FlattenParamsWrapper( - module, - params_to_flatten, - torch.device("cuda"), - self._get_default_config(), - False, + @skip_if_lt_x_gpu(1) + def test_partial_flattening(self): + """Tests flattening some submodules but not others.""" + self.run_subtests( + {"half": [False, True]}, + self._test_partial_flattening, ) - flat_output = self._get_output(flat_module) - self.assertEqual(ref_output, flat_output) - def test_partial_flattening(self): + def _test_partial_flattening(self, half: bool): module = self._get_transformer() - num_params = sum(p.numel() for p in module.parameters()) - - params_to_flatten = list(module.encoder.layers[1].parameters()) + list( - module.decoder.layers[0].parameters() + if half: + module = module.half() + numel = sum(p.numel() for p in module.parameters()) + + encoder_1_params = list(module.encoder.layers[1].parameters()) + decoder_0_params = list(module.decoder.layers[0].parameters()) + params_to_flatten = encoder_1_params + decoder_0_params + num_params = [len(encoder_1_params), len(decoder_0_params)] + numel_to_flatten = sum(p.numel() for p in params_to_flatten) + module.encoder.layers[1] = FSDP(module.encoder.layers[1]) + module.decoder.layers[0] = FSDP(module.decoder.layers[0]) + flat_params = [ + module.encoder.layers[1]._flat_param, + module.decoder.layers[0]._flat_param, + ] + + self.assertEqual(sum(fp.numel() for fp in flat_params), numel_to_flatten) + self.assertEqual(sum(p.numel() for p in module.parameters()), numel) + + # Check that flattened parameters have been replaced with a single + # `FlatParameter` + self.assertEqual(len(list(module.encoder.layers[1].parameters())), 1) + self.assertEqual(len(list(module.decoder.layers[0].parameters())), 1) + + # Check that non-flattened parameters remain + self.assertEqual( + len(list(module.encoder.layers[0].parameters())), num_params[0] ) - num_params_to_flatten = sum(p.numel() for p in params_to_flatten) - - module = FlattenParamsWrapper( - module, - params_to_flatten, - torch.device("cuda"), - self._get_default_config(), - False, + self.assertEqual( + len(list(module.decoder.layers[1].parameters())), num_params[1] ) - self.assertEqual(module.flat_param.numel(), num_params_to_flatten) - self.assertEqual(sum(p.numel() for p in module.parameters()), num_params) - - # flattened parameters are removed - self.assertEqual(len(list(module.encoder.layers[1].parameters())), 0) - self.assertEqual(len(list(module.decoder.layers[0].parameters())), 0) - - # non-flattened parameters remain - self.assertGreater(len(list(module.encoder.layers[0].parameters())), 0) - self.assertGreater(len(list(module.decoder.layers[1].parameters())), 0) - # test that changing the module dtype works properly + # Check that calling `module.to()` affects the `FlatParameter`s orig_dtype = params_to_flatten[0].dtype new_dtype = torch.float32 if orig_dtype == torch.float16 else torch.float16 - self.assertEqual(module.flat_param.dtype, orig_dtype) + for flat_param in flat_params: + self.assertEqual(flat_param.dtype, orig_dtype) self.assertTrue( all(p.dtype == orig_dtype for p in module.encoder.layers[0].parameters()) ) module = module.to(dtype=new_dtype) - self.assertEqual(module.flat_param.dtype, new_dtype) + for flat_param in flat_params: + self.assertEqual(flat_param.dtype, new_dtype) self.assertTrue( all(p.dtype == new_dtype for p in module.encoder.layers[0].parameters()) ) def test_flatten_nothing(self): - module = self._get_transformer() - module = FlattenParamsWrapper( - module, - [], - torch.device("cuda"), - self._get_default_config(), - False, + """ + Tests that constructing a ``FlatParamHandle`` with no parameters + raises an error. + """ + self.run_subtests( + {"half": [False, True]}, + self._test_flatten_nothing, ) - self.assertIsNone(module.flat_param) + def _test_flatten_nothing(self, half: bool): + module = self._get_transformer() + if half: + module = module.half() + with self.assertRaisesRegex( + ValueError, + "Cannot initialize a `FlatParameter` from an empty parameter list", + ): + FlatParamHandle( + [], + module, + torch.device("cuda"), + self._get_default_config(), + False, + ) + + @skip_if_lt_x_gpu(1) def test_empty_module(self): + """ + Tests flattening an empty module (i.e. one without any parameters). + """ module = self._get_empty_module() in_data = torch.rand(1) ref_out = module(in_data) - module = FlattenParamsWrapper( - module, - [], - torch.device("cuda"), - self._get_default_config(), - False, + fsdp_module = FSDP(module) + self.assertEqual(len(list(fsdp_module.parameters())), 0) + self.assertIsNone(fsdp_module._flat_param) + fsdp_out = fsdp_module(in_data) + self.assertEqual(ref_out, fsdp_out) + + def _get_empty_module(self): + """Returns a module with no parameters.""" + torch.manual_seed(0) # keep everything deterministic + + class EmptyModule(torch.nn.Module): + def forward(self, x): + return x + 1 + + def get_input(self, device, dtype): + torch.manual_seed(1) # keep everything deterministic + return torch.rand(1).to(device=device, dtype=dtype) + + return EmptyModule() + + def test_numel_without_shared_params(self): + """ + Tests that numel is preserved after flattening when there are no shared + parameters in the module. + """ + self.run_subtests( + {"half": [False, True]}, + self._test_numel_without_shared_params, ) - self.assertEqual(len(list(module.parameters())), 0) - self.assertIsNone(module.flat_param) - fpw_out = module(in_data) - self.assertEqual(ref_out, fpw_out) - def test_num_params(self): + def _test_numel_without_shared_params(self, half: bool): module = self._get_transformer() - self._test_num_params(module) + if half: + module = module.half() + self._test_numel(module) + + def test_numel_with_shared_params(self): + """ + Tests that numel is preserved after flattening when there are shared + parameters in the module. + """ + self.run_subtests( + {"half": [False, True]}, + self._test_numel_with_shared_params, + ) - def test_shared_params_num_params(self): + def _test_numel_with_shared_params(self, half: bool): module = self._get_shared_params_transformer() - self._test_num_params(module) + if half: + module = module.half() + self._test_numel(module) - def test_output(self): + def _test_numel(self, module): + ref_numel = sum(p.numel() for p in module.parameters()) + params_to_flatten = list(module.parameters()) + flat_param_handle = FlatParamHandle( + params_to_flatten, + module, + torch.device("cuda"), + self._get_default_config(), + False, + ) + self.assertEqual(ref_numel, flat_param_handle.flat_param.numel()) + + @skip_if_lt_x_gpu(1) + def test_output_without_shared_params(self): + """ + Tests a forward pass after flattening when there are no shared + parameters in the module. + """ + self.run_subtests( + {"half": [False, True]}, + self._test_output_without_shared_params, + ) + + def _test_output_without_shared_params(self, half: bool): module = self._get_transformer() + if half: + module = module.half() self._test_output(module) - def test_shared_params_output(self): + @skip_if_lt_x_gpu(1) + def test_output_with_shared_params(self): + """ + Tests a forward pass after flattening when there are shared parameters + in the module. + """ + self.run_subtests( + {"half": [False, True]}, + self._test_output_with_shared_params, + ) + + def _test_output_with_shared_params(self, half: bool): module = self._get_shared_params_transformer() + if half: + module = module.half() self._test_output(module) - def test_shared_params_pnorm_after_step(self): - # incorrect parameter sharing is likely to cause problems after an - # optimization step - module = self._get_shared_params_transformer() - ref_pnorm_after_step = self._get_pnorm_after_step(module) + def _test_output(self, module: nn.Module): + module = module.to(self.rank) + ref_output = self._get_output(module) + fsdp_module = FSDP(module) + fsdp_output = self._get_output(fsdp_module) + self.assertEqual(ref_output, fsdp_output) - module = self._get_shared_params_transformer() # recreate - params_to_flatten = list(module.parameters()) - flat_module = FlattenParamsWrapper( - module, - params_to_flatten, - torch.device("cuda"), - self._get_default_config(), - False, + def _get_output(self, module): + device = next(module.parameters()).device + dtype = next(module.parameters()).dtype + input = module.get_input(device, dtype) + return module(*input) + + @skip_if_lt_x_gpu(1) + def test_pnorm_after_step_with_shared_params(self): + """ + Tests for parameter Frobenius norm parity after an optimizer step when + there are shared parameters in the module. If the parameter sharing is + handled incorrectly, then an optimizer step should reveal that. + """ + self.run_subtests( + {"half": [False, True]}, + self._test_pnorm_after_step_with_shared_params, ) - flat_pnorm_after_step = self._get_pnorm_after_step(flat_module) - self.assertEqual(ref_pnorm_after_step, flat_pnorm_after_step) + def _test_pnorm_after_step_with_shared_params(self, half: bool): + module = self._get_shared_params_transformer().to(self.rank) + if half: + module = module.half() + ref_pnorm_after_step = self._get_pnorm_after_step(module) + module = self._get_shared_params_transformer().to(self.rank) # recreate + if half: + module = module.half() + fsdp_module = FSDP(module) + fsdp_pnorm_after_step = self._get_pnorm_after_step(fsdp_module) + self.assertEqual(ref_pnorm_after_step, fsdp_pnorm_after_step) - def test_sharded_flat_param(self): + def _get_pnorm_after_step(self, module): + optim = torch.optim.SGD(module.parameters(), lr=0.01) + loss = self._get_output(module).sum() + loss.backward() + optim.step() + return torch.norm(torch.stack([p.detach().norm() for p in module.parameters()])) + + def test_flat_param_shard_metadata(self): + """ + Tests that ``FlatParameter`` shard metadata are computed as expected. + """ module = torch.nn.Sequential( torch.nn.Linear(10, 10, bias=False), torch.nn.ReLU(), @@ -223,14 +315,13 @@ def test_sharded_flat_param(self): torch.nn.ReLU(), ) params_to_flatten = list(module.parameters()) - flat_module = FlattenParamsWrapper( - module, + flat_param_handle = FlatParamHandle( params_to_flatten, + module, torch.device("cuda"), self._get_default_config(), False, ) - flat_param_handle = flat_module.handle def _test(kwargs, expected): """ @@ -244,9 +335,11 @@ def _test(kwargs, expected): ``init_shard_info()`` with the start and end indices fixed based on rank and world size. """ - flat_param = flat_module.flat_param - flat_param._shard_param_offsets, flat_param._shard_indices = \ - flat_param_handle._get_shard_metadata(kwargs["start"], kwargs["end"]) + flat_param = flat_param_handle.flat_param + ( + flat_param._shard_param_offsets, + flat_param._shard_indices, + ) = flat_param_handle._get_shard_metadata(kwargs["start"], kwargs["end"]) self.assertEqual( flat_param_handle.shard_metadata(), expected, @@ -345,19 +438,5 @@ def _test(kwargs, expected): ) -@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") -class TestFlattenParamsCUDA(TestFlattenParams): - def _get_transformer(self, seed=0): - module = super()._get_transformer(seed=seed) - return module.cuda() - - -@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") -class TestFlattenParamsCUDAHalf(TestFlattenParams): - def _get_transformer(self, seed=0): - module = super()._get_transformer(seed=seed) - return module.cuda().half() - - if __name__ == "__main__": run_tests() From 1ff953af97b015c78c822f2bbc09a641ae0fa2ab Mon Sep 17 00:00:00 2001 From: Andrew Gu Date: Fri, 21 Oct 2022 11:30:58 +0000 Subject: [PATCH 0012/1922] [FSDP][5/N] Update `FlatParamHandle` after FPW deprecation (#87113) This PR resolves a TODO left in `FlatParamHandle` that was conditional on deprecating `FlattenParamsWrapper`. We simply pass in the process group into the `FlatParamHandle` constructor instead of later in `shard()`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87113 Approved by: https://github.com/zhaojuanmao --- .../fsdp/test_fsdp_flatten_params.py | 3 ++ torch/distributed/fsdp/flat_param.py | 30 +++++++++++-------- .../fsdp/fully_sharded_data_parallel.py | 11 +++++-- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/test/distributed/fsdp/test_fsdp_flatten_params.py b/test/distributed/fsdp/test_fsdp_flatten_params.py index cfc2a494d4406..4f7178df4a109 100644 --- a/test/distributed/fsdp/test_fsdp_flatten_params.py +++ b/test/distributed/fsdp/test_fsdp_flatten_params.py @@ -149,6 +149,7 @@ def _test_flatten_nothing(self, half: bool): module, torch.device("cuda"), self._get_default_config(), + self.process_group, False, ) @@ -220,6 +221,7 @@ def _test_numel(self, module): module, torch.device("cuda"), self._get_default_config(), + self.process_group, False, ) self.assertEqual(ref_numel, flat_param_handle.flat_param.numel()) @@ -320,6 +322,7 @@ def test_flat_param_shard_metadata(self): module, torch.device("cuda"), self._get_default_config(), + self.process_group, False, ) diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py index 2c65dd80ea3c3..c96cd4a3f267a 100644 --- a/torch/distributed/fsdp/flat_param.py +++ b/torch/distributed/fsdp/flat_param.py @@ -296,11 +296,15 @@ def __init__( module: nn.Module, device: torch.device, config: HandleConfig, + process_group: dist.ProcessGroup, use_orig_params: bool, ): super().__init__() self.device = device self._config = config + self.process_group = process_group + self.rank = process_group.rank() + self.world_size = process_group.size() self._use_orig_params = use_orig_params self._training_state = HandleTrainingState.IDLE self._debug_level = dist.get_debug_level() @@ -436,7 +440,7 @@ def flatten_params( # SHARD INITIALIZATION & METADATA # ################################### @torch.no_grad() - def shard(self, process_group: dist.ProcessGroup): + def shard(self): """ Shards the handle's ``FlatParameter``. In terms of memory, this allocates new memory for the sharded flattened parameter and frees the @@ -446,16 +450,8 @@ def shard(self, process_group: dist.ProcessGroup): Shard metadata attributes are set for all sharding strategies. ``process_group``, ``rank``, and ``world_size`` attributes are set if using a sharded strategy. - - TODO (awgu): Once we retire ``FlattenParamsWrapper``, we should pass - the process group directly to the ``FlatParamHandle`` constructor. For - now, we decouple ``FlattenParamsWrapper` from a process group, but this - makes the process-group-related attributes not necessarily defined. """ flat_param = self.flat_param - self.process_group = process_group - self.rank = process_group.rank() - self.world_size = process_group.size() if not self.uses_sharded_strategy: self._init_shard_metadata(0, 0, flat_param.numel() - 1) else: @@ -863,7 +859,9 @@ def unshard_grad(self): sharded_grad = flat_param._saved_grad_shard # type: ignore[attr-defined] dist.all_gather_into_tensor(padded_unsharded_grad, sharded_grad, self.process_group) unsharded_size = self.flat_param._unpadded_unsharded_size - flat_param.grad = padded_unsharded_grad[:unsharded_size.numel()].view(unsharded_size) + flat_param.grad = padded_unsharded_grad[: unsharded_size.numel()].view( + unsharded_size + ) self._use_unsharded_grad_views() def reshard_grad(self): @@ -913,7 +911,7 @@ def prepare_gradient_for_backward(self): else: p_assert( hasattr(flat_param, "_cpu_grad"), - "`_cpu_grad` should be defined if the gradient is on CPU" + "`_cpu_grad` should be defined if the gradient is on CPU", ) sharded_grad = flat_param._cpu_grad # type: ignore[attr-defined] # If user specified to keep the gradient in low precision, then @@ -944,12 +942,15 @@ def prepare_gradient_for_optim(self): Prepares the gradient for optimizer computation by moving the sharded gradient to the ``.grad`` attribute. """ + def cast_grad_to_param_dtype_if_needed(flat_param): if self._config.keep_low_precision_grads: assert flat_param.grad is not None # mypy # This cast is meaningful when `param_dtype` is a low precision # dtype. - flat_param.grad.data = flat_param.grad.to(self._config.low_prec_param_dtype) + flat_param.grad.data = flat_param.grad.to( + self._config.low_prec_param_dtype + ) flat_param = self.flat_param # TODO (awgu): We should replace these conditional checks to encode @@ -1517,7 +1518,10 @@ def is_sharded(self, tensor: Tensor) -> bool: Returns if ``tensor`` is *currently* sharded. For ``NO_SHARD``, we choose to have this always return ``False`` for clarity. """ - if not hasattr(self.flat_param, "_sharded_size") or not self.uses_sharded_strategy: + if ( + not hasattr(self.flat_param, "_sharded_size") + or not self.uses_sharded_strategy + ): # `_sharded_size` is defined iff `handle.shard()` has been called return False sharded_size = self.flat_param._sharded_size # type: ignore[attr-defined] diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py index 2cf3af6d540c0..6648c606861c1 100644 --- a/torch/distributed/fsdp/fully_sharded_data_parallel.py +++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py @@ -1092,11 +1092,18 @@ def __init__( self.params: List[FlatParameter] = [] self._fsdp_wrapped_module = module if params_to_flatten: - handle = FlatParamHandle(params_to_flatten, module, self.compute_device, config, use_orig_params) + handle = FlatParamHandle( + params_to_flatten, + module, + self.compute_device, + config, + self.process_group, + use_orig_params, + ) self._handles.append(handle) self.params.append(handle.flat_param) self._register_param_handle(handle) - handle.shard(self.process_group) + handle.shard() if self.cpu_offload.offload_params and handle.flat_param.device != torch.device("cpu"): handle.flat_param_to(torch.device("cpu")) if not use_orig_params: From 66027fdf958794efefac5f93e7f781b11c06a593 Mon Sep 17 00:00:00 2001 From: Andrew Gu Date: Fri, 21 Oct 2022 11:30:58 +0000 Subject: [PATCH 0013/1922] [FSDP][6/N] Remove FPW! (#87114) This PR simply deletes `flatten_params_wrapper.py`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87114 Approved by: https://github.com/zhaojuanmao --- .../fsdp/flatten_params_wrapper.py | 190 ------------------ 1 file changed, 190 deletions(-) delete mode 100644 torch/distributed/fsdp/flatten_params_wrapper.py diff --git a/torch/distributed/fsdp/flatten_params_wrapper.py b/torch/distributed/fsdp/flatten_params_wrapper.py deleted file mode 100644 index 2c72cdcf158c2..0000000000000 --- a/torch/distributed/fsdp/flatten_params_wrapper.py +++ /dev/null @@ -1,190 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. - -# Copyright (c) Tongzhou Wang -# Licensed under the MIT License. - -import contextlib -from typing import Any, Dict, Generator, List, Optional - -import torch -import torch.nn as nn -from torch.distributed.utils import _replace_by_prefix - -from .flat_param import FlatParameter, FlatParamHandle, HandleConfig - -FLAT_PARAM = "flat_param" -FPW_MODULE = "_fpw_module" - -__all__ = ["FlattenParamsWrapper"] - - -def _post_state_dict_hook( - module: nn.Module, state_dict: Dict[str, Any], prefix: str, *args: Any -) -> Dict[str, Any]: - """ - _post_state_dict_hook() is called after the state_dict() is executed - and before returning the state_dict to the users. - This API post-processes the keys of the state_dict to remove the - FlattenParamsWrapper internal prefix. - """ - # Move everything from FPW_MODULE up one level. - _replace_by_prefix(state_dict, prefix + f"{FPW_MODULE}.", prefix) - return state_dict - - -def _pre_load_state_dict_hook( - state_dict: Dict[str, Any], - prefix: str, - *args: Any, -) -> None: - """ - _pre_load_state_dict_hook() is called before the _load_from_state_dict() is - executed. This API pre-processes the keys of the state_dict to add the - FlattenParamsWrapper internal prefix. - """ - # Push everything down to FPW_MODULE level. - _replace_by_prefix(state_dict, prefix, prefix + f"{FPW_MODULE}.") - # The flat_param_* keys actually needs to move one level up. - flat_param_key = prefix + f"{FPW_MODULE}.{FLAT_PARAM}" - for k in list(state_dict.keys()): - if k.startswith(flat_param_key): - last_part = k.split(".")[-1] - assert last_part.startswith( - FLAT_PARAM - ), f"Expected key to contain flat_param, but key name is {k}" - _replace_by_prefix(state_dict, k, prefix + last_part) - - -class FlattenParamsWrapper(nn.Module): - """ - This is a wrapper for flattening parameters in a ``nn.Module`` 's subtree - into a single flattened parameter and is based on [1]. This is used for - :class:`FullyShardedDataParallel` 's recursive wrapping. - [1] https://github.com/SsnL/PyTorch-Reparam-Module - - Args: - module (nn.Module): Module to wrap. - params (List[nn.Parameter]): Parameters in ``module`` 's subtree to - flatten into a single flattened parameter. - device (torch.device): The compute and communication device for this - wrapper's handle. - config (HandleConfig): A config customizing this wrapper's handle based - on FSDP's available features. - - Attributes: - flat_param (Optional[FlatParameter]): The flattened parameter. - ``flat_param`` is ``None`` either when (1) this wrapper manages no - parameters or (2) the wrapped module's parameters are unflattened. - _fpw_module (nn.Module): The wrapped module. - _flat_param_handle (FlatParamHandle): A handle for the flattened - parameter; only present if this wrapper manages parameters. - """ - - def __init__( - self, - module: nn.Module, - params: List[nn.Parameter], - device: torch.device, - config: HandleConfig, - use_orig_params: bool, - ) -> None: - super().__init__() - self._fpw_module = module - # Register hooks to clean parameter names for state dict (even if this - # wrapper itself manages no parameters since it must clean names from - # submodules) - self._register_state_dict_hook(_post_state_dict_hook) - self._register_load_state_dict_pre_hook(_pre_load_state_dict_hook) - if len(params) == 0: - return - self._flat_param_handle = FlatParamHandle( - params, module, device, config, use_orig_params - ) - if not use_orig_params: - self._register_flat_param() - self._use_orig_params = use_orig_params - assert getattr(self, FPW_MODULE) is self._fpw_module - assert getattr(self, FLAT_PARAM) is self.flat_param - - @property - def has_params(self) -> bool: - """Returns whether this wrapper manages any parameters.""" - return hasattr(self, "_flat_param_handle") - - @property - def flat_param(self) -> Optional[FlatParameter]: - return self.handle.flat_param if self.has_params else None - - @property - def handle(self) -> FlatParamHandle: - assert hasattr(self, "_flat_param_handle"), ( - "Accessing the handle of a `FlattenParamsWrapper` that does not " - "manage any parameters" - ) - return self._flat_param_handle - - @property - def module(self) -> Any: - """Returns the wrapped module (like DDP).""" - return self._fpw_module - - @contextlib.contextmanager - def unflatten_as_params(self) -> Generator: - """ - Assumes that the flattened parameter is unsharded. When in the context, - de-registers the flattened parameter and unflattens the original - parameters as ``nn.Parameter`` views into the flattened parameter. - After the context, re-registers the flattened parameter and restores - the original parameters as ``Tensor`` views into the flattened - parameter. - """ - if self.flat_param is None: - yield - else: - self._deregister_flat_param() - try: - with self._flat_param_handle.unflatten_as_params(): - yield - finally: - if not self.handle._use_orig_params: - self._register_flat_param() - - def _register_flat_param(self): - """ - Registers the flattened parameter, making it visible to ``nn.Module`` - methods. - - We do not use :meth:`nn.Module.register_parameter` because we want - ``flat_param`` to always be an attribute but dynamically change whether - it is visible to ``nn.Module`` methods. - """ - self._parameters["flat_param"] = self.flat_param - - def _deregister_flat_param(self): - """ - De-registers the flattened parameter, hiding it from ``nn.Module`` - methods. - - We do not use ``del self.flat_param`` because we want ``flat_param`` to - always be an attribute but dynamically change whether it is visible to - ``nn.Module`` methods. - """ - self._parameters.pop("flat_param", None) - - def __getattr__(self, name: str) -> Any: - """Forward missing attributes of this wrapper to the wrapped module.""" - try: - return super().__getattr__(name) # defer to `nn.Module`'s logic - except AttributeError: - return getattr(self.module, name) # fall back to the wrapped module - - def __getitem__(self, key: int) -> Any: - """Forward indexing calls to the wrapped module in case the wrapped - module is an ``nn.Sequential``.""" - return self.module.__getitem__(key) - - def forward(self, *inputs: Any, **kwinputs: Any) -> Any: - return self.module(*inputs, **kwinputs) From 471f6c6d62d092d24b90b6551a7e04e8c134908f Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Fri, 21 Oct 2022 16:57:33 +0000 Subject: [PATCH 0014/1922] [quant][api] Add assert for backend in get_default_qconfig related apis (#86259) (#87331) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/86259 Add assertion to make sure backend is one of "fbgemm", "x86", "qnnpack" and "onednn" for get_default_qconfig, get_default_qat_qconfig, get_default_qconfig_mapping and get_default_qat_qconfig_mapping Test Plan: python test/test_quantization.py -k test_get_default_qconfig_mapping Imported from OSS Reviewed By: jcaip Differential Revision: D40236474 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87331 Approved by: https://github.com/andrewor14 --- .../eager/test_quantize_eager_qat.py | 1 + test/quantization/fx/test_quantize_fx.py | 14 ++++++++++++++ test/quantization/jit/test_quantize_jit.py | 5 +++++ torch/ao/quantization/qconfig.py | 19 +++++++++++++++++-- torch/ao/quantization/qconfig_mapping.py | 8 ++++---- 5 files changed, 41 insertions(+), 6 deletions(-) diff --git a/test/quantization/eager/test_quantize_eager_qat.py b/test/quantization/eager/test_quantize_eager_qat.py index bc118a82062d9..44911b6d9e11a 100644 --- a/test/quantization/eager/test_quantize_eager_qat.py +++ b/test/quantization/eager/test_quantize_eager_qat.py @@ -594,6 +594,7 @@ def forward(self, x): eps = 1e-5 self.assertTrue(torch.abs(mq.quant.scale * 2 - res.q_scale()) < eps) + @override_qengines def test_qat_embedding_bag_errors(self): default_qat_qconfig = get_default_qat_qconfig(torch.backends.quantized.engine) diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py index 6935081a5c923..2746b1c9a0173 100644 --- a/test/quantization/fx/test_quantize_fx.py +++ b/test/quantization/fx/test_quantize_fx.py @@ -5223,6 +5223,20 @@ def forward(self, x): # make sure this runs m = prepare_fx(m, qconfig_mapping, example_inputs, backend_config=backend_config) + def test_get_default_qconfig_valid_backend(self): + """ Checks that AssertionError is raised when non expected backend input is specified + """ + invalid_backends = ["imaginary_backend", 3] + for invalid_backend in invalid_backends: + with self.assertRaisesRegex(AssertionError, "not supported"): + qconfig = get_default_qconfig(invalid_backend) + with self.assertRaisesRegex(AssertionError, "not supported"): + qconfig = get_default_qat_qconfig(invalid_backend) + with self.assertRaisesRegex(AssertionError, "not supported"): + qconfig_mapping = get_default_qconfig_mapping(invalid_backend) + with self.assertRaisesRegex(AssertionError, "not supported"): + qconfig_mapping = get_default_qat_qconfig_mapping(invalid_backend) + @skipIfNoFBGEMM class TestQuantizeFxOps(QuantizationTestCase): def setUp(self): diff --git a/test/quantization/jit/test_quantize_jit.py b/test/quantization/jit/test_quantize_jit.py index 84ab3a723b70f..49152a1097ac2 100644 --- a/test/quantization/jit/test_quantize_jit.py +++ b/test/quantization/jit/test_quantize_jit.py @@ -2674,6 +2674,7 @@ def forward(self, x): m.graph ) + @override_qengines def test_hardswish(self): class FunctionalHardswish(torch.nn.Module): def __init__(self, inplace): @@ -2698,6 +2699,7 @@ def forward(self, input): m.graph ) + @override_qengines def test_elu(self): class FunctionalELU(torch.nn.Module): def __init__(self, inplace=False): @@ -2714,6 +2716,7 @@ def forward(self, input): m = self.checkGraphModeOp(m, self.img_data_2d, "quantized::elu", tracing) FileCheck().check_not("aten::elu").check_not("aten::elu_").run(m.graph) + @override_qengines def test_layer_norm(self): data = [[torch.rand((1, 2, 5, 5), dtype=torch.float)] for _ in range(2)] layer_norm = torch.nn.LayerNorm([2, 5, 5]) @@ -2723,6 +2726,7 @@ def test_layer_norm(self): ) FileCheck().check_not("aten::layer_norm").run(m.graph) + @override_qengines def test_group_norm(self): data = [[torch.rand((1, 4, 5, 5), dtype=torch.float)] for _ in range(2)] group_norm = torch.nn.GroupNorm(2, 4) @@ -2732,6 +2736,7 @@ def test_group_norm(self): ) FileCheck().check_not("aten::group_norm").run(m.graph) + @override_qengines def test_instance_norm(self): data_1d = [[torch.rand((1, 4, 5), dtype=torch.float)] for _ in range(2)] data_2d = [[torch.rand((1, 4, 5, 1), dtype=torch.float)] for _ in range(2)] diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py index 8e662e5745ce6..d1eb0a64a125d 100644 --- a/torch/ao/quantization/qconfig.py +++ b/torch/ao/quantization/qconfig.py @@ -228,12 +228,19 @@ def get_default_qconfig(backend='fbgemm', version=0): Returns the default PTQ qconfig for the specified backend. Args: - * `backend`: a string representing the target backend. Currently supports + * `backend` (str): a string representing the target backend. Currently supports `x86`, `fbgemm` (default), `qnnpack` and `onednn`. Return: qconfig """ + supported_backends = ["fbgemm", "x86", "qnnpack", "onednn"] + if backend not in supported_backends: + raise AssertionError( + "backend: " + str(backend) + + " not supported. backend must be one of {}".format(supported_backends) + ) + if version == 0: if backend == 'fbgemm': qconfig = QConfig(activation=HistogramObserver.with_args(reduce_range=True), @@ -249,6 +256,7 @@ def get_default_qconfig(backend='fbgemm', version=0): qconfig = QConfig(activation=HistogramObserver.with_args(reduce_range=True), weight=default_per_channel_weight_observer) else: + # won't reach qconfig = default_qconfig else: raise AssertionError("Version number: " + str(version) + @@ -303,13 +311,20 @@ def get_default_qat_qconfig(backend='fbgemm', version=1): Returns the default QAT qconfig for the specified backend. Args: - * `backend`: a string representing the target backend. Currently supports + * `backend` (str): a string representing the target backend. Currently supports `x86`, `fbgemm` (default), `qnnpack` and `onednn`. * `version`: version, for backwards compatibility. Can be `None` or `1`. Return: qconfig """ + supported_backends = ["fbgemm", "x86", "qnnpack", "onednn"] + if backend not in supported_backends: + raise AssertionError( + "backend: " + str(backend) + + " not supported. backend must be one of {}".format(supported_backends) + ) + # Histogram observer is too slow for quantization aware training if version == 0: if backend == 'fbgemm': diff --git a/torch/ao/quantization/qconfig_mapping.py b/torch/ao/quantization/qconfig_mapping.py index 4dc4431aa99d1..418cbb334814c 100644 --- a/torch/ao/quantization/qconfig_mapping.py +++ b/torch/ao/quantization/qconfig_mapping.py @@ -126,9 +126,9 @@ def get_default_qconfig_mapping(backend="fbgemm", version=0) -> QConfigMapping: Return the default QConfigMapping for post training quantization. Args: - * ``backend`` : the quantization backend for the default qconfig mapping, should be + * ``backend`` (str) : the quantization backend for the default qconfig mapping, should be one of ["x86", "fbgemm" (default), "qnnpack", "onednn"] - * ``version`` : the version for the default qconfig mapping + * ``version`` (int) : the version for the default qconfig mapping """ # TODO: add assert for backend choices return _get_default_qconfig_mapping(False, backend, version) @@ -138,9 +138,9 @@ def get_default_qat_qconfig_mapping(backend="fbgemm", version=1) -> QConfigMappi Return the default QConfigMapping for quantization aware training. Args: - * ``backend`` : the quantization backend for the default qconfig mapping, should be + * ``backend`` (str) : the quantization backend for the default qconfig mapping, should be one of ["x86", "fbgemm" (default), "qnnpack", "onednn"] - * ``version`` : the version for the default qconfig mapping + * ``version`` (int) : the version for the default qconfig mapping """ return _get_default_qconfig_mapping(True, backend, version) From 84ae92c52db4b859de531c70e9e65dab071c6501 Mon Sep 17 00:00:00 2001 From: Andrew Gu Date: Fri, 21 Oct 2022 11:35:30 +0000 Subject: [PATCH 0015/1922] [FSDP][1/N] Update `summon_full_params(with_grads)` `None` gradient (#87314) This PR changes `summon_full_params(with_grads=True)`'s behavior to be such that if all ranks have `flat_param.grad = None`, then the original parameters will correctly have `orig_param.grad = None`. This is achieved with a preliminary all-reduce. Note that if a particular original parameter's gradient is `None` on all of the containing ranks, but not all ranks' `flat_param.grad = None`, then that particular gradient is still going to be set to zeros. This can be handled if desired in follow-up work. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87314 Approved by: https://github.com/zhaojuanmao --- .../fsdp/test_fsdp_summon_full_params.py | 42 +++++++++++++++++-- torch/distributed/fsdp/flat_param.py | 30 ++++++++++++- 2 files changed, 68 insertions(+), 4 deletions(-) diff --git a/test/distributed/fsdp/test_fsdp_summon_full_params.py b/test/distributed/fsdp/test_fsdp_summon_full_params.py index d78aa81a19d7a..82fd8e1c0737b 100644 --- a/test/distributed/fsdp/test_fsdp_summon_full_params.py +++ b/test/distributed/fsdp/test_fsdp_summon_full_params.py @@ -574,7 +574,8 @@ def test_named_parameters_buffers(self, prefix: str, recurse: bool): self.assertEqual(p1, p2) @skip_if_lt_x_gpu(2) - def test_with_grads(self): + def test_with_grads_core(self): + """Tests the core usage of ``summon_full_params(with_grads=True)``.""" self.run_subtests( { "writeback": [False, True], @@ -586,10 +587,10 @@ def test_with_grads(self): ], "use_orig_params": [True], }, - self._test_with_grads, + self._test_with_grads_core, ) - def _test_with_grads( + def _test_with_grads_core( self, writeback: bool, offload_to_cpu: bool, @@ -691,6 +692,41 @@ def _get_fsdp_grads(fsdp_model: FSDP, is_supported: bool): with _get_error_context(is_supported): _check_grads(ddp_model, fsdp_model, old_fsdp_grads) + @skip_if_lt_x_gpu(2) + def test_with_grads_none_grads(self): + """ + Tests that if all ranks' ``FlatParameter`` has ``None`` gradient, then + each original parameter sees ``None`` gradient as well. + """ + self.run_subtests( + { + "sharding_strategy": [ + ShardingStrategy.FULL_SHARD, + ShardingStrategy.SHARD_GRAD_OP, + ShardingStrategy.NO_SHARD, + ] + }, + self._test_with_grads_none_grads + ) + + def _test_with_grads_none_grads(self, sharding_strategy: ShardingStrategy): + fsdp_model = TransformerWithSharedParams.init( + self.process_group, + FSDPInitMode.RECURSIVE, + CUDAInitMode.CUDA_BEFORE, + deterministic=True, + fsdp_kwargs={ + "use_orig_params": True, + "sharding_strategy": sharding_strategy, + }, + ) + for fsdp_module in FSDP.fsdp_modules(fsdp_model): + for handle in fsdp_module._handles: + assert handle.flat_param.grad is None + with FSDP.summon_full_params(fsdp_model, with_grads=True): + for param in fsdp_model.parameters(): + self.assertTrue(param.grad is None) + instantiate_parametrized_tests(TestSummonFullParams) instantiate_parametrized_tests(TestSummonFullParamsNoShard) diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py index c96cd4a3f267a..bb54e7c0e9613 100644 --- a/torch/distributed/fsdp/flat_param.py +++ b/torch/distributed/fsdp/flat_param.py @@ -841,18 +841,46 @@ def _free_low_precision_sharded_param(self): @torch.no_grad() def unshard_grad(self): + """ + Unshards the handle's ``FlatParameter`` 's gradient. If all ranks have + ``None`` gradient, then all original parameters will as well. This + method performs an all-reduce and an all-gather. The additional + all-reduce is tolerable since this method is not meant to be used on + the computation critical path. + + Postcondition: ``_saved_grad_shard`` is defined and contains the value + to set ``flat_param.grad`` after gradients are resharded. + """ if not self.uses_sharded_strategy: self._use_unsharded_grad_views() return flat_param = self.flat_param self._check_unsharded(flat_param) + + # Check if all ranks have a `None` gradient + num_grad_none = torch.zeros(1, dtype=torch.int32, device=self.device) + num_grad_none[0] = flat_param.grad is None + dist.all_reduce(num_grad_none, group=self.process_group) + if num_grad_none[0] == self.world_size: + flat_param._saved_grad_shard = None # type: ignore[attr-defined] + self._use_unsharded_grad_views() + return + padded_unsharded_grad = torch.empty( flat_param._padded_unsharded_size, # type: ignore[attr-defined] device=self.device, ) if flat_param.grad is None: + # In the case that only some ranks have `None` gradient, we use + # zeros to approximate as a best effort attempt + if self._debug_level == dist.DebugLevel.DETAIL: + warnings.warn( + f"[Rank {self.rank}] Only some but not all ranks have a " + "`None` `FlatParameter` gradient, so FSDP is using zeros to " + "approximate those ranks' sharded gradients being `None`" + ) flat_param._saved_grad_shard = None # type: ignore[attr-defined] - sharded_grad = torch.zeros_like(flat_param) # type: ignore[attr-defined] + sharded_grad = torch.zeros(flat_param._sharded_size, device=self.device) # type: ignore[attr-defined] else: self._check_sharded(flat_param.grad) flat_param._saved_grad_shard = flat_param.grad # type: ignore[attr-defined] From 01449d66e927a13c4914873c3f161f0d386f8d9d Mon Sep 17 00:00:00 2001 From: Andrew Gu Date: Fri, 21 Oct 2022 11:35:30 +0000 Subject: [PATCH 0016/1922] [FSDP][2/N] Fix grad zero vs. `None` edge case (#87308) Some original parameters corresponding to one `FlatParameter` may have `None` gradient while others do not. In that case, the `flat_param.grad` must be non-`None`. However, FSDP should take care to expose the original parameters' gradients regardless. To achieve this, we track a `_is_grad_none` mask over the parameters' gradients. - `_is_grad_none` is initialized to `False` for all. - `_is_grad_none[i]` is set to `True` when writing zeros in place of `None` when writing back the `i`th gradient. - `_is_grad_none[i]` is set to `False` via `_reset_is_grad_none()`, which should be called in the post-backward. See the docstring for details. - `_is_grad_none[i]` must be `False` in order to set `param.grad` to be a view into `flat_param.grad`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87308 Approved by: https://github.com/zhaojuanmao --- .../fsdp/test_fsdp_use_orig_params.py | 211 ++++++++++++++---- torch/distributed/fsdp/flat_param.py | 124 ++++++++-- .../fsdp/fully_sharded_data_parallel.py | 5 + 3 files changed, 283 insertions(+), 57 deletions(-) diff --git a/test/distributed/fsdp/test_fsdp_use_orig_params.py b/test/distributed/fsdp/test_fsdp_use_orig_params.py index 69b0645a3fa34..1091200206135 100644 --- a/test/distributed/fsdp/test_fsdp_use_orig_params.py +++ b/test/distributed/fsdp/test_fsdp_use_orig_params.py @@ -1,20 +1,23 @@ # Owner(s): ["oncall: distributed"] import functools +import itertools import sys -from typing import Callable, Optional, Tuple, Type +from typing import Any, Callable, Dict, List, Optional, Tuple, Type import torch import torch.nn as nn from torch import distributed as dist -from torch.distributed.fsdp import ( - BackwardPrefetch, - CPUOffload, - FullyShardedDataParallel as FSDP, - ShardingStrategy, +from torch.distributed.fsdp import BackwardPrefetch, CPUOffload +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp import ShardingStrategy +from torch.distributed.fsdp.fully_sharded_data_parallel import ( + clean_tensor_name, +) +from torch.distributed.fsdp.wrap import ( + always_wrap_policy, + transformer_auto_wrap_policy, ) -from torch.distributed.fsdp.fully_sharded_data_parallel import clean_tensor_name -from torch.distributed.fsdp.wrap import always_wrap_policy, transformer_auto_wrap_policy from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer from torch.nn.parallel.distributed import DistributedDataParallel as DDP from torch.testing._internal.common_distributed import skip_if_lt_x_gpu @@ -25,10 +28,10 @@ TransformerWithSharedParams, ) from torch.testing._internal.common_utils import ( + TEST_WITH_DEV_DBG_ASAN, instantiate_parametrized_tests, parametrize, run_tests, - TEST_WITH_DEV_DBG_ASAN, ) if not dist.is_available(): @@ -46,16 +49,14 @@ class TestFSDPUseOrigParamsMultipleParamGroups(FSDPTest): """Tests multiple parameter groups.""" - def _get_optim( - self, - model: nn.Module, - optim_class: Type[torch.optim.Optimizer], - multi_tensor: bool, - ) -> torch.optim.Optimizer: + @property + def world_size(self) -> int: + return 2 + + def _get_param_groups(self, model: nn.Module) -> List[Dict[str, Any]]: """ - Constructs an Adam optimizer with three parameter groups, one for - weights, one for biases, and one for everything else, each with - different weight decay and learning rates. + Constructs separate parameter groups for weights, biases, and other + parameters. """ param_groups = [ {"params": [], "weight_decay": 0.1, "lr": 1e-2}, @@ -69,18 +70,24 @@ def _get_optim( param_groups[1]["params"].append(param) else: param_groups[2]["params"].append(param) - return optim_class(param_groups, lr=5e-3, foreach=multi_tensor) + return param_groups - def _get_ddp_transformer_and_optim( + def _get_optim( self, + model: nn.Module, optim_class: Type[torch.optim.Optimizer], multi_tensor: bool, - find_unused_params: bool, - ) -> Tuple[DDP, torch.optim.Optimizer]: + ) -> torch.optim.Optimizer: """ - Returns a transformer with shared parameters wrapped with DDP and a - corresponding optimizer. + Constructs an Adam optimizer with three parameter groups, one for + weights, one for biases, and one for everything else, each with + different weight decay and learning rates. """ + param_groups = self._get_param_groups(model) + return optim_class(param_groups, lr=5e-3, foreach=multi_tensor) + + def _get_ddp_transformer(self, find_unused_params: bool) -> DDP: + """Returns a transformer with shared parameters wrapped with DDP.""" model = TransformerWithSharedParams.init( self.process_group, FSDPInitMode.NO_FSDP, @@ -92,8 +99,7 @@ def _get_ddp_transformer_and_optim( device_ids=[self.rank], find_unused_parameters=find_unused_params, ) - ddp_optim = self._get_optim(ddp_model, optim_class, multi_tensor) - return ddp_model, ddp_optim + return ddp_model def _get_fsdp_transformer_and_optim( self, @@ -174,11 +180,17 @@ def _check_train_parity( model.to(torch.device("cpu")) optim.step() if model is ddp_model and fsdp_model.cpu_offload.offload_params: - model.to(torch.device("cuda")) + model.to(device) torch.testing.assert_close(iter_losses[0], iter_losses[1]) iter_losses.clear() + self._check_ddp_fsdp_param_parity(ddp_model, fsdp_model) + + def _check_ddp_fsdp_param_parity(self, ddp_model: DDP, fsdp_model: FSDP): with FSDP.summon_full_params(fsdp_model): - for p1, p2 in zip(ddp_model.parameters(), fsdp_model.parameters()): + for (n1, p1), (n2, p2) in zip( + ddp_model.module.named_parameters(), fsdp_model.named_parameters() + ): + self.assertEqual(n1, n2) torch.testing.assert_close(p1, p2) def _get_sharding_strategy_from_str( @@ -271,11 +283,8 @@ def _test_diff_hyperparams( """ if cuda_init_mode == CUDAInitMode.CUDA_AFTER and cpu_offload.offload_params: return # not supported - ddp_model, ddp_optim = self._get_ddp_transformer_and_optim( - optim_class=optim_class, - multi_tensor=multi_tensor, - find_unused_params=False, - ) + ddp_model = self._get_ddp_transformer(find_unused_params=False) + ddp_optim = self._get_optim(ddp_model, optim_class, multi_tensor) fsdp_model, fsdp_optim = self._get_fsdp_transformer_and_optim( cuda_init_mode=cuda_init_mode, init_optim_before_wrap=init_optim_before_wrap, @@ -313,11 +322,8 @@ def _test_diff_trainability( sharding_strategy: ShardingStrategy, ): optim_class = torch.optim.Adam - ddp_model, ddp_optim = self._get_ddp_transformer_and_optim( - optim_class=optim_class, - multi_tensor=multi_tensor, - find_unused_params=True, - ) + ddp_model = self._get_ddp_transformer(find_unused_params=True) + ddp_optim = self._get_optim(ddp_model, optim_class, multi_tensor) fsdp_model, fsdp_optim = self._get_fsdp_transformer_and_optim( cuda_init_mode=CUDAInitMode.CUDA_BEFORE, init_optim_before_wrap=False, @@ -336,10 +342,139 @@ def _test_diff_trainability( param.requires_grad_(False) self._check_train_parity(ddp_model, ddp_optim, fsdp_model, fsdp_optim, False) + @skip_if_lt_x_gpu(2) + def test_multiple_optimizers(self): + """ + Tests using two optimizers where only one sets gradients to ``None``. + """ + self.run_subtests( + { + "sharding_strategy": [ + ShardingStrategy.FULL_SHARD, + # ShardingStrategy.SHARD_GRAD_OP, + ] + }, + self._test_multiple_optimizers, + ) + + def _test_multiple_optimizers(self, sharding_strategy: ShardingStrategy): + ddp_model = self._get_ddp_transformer(find_unused_params=True) + ddp_param_groups = self._get_param_groups(ddp_model) + assert len(ddp_param_groups) == 3, f"{len(ddp_param_groups)}" + fsdp_model, _ = self._get_fsdp_transformer_and_optim( # ignore returned optimizer + cuda_init_mode=CUDAInitMode.CUDA_BEFORE, + init_optim_before_wrap=False, + optim_class=torch.optim.Adam, # ignored + multi_tensor=False, # ignored + sharding_strategy=sharding_strategy, + backward_prefetch=BackwardPrefetch.BACKWARD_PRE, + cpu_offload=None, + ) + fsdp_param_groups = self._get_param_groups(fsdp_model) + assert len(fsdp_param_groups) == 3, f"{len(fsdp_param_groups)}" + ddp_optims = [] + fsdp_optims = [] + # For the transformer model, every parameter is either a weight or a + # bias, so we only use the first two parameter groups. Moreover, we use + # Adam and AdamW in particular since they both use bias correction + # dependent on the step, which is incremented even if a parameter has a + # zero gradient but not if the gradient is `None`. This is to test that + # we are differentiating between a zero and `None` gradient correctly. + optim_ctors = [ + functools.partial(torch.optim.Adam, lr=5e-3), + functools.partial(torch.optim.AdamW, lr=1e-2), + ] + + for optim_ctor, ddp_param_group, fsdp_param_group in zip( + optim_ctors, ddp_param_groups[:2], fsdp_param_groups[:2], + ): + ddp_optims.append(optim_ctor(ddp_param_group["params"])) + fsdp_optims.append(optim_ctor(fsdp_param_group["params"])) + device = torch.device("cuda") + + # Check that there exists a `FlatParameter` that has both a weight and + # a bias in this rank's shard + has_both = False + for fsdp_module in FSDP.fsdp_modules(fsdp_model): + for handle in fsdp_module._handles: + flat_param = handle.flat_param + assert flat_param._params is not None + has_weight = False + has_bias = False + for param, fqn in zip(flat_param._params, flat_param._fqns): + if "weight" in fqn and param.numel() > 0: + has_weight = True + elif "bias" in fqn and param.numel() > 0: + has_bias = True + has_both |= (has_weight and has_bias) + assert has_both, ( + f"Rank {self.rank} does not have a `FlatParameter` with both a " + "weight and a bias in its shard, meaning that this test is vacuous" + ) + + # Run one iteration to generate gradients + def run_iter(): + iter_losses = [] + for model, optims in ((ddp_model, ddp_optims), (fsdp_model, fsdp_optims)): + module = model.module + inp = module.get_input(device) + output = model(*inp) + loss = module.get_loss(inp, output).to(device) + iter_losses.append(loss) + module.run_backward(loss) + for optim in optims: + optim.step() + torch.testing.assert_close(iter_losses[0], iter_losses[1]) + iter_losses.clear() + self._check_ddp_fsdp_param_parity(ddp_model, fsdp_model) + + run_iter() + + # Only set the weights' gradients to None + ddp_optims[0].zero_grad(set_to_none=True) + fsdp_optims[0].zero_grad(set_to_none=True) + inp = ddp_model.module.get_input(device) + ddp_output = ddp_model(*inp) + fsdp_output = fsdp_model(*inp) + + # Check that FSDP correctly exposes gradients even after forward + # (namely, `None` for weights and non-`None` for biases) + for (ddp_n, ddp_p), (fsdp_n, fsdp_p) in zip( + ddp_model.module.named_parameters(), fsdp_model.named_parameters(), + ): + self.assertEqual(ddp_n, fsdp_n) + if fsdp_p.numel() == 0: + # Not in this rank's shard + self.assertTrue(fsdp_p.grad is None) + continue + if ddp_p.grad is None: + self.assertTrue(fsdp_p.grad is None) + else: + self.assertEqual(ddp_p.flatten(), fsdp_p.flatten()) + self.assertEqual(ddp_p.grad.flatten(), fsdp_p.grad.flatten()) + self._check_ddp_fsdp_param_parity(ddp_model, fsdp_model) + + # Finish the iteration (backward pass and optimizer step) + ddp_loss = ddp_model.module.get_loss(inp, ddp_output).to(device) + fsdp_loss = fsdp_model.module.get_loss(inp, fsdp_output).to(device) + ddp_model.module.run_backward(ddp_loss) + fsdp_model.module.run_backward(fsdp_loss) + for optim in itertools.chain(ddp_optims, fsdp_optims): + optim.step() + self._check_ddp_fsdp_param_parity(ddp_model, fsdp_model) + + # Run one more iteration to confirm bias corrections are correct + run_iter() + self._check_ddp_fsdp_param_parity(ddp_model, fsdp_model) + class TestFSDPUseOrigParamsUnshardReshard(FSDPTest): """Tests the unshard/reshard flow.""" + @property + def world_size(self) -> int: + return 2 + def _get_fsdp_models_and_optims( self, sharding_strategy: ShardingStrategy, diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py index bb54e7c0e9613..1e34510bd0225 100644 --- a/torch/distributed/fsdp/flat_param.py +++ b/torch/distributed/fsdp/flat_param.py @@ -206,6 +206,20 @@ class FlatParameter(nn.Parameter): This is only defined when offloading parameters is enabled. _saved_grad_shard (Tensor): Sharded gradient with padding from previous iterations for gradient accumulation without :meth:`no_sync`. + + _params (Optional[List[nn.Parameter]]): The original parameter + variables if ``use_orig_params=True`` and ``None`` otherwise. + _shared_params (Optional[List[nn.Parameter]]): The original shared + parameter variables if ``use_orig_params=True`` and ``None`` + otherwise. + _is_grad_none (Optional[List[bool]]): A mask over the original + parameters' gradients indicating if it is logically ``None`` or not + if ``use_orig_params=True`` and ``None`` otherwise. This is needed + because only some of the parameters may have ``None`` gradient, in + which case the ``FlatParameter`` gradient must be non-``None`` and + must use zeros to approximate those original ``None`` gradients. + This mask informs FSDP to set the original parameter gradients to + ``None`` (instead of zeros) as needed. """ def _init_metadata( @@ -256,9 +270,13 @@ def _init_metadata( # another `FlatParameter` during recursive construction for param in chain(self._params, self._shared_params): _set_fsdp_flattened(param) + self._is_grad_none: Optional[List[bool]] = [ + False for _ in range(len(params)) + ] else: self._params = None self._shared_params = None + self._is_grad_none = None self._unpadded_unsharded_size = self.size() _set_fsdp_flattened(self) @@ -1115,6 +1133,11 @@ def _use_sharded_flat_param(self) -> None: flat_param.data = flat_param._local_shard # type: ignore[attr-defined] if self._use_orig_params: self._use_sharded_views() + # For the post-forward reshard, we may try to use sharded gradient + # views, but for the post-backward reshard, we delay the call to + # after the reduce-scatter + if self._training_state == HandleTrainingState.FORWARD: + self._use_sharded_grad_views() ######### # VIEWS # @@ -1215,6 +1238,13 @@ def _use_unsharded_grad_views(self) -> None: """ # Expects the gradient to be in `flat_param.grad` if self.flat_param.grad is None: + assert self.flat_param._params is not None # mypy + assert self.flat_param._shared_params is not None # mypy + for param in chain( + self.flat_param._params, # type: ignore[attr-defined] + self.flat_param._shared_params, # type: ignore[attr-defined] + ): + param.grad = None return self._check_unsharded(self.flat_param.grad) views = self._get_unflat_views(self.flat_param, self.flat_param.grad) @@ -1329,21 +1359,26 @@ def _use_sharded_grad_views(self) -> None: self._check_sharded(flat_param) grad = self.sharded_grad if grad is None: - return # no-op + assert flat_param._params is not None # mypy + assert flat_param._shared_params is not None # mypy + for param in chain(flat_param._params, flat_param._shared_params): # type: ignore[attr-defined] + param.grad = None + return self._check_sharded(grad) - start, end = self.flat_param._shard_indices # type: ignore[attr-defined] + start, end = flat_param._shard_indices # type: ignore[attr-defined] offset = 0 - assert self.flat_param._params is not None - for i, param in enumerate(self.flat_param._params): + assert flat_param._params is not None + for i, param in enumerate(flat_param._params): in_sharded_flat_param = ( i >= start and i <= end - and self.flat_param._shard_param_offsets # type: ignore[attr-defined] + and flat_param._shard_param_offsets # type: ignore[attr-defined] ) if in_sharded_flat_param: - param_start, param_end = self.flat_param._shard_param_offsets[i - start] # type: ignore[attr-defined] + param_start, param_end = flat_param._shard_param_offsets[i - start] # type: ignore[attr-defined] numel_in_shard = param_end - param_start + 1 - if param.requires_grad: + assert flat_param._is_grad_none is not None # mypy + if param.requires_grad and not flat_param._is_grad_none[i]: param.grad = grad[offset : offset + numel_in_shard].reshape( param.shape ) @@ -1352,9 +1387,9 @@ def _use_sharded_grad_views(self) -> None: offset += numel_in_shard else: param.grad = None - assert self.flat_param._shared_params is not None + assert flat_param._shared_params is not None for i, (param, (_, _, _, prim_param_name, prim_module, _)) in enumerate( - zip(self.flat_param._shared_params, self.flat_param._shared_param_infos) + zip(flat_param._shared_params, flat_param._shared_param_infos) ): in_sharded_flat_param = hasattr(prim_module, prim_param_name) if in_sharded_flat_param and param.requires_grad: @@ -1412,7 +1447,9 @@ def _writeback_orig_params(self) -> bool: flat_param._params[i] = param if needs_param_writeback: expected_shape = torch.Size([numel_in_shard]) - self._writeback_tensor(param, flat_param, expected_shape, offset, True) + self._writeback_tensor( + param, flat_param, i, expected_shape, offset, True + ) wroteback = True # Check for gradient writeback # NOTE: Since this method is called in the pre-unshard, which is @@ -1422,19 +1459,27 @@ def _writeback_orig_params(self) -> bool: if param.grad is None and flat_param.grad is not None: expected_shape = torch.Size([numel_in_shard]) self._writeback_tensor( - None, flat_param.grad, expected_shape, offset, False + None, flat_param.grad, i, expected_shape, offset, False ) elif param.grad is not None: - needs_grad_writeback = flat_param.grad is None or not _same_storage( - param.grad, flat_param.grad + # For `NO_SHARD` + CPU offloading, `_cpu_grad` is always in + # memory and owns the gradient storage, so it will never + # require gradient writeback. + flat_param_grad = ( + flat_param.grad if self.uses_sharded_strategy or not self._config.offload_params + else flat_param._cpu_grad # type: ignore[attr-defined] + ) + needs_grad_writeback = flat_param_grad is None or not _same_storage( + param.grad, flat_param_grad ) if needs_grad_writeback: - if flat_param.grad is None: - flat_param.grad = torch.zeros_like(flat_param) + if flat_param_grad is None: + flat_param_grad = torch.zeros_like(flat_param) expected_shape = torch.Size([numel_in_shard]) self._writeback_tensor( - param.grad, flat_param.grad, expected_shape, offset, False + param.grad, flat_param_grad, i, expected_shape, offset, False ) + flat_param.grad = flat_param_grad offset += numel_in_shard # TODO (awgu): Handle shared parameters. We need to re-generate the # shared parameter data structures in case sharedness changed. @@ -1456,6 +1501,7 @@ def _writeback_tensor( self, src_tensor: Optional[Tensor], dst_tensor: Tensor, + tensor_index: int, expected_shape: torch.Size, offset: int, is_param: bool, # else gradient @@ -1465,7 +1511,8 @@ def _writeback_tensor( where ``src_tensor`` should have shape ``expected_shape``. ``is_param`` indicates if the tensor is the parameter (if ``True``) or gradient (if ``False``). If ``src_tensor`` is ``None``, then the effect is zeroing - instead of copying. + instead of copying. ``tensor_index`` gives the index of ``src_tensor`` + in the metadata structures. Raises: RuntimeError: If the ``src_tensor`` does not have the expected @@ -1497,6 +1544,8 @@ def _writeback_tensor( dst_tensor[offset : offset + expected_shape.numel()].copy_(src_tensor) else: dst_tensor[offset : offset + expected_shape.numel()].zero_() + assert self.flat_param._is_grad_none is not None + self.flat_param._is_grad_none[tensor_index] = True def _clear_grads_if_needed(self): """ @@ -1586,6 +1635,16 @@ def shared_parameter_module_names(self) -> Iterator[Tuple[str, str]]: ]: yield (param_name, module_name) + @property + def _fqns_in_shard(self) -> List[str]: + """Returns the FQNs of the parameters present in this rank's shard.""" + fqns_in_shard: List[str] = [] + start, end = self.flat_param._shard_indices # type: ignore[attr-defined] + for i in range(len(self.flat_param._fqns)): + if i >= start and i <= end and self.flat_param._shard_param_offsets: # type: ignore[attr-defined] + fqns_in_shard.append(self.flat_param._fqns[i]) + return fqns_in_shard + @property def sharded_grad(self) -> Optional[Tensor]: """Returns the handle's sharded gradient.""" @@ -1599,13 +1658,40 @@ def sharded_grad(self) -> Optional[Tensor]: elif hasattr(flat_param, "_saved_grad_shard"): grad = flat_param._saved_grad_shard # type: ignore[attr-defined] else: + # If in the forward, then there may be an accumulated gradient, + # which will be in `.grad` p_assert( - flat_param.grad is None or not self.uses_sharded_strategy, - "Sharded strategies should use `_cpu_grad` or `_saved_grad_shard`", + flat_param.grad is None + or not self.uses_sharded_strategy + or self._training_state == HandleTrainingState.FORWARD, + "Sharded strategies should use `_cpu_grad` or `_saved_grad_shard` " + "unless in FORWARD (for the post-forward reshard)", ) grad = flat_param.grad return grad + def _reset_is_grad_none(self) -> None: + """ + Resets the ``_is_grad_none`` mask as needed. This method should only be + called in the post-backward after gradient computation, in which case + if a parameter requires gradient, then it will surely receive a + gradient and we may reset its mask entry to ``False``. + """ + if not self._use_orig_params: + return + p_assert( + self._training_state == HandleTrainingState.BACKWARD_POST, + "Expects to only be called in the post-backward after gradient computation", + ) + flat_param = self.flat_param + assert flat_param._params is not None # mypy + for i, param in enumerate(flat_param._params): + # As long as the parameter requires gradient, it should receive a + # meaningful gradient (even if the gradient happens to be zeros) + if param.requires_grad: + assert flat_param._is_grad_none is not None # mypy + flat_param._is_grad_none[i] = False + ####################### # CHECKS & INVARIANTS # ####################### diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py index 6648c606861c1..a51df5195f0fc 100644 --- a/torch/distributed/fsdp/fully_sharded_data_parallel.py +++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py @@ -3668,6 +3668,11 @@ def _post_backward_hook( orig_grad_data.record_stream(self._streams["post_backward"]) if handle._use_orig_params: + # Since the handle's `FlatParameter` completed its gradient + # computation, we should reset the gradient noneness mask + handle._reset_is_grad_none() + # Delay using sharded gradient views until after the + # reduce-scatter instead of immediately after resharding handle._use_sharded_grad_views() def _cast_grad_to_param_dtype( From 581c137fc9c8a1a5f4469a36e9b6f28b5ab7cfa8 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Fri, 21 Oct 2022 07:29:38 -0700 Subject: [PATCH 0017/1922] [dynamo] Unify raise_on_* config to suppress_errors and raise by default (#87440) I noticed that a lot of bugs are being suppressed by torchdynamo's default error suppression, and worse yet, there's no way to unsuppress them. After discussion with voz and soumith, we decided that we will unify error suppression into a single option (suppress_errors) and default suppression to False. If your model used to work and no longer works, try TORCHDYNAMO_SUPPRESS_ERRORS=1 to bring back the old suppression behavior. Signed-off-by: Edward Z. Yang cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang Pull Request resolved: https://github.com/pytorch/pytorch/pull/87440 Approved by: https://github.com/voznesenskym, https://github.com/albanD --- benchmarks/dynamo/common.py | 24 ++++------------------ test/dynamo/test_aot_cudagraphs.py | 3 +++ test/dynamo/test_misc.py | 3 ++- test/dynamo/test_no_fake_tensors.py | 12 +++++++++++ test/dynamo/test_optimizers.py | 5 ----- test/dynamo/test_replay_record.py | 5 +++++ test/dynamo/test_repros.py | 8 ++++++++ test/dynamo/test_unspec.py | 5 +++++ test/inductor/test_torchinductor.py | 1 - test/inductor/test_torchinductor_opinfo.py | 1 - torch/_dynamo/config.py | 10 ++++----- torch/_dynamo/convert_frame.py | 14 ++++++------- torch/_dynamo/debug_utils.py | 2 -- torch/_dynamo/exc.py | 2 +- torch/_dynamo/guards.py | 11 ++++------ torch/_dynamo/output_graph.py | 3 +-- torch/_dynamo/test_case.py | 3 --- torch/testing/_internal/common_utils.py | 3 +++ 18 files changed, 59 insertions(+), 56 deletions(-) diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py index 8ff1fb5c3ae93..a2f8af2bc825a 100644 --- a/benchmarks/dynamo/common.py +++ b/benchmarks/dynamo/common.py @@ -1466,20 +1466,9 @@ def parse_args(): help="Use same settings as --inductor for baseline comparisons", ) parser.add_argument( - "--raise-on-assertion-error", + "--suppress-errors", action="store_true", - help="Fail a benchmark if torch._dynamo triggers an internal assertion", - ) - parser.add_argument( - "--raise-on-backend-error", - action="store_true", - help="Fail a benchmark if backend throws an exception", - ) - parser.add_argument( - "--raise-on-any", - "--raise", - action="store_true", - help="Raise on assertion or backend errors", + help="Suppress errors instead of raising them", ) parser.add_argument( "--output", @@ -1672,7 +1661,7 @@ def main(runner, original_dir=None): os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Stricter check to disable fallbacks - args.raise_on_any = True + args.suppress_errors = False elif args.performance: # Ensure that we test on real scenarios @@ -1736,12 +1725,7 @@ def main(runner, original_dir=None): if args.quiet: torch._dynamo.config.log_level = logging.ERROR - torch._dynamo.config.raise_on_assertion_error = ( - args.raise_on_assertion_error or args.raise_on_any - ) - torch._dynamo.config.raise_on_backend_error = ( - args.raise_on_backend_error or args.raise_on_any - ) + torch._dynamo.config.suppress_errors = args.suppress_errors if args.training: runner.model_iter_fn = runner.forward_and_backward_pass diff --git a/test/dynamo/test_aot_cudagraphs.py b/test/dynamo/test_aot_cudagraphs.py index fdb7c88762b8b..cb1d2a0e601ff 100644 --- a/test/dynamo/test_aot_cudagraphs.py +++ b/test/dynamo/test_aot_cudagraphs.py @@ -71,6 +71,7 @@ def fn(x, y): y = torch.randn(3, device="cuda") fn(x, y) + @patch("torch._dynamo.config.suppress_errors", True) @patch_all() def test_dtoh(self): def model(x, y): @@ -104,6 +105,7 @@ def fn(x, y): y = torch.randn((), device="cpu") fn(x, y) + @patch("torch._dynamo.config.suppress_errors", True) @patch("functorch._src.config.use_functionalize", True) @patch_all(ok=False) # input mutation not supported yet def test_mutate_input(self): @@ -143,6 +145,7 @@ def fn(x, y): y = torch.randn(1, device="cuda") fn(x, y) + @patch("torch._dynamo.config.suppress_errors", True) @patch_all() def test_factory(self): def model(y): diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py index 542a0319a48d3..a0f592212f4e1 100644 --- a/test/dynamo/test_misc.py +++ b/test/dynamo/test_misc.py @@ -578,6 +578,8 @@ def fn(count): self.assertEqual(cnts.frame_count, 0) self.assertEqual(cnts.op_count, 0) + # KeyError: '__name__' + @patch.object(torch._dynamo.config, "suppress_errors", True) def test_user_getattr1(self): class MyConfig(dict): def __getattr__(self, name): @@ -1959,7 +1961,6 @@ def check_sum_all(tensor: torch.Tensor) -> None: check_sum_all(torch.randn(200000, dtype=dtype, device=device)) - @patch.object(torch._dynamo.config, "raise_on_backend_error", True) def test_raise_on_backend_error(self): def my_compiler(gm, _): raise RuntimeError("duck!") diff --git a/test/dynamo/test_no_fake_tensors.py b/test/dynamo/test_no_fake_tensors.py index d65166f5762c5..df511f1affd55 100644 --- a/test/dynamo/test_no_fake_tensors.py +++ b/test/dynamo/test_no_fake_tensors.py @@ -1,4 +1,6 @@ # Owner(s): ["module: dynamo"] +import unittest + from torch._dynamo.testing import make_test_cls_with_patches try: @@ -23,6 +25,16 @@ def make_no_fake_cls(cls): NoFakeTensorsNNModuleTests = make_no_fake_cls(test_modules.NNModuleTests) NoFakeTensorsUnspecTests = make_no_fake_cls(test_unspec.UnspecTests) +unittest.expectedFailure( + NoFakeTensorsReproTests.test_guard_fail_tensor_bool_no_fake_tensors +) +NoFakeTensorsReproTests.test_numpy_list_no_fake_tensors.__unittest_expecting_failure__ = ( + False +) +NoFakeTensorsUnspecTests.test_builtin_getitem_no_fake_tensors.__unittest_expecting_failure__ = ( + False +) + if __name__ == "__main__": from torch._dynamo.test_case import run_tests diff --git a/test/dynamo/test_optimizers.py b/test/dynamo/test_optimizers.py index ebb2cde24f6ad..92b163b76d6dc 100644 --- a/test/dynamo/test_optimizers.py +++ b/test/dynamo/test_optimizers.py @@ -55,11 +55,6 @@ def setUpClass(cls): torch._dynamo.config, "fake_tensor_propagation", False ) ) - cls._exit_stack.enter_context( - unittest.mock.patch.object( - torch._dynamo.config, "raise_on_assertion_error", True - ) - ) test_sgd = make_test(torch.optim.SGD, lr=0.01) # lgbfs has data-dependent control and internally iterates diff --git a/test/dynamo/test_replay_record.py b/test/dynamo/test_replay_record.py index c158590a9d7f4..378fd2b78a9bc 100644 --- a/test/dynamo/test_replay_record.py +++ b/test/dynamo/test_replay_record.py @@ -29,6 +29,11 @@ def setUpClass(cls): cls._exit_stack.enter_context( unittest.mock.patch.object(torch._dynamo.config, "print_graph_breaks", True) ) + # Most of the tests are checking to see if errors got logged, so we + # ask for errors to be suppressed + cls._exit_stack.enter_context( + unittest.mock.patch.object(torch._dynamo.config, "suppress_errors", True) + ) cls._exit_stack.enter_context( unittest.mock.patch.object( torch._dynamo.config, diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py index 2bd3130958eb2..ffc71741d72c2 100644 --- a/test/dynamo/test_repros.py +++ b/test/dynamo/test_repros.py @@ -1347,6 +1347,8 @@ def fn(args): self.assertTrue(same(ref, res)) + # AssertionError: ABCMeta + @unittest.expectedFailure def test_numpy_list(self): @torch._dynamo.disable def rand_gen(): @@ -1426,6 +1428,8 @@ def fn(x): fn(torch.randn(3)) + # AssertionError: ABCMeta + @unittest.expectedFailure def test_isinstance_storage(self): @torch._dynamo.optimize("eager") def fn(x): @@ -1464,6 +1468,8 @@ def forward(self, x): self.assertEqual(y, 10) + # AssertionError: ABCMeta + @unittest.expectedFailure def test_sort_out(self): dtype = torch.float32 @@ -1481,6 +1487,8 @@ def fn(): opt_fn = torch._dynamo.optimize("eager")(fn) opt_fn() + # AssertionError: ABCMeta + @unittest.expectedFailure def test_sigmoid_out(self): dtype = torch.float32 diff --git a/test/dynamo/test_unspec.py b/test/dynamo/test_unspec.py index fbf3983661935..22f975d0f9d68 100644 --- a/test/dynamo/test_unspec.py +++ b/test/dynamo/test_unspec.py @@ -50,6 +50,9 @@ class UnspecTest(cls): UnspecReproTests = make_unspec_cls(test_repros.ReproTests) UnspecNNModuleTests = make_unspec_cls(test_modules.NNModuleTests) +# RuntimeError: a leaf Variable that requires grad is being used in an in-place operation. +unittest.expectedFailure(UnspecReproTests.test_batch_norm_act_unspec) + @patch.object(torch._dynamo.config, "specialize_int_float", False) class UnspecTests(torch._dynamo.test_case.TestCase): @@ -171,6 +174,8 @@ def fn(x): res2 = opt_fn(x) self.assertTrue(same(res1, res2)) + # TypeError: zeros(): argument 'size' (position 1) must be tuple of SymInts, not FakeTensor + @unittest.expectedFailure def test_builtin_getitem(self): # builtin getitem args[0] is python list and args[1] is unspec def fn(x, idx): diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py index df5a7fb0a21de..c4e82a8092437 100644 --- a/test/inductor/test_torchinductor.py +++ b/test/inductor/test_torchinductor.py @@ -167,7 +167,6 @@ def gather_leaf_tensors(args, kwargs): @patch.object(torch._inductor.config.triton, "cudagraphs", False) -@patch("torch._dynamo.config.raise_on_backend_error", True) def check_model( self: TestCase, model, diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py index e0638341eaa2c..220b711efcb51 100644 --- a/test/inductor/test_torchinductor_opinfo.py +++ b/test/inductor/test_torchinductor_opinfo.py @@ -597,6 +597,5 @@ def fn(*args, **kwargs): instantiate_device_type_tests(TestInductorOpInfo, globals()) if __name__ == "__main__": - torch._dynamo.config.raise_on_assertion_error = True if has_triton() and not TEST_WITH_ROCM: run_tests() diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py index 18d1af0a743b4..7a2c79972ddaa 100644 --- a/torch/_dynamo/config.py +++ b/torch/_dynamo/config.py @@ -74,11 +74,11 @@ # __torch_function__ logic of the subclass. traceable_tensor_subclasses = set() -# Raise torchdynamo internal assertions -raise_on_assertion_error = False - -# Propagate backend exceptions up to torchdynamo.optimize -raise_on_backend_error = True +# Suppress errors in torchdynamo.optimize, instead forcing a fallback to eager. +# This is a good way to get your model to work one way or another, but you may +# lose optimization opportunities this way. Devs, if your benchmark model is failing +# this way, you should figure out why instead of suppressing it. +suppress_errors = bool(os.environ.get("TORCHDYNAMO_SUPPRESS_ERRORS", False)) # Record and write an execution record of the current frame to a file # if an exception is encountered diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py index d4afed9f63e37..46a23b330a0a4 100644 --- a/torch/_dynamo/convert_frame.py +++ b/torch/_dynamo/convert_frame.py @@ -231,7 +231,7 @@ def replay_record_msg(): msg = f"WON'T CONVERT {code.co_name} {code.co_filename}\ line {code.co_firstlineno} \ndue to: \n{traceback.format_exc(limit=-1)}" - if hasattr(exc, "real_stack"): + if hasattr(exc, "real_stack") and len(exc.real_stack) > 0: msg += f"\nfrom user code:\n {''.join(traceback.format_list([exc.real_stack[-1]]))}" msg += replay_record_msg() @@ -439,7 +439,7 @@ def transform(instructions, code_options): raise except Exception as e: exception_handler(e, code, frame) - raise InternalTorchDynamoError() + raise InternalTorchDynamoError() from e def convert_frame(compiler_fn: typing.Callable, guard_export_fn=None): @@ -452,13 +452,11 @@ def _convert_frame(frame: types.FrameType, cache_size: int): result = inner_convert(frame, cache_size) counters["frames"]["ok"] += 1 return result - except AssertionError: - if config.raise_on_assertion_error: - raise - except BackendCompilerFailed: - raise - except Exception: + except (NotImplementedError, Unsupported): pass + except Exception: + if not config.suppress_errors: + raise return None _convert_frame._torchdynamo_orig_callable = compiler_fn diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py index 7a2466637b767..845c518a4f85d 100644 --- a/torch/_dynamo/debug_utils.py +++ b/torch/_dynamo/debug_utils.py @@ -790,8 +790,6 @@ def wrap_backend_debug(compiler_fn, compiler_name: str): def debug_wrapper(gm, example_inputs, **kwargs): assert config.repro_after in ("dynamo", "aot", None) if config.repro_after == "dynamo": - # Ensure that we fail when backend fails - config.raise_on_backend_error = True if config.repro_level == 3: dump_to_minify_after_dynamo(gm, example_inputs, compiler_name) diff --git a/torch/_dynamo/exc.py b/torch/_dynamo/exc.py index 3001c8c823924..80a2a75712ab4 100644 --- a/torch/_dynamo/exc.py +++ b/torch/_dynamo/exc.py @@ -44,7 +44,7 @@ def __init__(self, backend_fn, inner_exception): f"{self.backend_name} raised {type(inner_exception).__name__}: {inner_exception}" "\n\n" "You can suppress this exception and fall back to eager by setting:\n" - " torchdynamo.config.raise_on_backend_error = False" + " torchdynamo.config.suppress_errors = True" ) diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py index 8f94714784d73..1f43ac667e579 100644 --- a/torch/_dynamo/guards.py +++ b/torch/_dynamo/guards.py @@ -5,7 +5,6 @@ import math import os import re -import textwrap import types import weakref from inspect import currentframe, getframeinfo @@ -560,12 +559,10 @@ def compile_check_fn(self, local_builder, global_builder): ] ) closure_vars.update(CLOSURE_VARS) - py_code = textwrap.dedent( - f""" - def ___make_guard_fn({','.join(closure_vars.keys())}): - return lambda {args}: {code} - """ - ) + py_code = f"""\ +def ___make_guard_fn({','.join(closure_vars.keys())}): + return lambda {args}: {code} +""" if os.environ.get("TORCHDYNAMO_PRINT_GUARDS", None) == "1": print("GUARDS", code) set_guard_fail_hook(guard_fail_hook) diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py index f9b75b782aa00..f87b07996d73b 100644 --- a/torch/_dynamo/output_graph.py +++ b/torch/_dynamo/output_graph.py @@ -434,8 +434,7 @@ def call_user_compiler(self, gm): log.warning(e, exc_info=True) log.warning("-" * 40 + "\n") compiled_fn = gm.forward - if config.raise_on_backend_error: - raise BackendCompilerFailed(self.compiler_fn, e) from e + raise BackendCompilerFailed(self.compiler_fn, e) from e return compiled_fn def example_inputs(self): diff --git a/torch/_dynamo/test_case.py b/torch/_dynamo/test_case.py index 089e5053d0625..39eda31646d2a 100644 --- a/torch/_dynamo/test_case.py +++ b/torch/_dynamo/test_case.py @@ -51,9 +51,6 @@ def tearDownClass(cls): def setUpClass(cls): super().setUpClass() cls._exit_stack = contextlib.ExitStack() - cls._exit_stack.enter_context( - patch.object(config, "raise_on_backend_error", True) - ) cls._exit_stack.enter_context( patch.object(config, "raise_on_ctx_manager_usage", True) ) diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index edd7c31e98ac9..cb9b52c338118 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -940,6 +940,9 @@ def __torch_function__(self, func, types, args=(), kwargs=None): torch._dynamo.config.log_level = logging.ERROR # Do not spend time on helper functions that are called with different inputs torch._dynamo.config.cache_size_limit = 8 + # TODO: Remove this; this is grandfathered in because we suppressed errors + # on test suite previously + torch._dynamo.config.suppress_errors = True def skipIfTorchDynamo(msg="test doesn't currently work with dynamo"): From 7955596db64ecc0e67604650f9c5a8fd5b10e6ba Mon Sep 17 00:00:00 2001 From: Zachary DeVito Date: Fri, 21 Oct 2022 03:51:25 +0000 Subject: [PATCH 0018/1922] [inductor] force 'fork' method for processes, cleanup (#87411) To cooperate with other multithreading methods, this forces the process pool to use 'fork' even if others have set it diferently. We require fork because otherwise `if __name__ == __main__` needs to be set which we do not control as a library. Furthermore this adds code to cleanup worker processes if the parent exits abnormally (e.g. segfault). Previously we would leave live but inactive workers around. cc @jansel @lezcano @fdrocha Pull Request resolved: https://github.com/pytorch/pytorch/pull/87411 Approved by: https://github.com/soumith, https://github.com/anijain2305 --- torch/_inductor/codecache.py | 31 +++++++++++++++++++++++++++++-- torch/_inductor/config.py | 3 ++- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py index c4400a35cce85..1d83633019cb8 100644 --- a/torch/_inductor/codecache.py +++ b/torch/_inductor/codecache.py @@ -3,16 +3,19 @@ import getpass import hashlib import logging +import multiprocessing import os import re import shutil +import signal import subprocess import sysconfig import tempfile import types from concurrent.futures import Future, ProcessPoolExecutor, ThreadPoolExecutor from ctypes import cdll -from time import time +from threading import Thread +from time import sleep, time from typing import Any, Dict import torch @@ -279,7 +282,31 @@ def process_pool(): # are forked cuda_properties._properties() assert config.compile_threads > 1 - return ProcessPoolExecutor(config.compile_threads) + orig_ppid = os.getpid() + + # if this process dies abnormally (e.g. segfault) + # it will not shut down the workers. Instead + # the workers will have their parent reassigned to the + # init process. This launches a separate thread to + # watch for the worker getting reassigned, + # and cleans it up in this case. + def init(): + def run(): + while True: + sleep(1) + if orig_ppid != os.getppid(): + os.kill(os.getpid(), signal.SIGKILL) + + global _watchdog_thread + _watchdog_thread = Thread(target=run, daemon=True) + _watchdog_thread.start() + + # we rely on 'fork' because we cannot control whether users + # have an `if __name__ == '__main__'` in their main process. + fork_context = multiprocessing.get_context("fork") + return ProcessPoolExecutor( + config.compile_threads, mp_context=fork_context, initializer=init + ) @classmethod def warm_pool(cls): diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py index cabaa3e7ce0ba..f4b847e50c820 100644 --- a/torch/_inductor/config.py +++ b/torch/_inductor/config.py @@ -1,4 +1,5 @@ import os +import sys # add some debug printouts debug = False @@ -53,7 +54,7 @@ comment_origin = False -compile_threads = min(32, os.cpu_count()) +compile_threads = min(32, os.cpu_count()) if sys.platform != "win32" else 1 # How to import torchinductor, either torchinductor or torch.inductor inductor_import = __name__.replace(".config", "") From 885ecbf0c5238d60d662567859b139af5fcdc0c9 Mon Sep 17 00:00:00 2001 From: William Wen Date: Fri, 21 Oct 2022 17:30:14 +0000 Subject: [PATCH 0019/1922] Add dynamo smoke test (#87400) https://github.com/pytorch/torchdynamo/issues/1733 Move the old smoke test over from the old dynamo repo. cc @jansel @lezcano @fdrocha Pull Request resolved: https://github.com/pytorch/pytorch/pull/87400 Approved by: https://github.com/msaroufim --- .lintrunner.toml | 1 + tools/dynamo/verify_dynamo.py | 156 ++++++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 tools/dynamo/verify_dynamo.py diff --git a/.lintrunner.toml b/.lintrunner.toml index 70e2a423edcc1..a48d411ea9a83 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -156,6 +156,7 @@ include_patterns = [ exclude_patterns = [ # (linbinyu) copied from internal repo 'tools/code_analyzer/gen_operators_yaml.py', + 'tools/dynamo/verify_dynamo.py', 'tools/gen_vulkan_spv.py', 'tools/test/gen_operators_yaml_test.py', 'tools/test/gen_oplist_test.py', diff --git a/tools/dynamo/verify_dynamo.py b/tools/dynamo/verify_dynamo.py new file mode 100644 index 0000000000000..cbc582a561573 --- /dev/null +++ b/tools/dynamo/verify_dynamo.py @@ -0,0 +1,156 @@ +import os +import re +import subprocess +import sys +import traceback +import warnings + +from pkg_resources import packaging + +MIN_CUDA_VERSION = packaging.version.parse("11.6") +MIN_PYTHON_VERSION = (3, 7) + + +class VerifyDynamoError(BaseException): + pass + + +def check_python(): + if sys.version_info < MIN_PYTHON_VERSION: + raise VerifyDynamoError( + f"Python version not supported: {sys.version_info} " + f"- minimum requirement: {MIN_PYTHON_VERSION}" + ) + return sys.version_info + + +def check_torch(): + import torch + + return packaging.version.parse(torch.__version__) + + +# based on torch/utils/cpp_extension.py +def get_cuda_version(): + from torch.utils import cpp_extension + + CUDA_HOME = cpp_extension._find_cuda_home() + if not CUDA_HOME: + raise VerifyDynamoError(cpp_extension.CUDA_NOT_FOUND_MESSAGE) + + nvcc = os.path.join(CUDA_HOME, "bin", "nvcc") + cuda_version_str = ( + subprocess.check_output([nvcc, "--version"]) + .strip() + .decode(*cpp_extension.SUBPROCESS_DECODE_ARGS) + ) + cuda_version = re.search(r"release (\d+[.]\d+)", cuda_version_str) + if cuda_version is None: + raise VerifyDynamoError("CUDA version not found in `nvcc --version` output") + + cuda_str_version = cuda_version.group(1) + return packaging.version.parse(cuda_str_version) + + +def check_cuda(): + import torch + + if not torch.cuda.is_available(): + return None + + torch_cuda_ver = packaging.version.parse(torch.version.cuda) + + # check if torch cuda version matches system cuda version + cuda_ver = get_cuda_version() + if cuda_ver != torch_cuda_ver: + # raise VerifyDynamoError( + warnings.warn( + f"CUDA version mismatch, `torch` version: {torch_cuda_ver}, env version: {cuda_ver}" + ) + + if torch_cuda_ver < MIN_CUDA_VERSION: + # raise VerifyDynamoError( + warnings.warn( + f"(`torch`) CUDA version not supported: {torch_cuda_ver} " + f"- minimum requirement: {MIN_CUDA_VERSION}" + ) + if cuda_ver < MIN_CUDA_VERSION: + # raise VerifyDynamoError( + warnings.warn( + f"(env) CUDA version not supported: {cuda_ver} " + f"- minimum requirement: {MIN_CUDA_VERSION}" + ) + + return cuda_ver + + +def check_dynamo(backend, device, err_msg): + import torch + + if device == "cuda" and not torch.cuda.is_available(): + print(f"CUDA not available -- skipping CUDA check on {backend} backend\n") + return + + try: + import torch._dynamo as dynamo + + dynamo.reset() + + @dynamo.optimize(backend, nopython=True) + def fn(x): + return x + x + + class Module(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return x + x + + mod = Module() + opt_mod = dynamo.optimize(backend, nopython=True)(mod) + + for f in (fn, opt_mod): + x = torch.randn(10, 10).to(device) + x.requires_grad = True + y = f(x) + torch.testing.assert_close(y, x + x) + z = y.sum() + z.backward() + torch.testing.assert_close(x.grad, 2 * torch.ones_like(x)) + except Exception: + sys.stderr.write(traceback.format_exc() + "\n" + err_msg + "\n\n") + sys.exit(1) + + +_SANITY_CHECK_ARGS = ( + ("eager", "cpu", "CPU eager sanity check failed"), + ("eager", "cuda", "CUDA eager sanity check failed"), + ("aot_eager", "cpu", "CPU aot_eager sanity check failed"), + ("aot_eager", "cuda", "CUDA aot_eager sanity check failed"), + ("inductor", "cpu", "CPU inductor sanity check failed"), + ( + "inductor", + "cuda", + "CUDA inductor sanity check failed\n" + + "NOTE: Please check that you installed the correct hash/version of `triton`", + ), +) + + +def main(): + python_ver = check_python() + torch_ver = check_torch() + cuda_ver = check_cuda() + print( + f"Python version: {python_ver.major}.{python_ver.minor}.{python_ver.micro}\n" + f"`torch` version: {torch_ver}\n" + f"CUDA version: {cuda_ver}\n" + ) + for args in _SANITY_CHECK_ARGS: + check_dynamo(*args) + print("All required checks passed") + + +if __name__ == "__main__": + main() From 61ebcf1dc5eb5b7999f0450c37fd932ca95617b9 Mon Sep 17 00:00:00 2001 From: chuksmbaka Date: Fri, 21 Oct 2022 17:30:18 +0000 Subject: [PATCH 0020/1922] Grammatical update of the tech docs. (#87357) Fixes #ISSUE_NUMBER A more appropriate and correct word. ![grammatical correction](https://user-images.githubusercontent.com/25278471/196927273-7e4c0c9b-96a6-43d1-9b10-17b40665feed.png) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87357 Approved by: https://github.com/albanD --- torch/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/__init__.py b/torch/__init__.py index 8a824642ab57d..63995d6ec7f69 100644 --- a/torch/__init__.py +++ b/torch/__init__.py @@ -2,7 +2,7 @@ r""" The torch package contains data structures for multi-dimensional tensors and defines mathematical operations over these tensors. -Additionally, it provides many utilities for efficient serializing of +Additionally, it provides many utilities for efficient serialization of Tensors and arbitrary types, and other useful utilities. It has a CUDA counterpart, that enables you to run your tensor computations From 026cc96295b5f34f6fd8715a7912fb475e31938c Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Thu, 20 Oct 2022 18:06:25 +0100 Subject: [PATCH 0021/1922] Remove redundant zeroing in col2im/im2col (#87375) All of the kernels already either start by zeroing the output, or are careful in their implementation to write values to every output location. So, these `zero_` calls should be redundant. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87375 Approved by: https://github.com/albanD --- aten/src/ATen/native/Col2Im.cpp | 1 - aten/src/ATen/native/Im2Col.cpp | 1 - aten/src/ATen/native/cuda/Col2Im.cu | 1 - aten/src/ATen/native/cuda/Im2Col.cu | 1 - 4 files changed, 4 deletions(-) diff --git a/aten/src/ATen/native/Col2Im.cpp b/aten/src/ATen/native/Col2Im.cpp index 090a3a8a71db2..5ce747e9c7a7e 100644 --- a/aten/src/ATen/native/Col2Im.cpp +++ b/aten/src/ATen/native/Col2Im.cpp @@ -144,7 +144,6 @@ static void col2im_out_cpu_template( int64_t n_output_plane = n_input_plane / (kernel_width * kernel_height); output.resize_({batch_size, n_output_plane, output_height, output_width}); - output.zero_(); AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "col2im_out_cpu", [&] { diff --git a/aten/src/ATen/native/Im2Col.cpp b/aten/src/ATen/native/Im2Col.cpp index dd6c8b303a5fe..7cb5133eef9ad 100644 --- a/aten/src/ATen/native/Im2Col.cpp +++ b/aten/src/ATen/native/Im2Col.cpp @@ -85,7 +85,6 @@ static void im2col_out_cpu_template( int64_t output_length = output_height * output_width; output.resize_({batch_size, n_output_plane, output_length}); - output.zero_(); AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "im2col_out_cpu", [&] { diff --git a/aten/src/ATen/native/cuda/Col2Im.cu b/aten/src/ATen/native/cuda/Col2Im.cu index 98d1950004ef2..53eb2df3013eb 100644 --- a/aten/src/ATen/native/cuda/Col2Im.cu +++ b/aten/src/ATen/native/cuda/Col2Im.cu @@ -101,7 +101,6 @@ void col2im_out_cuda_template( int64_t input_batch_stride = input.stride(0); output.resize_({batch_size, n_output_plane, output_height, output_width}); - output.zero_(); int64_t output_batch_stride = output.stride(0); AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, diff --git a/aten/src/ATen/native/cuda/Im2Col.cu b/aten/src/ATen/native/cuda/Im2Col.cu index a18d4d822c659..a209aa2764639 100644 --- a/aten/src/ATen/native/cuda/Im2Col.cu +++ b/aten/src/ATen/native/cuda/Im2Col.cu @@ -102,7 +102,6 @@ static void im2col_out_cuda_template( int64_t output_length = output_height * output_width; output.resize_({batch_size, n_output_plane, output_length}); - output.zero_(); // Launch kernel AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, From afbee9d887b334e8d74348bc56e272f6575ddf1e Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 21 Oct 2022 17:39:01 +0000 Subject: [PATCH 0022/1922] Skip auto request review on forked PR (#87482) Addresses the comment in https://github.com/pytorch/pytorch/pull/87409 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87482 Approved by: https://github.com/albanD --- .github/workflows/auto_request_review.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/auto_request_review.yml b/.github/workflows/auto_request_review.yml index 01df7a054005f..352b1cf773b71 100644 --- a/.github/workflows/auto_request_review.yml +++ b/.github/workflows/auto_request_review.yml @@ -6,6 +6,8 @@ on: jobs: auto-request-review: + # Don't run on forked repos + if: github.repository_owner == 'pytorch' name: Auto Request Review runs-on: ubuntu-latest steps: From 4e382ab15a7e7897a3d8c7bc45c96eed9f21ba6b Mon Sep 17 00:00:00 2001 From: Neel Patel Date: Fri, 21 Oct 2022 17:39:27 +0000 Subject: [PATCH 0023/1922] Create workflow to make sure PRs have valid labels (#86829) ### Context When a dev submits a PR against the repo, we want to validate that they applied two labels to the PR corresponding the module they edited and the kind of change they're making. ### Change Extended the open source workflow CI to add a validation to ensure that the PR being checked has the required labels on it. If it doesn't, the check fails and a bot will post a message on the PR with instructions on what labels the developer needs to add (https://github.com/pytorch/pytorch/wiki/PyTorch-AutoLabel-Bot#why-categorize-for-release-notes-and-how-does-it-work). ### Impact Every time a new version of PyTorch is released, we want to compile all the changes made to each module. However, when devs forget to tag their PR, compiling the changes to write the release notes becomes a burdensome process (only ~20% of PRs are currently labeled appropriately, which means it can take up to 40 hours to compile release notes). With this new validation, the hope is that most PRs are labeled accordingly for more timely release notes compilation. Pull Request resolved: https://github.com/pytorch/pytorch/pull/86829 Approved by: https://github.com/ZainRizvi --- .github/requirements-gha-cache.txt | 2 + .github/scripts/check_labels.py | 84 + .github/scripts/gql_mocks.json | 15011 +++++++++++++++++++++++++ .github/scripts/test_check_labels.py | 77 + .github/scripts/trymerge.py | 4 + .github/workflows/lint.yml | 32 + 6 files changed, 15210 insertions(+) create mode 100755 .github/scripts/check_labels.py create mode 100644 .github/scripts/test_check_labels.py diff --git a/.github/requirements-gha-cache.txt b/.github/requirements-gha-cache.txt index f331d98351ae8..6badbe2cc65c8 100644 --- a/.github/requirements-gha-cache.txt +++ b/.github/requirements-gha-cache.txt @@ -5,12 +5,14 @@ # docs/cpp/requirements.txt # functorch/docs/requirements.txt # .circleci/docker/requirements-ci.txt +boto3==1.19.12 cffi==1.15.0 dataclasses==0.6 jinja2==3.0.1 lintrunner==0.9.2 ninja==1.10.0.post1 pynvml==11.4.1 +pyyaml==6.0 requests==2.26 rich==10.9.0 rockset==0.8.10 diff --git a/.github/scripts/check_labels.py b/.github/scripts/check_labels.py new file mode 100755 index 0000000000000..ff40a94ee6fec --- /dev/null +++ b/.github/scripts/check_labels.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +"""check_labels.py""" + +from typing import Any, List +from datetime import datetime, timedelta + +from export_pytorch_labels import get_pytorch_labels +from gitutils import ( + get_git_remote_name, + get_git_repo_dir, + GitRepo, +) +from trymerge import ( + _fetch_url, + gh_post_pr_comment, + GitHubPR, +) + + +BOT_AUTHORS = ["github-actions", "pytorchmergebot", "pytorch-bot"] + +ERR_MSG_TITLE = "This PR needs a label" +ERR_MSG = ( + f"# {ERR_MSG_TITLE}\n" + "If your changes are user facing and intended to be a part of release notes, please use a label starting with `release notes:`.\n\n" # noqa: E501 pylint: disable=line-too-long + "If not, please add the `topic: not user facing` label.\n\n" + "For more information, see https://github.com/pytorch/pytorch/wiki/PyTorch-AutoLabel-Bot#why-categorize-for-release-notes-and-how-does-it-work." # noqa: E501 pylint: disable=line-too-long +) + + +def get_release_notes_labels() -> List[str]: + return [label for label in get_pytorch_labels() if label.lstrip().startswith("release notes:")] + + +def delete_comment(comment_id: int) -> None: + url = f"https://api.github.com/repos/pytorch/pytorch/issues/comments/{comment_id}" + _fetch_url(url, method="DELETE") + + +def has_required_labels(pr: GitHubPR) -> bool: + pr_labels = pr.get_labels() + + # Check if PR is not user facing + is_not_user_facing_pr = any(label.strip() == "topic: not user facing" for label in pr_labels) + if is_not_user_facing_pr: + return True + + # Check if bot has already posted a message within the past hour to include a release notes label + for comment in pr.get_comments(): + if comment.body_text.lstrip(" #").startswith(ERR_MSG_TITLE) and comment.author_login in BOT_AUTHORS: + ts = datetime.strptime(comment.created_at, "%Y-%m-%dT%H:%M:%SZ") + if (datetime.utcnow() - ts) < timedelta(hours=1): + return True + delete_comment(comment.database_id) + break + + return any(label.strip() in get_release_notes_labels() for label in pr_labels) + + +def parse_args() -> Any: + from argparse import ArgumentParser + parser = ArgumentParser("Check PR labels") + parser.add_argument("pr_num", type=int) + + return parser.parse_args() + + +def main() -> None: + args = parse_args() + repo = GitRepo(get_git_repo_dir(), get_git_remote_name()) + org, project = repo.gh_owner_and_name() + pr = GitHubPR(org, project, args.pr_num) + + try: + if not has_required_labels(pr): + print(ERR_MSG) + gh_post_pr_comment(pr.org, pr.project, pr.pr_num, ERR_MSG) + exit(1) + except Exception as e: + pass + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/gql_mocks.json b/.github/scripts/gql_mocks.json index 4a6ea6a6402c7..164c1ac147e5b 100644 --- a/.github/scripts/gql_mocks.json +++ b/.github/scripts/gql_mocks.json @@ -20855,5 +20855,15016 @@ "team": null } } + }, + "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=82169 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": true, + "isCrossRepository": false, + "author": { + "login": "ezyang" + }, + "title": "Move test_dtypes so it runs later", + "body": "Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom):\n* __->__ #82169\n\nThe error messages it gives are very unhelpful (because a failure\ngets translated into \"dtype was not supported\" rather than the\nactual backtrace), so I'd rather get error messages about this after\nI've tested basic functionality.\n\nSigned-off-by: Edward Z. Yang ", + "headRefName": "gh/ezyang/1279/head", + "headRepository": { + "nameWithOwner": "pytorch/pytorch" + }, + "baseRefName": "gh/ezyang/1279/base", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "cef34da55a59da5a32494bff218ccd4978b659d3" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "83ad7e73a07111ac1d85e931d14360cc22c01edd" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "28140e4008289251b695385acfb48ac7a47cd49c" + } + } + ], + "pageInfo": { + "endCursor": "Mw", + "hasNextPage": false + }, + "totalCount": 3 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "edges": [ + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "lintrunner", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310707890" + }, + { + "name": "Test collect_env (with_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708140" + }, + { + "name": "Test collect_env (without_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708223" + }, + { + "name": "Test collect_env (older_python_version)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708332" + }, + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708496" + }, + { + "name": "toc", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708710" + }, + { + "name": "Test tools", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708937" + }, + { + "name": "workflow-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310709169" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcGj1lc=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/28140e4008289251b695385acfb48ac7a47cd49c/checks?check_suite_id=7546696649" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRc8k=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/28140e4008289251b695385acfb48ac7a47cd49c/checks?check_suite_id=7546696651" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRc8s=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "run-torchbench", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823982/jobs/4310707884" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcGjz0w=", + "hasNextPage": false + } + }, + "conclusion": "SKIPPED", + "url": "https://github.com/pytorch/pytorch/commit/28140e4008289251b695385acfb48ac7a47cd49c/checks?check_suite_id=7546696656" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRc9A=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/28140e4008289251b695385acfb48ac7a47cd49c/checks?check_suite_id=7546696660" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRc9Q=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pull" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/28140e4008289251b695385acfb48ac7a47cd49c/checks?check_suite_id=7546696715" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdAs=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pull" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "linux-bionic-cuda11.3-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310708487" + }, + { + "name": "linux-bionic-cuda11.6-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310708713" + }, + { + "name": "linux-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310708942" + }, + { + "name": "linux-focal-py3.7-clang7-asan / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310709174" + }, + { + "name": "linux-bionic-py3_7-clang8-xla / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310709340" + }, + { + "name": "linux-focal-py3.7-gcc7-no-ops / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310709579" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310709844" + }, + { + "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710003" + }, + { + "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710175" + }, + { + "name": "win-vs2019-cuda11.6-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710516" + }, + { + "name": "linux-focal-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710716" + }, + { + "name": "win-vs2019-cpu-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710890" + }, + { + "name": "linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711097" + }, + { + "name": "linux-focal-py3.7-clang10-onnx / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711234" + }, + { + "name": "linux-xenial-py3-clang5-mobile-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711429" + }, + { + "name": "linux-focal-rocm5.2-py3.7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711603" + }, + { + "name": "linux-jammy-cuda11.6-cudnn8-py3.8-clang12 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711765" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711946" + }, + { + "name": "linux-xenial-cuda11_3-py3_7-gcc7-deploy / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310712129" + }, + { + "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310712276" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194495" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194591" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194659" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194749" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (dynamo, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194858" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (dynamo, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194934" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (functorch, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311195003" + }, + { + "name": "linux-focal-py3.7-clang10-onnx / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311220458" + }, + { + "name": "linux-focal-py3.7-clang10-onnx / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311220540" + }, + { + "name": "linux-docs / build-docs (cpp)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311222725" + }, + { + "name": "linux-docs / build-docs (python)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311222869" + }, + { + "name": "linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223128" + }, + { + "name": "linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223225" + }, + { + "name": "linux-focal-py3.7-gcc7 / test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223324" + }, + { + "name": "linux-focal-py3.7-gcc7 / test (functorch, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223396" + }, + { + "name": "linux-focal-py3.7-gcc7 / test (docs_test, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223496" + }, + { + "name": "linux-focal-py3.7-gcc7 / test (jit_legacy, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223569" + }, + { + "name": "linux-focal-py3.7-gcc7 / test (backwards_compat, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223690" + }, + { + "name": "linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311224360" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311230050" + }, + { + "name": "linux-focal-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311301930" + }, + { + "name": "linux-focal-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311302152" + }, + { + "name": "linux-focal-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311302303" + }, + { + "name": "linux-focal-py3.7-clang7-asan / test (default, 4, 5, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311302433" + }, + { + "name": "linux-focal-py3.7-clang7-asan / test (default, 5, 5, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311302531" + }, + { + "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491082" + }, + { + "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491172" + }, + { + "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491232" + }, + { + "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491289" + }, + { + "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491348" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcG0YME=", + "hasNextPage": true + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/28140e4008289251b695385acfb48ac7a47cd49c/checks?check_suite_id=7546696836" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdIQ=" + }, + { + "node": { + "app": { + "name": "Facebook GitHub Tools", + "databaseId": 12274 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "Facebook CLA Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://code.intern.facebook.com/cla/" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcGjyQg=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/28140e4008289251b695385acfb48ac7a47cd49c/checks?check_suite_id=7546696896" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdMA=" + }, + { + "node": { + "app": { + "name": "Netlify", + "databaseId": 13473 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/28140e4008289251b695385acfb48ac7a47cd49c/checks?check_suite_id=7546697185" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdeE=" + }, + { + "node": { + "app": { + "name": "Azure Pipelines", + "databaseId": 9426 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/28140e4008289251b695385acfb48ac7a47cd49c/checks?check_suite_id=7546697205" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdfU=" + }, + { + "node": { + "app": { + "name": "Dependabot", + "databaseId": 29110 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/28140e4008289251b695385acfb48ac7a47cd49c/checks?check_suite_id=7546697224" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdgg=" + } + ], + "pageInfo": { + "hasNextPage": true + } + }, + "status": null, + "pushedDate": "2022-07-27T15:34:17Z", + "oid": "28140e4008289251b695385acfb48ac7a47cd49c" + } + } + ] + }, + "changedFiles": 1, + "files": { + "nodes": [ + { + "path": "test/test_ops.py" + } + ], + "pageInfo": { + "endCursor": "MQ", + "hasNextPage": false + } + }, + "reviews": { + "nodes": [ + { + "author": { + "login": "zou3519" + }, + "state": "APPROVED" + }, + { + "author": { + "login": "Chillee" + }, + "state": "APPROVED" + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNy0yNVQxNDo0NTozNS0wNzowMLkyMDIyLTA3LTI1VDE0OjQ1OjM1LTA3OjAwzj6XYmg=", + "hasPreviousPage": false + } + }, + "comments": { + "nodes": [ + { + "bodyText": "@pytorchbot merge -f FORCE", + "createdAt": "2022-07-27T17:56:43Z", + "author": { + "login": "malfet" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1197107402 + }, + { + "bodyText": "You need to provide a reason for using force merge, in the format @pytorchbot merge -f '[CATEGORY] Explanation'. With [CATEGORY] being one the following:\nEMERGENCY - an emergency fix to quickly address an issue\nMINOR - a minor fix such as cleaning locally unused variables, which shouldn't break anything\nPRE_TESTED - a previous CI run tested everything and you've only added minor changes like fixing lint\nOTHER - something not covered above", + "createdAt": "2022-07-27T17:56:45Z", + "author": { + "login": "pytorch-bot" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1197107439 + }, + { + "bodyText": "@pytorchbot merge -f \"[OTHER] normal land failed twice already\"", + "createdAt": "2022-07-27T17:57:28Z", + "author": { + "login": "malfet" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1197108130 + }, + { + "bodyText": "@pytorchbot successfully started a merge job. Check the current status here", + "createdAt": "2022-07-27T18:08:13Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1197119348 + }, + { + "bodyText": "Hey @ezyang.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.", + "createdAt": "2022-07-27T18:08:58Z", + "author": { + "login": "github-actions" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1197120095 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOR1poyg==", + "hasPreviousPage": true + } + }, + "labels": { + "edges": [ + { + "node": { + "name": "Merged" + } + }, + { + "node": { + "name": "cla signed" + } + } + ] + } + } + } + } + }, + "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=73811 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": true, + "isCrossRepository": false, + "author": { + "login": "seemethere" + }, + "title": "ci: Migrate metrics credentials to managed IAM", + "body": "Stack from [ghstack](https://github.com/ezyang/ghstack):\n* __->__ #73811\n\r\nMigrates our credentials to upload metrics statistics to managed IAM\r\ncredentials in order to make it easier to know where the credentials are\r\ncoming from and to make it easier to add more permissions / less\r\npermissions later on.\r\n\r\nRelates to work done in [D34535827](https://www.internalfb.com/diff/D34535827)\r\n\r\nSigned-off-by: Eli Uriegas ", + "headRefName": "gh/seemethere/215/head", + "headRepository": { + "nameWithOwner": "pytorch/pytorch" + }, + "baseRefName": "gh/seemethere/215/base", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": { + "login": "seemethere" + }, + "email": "eliuriegas@fb.com", + "name": "Eli Uriegas" + }, + "oid": "13c44d16a876a56bca479b4cf30715d21fa16e99" + } + }, + { + "commit": { + "author": { + "user": { + "login": "seemethere" + }, + "email": "eliuriegas@fb.com", + "name": "Eli Uriegas" + }, + "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7" + } + } + ], + "pageInfo": { + "endCursor": "Mg", + "hasNextPage": false + }, + "totalCount": 2 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "edges": [ + { + "node": { + "app": { + "name": "Facebook GitHub Tools", + "databaseId": 12274 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "Facebook CLA Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://code.intern.facebook.com/cla/" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOaHA=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658275867" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcBs=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276090" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcPo=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "win-vs2019-cpu-py3" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276092" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcPw=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3-clang5-mobile-build" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276094" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcP4=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276095" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcP8=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276097" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQE=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276098" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQI=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc7-no-ops" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602966/jobs/2839950629" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObRM=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276099" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQM=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Test tools" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276100" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQQ=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-clang7-asan" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276101" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQU=" + } + ], + "pageInfo": { + "hasNextPage": true + } + }, + "status": { + "contexts": [ + { + "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17044969?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17045014?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17044975?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + } + ] + }, + "pushedDate": "2022-03-14T23:01:55Z", + "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7" + } + } + ] + }, + "changedFiles": 3, + "files": { + "nodes": [ + { + "path": ".github/templates/common.yml.j2" + }, + { + "path": ".github/workflows/generated-macos-11-py3-x86-64.yml" + }, + { + "path": ".github/workflows/update_pytorch_labels.yml" + } + ], + "pageInfo": { + "endCursor": "Mw", + "hasNextPage": false + } + }, + "reviews": { + "nodes": [ + { + "author": { + "login": "kit1980" + }, + "state": "APPROVED" + }, + { + "author": { + "login": "janeyx99" + }, + "state": "APPROVED" + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMy0wNFQxNDoyNDo0OC0wODowMLkyMDIyLTAzLTA0VDE0OjI0OjQ4LTA4OjAwzjWwwqA=", + "hasPreviousPage": false + } + }, + "comments": { + "nodes": [ + { + "bodyText": "Merge failed due to Too many checksuites for commit\nRaised by https://github.com/pytorch/pytorch/actions/runs/1988337976", + "createdAt": "2022-03-15T17:43:28Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1068270969 + }, + { + "bodyText": "@pytorchbot force merge this", + "createdAt": "2022-03-15T20:26:36Z", + "author": { + "login": "seemethere" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1068436128 + }, + { + "bodyText": "Merge failed due to Too many checksuites for commit\nRaised by https://github.com/pytorch/pytorch/actions/runs/1989076952", + "createdAt": "2022-03-15T20:27:47Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1068437098 + }, + { + "bodyText": "@pytorchbot merge this", + "createdAt": "2022-03-15T21:18:55Z", + "author": { + "login": "seemethere" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1068482921 + }, + { + "bodyText": "Hey @seemethere.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.", + "createdAt": "2022-03-15T21:20:40Z", + "author": { + "login": "github-actions" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1068484404 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOP6yFeQ==", + "hasPreviousPage": true + } + }, + "labels": { + "edges": [ + { + "node": { + "name": "cla signed" + } + } + ] + } + } + } + } + }, + "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=31093 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": true, + "isCrossRepository": true, + "author": { + "login": "mingxiaoh" + }, + "title": "improve mkldnn convolution test coverage", + "body": "This pr will improve the test coverage of mkldnn convolution.\r\n1.test input: specific sensitive numbers\r\n2.pass criteria: output of mkldnn convolution matches output of thnn convolution\r\n3.coverage: by using coverage tool, we found out the following sensitive parameters. Overall the case will test 4352 patterns, takes 8.8s on my machine.\r\n\r\nto run the test case:\r\n\r\npython test_mkldnn_conv2d_ext.py\r\nor\r\npython run_test.py -i mkldnn_conv2d_ext\r\n\r\nIn case of failure, the pattern will be printed in the log for further debugging.\r\n\r\nactually, this PR is created to replace and improve that PR we created before(https://github.com/pytorch/pytorch/pull/25085) ", + "headRefName": "master", + "headRepository": { + "nameWithOwner": "mingxiaoh/pytorch" + }, + "baseRefName": "master", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": { + "login": "11pikachu" + }, + "email": "junx.du@intel.com", + "name": "dujun" + }, + "oid": "29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9" + } + } + ], + "pageInfo": { + "endCursor": "MQ", + "hasNextPage": false + }, + "totalCount": 1 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "edges": [ + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "clang-format" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "clang-format", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/1099676797?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHOQYu8fQ==", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9/checks?check_suite_id=1175281097" + }, + "cursor": "Y3Vyc29yOnYyOpHORg1dyQ==" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "flake8-py3", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/1099676800?check_suite_focus=true" + }, + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/1099676817?check_suite_focus=true" + }, + { + "name": "clang-tidy", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/1099676829?check_suite_focus=true" + }, + { + "name": "cmakelint", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/1099676840?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHOQYu8qA==", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9/checks?check_suite_id=1175281099" + }, + "cursor": "Y3Vyc29yOnYyOpHORg1dyw==" + }, + { + "node": { + "app": { + "name": "Codecov", + "databaseId": 254 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "codecov/project", + "conclusion": "SUCCESS", + "detailsUrl": "https://codecov.io" + }, + { + "name": "codecov/patch", + "conclusion": "SUCCESS", + "detailsUrl": "https://codecov.io" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHOQZhcFQ==", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9/checks?check_suite_id=1176100822" + }, + "cursor": "Y3Vyc29yOnYyOpHORhnf1g==" + }, + { + "node": { + "app": { + "name": "Codecov", + "databaseId": 254 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "codecov/patch", + "conclusion": "SUCCESS", + "detailsUrl": "https://codecov.io" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHOQZZsEQ==", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9/checks?check_suite_id=1176100824" + }, + "cursor": "Y3Vyc29yOnYyOpHORhnf2A==" + }, + { + "node": { + "app": { + "name": "Facebook GitHub Tools", + "databaseId": 12274 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "Facebook CLA Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://code.facebook.com/cla/" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHOUquzJg==", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9/checks?check_suite_id=1487517306" + }, + "cursor": "Y3Vyc29yOnYyOpHOWKm2eg==" + } + ], + "pageInfo": { + "hasNextPage": false + } + }, + "status": { + "contexts": [ + { + "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406538?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406947?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406544?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406931?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: binary_windows_libtorch_3_7_cpu_debug_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406550?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: binary_windows_libtorch_3_7_cpu_debug_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406887?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: binary_windows_libtorch_3_7_cpu_release_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406526?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: binary_windows_libtorch_3_7_cpu_release_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406707?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: caffe2_onnx_main_py3_6_clang7_ubuntu16_04_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406533?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: caffe2_onnx_main_py3_6_clang7_ubuntu16_04_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407256?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: caffe2_onnx_ort1_py3_6_clang7_ubuntu16_04_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407254?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: caffe2_onnx_ort2_py3_6_clang7_ubuntu16_04_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407255?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-bionic-cuda10.2-cudnn7-py3.6-clang9", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406556?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-bionic-cuda10.2-cudnn7-py3.8-gcc9", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406532?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-bionic-cuda11.0-cudnn8-py3.6-gcc9", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406527?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-bionic-cuda11.0-cudnn8-py3.8-gcc9", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406553?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-bionic-py3.6-clang9", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406537?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-bionic-py3.8-gcc9", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406529?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-bionic-rocm3.5.1-py3.6", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406554?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-bionic-rocm3.7-py3.6", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406545?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406543?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-xenial-cuda10.1-cudnn7-py3-gcc7", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406536?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406552?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406535?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406540?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406528?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406541?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-asan", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406549?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-clang7", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406555?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-gcc4.8", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406546?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-gcc5.4", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406531?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-gcc7", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406534?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-gcc7.2", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406523?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-xenial-py3.8", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406539?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-xenial-rocm3.3-py3.6", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406547?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-xenial-rocm3.5.1-py3.6", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406551?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407209?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406611?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_bazel_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406607?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_bazel_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406984?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_cpp_doc_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407013?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_doc_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407011?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_ios_11_2_1_x86_64_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406548?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406563?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7408680?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_backward_compatibility_check_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407014?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_bionic_py3_6_clang9_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406567?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_bionic_py3_6_clang9_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406945?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_bionic_py3_8_gcc9_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406561?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_bionic_py3_8_gcc9_coverage_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407422?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_bionic_rocm3_7_py3_6_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406562?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406612?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7408107?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_legacy_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7408111?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_profiling_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7408101?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc5_4_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406613?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406565?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_legacy_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407017?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_profiling_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407019?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_simple_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407012?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407016?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_vulkan_x86_32_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406608?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406609?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_asan_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406606?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_asan_test1", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407435?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_asan_test2", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407436?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_mobile_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406605?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_mobile_custom_build_dynamic", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406610?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_macos_10_13_py3_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406525?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_macos_10_13_py3_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407415?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_python_doc_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407018?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_vulkan_linux_bionic_py3_6_clang9_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406566?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_vulkan_linux_bionic_py3_6_clang9_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406946?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_windows_vs2019_py36_cpu_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406542?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_windows_vs2019_py36_cuda10.1_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406530?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_windows_vs2019_py36_cuda10.1_test1", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407028?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_windows_vs2019_py36_cuda10.1_test2", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407027?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_windows_vs2019_py36_cuda11.0_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406524?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_xla_linux_bionic_py3_6_clang9_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406572?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_xla_linux_bionic_py3_6_clang9_test", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407253?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "codecov/patch", + "state": "SUCCESS", + "targetUrl": "https://codecov.io/gh/pytorch/pytorch/compare/69f6d94caa3559d4f50745c26af5df041b83fee8...29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9" + }, + { + "context": "codecov/project", + "state": "SUCCESS", + "targetUrl": "https://codecov.io/gh/pytorch/pytorch/compare/69f6d94caa3559d4f50745c26af5df041b83fee8...29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9" + }, + { + "context": "pr/caffe2-pytorch-linux-bionic-rocm3.7-py3.6-test", + "state": "SUCCESS", + "targetUrl": "https://ci.pytorch.org/jenkins/job/caffe2-builds/job/pytorch-linux-bionic-rocm3.7-py3.6-trigger-test/2319/" + }, + { + "context": "pr/pytorch-linux-bionic-rocm3.7-py3.6", + "state": "SUCCESS", + "targetUrl": "https://ci.pytorch.org/jenkins/job/pytorch-builds/job/pytorch-linux-bionic-rocm3.7-py3.6-trigger/2325/" + } + ] + }, + "pushedDate": "2020-09-11T01:58:24Z", + "oid": "29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9" + } + } + ] + }, + "changedFiles": 5, + "files": { + "nodes": [ + { + "path": "test/math_libraries/convolutions.py" + }, + { + "path": "test/math_libraries/convolutions_cases/shapes_googlenet_v3.json" + }, + { + "path": "test/math_libraries/convolutions_cases/shapes_maskrcnn_p1.json" + }, + { + "path": "test/math_libraries/convolutions_cases/shapes_mobilenet.json" + }, + { + "path": "test/math_libraries/convolutions_cases/shapes_resnet_50.json" + } + ], + "pageInfo": { + "endCursor": "NQ", + "hasNextPage": false + } + }, + "reviews": { + "nodes": [ + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "CHANGES_REQUESTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "CHANGES_REQUESTED" + }, + { + "author": { + "login": "ailzhang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ngimel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "VitalyFedyunin" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ngimel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mingxiaoh" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mingxiaoh" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "VitalyFedyunin" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "VitalyFedyunin" + }, + "state": "APPROVED" + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpO5MjAxOS0xMi0zMFQxMDoxOToxMS0wODowMLkyMDE5LTEyLTMwVDEwOjE5OjExLTA4OjAwzhQZLuY=", + "hasPreviousPage": false + } + }, + "comments": { + "nodes": [ + { + "bodyText": "I cloned your repo and ran the tests:\n~/pytorch/test/math_libraries$ python convolutions.py\nFFFF\n======================================================================\nFAIL: test_conv2d_ext_cpu_float32 (__main__.TestConvExtCPU)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n result = test(self, *args)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 114, in test_conv2d_ext\n \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float16 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n result = test(self, *args)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 114, in test_conv2d_ext\n \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float32 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n result = test(self, *args)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 114, in test_conv2d_ext\n \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float64 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n result = test(self, *args)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 114, in test_conv2d_ext\n \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n----------------------------------------------------------------------\nRan 4 tests in 33.838s\n\nFAILED (failures=4)\n\nStill fails.\n\n@mruberry It is suggested by @VitalyFedyunin that, we need to display fail test to avoid invalid inputs, I guess we should set it as expected failures under the pytest test framework, right? we will change it as expected failure cases under pytest test framework. The result will looks like be low, is it ok?\n2500 passed, 136 skipped, 0 failed, 0 errors, 2 expected failures, 0 unexpected passes", + "createdAt": "2020-08-14T01:36:20Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": { + "login": "mingxiaoh" + }, + "databaseId": 673816925 + }, + { + "bodyText": "Displaying tests that fail is fine, but I don't think @VitalyFedyunin meant that it was OK if the tests didn't pass. If these are expected failures then yes, you can use with self.assertRaises(RuntimeError):... when testing them. If you also want to report that the test has test cases with these properties you can print or warn, which will appear in the test output.", + "createdAt": "2020-08-14T03:09:37Z", + "author": { + "login": "mruberry" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 673858224 + }, + { + "bodyText": "Codecov Report\n\nMerging #31093 into master will not change coverage.\nThe diff coverage is n/a.\n\n\n@@ Coverage Diff @@\n## master #31093 +/- ##\n=======================================\n Coverage 68.00% 68.00% \n=======================================\n Files 382 382 \n Lines 49527 49527 \n=======================================\n Hits 33679 33679 \n Misses 15848 15848 \n\nContinue to review full report at Codecov.\n\nLegend - Click here to learn more\n\u0394 = absolute (impact), \u00f8 = not affected, ? = missing data\nPowered by Codecov. Last update 69f6d94...29f6aa6. Read the comment docs.", + "createdAt": "2020-09-04T05:41:01Z", + "author": { + "login": "codecov" + }, + "authorAssociation": "NONE", + "editor": { + "login": "codecov" + }, + "databaseId": 686921371 + }, + { + "bodyText": "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as Stale. Feel free to remove the Stale label if you feel this was a mistake. If you are unable to remove the Stale label please contact a maintainer in order to do so. Stale pull requests will automatically be closed 30 days after being marked Stale", + "createdAt": "2022-04-12T02:35:37Z", + "author": { + "login": "pytorchbot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1095860944 + }, + { + "bodyText": "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as Stale. Feel free to remove the Stale label if you feel this was a mistake. If you are unable to remove the Stale label please contact a maintainer in order to do so. If you want the bot to never mark this PR stale again, add the no-stale label.Stale pull requests will automatically be closed after 30 days of inactivity.", + "createdAt": "2022-06-11T04:40:16Z", + "author": { + "login": "github-actions" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1152854802 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOKCmhXQ==", + "hasPreviousPage": true + } + }, + "labels": { + "edges": [ + { + "node": { + "name": "triaged" + } + }, + { + "node": { + "name": "open source" + } + }, + { + "node": { + "name": "cla signed" + } + }, + { + "node": { + "name": "Stale" + } + } + ] + } + } + } + } + }, + "query_sha=2e2877d2452c4f233f042b7ccd50ab9c2a6e9a73d8819a0c876203c12364e8a3 cursor=Y3Vyc29yOnYyOpHOKCmhXQ== name=pytorch number=31093 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "comments": { + "nodes": [ + { + "bodyText": "Hi, @mingfeima @soumith @Jianhui-Li\nthis will improve the test coverage of mkldnn convolution, would you please review it?\nThe current code is forward only, do we need to cover backward, if yes, we can add backward.", + "createdAt": "2019-12-12T01:19:02Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 564806270 + }, + { + "bodyText": "@mingxiaoh, what is the value in testing DNNL as part of Pytorch validation for the Pytorch developers? Shouldn't having these tests run in DNNL validation be enough?", + "createdAt": "2019-12-12T01:28:32Z", + "author": { + "login": "vpirogov" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 564808528 + }, + { + "bodyText": "@vpirogov The main value is to serve as a blind test to DNNL. If DNNL adds these test to DNNL test sets, it lost the value as a blind test. The spirit of validation is to cross check.\n@gottbrath @gchanan The test was developed per the request of Pytorch team. Mingxiao made an effort to reduce the execution time to a few second but still with good coverage. Although the test today is focused on DNNL, it could be easily extended to be blind test for any conv implementation used in Pytorch.", + "createdAt": "2019-12-20T07:44:30Z", + "author": { + "login": "Jianhui-Li" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 567826907 + }, + { + "bodyText": "@mruberry thanks for the comment. As for the chainer dependency, we import it is because we would like to use its testing function for pytest test cases combinations, other wise we need to write much more code to achieve same effect. So, can we use it?", + "createdAt": "2020-01-15T09:04:34Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 574563012 + }, + { + "bodyText": "@mingxiaoh You cannot import chainer. Looking at the code you should be able to achieve the same effect without it.", + "createdAt": "2020-01-16T17:59:46Z", + "author": { + "login": "mruberry" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 575272358 + }, + { + "bodyText": "@mruberry ok, we will change it according to your requirement. Thanks", + "createdAt": "2020-02-10T00:59:34Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 583917522 + }, + { + "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/31093\n\ud83d\udd27 \u00a0Opt-in to CIFlow to control what jobs run on your PRs\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 29f6aa6 (more details on the Dr. CI page):\n\nCommit 29f6aa6 was recently pushed. Waiting for builds...\n\nThis comment was automatically generated by Dr. CI (expand for details).Follow this link to opt-out of these comments for your Pull Requests.\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here to manually regenerate this comment.", + "createdAt": "2020-05-14T08:04:30Z", + "author": { + "login": "dr-ci" + }, + "authorAssociation": "NONE", + "editor": { + "login": "facebook-github-bot" + }, + "databaseId": 628466876 + }, + { + "bodyText": "@mruberry how about those cudnn UT error? we add check for it but it should be NV to fix cudnn bugs.", + "createdAt": "2020-05-18T05:34:11Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 629955767 + }, + { + "bodyText": "Hey @mingxiaoh! You're right, of course, that you shouldn't have to fix cuDNN bugs. Would you please:\n\nAssert that the test case fails, so we know it's failing and if someone fixes it they'll know what test to update.\nFile a new issue explaining the behavior and providing a short PyTorch program to reproduce the issue.\n\nThen we can ping NVIDIA on that issue.", + "createdAt": "2020-05-18T07:27:08Z", + "author": { + "login": "mruberry" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 629997129 + }, + { + "bodyText": "about the suggestion 'Assert that the test case fails, so we know it's failing and if someone fixes it they'll know what test to update. ', if we only assert it and continue the following test, I guess users might always ignore them in later test. Anyway, any similar example case for reference?", + "createdAt": "2020-05-18T07:55:08Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 630010734 + }, + { + "bodyText": "In this recent PR https://github.com/pytorch/pytorch/pull/38505/files, for example, you can see that the construction of bool tensors wasn't working properly, so the test author cited the relevant issue and asserted that the incorrect behavior happened, as expected. You can also see how these lines are being removed by https://github.com/pytorch/pytorch/pull/38392/files, which fixes the issue.\nAnother common pattern is to use with self.assertRaises(RuntimeError/AssertionError/etc.):.", + "createdAt": "2020-05-18T08:02:13Z", + "author": { + "login": "mruberry" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 630014823 + }, + { + "bodyText": "@mruberry the failed UT case is not introduced by our modification, how to handle this issue?", + "createdAt": "2020-05-20T01:59:13Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 631187735 + }, + { + "bodyText": "@mingxiaoh You mean the failures on ROCm? You may ignore them. Be sure to re-request review when you're ready.", + "createdAt": "2020-05-20T02:12:58Z", + "author": { + "login": "mruberry" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 631191425 + }, + { + "bodyText": "@mruberry we already skipped those ROCm errors, but there are stil somel error caused by the original code, they are not introduced by our modification.", + "createdAt": "2020-05-21T05:18:07Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 631886529 + }, + { + "bodyText": "I understand. Let me know when you're ready for me to review.", + "createdAt": "2020-05-21T06:24:15Z", + "author": { + "login": "mruberry" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 631908011 + }, + { + "bodyText": "@mruberry thanks, we are ready for review now.", + "createdAt": "2020-05-21T06:28:11Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 631909442 + }, + { + "bodyText": "@mingxiaoh Great! I'll take a look ASAP.", + "createdAt": "2020-05-21T06:31:10Z", + "author": { + "login": "mruberry" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 631910556 + }, + { + "bodyText": "@mruberry we just pull the latest code and updated the patch according to your comment, may you please help double check it? BTW, the new failed case in preci is not introduced by our modification.", + "createdAt": "2020-05-25T07:44:58Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 633430458 + }, + { + "bodyText": "@ailzhang would you please check the comment below? Thanks.\nIs there a reason why this TestConv2dExt is a new class instead a test inside TestNN?\n//comment: it is actually suggested by Tongzhou Wang in another thread before.\nAlthough this test sits in generic testing framework, it's actually comparing thnn/mkldnn/cudnn results specially. I feel it's better to make it truly generic so that it compares any device result with CPU result. Alternatively you can mark this test only run when torch.backends.mkldnn.is_available()=True\n//comment: but our goal is to compare the result with that of thnn. Anyway, if you insist, we can start to compare it with cpu.", + "createdAt": "2020-05-27T05:11:08Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": { + "login": "mingxiaoh" + }, + "databaseId": 634432326 + }, + { + "bodyText": "Pruning reviewers. @ngimel, @VitalyFedyunin, this PR is looking pretty good from a test framework perspective. Would one of you like to review?", + "createdAt": "2020-05-27T09:58:42Z", + "author": { + "login": "mruberry" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 634557563 + }, + { + "bodyText": "@mruberry Thanks, would you please help review it again. BTW: failed case is not introduced by our modification.", + "createdAt": "2020-05-28T10:26:32Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 635256214 + }, + { + "bodyText": "@mruberry we moved our case to TestNNDeviceType class, would you please help review it again? BTW, those failed cases are not introduced by our code", + "createdAt": "2020-06-02T08:00:01Z", + "author": { + "login": "1pikachu" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 637364148 + }, + { + "bodyText": "@mruberry we moved our case to TestNNDeviceType class, would you please help review it again? BTW, those failed cases are not introduced by our code\n\n@ngimel will follow-up on the test itself sometime this week or early next week.", + "createdAt": "2020-06-02T10:23:47Z", + "author": { + "login": "mruberry" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 637444457 + }, + { + "bodyText": "@mruberry we moved our case to TestNNDeviceType class, would you please help review it again? BTW, those failed cases are not introduced by our code\n\n@ngimel will follow-up on the test itself sometime this week or early next week.\n\n@mruberry thank you", + "createdAt": "2020-06-02T11:32:06Z", + "author": { + "login": "1pikachu" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 637479226 + }, + { + "bodyText": "Improving test coverage of math libraries is certainly a good goal and this PR is moving towards it. I have some doubts about implementation decisions made, and about running this PR as part of regular pytorch CI.\nIf the primary goal of this PR is to test correctness of the convolution implementations in the vendor library, then it does not serve this purpose. The absolute majority of the 4000+ test cases come from group 1, where different kernel sizes/strides/dilations are used to produce the output of size 1x1. This can test whether pytorch correctly passes convolution parameters to the backends (although there are cheaper ways to do that), but as actual library correctness check it is almost useless - libraries use very different kernels depending in the input/output sizes, and tests with toy sizes like this don't invoke the real bread-and-butter kernels.\nAlso, if this test suite is meant as primary a means of testing vendor libraries (which is a good goal!) it does not have a place as a part of pytorch regular CI, and should be run when the corresponding vendor libraries are updated. I'd suggest moving this test out into a separate file (maybe even outside of torch/test directory) and have it as a part of library update/qualification process rather than regular CI.\nAlso, if the primary goal is to enable easier testing of vendor libraries correctness, perhaps we should rethink the mechanism of the generation of test cases. It should be easy to add a test case with a particular set of parameters that was found to be buggy. Also, running a cross-product of cases in a multi-dimensional space (as this PR does) is rarely an efficient way of getting a signal, some forms of random sampling usually provide a way to get better correctness signal why using less resources.\nAlso, when testing libraries it is important to test both forward and backward functions, whereas this PR does forward only. I'm openminded on whether convTransposed should be tested or not - if we are testing vendor libraries, then it's not necessary, convTransposed calls the same underlying functions, if we are testing pytorch, then it makes sense to test it separately because it takes different codepaths.", + "createdAt": "2020-06-02T21:56:33Z", + "author": { + "login": "ngimel" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 637827507 + }, + { + "bodyText": "@mruberry ngimel is quite responsible, but it seems that she is not familiar with the background of this pull-request, since this pull-request is pending for so such a long time, each time we are almost done, then reviewer changes, each reviewer has different idea, it is good, but, would it be better if you help review it or ask the same reviewer to review it considering that you are more familiar with the background/change history? Thanks in advance.", + "createdAt": "2020-06-03T02:16:07Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 637912105 + }, + { + "bodyText": "@mruberry ngimel is quite responsible, but it seems that she is not familiar with the background of this pull-request, since this pull-request is pending for so such a long time, each time we are almost done, then reviewer changes, each reviewer has different idea, it is good, but, would it be better if you help review it or ask the same reviewer to review it considering that you are more familiar with the background/change history? Thanks in advance.\n\nWe know this PR has been open for awhile and we respect that your time is valuable, but we want to make sure we're making the right change here, and I think @ngimel's comments reflect that and should not be too difficult to address. As I understand, her points are:\n\nThis is a good PR with an exciting idea. To let it run longer and test more cases maybe it should run outside the regular PyTorch CI.\nTo remedy this, let's create a test/math_libraries folder and put this test there: test/math_libaries/convolutions.py. Yes, this is different from our requests in the past, which is our mistake, but it should be an easy change.\nTo make the test more interesting it'd be good for the test cases to resemble convolutions used in practice. The current test cases seem like similar \"toy\" examples. Without time pressure we should be able to run larger, more computationally intensive convolutions.\nLet's change the test cases to include some practical convolutions, make it easy to add test cases, and think about how we might generate other interesting cases. (We should also test backwards once we have more time!)\n\nAnd I think these are good points. Maybe the PR doesn't create a new way to generate interesting convolutions to start and instead only runs a few representative convolutions, but @ngimel is positioning the work for success so that it's useful and we can continue to improve on it in the future.\nDoes that make sense?", + "createdAt": "2020-06-03T03:04:55Z", + "author": { + "login": "mruberry" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 637924703 + }, + { + "bodyText": "@mruberry we were required to finish the test in limited time long long before, at that time, jianhui discussed this issue with you, and you are all agreed with the current test scope and test case number and test time, so you meant you change your mind now? you are not care about the test time currently? Sorry, this issue is pending so long, we are struggling with it now and would like to finish it asap. Given this, it would be be better if you raise all the requirement at a time, considering that we have many tasks at hand, we are hoping so eagerly that we can finish this PR and use it for further test for bugs finding.", + "createdAt": "2020-06-03T05:22:43Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": { + "login": "mingxiaoh" + }, + "databaseId": 637960626 + }, + { + "bodyText": "@mruberry we were required to finish the test in limited time long long before, at that time, jianhui discussed this issue with you, and you are all agreed with the current test scope and test case number and test time, so you meant you change your mind now? you are not care about the test time currently? Sorry, this issue is pending so long, we are struggling with it now and would like to finish it asap. Given this, it would be be better if you raise all the requirement at a time, considering that we have many tasks at hand, we are hoping so eagerly that we can finish this PR and use it for further test for bugs finding.\n\nI'm sorry, I don't think I've talked to @Jianhui-Li before. It's true that the team we expressed a concern about timing if the test was to be run in the CI initially, but I think now that we understand what the test is trying to do better we're not sure the CI is the best place for it. The PR was also closed after a lengthy period of inactivity, and we assumed it had simply been abandoned.\nDo you know who @Jianhui-Li spoke with about this issue originally? Maybe I can follow-up with them for more context.", + "createdAt": "2020-06-03T05:42:28Z", + "author": { + "login": "mruberry" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 637967153 + }, + { + "bodyText": "@mruberry it is reviewed and discussed with @soumith before. Anyway, since current reviewer is you, so, it should be decided by you. So, what we should do next?", + "createdAt": "2020-06-03T06:13:14Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 637978356 + }, + { + "bodyText": "@mruberry it is reviewed and discussed with @soumith before. Anyway, since current reviewer is you, so, it should be decided by you. So, what we should do next?\n\nI think this will be easier to discuss at the regular Intel-FB meeting.", + "createdAt": "2020-06-03T20:34:05Z", + "author": { + "login": "mruberry" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 638446723 + }, + { + "bodyText": "@mruberry it is reviewed and discussed with @soumith before. Anyway, since current reviewer is you, so, it should be decided by you. So, what we should do next?\n\nI think this will be easier to discuss at the regular Intel-FB meeting.\n\nLet me sync with Mingxiao and follow up with this. Thanks.", + "createdAt": "2020-06-03T20:44:44Z", + "author": { + "login": "Jianhui-Li" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 638451670 + }, + { + "bodyText": "@mruberry would you please help review it again?", + "createdAt": "2020-07-02T14:09:23Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 653028208 + }, + { + "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?", + "createdAt": "2020-07-06T20:15:04Z", + "author": { + "login": "mruberry" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 654443242 + }, + { + "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?\n\nyes, we talked it with jianhui, and we decided to follow your ideas. Anyway, we would like to do so modification later, will contact you for review tomorrow. Thanks", + "createdAt": "2020-07-09T11:04:06Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 656062287 + }, + { + "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?\n\nyes, we talked it with jianhui, and we decided to follow your ideas. Anyway, we would like to do so modification later, will contact you for review tomorrow. Thanks\n\n@mruberry the code is ready for review now, would you please take time for it? Thanks.", + "createdAt": "2020-07-14T09:16:48Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 658071151 + }, + { + "bodyText": "super nit: renaming files to .json will make it more IDE friendly.", + "createdAt": "2020-07-14T23:38:37Z", + "author": { + "login": "VitalyFedyunin" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 658464685 + }, + { + "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?\n\nyes, we talked it with jianhui, and we decided to follow your ideas. Anyway, we would like to do so modification later, will contact you for review tomorrow. Thanks\n\n@mruberry the code is ready for review now, would you please take time for it? Thanks.\n\nCool! I took a look with @ngimel, once these issues are addressed I think we're good to go!", + "createdAt": "2020-07-16T05:17:29Z", + "author": { + "login": "mruberry" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 659164401 + }, + { + "bodyText": "@ngimel & @VitalyFedyunin We have changed the code according to your suggestions, would you please review it again? Thanks.", + "createdAt": "2020-07-20T08:30:01Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 660884305 + }, + { + "bodyText": "@ngimel & @VitalyFedyunin We have changed the code according to your suggestions, would you please review it again? Thanks.\n\nUpdated: one more question about tolerances, one code cleanup recommendation, and one task leftover from the last review.", + "createdAt": "2020-07-22T20:26:42Z", + "author": { + "login": "mruberry" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 662678464 + }, + { + "bodyText": "Updated: one more question about tolerances, one code cleanup recommendation, and one task leftover from the last review.\n@mruberry we have finished the modification according to your comment, would you please review it again? Thanks.", + "createdAt": "2020-07-23T10:24:26Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 662930687 + }, + { + "bodyText": "The code looks good, but I tried running the test suite and hit the following failures:\n======================================================================\nFAIL: test_conv2d_ext_cuda_float16 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 241, in instantiated_test\n result = test(self, device_arg, dtype)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 542, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 411, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 102, in test_conv2d_ext\n msg=msg\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 1085, in assertEqual\n self.assertTrue(result, msg=msg)\nAssertionError: False is not true : device:cuda:0, dtype:torch.float16, group:1, batchsize:22input channel:448, output channel:384, bias:False, padding:[1, 1], dilation:[1, 1], stride:[1, 1], kernel:[3, 3]\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float32 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 241, in instantiated_test\n result = test(self, device_arg, dtype)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 542, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 411, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 102, in test_conv2d_ext\n msg=msg\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 1085, in assertEqual\n self.assertTrue(result, msg=msg)\nAssertionError: False is not true : device:cuda:0, dtype:torch.float32, group:1, batchsize:22input channel:80, output channel:192, bias:False, padding:[0, 0], dilation:[1, 1], stride:[1, 1], kernel:[3, 3]\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float64 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 241, in instantiated_test\n result = test(self, device_arg, dtype)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 542, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 411, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 106, in test_conv2d_ext\n \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\nLooking at the first invalid convolution, for example, it's:\n {\n \"case_name\":\"masknet_p1:conv33\",\n \"mb\":1,\n \"g\":1,\n \"ic\":512,\n \"ih\":64,\n \"iw\":64,\n \"oc\":12,\n \"kh\":1,\n \"kw\":1,\n \"sh\":1,\n \"sw\":1,\n \"ph\":0,\n \"pw\":0,\n \"dh\":0,\n \"dw\":0,\n \"bias\":\"False\"\n },\n\nwhich has a dh and dw of zero, causing it to be added to invalid cases here:\ndh, dw = case['dh'], case['dw']\n has_bias = case['bias']\n if dh == 0 or dw == 0:\n invalid_cases.append(case_name)", + "createdAt": "2020-07-23T21:25:19Z", + "author": { + "login": "mruberry" + }, + "authorAssociation": "COLLABORATOR", + "editor": { + "login": "mruberry" + }, + "databaseId": 663240268 + }, + { + "bodyText": "@mruberry the failure was not detected is because we did not export the cudnn path. Yes, you are right, we need to a large atol of 1e-2 . Would you please help review it again? Thanks.", + "createdAt": "2020-07-27T12:43:44Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 664373079 + }, + { + "bodyText": "@mruberry the failure was not detected is because we did not export the cudnn path. Yes, you are right, we need to a large atol of 1e-2 . Would you please help review it again? Thanks.\n\nBefore I run these tests again, is an atol of 1e-2 needed for all types or just half? Also, how does 1e-2 compare to the values that are being compared?", + "createdAt": "2020-07-27T18:39:27Z", + "author": { + "login": "mruberry" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 664569507 + }, + { + "bodyText": "@mruberry 1e-2 is experimental result, details see below, random means it might be failed sometimes.\n\n\n\natol,rtol\n1e-2,1e-2\n1e-2,1e-3\n1e-3,1e-2\n1e-3,1e-3\n1e-4,1e-3\n1e-3,1e-4\n1e-4,1e-4\n1e-4,1e-5\n1e-5,1e-4\n\n\n\n\nCuda float16\npass\npass\npass\npass\npass\nfail\nFail\nFail\nfail\n\n\nCuda float32\npass\nrandom\nrandom\nrandom\nrandom\nrandom\nrandom\nrandom\nfail", + "createdAt": "2020-07-31T03:33:27Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 666894774 + }, + { + "bodyText": "@mruberry would you please find time to review it again? Thanks.", + "createdAt": "2020-08-04T05:01:20Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 668380451 + }, + { + "bodyText": "@mruberry would you please find time to review it again? Thanks.\n\nI was just about to try and run this again locally but it looks like the files describing the convolutions are missing?", + "createdAt": "2020-08-07T03:49:44Z", + "author": { + "login": "mruberry" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 670306210 + }, + { + "bodyText": "@mruberry sorry but what is missing actually?", + "createdAt": "2020-08-07T05:00:20Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 670322557 + }, + { + "bodyText": "@mruberry sorry but what is missing actually?\n\nThe JSON files.", + "createdAt": "2020-08-07T16:06:41Z", + "author": { + "login": "mruberry" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 670591170 + }, + { + "bodyText": "@mruberry sorry but what is missing actually?\n\nThe JSON files.\n\n@mruberry sorry, we add them now, would you please check it again? Thanks.", + "createdAt": "2020-08-13T10:40:11Z", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 673402901 + }, + { + "bodyText": "I cloned your repo and ran the tests:\n~/pytorch/test/math_libraries$ python convolutions.py\nFFFF\n======================================================================\nFAIL: test_conv2d_ext_cpu_float32 (__main__.TestConvExtCPU)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n result = test(self, *args)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 114, in test_conv2d_ext\n \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float16 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n result = test(self, *args)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 114, in test_conv2d_ext\n \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float32 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n result = test(self, *args)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 114, in test_conv2d_ext\n \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float64 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n result = test(self, *args)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 114, in test_conv2d_ext\n \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n----------------------------------------------------------------------\nRan 4 tests in 33.838s\n\nFAILED (failures=4)\n\nStill fails.", + "createdAt": "2020-08-13T23:35:00Z", + "author": { + "login": "mruberry" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 673760580 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOIapCfg==", + "hasPreviousPage": false + } + } + } + } + } + }, + "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=76118 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": true, + "isCrossRepository": false, + "author": { + "login": "malfet" + }, + "title": "Dummy change with lots of commits", + "body": "Draft PR with 100+ commits, to test mergebot ", + "headRefName": "malfet/pr-with-lots-of-commits", + "headRepository": { + "nameWithOwner": "pytorch/pytorch" + }, + "baseRefName": "master", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nshulga@fb.com", + "name": "Nikita Shulga" + }, + "oid": "3067f2240afc7a29dc348000aa19eccbd9772303" + } + }, + { + "commit": { + "author": { + "user": { + "login": "andrewor14" + }, + "email": "andrewor@fb.com", + "name": "Andrew Or" + }, + "oid": "2f655b71f70c496c4e645f6cdb27d7bb7e825701" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "0c6dcaa7f58a19c42a530f4ee14bb6f0f03ca9fb" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dzdang" + }, + "email": "dzdang@umich.edu", + "name": "dzdang" + }, + "oid": "cad11c563d41ebcffb1683fe1f1288b8157413b3" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "jwtan@fb.com", + "name": "Jiewen Tan" + }, + "oid": "4dfd0875a68d87fccb5ad0d81692db480043b86e" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "2d37e74690582a4a26890e4c8b98f1f80e589c82" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "jwtan@fb.com", + "name": "Jiewen Tan" + }, + "oid": "d4aee60947e1a3ef23c7c42990621e0746fdd0a8" + } + }, + { + "commit": { + "author": { + "user": { + "login": "peterbell10" + }, + "email": "peterbell10@live.co.uk", + "name": "Peter Bell" + }, + "oid": "aac6204bf710beb5e50a383d426ae6222396335a" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dzdang" + }, + "email": "dzdang@umich.edu", + "name": "dzdang" + }, + "oid": "4b0362cab884584c24f5834b3874f5f357f56b5d" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "7536df613cbc645a9e68e6a3b0a8450753260fd1" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "20a50cb966d28d7bf82924adf781cf72a01ef90e" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "486387e8644afb46edff5aa5925b55c8119f67f0" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dzdang" + }, + "email": "dzdang@umich.edu", + "name": "dzdang" + }, + "oid": "acb9d78b9b732d3667b881727e6ed9f92a8c549f" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "683bb7959a5b973f8470c081ad02e8fc508e784a" + } + }, + { + "commit": { + "author": { + "user": { + "login": "qihqi" + }, + "email": "qihan@fb.com", + "name": "Han Qi" + }, + "oid": "a870cb40af65adf0b77d55f6b554d7093d284d7a" + } + }, + { + "commit": { + "author": { + "user": { + "login": "Krovatkin" + }, + "email": "korovaikon@gmail.com", + "name": "Nikolay Korovaiko" + }, + "oid": "70793b9f328ddf52cc86336104c3a064c8582ef4" + } + }, + { + "commit": { + "author": { + "user": { + "login": "suo" + }, + "email": "suo@fb.com", + "name": "Michael Suo" + }, + "oid": "f70b31f62b1c5159eef2725484b175983517c88c" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dagitses" + }, + "email": "mikeyd@fb.com", + "name": "Michael Andreas Dagitses" + }, + "oid": "04d3ec1db60defe1c6904bf77e9f8dfa87dc0b63" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "46b754a55b63e3168ad5854ad412c124934b675d" + } + }, + { + "commit": { + "author": { + "user": { + "login": "robieta" + }, + "email": "taylorrobie@fb.com", + "name": "Taylor Robie" + }, + "oid": "13df69e13ee571fdd716139419a00aec47ade7d6" + } + }, + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nshulga@fb.com", + "name": "Nikita Shulga" + }, + "oid": "70642e911ec80a47cdbf4a50aac475c11aa129b6" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pytorchmergebot" + }, + "email": "pytorchmergebot@users.noreply.github.com", + "name": "PyTorch MergeBot" + }, + "oid": "59bb7c39384bf3e0b284a037adef8b3caa53c1c4" + } + }, + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nshulga@fb.com", + "name": "Nikita Shulga" + }, + "oid": "007cfb97b55d70ff63e1ed71d1a674638f847376" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pytorchmergebot" + }, + "email": "pytorchmergebot@users.noreply.github.com", + "name": "PyTorch MergeBot" + }, + "oid": "0a7b858a5af1393fa3cf2853f92eca0e1d408dde" + } + }, + { + "commit": { + "author": { + "user": { + "login": "qihqi" + }, + "email": "qihan@fb.com", + "name": "Han Qi" + }, + "oid": "7917d789f0a523715041ade5177d271082628236" + } + }, + { + "commit": { + "author": { + "user": { + "login": "kit1980" + }, + "email": "sdym@fb.com", + "name": "Sergii Dymchenko (Meta Employee)" + }, + "oid": "91eb6017f0fb8a1b29e8cb48fac93bc9709f73b3" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dagitses" + }, + "email": "mikeyd@fb.com", + "name": "Michael Andreas Dagitses" + }, + "oid": "bd04dca5fabb0c2a51ac87063a515f256ef274fa" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dagitses" + }, + "email": "mikeyd@fb.com", + "name": "Michael Andreas Dagitses" + }, + "oid": "1f805a5defda7dabc49d0059edb9ccb06bc29352" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@fb.com", + "name": "Mike Ruberry" + }, + "oid": "4982c0a8db8f23d15ec4bfcbca4ce939afc04954" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pearu" + }, + "email": "pearu.peterson@gmail.com", + "name": "Pearu Peterson" + }, + "oid": "28502265cb5925cb7db8dcb2dd2334963092714a" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "e03fcaedb1342e6d65c7f7f20243000938ba60b2" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pritamdamania" + }, + "email": "pritam.damania@fb.com", + "name": "pritam" + }, + "oid": "efb28f5a1a5d18aa96bd668ab2ab5c651be359f3" + } + }, + { + "commit": { + "author": { + "user": { + "login": "MagiaSN" + }, + "email": "magialiao@tencent.com", + "name": "magialiao" + }, + "oid": "52cc1b9994f861ebdd3908759ed1ab11cba1f8de" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pytorchmergebot" + }, + "email": "pytorchmergebot@users.noreply.github.com", + "name": "PyTorch MergeBot" + }, + "oid": "3cd99f23d1acd6a5bedf6f3b02be79d64350a5b6" + } + }, + { + "commit": { + "author": { + "user": { + "login": "awgu" + }, + "email": "andgu@fb.com", + "name": "Andrew Gu" + }, + "oid": "b00502c634a5146f4d996bd90e84d317f049e7b0" + } + }, + { + "commit": { + "author": { + "user": { + "login": "davidberard98" + }, + "email": "dberard@fb.com", + "name": "David Berard" + }, + "oid": "976eb7cee799dddfbe6a4122b249aaee1b6c8854" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ngimel" + }, + "email": "ngimel@fb.com", + "name": "Natalia Gimelshein" + }, + "oid": "9608ab28744d5cae32f371490557b248c9549c66" + } + }, + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nshulga@fb.com", + "name": "Nikita Shulga" + }, + "oid": "4e119f0c39eb5ff0777f0e71561e6b633d85fb34" + } + }, + { + "commit": { + "author": { + "user": { + "login": "rohan-varma" + }, + "email": "rvarm1@fb.com", + "name": "Rohan Varma" + }, + "oid": "447580dc565f3660eddb2c996c6ed25b88338684" + } + }, + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nshulga@fb.com", + "name": "Nikita Shulga" + }, + "oid": "2bc8f43e9233008ea23053fab87b83ab36fca5e3" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dzdang" + }, + "email": "dzdang@umich.edu", + "name": "dzdang" + }, + "oid": "c13a8e891c3e3e714f60649ca1e3b082e090e9fe" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dzdang" + }, + "email": "dzdang@umich.edu", + "name": "dzdang" + }, + "oid": "fddc861b7ee473f57d3c2161e4618a2663a237e8" + } + }, + { + "commit": { + "author": { + "user": { + "login": "jiyuanzFB" + }, + "email": "jiyuanz@fb.com", + "name": "Jiyuan Zhang" + }, + "oid": "e2336dbc539d6c021720cbe43c92c9e4c8463299" + } + }, + { + "commit": { + "author": { + "user": { + "login": "bdhirsh" + }, + "email": "hirsheybar@fb.com", + "name": "Brian Hirsh" + }, + "oid": "26e2759d1ad59aac12168b74d1ca55e42ba9455c" + } + }, + { + "commit": { + "author": { + "user": { + "login": "bdhirsh" + }, + "email": "hirsheybar@fb.com", + "name": "Brian Hirsh" + }, + "oid": "ad7aa914ee3b3d1252e31514f010ba96c40aae87" + } + }, + { + "commit": { + "author": { + "user": { + "login": "bdhirsh" + }, + "email": "hirsheybar@fb.com", + "name": "Brian Hirsh" + }, + "oid": "f113c5d78065aafbe7b1c0e611945bfe9f67b3c0" + } + }, + { + "commit": { + "author": { + "user": { + "login": "bdhirsh" + }, + "email": "hirsheybar@fb.com", + "name": "Brian Hirsh" + }, + "oid": "a366fd01136292544b7862968ae92feba4b6d8fe" + } + }, + { + "commit": { + "author": { + "user": { + "login": "seemethere" + }, + "email": "eliuriegas@fb.com", + "name": "Eli Uriegas" + }, + "oid": "afeba0773749da5883c378a2e6ac066e1ce62ca0" + } + }, + { + "commit": { + "author": { + "user": { + "login": "bdhirsh" + }, + "email": "hirsheybar@fb.com", + "name": "Brian Hirsh" + }, + "oid": "d306c99addc543908f64666baeecacbd0749f4a7" + } + }, + { + "commit": { + "author": { + "user": { + "login": "awgu" + }, + "email": "andgu@fb.com", + "name": "Andrew Gu" + }, + "oid": "c2456ea658f41f64ea054a422edf22a9c977399f" + } + }, + { + "commit": { + "author": { + "user": { + "login": "awgu" + }, + "email": "andgu@fb.com", + "name": "Andrew Gu" + }, + "oid": "a8b0a1b681c9fe41e0d553c962a5c93e81d92503" + } + }, + { + "commit": { + "author": { + "user": { + "login": "anjali411" + }, + "email": "chourdiaanjali123@gmail.com", + "name": "anjali411" + }, + "oid": "af761d9a5d058c9188f16589bae4f307d35185be" + } + }, + { + "commit": { + "author": { + "user": { + "login": "clee2000" + }, + "email": "csl@fb.com", + "name": "Catherine Lee" + }, + "oid": "beceb417baef35b15c2716e23178fb49f7fd6f9d" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "1516554e22136db89d0aeba43a1a1a987e995d68" + } + }, + { + "commit": { + "author": { + "user": { + "login": "qihqi" + }, + "email": "qihan@fb.com", + "name": "Han Qi" + }, + "oid": "68eb1fa8374eff6cbdcf0be5e37ed6775d22e722" + } + }, + { + "commit": { + "author": { + "user": { + "login": "janeyx99" + }, + "email": "janeyx@fb.com", + "name": "Jane Xu" + }, + "oid": "3c7bcb99b5c0c879c2610f427880b03881f82f38" + } + }, + { + "commit": { + "author": { + "user": { + "login": "janeyx99" + }, + "email": "janeyx@fb.com", + "name": "Jane Xu" + }, + "oid": "38c1a2028090353e40a019c673c9ab16b39e4825" + } + }, + { + "commit": { + "author": { + "user": { + "login": "albanD" + }, + "email": "albandes@fb.com", + "name": "Alban Desmaison" + }, + "oid": "8091cbea2c95ed2c4c406b3c61547a27c6319bae" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "d81f59121969a47c8b2213a88e02cf9be0219be9" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "20d798b319cd107a767fe220f7a3027c18a1c844" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dzdang" + }, + "email": "dzdang@umich.edu", + "name": "dzdang" + }, + "oid": "eb35381a770b58c1cd41e935910cb4df2f3d8f14" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pytorchmergebot" + }, + "email": "pytorchmergebot@users.noreply.github.com", + "name": "PyTorch MergeBot" + }, + "oid": "e6498a657b9aa47546dcd92d1b4ffb2e1a50ebdb" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dzdang" + }, + "email": "dzdang@umich.edu", + "name": "dzdang" + }, + "oid": "7f821382db5ad08efe5b09a145c606852b8a9272" + } + }, + { + "commit": { + "author": { + "user": { + "login": "albanD" + }, + "email": "albandes@fb.com", + "name": "Alban Desmaison" + }, + "oid": "995c0e11a97d854ff969962bd81d7341e46ecb07" + } + }, + { + "commit": { + "author": { + "user": { + "login": "davidberard98" + }, + "email": "dberard@fb.com", + "name": "David Berard" + }, + "oid": "28d6258e62c9fc361a18689877c962c69889dc23" + } + }, + { + "commit": { + "author": { + "user": { + "login": "HarborYuan" + }, + "email": "yuanhaobo@whu.edu.cn", + "name": "Haobo Yuan" + }, + "oid": "2350fad8391367ebf81c7236a2c883644b4ff622" + } + }, + { + "commit": { + "author": { + "user": { + "login": "zou3519" + }, + "email": "zou3519@gmail.com", + "name": "Richard Zou" + }, + "oid": "3f789c9ccecdd7e2e52269453646e992a68c6b92" + } + }, + { + "commit": { + "author": { + "user": { + "login": "jeffdaily" + }, + "email": "jeff.daily@amd.com", + "name": "Jeff Daily" + }, + "oid": "20f79f610c1a3314da96d49515bbfbee9442e4f8" + } + }, + { + "commit": { + "author": { + "user": { + "login": "peterbell10" + }, + "email": "peterbell10@live.co.uk", + "name": "Peter Bell" + }, + "oid": "5823958f047f3b71a5dc8c52a20eb8ae3291bd3e" + } + }, + { + "commit": { + "author": { + "user": { + "login": "peterbell10" + }, + "email": "peterbell10@live.co.uk", + "name": "Peter Bell" + }, + "oid": "a0b15c49ecf3844daf2c0dcaef44f0214259db20" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "4afc38c25ca2ca126ba4987a419a58a5c572223b" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "b606f58d4a36683fbe0a7d02adfdde7d5cc694c2" + } + }, + { + "commit": { + "author": { + "user": { + "login": "albanD" + }, + "email": "albandes@fb.com", + "name": "Alban Desmaison" + }, + "oid": "2d61b4d630f6482a6c3cc7437091fad6d27c347e" + } + }, + { + "commit": { + "author": { + "user": { + "login": "george-qi" + }, + "email": "georgeqi94@gmail.com", + "name": "George Qi" + }, + "oid": "bc5384c47036a6cda94129f3e2f9e43c43393698" + } + }, + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nshulga@fb.com", + "name": "Nikita Shulga" + }, + "oid": "60fc3277634365b64465712b13db2acb76d6c890" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pytorchmergebot" + }, + "email": "pytorchmergebot@users.noreply.github.com", + "name": "PyTorch MergeBot" + }, + "oid": "1b8762e95bc38d1847fe99ed3230546c8b800bfd" + } + }, + { + "commit": { + "author": { + "user": { + "login": "jerryzh168" + }, + "email": "jerryzh168@gmail.com", + "name": "Jerry Zhang" + }, + "oid": "6acf60f95f59ecbc6e8ce830dea0abba7d3ec763" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ysiraichi" + }, + "email": "yukio.siraichi@gmail.com", + "name": "Yukio Siraichi" + }, + "oid": "8fb0276561fdd530c5a06ea195e930e0584f8705" + } + }, + { + "commit": { + "author": { + "user": { + "login": "albanD" + }, + "email": "albandes@fb.com", + "name": "Alban Desmaison" + }, + "oid": "1da7aed95a8700406671425eac1e4bbc2c7a24b5" + } + }, + { + "commit": { + "author": { + "user": { + "login": "thiagocrepaldi" + }, + "email": "thiago.crepaldi@microsoft.com", + "name": "Thiago Crepaldi" + }, + "oid": "83208e7dee4503c1bee1df9f6632794694dffa01" + } + }, + { + "commit": { + "author": { + "user": { + "login": "kshitij12345" + }, + "email": "kshitijkalambarkar@gmail.com", + "name": "kshitij12345" + }, + "oid": "1a46cf08dcd3d3564604c17b2c02d7e4eb45a7ff" + } + }, + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nshulga@fb.com", + "name": "Nikita Shulga" + }, + "oid": "b7f9b6689445f826c83694652fea5f7cfc7070d7" + } + }, + { + "commit": { + "author": { + "user": { + "login": "fatcat-z" + }, + "email": "jiz@microsoft.com", + "name": "Jay Zhang" + }, + "oid": "f273961c1696b156e35f8c76f7ad37934031050d" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pavithranrao" + }, + "email": "pavithran@fb.com", + "name": "Pavithran Ramachandran" + }, + "oid": "eb410a51fcbc716873fd80a970eb932d4aaaea61" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ngimel" + }, + "email": "ngimel@fb.com", + "name": "Natalia Gimelshein" + }, + "oid": "7dbb12cdc02332fa64264ed0df576511a5070d7e" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pytorchmergebot" + }, + "email": "pytorchmergebot@users.noreply.github.com", + "name": "PyTorch MergeBot" + }, + "oid": "43675665fa6b5154de8b25125dd03d7be35c884f" + } + }, + { + "commit": { + "author": { + "user": { + "login": "albanD" + }, + "email": "albandes@fb.com", + "name": "Alban Desmaison" + }, + "oid": "6c4d23c402c413667463770d9a2fa801f493d3c5" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pytorchmergebot" + }, + "email": "pytorchmergebot@users.noreply.github.com", + "name": "PyTorch MergeBot" + }, + "oid": "cf3778a35129a40dee14366515201b7ed2c0f346" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dzdang" + }, + "email": "dzdang@umich.edu", + "name": "dzdang" + }, + "oid": "9d00a051373cb81f79cb6375942cf3ec9fff2fe6" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pytorchmergebot" + }, + "email": "pytorchmergebot@users.noreply.github.com", + "name": "PyTorch MergeBot" + }, + "oid": "1eae67cf404aa8dffb80b8e85180f943878d52a6" + } + }, + { + "commit": { + "author": { + "user": { + "login": "janeyx99" + }, + "email": "janeyx@fb.com", + "name": "Jane Xu" + }, + "oid": "ce0e69dcda0fe41a6e964d6ac70ce8016979c71a" + } + }, + { + "commit": { + "author": { + "user": { + "login": "swolchok" + }, + "email": "swolchok@fb.com", + "name": "Scott Wolchok" + }, + "oid": "6faba554f6e49777f24911928edb3061b6ed0e3d" + } + }, + { + "commit": { + "author": { + "user": { + "login": "IvanYashchuk" + }, + "email": "ivan.yashchuk@aalto.fi", + "name": "Ivan Yashchuk" + }, + "oid": "d1d0e03f57a359f8f95331f9a34b8bed3e7cc845" + } + }, + { + "commit": { + "author": { + "user": { + "login": "Chillee" + }, + "email": "chilli@fb.com", + "name": "Horace He" + }, + "oid": "bb46bd9233a9fc631802a902cb48a4c13c2722ca" + } + }, + { + "commit": { + "author": { + "user": { + "login": "mehtanirav" + }, + "email": "niravmehta@fb.com", + "name": "Nirav Mehta" + }, + "oid": "3b1007fe4be12e483f2620fbac67cae42e703efc" + } + }, + { + "commit": { + "author": { + "user": { + "login": "mehtanirav" + }, + "email": "niravmehta@fb.com", + "name": "Nirav Mehta" + }, + "oid": "b4b65228dd0c109f5fdf17c7d9e56f60a98e398b" + } + }, + { + "commit": { + "author": { + "user": { + "login": "albanD" + }, + "email": "albandes@fb.com", + "name": "Alban Desmaison" + }, + "oid": "d629e300705196d3ae0bac5ed983b197101fa2ee" + } + }, + { + "commit": { + "author": { + "user": { + "login": "bigfootjon" + }, + "email": "jonjanzen@fb.com", + "name": "Jon Janzen" + }, + "oid": "52754b9e515f378f8476ad44d75b0a692bad8cde" + } + }, + { + "commit": { + "author": { + "user": { + "login": "samdow" + }, + "email": "samdow@fb.com", + "name": "samdow" + }, + "oid": "128c3ad747093f4970329a82c7c4720420faeff2" + } + }, + { + "commit": { + "author": { + "user": { + "login": "arindamroy-eng" + }, + "email": "61168652+arindamroy-eng@users.noreply.github.com", + "name": "arindamroy-eng" + }, + "oid": "2a0bda7d32a5bcc9827f7254a7b77cceb16ba973" + } + } + ], + "pageInfo": { + "endCursor": "MTAw", + "hasNextPage": true + }, + "totalCount": 131 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "edges": [ + { + "node": { + "app": { + "name": "Facebook GitHub Tools", + "databaseId": 12274 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "Facebook CLA Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://code.intern.facebook.com/cla/" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuNRg4=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693698" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRAI=" + }, + { + "node": { + "app": { + "name": "Netlify", + "databaseId": 13473 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693712" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRBA=" + }, + { + "node": { + "app": { + "name": "Azure Pipelines", + "databaseId": 9426 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693725" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRB0=" + }, + { + "node": { + "app": { + "name": "Dependabot", + "databaseId": 29110 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693741" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRC0=" + }, + { + "node": { + "app": { + "name": "Codecov", + "databaseId": 254 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693761" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsREE=" + }, + { + "node": { + "app": { + "name": "PyTorch Bot", + "databaseId": 40112 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693774" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRE4=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "run-torchbench", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192463/jobs/3232430975" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuNR-Y=", + "hasNextPage": false + } + }, + "conclusion": "SKIPPED", + "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193694412" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRsw=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "Test collect_env (with_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461134" + }, + { + "name": "Test collect_env (without_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461211" + }, + { + "name": "toc", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461301" + }, + { + "name": "Test tools", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461386" + }, + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461521" + }, + { + "name": "lintrunner", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461634" + }, + { + "name": "workflow-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461717" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuN84s=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193694417" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRtE=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pull" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "linux-xenial-py3.7-gcc7-no-ops / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232460797" + }, + { + "name": "linux-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232460951" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461088" + }, + { + "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461294" + }, + { + "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461410" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461543" + }, + { + "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461628" + }, + { + "name": "linux-bionic-rocm5.0-py3.7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461719" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461789" + }, + { + "name": "linux-bionic-cuda11.3-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461869" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461946" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462044" + }, + { + "name": "linux-xenial-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462112" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462244" + }, + { + "name": "win-vs2019-cuda11.3-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462360" + }, + { + "name": "linux-xenial-py3-clang5-mobile-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462432" + }, + { + "name": "win-vs2019-cpu-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462521" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462621" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462683" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462738" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232545510" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232545571" + }, + { + "name": "linux-docs / build-docs (cpp)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547522" + }, + { + "name": "linux-docs / build-docs (python)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547612" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547714" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547764" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547824" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547869" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547909" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547973" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232553452" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232553558" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232553605" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232553650" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232563716" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232563763" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232582650" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232582703" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232582741" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232590204" + }, + { + "name": "linux-bionic-rocm5.0-py3.7 / test (default, 1, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232608872" + }, + { + "name": "linux-bionic-rocm5.0-py3.7 / test (default, 2, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232608976" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232637097" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232637199" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232637259" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232639932" + }, + { + "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232687012" + }, + { + "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232687074" + }, + { + "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232785088" + }, + { + "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232785153" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuVD9M=", + "hasNextPage": true + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193694439" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRuc=" + } + ], + "pageInfo": { + "hasNextPage": false + } + }, + "status": null, + "pushedDate": "2022-04-20T17:10:41Z", + "oid": "5696e8357cf38f852ef3d680381513e26f202371" + } + } + ] + }, + "changedFiles": 348, + "files": { + "nodes": [ + { + "path": ".circleci/cimodel/data/pytorch_build_data.py" + }, + { + "path": ".circleci/cimodel/data/pytorch_build_definitions.py" + }, + { + "path": ".circleci/scripts/cpp_doc_push_script.sh" + }, + { + "path": ".circleci/scripts/python_doc_push_script.sh" + }, + { + "path": ".github/actions/checkout-pytorch/action.yml" + }, + { + "path": ".github/merge_rules.json" + }, + { + "path": ".github/scripts/gitutils.py" + }, + { + "path": ".github/scripts/gql_mocks.json" + }, + { + "path": ".github/scripts/trymerge.py" + }, + { + "path": ".github/workflows/_bazel-build-test.yml" + }, + { + "path": ".github/workflows/_linux-build.yml" + }, + { + "path": ".github/workflows/_linux-test.yml" + }, + { + "path": ".github/workflows/_mac-test.yml" + }, + { + "path": ".github/workflows/_rocm-test.yml" + }, + { + "path": ".github/workflows/_win-test.yml" + }, + { + "path": ".github/workflows/buck_build_test.yml" + }, + { + "path": ".github/workflows/lint.yml" + }, + { + "path": ".github/workflows/periodic.yml" + }, + { + "path": ".github/workflows/pull.yml" + }, + { + "path": ".github/workflows/trunk.yml" + }, + { + "path": ".jenkins/pytorch/macos-test.sh" + }, + { + "path": ".jenkins/pytorch/test.sh" + }, + { + "path": ".jenkins/pytorch/win-test.sh" + }, + { + "path": ".lintrunner.toml" + }, + { + "path": "BUILD.bazel" + }, + { + "path": "CODEOWNERS" + }, + { + "path": "README.md" + }, + { + "path": "aten/src/ATen/BatchingRegistrations.cpp" + }, + { + "path": "aten/src/ATen/Dispatch.h" + }, + { + "path": "aten/src/ATen/ExpandUtils.h" + }, + { + "path": "aten/src/ATen/FunctionalInverses.cpp" + }, + { + "path": "aten/src/ATen/FunctionalStorageImpl.cpp" + }, + { + "path": "aten/src/ATen/FunctionalStorageImpl.h" + }, + { + "path": "aten/src/ATen/FunctionalTensorWrapper.cpp" + }, + { + "path": "aten/src/ATen/FunctionalTensorWrapper.h" + }, + { + "path": "aten/src/ATen/FunctionalizeFallbackKernel.cpp" + }, + { + "path": "aten/src/ATen/NestedTensorImpl.cpp" + }, + { + "path": "aten/src/ATen/OpMathType.h" + }, + { + "path": "aten/src/ATen/SparseCsrTensorUtils.h" + }, + { + "path": "aten/src/ATen/ThreadLocalState.cpp" + }, + { + "path": "aten/src/ATen/ThreadLocalState.h" + }, + { + "path": "aten/src/ATen/autocast_mode.cpp" + }, + { + "path": "aten/src/ATen/autocast_mode.h" + }, + { + "path": "aten/src/ATen/core/SymIntArrayRef.cpp" + }, + { + "path": "aten/src/ATen/core/SymIntArrayRef.h" + }, + { + "path": "aten/src/ATen/core/TensorBase.h" + }, + { + "path": "aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h" + }, + { + "path": "aten/src/ATen/core/dispatch/Dispatcher.h" + }, + { + "path": "aten/src/ATen/core/interned_strings.h" + }, + { + "path": "aten/src/ATen/core/ivalue.cpp" + }, + { + "path": "aten/src/ATen/core/ivalue.h" + }, + { + "path": "aten/src/ATen/core/ivalue_inl.h" + }, + { + "path": "aten/src/ATen/core/jit_type.h" + }, + { + "path": "aten/src/ATen/core/jit_type_base.h" + }, + { + "path": "aten/src/ATen/core/type.cpp" + }, + { + "path": "aten/src/ATen/cuda/CUDASparse.h" + }, + { + "path": "aten/src/ATen/cuda/llvm_complex.cpp" + }, + { + "path": "aten/src/ATen/cuda/llvm_jit_strings.h" + }, + { + "path": "aten/src/ATen/native/Blas.cpp" + }, + { + "path": "aten/src/ATen/native/Itertools.cpp" + }, + { + "path": "aten/src/ATen/native/LinearAlgebra.cpp" + }, + { + "path": "aten/src/ATen/native/SoftMax.cpp" + }, + { + "path": "aten/src/ATen/native/TensorConversions.cpp" + }, + { + "path": "aten/src/ATen/native/TensorShape.cpp" + }, + { + "path": "aten/src/ATen/native/TensorShape.h" + }, + { + "path": "aten/src/ATen/native/Unique.cpp" + }, + { + "path": "aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu" + }, + { + "path": "aten/src/ATen/native/cuda/CUDAJitLoops.cuh" + }, + { + "path": "aten/src/ATen/native/cuda/JitLoops.cuh" + }, + { + "path": "aten/src/ATen/native/cuda/Lerp.cu" + }, + { + "path": "aten/src/ATen/native/cuda/PersistentSoftmax.cuh" + }, + { + "path": "aten/src/ATen/native/cuda/SoftMax.cu" + }, + { + "path": "aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu" + }, + { + "path": "aten/src/ATen/native/cuda/Unique.cu" + }, + { + "path": "aten/src/ATen/native/cuda/jit_utils.cpp" + }, + { + "path": "aten/src/ATen/native/cuda/jit_utils.h" + }, + { + "path": "aten/src/ATen/native/native_functions.yaml" + }, + { + "path": "aten/src/ATen/native/nested/NestedTensorMath.cpp" + }, + { + "path": "aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp" + }, + { + "path": "aten/src/ATen/native/quantized/cpu/qsoftmax.cpp" + }, + { + "path": "aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp" + }, + { + "path": "aten/src/ATen/native/quantized/cudnn/Linear.cpp" + }, + { + "path": "aten/src/ATen/native/quantized/cudnn/utils.h" + }, + { + "path": "aten/src/ATen/native/sparse/SparseCsrTensor.cpp" + }, + { + "path": "aten/src/ATen/native/ts_native_functions.yaml" + }, + { + "path": "aten/src/ATen/record_function.cpp" + }, + { + "path": "aten/src/ATen/record_function.h" + }, + { + "path": "aten/src/ATen/templates/Operators.h" + }, + { + "path": "aten/src/ATen/templates/RegisterFunctionalization.cpp" + }, + { + "path": "aten/src/ATen/test/basic.cpp" + }, + { + "path": "aten/src/ATen/test/vmap_test.cpp" + }, + { + "path": "binaries/record_function_benchmark.cc" + }, + { + "path": "c10/core/DispatchKey.cpp" + }, + { + "path": "c10/core/DispatchKey.h" + }, + { + "path": "c10/core/DispatchKeySet.h" + }, + { + "path": "c10/test/core/DispatchKeySet_test.cpp" + }, + { + "path": "c10/util/ArrayRef.h" + }, + { + "path": "caffe2/core/tensor.h" + }, + { + "path": "docs/source/conf.py" + }, + { + "path": "docs/source/fx.rst" + } + ], + "pageInfo": { + "endCursor": "MTAw", + "hasNextPage": true + } + }, + "reviews": { + "nodes": [], + "pageInfo": { + "startCursor": null, + "hasPreviousPage": false + } + }, + "comments": { + "nodes": [ + { + "bodyText": "Merge failed due to Matched rule superuser, but it was not reviewed yet by any of:zou3519,abhikrish,mehtanirav,wconstab,lc0, ...", + "createdAt": "2022-04-20T17:26:18Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1104215370 + }, + { + "bodyText": "Merge failed due to Matched rule superuser, but PR has not been reviewed yet", + "createdAt": "2022-04-20T17:31:26Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1104220908 + }, + { + "bodyText": "@pytorchbot merge this", + "createdAt": "2022-04-20T19:30:50Z", + "author": { + "login": "malfet" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1104378397 + }, + { + "bodyText": "Merge failed due to Matched rule superuser, but PR has not been reviewed yet\nRaised by https://github.com/pytorch/pytorch/actions/runs/2197877090", + "createdAt": "2022-04-20T19:32:10Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1104379712 + }, + { + "bodyText": "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as Stale. Feel free to remove the Stale label if you feel this was a mistake. If you are unable to remove the Stale label please contact a maintainer in order to do so. If you want the bot to never mark this PR stale again, add the no-stale label.Stale pull requests will automatically be closed after 30 days of inactivity.", + "createdAt": "2022-06-20T16:44:05Z", + "author": { + "login": "github-actions" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1160658699 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOQdD9Sg==", + "hasPreviousPage": true + } + }, + "labels": { + "edges": [ + { + "node": { + "name": "cla signed" + } + }, + { + "node": { + "name": "Stale" + } + } + ] + } + } + } + } + }, + "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=76123 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": true, + "isCrossRepository": true, + "author": { + "login": "kumpera" + }, + "title": "Introduce distributed checkpoint with ShardedTensor.", + "body": "Co-authored-by: Wen Zhang \r\nCo-authored-by: Yifu Wang \r\n\r\n", + "headRefName": "st_checkpoint", + "headRepository": { + "nameWithOwner": "kumpera/pytorch" + }, + "baseRefName": "master", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": { + "login": "kumpera" + }, + "email": "kumpera@fb.com", + "name": "Rodrigo Kumpera" + }, + "oid": "6bf248bc20a71f248064b795f38276326fe43aae" + } + }, + { + "commit": { + "author": { + "user": { + "login": "kumpera" + }, + "email": "kumpera@fb.com", + "name": "Rodrigo Kumpera" + }, + "oid": "10f84fb90bf02d7062e565ebf2c1da6352b64db7" + } + }, + { + "commit": { + "author": { + "user": { + "login": "kumpera" + }, + "email": "kumpera@fb.com", + "name": "Rodrigo Kumpera" + }, + "oid": "96c5299740ec791f3cf0975c03a40a7b219b6747" + } + } + ], + "pageInfo": { + "endCursor": "Mw", + "hasNextPage": false + }, + "totalCount": 3 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "edges": [ + { + "node": { + "app": { + "name": "Facebook GitHub Tools", + "databaseId": 12274 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "Facebook CLA Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://code.intern.facebook.com/cla/" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgS2l4=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6380755666" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXxSmtI=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "run-torchbench", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063614/jobs/3379894109" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXd2r3Q=", + "hasNextPage": false + } + }, + "conclusion": "SKIPPED", + "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6380755785" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXxSm0k=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894107" + }, + { + "name": "toc", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894332" + }, + { + "name": "lintrunner", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894444" + }, + { + "name": "Test collect_env (with_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894520" + }, + { + "name": "Test collect_env (without_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894567" + }, + { + "name": "Test tools", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894616" + }, + { + "name": "workflow-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894672" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXd2shU=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6380755786" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXxSm0o=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pull" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902301" + }, + { + "name": "linux-bionic-cuda11.3-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902363" + }, + { + "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902507" + }, + { + "name": "linux-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902560" + }, + { + "name": "win-vs2019-cpu-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902579" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902603" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902637" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902685" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902740" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902761" + }, + { + "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902794" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902874" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903006" + }, + { + "name": "linux-xenial-py3.7-gcc7-no-ops / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903111" + }, + { + "name": "linux-xenial-py3-clang5-mobile-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903193" + }, + { + "name": "linux-xenial-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903284" + }, + { + "name": "win-vs2019-cuda11.3-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903357" + }, + { + "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903446" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903512" + }, + { + "name": "linux-bionic-rocm5.1-py3.7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903546" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379944655" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379944695" + }, + { + "name": "linux-docs / build-docs (cpp)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946308" + }, + { + "name": "linux-docs / build-docs (python)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946337" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946359" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946391" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946423" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946453" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946496" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946529" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950041" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950137" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950165" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950192" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950646" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379951202" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379951230" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 4, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379963877" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 4, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379963928" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 4, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379963976" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 4, 4, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379964018" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379966372" + }, + { + "name": "linux-bionic-rocm5.1-py3.7 / test (default, 1, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379996173" + }, + { + "name": "linux-bionic-rocm5.1-py3.7 / test (default, 2, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379996218" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379997861" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379998374" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379998397" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379998422" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379998441" + }, + { + "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3380042106" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXd5yuY=", + "hasNextPage": true + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6380755806" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXxSm14=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "lintrunner", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387419477" + }, + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387419699" + }, + { + "name": "Test collect_env (with_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387419923" + }, + { + "name": "Test collect_env (without_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387419992" + }, + { + "name": "Test tools", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387420129" + }, + { + "name": "workflow-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387420208" + }, + { + "name": "toc", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387420309" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgS3SE=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6390363240" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXzlNGg=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "run-torchbench", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796862/jobs/3387419465" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgS1-o=", + "hasNextPage": false + } + }, + "conclusion": "SKIPPED", + "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6390363271" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXzlNIc=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pull" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "linux-bionic-rocm5.1-py3.7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387419999" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420164" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420316" + }, + { + "name": "linux-xenial-py3.7-gcc7-no-ops / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420477" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420675" + }, + { + "name": "linux-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420934" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387421278" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387421672" + }, + { + "name": "linux-xenial-py3-clang5-mobile-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387421888" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387421982" + }, + { + "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422191" + }, + { + "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422303" + }, + { + "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422476" + }, + { + "name": "linux-bionic-cuda11.3-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422715" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422963" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423092" + }, + { + "name": "linux-xenial-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423234" + }, + { + "name": "win-vs2019-cpu-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423421" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423622" + }, + { + "name": "win-vs2019-cuda11.3-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423739" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387545789" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387546032" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387546119" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553028" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553144" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553251" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553438" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553556" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553668" + }, + { + "name": "linux-docs / build-docs (cpp)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387554002" + }, + { + "name": "linux-docs / build-docs (python)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387554098" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387558927" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387559016" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387559071" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387559139" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387563803" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387563894" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 4, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387580868" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 4, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387580936" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 4, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387580993" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 4, 4, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387581053" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387592286" + }, + { + "name": "linux-bionic-rocm5.1-py3.7 / test (default, 1, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387631950" + }, + { + "name": "linux-bionic-rocm5.1-py3.7 / test (default, 2, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387632035" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387649916" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387649974" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387650084" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387650151" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387650373" + }, + { + "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387753429" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgaCXo=", + "hasNextPage": true + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6390363300" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXzlNKQ=" + } + ], + "pageInfo": { + "hasNextPage": false + } + }, + "status": null, + "pushedDate": "2022-05-05T00:34:26Z", + "oid": "96c5299740ec791f3cf0975c03a40a7b219b6747" + } + } + ] + }, + "changedFiles": 11, + "files": { + "nodes": [ + { + "path": "test/distributed/_shard/checkpoint/test_checkpoint.py" + }, + { + "path": "test/distributed/_shard/checkpoint/test_file_system_checkpoint.py" + }, + { + "path": "test/distributed/_shard/sharded_tensor/test_sharded_tensor.py" + }, + { + "path": "torch/distributed/_shard/checkpoint/__init__.py" + }, + { + "path": "torch/distributed/_shard/checkpoint/filesystem.py" + }, + { + "path": "torch/distributed/_shard/checkpoint/metadata.py" + }, + { + "path": "torch/distributed/_shard/checkpoint/resharding.py" + }, + { + "path": "torch/distributed/_shard/checkpoint/state_dict_loader.py" + }, + { + "path": "torch/distributed/_shard/checkpoint/state_dict_saver.py" + }, + { + "path": "torch/distributed/_shard/checkpoint/storage.py" + }, + { + "path": "torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py" + } + ], + "pageInfo": { + "endCursor": "MTE", + "hasNextPage": false + } + }, + "reviews": { + "nodes": [ + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "zzzwen" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "zzzwen" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "wanchaol" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "zzzwen" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "zzzwen" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "simpkins" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "zzzwen" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "zzzwen" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "simpkins" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "simpkins" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "pritamdamania87" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "pritamdamania87" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "pritamdamania87" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "wilson100hong" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "wilson100hong" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "wilson100hong" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "xunnanxu" + }, + "state": "DISMISSED" + }, + { + "author": { + "login": "xunnanxu" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "xunnanxu" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "xunnanxu" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "xunnanxu" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "xunnanxu" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "pritamdamania87" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "pritamdamania87" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "pritamdamania87" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "pritamdamania87" + }, + "state": "APPROVED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNC0yNVQxMTozNTowMS0wNzowMLkyMDIyLTA0LTI1VDExOjM1OjAwLTA3OjAwzjjC2d0=", + "hasPreviousPage": true + } + }, + "comments": { + "nodes": [ + { + "bodyText": "Merge failed due to Can't fetch all PR reviews\nRaised by https://github.com/pytorch/pytorch/actions/runs/2275691136", + "createdAt": "2022-05-05T12:35:49Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1118495479 + }, + { + "bodyText": "Merge failed due to Can't fetch all PR reviews\nRaised by https://github.com/pytorch/pytorch/actions/runs/2275691136", + "createdAt": "2022-05-05T12:53:15Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1118511287 + }, + { + "bodyText": "Merge failed due to Can't fetch all PR reviews\nRaised by https://github.com/pytorch/pytorch/actions/runs/2275691136", + "createdAt": "2022-05-05T15:00:08Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1118662274 + }, + { + "bodyText": "Merge failed due to Can't fetch all PR reviews Raised by https://github.com/pytorch/pytorch/actions/runs/2275691136\n\n@osalpekar @malfet This is failing because there are 109 review comments on this PR but we only fetch the first 100. This could be solved with a similar concept as how we fetch more comments/check_runs.", + "createdAt": "2022-05-05T15:20:46Z", + "author": { + "login": "janeyx99" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1118689010 + }, + { + "bodyText": "On a side note, has the test_fsdp_clip_grad_norm_norm_type_2_0_nested_fsdp_False_cpu_offload_CPUOffload failure on the distributed test first shard of this PR been addressed?", + "createdAt": "2022-05-05T15:24:08Z", + "author": { + "login": "janeyx99" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1118693497 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOQqri9w==", + "hasPreviousPage": true + } + }, + "labels": { + "edges": [ + { + "node": { + "name": "oncall: distributed" + } + }, + { + "node": { + "name": "cla signed" + } + } + ] + } + } + } + } + }, + "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=71759 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": true, + "isCrossRepository": true, + "author": { + "login": "coolteemf" + }, + "title": "Optimize grid sample 3d", + "body": "Fixes #71415\r\nI have implemented the changes that replicate what @to-mi did in this [PR](https://github.com/pytorch/pytorch/pull/65986#issue-1012959443) for the 3D case :\r\n\r\n> Fixes #64977\r\n> \r\n> Avoids creating a tensor for and calculating `input` gradient if it's not needed in the backward pass of `grid_sample` (2d case, native CPU & CUDA kernels). Especially the tensor creation seemed time consuming (see #64977).\r\n> \r\n> Brief description of the changes:\r\n> \r\n> * I have tried to go with rather minimal changes. It would probably be possible to make a more elegant version with a bit larger refactoring (or possibly with better understanding of PyTorch internals and C++ functionalities).\r\n> \r\n> * Changed the `native_functions.yaml` and `derivatives.yaml` so that the gradient input mask is passed to the functions.\r\n> \r\n> * Changed the CPU kernels:\r\n> (1) added `bool input_requires_grad` template parameter to the `backward` function,\r\n> (2) added if branches based on it to remove `input` gradient computations if it's not requested,\r\n> (3) feed in `TensorAccessor* gInp_slice_ptr` instead of `TensorAccessor& gInp_slice` so that I can pass a `nullptr` in case gradient for `input` is not requested. (A bit inelegant perhaps, but allows to keep one signature for `backward` function and not require breaking it to smaller pieces. Perhaps there's a more elegant way to achieve this?)\r\n> \r\n> * Changed CUDA kernel:\r\n> (1) added ~`bool input_requires_grad` template parameter~ `const bool input_requires_grad` argument to the `backward` function,\r\n> (2) added if branches based on it to remove `input` gradient computations if it's not requested,\r\n> (3) feed in `TensorInfo()` instead of `getTensorInfo(grad_input)` in case gradient for `input` is not requested.\r\n> \r\n> * Modified tests in `test/test_nn.py` so that they run also cases with no `input` gradient needed.\r\n> \r\n> * Have not touched the CPU fallback kernel.\r\n\r\nNote: the changes number (3) are N/A in this case.\r\n\r\n", + "headRefName": "optimize_grid_sample_3d", + "headRepository": { + "nameWithOwner": "coolteemf/pytorch" + }, + "baseRefName": "master", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "e0b0d1e695aeddceaf265da602c4704592053e9e" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "563ec73747ad53b63b36736c47c4342f962c2a09" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "51abe41a132d9dd5b1c0551bdca902aacc028ff8" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "be9898205992034a00e8ace8a55c2ecdcee2c2f8" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "2929c60b64384c2deae0f7dea8bab94ad4bc9ec8" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "9241b737e7e2b257905cc74ad9c50b737d7f9d0a" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "64d6b795d0636928a8aa2fd3da01302fb5f5f7af" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "4503577e53760a0006f1e80ca6bfe04d2be90470" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "b16f4b11ffbbbf2ca2098f9702af4ef6b6fc5e1f" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "7ffc23368a604afdc92d2818747f730ce31a2bb5" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "b85292604b9ad6c31706b76b5a5498c4f6d94309" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "9d81d7bae8ad91aaa24b3ceab83e3138894dbc69" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "e79f6a2202512b294c55bf4bfb2e0524fafd4c48" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "f683e8aec7aea76097a264eec01511e704c31154" + } + }, + { + "commit": { + "author": { + "user": { + "login": "coolteemf" + }, + "email": "67541941+coolteemf@users.noreply.github.com", + "name": "Fran\u00e7ois Lecomte" + }, + "oid": "b932e9e286c22aaf352375186df851ef060b295a" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "346e0c547953d98eb84d23c1391a95badb9c4a22" + } + } + ], + "pageInfo": { + "endCursor": "MTY", + "hasNextPage": false + }, + "totalCount": 16 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "edges": [ + { + "node": { + "app": { + "name": "Facebook GitHub Tools", + "databaseId": 12274 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "Facebook CLA Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://code.intern.facebook.com/cla/" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGYqY=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801320" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_T6g=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-clang7-onnx" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754066/jobs/2663109808" + }, + { + "name": "test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754066/jobs/2663214802" + }, + { + "name": "test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754066/jobs/2663214856" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIob0=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801849" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ubk=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3-clang5-mobile-build" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754064/jobs/2663109676" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ1E=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801852" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ubw=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-bionic-rocm4.5-py3.7" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754065/jobs/2663109684" + }, + { + "name": "test (default, 2, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754065/jobs/2663401083" + }, + { + "name": "test (default, 1, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754065/jobs/2663401143" + }, + { + "name": "test (distributed, 1, 1, linux.rocm.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754065/jobs/2663401186" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwMsZY=", + "hasNextPage": false + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801853" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ub0=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "win-vs2019-cuda11.3-py3" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754068/jobs/2663109680" + }, + { + "name": "test (default, 1, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754068/jobs/2663995756" + }, + { + "name": "test (force_on_cpu, 1, 1, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754068/jobs/2663995819" + }, + { + "name": "test (default, 2, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754068/jobs/2663995900" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwZbzg=", + "hasNextPage": false + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801855" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ub8=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "mypy", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663109683" + }, + { + "name": "shellcheck", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663109827" + }, + { + "name": "py2-setup-validate-errormsg", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663109962" + }, + { + "name": "clang-format", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110044" + }, + { + "name": "cmakelint", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110132" + }, + { + "name": "toc", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110233" + }, + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110320" + }, + { + "name": "clang-tidy", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110461" + }, + { + "name": "flake8-py3", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110575" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGbAQ=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801856" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_UcA=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-clang7-asan" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754070/jobs/2663109804" + }, + { + "name": "test (default, 3, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754070/jobs/2663233675" + }, + { + "name": "test (default, 1, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754070/jobs/2663233731" + }, + { + "name": "test (default, 2, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754070/jobs/2663233805" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwJC4U=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801857" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_UcE=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754076/jobs/2663109810" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ_w=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801862" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_UcY=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc5.4" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663109777" + }, + { + "name": "test (backwards_compat, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201383" + }, + { + "name": "test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201458" + }, + { + "name": "test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201512" + }, + { + "name": "test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201580" + }, + { + "name": "test (jit_legacy, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201672" + }, + { + "name": "test (docs_test, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201839" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIWu4=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801866" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Uco=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754079/jobs/2663109681" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ1k=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801869" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Uc0=" + } + ], + "pageInfo": { + "hasNextPage": true + } + }, + "status": { + "contexts": [ + { + "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17017798?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17017799?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17017816?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17017800?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + } + ] + }, + "pushedDate": "2022-02-23T10:39:30Z", + "oid": "346e0c547953d98eb84d23c1391a95badb9c4a22" + } + } + ] + }, + "changedFiles": 9, + "files": { + "nodes": [ + { + "path": "aten/src/ATen/native/GridSampler.cpp" + }, + { + "path": "aten/src/ATen/native/cpu/GridSamplerKernel.cpp" + }, + { + "path": "aten/src/ATen/native/cuda/GridSampler.cpp" + }, + { + "path": "aten/src/ATen/native/cuda/GridSampler.cu" + }, + { + "path": "aten/src/ATen/native/cuda/GridSampler.h" + }, + { + "path": "aten/src/ATen/native/native_functions.yaml" + }, + { + "path": "test/forward_backward_compatibility/check_forward_backward_compatibility.py" + }, + { + "path": "test/test_nn.py" + }, + { + "path": "tools/autograd/derivatives.yaml" + } + ], + "pageInfo": { + "endCursor": "OQ", + "hasNextPage": false + } + }, + "reviews": { + "nodes": [ + { + "author": { + "login": "albanD" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "coolteemf" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "albanD" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "coolteemf" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "albanD" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "coolteemf" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "coolteemf" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "albanD" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "coolteemf" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "albanD" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "albanD" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "coolteemf" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "albanD" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "coolteemf" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "albanD" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "albanD" + }, + "state": "APPROVED" + }, + { + "author": { + "login": "albanD" + }, + "state": "APPROVED" + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMS0yNVQwODoyODoxMC0wODowMLkyMDIyLTAxLTI1VDA3OjU0OjA1LTA4OjAwzjNooqI=", + "hasPreviousPage": false + } + }, + "comments": { + "nodes": [ + { + "bodyText": "Merge failed due to 'NoneType' object is not subscriptable\nRaised by https://github.com/pytorch/pytorch/actions/runs/1887945630", + "createdAt": "2022-02-23T14:55:36Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1048868910 + }, + { + "bodyText": "Thanks for the update! The windows failure is not your fault, you can ignore it!\n\nThank you very much for all of your feedback and sorry for the delay !", + "createdAt": "2022-02-23T16:44:36Z", + "author": { + "login": "coolteemf" + }, + "authorAssociation": "CONTRIBUTOR", + "editor": null, + "databaseId": 1048983572 + }, + { + "bodyText": "@coolteemf can you please send either me or @albanD an email? (or I can send you and invite to collab on private repo)", + "createdAt": "2022-02-23T17:49:55Z", + "author": { + "login": "malfet" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1049048119 + }, + { + "bodyText": "@pytorchbot merge this please", + "createdAt": "2022-02-23T19:23:55Z", + "author": { + "login": "albanD" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1049131992 + }, + { + "bodyText": "Hey @coolteemf.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.", + "createdAt": "2022-02-23T19:26:51Z", + "author": { + "login": "github-actions" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1049134520 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOPoR4Lg==", + "hasPreviousPage": true + } + }, + "labels": { + "edges": [ + { + "node": { + "name": "triaged" + } + }, + { + "node": { + "name": "open source" + } + }, + { + "node": { + "name": "cla signed" + } + }, + { + "node": { + "name": "release notes: nn" + } + }, + { + "node": { + "name": "topic: performance" + } + } + ] + } + } + } + } + }, + "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=75095 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": true, + "isCrossRepository": false, + "author": { + "login": "mruberry" + }, + "title": "Initial prims, references, and test architecture for them", + "body": "This PR adds an initial set of experimental primitive operations and Python references that reimplement existing PyTorch operations using them. See https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-0/577 for additional context.\r\n\r\nThe following experimental primitives are added:\r\n\r\n- Elementwise unary prims -- abs, acos, acosh, asin, atan, cos, cosh, bessel_i0e, bessel_i1e, cbrt, ceil, digamma, erf, erf_inv, erfc, exp, expm1, floor, igamma, igammac, is_finite, lgamma, log, log1p, neg, reciprocal, round, sign, sinh, sqrt, square, tan. \r\n- Elementwise binary prims -- add, atan2, bitwise_and, bitwise_not, bitwise_or, bitwise_xor, div, eq, ge, gt, le, lt, max, min, mul, ne, nextafter, pow, rsqrt, shift_left, shift_right_arithmetic\r\n- View prims -- brodcast_in_dim, collapse_view, split_dim, squeeze\r\n- Shape prims -- collapse, concatenate, reshape\r\n- Conditional prims -- select\r\n- Data conversion & movement prims -- convert_element_type, device_put\r\n- Inplace prims -- copy_to, resize\r\n\r\nThese primitives do not add any new functionality to PyTorch, but are intended to be the semantic building blocks for reference operators. We have tried to make them consistent with the operations in [jax.lax](https://jax.readthedocs.io/en/latest/jax.lax.html) where possible (because PyTorch prefers being consistent with other frameworks), although there are key differences between these prims and operations in jax.lax. Most notably is that these prims model view semantics and inplace operations.\r\n\r\nIn addition to these primitives the following elementwise binary Python references are added:\r\n\r\n- Elementwise binary Python references -- add, atan2, bitwise_and, bitwise_left_shift, bitwise_or, bitwise_right_shift, bitwise_xor, eq, float_power, ge, gt, le, lt, maximum, minimum, mul, ne, nextafter, pow, sub, true_divide\r\n- Conditional Python references - where\r\n- Data conversion & movement references - copy_to\r\n\r\nA Python reference implements the same behavior as its corresponding PyTorch operator (excepting slight numerical differences, bug fixes, and in some cases additional features). \r\n\r\nThe start of an OpInfo-based test architecture for these references is also included in this PR. A new list, `python_ref_db`, is added to `common_methods_invocations.py`. This list introduces the new `ElementwiseBinaryPythonRefInfo`, which inherits input arguments from the original operators' OpInfo, allows them to be overridden, and then constructs the OpInfo for the Python reference using the (potentially modified) arguments. OpInfo-based tests can opt-into testing references by including this new list in the Sequence passed to the `@ops` decorator. \r\n\r\ncc @ngimel @csarofeen @kevinstephano @Lezcano ", + "headRefName": "prims_and_references", + "headRepository": { + "nameWithOwner": "pytorch/pytorch" + }, + "baseRefName": "master", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "a790467c650be92775103cde5e866c90b56f5376" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "bd6fcf50692e208ebecdc2eaa517a2bfcdcd35cf" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "4a119c8f21529fe1375e7e8789b91f41a3df80c5" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "ea6750dc34d66be759fdfe84b09fb0e23ee59c79" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "2eef8a55fe0227e1921b51bf1f56f9d0a29b49ac" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "b886ed6c20dd1785fd31ed6fa6a8c5b6d0d0b16c" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "9ad9b63d09aa4f7a8549bcf1d88ea4ff0674299c" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "63fdd580118477416ae160e0670ae722ea248090" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "0ccf7dc292af1d40d0a094eb2b2fb0c7ab4ccc70" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "e8a8a4d1fbe35f20eb88e1a43cf5a653883638e5" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "186634dfdd25645c05b58a212f9e8d77c4125fc0" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "f5b4741312b5c42a79f6c8a1d3930b79db38ed8f" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "23d50391bb0fd12111fd3171591c4235ffb2fc1a" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "bac9d45422d58f513b60b4b854441cfdc253d4c5" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "13240ae0b4a0332c3167b65ac026a3172da90cb7" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "1ee34468cb1db3dc6cbae204669f4fec20e2a466" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "561d132bc686d00e8911f7feb3da5901b2bdc574" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ngimel" + }, + "email": "ngimel@fb.com", + "name": "Natalia Gimelshein" + }, + "oid": "ac42bedc84b7c96256376ad09917263bb020b2c3" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ngimel" + }, + "email": "ngimel@fb.com", + "name": "Natalia Gimelshein" + }, + "oid": "7f7d5ba40a0b5e10526d90b018b30b54673d12d8" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "37a6b4a8b1adb712d5777c7c3479866c27fb3c4e" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ngimel" + }, + "email": "ngimel@fb.com", + "name": "Natalia Gimelshein" + }, + "oid": "65b613868c44e519c1777af79b9fd3498c5a7e58" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ngimel" + }, + "email": "ngimel@fb.com", + "name": "Natalia Gimelshein" + }, + "oid": "442c405e9da0d66744ef03e379224c41eedf5b57" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "031ac49ae9c192989385986b6707fa781e3229e0" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "9a6c3b00039c0c985c1c9cb59490012d1c0b38ba" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "d5c30e408af1889b90012d2e09f6ec3cda333bcb" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "db355d55655bb252a699cd532441bb98e52b98d5" + } + } + ], + "pageInfo": { + "endCursor": "MjY", + "hasNextPage": false + }, + "totalCount": 26 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "edges": [ + { + "node": { + "app": { + "name": "Facebook GitHub Tools", + "databaseId": 12274 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "Facebook CLA Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://code.intern.facebook.com/cla/" + }, + { + "name": "Meta Internal-Only Changes Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://opensource.facebook.com/" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6ux14=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454954" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC2o=" + }, + { + "node": { + "app": { + "name": "Netlify", + "databaseId": 13473 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454956" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC2w=" + }, + { + "node": { + "app": { + "name": "Azure Pipelines", + "databaseId": 9426 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454965" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC3U=" + }, + { + "node": { + "app": { + "name": "Dependabot", + "databaseId": 29110 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454970" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC3o=" + }, + { + "node": { + "app": { + "name": "Codecov", + "databaseId": 254 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454974" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC34=" + }, + { + "node": { + "app": { + "name": "PyTorch Bot", + "databaseId": 40112 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454977" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC4E=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "run-torchbench", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622865/jobs/3270915028" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6e-c8=", + "hasNextPage": false + } + }, + "conclusion": "SKIPPED", + "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241455322" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFDNo=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915027" + }, + { + "name": "lintrunner", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915071" + }, + { + "name": "Test tools", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915141" + }, + { + "name": "Test collect_env (with_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915194" + }, + { + "name": "Test collect_env (without_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915229" + }, + { + "name": "toc", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915283" + }, + { + "name": "workflow-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915321" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6e-zM=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241455334" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFDOY=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pull" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "linux-vulkan-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927344" + }, + { + "name": "linux-bionic-rocm5.0-py3.7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927442" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927507" + }, + { + "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927567" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927674" + }, + { + "name": "win-vs2019-cuda11.3-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927727" + }, + { + "name": "linux-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927802" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927853" + }, + { + "name": "linux-xenial-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927948" + }, + { + "name": "linux-xenial-py3-clang5-mobile-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927996" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928061" + }, + { + "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928116" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928198" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928256" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928291" + }, + { + "name": "win-vs2019-cpu-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928317" + }, + { + "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928338" + }, + { + "name": "linux-xenial-py3.7-gcc7-no-ops / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928367" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928410" + }, + { + "name": "linux-bionic-cuda11.3-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928445" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991071" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991125" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991162" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991195" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991233" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991261" + }, + { + "name": "linux-docs / build-docs (cpp)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991305" + }, + { + "name": "linux-docs / build-docs (python)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991349" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270996024" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270996068" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270996092" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270996505" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270998987" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270999027" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271006886" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271006941" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271018097" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271018135" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271018162" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271021143" + }, + { + "name": "linux-bionic-rocm5.0-py3.7 / test (default, 1, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271034041" + }, + { + "name": "linux-bionic-rocm5.0-py3.7 / test (default, 2, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271034072" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271048218" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271049553" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271049587" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271049616" + }, + { + "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271068293" + }, + { + "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271068336" + }, + { + "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271149276" + }, + { + "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271149321" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6jVK8=", + "hasNextPage": true + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241455360" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFDQA=" + } + ], + "pageInfo": { + "hasNextPage": false + } + }, + "status": null, + "pushedDate": "2022-04-25T02:30:31Z", + "oid": "db355d55655bb252a699cd532441bb98e52b98d5" + } + } + ] + }, + "changedFiles": 5, + "files": { + "nodes": [ + { + "path": "test/test_ops.py" + }, + { + "path": "torch/_prims/__init__.py" + }, + { + "path": "torch/_prims/utils.py" + }, + { + "path": "torch/_refs/__init__.py" + }, + { + "path": "torch/testing/_internal/common_methods_invocations.py" + } + ], + "pageInfo": { + "endCursor": "NQ", + "hasNextPage": false + } + }, + "reviews": { + "nodes": [ + { + "author": { + "login": "lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ngimel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ngimel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "zou3519" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "peterbell10" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ngimel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ngimel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ngimel" + }, + "state": "APPROVED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNC0wNlQxMjo1NjoyNC0wNzowMLkyMDIyLTA0LTA2VDA4OjQwOjM4LTA3OjAwzjenO6Y=", + "hasPreviousPage": false + } + }, + "comments": { + "nodes": [ + { + "bodyText": "Ref implementations by themselves can handle any shapes (and broadcast ops by themselves don't bake in any shapes). The question is can we decide if a particular trace is applicable for a different input, but that depends on the tracing technology and what we are caching on, so out of scope for initial PR.", + "createdAt": "2022-04-21T19:00:28Z", + "author": { + "login": "ngimel" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1105643418 + }, + { + "bodyText": "@pytorchbot merge this please", + "createdAt": "2022-04-25T04:42:29Z", + "author": { + "login": "mruberry" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 1108072887 + }, + { + "bodyText": "Merge failed due to 'mruberry'\nRaised by https://github.com/pytorch/pytorch/actions/runs/2218044244", + "createdAt": "2022-04-25T04:43:54Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1108073536 + }, + { + "bodyText": "@mruberry has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.", + "createdAt": "2022-04-25T04:51:11Z", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1108075965 + }, + { + "bodyText": "Hey @mruberry.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.", + "createdAt": "2022-04-25T09:57:56Z", + "author": { + "login": "github-actions" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1108351107 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOQebHmg==", + "hasPreviousPage": true + } + }, + "labels": { + "edges": [ + { + "node": { + "name": "cla signed" + } + }, + { + "node": { + "name": "topic: not user facing" + } + }, + { + "node": { + "name": "module: primTorch" + } + } + ] + } + } + } + } + }, + "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=77700 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": true, + "isCrossRepository": false, + "author": { + "login": "kit1980" + }, + "title": "Move pull linux-docs job to Ubuntu 20.04", + "body": "", + "headRefName": "sdym/pull-xenial-focal-linux-docs", + "headRepository": { + "nameWithOwner": "pytorch/pytorch" + }, + "baseRefName": "master", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": { + "login": "kit1980" + }, + "email": "sdym@fb.com", + "name": "Sergii Dymchenko" + }, + "oid": "81261599614423baa17df72300b8e109677b6799" + } + } + ], + "pageInfo": { + "endCursor": "MQ", + "hasNextPage": false + }, + "totalCount": 1 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "edges": [ + { + "node": { + "app": { + "name": "Facebook GitHub Tools", + "databaseId": 12274 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "Facebook CLA Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://code.facebook.com/cla/" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNmNqE=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/81261599614423baa17df72300b8e109677b6799/checks?check_suite_id=6567147714" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuMI=" + }, + { + "node": { + "app": { + "name": "Netlify", + "databaseId": 13473 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/81261599614423baa17df72300b8e109677b6799/checks?check_suite_id=6567147726" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuM4=" + }, + { + "node": { + "app": { + "name": "Azure Pipelines", + "databaseId": 9426 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/81261599614423baa17df72300b8e109677b6799/checks?check_suite_id=6567147733" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuNU=" + }, + { + "node": { + "app": { + "name": "Dependabot", + "databaseId": 29110 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/81261599614423baa17df72300b8e109677b6799/checks?check_suite_id=6567147746" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuOI=" + }, + { + "node": { + "app": { + "name": "Codecov", + "databaseId": 254 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/81261599614423baa17df72300b8e109677b6799/checks?check_suite_id=6567147762" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuPI=" + }, + { + "node": { + "app": { + "name": "PyTorch Bot", + "databaseId": 40112 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/81261599614423baa17df72300b8e109677b6799/checks?check_suite_id=6567147780" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuQQ=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "lintrunner", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528127876" + }, + { + "name": "workflow-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128023" + }, + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128196" + }, + { + "name": "Test collect_env (with_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128519" + }, + { + "name": "Test collect_env (without_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128575" + }, + { + "name": "toc", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128663" + }, + { + "name": "Test tools", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128857" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNdYVY=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/81261599614423baa17df72300b8e109677b6799/checks?check_suite_id=6567148336" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuzA=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "run-torchbench", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867843/jobs/3528127882" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNdXEg=", + "hasNextPage": false + } + }, + "conclusion": "SKIPPED", + "url": "https://github.com/pytorch/pytorch/commit/81261599614423baa17df72300b8e109677b6799/checks?check_suite_id=6567148344" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuzg=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "docker-builds" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "docker-build (pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528127883" + }, + { + "name": "docker-build (pytorch-linux-bionic-cuda11.3-cudnn8-py3-clang9)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528127945" + }, + { + "name": "docker-build (pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128001" + }, + { + "name": "docker-build (pytorch-linux-bionic-py3.7-clang9)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128067" + }, + { + "name": "docker-build (pytorch-linux-bionic-rocm5.0-py3.7)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128124" + }, + { + "name": "docker-build (pytorch-linux-bionic-rocm5.1-py3.7)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128191" + }, + { + "name": "docker-build (pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128259" + }, + { + "name": "docker-build (pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128321" + }, + { + "name": "docker-build (pytorch-linux-xenial-py3-clang5-android-ndk-r19c)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128365" + }, + { + "name": "docker-build (pytorch-linux-xenial-py3-clang5-asan)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128446" + }, + { + "name": "docker-build (pytorch-linux-xenial-py3-clang7-asan)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128507" + }, + { + "name": "docker-build (pytorch-linux-xenial-py3-clang7-onnx)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128563" + }, + { + "name": "docker-build (pytorch-linux-xenial-py3.7-gcc5.4)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128639" + }, + { + "name": "docker-build (pytorch-linux-xenial-py3.7-gcc7)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128687" + }, + { + "name": "docker-build (pytorch-linux-focal-py3.7-gcc7)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128741" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNdYLI=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/81261599614423baa17df72300b8e109677b6799/checks?check_suite_id=6567148352" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduu0A=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pull" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "linux-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528150762" + }, + { + "name": "linux-focal-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528150903" + }, + { + "name": "linux-xenial-py3.7-gcc7-no-ops / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151086" + }, + { + "name": "linux-xenial-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151258" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151511" + }, + { + "name": "linux-bionic-rocm5.1-py3.7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151776" + }, + { + "name": "linux-bionic-cuda11.3-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151896" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152014" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152139" + }, + { + "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152216" + }, + { + "name": "win-vs2019-cuda11.3-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152378" + }, + { + "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152516" + }, + { + "name": "linux-xenial-py3-clang5-mobile-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152599" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152723" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152802" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152913" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152969" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528153005" + }, + { + "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528153062" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528153125" + }, + { + "name": "win-vs2019-cpu-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528153207" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528242483" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528242528" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528245875" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528245914" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528245964" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528246008" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528248520" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528255086" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528255128" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274064" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274097" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274133" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 4, 5, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274173" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 5, 5, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274209" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528277014" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528308958" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309747" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309810" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309837" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309864" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309895" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309925" + }, + { + "name": "linux-bionic-rocm5.1-py3.7 / test (default, 1, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528310044" + }, + { + "name": "linux-bionic-rocm5.1-py3.7 / test (default, 2, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528310101" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384337" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384379" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384408" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384441" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384471" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNi1Nc=", + "hasNextPage": true + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/81261599614423baa17df72300b8e109677b6799/checks?check_suite_id=6567148369" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduu1E=" + } + ], + "pageInfo": { + "hasNextPage": false + } + }, + "status": null, + "pushedDate": "2022-05-19T00:02:11Z", + "oid": "81261599614423baa17df72300b8e109677b6799" + } + } + ] + }, + "changedFiles": 3, + "files": { + "nodes": [ + { + "path": ".circleci/docker/build.sh" + }, + { + "path": ".circleci/docker/common/install_katex.sh" + }, + { + "path": ".github/workflows/pull.yml" + } + ], + "pageInfo": { + "endCursor": "Mw", + "hasNextPage": false + } + }, + "reviews": { + "nodes": [ + { + "author": { + "login": "suo" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kit1980" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "janeyx99" + }, + "state": "APPROVED" + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNS0xOFQxMjo0MTowNS0wNzowMLkyMDIyLTA1LTE4VDEyOjQxOjA0LTA3OjAwzjpD7es=", + "hasPreviousPage": false + } + }, + "comments": { + "nodes": [ + { + "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/77700\n\ud83d\udcc4 \u00a0Preview Python docs built from this PR\n\ud83d\udcc4 \u00a0Preview C++ docs built from this PR\n\u2753Need help or want to give feedback on the CI? Visit our office hours\n\n\u2705 No Failures (0 Pending)\nAs of commit 8126159 (more details on the Dr. CI page):\nExpand to see more\n\n\ud83d\udc9a \ud83d\udc9a Looks good so far! There are no failures yet. \ud83d\udc9a \ud83d\udc9a\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here to manually regenerate this comment.", + "createdAt": "2022-05-17T23:01:48Z", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": { + "login": "facebook-github-bot" + }, + "databaseId": 1129400934 + }, + { + "bodyText": "@pytorchbot merge", + "createdAt": "2022-05-19T15:39:05Z", + "author": { + "login": "kit1980" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1131884232 + }, + { + "bodyText": "Merge failed due to Refusing to merge as mandatory check(s) linux-docs / build-docs (cpp), linux-docs / build-docs (python) are pending/not yet run for rule OSS CI\nRaised by https://github.com/pytorch/pytorch/actions/runs/2353067846", + "createdAt": "2022-05-19T15:40:59Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1131886153 + }, + { + "bodyText": "@pytorchbot merge -f", + "createdAt": "2022-05-19T16:41:29Z", + "author": { + "login": "kit1980" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1131945610 + }, + { + "bodyText": "Hey @kit1980.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.", + "createdAt": "2022-05-19T16:43:37Z", + "author": { + "login": "github-actions" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1131947473 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOQ1FKZg==", + "hasPreviousPage": false + } + }, + "labels": { + "edges": [ + { + "node": { + "name": "Merged" + } + }, + { + "node": { + "name": "cla signed" + } + } + ] + } + } + } + } + }, + "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=68111 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": true, + "isCrossRepository": true, + "author": { + "login": "chunyuan-w" + }, + "title": "Add JIT graph fuser for oneDNN Graph API (Preview4)", + "body": "## Description\r\nPreview4 PR of this [RFC](https://github.com/pytorch/pytorch/issues/49444).\r\n\r\nOn the basis of https://github.com/pytorch/pytorch/pull/50256, the below improvements are included:\r\n\r\n- The [preview4 release branch](https://github.com/oneapi-src/oneDNN/releases/tag/graph-v0.4.1) of the oneDNN Graph API is used\r\n- The fuser now works with the profiling graph executor. We have inserted type check nodes to guard the profiled tensor properties.\r\n\r\n### User API:\r\nThe optimization pass is disabled by default. Users could enable it by:\r\n```\r\ntorch.jit.enable_onednn_fusion(True)\r\n```\r\n\r\n### Performance:\r\n[pytorch/benchmark](https://github.com/pytorch/benchmark) tool is used to compare the performance:\r\n- SkyLake 8180 (1 socket of 28 cores):\r\n\r\n ![image](https://user-images.githubusercontent.com/65992142/151162305-05e44425-a24e-4d5e-94e1-743b40b87a8c.png)\r\n\r\n- SkyLake 8180 (single thread):\r\n\r\n ![image](https://user-images.githubusercontent.com/65992142/151162528-69f90b79-d08d-46b8-8775-d80a6ccbce8a.png)\r\n \\* By mapping hardswish to oneDNN Graph, it\u2019s 8% faster than PyTorch JIT (NNC + OFI)\r\n \\** We expect performance gain after mapping transpose, contiguous & view to oneDNN graph ops\r\n\r\n\r\n### Directory structure of the integration code\r\nFuser-related code are placed under:\r\n```\r\ntorch/csrc/jit/codegen/onednn/\r\n```\r\n\r\nOptimization pass registration is done in:\r\n```\r\ntorch/csrc/jit/passes/onednn_graph_fuser.h\r\n```\r\n\r\nCMake for the integration code is:\r\n```\r\ncaffe2/CMakeLists.txt\r\n```\r\n\r\n## Limitations\r\n\r\n- In this PR, we have only supported the optimization on Linux platform. The support on Windows and MacOS will be enabled as the next step.\r\n- We have only optimized the inference use case.", + "headRefName": "chunyuan/llga_preview2", + "headRepository": { + "nameWithOwner": "chunyuan-w/pytorch" + }, + "baseRefName": "master", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "0096fcc49f277fd8e006fcb42e0cb28a1422ec98" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "7bcc4de26a5472f1d252735dd425b46794b0844f" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "3a2a588bfe6bbf9bf74d88d441cd22affda207da" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "ca7df12fbfaa3ddbabeca39b76300d17f4a33f2f" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "81d44f35b8bc043c38837d0694e5bc072203b832" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "14fd5d1bfc2c58a71379f778871e3fca0a8e79b2" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "954dc23663125897f4b199eb2a8607dc5fca3274" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "9f77a0b476accc678b6f0569e4ff33fa6bbe97fc" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchitintel" + }, + "oid": "fbf3b23bc1288697e1aec539a7c4ee3dc0bcb84c" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "f8b8e78f786586c3cdf3966fd83ffa124d3eda70" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "6fffa2f7453ee7e0f8d8e2f73ea8a65230539589" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "849385404e6f3cd1cf7cef19f931ecf4fa28afdb" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "adbae7b77f8c0dbc59fccf15207d97ba86cfade2" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "6dcf2a4981aff24fa16fc7461ae4ec29690f956f" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "54f3e05ad524cffd0911ee93be3c50f589b51f58" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "edbfc640ea79a0af85757d9e73796dcc90231519" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "67654db7cba562809d1b4a44cdda58af5cc9daaf" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "9c9d99b930b11af9ff03f52d45bf49c652df758d" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "ffb25119cd9ce815cc4d9d14a2317fcbbfa9ea86" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "ab9eee84512ca1bdfbc81e25c6eb67b29d0f302a" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "62a4642cf3330524990a69ac29e002c97812320a" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "ca9b1223be4af2c8b4929303d498eafd71793128" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "6f4a23d24514a02954d2ec792830085f612223c9" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchitintel" + }, + "oid": "b2a9a9c0926b02d0b2e87722ed61450f224a61d0" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "e88b492be733f24b6aa395829c76add67d0901e7" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "c44336d7a914952bfb78e012e08d9a6d6dde5937" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "5157930f7b3921d41a586260582b574c915f6ca1" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "04cb8353813f6bbd0d913a994923cc7e1e291406" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchitintel" + }, + "oid": "62991eaad0e638bb0bced327e03f932f66f68732" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchitintel" + }, + "oid": "7496bf1588050191595d833d23b8972b2f22655e" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchitintel" + }, + "oid": "d9d35f23cca0cd29c78a845731b24826152dcf1c" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "f74ec134f18a65a7c72455bdf44f72e3ebb27105" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "eb32cc65a975361160948bfc3d6a577991ea262e" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "c7665f8d695b680c54db0bad2b7b7df46d886b50" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "e6321ad8f59ea01130568c202d186448bb9cb9d0" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "a72cd0d02693f45e5354a70654581ad514581ec7" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "b3cd3028b4ed31805e82f7eaf02217ab74ca59b9" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "49a592d9788d08e6cd0593882f867e129057c1cc" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "0575766b2144b13f6a38227c4e2b8d22ec8db80f" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "b5c9b10ff87d622350e8ca64fae3a476eb70d5aa" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "66bc652a30ccc329adb929870a4ac726bb98b38c" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "72b9ca9c8e2dac98cbb7199b3dfac7c7305b80c5" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "a7892ed7373207d96406c8b5734a089643c5cdbd" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchitintel" + }, + "oid": "d54cb084e1daad8a08c3f8de0ad3f7afb5b05ac1" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchitintel" + }, + "oid": "aef71d692a8a159e0ca56be363e2cc1225ce7647" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "bf618e205ec31cff962dcc8ab478e0a699a9572d" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "e4a331f1088448f7d7d86256ce71e0e71da006b0" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "0b743523d1430fec759d5fefbb687f17c89335a5" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "e80a351a62d98b810ec8985c4b25257af1d6c5bb" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "c189eca154b6691919d0e21489d1c322c7435c0b" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchitintel" + }, + "oid": "e080a067c75d7b888a8a362682a2d5ba70e0c3a8" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchitintel" + }, + "oid": "028561fbf8f3ed90e074e6e0e3a4ca4dd7ffa2a8" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "d550cf14037badd4caa2f52202e2f20bc4db8432" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "574159ebadd1dec24daaf883879ffeca8d9e71b7" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "9eb3ee98ea756067ed1c8f52f309f6d3e211a904" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "29929f48be03dcdd1bbfade572de7feafa825547" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "8a7358ca8da547b40ea1a99ddc57ebed19959684" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "6606637d2c5525b43e294a8b366a85052e1be0c6" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "5ecfd1f28b87045deb8bc8ffe33b3d8b906f3264" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "be2d4345c65442c4cfbe8afdfb2ae0893945da42" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchitintel" + }, + "oid": "b5b89d3644a43e2dbda841cafb71b32edbe07c8a" + } + }, + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nikita.shulga@gmail.com", + "name": "Nikita Shulga" + }, + "oid": "73881411e2bfb3aaa2e89926a82390b4c587ad75" + } + } + ], + "pageInfo": { + "endCursor": "NjI", + "hasNextPage": false + }, + "totalCount": 62 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "edges": [ + { + "node": { + "app": { + "name": "Facebook GitHub Tools", + "databaseId": 12274 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "Facebook CLA Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://code.intern.facebook.com/cla/" + }, + { + "name": "Meta Internal-Only Changes Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://opensource.facebook.com/" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_NXnc=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/73881411e2bfb3aaa2e89926a82390b4c587ad75/checks?check_suite_id=5743625010" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVZYwzI=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "clang-format", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903895825" + }, + { + "name": "py2-setup-validate-errormsg", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903895911" + }, + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903895963" + }, + { + "name": "shellcheck", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896134" + }, + { + "name": "toc", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896253" + }, + { + "name": "clang-tidy", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896371" + }, + { + "name": "cmakelint", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896525" + }, + { + "name": "flake8-py3", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896658" + }, + { + "name": "Test collect_env (with_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896771" + }, + { + "name": "Test collect_env (without_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896795" + }, + { + "name": "Test tools", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896838" + }, + { + "name": "mypy", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896897" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_NZqw=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/73881411e2bfb3aaa2e89926a82390b4c587ad75/checks?check_suite_id=5743625458" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVZYxPI=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "run-torchbench", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440031/jobs/2903895828" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_NYIw=", + "hasNextPage": false + } + }, + "conclusion": "SKIPPED", + "url": "https://github.com/pytorch/pytorch/commit/73881411e2bfb3aaa2e89926a82390b4c587ad75/checks?check_suite_id=5743625463" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVZYxPc=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pull" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896014" + }, + { + "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896165" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896394" + }, + { + "name": "linux-bionic-rocm4.5-py3.7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896572" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896666" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896778" + }, + { + "name": "linux-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896837" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896896" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896936" + }, + { + "name": "linux-xenial-py3-clang5-mobile-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897025" + }, + { + "name": "linux-xenial-py3.7-gcc7-no-ops / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897161" + }, + { + "name": "linux-xenial-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897213" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897280" + }, + { + "name": "win-vs2019-cpu-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897368" + }, + { + "name": "win-vs2019-cuda11.3-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897431" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897476" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897578" + }, + { + "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897630" + }, + { + "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897699" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897733" + }, + { + "name": "linux-docs / build-docs (cpp)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904327787" + }, + { + "name": "linux-docs / build-docs (python)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904327838" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904327956" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904327997" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904328035" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904328093" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904328131" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904328177" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904333962" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904334006" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904430419" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904430459" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (noarch, 1, 1, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904430508" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904430573" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904443663" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904443723" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904443787" + }, + { + "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904454239" + }, + { + "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904454303" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904554602" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904554698" + }, + { + "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904588855" + }, + { + "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904588886" + }, + { + "name": "win-vs2019-cuda11.3-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904588924" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904655702" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904656104" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904656150" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904656192" + }, + { + "name": "linux-bionic-rocm4.5-py3.7 / test (default, 1, 2, linux.rocm.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904706520" + }, + { + "name": "linux-bionic-rocm4.5-py3.7 / test (default, 2, 2, linux.rocm.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904706565" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_fN1g=", + "hasNextPage": false + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/73881411e2bfb3aaa2e89926a82390b4c587ad75/checks?check_suite_id=5743625483" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVZYxQs=" + } + ], + "pageInfo": { + "hasNextPage": false + } + }, + "status": { + "contexts": [ + { + "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17048428?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17048429?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17048431?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17048430?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + } + ] + }, + "pushedDate": "2022-03-21T19:58:52Z", + "oid": "73881411e2bfb3aaa2e89926a82390b4c587ad75" + } + } + ] + }, + "changedFiles": 37, + "files": { + "nodes": [ + { + "path": "aten/src/ATen/core/interned_strings.h" + }, + { + "path": "caffe2/CMakeLists.txt" + }, + { + "path": "cmake/Dependencies.cmake" + }, + { + "path": "cmake/Modules/FindMKLDNN.cmake" + }, + { + "path": "cmake/public/mkldnn.cmake" + }, + { + "path": "docs/source/jit.rst" + }, + { + "path": "test/test_jit_llga_fuser.py" + }, + { + "path": "torch/_C/__init__.pyi.in" + }, + { + "path": "torch/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp" + }, + { + "path": "torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h" + }, + { + "path": "torch/csrc/jit/codegen/onednn/README.md" + }, + { + "path": "torch/csrc/jit/codegen/onednn/defer_size_check.cpp" + }, + { + "path": "torch/csrc/jit/codegen/onednn/defer_size_check.h" + }, + { + "path": "torch/csrc/jit/codegen/onednn/graph_fuser.cpp" + }, + { + "path": "torch/csrc/jit/codegen/onednn/graph_fuser.h" + }, + { + "path": "torch/csrc/jit/codegen/onednn/graph_helper.cpp" + }, + { + "path": "torch/csrc/jit/codegen/onednn/graph_helper.h" + }, + { + "path": "torch/csrc/jit/codegen/onednn/graph_rewriter.cpp" + }, + { + "path": "torch/csrc/jit/codegen/onednn/guard_shape.cpp" + }, + { + "path": "torch/csrc/jit/codegen/onednn/guard_shape.h" + }, + { + "path": "torch/csrc/jit/codegen/onednn/interface.cpp" + }, + { + "path": "torch/csrc/jit/codegen/onednn/interface.h" + }, + { + "path": "torch/csrc/jit/codegen/onednn/kernel.cpp" + }, + { + "path": "torch/csrc/jit/codegen/onednn/kernel.h" + }, + { + "path": "torch/csrc/jit/codegen/onednn/layout_propagation.cpp" + }, + { + "path": "torch/csrc/jit/codegen/onednn/layout_propagation.h" + }, + { + "path": "torch/csrc/jit/codegen/onednn/operator.h" + }, + { + "path": "torch/csrc/jit/codegen/onednn/prepare_binary.cpp" + }, + { + "path": "torch/csrc/jit/codegen/onednn/prepare_binary.h" + }, + { + "path": "torch/csrc/jit/codegen/onednn/register_interface.cpp" + }, + { + "path": "torch/csrc/jit/ir/alias_analysis.cpp" + }, + { + "path": "torch/csrc/jit/ir/ir.cpp" + }, + { + "path": "torch/csrc/jit/passes/inline_autodiff_subgraphs.cpp" + }, + { + "path": "torch/csrc/jit/passes/onednn_graph_fuser.h" + }, + { + "path": "torch/csrc/jit/python/init.cpp" + }, + { + "path": "torch/csrc/jit/runtime/operator.cpp" + }, + { + "path": "torch/jit/__init__.py" + } + ], + "pageInfo": { + "endCursor": "Mzc", + "hasNextPage": false + } + }, + "reviews": { + "nodes": [ + { + "author": { + "login": "pinzhenx" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "pinzhenx" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "pinzhenx" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "chunyuan-w" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "eellison" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "wukong1992" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "eellison" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "eellison" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "eellison" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "eellison" + }, + "state": "APPROVED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "eellison" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "malfet" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "malfet" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "malfet" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpO5MjAyMS0xMi0xMFQwOToyNDoxOS0wODowMLkyMDIxLTEyLTEwVDA5OjI0OjE5LTA4OjAwzjFryLE=", + "hasPreviousPage": false + } + }, + "comments": { + "nodes": [ + { + "bodyText": "Looks like this broke master https://hud.pytorch.org/pytorch/pytorch/commit/7dd08230117f4fa8bb82b3524e90fb00340198c7. I am reverting.", + "createdAt": "2022-03-21T22:51:38Z", + "author": { + "login": "suo" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1074498483 + }, + { + "bodyText": "@pytorchbot revert this", + "createdAt": "2022-03-21T22:51:44Z", + "author": { + "login": "suo" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1074498550 + }, + { + "bodyText": "Looks like this broke master https://hud.pytorch.org/pytorch/pytorch/commit/7dd08230117f4fa8bb82b3524e90fb00340198c7. I am reverting.\n\nOops! Will fix it ASAP.", + "createdAt": "2022-03-21T22:53:34Z", + "author": { + "login": "sanchitintel" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 1074499668 + }, + { + "bodyText": "This pull request has been reverted by e5bf879. To re-land this change, please open another pull request, assignthe same reviewers, fix the CI failures that caused the revert and make sure that the failing CI runs on the PR by applying the proper ciflow label (e.g., ciflow/trunk).", + "createdAt": "2022-03-21T23:07:23Z", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1074508608 + }, + { + "bodyText": "This pull request has been reverted by e5bf879. To re-land this change, please open another pull request, assignthe same reviewers, fix the CI failures that caused the revert and make sure that the failing CI runs on the PR by applying the proper ciflow label (e.g., ciflow/trunk).", + "createdAt": "2022-03-30T00:53:50Z", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1082508130 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOQAuLsw==", + "hasPreviousPage": true + } + }, + "labels": { + "edges": [ + { + "node": { + "name": "oncall: jit" + } + }, + { + "node": { + "name": "triaged" + } + }, + { + "node": { + "name": "open source" + } + }, + { + "node": { + "name": "cla signed" + } + }, + { + "node": { + "name": "Reverted" + } + }, + { + "node": { + "name": "intel priority" + } + } + ] + } + } + } + } + }, + "query_sha=2e2877d2452c4f233f042b7ccd50ab9c2a6e9a73d8819a0c876203c12364e8a3 cursor=Y3Vyc29yOnYyOpHOQAuLsw== name=pytorch number=68111 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "comments": { + "nodes": [ + { + "bodyText": "CI Flow Status\n\u269b\ufe0f CI Flow\nRuleset - Version: v1\nRuleset - File: https://github.com/chunyuan-w/pytorch/blob/7496bf1588050191595d833d23b8972b2f22655e/.github/generated-ciflow-ruleset.json\nPR ciflow labels: ciflow/default\n\n\n\nWorkflows\nLabels (bold enabled)\nStatus\n\n\n\n\nTriggered Workflows\n\n\n\n\nlinux-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/noarch, ciflow/trunk\n\u2705 triggered\n\n\nlinux-docs\nciflow/all, ciflow/cpu, ciflow/default, ciflow/docs, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-vulkan-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk, ciflow/vulkan\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-bazel-test\nciflow/all, ciflow/bazel, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-build\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-custom-build-static\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-asan\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/sanitizers, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-onnx\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/onnx, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7-no-ops\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nwin-vs2019-cpu-py3\nciflow/all, ciflow/cpu, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nwin-vs2019-cuda11.3-py3\nciflow/all, ciflow/cuda, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nSkipped Workflows\n\n\n\n\ncaffe2-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\ndocker-builds\nciflow/all, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-custom-ops\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-full-jit\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-metal\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64-full-jit\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda10.2-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-binary-conda\nciflow/binaries, ciflow/binaries/conda\n\ud83d\udeab skipped\n\n\nlinux-binary-libtorch-cxx11-abi\nciflow/binaries, ciflow/binaries/libtorch\n\ud83d\udeab skipped\n\n\nlinux-binary-libtorch-pre-cxx11\nciflow/binaries, ciflow/binaries/libtorch\n\ud83d\udeab skipped\n\n\nlinux-binary-manywheel\nciflow/binaries, ciflow/binaries/wheel\n\ud83d\udeab skipped\n\n\nlinux-bionic-cuda10.2-py3.9-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/slow, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-docs-push\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-no-ops\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-arm64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-lite-interpreter-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-11-py3-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nparallelnative-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nperiodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled, ciflow/slow, ciflow/slow-gradcheck\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda11.1-py3.7-gcc7-debug\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-win-vs2019-cuda11.1-py3\nciflow/all, ciflow/cuda, ciflow/scheduled, ciflow/win\n\ud83d\udeab skipped\n\n\nperiodic-win-vs2019-cuda11.5-py3\nciflow/all, ciflow/cuda, ciflow/scheduled, ciflow/win\n\ud83d\udeab skipped\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-build\nciflow/all, ciflow/android, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\n\n\nYou can add a comment to the PR and tag @pytorchbot with the following commands:\n\n# ciflow rerun, \"ciflow/default\" will always be added automatically\n@pytorchbot ciflow rerun\n\n# ciflow rerun with additional labels \"-l \", which is equivalent to adding these labels manually and trigger the rerun\n@pytorchbot ciflow rerun -l ciflow/scheduled -l ciflow/slow\n\nFor more information, please take a look at the CI Flow Wiki.", + "createdAt": "2021-11-10T08:42:49Z", + "author": { + "login": "pytorch-probot" + }, + "authorAssociation": "NONE", + "editor": { + "login": "pytorch-probot" + }, + "databaseId": 964902865 + }, + { + "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/68111\nNeed help or want to give feedback on the CI? Visit our office hours\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 7388141 (more details on the Dr. CI page):\n\n\n29/29 failures introduced in this PR\n\n\n\ud83d\udd75\ufe0f 29 new failures recognized by patterns\nThe following CI failures do not appear to be due to upstream breakages:\n pull / linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge) (1/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:31:38.6978776Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:31:38.3001628Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:31:38.5169168Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:31:38.5362923Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:31:38.5413452Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:31:38.5458747Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:31:38.5484014Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:31:38.5497924Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:31:38.5656491Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:31:38.5678893Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:31:38.6888479Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0f6488c20adb4dca4\n2022-03-21T21:31:38.6978776Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:31:38.6992648Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:31:38.7003010Z ##[error]Process completed with exit code 2.\n2022-03-21T21:31:38.7044027Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:31:38.7044261Z with:\n2022-03-21T21:31:38.7044413Z env:\n2022-03-21T21:31:38.7044565Z IN_CI: 1\n2022-03-21T21:31:38.7044709Z IS_GHA: 1\n2022-03-21T21:31:38.7044885Z GIT_DEFAULT_BRANCH: master\n2022-03-21T21:31:38.7045067Z ##[endgroup]\n2022-03-21T21:31:38.7060958Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge) (2/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:35:19.2635222Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:35:18.9028722Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:35:19.1132721Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:35:19.1310590Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:35:19.1360251Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:35:19.1386865Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:35:19.1429182Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:35:19.1441925Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:35:19.1468280Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:35:19.1617667Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:35:19.2545368Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-098be2985e0392130\n2022-03-21T21:35:19.2635222Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:35:19.2648463Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:35:19.2658727Z ##[error]Process completed with exit code 2.\n2022-03-21T21:35:19.2706355Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:35:19.2706591Z with:\n2022-03-21T21:35:19.2706748Z env:\n2022-03-21T21:35:19.2706908Z IN_CI: 1\n2022-03-21T21:35:19.2707061Z IS_GHA: 1\n2022-03-21T21:35:19.2707246Z GIT_DEFAULT_BRANCH: master\n2022-03-21T21:35:19.2707438Z ##[endgroup]\n2022-03-21T21:35:19.2724554Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / win-vs2019-cuda11.3-py3 / test (force_on_cpu, 1, 1, windows.4xlarge) (3/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T23:11:57.5531419Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T23:11:52.7662022Z Downloading botocore-1.22.12-py3-none-any.whl (8.1 MB)\n2022-03-21T23:11:53.1213298Z ---------------------------------------- 8.1/8.1 MB 23.6 MB/s eta 0:00:00\n2022-03-21T23:11:53.1644665Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T23:11:53.2218699Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T23:11:53.2389674Z Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T23:11:53.2787295Z -------------------------------------- 247.7/247.7 KB 7.4 MB/s eta 0:00:00\n2022-03-21T23:11:53.3761842Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T23:11:53.5457622Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T23:11:57.4175080Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T23:11:57.5296815Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0105d4db093574f40\n2022-03-21T23:11:57.5531419Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T23:11:57.5564814Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T23:11:57.5587712Z ##[error]Process completed with exit code 2.\n2022-03-21T23:11:57.5790311Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T23:11:57.5790832Z with:\n2022-03-21T23:11:57.5791104Z env:\n2022-03-21T23:11:57.5791358Z IN_CI: 1\n2022-03-21T23:11:57.5791620Z IS_GHA: 1\n2022-03-21T23:11:57.5791939Z GIT_DEFAULT_BRANCH: master\n2022-03-21T23:11:57.5792425Z pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T23:11:57.5792884Z ##[endgroup]\n\n\n pull / linux-bionic-rocm4.5-py3.7 / test (default, 1, 2, linux.rocm.gpu) (4/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T02:17:12.6257577Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T02:17:11.9280556Z Using cached https://files.pythonhosted.org/packages/7b/9c/f51775ebe7df5a7aa4e7c79ed671bde94e154bd968aca8d65bb24aba0c8c/s3transfer-0.5.2-py3-none-any.whl\n2022-03-22T02:17:11.9335199Z Collecting urllib3<1.27,>=1.25.4 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T02:17:11.9682045Z Using cached https://files.pythonhosted.org/packages/ec/03/062e6444ce4baf1eac17a6a0ebfe36bb1ad05e1df0e20b110de59c278498/urllib3-1.26.9-py2.py3-none-any.whl\n2022-03-22T02:17:11.9850357Z Collecting python-dateutil<3.0.0,>=2.1 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T02:17:12.0403171Z Using cached https://files.pythonhosted.org/packages/36/7a/87837f39d0296e723bb9b62bbb257d0355c7f6128853c78955f57342a56d/python_dateutil-2.8.2-py2.py3-none-any.whl\n2022-03-22T02:17:12.0468875Z Collecting six>=1.5 (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T02:17:12.0590000Z Using cached https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl\n2022-03-22T02:17:12.0607093Z Installing collected packages: jmespath, urllib3, six, python-dateutil, botocore, s3transfer, boto3\n2022-03-22T02:17:12.5273459Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2 six-1.16.0 urllib3-1.26.9\n2022-03-22T02:17:12.6032812Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 worker-rocm-amd-114\n2022-03-22T02:17:12.6257577Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T02:17:12.6259543Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T02:17:12.6291924Z ##[error]Process completed with exit code 2.\n2022-03-22T02:17:12.6387977Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-22T02:17:12.6388298Z with:\n2022-03-22T02:17:12.6388521Z wait-ssh: false\n2022-03-22T02:17:12.6388727Z env:\n2022-03-22T02:17:12.6388932Z IN_CI: 1\n2022-03-22T02:17:12.6389143Z IS_GHA: 1\n2022-03-22T02:17:12.6389368Z GIT_DEFAULT_BRANCH: master\n2022-03-22T02:17:12.6389669Z DOCKER_HOST: unix:///run/user/1121/docker.sock\n\n\n pull / linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge) (5/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:19:24.4890693Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:19:24.0962005Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:19:24.3152253Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:19:24.3341183Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:19:24.3391374Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:19:24.3436392Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:19:24.3448982Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:19:24.3474092Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:19:24.3502003Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:19:24.3655072Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:19:24.4799309Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0bc9250521f338cae\n2022-03-21T22:19:24.4890693Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:19:24.4903625Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:19:24.4913841Z ##[error]Process completed with exit code 2.\n2022-03-21T22:19:24.4957338Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:19:24.4957575Z with:\n2022-03-21T22:19:24.4957735Z env:\n2022-03-21T22:19:24.4957900Z IN_CI: 1\n2022-03-21T22:19:24.4958055Z IS_GHA: 1\n2022-03-21T22:19:24.4958246Z GIT_DEFAULT_BRANCH: master\n2022-03-21T22:19:24.4958437Z ##[endgroup]\n2022-03-21T22:19:24.4989649Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-bionic-rocm4.5-py3.7 / test (default, 2, 2, linux.rocm.gpu) (6/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T01:05:07.6983899Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T01:05:06.8364546Z Using cached https://files.pythonhosted.org/packages/7b/9c/f51775ebe7df5a7aa4e7c79ed671bde94e154bd968aca8d65bb24aba0c8c/s3transfer-0.5.2-py3-none-any.whl\n2022-03-22T01:05:06.8431763Z Collecting urllib3<1.27,>=1.25.4 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T01:05:06.8949391Z Using cached https://files.pythonhosted.org/packages/ec/03/062e6444ce4baf1eac17a6a0ebfe36bb1ad05e1df0e20b110de59c278498/urllib3-1.26.9-py2.py3-none-any.whl\n2022-03-22T01:05:06.9180079Z Collecting python-dateutil<3.0.0,>=2.1 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T01:05:06.9803351Z Using cached https://files.pythonhosted.org/packages/36/7a/87837f39d0296e723bb9b62bbb257d0355c7f6128853c78955f57342a56d/python_dateutil-2.8.2-py2.py3-none-any.whl\n2022-03-22T01:05:06.9882133Z Collecting six>=1.5 (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T01:05:07.0067062Z Using cached https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl\n2022-03-22T01:05:07.0088676Z Installing collected packages: urllib3, jmespath, six, python-dateutil, botocore, s3transfer, boto3\n2022-03-22T01:05:07.5819667Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2 six-1.16.0 urllib3-1.26.9\n2022-03-22T01:05:07.6774717Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 worker-rocm-amd-60\n2022-03-22T01:05:07.6983899Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T01:05:07.6988652Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T01:05:07.7023073Z ##[error]Process completed with exit code 2.\n2022-03-22T01:05:07.7102087Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-22T01:05:07.7102389Z with:\n2022-03-22T01:05:07.7102603Z wait-ssh: false\n2022-03-22T01:05:07.7102820Z env:\n2022-03-22T01:05:07.7103015Z IN_CI: 1\n2022-03-22T01:05:07.7103224Z IS_GHA: 1\n2022-03-22T01:05:07.7103458Z GIT_DEFAULT_BRANCH: master\n2022-03-22T01:05:07.7103737Z DOCKER_HOST: unix:///run/user/1502/docker.sock\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge) (7/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:51:39.3637996Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T20:51:39.2041249Z Attempting uninstall: s3transfer\n2022-03-21T20:51:39.2043010Z Found existing installation: s3transfer 0.3.7\n2022-03-21T20:51:39.2083799Z Uninstalling s3transfer-0.3.7:\n2022-03-21T20:51:39.2089675Z Successfully uninstalled s3transfer-0.3.7\n2022-03-21T20:51:39.2480546Z Attempting uninstall: boto3\n2022-03-21T20:51:39.2482953Z Found existing installation: boto3 1.16.34\n2022-03-21T20:51:39.2584292Z Uninstalling boto3-1.16.34:\n2022-03-21T20:51:39.2599474Z Successfully uninstalled boto3-1.16.34\n2022-03-21T20:51:39.3130921Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T20:51:39.3550598Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-03ef7efc3078e3da5\n2022-03-21T20:51:39.3637996Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T20:51:39.3650651Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T20:51:39.3660484Z ##[error]Process completed with exit code 2.\n2022-03-21T20:51:39.3696465Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T20:51:39.3696693Z with:\n2022-03-21T20:51:39.3696850Z env:\n2022-03-21T20:51:39.3697012Z IN_CI: 1\n2022-03-21T20:51:39.3697161Z IS_GHA: 1\n2022-03-21T20:51:39.3697342Z GIT_DEFAULT_BRANCH: master\n2022-03-21T20:51:39.3697528Z ##[endgroup]\n2022-03-21T20:51:39.3730420Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge) (8/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:03:36.3916860Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:03:36.0096309Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:03:36.2278560Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:03:36.2461618Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:03:36.2513260Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:03:36.2541524Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:03:36.2554899Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:03:36.2598277Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:03:36.2758299Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:03:36.2780690Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:03:36.3825021Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0a4a552890e6ef7d3\n2022-03-21T21:03:36.3916860Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:03:36.3930343Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:03:36.3941263Z ##[error]Process completed with exit code 2.\n2022-03-21T21:03:36.3979258Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:03:36.3979496Z with:\n2022-03-21T21:03:36.3979654Z env:\n2022-03-21T21:03:36.3979814Z IN_CI: 1\n2022-03-21T21:03:36.3979968Z IS_GHA: 1\n2022-03-21T21:03:36.3980157Z GIT_DEFAULT_BRANCH: master\n2022-03-21T21:03:36.3980360Z ##[endgroup]\n2022-03-21T21:03:36.3996257Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu) (9/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T00:41:15.5325784Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T00:41:10.3015614Z Downloading s3transfer-0.5.2-py3-none-any.whl (79 kB)\n2022-03-22T00:41:10.3625659Z ---------------------------------------- 79.5/79.5 KB 1.1 MB/s eta 0:00:00\n2022-03-22T00:41:10.4120236Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-22T00:41:10.4170155Z Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-22T00:41:10.4722115Z -------------------------------------- 247.7/247.7 KB 5.2 MB/s eta 0:00:00\n2022-03-22T00:41:10.4843512Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-22T00:41:10.6596108Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-22T00:41:10.8733354Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-22T00:41:15.3745408Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-22T00:41:15.4987162Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-09cacc848abc3dd32\n2022-03-22T00:41:15.5325784Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T00:41:15.5373630Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T00:41:15.5404353Z ##[error]Process completed with exit code 2.\n2022-03-22T00:41:15.5790508Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-22T00:41:15.5791192Z with:\n2022-03-22T00:41:15.5791530Z env:\n2022-03-22T00:41:15.5791849Z IN_CI: 1\n2022-03-22T00:41:15.5792186Z IS_GHA: 1\n2022-03-22T00:41:15.5792599Z GIT_DEFAULT_BRANCH: master\n2022-03-22T00:41:15.5793237Z pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-22T00:41:15.5793831Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge) (10/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:50:32.9799307Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T20:50:32.8167560Z Attempting uninstall: s3transfer\n2022-03-21T20:50:32.8169351Z Found existing installation: s3transfer 0.3.7\n2022-03-21T20:50:32.8213295Z Uninstalling s3transfer-0.3.7:\n2022-03-21T20:50:32.8219209Z Successfully uninstalled s3transfer-0.3.7\n2022-03-21T20:50:32.8602320Z Attempting uninstall: boto3\n2022-03-21T20:50:32.8603289Z Found existing installation: boto3 1.16.34\n2022-03-21T20:50:32.8704535Z Uninstalling boto3-1.16.34:\n2022-03-21T20:50:32.8719403Z Successfully uninstalled boto3-1.16.34\n2022-03-21T20:50:32.9244278Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T20:50:32.9710449Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0c568461a276d4a71\n2022-03-21T20:50:32.9799307Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T20:50:32.9812238Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T20:50:32.9823052Z ##[error]Process completed with exit code 2.\n2022-03-21T20:50:32.9859290Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T20:50:32.9859527Z with:\n2022-03-21T20:50:32.9859664Z env:\n2022-03-21T20:50:32.9859817Z IN_CI: 1\n2022-03-21T20:50:32.9859977Z IS_GHA: 1\n2022-03-21T20:50:32.9860144Z GIT_DEFAULT_BRANCH: master\n2022-03-21T20:50:32.9860327Z ##[endgroup]\n2022-03-21T20:50:32.9893642Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge) (11/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:05:00.7163042Z SUMMARY: Undefined.../jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in\n\n2022-03-21T21:05:00.6660824Z #10 0x55fc8a3ea801 in run_mod /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:1037\n2022-03-21T21:05:00.6661768Z #11 0x55fc8a3f57a9 in PyRun_StringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:961\n2022-03-21T21:05:00.6662455Z #12 0x55fc8a3f580b in PyRun_SimpleStringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:455\n2022-03-21T21:05:00.6663570Z #13 0x55fc8a3f5908 in pymain_run_command /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:420\n2022-03-21T21:05:00.6663952Z #14 0x55fc8a3f5908 in pymain_run_python /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:2907\n2022-03-21T21:05:00.6664431Z #15 0x55fc8a3f5908 in pymain_main /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3460\n2022-03-21T21:05:00.6665304Z #16 0x55fc8a3f5ccb in _Py_UnixMain /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3495\n2022-03-21T21:05:00.7162113Z #17 0x7f940d00f83f in __libc_start_main /build/glibc-S7Ft5T/glibc-2.23/csu/../csu/libc-start.c:291\n2022-03-21T21:05:00.7162534Z #18 0x55fc8a39a554 in _start (/opt/conda/bin/python3.7+0x1d7554)\n2022-03-21T21:05:00.7162711Z \n2022-03-21T21:05:00.7163042Z SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior /var/lib/jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in \n2022-03-21T21:05:00.7334595Z + retcode=1\n2022-03-21T21:05:00.7334954Z + set -e\n2022-03-21T21:05:00.7335215Z + return 1\n2022-03-21T21:05:00.7338688Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX-* ]]\n2022-03-21T21:05:00.7339232Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X ]]\n2022-03-21T21:05:00.7340113Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX2-* ]]\n2022-03-21T21:05:00.7340612Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\2 ]]\n2022-03-21T21:05:00.7341187Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX512-* ]]\n2022-03-21T21:05:00.7341668Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\5\\1\\2 ]]\n2022-03-21T21:05:00.7344466Z + [[ linux-xenial-py3.7-clang7-asan-default == *tbb* ]]\n\n\n pull / linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge) (12/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:06:03.4437430Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:06:03.0752199Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:06:03.2853252Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:06:03.3032326Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:06:03.3081589Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:06:03.3093911Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:06:03.3120244Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:06:03.3162406Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:06:03.3188431Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:06:03.3337181Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:06:03.4348072Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0ee48c8811fafc444\n2022-03-21T22:06:03.4437430Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:06:03.4450920Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:06:03.4461263Z ##[error]Process completed with exit code 2.\n2022-03-21T22:06:03.4502346Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:06:03.4502576Z with:\n2022-03-21T22:06:03.4502730Z env:\n2022-03-21T22:06:03.4502888Z IN_CI: 1\n2022-03-21T22:06:03.4503038Z IS_GHA: 1\n2022-03-21T22:06:03.4503302Z GIT_DEFAULT_BRANCH: master\n2022-03-21T22:06:03.4503492Z ##[endgroup]\n2022-03-21T22:06:03.4519156Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge) (13/29)\nStep: \"Test\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:50:13.2205634Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T20:50:12.8679322Z + python3 -m pip install boto3==1.19.12\n2022-03-21T20:50:13.0744228Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T20:50:13.0916284Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T20:50:13.0964264Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T20:50:13.1005656Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T20:50:13.1017299Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T20:50:13.1041042Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T20:50:13.1189450Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T20:50:13.1208751Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T20:50:13.2119445Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0d02da60fd18c22f5\n2022-03-21T20:50:13.2205634Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T20:50:13.2217939Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T20:50:13.2220259Z ##[error]Process completed with exit code 2.\n2022-03-21T20:50:13.2248664Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T20:50:13.2249012Z with:\n2022-03-21T20:50:13.2249260Z env:\n2022-03-21T20:50:13.2249500Z IN_CI: 1\n2022-03-21T20:50:13.2249738Z IS_GHA: 1\n2022-03-21T20:50:13.2250025Z GIT_DEFAULT_BRANCH: master\n2022-03-21T20:50:13.2250329Z ##[endgroup]\n2022-03-21T20:50:13.2272735Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu) (14/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T23:47:38.0451999Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T23:47:37.5554508Z + python3 -m pip install boto3==1.19.12\n2022-03-21T23:47:37.8411473Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T23:47:37.8631484Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T23:47:37.8699561Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T23:47:37.8737037Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T23:47:37.8754443Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T23:47:37.8814393Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T23:47:37.8849540Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T23:47:37.9059579Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T23:47:38.0336298Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0b44f47f4292089a2\n2022-03-21T23:47:38.0451999Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T23:47:38.0469471Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T23:47:38.0484106Z ##[error]Process completed with exit code 2.\n2022-03-21T23:47:38.0532678Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T23:47:38.0533007Z with:\n2022-03-21T23:47:38.0533223Z env:\n2022-03-21T23:47:38.0533440Z IN_CI: 1\n2022-03-21T23:47:38.0533649Z IS_GHA: 1\n2022-03-21T23:47:38.0533902Z GIT_DEFAULT_BRANCH: master\n2022-03-21T23:47:38.0534170Z GPU_FLAG: --gpus all\n2022-03-21T23:47:38.0534401Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge) (15/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:04:59.3115800Z SUMMARY: Undefined.../jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in\n\n2022-03-21T21:04:59.2595213Z #10 0x55a7f39a4801 in run_mod /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:1037\n2022-03-21T21:04:59.2595707Z #11 0x55a7f39af7a9 in PyRun_StringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:961\n2022-03-21T21:04:59.2597203Z #12 0x55a7f39af80b in PyRun_SimpleStringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:455\n2022-03-21T21:04:59.2598205Z #13 0x55a7f39af908 in pymain_run_command /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:420\n2022-03-21T21:04:59.2598697Z #14 0x55a7f39af908 in pymain_run_python /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:2907\n2022-03-21T21:04:59.2599178Z #15 0x55a7f39af908 in pymain_main /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3460\n2022-03-21T21:04:59.2599747Z #16 0x55a7f39afccb in _Py_UnixMain /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3495\n2022-03-21T21:04:59.3114751Z #17 0x7f3b3822383f in __libc_start_main /build/glibc-S7Ft5T/glibc-2.23/csu/../csu/libc-start.c:291\n2022-03-21T21:04:59.3115277Z #18 0x55a7f3954554 in _start (/opt/conda/bin/python3.7+0x1d7554)\n2022-03-21T21:04:59.3115468Z \n2022-03-21T21:04:59.3115800Z SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior /var/lib/jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in \n2022-03-21T21:04:59.3292385Z + retcode=1\n2022-03-21T21:04:59.3292781Z + set -e\n2022-03-21T21:04:59.3293062Z + return 1\n2022-03-21T21:04:59.3295462Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX-* ]]\n2022-03-21T21:04:59.3295802Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X ]]\n2022-03-21T21:04:59.3296394Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX2-* ]]\n2022-03-21T21:04:59.3296700Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\2 ]]\n2022-03-21T21:04:59.3297055Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX512-* ]]\n2022-03-21T21:04:59.3297416Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\5\\1\\2 ]]\n2022-03-21T21:04:59.3299623Z + [[ linux-xenial-py3.7-clang7-asan-default == *tbb* ]]\n\n\n pull / win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge) (16/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:14:31.7846086Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:14:25.5525714Z Collecting jmespath<1.0.0,>=0.7.1\n2022-03-21T22:14:25.5568155Z Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)\n2022-03-21T22:14:25.5952617Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T22:14:25.6169392Z Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T22:14:25.6629996Z -------------------------------------- 247.7/247.7 KB 5.1 MB/s eta 0:00:00\n2022-03-21T22:14:25.6710247Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:14:25.8284354Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:14:25.9816751Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T22:14:31.6672236Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T22:14:31.7630473Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0ed0915ecee5d2424\n2022-03-21T22:14:31.7846086Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:14:31.7876742Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:14:31.7897140Z ##[error]Process completed with exit code 2.\n2022-03-21T22:14:31.8195621Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T22:14:31.8196110Z with:\n2022-03-21T22:14:31.8196356Z env:\n2022-03-21T22:14:31.8196614Z IN_CI: 1\n2022-03-21T22:14:31.8196876Z IS_GHA: 1\n2022-03-21T22:14:31.8197169Z GIT_DEFAULT_BRANCH: master\n2022-03-21T22:14:31.8197652Z pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T22:14:31.8198093Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge) (17/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:19:15.8845728Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:19:15.5116060Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:19:15.7231476Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:19:15.7409711Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:19:15.7458478Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:19:15.7470508Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:19:15.7496799Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:19:15.7538362Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:19:15.7566161Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:19:15.7711630Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:19:15.8753543Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0e2b3b4ddb246ff2a\n2022-03-21T21:19:15.8845728Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:19:15.8859814Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:19:15.8870165Z ##[error]Process completed with exit code 2.\n2022-03-21T21:19:15.8917039Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:19:15.8917279Z with:\n2022-03-21T21:19:15.8917433Z env:\n2022-03-21T21:19:15.8917586Z IN_CI: 1\n2022-03-21T21:19:15.8917734Z IS_GHA: 1\n2022-03-21T21:19:15.8917917Z GIT_DEFAULT_BRANCH: master\n2022-03-21T21:19:15.8918102Z ##[endgroup]\n2022-03-21T21:19:15.8934572Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu) (18/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T23:19:48.5900162Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T23:19:48.0742254Z + python3 -m pip install boto3==1.19.12\n2022-03-21T23:19:48.3742563Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T23:19:48.3976536Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T23:19:48.4048700Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T23:19:48.4065374Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T23:19:48.4128076Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T23:19:48.4164273Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T23:19:48.4202610Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T23:19:48.4416723Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T23:19:48.5773033Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-07ab7a3c4a5402af2\n2022-03-21T23:19:48.5900162Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T23:19:48.5919822Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T23:19:48.5936087Z ##[error]Process completed with exit code 2.\n2022-03-21T23:19:48.6007930Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T23:19:48.6008268Z with:\n2022-03-21T23:19:48.6008483Z env:\n2022-03-21T23:19:48.6008701Z IN_CI: 1\n2022-03-21T23:19:48.6008920Z IS_GHA: 1\n2022-03-21T23:19:48.6009170Z GIT_DEFAULT_BRANCH: master\n2022-03-21T23:19:48.6009440Z GPU_FLAG: --gpus all\n2022-03-21T23:19:48.6009671Z ##[endgroup]\n\n\n pull / win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu) (19/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:54:04.2844259Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:53:59.0889659Z Downloading botocore-1.22.12-py3-none-any.whl (8.1 MB)\n2022-03-21T22:53:59.6881416Z ---------------------------------------- 8.1/8.1 MB 14.0 MB/s eta 0:00:00\n2022-03-21T22:53:59.7427779Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:53:59.7691882Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T22:53:59.7779847Z Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T22:53:59.8281663Z -------------------------------------- 247.7/247.7 KB 5.1 MB/s eta 0:00:00\n2022-03-21T22:54:00.0185115Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:54:00.2359770Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T22:54:04.1208891Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T22:54:04.2505862Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-03b4fbe63be8ef4b0\n2022-03-21T22:54:04.2844259Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:54:04.2891082Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:54:04.2919900Z ##[error]Process completed with exit code 2.\n2022-03-21T22:54:04.3377901Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T22:54:04.3378575Z with:\n2022-03-21T22:54:04.3378930Z env:\n2022-03-21T22:54:04.3379275Z IN_CI: 1\n2022-03-21T22:54:04.3379600Z IS_GHA: 1\n2022-03-21T22:54:04.3380023Z GIT_DEFAULT_BRANCH: master\n2022-03-21T22:54:04.3380691Z pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T22:54:04.3381278Z ##[endgroup]\n\n\n pull / linux-bionic-py3.7-clang9 / test (noarch, 1, 1, linux.2xlarge) (20/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:09:34.0074610Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:09:33.6365531Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:09:33.8475619Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:09:33.8655152Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:09:33.8704395Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:09:33.8716774Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:09:33.8760145Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:09:33.8785000Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:09:33.8811316Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:09:33.8960134Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:09:33.9984866Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0d325eb9fd156146f\n2022-03-21T22:09:34.0074610Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:09:34.0087465Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:09:34.0101743Z ##[error]Process completed with exit code 2.\n2022-03-21T22:09:34.0154014Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:09:34.0154246Z with:\n2022-03-21T22:09:34.0154412Z env:\n2022-03-21T22:09:34.0154574Z IN_CI: 1\n2022-03-21T22:09:34.0154728Z IS_GHA: 1\n2022-03-21T22:09:34.0154917Z GIT_DEFAULT_BRANCH: master\n2022-03-21T22:09:34.0155112Z ##[endgroup]\n2022-03-21T22:09:34.0191047Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge) (21/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:03:17.8502655Z [E request_callbac...yUniqueId(created_on=0, local_id=0) to be created.\n\n2022-03-21T21:03:14.4669960Z INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmpxgdsmeer\n2022-03-21T21:03:14.4671407Z INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmpxgdsmeer/_remote_module_non_sriptable.py\n2022-03-21T21:03:14.4973023Z INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmp1i2hfmpc\n2022-03-21T21:03:14.4973800Z INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmp1i2hfmpc/_remote_module_non_sriptable.py\n2022-03-21T21:03:14.5532339Z INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmpgx4da7b0\n2022-03-21T21:03:14.5533064Z INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmpgx4da7b0/_remote_module_non_sriptable.py\n2022-03-21T21:03:14.7050673Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 0\n2022-03-21T21:03:14.7097127Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 3\n2022-03-21T21:03:14.7398339Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 2\n2022-03-21T21:03:14.7922283Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 1\n2022-03-21T21:03:17.8502655Z [E request_callback_no_python.cpp:559] Received error while processing request type 261: false INTERNAL ASSERT FAILED at \"/var/lib/jenkins/workspace/torch/csrc/distributed/rpc/rref_context.cpp\":387, please report a bug to PyTorch. Expected OwnerRRef with id GloballyUniqueId(created_on=0, local_id=0) to be created.\n2022-03-21T21:03:17.8503603Z Exception raised from getOwnerRRef at /var/lib/jenkins/workspace/torch/csrc/distributed/rpc/rref_context.cpp:387 (most recent call first):\n2022-03-21T21:03:17.8504385Z frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x69 (0x7f180df19e19 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)\n2022-03-21T21:03:17.8505131Z frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string, std::allocator > const&) + 0xd2 (0x7f180df160e2 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)\n2022-03-21T21:03:17.8505927Z frame #2: c10::detail::torchInternalAssertFail(char const*, char const*, unsigned int, char const*, std::__cxx11::basic_string, std::allocator > const&) + 0x4e (0x7f180df17a7e in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)\n2022-03-21T21:03:17.8506674Z frame #3: torch::distributed::rpc::RRefContext::getOwnerRRef(torch::distributed::rpc::GloballyUniqueId const&, bool) + 0x4b4 (0x7f18118b7b64 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n2022-03-21T21:03:17.8507642Z frame #4: torch::distributed::rpc::RequestCallbackNoPython::assignOwnerRRef(torch::distributed::rpc::GloballyUniqueId const&, torch::distributed::rpc::GloballyUniqueId const&, c10::intrusive_ptr >) const + 0x70 (0x7f18118a7bf0 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n2022-03-21T21:03:17.8508613Z frame #5: torch::distributed::rpc::RequestCallbackImpl::processPythonRemoteCall(torch::distributed::rpc::RpcCommandBase&, std::vector >) const + 0xc8 (0x7f1819736208 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)\n2022-03-21T21:03:17.8509749Z frame #6: torch::distributed::rpc::RequestCallbackNoPython::processRpc(torch::distributed::rpc::RpcCommandBase&, torch::distributed::rpc::MessageType const&, std::vector >) const + 0x194 (0x7f18118ac914 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n2022-03-21T21:03:17.8510708Z frame #7: torch::distributed::rpc::RequestCallbackImpl::processRpcWithErrors(torch::distributed::rpc::RpcCommandBase&, torch::distributed::rpc::MessageType const&, std::vector >) const + 0x65 (0x7f1819735865 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)\n2022-03-21T21:03:17.8511369Z frame #8: + 0x375249a (0x7f18118a949a in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test (22/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:01:07.7015580Z \ufffd[36;1m echo \"ERR...t available for the merge-base of your branch\"\ufffd[0m\n\n2022-03-21T20:01:07.7012399Z \ufffd[36;1mfi\ufffd[0m\n2022-03-21T20:01:07.7012634Z \ufffd[36;1m# Covers the case where a previous tag doesn't exist for the tree\ufffd[0m\n2022-03-21T20:01:07.7012992Z \ufffd[36;1m# this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly\ufffd[0m\n2022-03-21T20:01:07.7013373Z \ufffd[36;1mif ! git rev-parse \"$MERGE_BASE:.circleci/docker\"; then\ufffd[0m\n2022-03-21T20:01:07.7013784Z \ufffd[36;1m echo \"Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit\"\ufffd[0m\n2022-03-21T20:01:07.7014149Z \ufffd[36;1m exit 1\ufffd[0m\n2022-03-21T20:01:07.7014325Z \ufffd[36;1mfi\ufffd[0m\n2022-03-21T20:01:07.7014573Z \ufffd[36;1mPREVIOUS_DOCKER_TAG=$(git rev-parse \"$MERGE_BASE:.circleci/docker\")\ufffd[0m\n2022-03-21T20:01:07.7014907Z \ufffd[36;1m# If no image exists but the hash is the same as the previous hash then we should error out here\ufffd[0m\n2022-03-21T20:01:07.7015231Z \ufffd[36;1mif [[ \"${PREVIOUS_DOCKER_TAG}\" = \"${DOCKER_TAG}\" ]]; then\ufffd[0m\n2022-03-21T20:01:07.7015580Z \ufffd[36;1m echo \"ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch\"\ufffd[0m\n2022-03-21T20:01:07.7015931Z \ufffd[36;1m echo \" contact the PyTorch team to restore the original images\"\ufffd[0m\n2022-03-21T20:01:07.7016225Z \ufffd[36;1m exit 1\ufffd[0m\n2022-03-21T20:01:07.7016400Z \ufffd[36;1mfi\ufffd[0m\n2022-03-21T20:01:07.7016608Z \ufffd[36;1mecho ::set-output name=rebuild::yes\ufffd[0m\n2022-03-21T20:01:07.7027605Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}\n2022-03-21T20:01:07.7027837Z env:\n2022-03-21T20:01:07.7028006Z IN_CI: 1\n2022-03-21T20:01:07.7028159Z IS_GHA: 1\n2022-03-21T20:01:07.7028346Z GIT_DEFAULT_BRANCH: master\n2022-03-21T20:01:07.7028589Z BASE_REVISION: 6643522db9ff595f564b8081de58b3a33c546178\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu) (23/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T00:49:54.2949572Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T00:49:53.8049151Z + python3 -m pip install boto3==1.19.12\n2022-03-22T00:49:54.0981629Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-22T00:49:54.1207562Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-22T00:49:54.1277146Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-22T00:49:54.1315027Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-22T00:49:54.1331813Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-22T00:49:54.1391622Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-22T00:49:54.1609217Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-22T00:49:54.1637417Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-22T00:49:54.2830197Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0f7c32fe13be12fea\n2022-03-22T00:49:54.2949572Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T00:49:54.2966933Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T00:49:54.2982588Z ##[error]Process completed with exit code 2.\n2022-03-22T00:49:54.3031464Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-22T00:49:54.3031794Z with:\n2022-03-22T00:49:54.3032012Z env:\n2022-03-22T00:49:54.3032227Z IN_CI: 1\n2022-03-22T00:49:54.3032434Z IS_GHA: 1\n2022-03-22T00:49:54.3032681Z GIT_DEFAULT_BRANCH: master\n2022-03-22T00:49:54.3033084Z GPU_FLAG: --gpus all\n2022-03-22T00:49:54.3033312Z ##[endgroup]\n\n\n pull / win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge) (24/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:56:12.5872636Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:56:07.3365589Z Downloading botocore-1.22.12-py3-none-any.whl (8.1 MB)\n2022-03-21T21:56:07.7926584Z ---------------------------------------- 8.1/8.1 MB 17.3 MB/s eta 0:00:00\n2022-03-21T21:56:07.9319362Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T21:56:07.9366132Z Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T21:56:08.0077590Z -------------------------------------- 247.7/247.7 KB 3.0 MB/s eta 0:00:00\n2022-03-21T21:56:08.0164070Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:56:08.1775537Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:56:08.3393469Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T21:56:12.4576766Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T21:56:12.5641959Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0afad69838118af0e\n2022-03-21T21:56:12.5872636Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:56:12.5905611Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:56:12.5927729Z ##[error]Process completed with exit code 2.\n2022-03-21T21:56:12.6239531Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T21:56:12.6240039Z with:\n2022-03-21T21:56:12.6240299Z env:\n2022-03-21T21:56:12.6240557Z IN_CI: 1\n2022-03-21T21:56:12.6240805Z IS_GHA: 1\n2022-03-21T21:56:12.6241118Z GIT_DEFAULT_BRANCH: master\n2022-03-21T21:56:12.6241613Z pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T21:56:12.6242052Z ##[endgroup]\n\n\n pull / linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge) (25/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:46:39.5474616Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:46:39.1884210Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:46:39.3928976Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:46:39.4105069Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:46:39.4152571Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:46:39.4194931Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:46:39.4218947Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:46:39.4230812Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:46:39.4380089Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:46:39.4399461Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:46:39.5387703Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0888bed1149cca415\n2022-03-21T21:46:39.5474616Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:46:39.5487145Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:46:39.5497480Z ##[error]Process completed with exit code 2.\n2022-03-21T21:46:39.5541319Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:46:39.5541544Z with:\n2022-03-21T21:46:39.5541698Z env:\n2022-03-21T21:46:39.5541851Z IN_CI: 1\n2022-03-21T21:46:39.5541997Z IS_GHA: 1\n2022-03-21T21:46:39.5542176Z GIT_DEFAULT_BRANCH: master\n2022-03-21T21:46:39.5542361Z ##[endgroup]\n2022-03-21T21:46:39.5557878Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge) (26/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:34:57.0623859Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:34:56.9039884Z Attempting uninstall: s3transfer\n2022-03-21T21:34:56.9041446Z Found existing installation: s3transfer 0.3.7\n2022-03-21T21:34:56.9090783Z Uninstalling s3transfer-0.3.7:\n2022-03-21T21:34:56.9095968Z Successfully uninstalled s3transfer-0.3.7\n2022-03-21T21:34:56.9453014Z Attempting uninstall: boto3\n2022-03-21T21:34:56.9454356Z Found existing installation: boto3 1.16.34\n2022-03-21T21:34:56.9564320Z Uninstalling boto3-1.16.34:\n2022-03-21T21:34:56.9578035Z Successfully uninstalled boto3-1.16.34\n2022-03-21T21:34:57.0091363Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T21:34:57.0536230Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-034a3afd5d80b91fd\n2022-03-21T21:34:57.0623859Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:34:57.0637167Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:34:57.0647396Z ##[error]Process completed with exit code 2.\n2022-03-21T21:34:57.0688237Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:34:57.0688481Z with:\n2022-03-21T21:34:57.0688631Z env:\n2022-03-21T21:34:57.0688769Z IN_CI: 1\n2022-03-21T21:34:57.0688930Z IS_GHA: 1\n2022-03-21T21:34:57.0689109Z GIT_DEFAULT_BRANCH: master\n2022-03-21T21:34:57.0689462Z ##[endgroup]\n2022-03-21T21:34:57.0704768Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge) (27/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:05:00.7896545Z SUMMARY: Undefined.../jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in\n\n2022-03-21T21:05:00.7395504Z #10 0x5597fd5a9801 in run_mod /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:1037\n2022-03-21T21:05:00.7396330Z #11 0x5597fd5b47a9 in PyRun_StringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:961\n2022-03-21T21:05:00.7396688Z #12 0x5597fd5b480b in PyRun_SimpleStringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:455\n2022-03-21T21:05:00.7398664Z #13 0x5597fd5b4908 in pymain_run_command /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:420\n2022-03-21T21:05:00.7399177Z #14 0x5597fd5b4908 in pymain_run_python /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:2907\n2022-03-21T21:05:00.7399663Z #15 0x5597fd5b4908 in pymain_main /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3460\n2022-03-21T21:05:00.7399986Z #16 0x5597fd5b4ccb in _Py_UnixMain /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3495\n2022-03-21T21:05:00.7895241Z #17 0x7f0a5905983f in __libc_start_main /build/glibc-S7Ft5T/glibc-2.23/csu/../csu/libc-start.c:291\n2022-03-21T21:05:00.7895772Z #18 0x5597fd559554 in _start (/opt/conda/bin/python3.7+0x1d7554)\n2022-03-21T21:05:00.7896033Z \n2022-03-21T21:05:00.7896545Z SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior /var/lib/jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in \n2022-03-21T21:05:00.8063448Z + retcode=1\n2022-03-21T21:05:00.8063787Z + set -e\n2022-03-21T21:05:00.8064058Z + return 1\n2022-03-21T21:05:00.8067638Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX-* ]]\n2022-03-21T21:05:00.8068127Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X ]]\n2022-03-21T21:05:00.8069018Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX2-* ]]\n2022-03-21T21:05:00.8069500Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\2 ]]\n2022-03-21T21:05:00.8070105Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX512-* ]]\n2022-03-21T21:05:00.8070580Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\5\\1\\2 ]]\n2022-03-21T21:05:00.8072640Z + [[ linux-xenial-py3.7-clang7-asan-default == *tbb* ]]\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu) (28/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:48:17.3384813Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:48:16.8599645Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:48:17.1464241Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:48:17.1685222Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:48:17.1754164Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:48:17.1771662Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:48:17.1808722Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:48:17.1868636Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:48:17.1903889Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:48:17.2113746Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:48:17.3267404Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-01fe178c405417375\n2022-03-21T22:48:17.3384813Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:48:17.3402286Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:48:17.3418376Z ##[error]Process completed with exit code 2.\n2022-03-21T22:48:17.3470528Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:48:17.3470874Z with:\n2022-03-21T22:48:17.3471096Z env:\n2022-03-21T22:48:17.3471327Z IN_CI: 1\n2022-03-21T22:48:17.3471538Z IS_GHA: 1\n2022-03-21T22:48:17.3471802Z GIT_DEFAULT_BRANCH: master\n2022-03-21T22:48:17.3472083Z GPU_FLAG: --gpus all\n2022-03-21T22:48:17.3472322Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge) (29/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:16:38.9646300Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:16:38.7995969Z Attempting uninstall: s3transfer\n2022-03-21T21:16:38.7998039Z Found existing installation: s3transfer 0.3.7\n2022-03-21T21:16:38.8066994Z Uninstalling s3transfer-0.3.7:\n2022-03-21T21:16:38.8072844Z Successfully uninstalled s3transfer-0.3.7\n2022-03-21T21:16:38.8449275Z Attempting uninstall: boto3\n2022-03-21T21:16:38.8451430Z Found existing installation: boto3 1.16.34\n2022-03-21T21:16:38.8559828Z Uninstalling boto3-1.16.34:\n2022-03-21T21:16:38.8574290Z Successfully uninstalled boto3-1.16.34\n2022-03-21T21:16:38.9100438Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T21:16:38.9558098Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0d779c59d277d32ee\n2022-03-21T21:16:38.9646300Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:16:38.9658894Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:16:38.9673240Z ##[error]Process completed with exit code 2.\n2022-03-21T21:16:38.9720106Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:16:38.9720333Z with:\n2022-03-21T21:16:38.9720485Z env:\n2022-03-21T21:16:38.9720645Z IN_CI: 1\n2022-03-21T21:16:38.9720793Z IS_GHA: 1\n2022-03-21T21:16:38.9720970Z GIT_DEFAULT_BRANCH: master\n2022-03-21T21:16:38.9721151Z ##[endgroup]\n2022-03-21T21:16:38.9736762Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here to manually regenerate this comment.", + "createdAt": "2021-11-10T08:42:52Z", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": { + "login": "facebook-github-bot" + }, + "databaseId": 964902894 + }, + { + "bodyText": "@vitaly-fedyunin @gottbrath FYI that this is the oneDNN Graph API integration. It depends on the #63748.", + "createdAt": "2021-11-16T16:36:52Z", + "author": { + "login": "Jianhui-Li" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 970451860 + }, + { + "bodyText": "CI failures are currently being caused by some issues in the CI infra, and are also occurring with other PRs.", + "createdAt": "2021-12-10T05:59:17Z", + "author": { + "login": "sanchitintel" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 990641309 + }, + { + "bodyText": "CI failures are unrelated.", + "createdAt": "2021-12-10T20:44:09Z", + "author": { + "login": "sanchitintel" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 991281407 + }, + { + "bodyText": "The CI failure is unrelated.", + "createdAt": "2021-12-16T02:45:59Z", + "author": { + "login": "sanchitintel" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 995389295 + }, + { + "bodyText": "Hi, thank you for the PR!\nDo you mind running a larger amount of torchbench and reporting numbers ? You can look at Jason's post here for what models are supported in script. Initially just the vision models would be useful. @Krovatkin also did some benchmarking of a traced Bert model and found on average a ~16% speedup with this PR.", + "createdAt": "2022-01-18T18:22:34Z", + "author": { + "login": "eellison" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1015689390 + }, + { + "bodyText": "Thanks a lot for reviewing, @eellison & @Krovatkin!\nWe just wanted to let you know that we're working on the benchmarking & will get back to you in a day, or two.\nUPDATE (Jan 21): While running some TorchBench models, we discovered some composability issues, and are working to ensure that oneDNN Graph would complement PyTorch's existing fusion capabilities, not hinder them.\nUPDATE (Jan 24): We've resolved the issues & will update this PR later today. Thanks!", + "createdAt": "2022-01-20T00:31:01Z", + "author": { + "login": "sanchitintel" + }, + "authorAssociation": "COLLABORATOR", + "editor": { + "login": "sanchitintel" + }, + "databaseId": 1016996190 + }, + { + "bodyText": "Hello @eellison,\nWe used this TorchBench branch for comparison. compare_llga.sh can be run for comparison.\nFor benchmarking mobilenet_v3_large with hardswish support in oneDNN Graph, this oneDNN Graph branch can be used in third_party/ideep/mkl-dnn. It delivers a speedup over PyTorch JIT (NNC + OFI) because 21 additional reorders are prevented (the major factor here), and fusion with conv also helps further.\nThe next release of oneDNN Graph would have hardswish support.\nWe're also exploring adding a hardsigmoid op in oneDNN Graph.\nThank you!", + "createdAt": "2022-01-26T23:51:38Z", + "author": { + "login": "sanchitintel" + }, + "authorAssociation": "COLLABORATOR", + "editor": { + "login": "sanchitintel" + }, + "databaseId": 1022709513 + }, + { + "bodyText": "Please note that this PR should be merged after #71546, as #71546 changes the third_party/ideep commit (this PR also uses that ideep commit, but it'd probably be better to merge #71546 first, so that oneDNN v2.5.2 upgrade would be in a separate PR). Thank you!", + "createdAt": "2022-01-31T23:57:21Z", + "author": { + "login": "sanchitintel" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 1026330085 + }, + { + "bodyText": "@sanchitintel mind rebasing and i'll land ?", + "createdAt": "2022-03-01T20:07:57Z", + "author": { + "login": "eellison" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1055813984 + }, + { + "bodyText": "@eellison has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.", + "createdAt": "2022-03-02T17:44:47Z", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1057203495 + }, + { + "bodyText": "Thanks a lot for taking a look, @eellison! To fix this error, we would enable Bazel build for oneDNN Graph.", + "createdAt": "2022-03-07T23:03:45Z", + "author": { + "login": "sanchitintel" + }, + "authorAssociation": "COLLABORATOR", + "editor": { + "login": "sanchitintel" + }, + "databaseId": 1061230087 + }, + { + "bodyText": "@eellison has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.", + "createdAt": "2022-03-09T19:24:13Z", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1063276600 + }, + { + "bodyText": "@malfet has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.", + "createdAt": "2022-03-21T19:59:41Z", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1074355779 + }, + { + "bodyText": "And graph_rewriter.cpp is full of DOS newlines...", + "createdAt": "2022-03-21T20:53:40Z", + "author": { + "login": "malfet" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1074407452 + }, + { + "bodyText": "Hey @chunyuan-w.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.", + "createdAt": "2022-03-21T22:12:51Z", + "author": { + "login": "github-actions" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1074471758 + }, + { + "bodyText": "Thanks a ton for your help, @malfet & @eellison! :)\nWe'll incorporate your suggestions in subsequent PR(s).", + "createdAt": "2022-03-21T22:41:25Z", + "author": { + "login": "sanchitintel" + }, + "authorAssociation": "COLLABORATOR", + "editor": { + "login": "sanchitintel" + }, + "databaseId": 1074492365 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOOYM_0Q==", + "hasPreviousPage": false + } + } + } + } + } + }, + "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=73969 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": true, + "isCrossRepository": true, + "author": { + "login": "malfet" + }, + "title": "Dummy change", + "body": "Test Plan: None at all\n\nDifferential Revision: D34753911\n\n", + "headRefName": "export-D34753911", + "headRepository": { + "nameWithOwner": "malfet/pytorch" + }, + "baseRefName": "master", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nshulga@fb.com", + "name": "Nikita Shulga" + }, + "oid": "4746da707a9912356f5179625da89616b228dc21" + } + } + ], + "pageInfo": { + "endCursor": "MQ", + "hasNextPage": false + }, + "totalCount": 1 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "edges": [ + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-vulkan-bionic-py3.7-clang9" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280134/jobs/2794078044" + }, + { + "name": "test (default, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280134/jobs/2794189060" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRQMQ=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592963" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-QM=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280135/jobs/2794078023" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2aM=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592965" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-QU=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-bionic-rocm4.5-py3.7" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280132/jobs/2794078060" + }, + { + "name": "test (default, 1, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280132/jobs/2794292071" + }, + { + "name": "test (default, 2, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280132/jobs/2794292205" + }, + { + "name": "test (distributed, 1, 1, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280132/jobs/2794292306" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbTiXw=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592966" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-QY=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "win-vs2019-cuda11.3-py3" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280139/jobs/2794078053" + }, + { + "name": "test (force_on_cpu, 1, 1, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280139/jobs/2794536907" + }, + { + "name": "test (default, 2, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280139/jobs/2794536998" + }, + { + "name": "test (default, 1, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280139/jobs/2794537089" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbY_vU=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592967" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Qc=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280136/jobs/2794078031" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2ao=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592969" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Qk=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-docs" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280138/jobs/2794078055" + }, + { + "name": "build-docs (cpp)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280138/jobs/2794183768" + }, + { + "name": "build-docs (python)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280138/jobs/2794183828" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRIt0=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592970" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Qo=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc7" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280140/jobs/2794078017" + }, + { + "name": "test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280140/jobs/2794181109" + }, + { + "name": "test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280140/jobs/2794181305" + }, + { + "name": "test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280140/jobs/2794181488" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRFm4=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592971" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Qs=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3-clang5-mobile-build" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280143/jobs/2794078025" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2aw=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592974" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Q4=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "shellcheck", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078028" + }, + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078196" + }, + { + "name": "clang-tidy", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078407" + }, + { + "name": "clang-format", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078610" + }, + { + "name": "cmakelint", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078760" + }, + { + "name": "toc", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078898" + }, + { + "name": "py2-setup-validate-errormsg", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078999" + }, + { + "name": "flake8-py3", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794079087" + }, + { + "name": "mypy", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794079199" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO4Es=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592975" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Q8=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280146/jobs/2794078040" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2b0=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592976" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-RA=" + } + ], + "pageInfo": { + "hasNextPage": true + } + }, + "status": { + "contexts": [ + { + "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17040614?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17040643?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17040615?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + } + ] + }, + "pushedDate": "2022-03-09T15:57:16Z", + "oid": "4746da707a9912356f5179625da89616b228dc21" + } + } + ] + }, + "changedFiles": 1, + "files": { + "nodes": [ + { + "path": "tools/build_variables.bzl" + } + ], + "pageInfo": { + "endCursor": "MQ", + "hasNextPage": false + } + }, + "reviews": { + "nodes": [], + "pageInfo": { + "startCursor": null, + "hasPreviousPage": false + } + }, + "comments": { + "nodes": [ + { + "bodyText": "CI Flow Status\n\u269b\ufe0f CI Flow\nRuleset - Version: v1\nRuleset - File: https://github.com/malfet/pytorch/blob/4746da707a9912356f5179625da89616b228dc21/.github/generated-ciflow-ruleset.json\nPR ciflow labels: ciflow/default\nAdd ciflow labels to this PR to trigger more builds:\n\n\n\nWorkflows\nLabels (bold enabled)\nStatus\n\n\n\n\nTriggered Workflows\n\n\n\n\nlinux-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nlinux-binary-libtorch-cxx11-abi\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nlinux-binary-libtorch-pre-cxx11\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nlinux-binary-manywheel\nciflow/all, ciflow/binaries, ciflow/binaries_wheel, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nlinux-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/noarch, ciflow/trunk\n\u2705 triggered\n\n\nlinux-bionic-rocm4.5-py3.7\nciflow/all, ciflow/default, ciflow/linux, ciflow/rocm, ciflow/trunk\n\u2705 triggered\n\n\nlinux-docs\nciflow/all, ciflow/cpu, ciflow/default, ciflow/docs, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-vulkan-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk, ciflow/vulkan\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-bazel-test\nciflow/all, ciflow/bazel, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-build\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-custom-build-static\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-asan\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/sanitizers, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-onnx\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/onnx, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build\nciflow/all, ciflow/cpu, ciflow/default, ciflow/libtorch, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7-no-ops\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nmacos-arm64-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nmacos-arm64-binary-wheel\nciflow/binaries, ciflow/binaries_wheel, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-libtorch-cxx11-abi\nciflow/binaries, ciflow/binaries_libtorch, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-libtorch-pre-cxx11\nciflow/binaries, ciflow/binaries_libtorch, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-wheel\nciflow/binaries, ciflow/binaries_wheel, ciflow/default\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nwin-vs2019-cpu-py3\nciflow/all, ciflow/cpu, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nwin-vs2019-cuda11.3-py3\nciflow/all, ciflow/cuda, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nwindows-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nwindows-binary-libtorch-debug\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nwindows-binary-libtorch-release\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nwindows-binary-wheel\nciflow/all, ciflow/binaries, ciflow/binaries_wheel, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nSkipped Workflows\n\n\n\n\ncaffe2-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\ndocker-builds\nciflow/all, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-custom-ops\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-metal\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda10.2-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-bionic-cuda10.2-py3.9-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/slow, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-docs-push\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-no-ops\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-arm64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-lite-interpreter-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-11-py3-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nparallelnative-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nperiodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled, ciflow/slow, ciflow/slow-gradcheck\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda11.3-py3.7-gcc7-debug\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-win-vs2019-cuda11.5-py3\nciflow/all, ciflow/cuda, ciflow/scheduled, ciflow/win\n\ud83d\udeab skipped\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-build\nciflow/all, ciflow/android, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\npytorch-xla-linux-bionic-py3.7-clang8\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk, ciflow/xla\n\ud83d\udeab skipped", + "createdAt": "2022-03-09T15:57:11Z", + "author": { + "login": "pytorch-bot" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1063079053 + }, + { + "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/73969\n\ud83d\udcc4 \u00a0Preview docs built from this PR\n\ud83d\udcc4 \u00a0Preview C++ docs built from this PR\n\ud83d\udd27 \u00a0Opt-in to CIFlow to control what jobs run on your PRs\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 4746da7 (more details on the Dr. CI page):\n\n\ud83d\udc9a \ud83d\udc9a Looks good so far! There are no failures yet. \ud83d\udc9a \ud83d\udc9a\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here to manually regenerate this comment.", + "createdAt": "2022-03-09T15:57:12Z", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": { + "login": "facebook-github-bot" + }, + "databaseId": 1063079113 + }, + { + "bodyText": "This pull request was exported from Phabricator. Differential Revision: D34753911", + "createdAt": "2022-03-09T15:57:34Z", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1063079731 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOP11MjQ==", + "hasPreviousPage": false + } + }, + "labels": { + "edges": [ + { + "node": { + "name": "fb-exported" + } + }, + { + "node": { + "name": "cla signed" + } + } + ] + } + } + } + } + }, + "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=73099 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": true, + "isCrossRepository": false, + "author": { + "login": "BowenBao" + }, + "title": "[ONNX] Make graph name spec-compliant (#71961)", + "body": "Stack from [ghstack](https://github.com/ezyang/ghstack):\n* #73104\n* #73103\n* #73102\n* #73101\n* #73100\n* __->__ #73099\n\n[According to the ONNX spec](https://github.com/onnx/onnx/blob/main/docs/IR.md#names-within-a-graph),\nall names must adhere to C90 identifier syntax rules, which means no\ndashes.\n\nFixes: #30952", + "headRefName": "gh/BowenBao/138/head", + "headRepository": { + "nameWithOwner": "pytorch/pytorch" + }, + "baseRefName": "gh/BowenBao/138/base", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": { + "login": "BowenBao" + }, + "email": "bowbao@microsoft.com", + "name": "BowenBao" + }, + "oid": "3038b939eb2069653305c419326a0f47d2598e39" + } + } + ], + "pageInfo": { + "endCursor": "MQ", + "hasNextPage": false + }, + "totalCount": 1 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "edges": [ + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "run-torchbench", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041786/jobs/2626264278" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNn9o=", + "hasNextPage": false + } + }, + "conclusion": "SKIPPED", + "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189561" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS7k=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-cuda11.3-py3.7-gcc7" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041785/jobs/2626264385" + }, + { + "name": "test (default, 1, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041785/jobs/2626417658" + }, + { + "name": "test (default, 2, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041785/jobs/2626417743" + }, + { + "name": "test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041785/jobs/2626417885" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkRE_E=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189562" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS7o=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc7-no-ops" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041789/jobs/2626264416" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoJE=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189563" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS7s=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3-clang5-mobile-build" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041787/jobs/2626264407" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoIY=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189564" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS7w=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041788/jobs/2626264422" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoJs=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189566" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS74=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-bionic-py3.7-clang9" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041790/jobs/2626264414" + }, + { + "name": "test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041790/jobs/2626349405" + }, + { + "name": "test (noarch, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041790/jobs/2626349522" + }, + { + "name": "test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041790/jobs/2626349618" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkPiwA=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189567" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS78=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-vulkan-bionic-py3.7-clang9" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041793/jobs/2626264431" + }, + { + "name": "test (default, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041793/jobs/2626359364" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkPxgQ=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189568" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8A=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3-clang5-mobile-custom-build-static" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041792/jobs/2626264427" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoKA=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189570" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8I=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "win-vs2019-cpu-py3" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041791/jobs/2626264386" + }, + { + "name": "test (default, 1, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041791/jobs/2626722677" + }, + { + "name": "test (default, 2, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041791/jobs/2626722710" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkX070=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189571" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8M=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc7" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041803/jobs/2626264401" + }, + { + "name": "test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041803/jobs/2626349045" + }, + { + "name": "test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041803/jobs/2626349141" + }, + { + "name": "test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041803/jobs/2626349272" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkPiQA=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189572" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8Q=" + } + ], + "pageInfo": { + "hasNextPage": true + } + }, + "status": { + "contexts": [ + { + "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17010288?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17010289?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17010488?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + }, + { + "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build", + "state": "SUCCESS", + "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17010326?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link" + } + ] + }, + "pushedDate": "2022-02-18T18:46:28Z", + "oid": "3038b939eb2069653305c419326a0f47d2598e39" + } + } + ] + }, + "changedFiles": 162, + "files": { + "nodes": [ + { + "path": "test/onnx/expect/TestOperators.test_acos.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_add_broadcast.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_add_left_broadcast.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_add_size1_broadcast.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_addconstant.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_addmm.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_arange_dynamic.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_argmax.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_asin.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_at_op.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_atan.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_aten_embedding_1.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_aten_embedding_2.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_avg_pool2d.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_baddbmm.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_basic.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_batchnorm.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_batchnorm_1d.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_batchnorm_onnx_irv4.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_batchnorm_training.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_bitshift.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_c2_op.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_chunk.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_clip.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_clip_max.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_clip_min.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_concat2.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_conv.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_conv_onnx_irv4.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_conv_onnx_irv4_opset8.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_convtranspose.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_cos.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_cumsum.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_det.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dict.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dict_str.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dim.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dropout.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dropout_default.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dropout_opset12.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dropout_training.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dropout_training_opset12.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dynamic_axes_add.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dynamic_axes_add_inputs_same_symbolic_shape.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dynamic_axes_matmul.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dynamic_axes_reduce_mean.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dynamic_axes_unchange.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_elu.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_embedding_bags.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_empty_like.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_empty_like_opset7.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_equal.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_erf.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_exp.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_expand.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_flatten.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_flatten2D.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_fmod.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_frobenius_norm.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_full.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_full_like.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_gather.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_gather_opset11.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_ge.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_gelu.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_gt.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_hardtanh.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_implicit_expand.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_index.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_isnan.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_layer_norm_aten.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_le.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_linear.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_log_sigmoid.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_logsoftmax.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_lstm_none_sequence_lens.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_lt.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_master_opset.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_max.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_maxpool.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_maxpool_dilations.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_maxpool_indices.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_mean.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_mean_dtype.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_meshgrid.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_min.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_mm.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_narrow.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_ne.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_nonzero.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_norm_p1.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_norm_p2.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_ones_like.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_pad.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_params.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_params_onnx_irv4.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_permute2.expect" + } + ], + "pageInfo": { + "endCursor": "MTAw", + "hasNextPage": true + } + }, + "reviews": { + "nodes": [ + { + "author": { + "login": "garymm" + }, + "state": "APPROVED" + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMi0xOFQxNzoxODo0NC0wODowMLkyMDIyLTAyLTE4VDE3OjE4OjQ0LTA4OjAwzjTr0H0=", + "hasPreviousPage": false + } + }, + "comments": { + "nodes": [ + { + "bodyText": "This PR cannot be merged by bot due to changing > 100 files. @malfet \n \n \n pytorch/.github/scripts/trymerge.py\n \n \n Line 63\n in\n 932adf2\n \n \n \n \n\n \n \n files(last: 100) { \n \n \n \n\n Can this be relaxed? If not please import.", + "createdAt": "2022-02-22T18:22:40Z", + "author": { + "login": "BowenBao" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 1048084569 + }, + { + "bodyText": "This PR cannot be merged by bot due to changing > 100 files. @malfet\nCan this be relaxed? If not please import.\n\nWow, you've hit a really interesting problem. 100 is a limitation enforced by GitHub, see https://docs.github.com/en/graphql/overview/resource-limitations, but I can implement a pagination. Do you mind keeping it like that for a bit, want to land a fix soonish.", + "createdAt": "2022-02-22T18:27:29Z", + "author": { + "login": "malfet" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1048088691 + }, + { + "bodyText": "@malfet Thank you for info. Sure, I have separated the rest of stack from this one, we'll wait for the fix to try again.", + "createdAt": "2022-02-22T18:29:48Z", + "author": { + "login": "BowenBao" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 1048090640 + }, + { + "bodyText": "@pytorchbot merge this", + "createdAt": "2022-02-24T21:42:36Z", + "author": { + "login": "BowenBao" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 1050293881 + }, + { + "bodyText": "Hey @BowenBao.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.", + "createdAt": "2022-02-24T21:44:39Z", + "author": { + "login": "github-actions" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1050295451 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOPniAWQ==", + "hasPreviousPage": true + } + }, + "labels": { + "edges": [ + { + "node": { + "name": "oncall: jit" + } + }, + { + "node": { + "name": "open source" + } + }, + { + "node": { + "name": "cla signed" + } + }, + { + "node": { + "name": "release notes: onnx" + } + }, + { + "node": { + "name": "topic: bug fixes" + } + } + ] + } + } + } + } + }, + "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=74649 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": true, + "isCrossRepository": false, + "author": { + "login": "malfet" + }, + "title": "This should fail flake8", + "body": "Test issue for GHF mandatory checks", + "headRefName": "malfet-patch-8", + "headRepository": { + "nameWithOwner": "pytorch/pytorch" + }, + "baseRefName": "master", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nshulga@fb.com", + "name": "Nikita Shulga" + }, + "oid": "57c86ff1c5ab948888fd329986c9d55796680e33" + } + }, + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nshulga@fb.com", + "name": "Nikita Shulga" + }, + "oid": "6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4" + } + } + ], + "pageInfo": { + "endCursor": "Mg", + "hasNextPage": false + }, + "totalCount": 2 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "edges": [ + { + "node": { + "app": { + "name": "Facebook GitHub Tools", + "databaseId": 12274 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "Facebook CLA Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://code.intern.facebook.com/cla/" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHsK3w=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018129" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj1E=" + }, + { + "node": { + "app": { + "name": "Netlify", + "databaseId": 13473 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018131" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj1M=" + }, + { + "node": { + "app": { + "name": "Azure Pipelines", + "databaseId": 9426 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018132" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj1Q=" + }, + { + "node": { + "app": { + "name": "Dependabot", + "databaseId": 29110 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018134" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj1Y=" + }, + { + "node": { + "app": { + "name": "Codecov", + "databaseId": 254 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018139" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj1s=" + }, + { + "node": { + "app": { + "name": "PyTorch Bot", + "databaseId": 40112 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018142" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj14=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "clang-format", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925132" + }, + { + "name": "clang-tidy", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925189" + }, + { + "name": "cmakelint", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925230" + }, + { + "name": "flake8-py3", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925307" + }, + { + "name": "mypy", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925365" + }, + { + "name": "Test collect_env (with_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925427" + }, + { + "name": "Test collect_env (without_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925449" + }, + { + "name": "Test tools", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925537" + }, + { + "name": "py2-setup-validate-errormsg", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925644" + }, + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925688" + }, + { + "name": "toc", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925809" + }, + { + "name": "shellcheck", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925945" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHsMiY=", + "hasNextPage": false + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018384" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlkFA=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "run-torchbench", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576288/jobs/2928925134" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHsLW0=", + "hasNextPage": false + } + }, + "conclusion": "SKIPPED", + "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018395" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlkFs=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pull" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928935743" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928935775" + }, + { + "name": "linux-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928935850" + }, + { + "name": "linux-bionic-rocm4.5-py3.7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928935994" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936064" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936179" + }, + { + "name": "linux-xenial-py3-clang5-mobile-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936265" + }, + { + "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936309" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936353" + }, + { + "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936395" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936426" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936483" + }, + { + "name": "win-vs2019-cuda11.3-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936516" + }, + { + "name": "win-vs2019-cpu-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936558" + }, + { + "name": "linux-xenial-py3.7-gcc7-no-ops / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936633" + }, + { + "name": "linux-xenial-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936705" + }, + { + "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936736" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936756" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936796" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936823" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928990551" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928990588" + }, + { + "name": "linux-docs / build-docs (cpp)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928992832" + }, + { + "name": "linux-docs / build-docs (python)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928992868" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928992932" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928992965" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928993011" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928993042" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928993086" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928993128" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928995802" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928995853" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (noarch, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928995889" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928997626" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928999058" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928999075" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929012407" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929012438" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929012469" + }, + { + "name": "linux-bionic-rocm4.5-py3.7 / test (default, 1, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929034328" + }, + { + "name": "linux-bionic-rocm4.5-py3.7 / test (default, 2, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929034340" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929040801" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929045939" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929046016" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929046063" + }, + { + "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929082254" + }, + { + "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929082275" + }, + { + "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929157614" + }, + { + "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929157635" + }, + { + "name": "win-vs2019-cuda11.3-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929157656" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHxIT4=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018405" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlkGU=" + } + ], + "pageInfo": { + "hasNextPage": false + } + }, + "status": null, + "pushedDate": "2022-03-24T00:42:33Z", + "oid": "6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4" + } + } + ] + }, + "changedFiles": 1, + "files": { + "nodes": [ + { + "path": "torch/nn/cpp.py" + } + ], + "pageInfo": { + "endCursor": "MQ", + "hasNextPage": false + } + }, + "reviews": { + "nodes": [ + { + "author": { + "login": "seemethere" + }, + "state": "APPROVED" + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMy0yM1QxNTo1MDo0NS0wNzowMLkyMDIyLTAzLTIzVDE1OjUwOjQ1LTA3OjAwzjbPEDg=", + "hasPreviousPage": false + } + }, + "comments": { + "nodes": [ + { + "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/74649\n\u21a9\ufe0f \u00a0[fb-only] Re-run with SSH instructions\nNeed help or want to give feedback on the CI? Visit our office hours\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 6c3c3de (more details on the Dr. CI page):\n\n\n1/1 failures introduced in this PR\n\n\n1 failure not recognized by patterns:\n\n\n\nJob\nStep\nAction\n\n\n\n\n Lint / flake8-py3\nFail if there were any warnings\n\ud83d\udd01 rerun\n\n\n\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here to manually regenerate this comment.", + "createdAt": "2022-03-23T22:40:51Z", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": { + "login": "facebook-github-bot" + }, + "databaseId": 1076891218 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOQDAOUg==", + "hasPreviousPage": false + } + }, + "labels": { + "edges": [ + { + "node": { + "name": "cla signed" + } + } + ] + } + } + } + } + }, + "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=79694 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": true, + "isCrossRepository": true, + "author": { + "login": "kshitij12345" + }, + "title": "[complex] conv_transpose1d", + "body": "Reference: https://github.com/pytorch/pytorch/issues/71108", + "headRefName": "develop/complex/conv_transpose1d", + "headRepository": { + "nameWithOwner": "kshitij12345/pytorch" + }, + "baseRefName": "master", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": { + "login": "kshitij12345" + }, + "email": "kshitijkalambarkar@gmail.com", + "name": "kshitij12345" + }, + "oid": "d1ea948e65ac6d31ad056287ab65d38ecc68b30d" + } + }, + { + "commit": { + "author": { + "user": { + "login": "kshitij12345" + }, + "email": "kshitijkalambarkar@gmail.com", + "name": "kshitij12345" + }, + "oid": "b4ba1db9a3a71bd8c03158dcd1b68711360633d8" + } + }, + { + "commit": { + "author": { + "user": { + "login": "kshitij12345" + }, + "email": "kshitijkalambarkar@gmail.com", + "name": "kshitij12345" + }, + "oid": "655a4220beae163bfe578f0318a130df01ec05d6" + } + }, + { + "commit": { + "author": { + "user": { + "login": "kshitij12345" + }, + "email": "kshitijkalambarkar@gmail.com", + "name": "Kshiteej K" + }, + "oid": "8181716be7a8005eb13ad5c3f2e1279ed1c60aff" + } + }, + { + "commit": { + "author": { + "user": { + "login": "kshitij12345" + }, + "email": "kshitijkalambarkar@gmail.com", + "name": "kshitij12345" + }, + "oid": "9e5ca3663e7471786eeebebfdf84aea5d761712f" + } + }, + { + "commit": { + "author": { + "user": { + "login": "kshitij12345" + }, + "email": "kshitijkalambarkar@gmail.com", + "name": "kshitij12345" + }, + "oid": "9c110f39bcdc4e56386b6f9c4e2c082c8940ade6" + } + }, + { + "commit": { + "author": { + "user": { + "login": "kshitij12345" + }, + "email": "kshitijkalambarkar@gmail.com", + "name": "kshitij12345" + }, + "oid": "49315e79d0eee8008e2a74575c6fc0f6a9531ee4" + } + }, + { + "commit": { + "author": { + "user": { + "login": "kshitij12345" + }, + "email": "kshitijkalambarkar@gmail.com", + "name": "kshitij12345" + }, + "oid": "728752480760226270c374a0acc08e28b9b133f3" + } + }, + { + "commit": { + "author": { + "user": { + "login": "kshitij12345" + }, + "email": "kshitijkalambarkar@gmail.com", + "name": "kshitij12345" + }, + "oid": "ffe43399d6f60ef7844523a5f465c11d9a67062f" + } + }, + { + "commit": { + "author": { + "user": { + "login": "kshitij12345" + }, + "email": "kshitijkalambarkar@gmail.com", + "name": "kshitij12345" + }, + "oid": "9672a2198472567bae4ac6f55d004f7e1fa8a9fa" + } + }, + { + "commit": { + "author": { + "user": { + "login": "kshitij12345" + }, + "email": "kshitijkalambarkar@gmail.com", + "name": "kshitij12345" + }, + "oid": "48a0ebf32b895286f036b36c871f671dc867e400" + } + }, + { + "commit": { + "author": { + "user": { + "login": "kshitij12345" + }, + "email": "kshitijkalambarkar@gmail.com", + "name": "kshitij12345" + }, + "oid": "52fbe80d5c8a94e03d816c0bd21fd82019dcd5ac" + } + }, + { + "commit": { + "author": { + "user": { + "login": "kshitij12345" + }, + "email": "kshitijkalambarkar@gmail.com", + "name": "kshitij12345" + }, + "oid": "2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce" + } + } + ], + "pageInfo": { + "endCursor": "MTM", + "hasNextPage": false + }, + "totalCount": 13 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "edges": [ + { + "node": { + "app": { + "name": "Facebook GitHub Tools", + "databaseId": 12274 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "Facebook CLA Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://code.facebook.com/cla/" + }, + { + "name": "Meta Internal-Only Changes Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://opensource.facebook.com/" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdtq8Hc=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce/checks?check_suite_id=7929899098" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAdioqFo=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "run-torchbench", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393316/jobs/4628529923" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdqTEwk=", + "hasNextPage": false + } + }, + "conclusion": "SKIPPED", + "url": "https://github.com/pytorch/pytorch/commit/2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce/checks?check_suite_id=7929899387" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAdioqXs=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "lintrunner", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628529910" + }, + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628530162" + }, + { + "name": "Test collect_env (with_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628530698" + }, + { + "name": "Test collect_env (without_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628530867" + }, + { + "name": "Test collect_env (older_python_version)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628530989" + }, + { + "name": "pr-sanity-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628531151" + }, + { + "name": "workflow-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628531475" + }, + { + "name": "Test tools", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628531753" + }, + { + "name": "toc", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628531853" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdqTHFY=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce/checks?check_suite_id=7929899388" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAdioqXw=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pull" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "linux-focal-py3.7-clang7-asan / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628531149" + }, + { + "name": "linux-bionic-cuda11.6-py3.10-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628531473" + }, + { + "name": "linux-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628531754" + }, + { + "name": "linux-jammy-cuda11.6-cudnn8-py3.8-clang12 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628531857" + }, + { + "name": "linux-focal-py3.7-gcc7-pch / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628532179" + }, + { + "name": "linux-focal-py3.7-clang10-onnx / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628532543" + }, + { + "name": "linux-bionic-cuda11.3-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628532694" + }, + { + "name": "linux-focal-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628532918" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533033" + }, + { + "name": "linux-focal-py3.7-gcc7-no-ops / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533181" + }, + { + "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533420" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533630" + }, + { + "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533825" + }, + { + "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533959" + }, + { + "name": "linux-xenial-py3-clang5-mobile-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534129" + }, + { + "name": "linux-bionic-py3_7-clang8-xla / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534256" + }, + { + "name": "linux-focal-rocm5.2-py3.7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534388" + }, + { + "name": "linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534571" + }, + { + "name": "linux-bionic-cuda11_6-py3_10-gcc7-deploy / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534714" + }, + { + "name": "win-vs2019-cuda11.6-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534989" + }, + { + "name": "win-vs2019-cpu-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628535311" + }, + { + "name": "linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639115" + }, + { + "name": "linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639198" + }, + { + "name": "linux-focal-py3.7-gcc7 / test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639265" + }, + { + "name": "linux-focal-py3.7-gcc7 / test (functorch, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639339" + }, + { + "name": "linux-focal-py3.7-gcc7 / test (docs_test, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639395" + }, + { + "name": "linux-focal-py3.7-gcc7 / test (jit_legacy, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639450" + }, + { + "name": "linux-focal-py3.7-gcc7 / test (backwards_compat, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639509" + }, + { + "name": "linux-docs / build-docs (cpp)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639572" + }, + { + "name": "linux-docs / build-docs (python)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639635" + }, + { + "name": "linux-focal-py3.7-clang10-onnx / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647047" + }, + { + "name": "linux-focal-py3.7-clang10-onnx / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647119" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647215" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647277" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647348" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647432" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (dynamo, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647522" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (dynamo, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647641" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (functorch, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647762" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628653797" + }, + { + "name": "linux-focal-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679376" + }, + { + "name": "linux-focal-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679431" + }, + { + "name": "linux-focal-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679469" + }, + { + "name": "linux-focal-py3.7-clang7-asan / test (default, 4, 5, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679519" + }, + { + "name": "linux-focal-py3.7-clang7-asan / test (default, 5, 5, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679594" + }, + { + "name": "linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628681226" + }, + { + "name": "linux-bionic-cuda11_6-py3_10-gcc7-deploy / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628854932" + }, + { + "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856434" + }, + { + "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856501" + }, + { + "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856575" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdqZ2fA=", + "hasNextPage": true + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce/checks?check_suite_id=7929899419" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAdioqZs=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "windows-binary-libtorch-debug" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "libtorch-cpu-shared-with-deps-debug-build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351637/jobs/4634503587" + }, + { + "name": "libtorch-cpu-shared-with-deps-debug-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351637/jobs/4635312938" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsbsmM=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce/checks?check_suite_id=7936953056" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUSuA=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "windows-binary-wheel" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "wheel-py3_7-cuda11_3-build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351640/jobs/4634503571" + }, + { + "name": "wheel-py3_7-cuda11_3-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351640/jobs/4636146265" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsskcw=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce/checks?check_suite_id=7936953059" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUSuM=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "windows-binary-libtorch-release" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "libtorch-cpu-shared-with-deps-release-build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351643/jobs/4634503570" + }, + { + "name": "libtorch-cpu-shared-with-deps-release-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351643/jobs/4635003925" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsVbD8=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce/checks?check_suite_id=7936953061" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUSuU=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-binary-libtorch-cxx11-abi" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "libtorch-cpu-shared-with-deps-cxx11-abi-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351698/jobs/4634504079" + }, + { + "name": "libtorch-cpu-shared-with-deps-cxx11-abi-test / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351698/jobs/4635072931" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsW5Aw=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce/checks?check_suite_id=7936953185" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUS2E=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-binary-libtorch-pre-cxx11" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "libtorch-cpu-shared-with-deps-cxx11-abi-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351700/jobs/4634503897" + }, + { + "name": "libtorch-cpu-shared-with-deps-cxx11-abi-test / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351700/jobs/4635077148" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsW-jo=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce/checks?check_suite_id=7936953186" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUS2I=" + }, + { + "node": { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-binary-manywheel" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "manywheel-py3_7-cuda10_2-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351699/jobs/4634503896" + }, + { + "name": "manywheel-py3_7-cuda10_2-test / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351699/jobs/4635934290" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsoMEA=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce/checks?check_suite_id=7936953187" + }, + "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUS2M=" + } + ], + "pageInfo": { + "hasNextPage": true + } + }, + "status": null, + "pushedDate": "2022-08-22T22:04:19Z", + "oid": "2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce" + } + } + ] + }, + "changedFiles": 3, + "files": { + "nodes": [ + { + "path": "aten/src/ATen/native/Convolution.cpp" + }, + { + "path": "torch/testing/_internal/common_methods_invocations.py" + }, + { + "path": "torch/testing/_internal/common_modules.py" + } + ], + "pageInfo": { + "endCursor": "Mw", + "hasNextPage": false + } + }, + "reviews": { + "nodes": [ + { + "author": { + "login": "ngimel" + }, + "state": "APPROVED" + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNy0xOVQxMDowNzo1NC0wNzowMLkyMDIyLTA3LTE5VDEwOjA3OjU0LTA3OjAwzj43QcY=", + "hasPreviousPage": false + } + }, + "comments": { + "nodes": [ + { + "bodyText": "@pytorchbot merge -g\nAll is green internally!", + "createdAt": "2022-08-23T19:29:55Z", + "author": { + "login": "albanD" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1224702749 + }, + { + "bodyText": "@pytorchbot successfully started a merge job. Check the current status here.\nThe merge job was triggered with the green (-g) flag. This means that your change will be merged once all checks on your PR have passed (ETA: 0-4 Hours). If this is not the intended behavior, feel free to use some of the other merge options in the wiki.\nPlease reach out to the PyTorch DevX Team with feedback or questions!", + "createdAt": "2022-08-23T19:31:18Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1224705564 + }, + { + "bodyText": "Thanks for looking into it \ud83d\ude42 @albanD @jeanschmidt", + "createdAt": "2022-08-23T19:34:36Z", + "author": { + "login": "kshitij12345" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 1224712351 + }, + { + "bodyText": "Hey @kshitij12345.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.", + "createdAt": "2022-08-23T22:31:58Z", + "author": { + "login": "github-actions" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1224956051 + }, + { + "bodyText": "Yeah, discussed with my manager and I got the required permissions to do so. Sorry for not responding promptly yesterday. But I am available from now on to provide assistance :)", + "createdAt": "2022-08-24T09:24:04Z", + "author": { + "login": "jeanschmidt" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1225462612 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOSP97HQ==", + "hasPreviousPage": true + } + }, + "labels": { + "edges": [ + { + "node": { + "name": "open source" + } + }, + { + "node": { + "name": "Merged" + } + }, + { + "node": { + "name": "cla signed" + } + }, + { + "node": { + "name": "Reverted" + } + }, + { + "node": { + "name": "ciflow/trunk" + } + }, + { + "node": { + "name": "ciflow/periodic" + } + } + ] + } + } + } + } + }, + "query_sha=2e2877d2452c4f233f042b7ccd50ab9c2a6e9a73d8819a0c876203c12364e8a3 cursor=Y3Vyc29yOnYyOpHOSP97HQ== name=pytorch number=79694 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "comments": { + "nodes": [ + { + "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/79694\n\ud83d\udcc4 \u00a0Preview Python docs built from this PR\n\ud83d\udcc4 \u00a0Preview C++ docs built from this PR\n\u2753Need help or want to give feedback on the CI? Visit our office hours\n\n\u2705 No Failures (0 Pending)\nAs of commit 2fd08f1 (more details on the Dr. CI page):\nExpand to see more\n\n\ud83d\udc9a \ud83d\udc9a Looks good so far! There are no failures yet. \ud83d\udc9a \ud83d\udc9a\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here to manually regenerate this comment.", + "createdAt": "2022-06-16T09:43:16Z", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": { + "login": "facebook-github-bot" + }, + "databaseId": 1157454523 + }, + { + "bodyText": "Unable to reproduce jit failure locally (will skip the test)\nCI Failure : https://github.com/pytorch/pytorch/runs/6926187074?check_suite_focus=true#step:9:20230\npytest test/test_ops_jit.py -k test_variant_consistency_jit_nn_functional_conv_transpose1d_cpu_complex64 -v\n=============================================================== test session starts ===============================================================\nplatform linux -- Python 3.10.0, pytest-6.2.5, py-1.10.0, pluggy-1.0.0 -- /home/kshiteej/.conda/envs/pytorch-cuda-dev/bin/python\ncachedir: .pytest_cache\nhypothesis profile 'default' -> database=DirectoryBasedExampleDatabase('/home/kshiteej/Pytorch/pytorch_complex_convolution.py/.hypothesis/examples')\nrootdir: /home/kshiteej/Pytorch/pytorch_complex_convolution.py, configfile: pytest.ini\nplugins: hypothesis-6.23.2, repeat-0.9.1\ncollected 1976 items / 1975 deselected / 1 selected \n\ntest/test_ops_jit.py::TestJitCPU::test_variant_consistency_jit_nn_functional_conv_transpose1d_cpu_complex64 PASSED [100%]\n\n================================================================ warnings summary =================================================================\n../../.conda/envs/pytorch-cuda-dev/lib/python3.10/site-packages/torch/testing/_internal/common_cuda.py:9\n /home/kshiteej/.conda/envs/pytorch-cuda-dev/lib/python3.10/site-packages/torch/testing/_internal/common_cuda.py:9: DeprecationWarning: The distutils package is deprecated and slated for removal in Python 3.12. Use setuptools or check PEP 632 for potential alternatives\n from distutils.version import LooseVersion\n\n../../.conda/envs/pytorch-cuda-dev/lib/python3.10/site-packages/torch/backends/cudnn/__init__.py:91\n /home/kshiteej/.conda/envs/pytorch-cuda-dev/lib/python3.10/site-packages/torch/backends/cudnn/__init__.py:91: UserWarning: PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild PyTorch making sure the library is visible to the build system.\n warnings.warn(\n\n-- Docs: https://docs.pytest.org/en/stable/warnings.html\n================================================= 1 passed, 1975 deselected, 2 warnings in 4.90s =================================================", + "createdAt": "2022-07-18T09:05:35Z", + "author": { + "login": "kshitij12345" + }, + "authorAssociation": "COLLABORATOR", + "editor": { + "login": "kshitij12345" + }, + "databaseId": 1186949486 + }, + { + "bodyText": "@pytorchbot merge", + "createdAt": "2022-07-19T17:12:23Z", + "author": { + "login": "ngimel" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1189347786 + }, + { + "bodyText": "@pytorchbot successfully started a merge job. Check the current status here", + "createdAt": "2022-07-19T17:13:42Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1189350009 + }, + { + "bodyText": "Hey @kshitij12345.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.", + "createdAt": "2022-07-19T17:14:25Z", + "author": { + "login": "github-actions" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1189350932 + }, + { + "bodyText": "@pytorchbot revert -m \"broke slow test https://github.com/pytorch/pytorch/runs/7414560957?check_suite_focus=true#step:9:31516\" -c \"nosignal\"", + "createdAt": "2022-07-19T19:15:41Z", + "author": { + "login": "kshitij12345" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 1189459845 + }, + { + "bodyText": "@pytorchbot successfully started a revert job. Check the current status here", + "createdAt": "2022-07-19T19:16:59Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1189460926 + }, + { + "bodyText": "Will not revert as @kshitij12345 is not a MEMBER, but COLLABORATOR", + "createdAt": "2022-07-19T19:17:00Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1189460942 + }, + { + "bodyText": "@pytorchbot revert -m \"broke slow test https://github.com/pytorch/pytorch/runs/7414560957?check_suite_focus=true#step:9:31516\" -c \"nosignal\"", + "createdAt": "2022-07-19T20:40:04Z", + "author": { + "login": "anjali411" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1189529734 + }, + { + "bodyText": "@pytorchbot successfully started a revert job. Check the current status here", + "createdAt": "2022-07-19T20:41:20Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1189530756 + }, + { + "bodyText": "@kshitij12345 your PR has been successfully reverted.", + "createdAt": "2022-07-19T20:41:25Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1189530831 + }, + { + "bodyText": "@pytorchbot merge -g", + "createdAt": "2022-07-20T09:53:08Z", + "author": { + "login": "kshitij12345" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 1190070141 + }, + { + "bodyText": "@pytorchbot successfully started a merge job. Check the current status here", + "createdAt": "2022-07-20T09:54:24Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1190071424 + }, + { + "bodyText": "Hey @kshitij12345.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.", + "createdAt": "2022-07-20T13:00:51Z", + "author": { + "login": "github-actions" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1190258272 + }, + { + "bodyText": "commit is breaking internal builds/tests https://pastebin.com/HX4RUusH (pytorch/functorch/test:test_eager_transforms)", + "createdAt": "2022-07-21T10:39:01Z", + "author": { + "login": "jeanschmidt" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1191327616 + }, + { + "bodyText": "@pytorchbot revert -m \"breaking internal builds\" -c \"ghfirst\"", + "createdAt": "2022-07-21T10:39:27Z", + "author": { + "login": "jeanschmidt" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1191328013 + }, + { + "bodyText": "@pytorchbot revert -m \"breaking internal builds\" -c \"ghfirst\"", + "createdAt": "2022-07-21T10:41:23Z", + "author": { + "login": "jeanschmidt" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1191329792 + }, + { + "bodyText": "@pytorchbot successfully started a revert job. Check the current status here", + "createdAt": "2022-07-21T10:42:16Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1191330586 + }, + { + "bodyText": "@kshitij12345 your PR has been successfully reverted.", + "createdAt": "2022-07-21T10:42:23Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1191330690 + }, + { + "bodyText": "@jeanschmidt which test is it failing on? I tried running the test_eager_transforms in functorch but couldn't reproduce it.", + "createdAt": "2022-07-25T07:11:19Z", + "author": { + "login": "kshitij12345" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 1193667568 + }, + { + "bodyText": "@jbschlosser have added a ref as discussed offline. Can you please take a look? And if it looks good, can you import the PR to check if it is breaking anything internally.\nThanks", + "createdAt": "2022-08-03T18:30:17Z", + "author": { + "login": "kshitij12345" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 1204329491 + }, + { + "bodyText": "@jbschlosser @jeanschmidt @albanD anything we can do to unblock this on our side?", + "createdAt": "2022-08-20T09:27:17Z", + "author": { + "login": "lezcano" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 1221266218 + }, + { + "bodyText": "Functorch tests should be running here now so can you rebase on top of master please?", + "createdAt": "2022-08-22T21:42:37Z", + "author": { + "login": "albanD" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1223129944 + }, + { + "bodyText": "@albanD have rebased on latest master.", + "createdAt": "2022-08-23T08:49:10Z", + "author": { + "login": "kshitij12345" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 1223758571 + }, + { + "bodyText": "I triggered all the tests not to have any issues with slow tests again", + "createdAt": "2022-08-23T09:20:18Z", + "author": { + "login": "lezcano" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 1223796413 + }, + { + "bodyText": "Thanks @lezcano! However, last time it was reverted for internal failures. So it would be great if someone can import and verify that.\ncc: @albanD @jeanschmidt", + "createdAt": "2022-08-23T10:17:50Z", + "author": { + "login": "kshitij12345" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 1223863075 + }, + { + "bodyText": "@albanD has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.", + "createdAt": "2022-08-23T14:43:02Z", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1224175731 + }, + { + "bodyText": "I am not the right person to provide assistence, as currently I am not based in a Tier 1 location, so my permissions to access are so restricted that I am not able to import this commit, run the tests and provide meaningful responses.", + "createdAt": "2022-08-23T15:57:48Z", + "author": { + "login": "jeanschmidt" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1224272324 + }, + { + "bodyText": "@jeanschmidt has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.", + "createdAt": "2022-08-23T17:00:53Z", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1224351135 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHORP1auw==", + "hasPreviousPage": false + } + } + } + } + } + }, + "query_sha=2e2877d2452c4f233f042b7ccd50ab9c2a6e9a73d8819a0c876203c12364e8a3 cursor=Y3Vyc29yOnYyOpHOR1poyg== name=pytorch number=82169 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "comments": { + "nodes": [ + { + "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/82169\n\ud83d\udcc4 \u00a0Preview Python docs built from this PR\n\ud83d\udcc4 \u00a0Preview C++ docs built from this PR\n\u2753Need help or want to give feedback on the CI? Visit our office hours\n\n\u2705 No Failures (0 Pending)\nAs of commit 28140e4 (more details on the Dr. CI page):\nExpand to see more\n\n\ud83d\udc9a \ud83d\udc9a Looks good so far! There are no failures yet. \ud83d\udc9a \ud83d\udc9a\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here to manually regenerate this comment.", + "createdAt": "2022-07-25T21:41:41Z", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": { + "login": "facebook-github-bot" + }, + "databaseId": 1194667199 + }, + { + "bodyText": "@pytorchbot merge -g", + "createdAt": "2022-07-25T21:46:04Z", + "author": { + "login": "ezyang" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1194671445 + }, + { + "bodyText": "@pytorchbot successfully started a merge job. Check the current status here", + "createdAt": "2022-07-25T21:47:25Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1194672744 + }, + { + "bodyText": "Merge failed due to Refusing to merge as mandatory check(s) pull failed for rule superuser\nRaised by https://github.com/pytorch/pytorch/actions/runs/2735501647", + "createdAt": "2022-07-25T23:22:45Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1194761219 + }, + { + "bodyText": "@pytorchbot rebase", + "createdAt": "2022-07-26T00:54:17Z", + "author": { + "login": "ezyang" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1194839920 + }, + { + "bodyText": "@pytorchbot successfully started a rebase job. Check the current status here", + "createdAt": "2022-07-26T01:01:32Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1194846575 + }, + { + "bodyText": "Successfully rebased gh/ezyang/1279/orig onto refs/remotes/origin/master, please pull locally before adding more changes (for example, via ghstack checkout https://github.com/pytorch/pytorch/pull/82169)", + "createdAt": "2022-07-26T01:01:53Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1194846838 + }, + { + "bodyText": "@pytorchbot rebase", + "createdAt": "2022-07-27T15:32:13Z", + "author": { + "login": "ezyang" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1196915484 + }, + { + "bodyText": "@pytorchbot successfully started a rebase job. Check the current status here", + "createdAt": "2022-07-27T15:33:49Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1196917359 + }, + { + "bodyText": "Successfully rebased gh/ezyang/1279/orig onto refs/remotes/origin/master, please pull locally before adding more changes (for example, via ghstack checkout https://github.com/pytorch/pytorch/pull/82169)", + "createdAt": "2022-07-27T15:34:03Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1196917609 + }, + { + "bodyText": "@pytorchbot merge -g", + "createdAt": "2022-07-27T15:41:52Z", + "author": { + "login": "ezyang" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1196927174 + }, + { + "bodyText": "@pytorchbot successfully started a merge job. Check the current status here", + "createdAt": "2022-07-27T15:43:11Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1196928771 + }, + { + "bodyText": "Merge failed due to Refusing to merge as mandatory check(s) Lint failed for rule superuser\nRaised by https://github.com/pytorch/pytorch/actions/runs/2747872935", + "createdAt": "2022-07-27T15:43:14Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1196928849 + }, + { + "bodyText": "@pytorchbot merge -g", + "createdAt": "2022-07-27T16:59:37Z", + "author": { + "login": "ezyang" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1197046487 + }, + { + "bodyText": "@pytorchbot successfully started a merge job. Check the current status here", + "createdAt": "2022-07-27T17:07:32Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1197055101 + }, + { + "bodyText": "Merge failed due to Refusing to merge as mandatory check(s) Lint failed for rule superuser\nRaised by https://github.com/pytorch/pytorch/actions/runs/2748317347", + "createdAt": "2022-07-27T17:07:36Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1197055259 + }, + { + "bodyText": "@pytorchbot merge -f", + "createdAt": "2022-07-27T17:56:26Z", + "author": { + "login": "malfet" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1197107106 + }, + { + "bodyText": "\u274c \ud83e\udd16 pytorchbot command failed:\n@pytorchbot merge: error: argument -f/--force: expected one argument\n\nusage: @pytorchbot merge [-g | -f FORCE | -l]\n\nTry @pytorchbot --help for more info.", + "createdAt": "2022-07-27T17:56:27Z", + "author": { + "login": "pytorch-bot" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1197107129 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHORzUsvw==", + "hasPreviousPage": false + } + } + } + } + } + }, + "query_sha=2e2877d2452c4f233f042b7ccd50ab9c2a6e9a73d8819a0c876203c12364e8a3 cursor=Y3Vyc29yOnYyOpHOPoR4Lg== name=pytorch number=71759 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "comments": { + "nodes": [ + { + "bodyText": "CI Flow Status\n\u269b\ufe0f CI Flow\nRuleset - Version: v1\nRuleset - File: https://github.com/coolteemf/pytorch/blob/7647f7953a68e4f1c3feaa19c77d925abfe8e377/.github/generated-ciflow-ruleset.json\nPR ciflow labels: ciflow/default\nAdd ciflow labels to this PR to trigger more builds:\n\n\n\nWorkflows\nLabels (bold enabled)\nStatus\n\n\n\n\nTriggered Workflows\n\n\n\n\nlinux-bionic-py3.6-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/noarch, ciflow/xla\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.6-gcc7\nciflow/all, ciflow/cuda, ciflow/default, ciflow/linux\n\u2705 triggered\n\n\nlinux-xenial-py3.6-clang7-asan\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/sanitizers\n\u2705 triggered\n\n\nlinux-xenial-py3.6-gcc5.4\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux\n\u2705 triggered\n\n\nlinux-xenial-py3.6-gcc7-bazel-test\nciflow/all, ciflow/bazel, ciflow/cpu, ciflow/default, ciflow/linux\n\u2705 triggered\n\n\nwin-vs2019-cpu-py3\nciflow/all, ciflow/cpu, ciflow/default, ciflow/win\n\u2705 triggered\n\n\nwin-vs2019-cuda11.3-py3\nciflow/all, ciflow/cuda, ciflow/default, ciflow/win\n\u2705 triggered\n\n\nSkipped Workflows\n\n\n\n\nlibtorch-linux-xenial-cuda10.2-py3.6-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda11.3-py3.6-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux\n\ud83d\udeab skipped\n\n\nlinux-bionic-cuda10.2-py3.9-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/slow\n\ud83d\udeab skipped\n\n\nlinux-xenial-cuda10.2-py3.6-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/slow\n\ud83d\udeab skipped\n\n\nparallelnative-linux-xenial-py3.6-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux\n\ud83d\udeab skipped\n\n\nperiodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda11.1-py3.6-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-win-vs2019-cuda11.1-py3\nciflow/all, ciflow/cuda, ciflow/scheduled, ciflow/win\n\ud83d\udeab skipped\n\n\npuretorch-linux-xenial-py3.6-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux\n\ud83d\udeab skipped", + "createdAt": "2022-01-25T09:31:05Z", + "author": { + "login": "pytorch-bot" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1020983378 + }, + { + "bodyText": "Hi @coolteemf!\nThank you for your pull request and welcome to our community.\nAction Required\nIn order to merge any pull request (code, docs, etc.), we require contributors to sign our Contributor License Agreement, and we don't seem to have one on file for you.\nProcess\nIn order for us to review and merge your suggested changes, please sign at https://code.facebook.com/cla. If you are contributing on behalf of someone else (eg your employer), the individual CLA may not be sufficient and your employer may need to sign the corporate CLA.\nOnce the CLA is signed, our tooling will perform checks and validations. Afterwards, the pull request will be tagged with CLA signed. The tagging process may take up to 1 hour after signing. Please give it that time before contacting us about it.\nIf you have received this in error or have any questions, please contact us at cla@fb.com. Thanks!", + "createdAt": "2022-01-25T09:31:06Z", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1020983383 + }, + { + "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/71759\n\ud83d\udcc4 \u00a0Preview docs built from this PR\n\ud83d\udcc4 \u00a0Preview C++ docs built from this PR\n\ud83d\udd27 \u00a0Opt-in to CIFlow to control what jobs run on your PRs\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 346e0c5 (more details on the Dr. CI page):\n\n\n2/3 failures introduced in this PR\n1/3 tentatively recognized as flaky \u2744\ufe0f\n\nClick here to rerun these jobs\n\n\n\n\n\ud83d\udd75\ufe0f 2 new failures recognized by patterns\nThe following CI failures do not appear to be due to upstream breakages:\n win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge) (1/2)\nStep: \"Test\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-02-23T14:12:58.9371445Z FAIL [0.010s]: test_sparse_addmm_cpu_bfloat16 (__main__.TestSparseCPU)\n\n2022-02-23T14:12:58.9258506Z test_sparse_zeros_tanh_cpu_float64 (__main__.TestSparseUnaryUfuncsCPU) ... ok (0.002s)\n2022-02-23T14:12:58.9274771Z test_sparse_zeros_tanh_cpu_int16 (__main__.TestSparseUnaryUfuncsCPU) ... ok (0.001s)\n2022-02-23T14:12:58.9290805Z test_sparse_zeros_tanh_cpu_int32 (__main__.TestSparseUnaryUfuncsCPU) ... ok (0.001s)\n2022-02-23T14:12:58.9306695Z test_sparse_zeros_tanh_cpu_int64 (__main__.TestSparseUnaryUfuncsCPU) ... ok (0.000s)\n2022-02-23T14:12:58.9322595Z test_sparse_zeros_tanh_cpu_int8 (__main__.TestSparseUnaryUfuncsCPU) ... ok (0.000s)\n2022-02-23T14:12:58.9338535Z test_sparse_zeros_tanh_cpu_uint8 (__main__.TestSparseUnaryUfuncsCPU) ... ok (0.000s)\n2022-02-23T14:12:58.9354468Z test_sparse_zeros_trunc_cpu_float32 (__main__.TestSparseUnaryUfuncsCPU) ... ok (0.000s)\n2022-02-23T14:12:58.9370208Z test_sparse_zeros_trunc_cpu_float64 (__main__.TestSparseUnaryUfuncsCPU) ... ok (0.000s)\n2022-02-23T14:12:58.9370712Z \n2022-02-23T14:12:58.9370976Z ======================================================================\n2022-02-23T14:12:58.9371445Z FAIL [0.010s]: test_sparse_addmm_cpu_bfloat16 (__main__.TestSparseCPU)\n2022-02-23T14:12:58.9372134Z ----------------------------------------------------------------------\n2022-02-23T14:12:58.9372597Z Traceback (most recent call last):\n2022-02-23T14:12:58.9374021Z File \"C:\\actions-runner\\_work\\pytorch\\pytorch\\build\\win_tmp\\build\\torch\\testing\\_internal\\common_device_type.py\", line 376, in instantiated_test\n2022-02-23T14:12:58.9374740Z result = test(self, **param_kwargs)\n2022-02-23T14:12:58.9375570Z File \"C:\\actions-runner\\_work\\pytorch\\pytorch\\build\\win_tmp\\build\\torch\\testing\\_internal\\common_utils.py\", line 2951, in wrapped\n2022-02-23T14:12:58.9376266Z f(self, *args, **kwargs, coalesced=False)\n2022-02-23T14:12:58.9376972Z File \"test_sparse.py\", line 1272, in test_sparse_addmm\n2022-02-23T14:12:58.9377402Z test_shape(7, 8, 9, 20, True, None)\n2022-02-23T14:12:58.9377939Z File \"test_sparse.py\", line 1264, in test_shape\n2022-02-23T14:12:58.9378373Z self.assertEqual(Y, Y_dense)\n\n\n win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu) (2/2)\nStep: \"Test\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-02-23T15:20:20.5710678Z FAIL [0.031s]: test_sparse_addmm_cpu_bfloat16 (__main__.TestSparseCPU)\n\n2022-02-23T15:20:20.5569146Z test_sparse_zeros_tanh_cuda_float64 (__main__.TestSparseUnaryUfuncsCUDA) ... ok (0.000s)\n2022-02-23T15:20:20.5589083Z test_sparse_zeros_tanh_cuda_int16 (__main__.TestSparseUnaryUfuncsCUDA) ... ok (0.000s)\n2022-02-23T15:20:20.5609025Z test_sparse_zeros_tanh_cuda_int32 (__main__.TestSparseUnaryUfuncsCUDA) ... ok (0.000s)\n2022-02-23T15:20:20.5629080Z test_sparse_zeros_tanh_cuda_int64 (__main__.TestSparseUnaryUfuncsCUDA) ... ok (0.016s)\n2022-02-23T15:20:20.5649102Z test_sparse_zeros_tanh_cuda_int8 (__main__.TestSparseUnaryUfuncsCUDA) ... ok (0.000s)\n2022-02-23T15:20:20.5668867Z test_sparse_zeros_tanh_cuda_uint8 (__main__.TestSparseUnaryUfuncsCUDA) ... ok (0.000s)\n2022-02-23T15:20:20.5688700Z test_sparse_zeros_trunc_cuda_float32 (__main__.TestSparseUnaryUfuncsCUDA) ... ok (0.000s)\n2022-02-23T15:20:20.5708285Z test_sparse_zeros_trunc_cuda_float64 (__main__.TestSparseUnaryUfuncsCUDA) ... ok (0.000s)\n2022-02-23T15:20:20.5709405Z \n2022-02-23T15:20:20.5709879Z ======================================================================\n2022-02-23T15:20:20.5710678Z FAIL [0.031s]: test_sparse_addmm_cpu_bfloat16 (__main__.TestSparseCPU)\n2022-02-23T15:20:20.5711399Z ----------------------------------------------------------------------\n2022-02-23T15:20:20.5712013Z Traceback (most recent call last):\n2022-02-23T15:20:20.5713280Z File \"C:\\actions-runner\\_work\\pytorch\\pytorch\\build\\win_tmp\\build\\torch\\testing\\_internal\\common_device_type.py\", line 376, in instantiated_test\n2022-02-23T15:20:20.5714267Z result = test(self, **param_kwargs)\n2022-02-23T15:20:20.5715299Z File \"C:\\actions-runner\\_work\\pytorch\\pytorch\\build\\win_tmp\\build\\torch\\testing\\_internal\\common_utils.py\", line 2951, in wrapped\n2022-02-23T15:20:20.5716240Z f(self, *args, **kwargs, coalesced=False)\n2022-02-23T15:20:20.5716943Z File \"test_sparse.py\", line 1275, in test_sparse_addmm\n2022-02-23T15:20:20.5717516Z test_shape(7, 8, 9, 20, False, (1, 1))\n2022-02-23T15:20:20.5718323Z File \"test_sparse.py\", line 1264, in test_shape\n2022-02-23T15:20:20.5718915Z self.assertEqual(Y, Y_dense)\n\n\n\n\u2744\ufe0f 1 failure tentatively classified as flaky\nbut reruns have not yet been triggered to confirm:\n linux-bionic-rocm4.5-py3.7 / test (distributed, 1, 1, linux.rocm.gpu) (1/1)\nStep: \"Test\" (full log | diagnosis details | \ud83d\udd01 rerun) \u2744\ufe0f\n\n\n2022-02-23T16:16:26.7221984Z RuntimeError: Proc...ated or timed out after 100.06913685798645 seconds\n\n2022-02-23T16:16:26.7207909Z ERROR [100.093s]: test_collect_shards (__main__.TestZeroRedundancyOptimizerDistributed)\n2022-02-23T16:16:26.7209206Z Check the state consolidation mechanism, and the state dict exposed by ZeroRedundancyOptimizer\n2022-02-23T16:16:26.7213073Z ----------------------------------------------------------------------\n2022-02-23T16:16:26.7213996Z Traceback (most recent call last):\n2022-02-23T16:16:26.7215434Z File \"/opt/conda/lib/python3.7/site-packages/torch/testing/_internal/common_distributed.py\", line 483, in wrapper\n2022-02-23T16:16:26.7216409Z self._join_processes(fn)\n2022-02-23T16:16:26.7217801Z File \"/opt/conda/lib/python3.7/site-packages/torch/testing/_internal/common_distributed.py\", line 702, in _join_processes\n2022-02-23T16:16:26.7218822Z self._check_return_codes(elapsed_time)\n2022-02-23T16:16:26.7220266Z File \"/opt/conda/lib/python3.7/site-packages/torch/testing/_internal/common_distributed.py\", line 754, in _check_return_codes\n2022-02-23T16:16:26.7221201Z i, elapsed_time\n2022-02-23T16:16:26.7221984Z RuntimeError: Process 0 terminated or timed out after 100.06913685798645 seconds\n2022-02-23T16:16:26.7222551Z \n2022-02-23T16:16:26.7223245Z ----------------------------------------------------------------------\n2022-02-23T16:16:26.7224032Z Ran 26 tests in 303.663s\n2022-02-23T16:16:26.7224400Z \n2022-02-23T16:16:26.7224780Z FAILED (errors=1, skipped=8, unexpected successes=3)\n2022-02-23T16:16:26.7225718Z \n2022-02-23T16:16:26.7225992Z Generating XML reports...\n2022-02-23T16:16:26.7336797Z Generated XML report: test-reports/python-unittest/distributed.optim.test_zero_redundancy_optimizer/TEST-TestZeroRedundancyOptimizerDistributed-20220223161123.xml\n2022-02-23T16:16:26.7349296Z Generated XML report: test-reports/python-unittest/distributed.optim.test_zero_redundancy_optimizer/TEST-TestZeroRedundancyOptimizerSingleRank-20220223161123.xml\n2022-02-23T16:16:27.6823633Z Traceback (most recent call last):\n\n\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here to manually regenerate this comment.", + "createdAt": "2022-01-25T09:31:08Z", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": { + "login": "facebook-github-bot" + }, + "databaseId": 1020983433 + }, + { + "bodyText": "Thank you for signing our Contributor License Agreement. We can now accept your code for this (and any) Meta Open Source project. Thanks!", + "createdAt": "2022-01-25T18:07:45Z", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1021467314 + }, + { + "bodyText": "@albanD Is there something that needs to be done to correct the failed check ?", + "createdAt": "2022-02-04T13:18:05Z", + "author": { + "login": "coolteemf" + }, + "authorAssociation": "CONTRIBUTOR", + "editor": null, + "databaseId": 1029978104 + }, + { + "bodyText": "Hi,\nI think you didn't do the merge properly as there are now a lot more commits than it should be in this PR.\nYou can either clean up the branch locally and force push here or open a new clean PR.\nNote that in general, it is better to rebase on top of master than merge master into your branch!", + "createdAt": "2022-02-04T14:28:28Z", + "author": { + "login": "albanD" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1030038719 + }, + { + "bodyText": "Okay thank you for the heads up", + "createdAt": "2022-02-04T16:44:46Z", + "author": { + "login": "coolteemf" + }, + "authorAssociation": "CONTRIBUTOR", + "editor": null, + "databaseId": 1030159616 + }, + { + "bodyText": "@albanD I just rebased and updated the branch to take into account changes from 28388b4. Is it all clear for merging ?", + "createdAt": "2022-02-16T15:34:59Z", + "author": { + "login": "coolteemf" + }, + "authorAssociation": "CONTRIBUTOR", + "editor": null, + "databaseId": 1041720345 + }, + { + "bodyText": "Thanks! The CI needs fixing for bc-compat and lint though\n\nThe lint should be fixed, however I didn't find clear instructions on how to fix the bc compat.\nI guess output_mask could be made optional, however in the case of native_group_norm_backward the same argument is not optional.", + "createdAt": "2022-02-17T08:04:30Z", + "author": { + "login": "coolteemf" + }, + "authorAssociation": "CONTRIBUTOR", + "editor": null, + "databaseId": 1042672732 + }, + { + "bodyText": "Since we are changing the signature on purpose here, you can add it to the list at https://github.com/pytorch/pytorch/blob/master/test/forward_backward_compatibility/check_forward_backward_compatibility.py#L29 to silence the test.", + "createdAt": "2022-02-17T14:41:16Z", + "author": { + "login": "albanD" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1043020903 + }, + { + "bodyText": "@pytorchbot merge this please", + "createdAt": "2022-02-23T14:48:05Z", + "author": { + "login": "albanD" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1048861185 + }, + { + "bodyText": "Merge failed due to 'NoneType' object is not subscriptable\nRaised by https://github.com/pytorch/pytorch/actions/runs/1887914411", + "createdAt": "2022-02-23T14:49:16Z", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1048862374 + }, + { + "bodyText": "@coolteemf you can ignore me playing with the bot. Nothing is needed on your end anymore, I'll take it from here.", + "createdAt": "2022-02-23T14:52:10Z", + "author": { + "login": "albanD" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1048865236 + }, + { + "bodyText": "@pytorchbot merge this", + "createdAt": "2022-02-23T14:54:23Z", + "author": { + "login": "malfet" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1048867615 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOPNr4Ug==", + "hasPreviousPage": false + } + } + } + } + } } } diff --git a/.github/scripts/test_check_labels.py b/.github/scripts/test_check_labels.py new file mode 100644 index 0000000000000..64e91dcd8ecbe --- /dev/null +++ b/.github/scripts/test_check_labels.py @@ -0,0 +1,77 @@ +"""test_check_labels.py""" + +from typing import Any +from unittest import TestCase, mock, main + +from trymerge import GitHubPR +from test_trymerge import mocked_gh_graphql +from check_labels import has_required_labels + +release_notes_labels = [ + "release notes: AO frontend", + "release notes: autograd", + "release notes: benchmark", + "release notes: build", + "release notes: complex", + "release notes: composability", + "release notes: cpp", + "release notes: cuda", + "release notes: cudnn", + "release notes: dataloader", + "release notes: distributed (c10d)", + "release notes: distributed (ddp)", + "release notes: distributed (fsdp)", + "release notes: distributed (pipeline)", + "release notes: distributed (rpc)", + "release notes: distributed (sharded)", + "release notes: foreach_frontend", + "release notes: functorch", + "release notes: fx", + "release notes: hub", + "release notes: jit", + "release notes: lazy", + "release notes: linalg_frontend", + "release notes: memory format", + "release notes: Meta API", + "release notes: mobile", + "release notes: mps", + "release notes: nested tensor", + "release notes: nn", + "release notes: onnx", + "release notes: package/deploy", + "release notes: performance_as_product", + "release notes: profiler", + "release notes: python_frontend", + "release notes: quantization", + "release notes: releng", + "release notes: rocm", + "release notes: sparse", + "release notes: visualization", + "release notes: vulkan", +] + + +class TestCheckLabels(TestCase): + @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql) + @mock.patch('check_labels.get_release_notes_labels', return_value=release_notes_labels) + def test_pr_with_missing_labels(self, mocked_rn_labels: Any, mocked_gql: Any) -> None: + "Test PR with no 'release notes:' label or 'topic: not user facing' label" + pr = GitHubPR("pytorch", "pytorch", 82169) + self.assertFalse(has_required_labels(pr)) + + @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql) + @mock.patch('check_labels.get_release_notes_labels', return_value=release_notes_labels) + def test_pr_with_release_notes_label(self, mocked_rn_labels: Any, mocked_gql: Any) -> None: + "Test PR with 'release notes: nn' label" + pr = GitHubPR("pytorch", "pytorch", 71759) + self.assertTrue(has_required_labels(pr)) + + @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql) + @mock.patch('check_labels.get_release_notes_labels', return_value=release_notes_labels) + def test_pr_with_not_user_facing_label(self, mocked_rn_labels: Any, mocked_gql: Any) -> None: + "Test PR with 'topic: not user facing' label" + pr = GitHubPR("pytorch", "pytorch", 75095) + self.assertTrue(has_required_labels(pr)) + +if __name__ == "__main__": + main() diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py index 87ca3ac06579c..502b22d847d23 100755 --- a/.github/scripts/trymerge.py +++ b/.github/scripts/trymerge.py @@ -179,6 +179,7 @@ class WorkflowCheckState(NamedTuple): comments(last: 5) { nodes { bodyText + createdAt author { login } @@ -336,6 +337,7 @@ class WorkflowCheckState(NamedTuple): comments(last: 100, before: $cursor) { nodes { bodyText + createdAt author { login } @@ -583,6 +585,7 @@ def can_skip_internal_checks(pr: "GitHubPR", comment_id: Optional[int] = None) - @dataclass class GitHubComment: body_text: str + created_at: str author_login: str author_association: str editor_login: Optional[str] @@ -807,6 +810,7 @@ def get_pr_url(self) -> str: def _comment_from_node(node: Any) -> GitHubComment: editor = node["editor"] return GitHubComment(body_text=node["bodyText"], + created_at=node["createdAt"] if "createdAt" in node else "", author_login=node["author"]["login"], author_association=node["authorAssociation"], editor_login=editor["login"] if editor else None, diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 1803395f81d97..bcee8ab86c83e 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -226,6 +226,38 @@ jobs: false fi + check-labels: + name: Check labels + runs-on: linux.20_04.16x + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + with: + submodules: false + fetch-depth: 1 + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: 3.8 + architecture: x64 + cache: 'pip' + cache-dependency-path: | + **/.github/requirements-gha-cache.txt + + - name: Install requirements + id: requirements + run: | + pip install -r .github/requirements-gha-cache.txt --user + + - name: Check labels + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUM: ${{ github.event.number }} + run: | + set -ex + python3 .github/scripts/check_labels.py "${PR_NUM}" + test-tools: name: Test tools if: ${{ github.repository == 'pytorch/pytorch' }} From b6b67977dd1da2954a95d7f3e7f8dae78724c51d Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Fri, 21 Oct 2022 08:29:10 -0700 Subject: [PATCH 0024/1922] functionalization: make view_copy outputs always contiguous (#85747) This fixes an issue with mobile: The output of view_copy ops should always be contiguous. Later, we can consider adding optional arguments to the `view_copy()` functions to let you explicitly say what the contiguity of the output can be (e.g. channels_last) Pull Request resolved: https://github.com/pytorch/pytorch/pull/85747 Approved by: https://github.com/ezyang --- test/test_functionalization.py | 28 +++++++++++++++++--------- test/test_view_ops.py | 6 ++++++ torchgen/gen_functionalization_type.py | 6 +++--- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/test/test_functionalization.py b/test/test_functionalization.py index 2eb79c73cc0bd..041e5b84f6945 100644 --- a/test/test_functionalization.py +++ b/test/test_functionalization.py @@ -659,23 +659,31 @@ def forward(self, a_1): getitem_1 = split_copy[1]; split_copy = None add_1 = torch.ops.aten.add.Tensor(getitem, ones); getitem = ones = None select_copy = torch.ops.aten.select_copy.int(_reshape_alias_copy, 0, 0); _reshape_alias_copy = None - clone = torch.ops.aten.clone.default(add_1, memory_format = torch.contiguous_format) - _unsafe_view = torch.ops.aten._unsafe_view.default(clone, [4]); clone = None + _reshape_alias_copy_1 = torch.ops.aten._reshape_alias_copy.default(add_1, [4], [1]) view_copy_1 = torch.ops.aten.view_copy.default(add, [8]); add = None - _reshape_alias_copy_1 = torch.ops.aten._reshape_alias_copy.default(view_copy_1, [2, 4], [4, 1]); view_copy_1 = None - transpose_copy_1 = torch.ops.aten.transpose_copy.int(_reshape_alias_copy_1, 1, 0); _reshape_alias_copy_1 = None + _reshape_alias_copy_2 = torch.ops.aten._reshape_alias_copy.default(view_copy_1, [2, 4], [4, 1]); view_copy_1 = None + transpose_copy_1 = torch.ops.aten.transpose_copy.int(_reshape_alias_copy_2, 1, 0); _reshape_alias_copy_2 = None unsqueeze_copy_1 = torch.ops.aten.unsqueeze_copy.default(transpose_copy_1, 0); transpose_copy_1 = None squeeze_copy_1 = torch.ops.aten.squeeze_copy.default(unsqueeze_copy_1); unsqueeze_copy_1 = None slice_scatter = torch.ops.aten.slice_scatter.default(squeeze_copy_1, add_1, 0, 0, 2); squeeze_copy_1 = None unsqueeze_copy_2 = torch.ops.aten.unsqueeze_copy.default(slice_scatter, 0); slice_scatter = None squeeze_copy_2 = torch.ops.aten.squeeze_copy.dim(unsqueeze_copy_2, 0); unsqueeze_copy_2 = None transpose_copy_2 = torch.ops.aten.transpose_copy.int(squeeze_copy_2, 1, 0); squeeze_copy_2 = None - _reshape_alias_copy_2 = torch.ops.aten._reshape_alias_copy.default(transpose_copy_2, [8], [1]); transpose_copy_2 = None - view_copy_2 = torch.ops.aten.view_copy.default(_reshape_alias_copy_2, [4, 2]); _reshape_alias_copy_2 = None - view_copy_3 = torch.ops.aten.view_copy.default(view_copy_2, [8]); view_copy_2 = None - _reshape_alias_copy_3 = torch.ops.aten._reshape_alias_copy.default(view_copy_3, [2, 4], [4, 1]); view_copy_3 = None - select_copy_1 = torch.ops.aten.select_copy.int(_reshape_alias_copy_3, 0, 0); _reshape_alias_copy_3 = None - add_2 = torch.ops.aten.add.Tensor(select_copy_1, _unsafe_view); select_copy_1 = _unsafe_view = None + _reshape_alias_copy_3 = torch.ops.aten._reshape_alias_copy.default(transpose_copy_2, [8], [1]); transpose_copy_2 = None + view_copy_2 = torch.ops.aten.view_copy.default(_reshape_alias_copy_3, [4, 2]); _reshape_alias_copy_3 = None + view_copy_3 = torch.ops.aten.view_copy.default(view_copy_2, [8]) + _reshape_alias_copy_4 = torch.ops.aten._reshape_alias_copy.default(view_copy_3, [2, 4], [4, 1]); view_copy_3 = None + select_copy_1 = torch.ops.aten.select_copy.int(_reshape_alias_copy_4, 0, 0); _reshape_alias_copy_4 = None + view_copy_4 = torch.ops.aten.view_copy.default(view_copy_2, [8]); view_copy_2 = None + _reshape_alias_copy_5 = torch.ops.aten._reshape_alias_copy.default(view_copy_4, [2, 4], [4, 1]); view_copy_4 = None + transpose_copy_3 = torch.ops.aten.transpose_copy.int(_reshape_alias_copy_5, 1, 0); _reshape_alias_copy_5 = None + unsqueeze_copy_3 = torch.ops.aten.unsqueeze_copy.default(transpose_copy_3, 0); transpose_copy_3 = None + squeeze_copy_3 = torch.ops.aten.squeeze_copy.default(unsqueeze_copy_3); unsqueeze_copy_3 = None + split_copy_1 = torch.ops.aten.split_copy.Tensor(squeeze_copy_3, 2); squeeze_copy_3 = None + getitem_2 = split_copy_1[0] + getitem_3 = split_copy_1[1]; split_copy_1 = None + _reshape_alias_copy_6 = torch.ops.aten._reshape_alias_copy.default(getitem_2, [4], [1]); getitem_2 = None + add_2 = torch.ops.aten.add.Tensor(select_copy_1, _reshape_alias_copy_6); select_copy_1 = _reshape_alias_copy_6 = None return add_1 """) # noqa: B950 diff --git a/test/test_view_ops.py b/test/test_view_ops.py index 6c65457ae24f1..3c5987e65ae75 100644 --- a/test/test_view_ops.py +++ b/test/test_view_ops.py @@ -926,6 +926,12 @@ def test_view_copy(self, device): self.assertEqual(a_view_copy, a_view) self.assertEqual(a.grad, a_ref.grad) + # Testing that the output of a view_copy kernel (by default) is contiguous. + def test_view_copy_output_contiguous(self, device): + a = torch.randn(4, 4, 4, 4, device=device).to(memory_format=torch.channels_last) + b = torch.ops.aten.slice_copy(a, 0, 0, 2) + self.assertTrue(b.is_contiguous()) + def test_view_copy_out(self, device): a = torch.randn(2, 2, device=device) out = torch.empty(2, device=device) diff --git a/torchgen/gen_functionalization_type.py b/torchgen/gen_functionalization_type.py index a3f9b0b0ff2cb..a27b4f327b2ac 100644 --- a/torchgen/gen_functionalization_type.py +++ b/torchgen/gen_functionalization_type.py @@ -91,7 +91,7 @@ def gen_composite_view_copy_kernel(g: NativeFunctionsViewGroup) -> Optional[str] return self.reshape_symint(size); } else { auto output = at::_ops::view::call(self, size); - return output.clone(); + return output.clone(/*memory_format=*/at::MemoryFormat::Contiguous); } } """ @@ -117,13 +117,13 @@ def gen_composite_view_copy_kernel(g: NativeFunctionsViewGroup) -> Optional[str] if g.view.func.returns[0].type == BaseType(BaseTy.Tensor): return_cloned_output = """\ - return output.clone();""" + return output.clone(/*memory_format=*/at::MemoryFormat::Contiguous);""" else: # If the return type is a list, we need to clone each tensor in the list. return_cloned_output = f"""\ {view_copy_sig.returns_type().cpp_type()} out_clone; for (const auto i : c10::irange(output.size())) {{ - out_clone.push_back(output[i].clone()); + out_clone.push_back(output[i].clone(/*memory_format=*/at::MemoryFormat::Contiguous)); }} return out_clone;""" From aa2decaa9cfd6177fe860fb36d037849607a391e Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Fri, 21 Oct 2022 11:17:39 -0400 Subject: [PATCH 0025/1922] ci: Allow nvidia-smi to continue with non-0 exit (#87464) Allows nvidia-smi to return a non-0 exit status like status 14 since status 14 is a warning and doesn't affect actual execution see https://github.com/NVIDIA/gpu-operator/issues/285 Signed-off-by: Eli Uriegas Pull Request resolved: https://github.com/pytorch/pytorch/pull/87464 Approved by: https://github.com/atalman, https://github.com/malfet, https://github.com/ZainRizvi --- .github/scripts/install_nvidia_utils_linux.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/.github/scripts/install_nvidia_utils_linux.sh b/.github/scripts/install_nvidia_utils_linux.sh index 855d15dde83b4..79f588633794e 100755 --- a/.github/scripts/install_nvidia_utils_linux.sh +++ b/.github/scripts/install_nvidia_utils_linux.sh @@ -51,7 +51,18 @@ install_nvidia_driver_amzn2() { sudo rm -fv /tmp/nvidia_driver fi - nvidia-smi + ( + set +e + nvidia-smi + status=$? + # Allowable exit statuses for nvidia-smi, see: https://github.com/NVIDIA/gpu-operator/issues/285 + if [ $status -eq 0 ] || [ $status -eq 14 ]; then + echo "INFO: Ignoring allowed status ${status}" + else + echo "ERROR: nvidia-smi exited with unresolved status ${status}" + exit ${status} + fi + ) ) } From cb300de040239a690b94ca187a7ca8dbe522bc84 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Fri, 21 Oct 2022 18:13:56 +0000 Subject: [PATCH 0026/1922] fix for dynamo xml reporting (#87378) dynamo tests call a helper function in torch/_dynamo/test_case.py which then calls run_tests in common_utils.py so the test report path looked something like /opt/conda/lib/python3/10/site-packages/torch/_dynamo/test_case * instead of using frame, use argv[0] which should be the invoking file * got rid of sanitize functorch test name because theyve been moved into the test folder Pull Request resolved: https://github.com/pytorch/pytorch/pull/87378 Approved by: https://github.com/huydhn --- torch/testing/_internal/common_utils.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index cb9b52c338118..77887574e1888 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -612,15 +612,6 @@ def sanitize_test_filename(filename): strip_py = re.sub(r'.py$', '', filename) return re.sub('/', r'.', strip_py) -# hack until https://github.com/pytorch/pytorch/issues/82109 is resolved -def sanitize_if_functorch_test_filename(filename): - # absolute filenames must be converted to relative paths, otherwise, - # we cannot prepend test-reports/ to it - # (e.g. test-reports\\C:\\... on windows is nonsense) - if filename.startswith(CI_FUNCTORCH_ROOT): - filename = filename[len(CI_PT_ROOT) + 1:] - return filename - def lint_test_case_extension(suite): succeed = True for test_case_or_suite in suite: @@ -640,10 +631,8 @@ def lint_test_case_extension(suite): return succeed -def get_report_path(pytest=False): - test_filename = inspect.getfile(sys._getframe(2)) - test_filename = sanitize_if_functorch_test_filename(test_filename) - test_filename = sanitize_test_filename(test_filename) +def get_report_path(argv=UNITTEST_ARGS, pytest=False): + test_filename = sanitize_test_filename(argv[0]) test_report_path = TEST_SAVE_XML + LOG_SUFFIX test_report_path = os.path.join(test_report_path, test_filename) if pytest: From bd7e75696f6d5d907db413d412a16ea4ac29c8de Mon Sep 17 00:00:00 2001 From: Zain Rizvi Date: Fri, 21 Oct 2022 18:15:38 +0000 Subject: [PATCH 0027/1922] Only label checks against pull requests (#87488) When a commit is triggered via any mechanism other than a pull request, there will not be a PR to check labels for. The job will fail with the error: ``` 2022-10-21T17:50:53.2938592Z + python3 .github/scripts/check_labels.py '' 2022-10-21T17:50:53.4758863Z usage: Check PR labels [-h] pr_num 2022-10-21T17:50:53.4759337Z Check PR labels: error: argument pr_num: invalid int value: '' ``` Instead, we should limit the workflow to only run on pull requests Pull Request resolved: https://github.com/pytorch/pytorch/pull/87488 Approved by: https://github.com/huydhn --- .github/workflows/lint.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index bcee8ab86c83e..669977b143a5e 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -229,6 +229,7 @@ jobs: check-labels: name: Check labels runs-on: linux.20_04.16x + if: github.event_name == 'pull_request' steps: - name: Checkout PyTorch uses: pytorch/pytorch/.github/actions/checkout-pytorch@master From 549d6e0828fd96a407c4a81fb53e975801397a2c Mon Sep 17 00:00:00 2001 From: Iris Zhang Date: Fri, 21 Oct 2022 18:45:38 +0000 Subject: [PATCH 0028/1922] [1/N][C10D] Add a customized ScubaLogHandler implementation for internal FB use (#86699) (#87123) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/86699 This diff does the following: 1. **c10d_error_logger.py**: Add an API to create a logger with a specific logging handler based on the destination. 2. The API from above would get a logging handler based on the destination provided. - **caffe2/torch/distributed/logging_handlers.py**: For OSS, we simply use a NullHandler() for now. 3. Add associated test files for 1 and 2. Test Plan: ## Unit Test ``` buck test @//mode/dev-nosan //caffe2/test/distributed:test_c10d_error_logger -- --print-passing-details ``` ``` File changed: fbcode//caffe2/test/distributed/test_c10d_error_logger.py File changed: fbsource//xplat/caffe2/test/distributed/TARGETS 9 additional file changes waiting for all tests to finish... ✓ Listing success: caffe2/test/distributed:test_c10d_error_logger (0.2s) Found 1 tests ✓ Pass: caffe2/test/distributed:test_c10d_error_logger - test_get_or_create_logger (caffe2.test.distributed.test_c10d_error_logger.C10dErrorLoggerTest) (0.2s) stdout: stderr: Buck UI: https://www.internalfb.com/buck2/b975f6b0-77e9-4287-8722-f95b48036181 Test Session: https://www.internalfb.com/intern/testinfra/testrun/1407375150206593 RE: reSessionID-4d7ab8ca-1051-48e9-a5a8-6edbe15d1fe4 Up: 124 B Down: 0 B Jobs completed: 5. Time elapsed: 3.5s. Tests finished: Pass 1. Fail 0. Fatal 0. Skip 0. 0 builds failed ``` Differential Revision: D39920391 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87123 Approved by: https://github.com/fduwjj, https://github.com/H-Huang --- test/distributed/test_c10d_error_logger.py | 17 ++++++++++ test/run_test.py | 1 + torch/distributed/c10d_error_logger.py | 36 ++++++++++++++++++++++ torch/distributed/logging_handlers.py | 14 +++++++++ 4 files changed, 68 insertions(+) create mode 100644 test/distributed/test_c10d_error_logger.py create mode 100644 torch/distributed/c10d_error_logger.py create mode 100644 torch/distributed/logging_handlers.py diff --git a/test/distributed/test_c10d_error_logger.py b/test/distributed/test_c10d_error_logger.py new file mode 100644 index 0000000000000..8001f2b869d83 --- /dev/null +++ b/test/distributed/test_c10d_error_logger.py @@ -0,0 +1,17 @@ +# Owner(s): ["oncall: distributed"] + +import logging +import unittest +from unittest.mock import patch + +from torch.distributed.c10d_error_logger import _get_or_create_logger + +class C10dErrorLoggerTest(unittest.TestCase): + + @patch("torch.distributed.c10d_error_logger._get_logging_handler") + def test_get_or_create_logger(self, logging_handler_mock): + logging_handler_mock.return_value = logging.NullHandler(), "NullHandler" + logger = _get_or_create_logger() + self.assertIsNotNone(logger) + self.assertEqual(1, len(logger.handlers)) + self.assertIsInstance(logger.handlers[0], logging.NullHandler) diff --git a/test/run_test.py b/test/run_test.py index 35004406d0115..620a8b712aeeb 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -170,6 +170,7 @@ def skip_test_p(name: str) -> bool: "distributed/elastic/events/lib_test", "distributed/elastic/agent/server/test/api_test", "test_deploy", + "distributed/test_c10d_error_logger.py" ] WINDOWS_BLOCKLIST = [ diff --git a/torch/distributed/c10d_error_logger.py b/torch/distributed/c10d_error_logger.py new file mode 100644 index 0000000000000..10605c69be476 --- /dev/null +++ b/torch/distributed/c10d_error_logger.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from typing import Tuple + +from torch.distributed.logging_handlers import _log_handlers + +_c10d_error_logger = None + + +def _get_or_create_logger() -> logging.Logger: + global _c10d_error_logger + if _c10d_error_logger: + return _c10d_error_logger + logging_handler, log_handler_name = _get_logging_handler() + _c10d_error_logger = logging.getLogger(f"c10d-collectives-{log_handler_name}") + _c10d_error_logger.setLevel(logging.DEBUG) + formatter = logging.Formatter( + "%(asctime)s %(filename)s:%(lineno)s %(levelname)s p:%(processName)s t:%(threadName)s: %(message)s" + ) + logging_handler.setFormatter(formatter) + _c10d_error_logger.propagate = False + _c10d_error_logger.addHandler(logging_handler) + return _c10d_error_logger + + +def _get_logging_handler(destination: str = "default") -> Tuple[logging.Handler, str]: + log_handler = _log_handlers[destination] + log_handler_name = type(log_handler).__name__ + return (log_handler, log_handler_name) diff --git a/torch/distributed/logging_handlers.py b/torch/distributed/logging_handlers.py new file mode 100644 index 0000000000000..7c3b3249f6c79 --- /dev/null +++ b/torch/distributed/logging_handlers.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from typing import Dict + +_log_handlers: Dict[str, logging.Handler] = { + "default": logging.NullHandler(), +} From ec6e885aa0fc4e2e457b21aebc03fccbdfde955f Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 21 Oct 2022 19:03:00 +0000 Subject: [PATCH 0029/1922] Slowly introduce ops to be tested by test_numpy_ref on MPS backend (#87342) Enable a test that would have caught https://github.com/pytorch/pytorch/issues/86239 Prior to the fix for that bug, this test fails with ``` _____________________________ TestCommonMPS.test_numpy_ref_mps_where_mps_float32 _____________________________ Traceback (most recent call last): File "/Users/alex/git/pytorch/test/test_ops.py", line 197, in test_numpy_ref_mps self.compare_with_reference( File "/Users/alex/git/pytorch/torch/testing/_internal/common_utils.py", line 2366, in compare_with_reference actual = torch_fn(t_inp, *t_args, **t_kwargs) File "/Users/alex/git/pytorch/torch/testing/_internal/opinfo/core.py", line 1068, in __call__ return self.op(*args, **kwargs) File "/Users/alex/git/pytorch/torch/testing/_internal/common_methods_invocations.py", line 15167, in op=lambda self, condition, other: torch.where(condition, self, other), RuntimeError: 0'th index 3 of x tensor does not match the other tensors ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/87342 Approved by: https://github.com/albanD --- test/test_mps.py | 75 +++++++++++++++++-- torch/testing/_internal/common_device_type.py | 17 ++++- .../_internal/common_methods_invocations.py | 40 ++++++++-- .../_internal/opinfo/definitions/linalg.py | 33 ++++++++ .../_internal/opinfo/definitions/signal.py | 21 ++++++ 5 files changed, 173 insertions(+), 13 deletions(-) diff --git a/test/test_mps.py b/test/test_mps.py index 9702239df95df..8eeae7dbcaf7b 100644 --- a/test/test_mps.py +++ b/test/test_mps.py @@ -18,9 +18,10 @@ from collections import defaultdict from torch._six import inf from torch.nn import Parameter +from torch.testing._internal import opinfo from torch.testing._internal.common_utils import \ - (gradcheck, gradgradcheck, run_tests, TestCase, download_file, - TEST_WITH_UBSAN, dtype_abbrs) + (gradcheck, gradgradcheck, run_tests, TestCase, download_file, IS_CI, + TEST_WITH_UBSAN, dtype_abbrs, skipIfSlowGradcheckEnv, TEST_WITH_ASAN, suppress_warnings) from torch.testing import make_tensor from torch.testing._comparison import TensorLikePair from torch.testing._internal.common_dtype import get_all_dtypes, integral_types @@ -28,13 +29,31 @@ from torch.distributions import Uniform, Exponential from functools import partial -from torch.testing._internal.common_methods_invocations import op_db -from torch.testing._internal.common_device_type import ops, instantiate_device_type_tests +from torch.testing._internal.common_methods_invocations import ( + op_db, + UnaryUfuncInfo, + ReductionOpInfo, + SpectralFuncInfo, + BinaryUfuncInfo, +) +from torch.testing._internal.common_device_type import ops, instantiate_device_type_tests, onlyMPS from torch.testing._internal.common_nn import NNTestCase import numpy as np import torch import torch.utils._pytree as pytree + +# Copied from `test_ops.py` for the purposes of duplicating `test_numpy_ref` +_ref_test_ops = tuple( + filter( + lambda op: not isinstance( + op, (UnaryUfuncInfo, ReductionOpInfo, SpectralFuncInfo, BinaryUfuncInfo) + ) + and op.ref is not None, + op_db, + ) +) + # Same logic as test_cuda.py if not torch.backends.mps.is_available(): print('MPS not available, skipping tests', file=sys.stderr) @@ -7790,10 +7809,56 @@ def req_grad(t): # So each test append to the dict and write it. with open("new_mps_allowlist_grad.txt", "w") as f: pprint.pprint(self.NEW_ALLOW_LIST_GRAD, stream=f) + + +# Copied from `TestCommon` in `test_ops.py`, just enough to duplicate the `test_numpy_ref` for MPS +@skipIfSlowGradcheckEnv +class TestCommon(TestCase): + exact_dtype = True + + # Verifies, on teardown, that no OpInfo is still using dynamic dtypes in CI + @classmethod + def tearDownClass(cls): + super().tearDownClass() + + if IS_CI: + err_msg = ( + "The operator(s) below is(are) using dynamic_dtypes in the OpInfo entries." + "This is OK for testing, but be sure to set the dtypes manually before landing your PR!" + ) + # Assure no opinfo entry has dynamic_dtypes + filtered_ops = list(filter(opinfo.utils.is_dynamic_dtype_set, op_db)) + for op in filtered_ops: + fmt_str = opinfo.utils.str_format_dynamic_dtype(op) + err_msg += "\n" + fmt_str + + assert len(filtered_ops) == 0, err_msg + + # This is the MPS equivalent of `test_numpy_ref` from `test_ops.py`. It lives over here while + # MPS still requires some fairly heavy special casing in the test framework. + # When MPS becomes more consistent, this can probably be merged with that test using + # `@dtypesIfMPS(torch.float32)`, but for now, the assertions themselves need to be loosened + @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN") + @onlyMPS + @suppress_warnings + # MPS only supports float32 + @ops(_ref_test_ops, allowed_dtypes=(torch.float32,)) + def test_numpy_ref_mps(self, device, dtype, op): + # Unlike `test_numpy_ref`, this test compares in `float32` since at the time of this test's creation MPS + # does not support float64 Tensors. + # A few ops are currently broken on their reference inputs, but not their sample inputs. These should + # get patched up and this workaround removed. + broken_on_ref_inputs = op.name in ['cat', 'clamp', 'where'] + inputs = op.reference_inputs(device, dtype) if not broken_on_ref_inputs else op.sample_inputs(device, dtype) + for sample_input in inputs: + self.compare_with_reference(op, op.ref, sample_input) + # TODO: Actually instantiate that test for the "mps" device to better reflect what it is doing. # This requires mps to be properly registered in the device generic test framework which is not the -# case right now. +# case right now. We can probably use `allow_mps` introduced in https://github.com/pytorch/pytorch/pull/87342 +# to achieve this. instantiate_device_type_tests(TestConsistency, globals(), only_for="cpu") +instantiate_device_type_tests(TestCommon, globals(), allow_mps=True) if __name__ == "__main__": run_tests() diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py index aec7191c3c6eb..7d9f31330ef6d 100644 --- a/torch/testing/_internal/common_device_type.py +++ b/torch/testing/_internal/common_device_type.py @@ -10,6 +10,7 @@ import unittest import os import torch +import torch.backends.mps from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, TEST_MKL, \ skipCUDANonDefaultStreamIf, TEST_WITH_ASAN, TEST_WITH_UBSAN, TEST_WITH_TSAN, \ IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, IS_WINDOWS, \ @@ -198,6 +199,8 @@ # Skips the test if the device is not a CPU device # - @onlyCUDA # Skips the test if the device is not a CUDA device +# - @onlyMPS +# Skips the test if the device is not a MPS device # - @skipCPUIfNoLapack # Skips the test if the device is a CPU device and LAPACK is not installed # - @skipCPUIfNoMkl @@ -590,7 +593,7 @@ def filter_desired_device_types(device_type_test_bases, except_for=None, only_fo # The tests in these test cases are derived from the generic tests in # generic_test_class. # See note "Generic Device Type Testing." -def instantiate_device_type_tests(generic_test_class, scope, except_for=None, only_for=None, include_lazy=False): +def instantiate_device_type_tests(generic_test_class, scope, except_for=None, only_for=None, include_lazy=False, allow_mps=False): # Removes the generic test class from its enclosing scope so its tests # are not discoverable. del scope[generic_test_class.__name__] @@ -609,9 +612,13 @@ def instantiate_device_type_tests(generic_test_class, scope, except_for=None, on generic_members = set(generic_test_class.__dict__.keys()) - set(empty_class.__dict__.keys()) generic_tests = [x for x in generic_members if x.startswith('test')] + # MPS backend support is disabled in `get_device_type_test_bases` while support is being ramped + # up, so allow callers to specifically opt tests into being tested on MPS, similar to `include_lazy` + test_bases = device_type_test_bases.copy() + if allow_mps and torch.backends.mps.is_available() and MPSTestBase not in test_bases: + test_bases.append(MPSTestBase) # Filter out the device types based on user inputs - desired_device_type_test_bases = filter_desired_device_types(device_type_test_bases, - except_for, only_for) + desired_device_type_test_bases = filter_desired_device_types(test_bases, except_for, only_for) if include_lazy: # Note [Lazy Tensor tests in device agnostic testing] # Right now, test_view_ops.py runs with LazyTensor. @@ -1143,6 +1150,10 @@ def onlyCUDA(fn): return onlyOn('cuda')(fn) +def onlyMPS(fn): + return onlyOn('mps')(fn) + + def disablecuDNN(fn): @wraps(fn) diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index e5d6e6efe18a9..34f54f2fb5ae1 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -6257,7 +6257,7 @@ def make_bool_mask(shape): if mask_t.sum() == 0: def random_index(shape): - return tuple(map(lambda max_idx: random.randint(0, max_idx), shape)) + return tuple(map(lambda max_idx: random.randrange(0, max_idx), shape)) mask_t[random_index(mask_t.shape)] = True return mask_t @@ -6268,7 +6268,9 @@ def random_index(shape): ((M, 1, M), (M, M), (M, M, 1), True), ((), (), (), False), ((M, 1, M), (), (M, M, 1), True), - ((), (M, M), (), True),) + ((), (M, M), (), True), + ((), (2), (1, 1), True), + ) for shape, mask_shape, other_shape, broadcasts_input in cases: yield SampleInput(make_arg(shape), @@ -8206,6 +8208,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1): toleranceOverride({torch.float32: tol(atol=1.3e-05, rtol=1.3e-05), torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}), 'TestCommon', 'test_numpy_refs'), + # MPS has slightly worse precision. Is this acceptable? + DecorateInfo( + toleranceOverride({torch.float32: tol(atol=1.3e-04, rtol=1.3e-04), + torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}), + 'TestCommon', 'test_numpy_ref_mps'), DecorateInfo( toleranceOverride({torch.float32: tol(atol=1e-5, rtol=1e-5)}), 'TestConsistency', @@ -8701,6 +8708,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1): # TypeError: _copy_dispatcher() got an unexpected keyword argument 'memory_format' # (NumPy reference needs to be extended with memory_format) DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref'), + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref_mps'), ),), OpInfo('contiguous', op=lambda x, *args, **kwargs: x.contiguous(*args, **kwargs), @@ -10398,6 +10406,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1): # Extremal value issue on aten::native_layer_norm, which returns 'nan' for mean on 'inf' inputs # possibly because of the welford implementation. DecorateInfo(unittest.skip("Skipped!"), 'TestCudaFuserOpInfo', 'test_nvfuser_extremal_values'), + DecorateInfo(unittest.skip("Unsupported on MPS for now"), 'TestCommon', 'test_numpy_ref_mps'), )), OpInfo('native_batch_norm', aten_name='native_batch_norm', @@ -10664,6 +10673,19 @@ def reference_flatten(input, start_dim=0, end_dim=-1): # RuntimeError: !lhs.isAliasOf(rhs)INTERNAL ASSERT FAILED at # "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":104, please report a bug to PyTorch. DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'), + # RuntimeError: UNSUPPORTED DTYPE: complex + DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness', + dtypes=(torch.complex64, torch.complex128)), + # RuntimeError: "slow_conv2d_cpu_grad_input" not implemented for 'Long' + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref', + dtypes=(torch.int64,)), + # Reference: https://github.com/pytorch/pytorch/issues/86356 + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref', + dtypes=(torch.double, torch.cdouble)), + DecorateInfo(unittest.skip("Unsupported on MPS for now"), 'TestCommon', 'test_numpy_ref_mps'), + # AssertionError: None mismatch: torch.complex64 is not None + DecorateInfo(unittest.expectedFailure, 'TestDtypeCustomRules', 'test_custom_rules', + dtypes=(torch.complex64, torch.complex128)), ), supports_out=False,), OpInfo('nn.functional.conv_transpose3d', @@ -10826,7 +10848,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1): DecorateInfo( toleranceOverride({torch.float32: tol(atol=1e-05, rtol=1e-03)}), 'TestCommon', 'test_numpy_refs' - ) + ), + DecorateInfo(unittest.skip("Bug in MPS backend!"), 'TestCommon', 'test_numpy_ref_mps'), ], sample_inputs_func=sample_inputs_layer_norm, supports_expanded_weight=True,), @@ -12116,7 +12139,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1): skips=( # AssertionError: Tensor-likes are not close! # May not replicate in CI - DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out'),)), + DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out'), + DecorateInfo(unittest.skip("Unsupported on MPS for now"), 'TestCommon', 'test_numpy_ref_mps'), + )), UnaryUfuncInfo('nn.functional.relu6', aten_name="relu6", dtypes=all_types_and(torch.bfloat16), @@ -14507,6 +14532,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1): # JIT tests don't work with Tensor keyword arguments # https://github.com/pytorch/pytorch/issues/58507 DecorateInfo(unittest.skip("Expected failure!"), 'TestJit', 'test_variant_consistency_jit'), + DecorateInfo(unittest.skip("Unsupported on MPS for now"), 'TestCommon', 'test_numpy_ref_mps'), )), OpInfo('cat', ref=_cat_np, @@ -16123,7 +16149,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1): sample_inputs_func=sample_inputs_pdist, dtypes=floating_types(), supports_out=False, - supports_gradgrad=False), + supports_gradgrad=False, + skips=( + DecorateInfo(unittest.skip("Unsupported on MPS for now"), 'TestCommon', 'test_numpy_ref_mps'), + ) + ), OpInfo( "nn.functional.poisson_nll_loss", dtypes=all_types_and(torch.bfloat16), diff --git a/torch/testing/_internal/opinfo/definitions/linalg.py b/torch/testing/_internal/opinfo/definitions/linalg.py index 2d899dcd0ca24..193f1f2db85cc 100644 --- a/torch/testing/_internal/opinfo/definitions/linalg.py +++ b/torch/testing/_internal/opinfo/definitions/linalg.py @@ -1093,6 +1093,13 @@ def make_input(): supports_out=True, supports_fwgrad_bwgrad=True, supports_forward_ad=True, + skips=( + DecorateInfo( + unittest.skip("Unsupported on MPS for now"), + "TestCommon", + "test_numpy_ref_mps", + ), + ), ), OpInfo( "linalg.det", @@ -1211,6 +1218,11 @@ def make_input(): "test_schema_correctness", dtypes=(torch.complex64, torch.complex128), ), + DecorateInfo( + unittest.skip("Unsupported on MPS for now"), + "TestCommon", + "test_numpy_ref_mps", + ), ), ), OpInfo( @@ -1647,6 +1659,13 @@ def make_input(): supports_fwgrad_bwgrad=True, supports_out=False, sample_inputs_func=sample_inputs_linalg_vander, + skips=( + DecorateInfo( + unittest.skip("Unsupported on MPS for now"), + "TestCommon", + "test_numpy_ref_mps", + ), + ), ), ReductionOpInfo( "linalg.vector_norm", @@ -2123,6 +2142,13 @@ def make_input(): # See https://github.com/pytorch/pytorch/pull/78358 check_batched_forward_grad=False, decorators=[skipCPUIfNoLapack, skipCUDAIfNoMagmaAndNoCusolver], + skips=( + DecorateInfo( + unittest.skip("Unsupported on MPS for now"), + "TestCommon", + "test_numpy_ref_mps", + ), + ), ), OpInfo( "linalg.tensorsolve", @@ -2141,6 +2167,13 @@ def make_input(): device_type="cuda", ), ], + skips=( + DecorateInfo( + unittest.skip("Unsupported on MPS for now"), + "TestCommon", + "test_numpy_ref_mps", + ), + ), ), ] diff --git a/torch/testing/_internal/opinfo/definitions/signal.py b/torch/testing/_internal/opinfo/definitions/signal.py index 1f1c8d7e6a6d1..3b7f3e4de4001 100644 --- a/torch/testing/_internal/opinfo/definitions/signal.py +++ b/torch/testing/_internal/opinfo/definitions/signal.py @@ -291,6 +291,13 @@ def make_signal_windows_opinfo( sample_inputs_func=sample_inputs_window, reference_inputs_func=reference_inputs_window, error_inputs_func=error_inputs_window, + skips=( + DecorateInfo( + unittest.skip("Buggy on MPS for now (mistakenly promotes to float64)"), + "TestCommon", + "test_numpy_ref_mps", + ), + ), ), make_signal_windows_opinfo( name="signal.windows.exponential", @@ -300,6 +307,13 @@ def make_signal_windows_opinfo( sample_inputs_func=partial(sample_inputs_window, tau=2.78), reference_inputs_func=partial(reference_inputs_exponential_window, tau=2.78), error_inputs_func=error_inputs_exponential_window, + skips=( + DecorateInfo( + unittest.skip("Buggy on MPS for now (mistakenly promotes to float64)"), + "TestCommon", + "test_numpy_ref_mps", + ), + ), ), make_signal_windows_opinfo( name="signal.windows.gaussian", @@ -309,5 +323,12 @@ def make_signal_windows_opinfo( sample_inputs_func=partial(sample_inputs_window, std=1.92), reference_inputs_func=partial(reference_inputs_gaussian_window, std=1.92), error_inputs_func=error_inputs_gaussian_window, + skips=( + DecorateInfo( + unittest.skip("Buggy on MPS for now (mistakenly promotes to float64)"), + "TestCommon", + "test_numpy_ref_mps", + ), + ), ), ] From bd60463f279b8aad2308562392d76f0d5ba6f08a Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 21 Oct 2022 19:14:28 +0000 Subject: [PATCH 0030/1922] [BE] Remove pip and conda installation in Linux build workflow (#87256) All the dependencies should come from the Docker container already. This only updates Linux build workflow, Linux test workflow comes later in a separate PR. The `opt-einsum` package that was installed as part of PyTorch wheel has already been installed in the Docker container [requirements-ci.txt](https://github.com/pytorch/pytorch/blob/master/.circleci/docker/requirements-ci.txt#L127) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87256 Approved by: https://github.com/malfet --- .jenkins/pytorch/build-asan.sh | 2 +- .jenkins/pytorch/build-tsan.sh | 2 +- .jenkins/pytorch/build.sh | 7 +------ .jenkins/pytorch/common_utils.sh | 6 ++++++ 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/.jenkins/pytorch/build-asan.sh b/.jenkins/pytorch/build-asan.sh index d2cafa323fc56..91953c322f223 100755 --- a/.jenkins/pytorch/build-asan.sh +++ b/.jenkins/pytorch/build-asan.sh @@ -26,7 +26,7 @@ CC="clang" CXX="clang++" LDSHARED="clang --shared" \ CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -fsanitize-address-use-after-scope -shared-libasan" \ USE_ASAN=1 USE_CUDA=0 USE_MKLDNN=0 \ python setup.py bdist_wheel - python -mpip install "$(echo dist/*.whl)[opt-einsum]" + pip_install_whl "$(echo dist/*.whl)" # Test building via the sdist source tarball python setup.py sdist diff --git a/.jenkins/pytorch/build-tsan.sh b/.jenkins/pytorch/build-tsan.sh index 41ebdd5cb1eed..e10edb310d813 100755 --- a/.jenkins/pytorch/build-tsan.sh +++ b/.jenkins/pytorch/build-tsan.sh @@ -22,7 +22,7 @@ CC="clang" CXX="clang++" LDSHARED="clang --shared" \ CFLAGS="-fsanitize=thread" \ USE_TSAN=1 USE_CUDA=0 USE_MKLDNN=0 \ python setup.py bdist_wheel - python -mpip install dist/*.whl + pip_install_whl "$(echo dist/*.whl)" print_sccache_stats diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index 24567449424a6..58cdc1227ac2d 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -62,9 +62,6 @@ elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then export ATEN_THREADING=NATIVE fi -# TODO: Don't run this... -pip_install -r requirements.txt || true - # Enable LLVM dependency for TensorExpr testing if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then export USE_LLVM=/opt/rocm/llvm @@ -74,13 +71,11 @@ else export LLVM_DIR=/opt/llvm/lib/cmake/llvm fi -# TODO: Don't install this here if ! which conda; then # In ROCm CIs, we are doing cross compilation on build machines with # intel cpu and later run tests on machines with amd cpu. # Also leave out two builds to make sure non-mkldnn builds still work. if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then - pip_install mkl mkl-devel export USE_MKLDNN=1 else export USE_MKLDNN=0 @@ -230,7 +225,7 @@ else else python setup.py bdist_wheel fi - python -mpip install "$(echo dist/*.whl)[opt-einsum]" + pip_install_whl "$(echo dist/*.whl)" # TODO: I'm not sure why, but somehow we lose verbose commands set -x diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh index c0e51bc80aa8c..d8c853f97ab23 100644 --- a/.jenkins/pytorch/common_utils.sh +++ b/.jenkins/pytorch/common_utils.sh @@ -49,6 +49,12 @@ function assert_git_not_dirty() { fi } +function pip_install_whl() { + # This is used to install PyTorch and other build artifacts wheel locally + # without using any network connection + python3 -mpip install --no-index --no-deps "$@" +} + function pip_install() { # retry 3 times # old versions of pip don't have the "--progress-bar" flag From 71a05b857d14033e5192cbc584ea414aa05f7fd1 Mon Sep 17 00:00:00 2001 From: samdow Date: Wed, 19 Oct 2022 10:36:40 -0400 Subject: [PATCH 0031/1922] [Modes] refactor modes to only use a stack in cpp (#86458) Refactors the mode code to only have the C++ mode stack and not the "C++ mode" like we originally had. This also simplifies the mode logic in a number of places Pull Request resolved: https://github.com/pytorch/pytorch/pull/86458 Approved by: https://github.com/zou3519 --- aten/src/ATen/PythonTorchFunctionTLS.cpp | 16 +----- aten/src/ATen/PythonTorchFunctionTLS.h | 9 +-- aten/src/ATen/core/PythonFallbackKernel.cpp | 7 ++- c10/core/TensorImpl.cpp | 9 +-- c10/core/impl/TorchDispatchModeTLS.cpp | 55 ++++++------------- c10/core/impl/TorchDispatchModeTLS.h | 11 ---- test/test_overrides.py | 4 +- test/test_python_dispatch.py | 2 +- torch/csrc/autograd/init.cpp | 31 +---------- torch/csrc/autograd/python_variable.cpp | 4 +- torch/csrc/jit/python/pybind_utils.cpp | 3 +- torch/csrc/utils/disable_torch_function.cpp | 2 +- torch/csrc/utils/python_arg_parser.cpp | 42 +++++++------- torch/csrc/utils/python_arg_parser.h | 2 +- torch/csrc/utils/python_torch_function_mode.h | 15 +++-- torch/csrc/utils/torch_dispatch_mode.h | 24 +++++++- torch/overrides.py | 38 ++----------- torch/utils/_python_dispatch.py | 22 +------- 18 files changed, 98 insertions(+), 198 deletions(-) diff --git a/aten/src/ATen/PythonTorchFunctionTLS.cpp b/aten/src/ATen/PythonTorchFunctionTLS.cpp index c4e1241805a88..c9487c6958cbf 100644 --- a/aten/src/ATen/PythonTorchFunctionTLS.cpp +++ b/aten/src/ATen/PythonTorchFunctionTLS.cpp @@ -6,18 +6,6 @@ namespace impl { static thread_local PythonTorchFunctionTLS pythonTorchFunctionState; -void PythonTorchFunctionTLS::set_mode(std::shared_ptr mode) { - pythonTorchFunctionState.mode_ = std::move(mode); -} - -const std::shared_ptr& PythonTorchFunctionTLS::get_mode() { - return pythonTorchFunctionState.mode_; -} - -void PythonTorchFunctionTLS::swap_mode(std::shared_ptr& mode) { - pythonTorchFunctionState.mode_.swap(mode); -} - void PythonTorchFunctionTLS::push_onto_stack(std::shared_ptr mode) { pythonTorchFunctionState.stack_.push_back(std::move(mode)); } @@ -54,8 +42,8 @@ const PythonTorchFunctionTLS& PythonTorchFunctionTLS::get_state() { return pythonTorchFunctionState; } -bool function_mode_enabled() { - return static_cast(PythonTorchFunctionTLS::get_mode()); +bool torch_function_mode_enabled() { + return PythonTorchFunctionTLS::stack_len() > 0; } } // namespace impl diff --git a/aten/src/ATen/PythonTorchFunctionTLS.h b/aten/src/ATen/PythonTorchFunctionTLS.h index ef283164246d3..5940fb6f2dee2 100644 --- a/aten/src/ATen/PythonTorchFunctionTLS.h +++ b/aten/src/ATen/PythonTorchFunctionTLS.h @@ -10,10 +10,6 @@ struct TORCH_API PythonTorchFunctionTLS { static void set_disabled(bool); static bool is_disabled(); - static void set_mode(std::shared_ptr); - static const std::shared_ptr& get_mode(); - static void swap_mode(std::shared_ptr&); - static void push_onto_stack(std::shared_ptr mode); static const std::shared_ptr pop_stack(); static const std::shared_ptr& get_stack_at(int64_t idx); @@ -26,16 +22,13 @@ struct TORCH_API PythonTorchFunctionTLS { // The mode TLS is split into // - disabled_, which says whether or not to disable all torch function // modes - // - mode_, which is the C++ mode, that can only be the mode handling mode - // or null // - stack_, which is a vector of modes representing the stack of user // defined modes bool disabled_; - std::shared_ptr mode_ = nullptr; std::vector> stack_; }; -TORCH_API bool function_mode_enabled(); +TORCH_API bool torch_function_mode_enabled(); } // namespace impl } // namespace at diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp index fcdb018b6ff7b..e16874a83f966 100644 --- a/aten/src/ATen/core/PythonFallbackKernel.cpp +++ b/aten/src/ATen/core/PythonFallbackKernel.cpp @@ -52,9 +52,10 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { // If Torch Dispatch Mode is active, use its PyInterpreter for dispatch - const auto& maybe_torch_dispatch_mode_state = c10::impl::TorchDispatchModeTLS::get_mode(); - if (maybe_torch_dispatch_mode_state) { - maybe_torch_dispatch_mode_state->pyinterpreter()->dispatch(op, stack); + const auto mode_stack_len = c10::impl::TorchDispatchModeTLS::stack_len(); + if (mode_stack_len > 0) { + const auto& cur_torch_dispatch_mode_state = c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1); + cur_torch_dispatch_mode_state->pyinterpreter()->dispatch(op, stack); return; } diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp index 3951578a848cc..976382cf2ee7f 100644 --- a/c10/core/TensorImpl.cpp +++ b/c10/core/TensorImpl.cpp @@ -611,12 +611,13 @@ c10::intrusive_ptr TensorImpl::shallow_copy_and_detach_core( VariableVersion&& version_counter, bool allow_tensor_metadata_change) const { c10::intrusive_ptr r; - const auto& maybe_torch_dispatch_mode_state = - c10::impl::TorchDispatchModeTLS::get_mode(); + const auto mode_stack_len = c10::impl::TorchDispatchModeTLS::stack_len(); // TODO: do we have to exclude after Python dispatch key set? - if (maybe_torch_dispatch_mode_state && + if (mode_stack_len > 0 && !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) { - r = maybe_torch_dispatch_mode_state->pyinterpreter()->detach(this); + const auto& cur_torch_dispatch_mode_state = + c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1); + r = cur_torch_dispatch_mode_state->pyinterpreter()->detach(this); } else if ( key_set_.has(DispatchKey::Python) && !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) { diff --git a/c10/core/impl/TorchDispatchModeTLS.cpp b/c10/core/impl/TorchDispatchModeTLS.cpp index 5f02686584255..6755657b73687 100644 --- a/c10/core/impl/TorchDispatchModeTLS.cpp +++ b/c10/core/impl/TorchDispatchModeTLS.cpp @@ -8,44 +8,12 @@ namespace impl { thread_local TorchDispatchModeTLS torchDispatchModeState; -// MODE -void TorchDispatchModeTLS::set_mode(std::shared_ptr mode) { - if (mode) { - c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, true); - c10::impl::tls_set_dispatch_key_included( - DispatchKey::PythonTLSSnapshot, true); - } else { - TorchDispatchModeTLS::reset_mode(); - } - torchDispatchModeState.mode_ = std::move(mode); -} - -const std::shared_ptr& TorchDispatchModeTLS::get_mode() { - return torchDispatchModeState.mode_; -} - -void TorchDispatchModeTLS::reset_mode() { - torchDispatchModeState.mode_.reset(); - c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false); - c10::impl::tls_set_dispatch_key_included( - DispatchKey::PythonTLSSnapshot, false); -} - -void TorchDispatchModeTLS::swap_mode(std::shared_ptr& mode) { - if (mode) { +void TorchDispatchModeTLS::push_onto_stack(std::shared_ptr mode) { + if (torchDispatchModeState.stack_.size() == 0) { c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, true); c10::impl::tls_set_dispatch_key_included( DispatchKey::PythonTLSSnapshot, true); - } else { - c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false); - c10::impl::tls_set_dispatch_key_included( - DispatchKey::PythonTLSSnapshot, false); } - torchDispatchModeState.mode_.swap(mode); -} - -// STACK -void TorchDispatchModeTLS::push_onto_stack(std::shared_ptr mode) { torchDispatchModeState.stack_.push_back(std::move(mode)); } @@ -56,6 +24,12 @@ const std::shared_ptr TorchDispatchModeTLS::pop_stack() { const std::shared_ptr out = torchDispatchModeState.stack_.back(); torchDispatchModeState.stack_.pop_back(); + + if (torchDispatchModeState.stack_.size() == 0) { + c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false); + c10::impl::tls_set_dispatch_key_included( + DispatchKey::PythonTLSSnapshot, false); + } return out; } @@ -71,20 +45,27 @@ int64_t TorchDispatchModeTLS::stack_len() { return torchDispatchModeState.stack_.size(); } -// STATE - const TorchDispatchModeTLS& TorchDispatchModeTLS::get_state() { return torchDispatchModeState; } void TorchDispatchModeTLS::set_state(const TorchDispatchModeTLS& state) { torchDispatchModeState = state; + if (torchDispatchModeState.stack_.size() == 0) { + c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false); + c10::impl::tls_set_dispatch_key_included( + DispatchKey::PythonTLSSnapshot, false); + } else { + c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, true); + c10::impl::tls_set_dispatch_key_included( + DispatchKey::PythonTLSSnapshot, true); + } } // UTIL bool dispatch_mode_enabled() { - return static_cast(c10::impl::TorchDispatchModeTLS::get_mode()); + return TorchDispatchModeTLS::stack_len() > 0; } } // namespace impl diff --git a/c10/core/impl/TorchDispatchModeTLS.h b/c10/core/impl/TorchDispatchModeTLS.h index 708c22e014ad4..da30d0460427c 100644 --- a/c10/core/impl/TorchDispatchModeTLS.h +++ b/c10/core/impl/TorchDispatchModeTLS.h @@ -9,11 +9,6 @@ namespace c10 { namespace impl { struct C10_API TorchDispatchModeTLS { - static void set_mode(std::shared_ptr mode); - static const std::shared_ptr& get_mode(); - static void reset_mode(); - static void swap_mode(std::shared_ptr& mode); - static void push_onto_stack(std::shared_ptr mode); static const std::shared_ptr pop_stack(); static const std::shared_ptr& get_stack_at(int64_t idx); @@ -23,12 +18,6 @@ struct C10_API TorchDispatchModeTLS { static void set_state(const TorchDispatchModeTLS& state); private: - // The mode TLS is split into - // - mode_, which is the C++ mode, that can only be the mode handling mode - // or null - // - stack_, which is a vector of modes representing the stack of user - // defined modes - std::shared_ptr mode_; std::vector> stack_; }; diff --git a/test/test_overrides.py b/test/test_overrides.py index e9e01684bda53..879b27277f0d8 100644 --- a/test/test_overrides.py +++ b/test/test_overrides.py @@ -1175,7 +1175,7 @@ def __torch_function__(self, *args, **kwargs): self.assertEqual(torch.mm(x, x), -1) self.assertEqual(bar(x), 1) self.assertRaisesRegex( - TypeError, r'SubTensor.+TorchFunctionStackMode', + TypeError, r'SubTensor', lambda: self.assertEqual(torch.max(x, x))) def test_with_mode(self): @@ -1248,7 +1248,7 @@ def __torch_function__(cls, func, _, args=(), kwargs=None): return func(args, kwargs) x = torch.tensor(5.) - with self.assertRaisesRegex(RuntimeError, "should be a normal method not a class method"): + with self.assertRaisesRegex(RuntimeError, "classmethod is not supported, please make it a plain method"): with A(): x + x diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py index dea96d19b74c4..380f85f568f72 100644 --- a/test/test_python_dispatch.py +++ b/test/test_python_dispatch.py @@ -1050,7 +1050,7 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None): return func(args, kwargs) x = torch.tensor(5.) - with self.assertRaisesRegex(RuntimeError, "should be a normal method not a class method"): + with self.assertRaisesRegex(RuntimeError, "classmethod is not supported, please make it a plain method"): with A(): x + x diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 007150002dbb6..ee963232d3166 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -606,24 +607,11 @@ static PyObject* python_exit_dual_level( END_HANDLE_TH_ERRORS } -static PyObject* set_torch_function_mode(PyObject* _unused, PyObject* arg) { - HANDLE_TH_ERRORS - if (arg == Py_None) { - at::impl::PythonTorchFunctionTLS::set_mode(nullptr); - } else { - Py_INCREF(arg); - at::impl::PythonTorchFunctionTLS::set_mode( - std::make_shared(arg, getPyInterpreter())); - } - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS; -} - static PyObject* is_torch_function_mode_enabled( PyObject* _unused, PyObject* _unused2) { HANDLE_TH_ERRORS - if (at::impl::function_mode_enabled()) { + if (at::impl::torch_function_mode_enabled()) { Py_RETURN_TRUE; } else { Py_RETURN_FALSE; @@ -682,19 +670,6 @@ static PyObject* len_torch_function_stack( END_HANDLE_TH_ERRORS } -static PyObject* set_torch_dispatch_mode(PyObject* _unused, PyObject* arg) { - HANDLE_TH_ERRORS - if (arg == Py_None) { - c10::impl::TorchDispatchModeTLS::set_mode(nullptr); - } else { - Py_INCREF(arg); - c10::impl::TorchDispatchModeTLS::set_mode( - std::make_shared(arg, getPyInterpreter())); - } - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS; -} - static PyObject* push_on_torch_dispatch_stack( PyObject* _unused, PyObject* arg) { @@ -795,7 +770,6 @@ static PyMethodDef methods[] = { // NOLINT is_torch_function_mode_enabled, METH_NOARGS, nullptr}, - {"_set_torch_function_mode", set_torch_function_mode, METH_O, nullptr}, {"_push_on_torch_function_stack", push_on_torch_function_stack, METH_O, @@ -812,7 +786,6 @@ static PyMethodDef methods[] = { // NOLINT len_torch_function_stack, METH_NOARGS, nullptr}, - {"_set_torch_dispatch_mode", set_torch_dispatch_mode, METH_O, nullptr}, {"_push_on_torch_dispatch_stack", push_on_torch_dispatch_stack, METH_O, diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp index 9b52f7b50943a..66b8ad2d8351b 100644 --- a/torch/csrc/autograd/python_variable.cpp +++ b/torch/csrc/autograd/python_variable.cpp @@ -686,7 +686,9 @@ static PyObject* THPVariable_make_subclass( throw torch::TypeError( "cls must be a type (got %s)", Py_TYPE(cls)->tp_name); } - torch_dispatch_mode::StashTorchDispatchModeGuard td_g; + // guard completely turns off torch dispatch modes, doesn't just pop off the + // stack + torch_dispatch_mode::StashTorchDispatchStackGuard td_g; c10::impl::DisablePythonDispatcher dpd_g; auto data = r.tensor(1).detach(); // creates a fresh Tensor (DEFINITELY_UNINITIALIZED) diff --git a/torch/csrc/jit/python/pybind_utils.cpp b/torch/csrc/jit/python/pybind_utils.cpp index 60c7247ada62a..68317f76524b2 100644 --- a/torch/csrc/jit/python/pybind_utils.cpp +++ b/torch/csrc/jit/python/pybind_utils.cpp @@ -755,8 +755,7 @@ py::object _get_operation_for_overload_or_packet( total_arg_num, false /* throw_error */); } - if (overloaded_args.size() > 0 || - at::impl::PythonTorchFunctionTLS::get_mode()) { + if (overloaded_args.size() > 0 || at::impl::torch_function_mode_enabled()) { py::object ret; std::string ns = symbol.ns().toUnqualString(); std::string method_name = symbol.toUnqualString(); diff --git a/torch/csrc/utils/disable_torch_function.cpp b/torch/csrc/utils/disable_torch_function.cpp index ac29a9157a9c1..3031493a704f6 100644 --- a/torch/csrc/utils/disable_torch_function.cpp +++ b/torch/csrc/utils/disable_torch_function.cpp @@ -221,7 +221,7 @@ inline bool has_torch_function_attr(PyObject* obj) { namespace torch { auto check_has_torch_function(PyObject* obj, bool ignore_mode) -> bool { - if (!ignore_mode && at::impl::PythonTorchFunctionTLS::get_mode()) + if (!ignore_mode && at::impl::torch_function_mode_enabled()) return true; PyTypeObject* tp = Py_TYPE(obj); return ( diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp index b1b0d2769df46..177346614704f 100644 --- a/torch/csrc/utils/python_arg_parser.cpp +++ b/torch/csrc/utils/python_arg_parser.cpp @@ -289,27 +289,41 @@ auto handle_torch_function_no_python_arg_parser( py::tuple py_types = py::cast(overloaded_types); py::object ret; PyObject* mode_obj = nullptr; + const bool is_torch_function = torch_function_name == TorchFunctionName::TorchFunction; - auto get_mode = [&]() { - return is_torch_function ? at::impl::PythonTorchFunctionTLS::get_mode() - : c10::impl::TorchDispatchModeTLS::get_mode(); + auto get_stack_len = [&]() { + return is_torch_function ? at::impl::PythonTorchFunctionTLS::stack_len() + : c10::impl::TorchDispatchModeTLS::stack_len(); }; - const auto& maybe_mode = get_mode(); - if (maybe_mode) { - mode_obj = maybe_mode->ptr(getPyInterpreter()); - TORCH_INTERNAL_ASSERT(py_types.ptr() != nullptr); - TORCH_INTERNAL_ASSERT(args != nullptr); + if (get_stack_len() > 0) { // Disable mode on the inside; this makes for a more user-friendly // experience if you try to, e.g., print your tensors. at::optional tf_g; at::optional td_g; if (is_torch_function) { tf_g.emplace(); + mode_obj = tf_g->get_cur_mode()->ptr(getPyInterpreter()); } else { td_g.emplace(); + mode_obj = td_g->get_cur_mode()->ptr(getPyInterpreter()); } + py::object torch_function = + PyObject_FastGetAttrString(mode_obj, torch_function_name_str); + if (!torch_function) { + TORCH_INTERNAL_ASSERT(0); + } + TORCH_INTERNAL_ASSERT(py_types.ptr() != nullptr); + TORCH_INTERNAL_ASSERT(args != nullptr); + + TORCH_CHECK( + PyObject_FastGetAttrString(torch_function.ptr(), "__self__") + .is(py::reinterpret_borrow(mode_obj)), + "Defining your mode's `", + torch_function_name_str, + "` as a classmethod is not supported, please make it a plain method"); + // Blegh. This accidentally works in PyObject_CallFunctionObjArgs below // because the nullptr terminates the argument list ick ick ick. if (kwargs == nullptr) { @@ -393,18 +407,6 @@ auto handle_torch_function_no_python_arg_parser( } } ss << "]"; - if (mode_obj) { - // Note [Paranoid check mode is same] - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // If a user forcibly changes the mode in a non-lexical way - // in the inner context, the mode could be invalid here. So just be - // a bit safe, it doesn't cost us anything since this is error reporting - const auto& maybe_mode = get_mode(); - TORCH_INTERNAL_ASSERT( - maybe_mode && mode_obj == maybe_mode->ptr(getPyInterpreter())); - ss << " nor was it found on the currently active mode " - << py::repr(mode_obj); - } const std::string& tmp = ss.str(); PyErr_SetString(PyExc_TypeError, tmp.c_str()); throw python_error(); diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h index 9b23af5829786..a08441369db82 100644 --- a/torch/csrc/utils/python_arg_parser.h +++ b/torch/csrc/utils/python_arg_parser.h @@ -443,7 +443,7 @@ inline PythonArgs PythonArgParser::parse(PyObject* self, ParsedArgs<0>& dst) { inline bool PythonArgs::has_torch_function() { return !this->signature.overloaded_args.empty() || - at::impl::PythonTorchFunctionTLS::get_mode(); + at::impl::torch_function_mode_enabled(); } inline std::string PythonArgs::get_func_name() { diff --git a/torch/csrc/utils/python_torch_function_mode.h b/torch/csrc/utils/python_torch_function_mode.h index 5faf75778469d..f6652dfd93084 100644 --- a/torch/csrc/utils/python_torch_function_mode.h +++ b/torch/csrc/utils/python_torch_function_mode.h @@ -5,21 +5,20 @@ namespace torch { namespace overrides { -// Corresponds to torch.overrides._no_torch_function_mode. We discourage use -// of this in userland because it's non-compositional; there might be another -// mode waiting to go after you, and you shouldn't just blindly disable it. -// From C++ side, there is no such thing as compositional modes, there is one -// mode and of course you should be able to clear it. struct StashTorchFunctionModeGuard { StashTorchFunctionModeGuard() { - at::impl::PythonTorchFunctionTLS::swap_mode(old_mode_); + cur_mode_ = at::impl::PythonTorchFunctionTLS::pop_stack(); } ~StashTorchFunctionModeGuard() { - at::impl::PythonTorchFunctionTLS::set_mode(std::move(old_mode_)); + at::impl::PythonTorchFunctionTLS::push_onto_stack(cur_mode_); + } + + const std::shared_ptr& get_cur_mode() { + return cur_mode_; } private: - std::shared_ptr old_mode_; + std::shared_ptr cur_mode_; }; } // namespace overrides diff --git a/torch/csrc/utils/torch_dispatch_mode.h b/torch/csrc/utils/torch_dispatch_mode.h index 81b219f71c095..2c97a7d96c320 100644 --- a/torch/csrc/utils/torch_dispatch_mode.h +++ b/torch/csrc/utils/torch_dispatch_mode.h @@ -8,16 +8,36 @@ namespace torch_dispatch_mode { struct StashTorchDispatchModeGuard { public: StashTorchDispatchModeGuard() { - c10::impl::TorchDispatchModeTLS::swap_mode(saved_mode_); + saved_mode_ = c10::impl::TorchDispatchModeTLS::pop_stack(); } ~StashTorchDispatchModeGuard() { - c10::impl::TorchDispatchModeTLS::set_mode(std::move(saved_mode_)); + c10::impl::TorchDispatchModeTLS::push_onto_stack(std::move(saved_mode_)); + } + + const std::shared_ptr& get_cur_mode() { + return saved_mode_; } private: std::shared_ptr saved_mode_; }; +struct StashTorchDispatchStackGuard { + public: + StashTorchDispatchStackGuard() { + const auto old = c10::impl::TorchDispatchModeTLS::get_state(); + c10::impl::TorchDispatchModeTLS::set_state(saved_state_); + saved_state_ = std::move(old); + } + + ~StashTorchDispatchStackGuard() { + c10::impl::TorchDispatchModeTLS::set_state(std::move(saved_state_)); + } + + private: + c10::impl::TorchDispatchModeTLS saved_state_; +}; + } // namespace torch_dispatch_mode } // namespace torch diff --git a/torch/overrides.py b/torch/overrides.py index 0d252f1114aa8..c463cf3ca94d4 100644 --- a/torch/overrides.py +++ b/torch/overrides.py @@ -26,7 +26,7 @@ import functools import types import warnings -from typing import Dict, Set, List, Any, Callable, Iterable, Type, Iterator, Tuple +from typing import Dict, Set, List, Any, Callable, Iterable, Type, Tuple import contextlib import torch @@ -34,7 +34,7 @@ _has_torch_function, _has_torch_function_unary, _has_torch_function_variadic, _add_docstr, _push_on_torch_function_stack, _pop_torch_function_stack, _get_function_stack_at, _len_torch_function_stack, - _set_torch_function_mode, _is_torch_function_mode_enabled) + _is_torch_function_mode_enabled) __all__ = [ "get_ignored_functions", @@ -1512,8 +1512,8 @@ def handle_torch_function( if _is_torch_function_mode_enabled(): # if we're here, the mode must be set to a TorchFunctionStackMode # this unsets it and calls directly into TorchFunctionStackMode's torch function - with _no_torch_function_mode(): - result = _TorchFunctionStackMode().__torch_function__(public_api, types, args, kwargs) + with _pop_mode_temporarily() as mode: + result = mode.__torch_function__(public_api, types, args, kwargs) if result is not NotImplemented: return result @@ -1828,15 +1828,11 @@ def _get_current_function_mode_stack(): return [_get_function_stack_at(i) for i in range(stack_len)] def _push_mode(mode): - if _len_torch_function_stack() == 0: - _set_torch_function_mode(_TorchFunctionStackMode()) _push_on_torch_function_stack(mode) def _pop_mode(): old = _pop_torch_function_stack() - if _len_torch_function_stack() == 0: - _set_torch_function_mode(None) return old @@ -1848,19 +1844,6 @@ def _pop_mode_temporarily(): finally: _push_mode(old) -# a helper "mode" used by the torch_function push helper method. This is the only mode that will ever -# be active at the C++ level and it will run the current mode -class _TorchFunctionStackMode: - def __torch_function__(self, func, types, args=(), kwargs=None): - with _pop_mode_temporarily() as old: - if _len_torch_function_stack() > 0: - _set_torch_function_mode(self) - # we can't check the type of __torch_function__ here but this is sufficient for checking it's a classmethod - if old.__torch_function__.__self__ is type(old): - raise RuntimeError("TorchFunctionMode's torch_function function " + - "should be a normal method not a class method") - return old.__torch_function__(func, types, args, kwargs) - class BaseTorchFunctionMode(TorchFunctionMode): def __torch_function__(self, func, types, args=(), kwargs=None): if kwargs is None: @@ -1868,19 +1851,6 @@ def __torch_function__(self, func, types, args=(), kwargs=None): return func(*args, **kwargs) -# This is private API as I'm not sure it's possible for users to use this -# compositionally (easy to discard too many modes). It is useful for -# library code though, e.g., in handle_torch_function -@contextlib.contextmanager -def _no_torch_function_mode() -> Iterator[None]: - _set_torch_function_mode(None) - try: - yield - finally: - if _len_torch_function_stack() > 0: - _set_torch_function_mode(_TorchFunctionStackMode()) - - class enable_reentrant_dispatch(): def __enter__(self): self._raii_guard = torch._C._RestorePythonTLSSnapshot() diff --git a/torch/utils/_python_dispatch.py b/torch/utils/_python_dispatch.py index 08ef67d7333fa..5d22ae69a185f 100644 --- a/torch/utils/_python_dispatch.py +++ b/torch/utils/_python_dispatch.py @@ -2,7 +2,7 @@ import warnings from torch._C import _len_torch_dispatch_stack, _get_dispatch_stack_at,\ - _pop_torch_dispatch_stack, _push_on_torch_dispatch_stack, _set_torch_dispatch_mode + _pop_torch_dispatch_stack, _push_on_torch_dispatch_stack # TODO: Limitations and things about enable_torch_dispatch_mode we should fix before exposing it: @@ -67,16 +67,11 @@ def _get_current_dispatch_mode_stack(): return [_get_dispatch_stack_at(i) for i in range(stack_len)] def _push_mode(mode): - if _len_torch_dispatch_stack() == 0: - _set_torch_dispatch_mode(_TorchDispatchStackMode()) _push_on_torch_dispatch_stack(mode) def _pop_mode(): - old = _pop_torch_dispatch_stack() - if _len_torch_dispatch_stack() == 0: - _set_torch_dispatch_mode(None) - return old + return _pop_torch_dispatch_stack() @contextlib.contextmanager @@ -87,19 +82,6 @@ def _pop_mode_temporarily(): finally: _push_mode(old) -# a helper "mode" used by the torch dispatch push helper method. This is the only mode that will ever -# be active at the C++ level and it will run the current mode -class _TorchDispatchStackMode: - def __torch_dispatch__(self, func, types, args=(), kwargs=None): - with _pop_mode_temporarily() as old: - if _len_torch_dispatch_stack() > 0: - _set_torch_dispatch_mode(self) - # we can't check the type of __torch_dispatch__ here but this is sufficient for checking it's a classmethod - if old.__torch_dispatch__.__self__ is type(old): - raise RuntimeError(f"{type(old)}'s torch_dispatch function " + - "should be a normal method not a class method") - return old.__torch_dispatch__(func, types, args, kwargs) - class BaseTorchDispatchMode(TorchDispatchMode): def __torch_dispatch__(self, func, types, args=(), kwargs=None): if kwargs is None: From 39be343c25c6fb5e11d660a1c06d3c8429aad284 Mon Sep 17 00:00:00 2001 From: Will Constable Date: Fri, 21 Oct 2022 16:21:42 +0000 Subject: [PATCH 0032/1922] Make torchbench setup a function (#87469) cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang Pull Request resolved: https://github.com/pytorch/pytorch/pull/87469 Approved by: https://github.com/anijain2305 --- benchmarks/dynamo/torchbench.py | 40 +++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/benchmarks/dynamo/torchbench.py b/benchmarks/dynamo/torchbench.py index c37422a19bfd9..b6577745ab154 100755 --- a/benchmarks/dynamo/torchbench.py +++ b/benchmarks/dynamo/torchbench.py @@ -20,25 +20,30 @@ # We are primarily interested in tf32 datatype torch.backends.cuda.matmul.allow_tf32 = True -original_dir = abspath(os.getcwd()) - -os.environ["KALDI_ROOT"] = "/tmp" # avoids some spam -for torchbench_dir in ( - "./torchbenchmark", - "../torchbenchmark", - "../torchbench", - "../benchmark", - "../../torchbenchmark", - "../../torchbench", - "../../benchmark", -): + + +def setup_torchbench_cwd(): + original_dir = abspath(os.getcwd()) + + os.environ["KALDI_ROOT"] = "/tmp" # avoids some spam + for torchbench_dir in ( + "./torchbenchmark", + "../torchbenchmark", + "../torchbench", + "../benchmark", + "../../torchbenchmark", + "../../torchbench", + "../../benchmark", + ): + if exists(torchbench_dir): + break + if exists(torchbench_dir): - break + torchbench_dir = abspath(torchbench_dir) + os.chdir(torchbench_dir) + sys.path.append(torchbench_dir) -if exists(torchbench_dir): - torchbench_dir = abspath(torchbench_dir) - os.chdir(torchbench_dir) - sys.path.append(torchbench_dir) + return original_dir # Some models have large dataset that doesn't fit in memory. Lower the batch @@ -338,6 +343,7 @@ def forward_and_backward_pass(self, mod, inputs, collect_outputs=True): if __name__ == "__main__": + original_dir = setup_torchbench_cwd() logging.basicConfig(level=logging.WARNING) warnings.filterwarnings("ignore") main(TorchBenchmarkRunner(), original_dir) From 83a6448f8b1b6fabd347b39944c52753592d5270 Mon Sep 17 00:00:00 2001 From: Will Constable Date: Fri, 21 Oct 2022 16:21:43 +0000 Subject: [PATCH 0033/1922] Remove unused cold_start experiment (#87470) - this `--cold_start` experiment didn't end up being used - there is a new `--cold_start_latency` flag that is used - this experiment was only hooked up for nvfuser anyway cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang Pull Request resolved: https://github.com/pytorch/pytorch/pull/87470 Approved by: https://github.com/anijain2305 --- benchmarks/dynamo/common.py | 89 ------------------------------------- 1 file changed, 89 deletions(-) diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py index a2f8af2bc825a..507f9db2d5b11 100644 --- a/benchmarks/dynamo/common.py +++ b/benchmarks/dynamo/common.py @@ -348,84 +348,6 @@ def randomize_input(inputs): ) -def cold_start_experiment(args, model_iter_fn, model, example_inputs, optimize_ctx): - compile_iters = 2 - total_iters = compile_iters + 2 - timings = np.zeros((total_iters, 2), np.float64) - # if we randomize the input, we should also check the result is correct - should_check_result = should_randomize_input = args.randomize_input - is_correct = True - - optimized_model_iter_fn = optimize_ctx(model_iter_fn) - for rep in range(total_iters): - inputs = ( - randomize_input(copy.deepcopy(example_inputs)) - if should_randomize_input - else example_inputs - ) - - # interleave the runs to handle frequency scaling and load changes - timings[rep, 0], expected_output = timed( - model, model_iter_fn, inputs, return_result=True - ) - timings[rep, 1], actual_output = timed( - model, optimized_model_iter_fn, inputs, return_result=True - ) - if should_check_result: - is_correct = is_correct and same(expected_output, actual_output) - pvalue = ttest_ind(timings[:, 0], timings[:, 1]).pvalue - worst = np.max(timings, axis=0) - - def breakeven(dynamo_times, eager_times): - """ - Solve for the number of iterations it takes dynamo to 'catch up' with eager, - taking into account the time it spent compiling. Assumes all compilation - happens up front and the model is static thereafter, which is definitely not - true in general but might be across torchbench. - - dc1, dc2 = dynamo compilation iterations (with Prof Exec) - d, e = dynamo, eager warmed up iteration - B = num iters to break even - dc1 + dc2 + (B-2)d = B*e - B = (dc1 + dc2 - 2d) / (e - d) - """ - dc1, dc2, d = dynamo_times[0], dynamo_times[1], np.median(dynamo_times[2:]) - e = np.median(eager_times) - if d < e: - return (dc1 + dc2 + 2 * d) / (e - d) - else: - # if optimized dynamo is not faster than eager we'll compute - # a nonsense negative number - return 0 - - speedup = worst[0] / worst[1] - eager_times, dynamo_times = timings[:, 0], timings[:, 1] - output_csv( - output_filename, - ("dev", "name", "batch_size", "cold-start speedup", "breakeven iters"), - [ - current_device, - current_name, - current_batch_size, - float(speedup), - breakeven(dynamo_times, eager_times), - ], - ) - - def format_speedup( - speedup, pvalue, breakeven_iters, is_correct=True, pvalue_threshold=0.1 - ): - if not is_correct: - return "ERROR" - if pvalue > pvalue_threshold: - return f"{speedup:.3f}x breakeven={breakeven_iters:.2f} iters SAME" - return f"{speedup:.3f}x breakeven={breakeven_iters:.2f} iters p={pvalue:.2f}" - - return format_speedup( - speedup, pvalue, breakeven(dynamo_times, eager_times), is_correct=is_correct - ) - - def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs): """ Measure speedups over eager. @@ -1527,9 +1449,6 @@ def parse_args(): action="store_true", help="speedup using the ltc backend without reusing compiled graph", ) - group.add_argument( - "--cold-start", action="store_true", help=help(cold_start_experiment) - ) group.add_argument( "--overhead", action="store_true", help=help(overhead_experiment) ) @@ -1769,14 +1688,6 @@ def main(runner, original_dir=None): optimize_ctx = torch._dynamo.optimize(dummy_fx_compile, nopython=args.nopython) experiment = speedup_experiment output_filename = "overheads.csv" - elif args.cold_start: - optimize_ctx = torch._dynamo.optimize("aot_nvfuser", nopython=args.nopython) - experiment = cold_start_experiment - assert args.nvfuser, "TODO - Add another aot string for mem fusion with NNC" - backend_str = "nvfuser" if args.nvfuser else "nnc" - output_filename = f"cold_start_{backend_str}.csv" - # TODO(whc) should we move this to a more general part of the script? - torch.backends.cuda.matmul.allow_tf32 = True elif args.inductor or args.inductor_dynamic: from torch._inductor import config as inductor_config From e83cc19610bc3610db7063b6876f78e7954fcb72 Mon Sep 17 00:00:00 2001 From: Will Constable Date: Fri, 21 Oct 2022 16:21:43 +0000 Subject: [PATCH 0034/1922] Delete unused ltc experiments (#87471) cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang Pull Request resolved: https://github.com/pytorch/pytorch/pull/87471 Approved by: https://github.com/anijain2305 --- benchmarks/dynamo/common.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py index 507f9db2d5b11..b31cf1a0642ab 100644 --- a/benchmarks/dynamo/common.py +++ b/benchmarks/dynamo/common.py @@ -1439,16 +1439,6 @@ def parse_args(): group.add_argument( "--coverage", action="store_true", help="(default) " + help(coverage_experiment) ) - group.add_argument( - "--speedup-ltc", - action="store_true", - help="speedup using the ltc backend", - ) - group.add_argument( - "--speedup-ltc-trivial", - action="store_true", - help="speedup using the ltc backend without reusing compiled graph", - ) group.add_argument( "--overhead", action="store_true", help=help(overhead_experiment) ) @@ -1707,18 +1697,6 @@ def main(runner, original_dir=None): optimize_ctx = torch._dynamo.optimize("inductor", nopython=args.nopython) experiment = speedup_experiment output_filename = "inductor.csv" - elif args.speedup_ltc: - optimize_ctx = torch._dynamo.optimize( - backends.ltc_reuse_graph, nopython=args.nopython - ) - experiment = speedup_experiment - output_filename = "speedups_ltc.csv" - elif args.speedup_ltc_trivial: - optimize_ctx = torch._dynamo.optimize( - backends.ltc_trivial, nopython=args.nopython - ) - experiment = speedup_experiment - output_filename = "speedups_ltc_trivial.csv" elif args.speedup_ts: experiment = speedup_experiment_ts output_filename = "baseline_ts.csv" From b1e71d0db2b56ac75fbafca16a83b7976a97ca5f Mon Sep 17 00:00:00 2001 From: Will Constable Date: Fri, 21 Oct 2022 16:21:43 +0000 Subject: [PATCH 0035/1922] Delete unused ts experiment (#87472) cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang Pull Request resolved: https://github.com/pytorch/pytorch/pull/87472 Approved by: https://github.com/anijain2305 --- benchmarks/dynamo/common.py | 40 ------------------------------------- 1 file changed, 40 deletions(-) diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py index b31cf1a0642ab..0597cc513781d 100644 --- a/benchmarks/dynamo/common.py +++ b/benchmarks/dynamo/common.py @@ -587,40 +587,6 @@ def try_script(model, example_inputs): return None -def speedup_experiment_ts(args, model_iter_fn, model, example_inputs): - """ - Measure baseline performance (without using TorchDynamo) of TorchScript and optimize_for_inference. - - Writes to ./baseline_ts.csv - """ - if args.training: - return baselines( - [ - ("eager", model), - ("ts", try_script(model, example_inputs)), - ], - model_iter_fn, - example_inputs, - args, - ) - - return baselines( - [ - ("eager", model), - ("ts", try_script(model, example_inputs)), - ( - "ofi", - backends.ofi(try_script(model, example_inputs), example_inputs), - ), - # ("nnc", backends.nnc(try_script(model, example_inputs), example_inputs)), - # ("nvfuser", backends.nvfuser(try_script(model, example_inputs), example_inputs)), - ], - model_iter_fn, - example_inputs, - args, - ) - - def speedup_experiment_sr(args, model_iter_fn, model, example_inputs): """ Measure baseline performance (without using TorchDynamo) of static runtime. @@ -1442,9 +1408,6 @@ def parse_args(): group.add_argument( "--overhead", action="store_true", help=help(overhead_experiment) ) - group.add_argument( - "--speedup-ts", action="store_true", help=help(speedup_experiment_ts) - ) group.add_argument( "--speedup-sr", action="store_true", help=help(speedup_experiment_sr) ) @@ -1697,9 +1660,6 @@ def main(runner, original_dir=None): optimize_ctx = torch._dynamo.optimize("inductor", nopython=args.nopython) experiment = speedup_experiment output_filename = "inductor.csv" - elif args.speedup_ts: - experiment = speedup_experiment_ts - output_filename = "baseline_ts.csv" elif args.speedup_sr: experiment = speedup_experiment_sr output_filename = "baseline_sr.csv" From 4e4cd9d07b2a95bead2c577333434a25b1bea556 Mon Sep 17 00:00:00 2001 From: Will Constable Date: Fri, 21 Oct 2022 16:21:43 +0000 Subject: [PATCH 0036/1922] Delete unused static runtime experiment (#87473) cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang Pull Request resolved: https://github.com/pytorch/pytorch/pull/87473 Approved by: https://github.com/anijain2305 --- benchmarks/dynamo/common.py | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py index 0597cc513781d..01793b01e0e03 100644 --- a/benchmarks/dynamo/common.py +++ b/benchmarks/dynamo/common.py @@ -587,32 +587,6 @@ def try_script(model, example_inputs): return None -def speedup_experiment_sr(args, model_iter_fn, model, example_inputs): - """ - Measure baseline performance (without using TorchDynamo) of static runtime. - - Writes to ./baseline_sr.csv - """ - - if current_name not in ("opacus_cifar10", "timm_nfnet", "hf_T5"): - sr = backends.static_runtime(try_script(model, example_inputs), example_inputs) - else: - # segfaults on these models - sr = None - return baselines( - [ - ("eager", model), - ( - "sr", - sr, - ), - ], - model_iter_fn, - example_inputs, - args, - ) - - def speedup_experiment_onnx(args, model_iter_fn, model, example_inputs): """ Measure baseline performance (without using TorchDynamo) of ONNXRT and TensorFlow. @@ -1408,9 +1382,6 @@ def parse_args(): group.add_argument( "--overhead", action="store_true", help=help(overhead_experiment) ) - group.add_argument( - "--speedup-sr", action="store_true", help=help(speedup_experiment_sr) - ) group.add_argument( "--speedup-onnx", action="store_true", help=help(speedup_experiment_onnx) ) @@ -1660,9 +1631,6 @@ def main(runner, original_dir=None): optimize_ctx = torch._dynamo.optimize("inductor", nopython=args.nopython) experiment = speedup_experiment output_filename = "inductor.csv" - elif args.speedup_sr: - experiment = speedup_experiment_sr - output_filename = "baseline_sr.csv" elif args.speedup_onnx: experiment = speedup_experiment_onnx output_filename = "baseline_onnx.csv" From a0001256b681844324a50070bd3e3df0f9e441ff Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Fri, 21 Oct 2022 12:57:55 -0400 Subject: [PATCH 0037/1922] as_strided_scatter storage offset defaults to None not 0 (#87481) Signed-off-by: Edward Z. Yang Pull Request resolved: https://github.com/pytorch/pytorch/pull/87481 Approved by: https://github.com/bdhirsh --- torch/_tensor_docs.py | 2 +- torch/_torch_docs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py index 9f8ce0c0f8520..b564351acf590 100644 --- a/torch/_tensor_docs.py +++ b/torch/_tensor_docs.py @@ -1584,7 +1584,7 @@ def add_docstr_all(method, docstr): add_docstr_all( "as_strided_scatter", r""" -as_strided_scatter(src, size, stride, storage_offset=0) -> Tensor +as_strided_scatter(src, size, stride, storage_offset=None) -> Tensor See :func:`torch.as_strided_scatter` """, diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index 00e7129cfb10e..d84ed259b6d38 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -3740,7 +3740,7 @@ def merge_dicts(*dicts): add_docstr( torch.as_strided_scatter, r""" -as_strided_scatter(input, src, size, stride, storage_offset=0) -> Tensor +as_strided_scatter(input, src, size, stride, storage_offset=None) -> Tensor Embeds the values of the :attr:`src` tensor into :attr:`input` along the elements corresponding to the result of calling From ba51abbc4016d6564c79acd6d9b132285090c9cf Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Fri, 21 Oct 2022 22:53:35 +0000 Subject: [PATCH 0038/1922] fix docs push (#87498) push docs to temp branch first then push to actual branch to satisfy CLA check in branch protections Pull Request resolved: https://github.com/pytorch/pytorch/pull/87498 Approved by: https://github.com/malfet --- .circleci/scripts/cpp_doc_push_script.sh | 3 +++ .circleci/scripts/python_doc_push_script.sh | 3 +++ 2 files changed, 6 insertions(+) diff --git a/.circleci/scripts/cpp_doc_push_script.sh b/.circleci/scripts/cpp_doc_push_script.sh index 4c22677e94bd3..6e66514ae93b9 100755 --- a/.circleci/scripts/cpp_doc_push_script.sh +++ b/.circleci/scripts/cpp_doc_push_script.sh @@ -98,6 +98,9 @@ git commit -m "Generate C++ docs from pytorch/pytorch@${GITHUB_SHA}" || true git status if [[ "${WITH_PUSH:-}" == true ]]; then + # push to a temp branch first to trigger CLA check and satisfy branch protections + git push -u origin HEAD:pytorchbot/temp-branch-cpp -f + sleep 30 git push -u origin fi diff --git a/.circleci/scripts/python_doc_push_script.sh b/.circleci/scripts/python_doc_push_script.sh index f9b019ec069b3..d255f77c82e8e 100755 --- a/.circleci/scripts/python_doc_push_script.sh +++ b/.circleci/scripts/python_doc_push_script.sh @@ -135,6 +135,9 @@ git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true git status if [[ "${WITH_PUSH:-}" == true ]]; then + # push to a temp branch first to trigger CLA check and satisfy branch protections + git push -u origin HEAD:pytorchbot/temp-branch-py -f + sleep 30 git push -u origin "${branch}" fi From 8c2df0853a5cf7761031d1b5bfcca39a7fb15b05 Mon Sep 17 00:00:00 2001 From: Jason Ansel Date: Fri, 21 Oct 2022 15:14:15 -0700 Subject: [PATCH 0039/1922] Reland #87025 and fix periodic tests (#87084) - Relands #87025 - disables failing tests related to https://github.com/pytorch/torchdynamo/issues/1697 - Reverts https://github.com/pytorch/pytorch/commit/d01eea6027c26bf100fc99a705669f60648964ae cc @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang Pull Request resolved: https://github.com/pytorch/pytorch/pull/87084 Approved by: https://github.com/malfet, https://github.com/voznesenskym --- .jenkins/pytorch/common_utils.sh | 1 + .jenkins/pytorch/test.sh | 16 ++++------------ test/inductor/test_torchinductor.py | 13 +++++++++++++ test/inductor/test_torchinductor_opinfo.py | 19 +++++++++++++++++-- torch/_inductor/decomposition.py | 12 +++++++----- torch/_inductor/lowering.py | 2 -- 6 files changed, 42 insertions(+), 21 deletions(-) diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh index d8c853f97ab23..d673a37f17b8f 100644 --- a/.jenkins/pytorch/common_utils.sh +++ b/.jenkins/pytorch/common_utils.sh @@ -140,6 +140,7 @@ function install_triton() { else commit=$(get_pinned_commit triton) pip_install --user "git+https://github.com/openai/triton@${commit}#subdirectory=python" + pip_install --user jinja2 fi } diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 7e9d4f37edec1..ec77478769b4f 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -741,16 +741,12 @@ elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SH elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then test_without_numpy install_torchvision - if ! [[ "${BUILD_ENVIRONMENT}" == *sm86 ]]; then - install_triton - fi + install_triton test_python_shard 1 test_aten elif [[ "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then install_torchvision - if ! [[ "${BUILD_ENVIRONMENT}" == *sm86 ]]; then - install_triton - fi + install_triton test_python_shard 2 test_libtorch test_aot_compilation @@ -759,9 +755,7 @@ elif [[ "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then test_torch_function_benchmark elif [[ "${SHARD_NUMBER}" -gt 2 ]]; then # Handle arbitrary number of shards - if ! [[ "${BUILD_ENVIRONMENT}" == *sm86 ]]; then - install_triton - fi + install_triton test_python_shard "$SHARD_NUMBER" elif [[ "${BUILD_ENVIRONMENT}" == *vulkan* ]]; then test_vulkan @@ -779,9 +773,7 @@ elif [[ "${TEST_CONFIG}" == *functorch* ]]; then test_functorch else install_torchvision - if ! [[ "${BUILD_ENVIRONMENT}" == *sm86 ]]; then - install_triton - fi + install_triton install_monkeytype test_python test_aten diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py index c4e82a8092437..52f36500b5025 100644 --- a/test/inductor/test_torchinductor.py +++ b/test/inductor/test_torchinductor.py @@ -758,6 +758,11 @@ def fn(a): self.common(fn, ((torch.rand((10, 3, 352, 352), dtype=torch.float16),))) def test_expanded_reduction(self): + if self.device == "cpu": + raise unittest.SkipTest( + "https://github.com/pytorch/torchdynamo/issues/1697" + ) + def fn(x, y): z = x * y return z.sum((0, 1)) @@ -3145,6 +3150,9 @@ def fn(a, dim, index, b): ) def test_scatter2(self): + if self.device == "cuda": + raise unittest.SkipTest("unstable on sm86") + def fn(a, dim, index, b): return aten.scatter.reduce(a, dim, index, b, reduce="add") @@ -3259,6 +3267,11 @@ def fn(a, dim, index, b): # issue #1150 def test_dense_mask_index(self): + if self.device == "cpu": + raise unittest.SkipTest( + "https://github.com/pytorch/torchdynamo/issues/1697" + ) + def fn(x, y): y = torch.ops.aten.select.int(y, 0, 2) z = x * y diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py index 220b711efcb51..371a825b28a30 100644 --- a/test/inductor/test_torchinductor_opinfo.py +++ b/test/inductor/test_torchinductor_opinfo.py @@ -140,6 +140,23 @@ def process(device_type): # Disabled on migration to core "linalg.pinv.singular": {f32, f64}, "linalg.householder_product": {f32}, + # These might be passing now? + "T": {b8, f16, f32, f64, i32, i64}, + "H": {b8, f16, f32, f64, i32, i64}, + "__getitem__": {b8, f16, f32, f64, i32, i64}, + "acos": {b8, f16, f32, f64, i32, i64}, + "acosh": {b8, f16, f32, f64, i32, i64}, + "nn.functional.conv_transpose3d": {f16}, + "max.reduction_with_dim": {i32, i64}, + "min.reduction_with_dim": {i32, i64}, + "linalg.lu": {f32, f64}, + "lu_unpack": {f32, f64}, + "native_batch_norm": {f16, f32, f64}, + "native_layer_norm": {f16, f32, f64}, + # Issues on sm86 periodic job (complex numbers) + "cdouble": {b8, f16, f32, f64, i32, i64}, + "cfloat": {b8, f16, f32, f64, i32, i64}, + "randint": {b8, f16, f32, f64, i32, i64}, } inductor_expected_failures_single_sample = defaultdict(dict) @@ -354,7 +371,6 @@ def process(device_type): # AssertionError: Tensor-likes are not close! "erf": {b8, f64}, "nn.functional.gelu": {f64}, - "nn.functional.conv_transpose3d": {f16}, "nn.functional.triplet_margin_loss": {f16}, } @@ -365,7 +381,6 @@ def process(device_type): "cumprod": {f16}, "linalg.vector_norm": {f64, f64}, "linalg.householder_product": {f32}, - "linalg.lu": {f32, f64}, "kron": {f16}, "nanquantile": {f32, f64}, "native_batch_norm": {f16, f32, f64}, diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py index 5e67bfe6ef29e..6fed9ca691240 100644 --- a/torch/_inductor/decomposition.py +++ b/torch/_inductor/decomposition.py @@ -81,24 +81,26 @@ aten._reshape_alias, aten.select_backward, aten.select_scatter, + aten.sgn, aten.sigmoid_backward, + aten.silu, aten.silu_backward, aten.slice_backward, - aten.sgn, - aten.std_mean.correction, aten._softmax, aten._softmax_backward_data, + aten.softplus, + aten.softplus_backward, aten.stack, + aten.std_mean.correction, aten.t, aten.tanh_backward, aten.threshold_backward, aten.transpose.int, aten.tril.default, + aten.unfold, + aten.unfold_backward, aten.upsample_bilinear2d.vec, aten.upsample_nearest2d_backward, - aten.softplus, - aten.softplus_backward, - aten.silu, ] ) diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py index 49a136b440ed2..fd94aa9bc5d5a 100644 --- a/torch/_inductor/lowering.py +++ b/torch/_inductor/lowering.py @@ -1084,8 +1084,6 @@ def inner_fn(index): make_fallback(aten._sparse_coo_tensor_with_dims_and_tensors) make_fallback(aten._thnn_fused_lstm_cell) make_fallback(aten.topk) -make_fallback(aten.unfold) -make_fallback(aten.unfold_backward) make_fallback(aten.upsample_bicubic2d_backward) make_fallback(aten.upsample_bilinear2d_backward) From 95a7c042137260cdc0bf99038b4575f0aed9a315 Mon Sep 17 00:00:00 2001 From: Will Constable Date: Fri, 21 Oct 2022 23:13:39 +0000 Subject: [PATCH 0040/1922] Re-enable dynamo ddp tests (#87524) - Move dynamo dist tests to another shard Pull Request resolved: https://github.com/pytorch/pytorch/pull/87524 Approved by: https://github.com/davidberard98 --- .jenkins/pytorch/test.sh | 2 + .../test_dynamo_distributed.py} | 47 +++++-------------- 2 files changed, 14 insertions(+), 35 deletions(-) rename test/{dynamo/test_distributed.py => distributed/test_dynamo_distributed.py} (88%) diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index ec77478769b4f..adcaf82ffdd38 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -708,6 +708,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then # TODO: run some C++ tests echo "no-op at the moment" elif [[ "$TEST_CONFIG" == distributed ]]; then + install_filelock + install_triton test_distributed # Only run RPC C++ tests on the first shard if [[ "${SHARD_NUMBER}" == 1 ]]; then diff --git a/test/dynamo/test_distributed.py b/test/distributed/test_dynamo_distributed.py similarity index 88% rename from test/dynamo/test_distributed.py rename to test/distributed/test_dynamo_distributed.py index 695e34817f37b..0fefd4ec507a7 100644 --- a/test/dynamo/test_distributed.py +++ b/test/distributed/test_dynamo_distributed.py @@ -3,15 +3,14 @@ import unittest from unittest.mock import patch -import pytest import torch - import torch._dynamo import torch._dynamo.test_case import torch.distributed as dist from torch import nn from torch._dynamo import config -from torch._dynamo.testing import same +from torch._dynamo.utils import same +from torch.nn.parallel import DistributedDataParallel as DDP class ToyModel(nn.Module): @@ -36,14 +35,6 @@ def compile_fn(self, gm, example_inputs): return gm -def skip_if_no_active_ddp(): - from torch.nn.parallel import DistributedDataParallel as DDP - - if not hasattr(DDP, "_get_active_ddp_module"): - raise unittest.SkipTest("requires pytorch landing in parallel") - - -@pytest.mark.skip("Module hangs in PyTorch CI") class TestDistributed(torch._dynamo.test_case.TestCase): """ Test harness initializes dist process group @@ -98,8 +89,10 @@ def test_ddp_baseline_inductor(self): outputs = ddp_m(inputs) self.assertTrue(same(correct_outputs, outputs)) - # can't run with gloo (no support for _allgather_base) and nccl not available in CI - @pytest.mark.xfail + # TODO(whc) move these tests to 'distributed' shard to get nccl, or see if it's available already in pytorch CI? + @unittest.skip( + "can't run with gloo (no support for _allgather_base) and nccl not available in CI" + ) @patch.object(config, "optimize_ddp", False) def test_fsdp_baseline_aot_eager(self): from torch.distributed.fsdp import FullyShardedDataParallel as FSDP @@ -110,8 +103,7 @@ def test_fsdp_baseline_aot_eager(self): outputs = fsdp_m(inputs) self.assertTrue(same(correct_outputs, outputs)) - # hangs/crashes with inductor currently - @pytest.mark.skip + @unittest.skip("hangs/crashes with inductor currently") @patch.object(config, "optimize_ddp", False) def test_fsdp_baseline_inductor(self): from torch.distributed.fsdp import FullyShardedDataParallel as FSDP @@ -130,9 +122,6 @@ def test_graph_split(self): the user-provided compiler is called by the DDPOptimizer which is doing the graph splitting """ - from torch.nn.parallel import DistributedDataParallel as DDP - - skip_if_no_active_ddp() m, inputs, correct_outputs = self.get_model() ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=25) @@ -148,16 +137,13 @@ def opt_fn(inputs): self.assertEqual(check_splits_compiler.compiler_called, 3) # hangs/crashes with inductor currently - @pytest.mark.skip + @unittest.skip("hangs/crashes with inductor currently") @patch.object(config, "optimize_ddp", True) def test_graph_split_inductor(self): """ Same as above, but using inductor backend. We observed issues with inductor/fx interface in the past. """ - from torch.nn.parallel import DistributedDataParallel as DDP - - skip_if_no_active_ddp() m, inputs, correct_outputs = self.get_model() ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=25) @@ -174,9 +160,6 @@ def test_no_split(self): Ensures the DDPOptimizer returns a correct, compiled module without introducing graph splits. (Based on model parmeters fitting in the bucket) """ - from torch.nn.parallel import DistributedDataParallel as DDP - - skip_if_no_active_ddp() m, inputs, correct_outputs = self.get_model() ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=250) @@ -196,9 +179,6 @@ def test_aot_autograd(self): Explicitly check AotAutograd family of compilers work, since they require example inputs propagated between graph splits. """ - from torch.nn.parallel import DistributedDataParallel as DDP - - skip_if_no_active_ddp() m, inputs, correct_outputs = self.get_model() ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=25) @@ -218,9 +198,6 @@ def test_custom_layer(self): the user-provided compiler is called by the DDPOptimizer which is doing the graph splitting """ - from torch.nn.parallel import DistributedDataParallel as DDP - - skip_if_no_active_ddp() class MyCustomLinear(torch.nn.Module): def __init__(self): @@ -281,7 +258,7 @@ def fn(): self.assertEqual(res, 1) -# TODO(jansel): debug issues running this in CI -# if __name__ == "__main__": -# from torch._dynamo.testing import run_tests -# run_tests() +if __name__ == "__main__": + from torch._dynamo.test_case import run_tests + + run_tests() From 4dd3a33e784f2419b588e786c4d39ee88db7e381 Mon Sep 17 00:00:00 2001 From: Michael Lazos Date: Sat, 22 Oct 2022 03:43:08 +0000 Subject: [PATCH 0041/1922] Unified debug directory for dynamo/inductor tools (#87438) Fixes https://github.com/pytorch/torchdynamo/issues/1705 Fixes https://github.com/pytorch/torchdynamo/issues/1383 Adds a debug directory by default called `torchdynamo_debug` in the current working directory. In the debug directory for each run of dynamo (an enter and exit of optimize) folder run_\ is created which contains any minifier/inductor/torchdynamo artifacts under respective folders. Updated the minifier, record replay, and inductor tracing to use this directory cc @jansel @lezcano @fdrocha @soumith @voznesenskym @yanboliang Pull Request resolved: https://github.com/pytorch/pytorch/pull/87438 Approved by: https://github.com/soumith --- test/dynamo/test_debug_dir.py | 96 +++++++++++++++++++++++++++++++ test/dynamo/test_minifier.py | 36 +++++++++--- test/dynamo/test_replay_record.py | 7 +-- torch/_dynamo/config.py | 6 +- torch/_dynamo/debug_utils.py | 24 ++++---- torch/_dynamo/eval_frame.py | 4 ++ torch/_dynamo/utils.py | 37 +++++++++++- torch/_inductor/debug.py | 7 ++- 8 files changed, 183 insertions(+), 34 deletions(-) create mode 100644 test/dynamo/test_debug_dir.py diff --git a/test/dynamo/test_debug_dir.py b/test/dynamo/test_debug_dir.py new file mode 100644 index 0000000000000..5827ff40ea781 --- /dev/null +++ b/test/dynamo/test_debug_dir.py @@ -0,0 +1,96 @@ +# Owner(s): ["module: dynamo"] +import shutil +import unittest + +import torch +import torch._dynamo.test_case +import torch._dynamo.testing +from torch._dynamo.utils import DebugDir, get_debug_dir + + +class DebugDirTests(torch._dynamo.test_case.TestCase): + @classmethod + def setUpClass(cls): + super().setUpClass() + cls._exit_stack.enter_context( + unittest.mock.patch.object( + torch._dynamo.config, + "debug_dir_root", + "/tmp/torch._dynamo_debug_dirs/", + ) + ) + + @classmethod + def tearDownClass(cls): + shutil.rmtree(torch._dynamo.config.debug_dir_root, ignore_errors=True) + cls._exit_stack.close() + + def setUp(self): + super().setUp() + torch._dynamo.utils.debug_dir = DebugDir() + + def tearDown(self): + torch._dynamo.utils.debug_dir = DebugDir() + super().tearDown() + + def _setup(self): + debug_dir = torch._dynamo.utils.debug_dir + debug_dir.setup() + self.assertIsNotNone(debug_dir.debug_path) + self.assertEqual(debug_dir.num_setup_calls, 1) + return debug_dir + + def test_setup(self): + self._setup() + + def test_clear(self): + debug_dir = self._setup() + debug_dir.clear() + self.assertIsNone(debug_dir.debug_path) + self.assertEqual(debug_dir.num_setup_calls, 0) + + def test_multi_setup_single_clear(self): + debug_dir = self._setup() + prev = get_debug_dir() + + debug_dir.setup() + self.assertEqual(prev, get_debug_dir()) + self.assertEqual(debug_dir.num_setup_calls, 2) + + debug_dir.clear() + self.assertEqual(prev, get_debug_dir()) + self.assertEqual(debug_dir.num_setup_calls, 1) + + def test_multi_setup_multi_clear(self): + debug_dir = self._setup() + prev = get_debug_dir() + + debug_dir.setup() + self.assertEqual(prev, get_debug_dir()) + self.assertEqual(debug_dir.num_setup_calls, 2) + + debug_dir.clear() + self.assertEqual(prev, get_debug_dir()) + self.assertEqual(debug_dir.num_setup_calls, 1) + + debug_dir.clear() + self.assertIsNone(debug_dir.debug_path) + self.assertEqual(debug_dir.num_setup_calls, 0) + + def test_single_setup_single_clear(self): + debug_dir = self._setup() + debug_dir.clear() + self.assertIsNone(debug_dir.debug_path) + self.assertEqual(debug_dir.num_setup_calls, 0) + + def test_multi_get(self): + self._setup() + prev = get_debug_dir() + next = get_debug_dir() + self.assertEqual(prev, next) + + +if __name__ == "__main__": + from torch._dynamo.test_case import run_tests + + run_tests() diff --git a/test/dynamo/test_minifier.py b/test/dynamo/test_minifier.py index 4570d15b2d148..a282485285797 100644 --- a/test/dynamo/test_minifier.py +++ b/test/dynamo/test_minifier.py @@ -1,10 +1,10 @@ # Owner(s): ["module: dynamo"] import os import shutil +import unittest from unittest.mock import patch import torch - import torch._dynamo import torch._dynamo.test_case import torch._dynamo.testing @@ -25,6 +25,30 @@ def forward(self, x): class MinfierTests(torch._dynamo.test_case.TestCase): + @classmethod + def setUpClass(cls): + super().setUpClass() + cls._exit_stack.enter_context( + unittest.mock.patch.object( + torch._dynamo.config, + "debug_dir_root", + "/tmp/_torchdynamo_debug_/", + ) + ) + + @classmethod + def tearDownClass(cls): + shutil.rmtree(torch._dynamo.config.debug_dir_root, ignore_errors=True) + cls._exit_stack.close() + + def setUp(self): + super().setUp() + torch._dynamo.utils.debug_dir.setup() + + def tearDown(self): + torch._dynamo.utils.debug_dir.clear() + super().tearDown() + def test_after_dynamo(self): @create_backend def bad_dynamo_backend(subgraph): @@ -43,12 +67,9 @@ def f(*args): mod = MockModule() opt_mod = torch._dynamo.optimize("bad_dynamo_backend")(mod) - repro_dir = "/tmp/test_minifier" - repro_file = os.path.join(repro_dir, "minifier_launcher.py") - shutil.rmtree(repro_dir, ignore_errors=True) + repro_file = torch._dynamo.debug_utils.get_minifier_repro_path() @patch.object(torch._dynamo.config, "repro_after", "dynamo") - @patch.object(torch._dynamo.config, "repro_dir", repro_dir) def inner(): x = torch.randn(4) try: @@ -65,14 +86,11 @@ def inner(): def _test_around_aot(self, error_at_aot): mod = MockModule() opt_mod = torch._dynamo.optimize("inductor")(mod) - repro_dir = "/tmp/test_minifier" - repro_file = os.path.join(repro_dir, "minifier_launcher.py") - shutil.rmtree(repro_dir, ignore_errors=True) + repro_file = torch._dynamo.debug_utils.get_minifier_repro_path() repro_after = "dynamo" if error_at_aot else "aot" @patch.object(torch._dynamo.config, "repro_after", repro_after) - @patch.object(torch._dynamo.config, "repro_dir", repro_dir) def inner(): x = torch.randn(4) x.requires_grad = error_at_aot diff --git a/test/dynamo/test_replay_record.py b/test/dynamo/test_replay_record.py index 378fd2b78a9bc..5235e355e0d1c 100644 --- a/test/dynamo/test_replay_record.py +++ b/test/dynamo/test_replay_record.py @@ -5,7 +5,6 @@ import unittest import torch - import torch._dynamo.test_case import torch._dynamo.testing @@ -37,14 +36,14 @@ def setUpClass(cls): cls._exit_stack.enter_context( unittest.mock.patch.object( torch._dynamo.config, - "replay_record_dir_name", - "/tmp/torch._dynamo_error_records/", + "debug_dir_root", + "/tmp/_torchdynamo_debug_/", ) ) @classmethod def tearDownClass(cls): - shutil.rmtree(torch._dynamo.config.replay_record_dir_name, ignore_errors=True) + shutil.rmtree(torch._dynamo.config.debug_dir_root, ignore_errors=True) cls._exit_stack.close() def check_replay(self, fn, *args, exp_exc_name=None): diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py index 7a2c79972ddaa..701036789ffcb 100644 --- a/torch/_dynamo/config.py +++ b/torch/_dynamo/config.py @@ -83,7 +83,6 @@ # Record and write an execution record of the current frame to a file # if an exception is encountered replay_record_enabled = False -replay_record_dir_name = "./torchdynamo_error_records" # Show a warning on every graph break print_graph_breaks = False @@ -126,9 +125,6 @@ # 4: Dumps a minifier_launcher.py if the accuracy fails. repro_level = int(os.environ.get("TORCHDYNAMO_REPRO_LEVEL", 2)) -# Specify the directory where to save the repro artifacts -repro_dir = os.environ.get("TORCHDYNAMO_REPRO_DIR", None) - # Not all backends support scalars. Some calls on torch.Tensor (like .item()) return a scalar type. # When this flag is set to False, we introduce a graph break instead of capturing. capture_scalar_outputs = False @@ -159,6 +155,8 @@ else: base_dir = dirname(dirname(abspath(__file__))) +debug_dir_root = os.path.join(os.getcwd(), "torchdynamo_debug") + class _AccessLimitingConfig(ModuleType): def __setattr__(self, name, value): diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py index 845c518a4f85d..1134267c5f60d 100644 --- a/torch/_dynamo/debug_utils.py +++ b/torch/_dynamo/debug_utils.py @@ -16,13 +16,13 @@ from . import config from .optimizations.backends import register_backend -from .utils import clone_inputs +from .utils import clone_inputs, get_debug_dir log = logging.getLogger(__name__) def minifier_dir(): - path = config.repro_dir + path = os.path.join(get_debug_dir(), "minifier") if path is None: path = f"/tmp/minifier_{getpass.getuser()}" if not os.path.exists(path): @@ -331,8 +331,12 @@ def inductor_accuracy_fails(fx_g, args, check_str=None): return backend_aot_accuracy_fails(fx_g, args, compile_fx_inner) +def get_minifier_repro_path(): + return os.path.join(minifier_dir(), "minifier_launcher.py") + + def helper_for_dump_minify(contents): - minified_repro_path = os.path.join(minifier_dir(), "minifier_launcher.py") + minified_repro_path = get_minifier_repro_path() log.warning(f"Writing minified repro to {minified_repro_path}") try: with open(minified_repro_path, "w") as fd: @@ -341,15 +345,6 @@ def helper_for_dump_minify(contents): log.exception(e) raise NotImplementedError("Could not write to {minified_repro_path}") - local_path = os.path.join(config.base_dir, "minifier_launcher.py") - try: - shutil.copyfile(minified_repro_path, local_path) - log.warning( - f"Copying minified repro from {minified_repro_path} to {local_path} for convenience" - ) - except OSError: - log.warning(f"Don't have write permissions for {local_path}") - def dump_to_minify(gm, args, compiler_name: str): favored_device = 1 if torch.cuda.device_count() >= 2 else 0 @@ -827,7 +822,9 @@ def debug_wrapper(gm, example_inputs, **kwargs): example_inputs, compiler_name, ) - raise ValueError("Issue deteced. Repro at minifier_launcher.py.") + raise ValueError( + f"Issue detected. Repro at {get_minifier_repro_path()}." + ) else: compiled_gm = compiler_fn(gm, example_inputs, **kwargs) @@ -877,7 +874,6 @@ def dynamo_minifier_backend(gm, example_inputs, compiler_name): @register_backend def dynamo_accuracy_minifier_backend(gm, example_inputs, compiler_name): from functorch.compile import minifier - from torchdynamo.optimizations.backends import BACKENDS if compiler_name == "inductor": diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py index bf9a230a420b8..40beba357b1cf 100644 --- a/torch/_dynamo/eval_frame.py +++ b/torch/_dynamo/eval_frame.py @@ -103,12 +103,14 @@ def __enter__(self): "Please refer to https://github.com/pytorch/torchdynamo#usage-example " "to use torchdynamo.optimize(...) as an annotation/decorator. " ) + utils.debug_dir.setup() self.on_enter() self.prior = set_eval_frame(self.callback) self.backend_ctx = self.extra_ctx_ctor() self.backend_ctx.__enter__() def __exit__(self, exc_type, exc_val, exc_tb): + utils.debug_dir.clear() set_eval_frame(self.prior) self.prior = unset self.backend_ctx.__exit__(exc_type, exc_val, exc_tb) @@ -150,12 +152,14 @@ def __call__(self, *args, **kwargs): @functools.wraps(fn) def _fn(*args, **kwargs): on_enter() + utils.debug_dir.setup() prior = set_eval_frame(callback) backend_ctx = backend_ctx_ctor() backend_ctx.__enter__() try: return fn(*args, **kwargs) finally: + utils.debug_dir.clear() set_eval_frame(prior) backend_ctx.__exit__(None, None, None) diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py index b66c240e0f04d..aa64de0eeef3b 100644 --- a/torch/_dynamo/utils.py +++ b/torch/_dynamo/utils.py @@ -3,6 +3,7 @@ import copy import cProfile import dataclasses +import datetime import dis import functools import gc @@ -197,7 +198,7 @@ def format_bytecode(prefix, name, filename, line_no, code): def gen_record_file_name(exc, code): - return f"{config.replay_record_dir_name}/\ + return f"{get_debug_dir()}/error_recordings/\ {code.co_name}_{type(exc).__name__}_{code.co_firstlineno}.rec" @@ -928,3 +929,37 @@ def recompile_reasons(code): rpt += "No cache-limited recompilations detected.\n" return rpt + + +class DebugDir: + def __init__(self): + self.num_setup_calls = 0 + self.debug_path = None + + def setup(self): + assert self.num_setup_calls >= 0 + if self.num_setup_calls == 0: + debug_root = config.debug_dir_root + dir_name = "run_" + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f") + self.debug_path = os.path.join(debug_root, dir_name) + + self.num_setup_calls += 1 + + def clear(self): + assert self.num_setup_calls >= 0 + if self.num_setup_calls == 1: + self.debug_path = None + + self.num_setup_calls -= 1 + assert self.num_setup_calls >= 0 + + def get(self): + assert self.debug_path is not None + return self.debug_path + + +debug_dir = DebugDir() + + +def get_debug_dir(): + return debug_dir.get() diff --git a/torch/_inductor/debug.py b/torch/_inductor/debug.py index d2bc9bcd73344..f7fbfe218be39 100644 --- a/torch/_inductor/debug.py +++ b/torch/_inductor/debug.py @@ -19,7 +19,6 @@ from torch.fx.passes.tools_common import legalize_graph from . import config, ir -from .codecache import cache_dir from .scheduler import ( BaseSchedulerNode, ExternKernelSchedulerNode, @@ -182,7 +181,11 @@ def inner(*args, **kwargs): @staticmethod def create_debug_dir(): for n in DebugContext._counter: - dirname = os.path.join(cache_dir(), f"debug.{os.getpid()}.{n}") + dirname = os.path.join( + dynamo_utils.get_debug_dir(), + "torchinductor", + f"debug.{os.getpid()}.{n}", + ) if not os.path.exists(dirname): os.makedirs(dirname) return dirname From 409a9a385383719690fb5e9c6bffc0a0abee1c92 Mon Sep 17 00:00:00 2001 From: Will Constable Date: Sat, 22 Oct 2022 01:03:41 +0000 Subject: [PATCH 0042/1922] Improvements for DDP Optimizer (#87525) - adds support for 'first_bucket_cap' arg, to align bucketing more precisely with DDP, which may start a smaller first bucket - refactors the bucket splitting logic to be cleaner - adds pretty-print for bucket info, and a way to access bucket info from the DDPOptimizer class from a test case or benchmark - dumps debug logs to stdout cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang Pull Request resolved: https://github.com/pytorch/pytorch/pull/87525 Approved by: https://github.com/davidberard98 --- test/distributed/test_dynamo_distributed.py | 9 +- torch/_dynamo/optimizations/distributed.py | 120 ++++++++++++-------- 2 files changed, 76 insertions(+), 53 deletions(-) diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py index 0fefd4ec507a7..43a4a23039175 100644 --- a/test/distributed/test_dynamo_distributed.py +++ b/test/distributed/test_dynamo_distributed.py @@ -18,8 +18,8 @@ def __init__(self, in_feat=10, hidden_feat=5000, num_hidden=2, out_feat=5): super().__init__() self.net = nn.Sequential( *[nn.Linear(in_feat, hidden_feat), nn.ReLU()] - + [nn.Linear(5000, 5000), nn.ReLU()] * num_hidden - + [nn.Linear(5000, 5), nn.ReLU()] + + [nn.Linear(hidden_feat, hidden_feat), nn.ReLU()] * num_hidden + + [nn.Linear(hidden_feat, 5), nn.ReLU()] ) def forward(self, inputs): @@ -160,7 +160,10 @@ def test_no_split(self): Ensures the DDPOptimizer returns a correct, compiled module without introducing graph splits. (Based on model parmeters fitting in the bucket) """ - m, inputs, correct_outputs = self.get_model() + # DDP will always do a 'first bucket' with a really small size; so only a tiny model will escape this + m = ToyModel(hidden_feat=5).to(self.device) + inputs = torch.randn(20, 10).to(self.device) + correct_outputs = m(inputs) ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=250) check_splits_compiler = CheckSplitsCompiler() diff --git a/torch/_dynamo/optimizations/distributed.py b/torch/_dynamo/optimizations/distributed.py index f65c16483aec6..e674820032d97 100644 --- a/torch/_dynamo/optimizations/distributed.py +++ b/torch/_dynamo/optimizations/distributed.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass, field from typing import Any, List import torch @@ -18,6 +19,28 @@ def args_str(args): return str(args) +@dataclass +class Bucket: + size: int = 0 + params: List[str] = field(default_factory=list) + nodes: List[fx.Node] = field(default_factory=list) + + +def pretty_print_buckets(buckets: List[Bucket]): + headers = ("Index", "Size (b)", "Param Names") + rows = [] + for idx, bucket in enumerate(reversed(buckets)): + rows.append((idx, bucket.size, bucket.params[0])) + for param in bucket.params[1:]: + rows.append((None, None, param)) + try: + from tabulate import tabulate + + print(tabulate(rows, headers=headers, tablefmt="simple_grid")) + except ImportError: + print("Please `pip install tabulate` in order to pretty-print ddp bucket sizes") + + class DDPOptimizer: def __init__( self, @@ -25,8 +48,13 @@ def __init__( parameters_to_ignore: List[str], backend_compile_fn, debug=False, + first_bucket_cap: int = torch.distributed._DEFAULT_FIRST_BUCKET_BYTES, ): self.bucket_bytes_cap = bucket_bytes_cap + assert ( + first_bucket_cap <= bucket_bytes_cap + ), "First bucket should be smaller/equal to other buckets to get comms warmed up ASAP" + self.first_bucket_cap = first_bucket_cap self.parameters_to_ignore = parameters_to_ignore self.backend_compile_fn = backend_compile_fn self.debug = debug @@ -35,76 +63,69 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]): """ TODO: - handle params_and_buffers_to_ignore - - handle kwargs """ # 1: compute the partition map according to DDP bucket logic - bucket_bytes = 0 - bucket_actual_sizes = [] - node_splits = [[]] + buckets = [Bucket()] # (size, param_names) for node in reversed(gm.graph.nodes): - if node.op == "output" or node.op == "placeholder": + if node.op in ("output", "placeholder"): continue - if bucket_bytes >= self.bucket_bytes_cap: - bucket_actual_sizes.insert(0, bucket_bytes) - bucket_bytes = 0 - node_splits.insert(0, []) + if ( + buckets[0].size >= self.bucket_bytes_cap + or len(buckets) == 1 + and buckets[0].size >= self.first_bucket_cap + ): + buckets.insert(0, Bucket()) - elif node.op == "call_module": + if node.op == "call_module": target = gm.get_submodule(node.target) - params_size_b = sum( - [ - p.storage().nbytes() - for p in target.parameters() - if p.requires_grad - ] - ) - bucket_bytes += params_size_b - # print(f"accumulated {params_size_b} b from {node}") + for name, p in target.named_parameters(): + if p.requires_grad: + buckets[0].size += p.storage().nbytes() + # TODO correct FQ name? + buckets[0].params.append(f"{node}_{name}") elif node.op == "get_attr": maybe_param = getattr(gm, node.target) if maybe_param.requires_grad: - bucket_bytes += maybe_param.storage().nbytes() - else: - # TODO(whc) confirm this: - # (e.g. call_method, call_function aren't expected to 'have' parameters) - pass - - node_splits[0].append(node) - - if len(node_splits) == 1: - if self.debug: - print( - "DDPOptimizer did not split graphs." - f" Accumulated {bucket_bytes} bytes, and bucket cap is {self.bucket_bytes_cap}" - ) - return self.backend_compile_fn(gm, example_inputs) + buckets[0].size += maybe_param.storage().nbytes() + buckets[0].params.append(node.target) - if len(bucket_actual_sizes) < len(node_splits): - bucket_actual_sizes.insert(0, bucket_bytes) + # All nodes have to be mapped to a bucket, even if they don't have their own params + buckets[0].nodes.append(node) + # stash buckets for testing/debugging purposes + self.buckets = buckets if self.debug: print( - f"DDPOptimizer used bucket cap {self.bucket_bytes_cap}" - f" and split graphs into parameter sizes {', '.join([str(b) for b in bucket_actual_sizes])}" + f"DDPOptimizer used bucket cap {self.bucket_bytes_cap} and produced the following buckets:" ) + pretty_print_buckets(buckets) + + if len(buckets) == 1: + # bypass split/fuse logic if there is only one bucket + return self.backend_compile_fn(gm, example_inputs) # 2: partition the graphmodule according to bucket capacity partition_map = {} - for p, nodes in enumerate(node_splits): - for node in nodes: - partition_map[node] = p + for idx, b in enumerate(buckets): + for node in b.nodes: + partition_map[node] = idx split_gm = fx.passes.split_module.split_module( gm, None, lambda node: partition_map[node] ) if self.debug: - with open("debug_ddp_optimizer.log", "w") as dump_file: - dump_file.write("---orig graph---") - dump_file.write(str(gm.graph)) - dump_file.write("\n---split graph---") - dump_file.write(str(split_gm.graph)) + print("---orig graph---") + print(str(gm.graph)) + print("\n---split graph---") + print(str(split_gm.graph)) + for name, module in split_gm.named_modules(): + if "." not in name: + # only print the submod graphs, not their children + print(f"\n---{name} graph---") + print(str(module.graph)) + print("---------------") # 3: compile each of the partitioned submodules using the user-provided compiler class SubmodCompiler(torch.fx.interpreter.Interpreter): @@ -171,7 +192,6 @@ def run_node(self, n: Node) -> Any: self.module.delete_submodule(n.target) n.target = "compiled_" + n.target self.module.add_submodule(n.target, compiled_submod) - # then we execute the modified node using the usual logic return getattr(self, n.op)(n.target, args, kwargs) @@ -180,8 +200,8 @@ def run_node(self, n: Node) -> Any: split_gm.recompile() if self.debug: - with open("debug_ddp_optimizer.log", "a") as dump_file: - dump_file.write("\n---final graph---") - dump_file.write(str(split_gm.graph)) + print("\n---final graph---") + print(str(split_gm.graph)) + print("---------------") return split_gm From 1a0cc9ca29bf9d5336330def1acaf896194759ef Mon Sep 17 00:00:00 2001 From: PyTorch MergeBot Date: Sat, 22 Oct 2022 04:51:33 +0000 Subject: [PATCH 0043/1922] Revert "Improvements for DDP Optimizer (#87525)" This reverts commit cf693a02e0f6a022d10fd882af20efacfe7ecb76. Reverted https://github.com/pytorch/pytorch/pull/87525 on behalf of https://github.com/ZainRizvi due to The macos error messages look like they were indeed caused by this PR --- test/distributed/test_dynamo_distributed.py | 9 +- torch/_dynamo/optimizations/distributed.py | 120 ++++++++------------ 2 files changed, 53 insertions(+), 76 deletions(-) diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py index 43a4a23039175..0fefd4ec507a7 100644 --- a/test/distributed/test_dynamo_distributed.py +++ b/test/distributed/test_dynamo_distributed.py @@ -18,8 +18,8 @@ def __init__(self, in_feat=10, hidden_feat=5000, num_hidden=2, out_feat=5): super().__init__() self.net = nn.Sequential( *[nn.Linear(in_feat, hidden_feat), nn.ReLU()] - + [nn.Linear(hidden_feat, hidden_feat), nn.ReLU()] * num_hidden - + [nn.Linear(hidden_feat, 5), nn.ReLU()] + + [nn.Linear(5000, 5000), nn.ReLU()] * num_hidden + + [nn.Linear(5000, 5), nn.ReLU()] ) def forward(self, inputs): @@ -160,10 +160,7 @@ def test_no_split(self): Ensures the DDPOptimizer returns a correct, compiled module without introducing graph splits. (Based on model parmeters fitting in the bucket) """ - # DDP will always do a 'first bucket' with a really small size; so only a tiny model will escape this - m = ToyModel(hidden_feat=5).to(self.device) - inputs = torch.randn(20, 10).to(self.device) - correct_outputs = m(inputs) + m, inputs, correct_outputs = self.get_model() ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=250) check_splits_compiler = CheckSplitsCompiler() diff --git a/torch/_dynamo/optimizations/distributed.py b/torch/_dynamo/optimizations/distributed.py index e674820032d97..f65c16483aec6 100644 --- a/torch/_dynamo/optimizations/distributed.py +++ b/torch/_dynamo/optimizations/distributed.py @@ -1,4 +1,3 @@ -from dataclasses import dataclass, field from typing import Any, List import torch @@ -19,28 +18,6 @@ def args_str(args): return str(args) -@dataclass -class Bucket: - size: int = 0 - params: List[str] = field(default_factory=list) - nodes: List[fx.Node] = field(default_factory=list) - - -def pretty_print_buckets(buckets: List[Bucket]): - headers = ("Index", "Size (b)", "Param Names") - rows = [] - for idx, bucket in enumerate(reversed(buckets)): - rows.append((idx, bucket.size, bucket.params[0])) - for param in bucket.params[1:]: - rows.append((None, None, param)) - try: - from tabulate import tabulate - - print(tabulate(rows, headers=headers, tablefmt="simple_grid")) - except ImportError: - print("Please `pip install tabulate` in order to pretty-print ddp bucket sizes") - - class DDPOptimizer: def __init__( self, @@ -48,13 +25,8 @@ def __init__( parameters_to_ignore: List[str], backend_compile_fn, debug=False, - first_bucket_cap: int = torch.distributed._DEFAULT_FIRST_BUCKET_BYTES, ): self.bucket_bytes_cap = bucket_bytes_cap - assert ( - first_bucket_cap <= bucket_bytes_cap - ), "First bucket should be smaller/equal to other buckets to get comms warmed up ASAP" - self.first_bucket_cap = first_bucket_cap self.parameters_to_ignore = parameters_to_ignore self.backend_compile_fn = backend_compile_fn self.debug = debug @@ -63,69 +35,76 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]): """ TODO: - handle params_and_buffers_to_ignore + - handle kwargs """ # 1: compute the partition map according to DDP bucket logic - buckets = [Bucket()] # (size, param_names) + bucket_bytes = 0 + bucket_actual_sizes = [] + node_splits = [[]] for node in reversed(gm.graph.nodes): - if node.op in ("output", "placeholder"): + if node.op == "output" or node.op == "placeholder": continue - if ( - buckets[0].size >= self.bucket_bytes_cap - or len(buckets) == 1 - and buckets[0].size >= self.first_bucket_cap - ): - buckets.insert(0, Bucket()) + if bucket_bytes >= self.bucket_bytes_cap: + bucket_actual_sizes.insert(0, bucket_bytes) + bucket_bytes = 0 + node_splits.insert(0, []) - if node.op == "call_module": + elif node.op == "call_module": target = gm.get_submodule(node.target) - for name, p in target.named_parameters(): - if p.requires_grad: - buckets[0].size += p.storage().nbytes() - # TODO correct FQ name? - buckets[0].params.append(f"{node}_{name}") + params_size_b = sum( + [ + p.storage().nbytes() + for p in target.parameters() + if p.requires_grad + ] + ) + bucket_bytes += params_size_b + # print(f"accumulated {params_size_b} b from {node}") elif node.op == "get_attr": maybe_param = getattr(gm, node.target) if maybe_param.requires_grad: - buckets[0].size += maybe_param.storage().nbytes() - buckets[0].params.append(node.target) + bucket_bytes += maybe_param.storage().nbytes() + else: + # TODO(whc) confirm this: + # (e.g. call_method, call_function aren't expected to 'have' parameters) + pass + + node_splits[0].append(node) + + if len(node_splits) == 1: + if self.debug: + print( + "DDPOptimizer did not split graphs." + f" Accumulated {bucket_bytes} bytes, and bucket cap is {self.bucket_bytes_cap}" + ) + return self.backend_compile_fn(gm, example_inputs) - # All nodes have to be mapped to a bucket, even if they don't have their own params - buckets[0].nodes.append(node) + if len(bucket_actual_sizes) < len(node_splits): + bucket_actual_sizes.insert(0, bucket_bytes) - # stash buckets for testing/debugging purposes - self.buckets = buckets if self.debug: print( - f"DDPOptimizer used bucket cap {self.bucket_bytes_cap} and produced the following buckets:" + f"DDPOptimizer used bucket cap {self.bucket_bytes_cap}" + f" and split graphs into parameter sizes {', '.join([str(b) for b in bucket_actual_sizes])}" ) - pretty_print_buckets(buckets) - - if len(buckets) == 1: - # bypass split/fuse logic if there is only one bucket - return self.backend_compile_fn(gm, example_inputs) # 2: partition the graphmodule according to bucket capacity partition_map = {} - for idx, b in enumerate(buckets): - for node in b.nodes: - partition_map[node] = idx + for p, nodes in enumerate(node_splits): + for node in nodes: + partition_map[node] = p split_gm = fx.passes.split_module.split_module( gm, None, lambda node: partition_map[node] ) if self.debug: - print("---orig graph---") - print(str(gm.graph)) - print("\n---split graph---") - print(str(split_gm.graph)) - for name, module in split_gm.named_modules(): - if "." not in name: - # only print the submod graphs, not their children - print(f"\n---{name} graph---") - print(str(module.graph)) - print("---------------") + with open("debug_ddp_optimizer.log", "w") as dump_file: + dump_file.write("---orig graph---") + dump_file.write(str(gm.graph)) + dump_file.write("\n---split graph---") + dump_file.write(str(split_gm.graph)) # 3: compile each of the partitioned submodules using the user-provided compiler class SubmodCompiler(torch.fx.interpreter.Interpreter): @@ -192,6 +171,7 @@ def run_node(self, n: Node) -> Any: self.module.delete_submodule(n.target) n.target = "compiled_" + n.target self.module.add_submodule(n.target, compiled_submod) + # then we execute the modified node using the usual logic return getattr(self, n.op)(n.target, args, kwargs) @@ -200,8 +180,8 @@ def run_node(self, n: Node) -> Any: split_gm.recompile() if self.debug: - print("\n---final graph---") - print(str(split_gm.graph)) - print("---------------") + with open("debug_ddp_optimizer.log", "a") as dump_file: + dump_file.write("\n---final graph---") + dump_file.write(str(split_gm.graph)) return split_gm From f43dc9e5f9a47d53f3e679918f2afcd69cebfa36 Mon Sep 17 00:00:00 2001 From: Bin Bao Date: Fri, 21 Oct 2022 23:01:17 +0000 Subject: [PATCH 0044/1922] Enable inductor CI for TIMM (#87462) cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang Pull Request resolved: https://github.com/pytorch/pytorch/pull/87462 Approved by: https://github.com/anijain2305 --- .github/ci_commit_pins/timm.txt | 2 +- .github/workflows/inductor.yml | 12 ++++++++++-- .jenkins/pytorch/test.sh | 26 +++++++++++++++++++++++--- benchmarks/dynamo/common.py | 1 + benchmarks/dynamo/timm_models.py | 3 ++- 5 files changed, 37 insertions(+), 7 deletions(-) diff --git a/.github/ci_commit_pins/timm.txt b/.github/ci_commit_pins/timm.txt index 4b199567e9a7b..cdda1d14775c6 100644 --- a/.github/ci_commit_pins/timm.txt +++ b/.github/ci_commit_pins/timm.txt @@ -1 +1 @@ -ebee0a27940adfbb30444d83387b9ea0f1173f40 +6635bc3f7d06c6a0d0481803b24d6ad0004b61ac diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml index a5aa7acaec0b9..da27466b60e90 100644 --- a/.github/workflows/inductor.yml +++ b/.github/workflows/inductor.yml @@ -22,8 +22,16 @@ jobs: cuda-arch-list: 8.6 test-matrix: | { include: [ - { config: "inductor", shard: 1, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "inductor", shard: 2, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 1, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 2, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 3, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 4, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 5, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 6, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 7, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 8, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 9, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 10, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" }, ]} linux-bionic-cuda11_6-py3_10-gcc7-inductor-test: diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index adcaf82ffdd38..2b0de2ec35a6f 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -277,6 +277,19 @@ test_inductor_huggingface_shard() { python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_huggingface_"$1".csv } +test_inductor_timm_shard() { + if [[ -z "$NUM_TEST_SHARDS" ]]; then + echo "NUM_TEST_SHARDS must be defined to run a Python test shard" + exit 1 + fi + TEST_REPORTS_DIR=/tmp/test-reports + mkdir -p "$TEST_REPORTS_DIR" + python benchmarks/dynamo/timm_models.py --ci --training --accuracy \ + --device cuda --inductor --float32 --total-partitions 8 --partition-id "$1" \ + --output "$TEST_REPORTS_DIR"/inductor_timm_"$1".csv + python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_timm_"$1".csv +} + test_python_gloo_with_tls() { source "$(dirname "${BASH_SOURCE[0]}")/run_glootls_test.sh" assert_git_not_dirty @@ -729,17 +742,24 @@ elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHAR install_filelock install_triton test_dynamo_shard 2 -elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then +elif [[ "${TEST_CONFIG}" == *inductor* && $SHARD_NUMBER -lt 9 && $NUM_TEST_SHARDS -gt 1 ]]; then install_torchvision install_filelock install_triton - test_inductor -elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then + install_timm + id=$((SHARD_NUMBER-1)) + test_inductor_timm_shard $id +elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 9 && $NUM_TEST_SHARDS -gt 1 ]]; then install_torchvision install_filelock install_triton install_huggingface test_inductor_huggingface_shard 0 +elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 10 && $NUM_TEST_SHARDS -gt 1 ]]; then + install_torchvision + install_filelock + install_triton + test_inductor elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then test_without_numpy install_torchvision diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py index 01793b01e0e03..b1f8bbd993f3b 100644 --- a/benchmarks/dynamo/common.py +++ b/benchmarks/dynamo/common.py @@ -149,6 +149,7 @@ def set_model_name(name): "convit_base", # fp64_OOM "gernet_l", # accuracy "gluon_xception65", + "hrnet_w18", # accuracy "lcnet_0500", # accuracy "levit_128", # levit_128 "rexnet_100", # accuracy diff --git a/benchmarks/dynamo/timm_models.py b/benchmarks/dynamo/timm_models.py index f7ff2559cbb8a..34b2078d23e36 100755 --- a/benchmarks/dynamo/timm_models.py +++ b/benchmarks/dynamo/timm_models.py @@ -205,7 +205,8 @@ def load_model( drop_rate=0.0, drop_path_rate=None, drop_block_rate=None, - pretrained=True, + # Skip downloading pretrained models for speedy CI + pretrained=not self.args.ci, # global_pool=kwargs.pop('gp', 'fast'), # num_classes=kwargs.pop('num_classes', None), # drop_rate=kwargs.pop('drop', 0.), From 2b0af59a61ca4330388161725ff2966759ed7158 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Sat, 22 Oct 2022 06:00:59 +0000 Subject: [PATCH 0045/1922] [BE] Delete BUILD_SPLIT_CUDA option (#87502) As we are linking with cuDNN and cuBLAS dynamically for all configs anyway, as statically linked cuDNN is different library than dynamically linked one, increases default memory footprint, etc, and libtorch_cuda even if compiled for all GPU architectures is no longer approaching 2Gb binary size limit, so BUILD_SPLIT_CUDA can go away. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87502 Approved by: https://github.com/atalman --- .../actions/test-pytorch-binary/action.yml | 1 - .github/workflows/_binary-build-linux.yml | 6 - .jenkins/pytorch/build.sh | 2 - .jenkins/pytorch/test.sh | 4 - .../win-test-helpers/build_pytorch.bat | 4 - .jenkins/pytorch/win-test.sh | 4 - CMakeLists.txt | 7 - aten/src/ATen/native/cuda/Bucketization.cu | 6 - aten/src/ATen/native/native_functions.yaml | 11 -- caffe2/CMakeLists.txt | 128 ++---------------- .../check_forward_backward_compatibility.py | 1 + torch/csrc/jit/codegen/cuda/nvfuser.cmake | 4 +- torch/utils/cpp_extension.py | 23 +--- 13 files changed, 20 insertions(+), 181 deletions(-) diff --git a/.github/actions/test-pytorch-binary/action.yml b/.github/actions/test-pytorch-binary/action.yml index bc2c546f57b28..be2090db533db 100644 --- a/.github/actions/test-pytorch-binary/action.yml +++ b/.github/actions/test-pytorch-binary/action.yml @@ -15,7 +15,6 @@ runs: -e BINARY_ENV_FILE \ -e BUILDER_ROOT \ -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ -e DESIRED_CUDA \ -e DESIRED_DEVTOOLSET \ -e DESIRED_PYTHON \ diff --git a/.github/workflows/_binary-build-linux.yml b/.github/workflows/_binary-build-linux.yml index a665f53bab5e0..6bd2ccd691918 100644 --- a/.github/workflows/_binary-build-linux.yml +++ b/.github/workflows/_binary-build-linux.yml @@ -167,11 +167,6 @@ jobs: git clean -fxd working-directory: builder - - name: Set BUILD_SPLIT_CUDA - if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && startsWith(inputs.GPU_ARCH_VERSION, '11') }} - shell: bash - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: @@ -184,7 +179,6 @@ jobs: -e BINARY_ENV_FILE \ -e BUILDER_ROOT \ -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ -e DESIRED_CUDA \ -e DESIRED_DEVTOOLSET \ -e DESIRED_PYTHON \ diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index 58cdc1227ac2d..13ee6309c0655 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -41,8 +41,6 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then fi if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then - # enable split torch_cuda build option in CMake - export BUILD_SPLIT_CUDA=ON if [[ "$BUILD_ENVIRONMENT" != *cuda11.3* && "$BUILD_ENVIRONMENT" != *clang* ]]; then # TODO: there is a linking issue when building with UCC using clang, # disable it for now and to be fix later. diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 2b0de2ec35a6f..b263c1949c10f 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -97,10 +97,6 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* || "$BUILD_ENVIRONMENT" == *rocm* ]]; then export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda" fi -if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then - export BUILD_SPLIT_CUDA=ON -fi - if [[ "$TEST_CONFIG" == *crossref* ]]; then export PYTORCH_TEST_WITH_CROSSREF=1 fi diff --git a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat index b85dad0616cd7..9c9c9cd64290b 100644 --- a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat +++ b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat @@ -135,10 +135,6 @@ if "%REBUILD%" == "" ( if not errorlevel 0 exit /b ) ) -:: tests if BUILD_ENVIRONMENT contains cuda11 as a substring -if not x%BUILD_ENVIRONMENT:cuda11=%==x%BUILD_ENVIRONMENT% ( - set BUILD_SPLIT_CUDA=ON -) python setup.py bdist_wheel && sccache --show-stats && python -c "import os, glob; os.system('python -mpip install ' + glob.glob('dist/*.whl')[0] + '[opt-einsum]')" ( if "%BUILD_ENVIRONMENT%"=="" ( diff --git a/.jenkins/pytorch/win-test.sh b/.jenkins/pytorch/win-test.sh index dc28521204878..560b039dbf679 100755 --- a/.jenkins/pytorch/win-test.sh +++ b/.jenkins/pytorch/win-test.sh @@ -39,10 +39,6 @@ fi export SCRIPT_HELPERS_DIR=$SCRIPT_PARENT_DIR/win-test-helpers -if [[ "${BUILD_ENVIRONMENT}" == *cuda11* ]]; then - export BUILD_SPLIT_CUDA=ON -fi - if [[ "$TEST_CONFIG" = "force_on_cpu" ]]; then # run the full test suite for force_on_cpu test export USE_CUDA=0 diff --git a/CMakeLists.txt b/CMakeLists.txt index dae1dd4bc14fb..e2e3bf0e3f8d5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -187,13 +187,6 @@ option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON) option(USE_ASAN "Use Address Sanitizer" OFF) option(USE_TSAN "Use Thread Sanitizer" OFF) option(USE_CUDA "Use CUDA" ON) -# BUILD_SPLIT_CUDA must also be exported as an environment variable before building, with -# `export BUILD_SPLIT_CUDA=1` because cpp_extension.py can only work properly if this variable -# also exists in the environment. -# This option is incompatible with CUDA_SEPARABLE_COMPILATION. -cmake_dependent_option( - BUILD_SPLIT_CUDA "Split torch_cuda library into torch_cuda_cu and torch_cuda_cpp" OFF - "USE_CUDA AND NOT CUDA_SEPARABLE_COMPILATION" OFF) cmake_dependent_option( BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF) option(USE_FAST_NVCC "Use parallel NVCC build" OFF) diff --git a/aten/src/ATen/native/cuda/Bucketization.cu b/aten/src/ATen/native/cuda/Bucketization.cu index 2a3d5730d7860..21c582216628e 100644 --- a/aten/src/ATen/native/cuda/Bucketization.cu +++ b/aten/src/ATen/native/cuda/Bucketization.cu @@ -10,7 +10,6 @@ #include #include #else -#include #include #include #include @@ -191,11 +190,6 @@ Tensor searchsorted_cuda( return result; } -// See [Note about _torch_cuda_cu_linker_symbol_op and torch_cuda_cu] in native_functions.yaml -Tensor _torch_cuda_cu_linker_symbol_op_cuda(const Tensor& self) { - return self; -} - Tensor searchsorted_cuda( const Tensor& sorted_sequence, const Scalar& self, diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index ba1d38aa350b5..b827999cf54e9 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -9930,17 +9930,6 @@ CPU: searchsorted_cpu CUDA: searchsorted_cuda -# [Note about _torch_cuda_cu_linker_symbol_op and torch_cuda_cu] -# This is a DUMMY function to force the linking against torch_cuda_cu on Windows. -# Otherwise, the Windows linker will optimize and not include torch_cuda_cu even when we -# want it to be included. This is similar to what we do with warp_size for torch_cuda_cpp, -# described as the solution to this issue: https://github.com/pytorch/pytorch/issues/31611 -# This op should NOT be used or exposed or edited or else Windows builds (with BUILD_SPLIT_CUDA) will break. -- func: _torch_cuda_cu_linker_symbol_op(Tensor self) -> Tensor - dispatch: - CUDA: _torch_cuda_cu_linker_symbol_op_cuda - autogen: _torch_cuda_cu_linker_symbol_op.out - - func: searchsorted.Tensor_out(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!) dispatch: CPU: searchsorted_out_cpu diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 64d53de5a64bb..aa6dfd2841bac 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -883,10 +883,6 @@ file(WRITE ${DUMMY_EMPTY_FILE} ${DUMMY_FILE_CONTENT}) # Wrapper library for people who link against torch and expect both CPU and CUDA support # Contains "torch_cpu" and "torch_cuda" add_library(torch ${DUMMY_EMPTY_FILE}) -if(BUILD_SPLIT_CUDA) - # When we split torch_cuda, we want a dummy torch_cuda library that contains both parts - add_library(torch_cuda ${DUMMY_EMPTY_FILE}) -endif() if(HAVE_SOVERSION) set_target_properties(torch PROPERTIES VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION}) @@ -926,37 +922,19 @@ elseif(USE_CUDA) ${Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY}) set_property(TARGET torch_cuda_w_sort_by_key PROPERTY CUDA_SEPARABLE_COMPILATION OFF) target_link_libraries(torch_cuda PRIVATE torch_cuda_w_sort_by_key) - elseif(BUILD_SPLIT_CUDA) - add_library(torch_cuda_cpp ${Caffe2_GPU_SRCS} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY}) - add_library(torch_cuda_cu ${Caffe2_GPU_CU_SRCS} ${Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY}) else() add_library(torch_cuda ${Caffe2_GPU_SRCS} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY} ${Caffe2_GPU_CU_SRCS} ${Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY}) endif() set(CUDA_LINK_LIBRARIES_KEYWORD) - if(BUILD_SPLIT_CUDA) - torch_compile_options(torch_cuda_cpp) # see cmake/public/utils.cmake - torch_compile_options(torch_cuda_cu) # see cmake/public/utils.cmake - target_compile_definitions(torch_cuda_cpp PRIVATE BUILD_SPLIT_CUDA) - target_compile_definitions(torch_cuda_cpp PRIVATE USE_CUDA) - target_compile_definitions(torch_cuda_cu PRIVATE BUILD_SPLIT_CUDA) - target_compile_definitions(torch_cuda_cu PRIVATE USE_CUDA) - else() - torch_compile_options(torch_cuda) # see cmake/public/utils.cmake - target_compile_definitions(torch_cuda PRIVATE USE_CUDA) - endif() - if(USE_NCCL AND BUILD_SPLIT_CUDA) - target_link_libraries(torch_cuda_cpp PRIVATE __caffe2_nccl) - target_compile_definitions(torch_cuda_cpp PRIVATE USE_NCCL) - elseif(USE_NCCL) + torch_compile_options(torch_cuda) # see cmake/public/utils.cmake + target_compile_definitions(torch_cuda PRIVATE USE_CUDA) + if(USE_NCCL) target_link_libraries(torch_cuda PRIVATE __caffe2_nccl) target_compile_definitions(torch_cuda PRIVATE USE_NCCL) endif() - if(USE_UCC AND BUILD_SPLIT_CUDA) - target_link_libraries(torch_cuda_cpp PRIVATE __caffe2_ucc) - target_compile_definitions(torch_cuda_cpp PRIVATE USE_UCC) - elseif(USE_UCC) + if(USE_UCC) target_link_libraries(torch_cuda PRIVATE __caffe2_ucc) target_compile_definitions(torch_cuda PRIVATE USE_UCC) endif() @@ -998,13 +976,8 @@ elseif(USE_CUDA) endif() if(USE_PRECOMPILED_HEADERS) - if(BUILD_SPLIT_CUDA) - target_precompile_headers(torch_cuda_cpp PRIVATE - "$<$:ATen/core/ATen_pch.h>") - else() - target_precompile_headers(torch_cuda PRIVATE - "$<$:ATen/core/ATen_pch.h>") - endif() + target_precompile_headers(torch_cuda PRIVATE + "$<$:ATen/core/ATen_pch.h>") endif() endif() @@ -1085,12 +1058,7 @@ if(NOT NO_API) ${TORCH_SRC_DIR}/csrc/api/include) endif() -if(BUILD_SPLIT_CUDA AND MSVC) - # -INCLUDE is used to ensure torch_cuda_cpp/cu are linked against in a project that relies on them. - target_link_libraries(torch_cuda_cpp INTERFACE "-INCLUDE:?warp_size@cuda@at@@YAHXZ") - # See [Note about _torch_cuda_cu_linker_symbol_op and torch_cuda_cu] in native_functions.yaml - target_link_libraries(torch_cuda_cu INTERFACE "-INCLUDE:?_torch_cuda_cu_linker_symbol_op_cuda@native@at@@YA?AVTensor@2@AEBV32@@Z") -elseif(USE_CUDA AND MSVC) +if(USE_CUDA AND MSVC) # -INCLUDE is used to ensure torch_cuda is linked against in a project that relies on them. # Related issue: https://github.com/pytorch/pytorch/issues/31611 target_link_libraries(torch_cuda INTERFACE "-INCLUDE:?warp_size@cuda@at@@YAHXZ") @@ -1320,27 +1288,16 @@ if(USE_DISTRIBUTED) if(USE_UCC AND USE_C10D_UCC) target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC) if(USE_CUDA) - if(BUILD_SPLIT_CUDA) - target_compile_definitions(torch_cuda_cpp PUBLIC USE_C10D_UCC) - else() - target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC) - endif() + target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC) endif() endif() if(USE_NCCL AND USE_C10D_NCCL) if(USE_ROCM) target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL) else() - if(BUILD_SPLIT_CUDA) - target_compile_definitions(torch_cuda_cpp PUBLIC USE_C10D_NCCL) - if(USE_NCCL_WITH_UCC) - target_compile_definitions(torch_cuda_cpp PUBLIC USE_NCCL_WITH_UCC) - endif() - else() - target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL) - if(USE_NCCL_WITH_UCC) - target_compile_definitions(torch_cuda PUBLIC USE_NCCL_WITH_UCC) - endif() + target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL) + if(USE_NCCL_WITH_UCC) + target_compile_definitions(torch_cuda PUBLIC USE_NCCL_WITH_UCC) endif() endif() endif() @@ -1423,14 +1380,7 @@ torch_set_target_props(torch_cpu) target_compile_options(torch_cpu PRIVATE "-DCAFFE2_BUILD_MAIN_LIB") -if(BUILD_SPLIT_CUDA) - target_compile_options(torch_cuda_cu PRIVATE "-DTORCH_CUDA_CU_BUILD_MAIN_LIB") - target_compile_options(torch_cuda_cpp PRIVATE "-DTORCH_CUDA_CPP_BUILD_MAIN_LIB") - # NB: This must be target_compile_definitions, not target_compile_options, - # as the latter is not respected by nvcc - target_compile_definitions(torch_cuda_cu PRIVATE "-DTORCH_CUDA_CU_BUILD_MAIN_LIB") - target_compile_definitions(torch_cuda_cpp PRIVATE "-DTORCH_CUDA_CPP_BUILD_MAIN_LIB") -elseif(USE_CUDA) +if(USE_CUDA) target_compile_options(torch_cuda PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB") # NB: This must be target_compile_definitions, not target_compile_options, # as the latter is not respected by nvcc @@ -1441,10 +1391,7 @@ elseif(USE_ROCM) endif() if(USE_EXPERIMENTAL_CUDNN_V8_API) - if(BUILD_SPLIT_CUDA) - target_compile_definitions(torch_cuda_cu PRIVATE "-DUSE_EXPERIMENTAL_CUDNN_V8_API") - target_compile_definitions(torch_cuda_cpp PRIVATE "-DUSE_EXPERIMENTAL_CUDNN_V8_API") - elseif(USE_CUDA) + if(USE_CUDA) target_compile_definitions(torch_cuda PRIVATE "-DUSE_EXPERIMENTAL_CUDNN_V8_API") endif() endif() @@ -1534,10 +1481,6 @@ caffe2_interface_library(torch_cpu torch_cpu_library) if(USE_CUDA) caffe2_interface_library(torch_cuda torch_cuda_library) - if(BUILD_SPLIT_CUDA) - caffe2_interface_library(torch_cuda_cu torch_cuda_cu_library) - caffe2_interface_library(torch_cuda_cpp torch_cuda_cpp_library) - endif() elseif(USE_ROCM) caffe2_interface_library(torch_hip torch_hip_library) endif() @@ -1548,10 +1491,6 @@ install(TARGETS torch_cpu torch_cpu_library EXPORT Caffe2Targets DESTINATION "${ if(USE_CUDA) install(TARGETS torch_cuda torch_cuda_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}") - if(BUILD_SPLIT_CUDA) - install(TARGETS torch_cuda_cu torch_cuda_cu_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}") - install(TARGETS torch_cuda_cpp torch_cuda_cpp_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}") - endif() elseif(USE_ROCM) install(TARGETS torch_hip torch_hip_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}") endif() @@ -1561,11 +1500,6 @@ target_link_libraries(torch PUBLIC torch_cpu_library) if(USE_CUDA) target_link_libraries(torch PUBLIC torch_cuda_library) - if(BUILD_SPLIT_CUDA) - # NS: Library order is important here to prevent cudnn double linking - target_link_libraries(torch_cuda PUBLIC torch_cuda_cpp_library) - target_link_libraries(torch_cuda PUBLIC torch_cuda_cu_library) - endif() elseif(USE_ROCM) target_link_libraries(torch PUBLIC torch_hip_library) endif() @@ -1578,10 +1512,7 @@ endif() # Install PDB files for MSVC builds if(MSVC AND BUILD_SHARED_LIBS) install(FILES $ DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL) - if(BUILD_SPLIT_CUDA) - install(FILES $ DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL) - install(FILES $ DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL) - elseif(USE_CUDA) + if(USE_CUDA) install(FILES $ DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL) elseif(USE_ROCM) install(FILES $ DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL) @@ -1589,36 +1520,7 @@ if(MSVC AND BUILD_SHARED_LIBS) endif() # ---[ CUDA library. -if(BUILD_SPLIT_CUDA) - target_link_libraries(torch_cuda_cu INTERFACE torch::cudart) - target_link_libraries(torch_cuda_cpp INTERFACE torch::cudart) - target_link_libraries(torch_cuda_cu PUBLIC c10_cuda torch::nvtoolsext) - target_link_libraries(torch_cuda_cpp PUBLIC c10_cuda torch::nvtoolsext) - - target_include_directories( - torch_cuda_cu INTERFACE $) - target_include_directories( - torch_cuda_cpp INTERFACE $) - target_include_directories( - torch_cuda_cu PRIVATE ${Caffe2_GPU_INCLUDE}) - target_include_directories( - torch_cuda_cpp PRIVATE ${Caffe2_GPU_INCLUDE}) - target_link_libraries( - torch_cuda_cu PRIVATE ${Caffe2_CUDA_DEPENDENCY_LIBS}) - target_link_libraries( - torch_cuda_cpp PRIVATE ${Caffe2_CUDA_DEPENDENCY_LIBS}) - target_link_libraries(torch_cuda_cu PRIVATE torch_cuda_cpp) - if(USE_CUDNN) - target_link_libraries( - torch_cuda_cpp PRIVATE caffe2::cudnn-private) - endif() - - # These public dependencies must go after the previous dependencies, as the - # order of the libraries in the linker call matters here when statically - # linking; libculibos and cublas must be last. - target_link_libraries(torch_cuda_cpp PUBLIC torch_cpu_library ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS}) - target_link_libraries(torch_cuda_cu PUBLIC torch_cpu_library ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS}) -elseif(USE_CUDA) +if(USE_CUDA) target_link_libraries(torch_cuda INTERFACE torch::cudart) target_link_libraries(torch_cuda PUBLIC c10_cuda torch::nvtoolsext) diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py index f10fd14393580..5f13834ee77e0 100644 --- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py +++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py @@ -290,6 +290,7 @@ ("aten::nested_to_padded_tensor", datetime.date(2022, 10, 1)), ("aten::nested_tensor", datetime.date(2022, 10, 15)), ("aten::_nested_tensor_layer_norm", datetime.date(2022, 10, 15)), + ("aten::_torch_cuda_cu_linker_symbol_op", datetime.date(2022, 11, 1)), ] diff --git a/torch/csrc/jit/codegen/cuda/nvfuser.cmake b/torch/csrc/jit/codegen/cuda/nvfuser.cmake index 526a674e4fb4c..147003054766b 100644 --- a/torch/csrc/jit/codegen/cuda/nvfuser.cmake +++ b/torch/csrc/jit/codegen/cuda/nvfuser.cmake @@ -1,6 +1,4 @@ -if(BUILD_SPLIT_CUDA) - set(TORCHLIB_FLAVOR torch_cuda_cu) # chose torch_cuda_cu here since JIT is in torch_cuda_cpp -elseif(USE_CUDA) +if(USE_CUDA) set(TORCHLIB_FLAVOR torch_cuda) elseif(USE_ROCM) set(TORCHLIB_FLAVOR torch_hip) diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index 9ab43e5ccdd67..36811bf22dedc 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -38,9 +38,6 @@ TORCH_LIB_PATH = os.path.join(_TORCH_PATH, 'lib') -BUILD_SPLIT_CUDA = os.getenv('BUILD_SPLIT_CUDA') or (os.path.exists(os.path.join( - TORCH_LIB_PATH, f'{CLIB_PREFIX}torch_cuda_cu{CLIB_EXT}')) and os.path.exists(os.path.join(TORCH_LIB_PATH, f'{CLIB_PREFIX}torch_cuda_cpp{CLIB_EXT}'))) - SUBPROCESS_DECODE_ARGS = ('oem',) if IS_WINDOWS else () MINIMUM_GCC_VERSION = (5, 0, 0) MINIMUM_MSVC_VERSION = (19, 0, 24215) @@ -1060,11 +1057,7 @@ def CUDAExtension(name, sources, *args, **kwargs): else: libraries.append('cudart') libraries.append('c10_cuda') - if BUILD_SPLIT_CUDA: - libraries.append('torch_cuda_cu') - libraries.append('torch_cuda_cpp') - else: - libraries.append('torch_cuda') + libraries.append('torch_cuda') kwargs['libraries'] = libraries include_dirs = kwargs.get('include_dirs', []) @@ -1657,15 +1650,7 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone): if with_cuda: extra_ldflags.append('c10_cuda.lib') extra_ldflags.append('torch_cpu.lib') - if BUILD_SPLIT_CUDA and with_cuda: - extra_ldflags.append('torch_cuda_cu.lib') - # See [Note about _torch_cuda_cu_linker_symbol_op and torch_cuda_cu] in native_functions.yaml - extra_ldflags.append('-INCLUDE:?_torch_cuda_cu_linker_symbol_op_cuda@native@at@@YA?AVTensor@2@AEBV32@@Z') - extra_ldflags.append('torch_cuda_cpp.lib') - # /INCLUDE is used to ensure torch_cuda_cpp is linked against in a project that relies on it. - # Related issue: https://github.com/pytorch/pytorch/issues/31611 - extra_ldflags.append('-INCLUDE:?warp_size@cuda@at@@YAHXZ') - elif with_cuda: + if with_cuda: extra_ldflags.append('torch_cuda.lib') # /INCLUDE is used to ensure torch_cuda is linked against in a project that relies on it. # Related issue: https://github.com/pytorch/pytorch/issues/31611 @@ -1682,9 +1667,7 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone): if with_cuda: extra_ldflags.append('-lc10_hip' if IS_HIP_EXTENSION else '-lc10_cuda') extra_ldflags.append('-ltorch_cpu') - if BUILD_SPLIT_CUDA and with_cuda: - extra_ldflags.append('-ltorch_hip' if IS_HIP_EXTENSION else '-ltorch_cuda_cu -ltorch_cuda_cpp') - elif with_cuda: + if with_cuda: extra_ldflags.append('-ltorch_hip' if IS_HIP_EXTENSION else '-ltorch_cuda') extra_ldflags.append('-ltorch') if not is_standalone: From be7d04682b8f987f37091eaa049e1639f6e355a7 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Sat, 22 Oct 2022 06:06:15 +0000 Subject: [PATCH 0046/1922] [CI] Run all MacOS builds on MacOS-12 (#87496) Not sure why we needed macos-10.15 for libtorch Pull Request resolved: https://github.com/pytorch/pytorch/pull/87496 Approved by: https://github.com/atalman, https://github.com/seemethere --- .../macos_binary_build_workflow.yml.j2 | 10 +------- ...rated-macos-arm64-binary-conda-nightly.yml | 3 +++ ...rated-macos-arm64-binary-wheel-nightly.yml | 4 ++++ .../generated-macos-binary-conda-nightly.yml | 4 ++++ ...acos-binary-libtorch-cxx11-abi-nightly.yml | 24 +++++++++---------- ...acos-binary-libtorch-pre-cxx11-nightly.yml | 24 +++++++++---------- .../generated-macos-binary-wheel-nightly.yml | 4 ++++ 7 files changed, 40 insertions(+), 33 deletions(-) diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2 index 149c007daef9e..5e6b505664e60 100644 --- a/.github/templates/macos_binary_build_workflow.yml.j2 +++ b/.github/templates/macos_binary_build_workflow.yml.j2 @@ -58,17 +58,8 @@ jobs: {%- for config in build_configs %} !{{ config["build_name"] }}-build: if: ${{ github.repository_owner == 'pytorch' }} - {%- if config["package_type"] == "libtorch" %} - runs-on: macos-10.15 - {%- else %} runs-on: macos-12-xl - {%- endif %} -{%- if config["package_type"] == "libtorch" %} - # libtorch builds take a long time on github hosted runners - timeout-minutes: 720 -{%- else %} timeout-minutes: !{{ common.timeout_minutes }} -{%- endif %} !{{ upload.binary_env(config, true) }} # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} @@ -82,6 +73,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" !{{ common.checkout(deep_clone=False, directory="pytorch") }} !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }} - name: Install sccache (only for non-forked PRs, and pushes to trunk) diff --git a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml index 52fe582aa59ee..5d47cc77cf3a7 100644 --- a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml +++ b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml @@ -71,6 +71,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: @@ -180,6 +181,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: @@ -289,6 +291,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml index fc5f84d9484ea..e58d153269b38 100644 --- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml +++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml @@ -71,6 +71,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: @@ -180,6 +181,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: @@ -289,6 +291,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: @@ -398,6 +401,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: diff --git a/.github/workflows/generated-macos-binary-conda-nightly.yml b/.github/workflows/generated-macos-binary-conda-nightly.yml index 8fab29ddaed9f..079687e6ff951 100644 --- a/.github/workflows/generated-macos-binary-conda-nightly.yml +++ b/.github/workflows/generated-macos-binary-conda-nightly.yml @@ -69,6 +69,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: @@ -178,6 +179,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: @@ -287,6 +289,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: @@ -396,6 +399,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: diff --git a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml index ae63f95bc3189..dcb480b0a07ce 100644 --- a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml +++ b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml @@ -34,9 +34,8 @@ concurrency: jobs: libtorch-cpu-shared-with-deps-cxx11-abi-build: if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-10.15 - # libtorch builds take a long time on github hosted runners - timeout-minutes: 720 + runs-on: macos-12-xl + timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder @@ -74,6 +73,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: @@ -149,9 +149,8 @@ jobs: uses: ./.github/workflows/_binary-upload.yml libtorch-cpu-shared-without-deps-cxx11-abi-build: if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-10.15 - # libtorch builds take a long time on github hosted runners - timeout-minutes: 720 + runs-on: macos-12-xl + timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder @@ -189,6 +188,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: @@ -264,9 +264,8 @@ jobs: uses: ./.github/workflows/_binary-upload.yml libtorch-cpu-static-with-deps-cxx11-abi-build: if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-10.15 - # libtorch builds take a long time on github hosted runners - timeout-minutes: 720 + runs-on: macos-12-xl + timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder @@ -304,6 +303,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: @@ -379,9 +379,8 @@ jobs: uses: ./.github/workflows/_binary-upload.yml libtorch-cpu-static-without-deps-cxx11-abi-build: if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-10.15 - # libtorch builds take a long time on github hosted runners - timeout-minutes: 720 + runs-on: macos-12-xl + timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder @@ -419,6 +418,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: diff --git a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml index 39ad514a56702..5f02ea874b4e4 100644 --- a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml +++ b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml @@ -34,9 +34,8 @@ concurrency: jobs: libtorch-cpu-shared-with-deps-pre-cxx11-build: if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-10.15 - # libtorch builds take a long time on github hosted runners - timeout-minutes: 720 + runs-on: macos-12-xl + timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder @@ -74,6 +73,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: @@ -149,9 +149,8 @@ jobs: uses: ./.github/workflows/_binary-upload.yml libtorch-cpu-shared-without-deps-pre-cxx11-build: if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-10.15 - # libtorch builds take a long time on github hosted runners - timeout-minutes: 720 + runs-on: macos-12-xl + timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder @@ -189,6 +188,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: @@ -264,9 +264,8 @@ jobs: uses: ./.github/workflows/_binary-upload.yml libtorch-cpu-static-with-deps-pre-cxx11-build: if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-10.15 - # libtorch builds take a long time on github hosted runners - timeout-minutes: 720 + runs-on: macos-12-xl + timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder @@ -304,6 +303,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: @@ -379,9 +379,8 @@ jobs: uses: ./.github/workflows/_binary-upload.yml libtorch-cpu-static-without-deps-pre-cxx11-build: if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-10.15 - # libtorch builds take a long time on github hosted runners - timeout-minutes: 720 + runs-on: macos-12-xl + timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder @@ -419,6 +418,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: diff --git a/.github/workflows/generated-macos-binary-wheel-nightly.yml b/.github/workflows/generated-macos-binary-wheel-nightly.yml index 70d6783dbe881..081f470d6109f 100644 --- a/.github/workflows/generated-macos-binary-wheel-nightly.yml +++ b/.github/workflows/generated-macos-binary-wheel-nightly.yml @@ -69,6 +69,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: @@ -178,6 +179,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: @@ -287,6 +289,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: @@ -396,6 +399,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: From b0ae042db3c33f38b91b7dcbd01d224f8a15af6d Mon Sep 17 00:00:00 2001 From: Sherlock Huang Date: Sat, 22 Oct 2022 02:21:07 +0000 Subject: [PATCH 0047/1922] Fix meta for meta_fill_ (#87493) Existing meta_fill_ doesn't correctly reflect the aliasing relationship for aten.fill. A new MetaTensor should be return instead. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87493 Approved by: https://github.com/eellison, https://github.com/bdhirsh --- test/test_meta.py | 10 ++++++++++ torch/_meta_registrations.py | 10 ++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/test/test_meta.py b/test/test_meta.py index 6b283da39cbe0..23e7025140138 100644 --- a/test/test_meta.py +++ b/test/test_meta.py @@ -1013,6 +1013,16 @@ def test_huber_loss_backward(self): self.assertEqual(r.device.type, 'meta') self.assertEqual(r.shape, inps[0].shape) + def test_fill_alias_relationship(self): + inps = torch.rand(2**52, device='meta') + r = torch.ops.aten.fill_(inps, 1.0) + # aten.fill_ returns an aliase + self.assertEqual(id(inps), id(r)) + + # aten.fill returns a new tensor + r2 = torch.ops.aten.fill(inps, 1.0) + self.assertNotEqual(id(inps), id(r2)) + def test_map_location_deserialize(self): import io diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py index c17aa091120cc..7be63af9e051a 100644 --- a/torch/_meta_registrations.py +++ b/torch/_meta_registrations.py @@ -1111,14 +1111,16 @@ def meta_zero_(self): return self -@register_meta( - [aten.fill.Tensor, aten.fill.Scalar, aten.fill_.Tensor, aten.fill_.Scalar], - register_dispatcher=False, -) +@register_meta([aten.fill_.Tensor, aten.fill_.Scalar], register_dispatcher=False) def meta_fill_(self, val): return self +@register_meta([aten.fill.Tensor, aten.fill.Scalar], register_dispatcher=False) +def meta_fill(self, val): + return self.new_empty(self.shape) + + @register_meta(aten.relu_.default, register_dispatcher=False) def meta_relu_(self): return self From 80f29948806963fc1e95048c14951473b83dea41 Mon Sep 17 00:00:00 2001 From: Ryan Spring Date: Sat, 22 Oct 2022 17:59:25 +0000 Subject: [PATCH 0048/1922] Add xlogy and xlog1py references (#77712) * Add reference implementations for `xlogy` and `xlog1py` * Replace `_wrap_scalar` helper function with `scalar_tensor` prim Pull Request resolved: https://github.com/pytorch/pytorch/pull/77712 Approved by: https://github.com/mruberry --- test/functorch/test_aotdispatch.py | 1 - test/test_proxy_tensor.py | 1 - torch/_decomp/decompositions.py | 1 - torch/_refs/__init__.py | 29 +++++++++++++-- torch/_refs/special/__init__.py | 36 +++++++++++++++++-- .../_internal/common_methods_invocations.py | 15 ++++++++ 6 files changed, 75 insertions(+), 8 deletions(-) diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py index e9a46b0882e2e..c058b3618ecb1 100644 --- a/test/functorch/test_aotdispatch.py +++ b/test/functorch/test_aotdispatch.py @@ -1212,7 +1212,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _): xfail('sort', ''), # Cannot call sizes() on tensor with symbolic sizes/strides xfail('special.i1', ''), # aten.i0.default - couldn't find symbolic meta function/decomposition xfail('special.polygamma', 'special_polygamma_n_0'), # aten.polygamma.default - couldn't find symbolic ... - xfail('special.xlog1py', ''), # aten.special_xlog1py.default - couldn't find symbolic meta function/deco... xfail('split', ''), # Cannot call sizes() on tensor with symbolic sizes/strides xfail('std', ''), # Cannot call numel() on tensor with symbolic sizes/strides xfail('std_mean', ''), # Cannot call numel() on tensor with symbolic sizes/strides diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py index 914261ae1c6ab..0092daa77ab49 100644 --- a/test/test_proxy_tensor.py +++ b/test/test_proxy_tensor.py @@ -1328,7 +1328,6 @@ def f(a, b, c, d, e): xfail('special.polygamma', 'special_polygamma_n_0'), # aten.polygamma.default - couldn't find symbolic meta function/... xfail('special.scaled_modified_bessel_k0', ''), # aten.special_scaled_modified_bessel_k0.default - couldn't find symbo... xfail('special.scaled_modified_bessel_k1', ''), # aten.special_scaled_modified_bessel_k1.default - couldn't find symbo... - xfail('special.xlog1py', ''), # aten.special_xlog1py.default - couldn't find symbolic meta function/decomposition xfail('split', ''), # 'torch._C.SymIntNode' and 'int' xfail('stft', ''), # argument 'size' must be tuple of ints, but found element of type torch._C.SymIntNode at... xfail('sum_to_size', ''), # aten.size.default - couldn't find symbolic meta function/decomposition diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py index 9e9c36104ddc5..2b4d2914fe858 100644 --- a/torch/_decomp/decompositions.py +++ b/torch/_decomp/decompositions.py @@ -1433,7 +1433,6 @@ def _to_copy( return x -@register_decomposition(aten.xlogy.Tensor) @pw_cast_for_int_to_real def xlogy(self: Tensor, other: Tensor) -> Tensor: return aten.where( diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py index 08e1361c76220..d6a8f476b3176 100644 --- a/torch/_refs/__init__.py +++ b/torch/_refs/__init__.py @@ -162,12 +162,10 @@ "rsub", "rtruediv", "rfloordiv", - # # special.xlog1py - # # special.zeta "sub", "true_divide", "trunc_divide", - # 'xlogy', # where?, log, mul + "xlogy", # # Elementwise Ternary References # @@ -1546,6 +1544,31 @@ def sub( ) +@register_decomposition(torch.ops.aten.xlogy) +@out_wrapper() +@elementwise_type_promotion_wrapper( + type_promoting_args=("a", "b"), + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, +) +def xlogy(a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType]): + utils.check( + isinstance(a, TensorLike) or isinstance(b, TensorLike), + lambda: 'Expected either argument a or b to be a Tensor"', + ) + + # Operations like eq and log do not handle scalar values, so we convert them to scalar_tensors. + if isinstance(b, TensorLike) and isinstance(a, Number): + a = scalar_tensor(a, dtype=b.dtype, device=b.device) + elif isinstance(a, TensorLike) and isinstance(b, Number): + b = scalar_tensor(b, dtype=a.dtype, device=a.device) + + # mypy: expected "Tensor" + assert isinstance(a, TensorLike) + assert isinstance(b, TensorLike) + rhs = torch.where(torch.eq(a, 0), 0, torch.mul(a, torch.log(b))) + return torch.where(torch.isnan(b), float("nan"), rhs) + + def _trunc_divide( a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType] ): diff --git a/torch/_refs/special/__init__.py b/torch/_refs/special/__init__.py index fae9f9d12dbe6..1227a2631475b 100644 --- a/torch/_refs/special/__init__.py +++ b/torch/_refs/special/__init__.py @@ -1,5 +1,5 @@ import math -from typing import Optional +from typing import Optional, Union import torch import torch._prims as prims @@ -8,7 +8,13 @@ from torch import Tensor from torch._decomp import register_decomposition -from torch._prims_common import ELEMENTWISE_TYPE_PROMOTION_KIND, TensorLikeType +from torch._prims_common import ( + ELEMENTWISE_TYPE_PROMOTION_KIND, + Number, + NumberType, + TensorLike, + TensorLikeType, +) from torch._prims_common.wrappers import elementwise_type_promotion_wrapper, out_wrapper from torch._refs import ( _make_elementwise_binary_reference, @@ -33,6 +39,7 @@ "ndtri", "softmax", "spherical_bessel_j0", + "xlog1py", "zeta", ] @@ -134,6 +141,31 @@ def logit(self: TensorLikeType, eps: Optional[float] = None) -> TensorLikeType: return torch.log(torch.true_divide(self, torch.sub(1, self))) +@register_decomposition(torch.ops.aten.special_xlog1py) +@out_wrapper() +@elementwise_type_promotion_wrapper( + type_promoting_args=("a", "b"), + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, +) +def xlog1py(a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType]): + utils.check( + isinstance(a, TensorLike) or isinstance(b, TensorLike), + lambda: 'Expected either argument a or b to be a Tensor"', + ) + + # Operations like eq and log do not handle scalar values, so we convert them to scalar_tensors. + if isinstance(a, TensorLike) and isinstance(b, Number): + b = refs.scalar_tensor(b, dtype=a.dtype, device=a.device) + elif isinstance(b, TensorLike) and isinstance(a, Number): + a = refs.scalar_tensor(a, dtype=b.dtype, device=b.device) + + # mypy: expected "Tensor" + assert isinstance(a, TensorLike) + assert isinstance(b, TensorLike) + rhs = torch.where(torch.eq(a, 0), 0, torch.mul(a, refs.log1p(b))) + return torch.where(torch.isnan(b), float("nan"), rhs) + + @register_decomposition(torch.ops.aten.mvlgamma) @out_wrapper() @elementwise_type_promotion_wrapper( diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 34f54f2fb5ae1..f637339f16d24 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -17461,6 +17461,21 @@ def reference_flatten(input, start_dim=0, end_dim=-1): dtypes=(torch.uint8,), device_type="cpu"), ) ), + ElementwiseBinaryPythonRefInfo( + "_refs.xlogy", + torch_opinfo_name="xlogy", + supports_one_python_scalar=True, + supports_nvfuser=False, + ), + # + # Elementwise Binary Special OpInfos + # + ElementwiseBinaryPythonRefInfo( + "_refs.special.xlog1py", + torch_opinfo_name="special.xlog1py", + supports_one_python_scalar=True, + supports_nvfuser=False, + ), # # Data Conversion & Data Movement Opinfos # From f7b42a07cda47d3c2b1478004638b3bcb102fd55 Mon Sep 17 00:00:00 2001 From: PyTorch MergeBot Date: Sun, 23 Oct 2022 03:18:57 +0000 Subject: [PATCH 0049/1922] [vision hash update] update the pinned vision hash (#87528) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml). Update the pinned vision hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87528 Approved by: https://github.com/pytorchbot --- .github/ci_commit_pins/vision.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt index db0aa4e7d73c4..02a12c728a3a5 100644 --- a/.github/ci_commit_pins/vision.txt +++ b/.github/ci_commit_pins/vision.txt @@ -1 +1 @@ -7a62a545ce76f43ccc5cfe0009131f7db14ae7b5 +9c112935abe400222cca8f9fbc2d8386e0f25e80 From 3c6063a281b024666ca4c1f2fdea843d83233d7f Mon Sep 17 00:00:00 2001 From: Horace He Date: Sun, 23 Oct 2022 02:53:37 +0000 Subject: [PATCH 0050/1922] Fix stupid N^2 naming behavior in FX and removed assert that slows things a lot sometimes (#87533) cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang Pull Request resolved: https://github.com/pytorch/pytorch/pull/87533 Approved by: https://github.com/ezyang, https://github.com/voznesenskym --- torch/_dynamo/variables/lists.py | 5 ++++- torch/fx/graph.py | 10 +++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py index e1c0d584073e4..f63283819f350 100644 --- a/torch/_dynamo/variables/lists.py +++ b/torch/_dynamo/variables/lists.py @@ -395,7 +395,10 @@ class ListIteratorVariable(VariableTracker): def __init__(self, items, index: int = 0, **kwargs): super(ListIteratorVariable, self).__init__(**kwargs) assert isinstance(items, list) - assert all(isinstance(x, VariableTracker) for x in items) + # Removing this check as it slows things down too much + # https://github.com/pytorch/pytorch/pull/87533#issuecomment-1287574492 + + # assert all(isinstance(x, VariableTracker) for x in items) self.items = items self.index = index diff --git a/torch/fx/graph.py b/torch/fx/graph.py index 9397050bc29a5..3b8c96b6a43bf 100644 --- a/torch/fx/graph.py +++ b/torch/fx/graph.py @@ -1,3 +1,4 @@ +from collections import defaultdict from .node import Node, Argument, Target, map_arg, _type_repr, _get_qualified_name import torch.utils._pytree as pytree from . import _pytree as fx_pytree @@ -120,7 +121,8 @@ class _Namespace: def __init__(self): self._obj_to_name: Dict[Any, str] = {} self._unassociated_names = set() - self._used_names: Dict[str, int] = {} + self._used_names: Set[str] = set() + self._base_count: Dict[str, int] = defaultdict(int) self._illegal_char_regex = re.compile('[^0-9a-zA-Z_]+') self._name_suffix_regex = re.compile(r"(.*)_(\d+)$") @@ -150,13 +152,15 @@ def create_name(self, candidate: str, obj: Optional[Any]) -> str: num = int(num_str) candidate = base if num is None else f'{base}_{num}' - num = num if num else 0 + if not num: + num = self._base_count[base] while candidate in self._used_names or self._is_illegal_name(candidate, obj): num += 1 candidate = f'{base}_{num}' - self._used_names.setdefault(candidate, 0) + self._used_names.add(candidate) + self._base_count[base] = num if obj is None: self._unassociated_names.add(candidate) else: From aeb5f0f0e7efc61fac5a2820ddc41b71eb840cfa Mon Sep 17 00:00:00 2001 From: Taylor Robie Date: Sat, 22 Oct 2022 17:37:54 -0700 Subject: [PATCH 0051/1922] [Profiler] Use parameter as key for optimizer state recording. (#86753) While optimizer can store state however it likes, in practice most optimizer state corresponds to a particular parameter. (This is the case for all `torch.optim` optimizers.) Thus, it turns out to be ergonomic to collect using that structure. Note that this doesn't lock us into anything; we can always collect state with non Tensor keys if the use case arises. One simplification that arises is that Module and Optimizer collection has very similar structure. So similar, in fact, that it is possible to use a common template for config. I also found that a lot of the `check_and_store` logic could be simplified and inlined by this joining of collected optimizer state. Differential Revision: [D40210703](https://our.internmc.facebook.com/intern/diff/D40210703/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/86753 Approved by: https://github.com/slgong-fb, https://github.com/aaronenyeshi --- test/profiler/test_profiler.py | 27 ++-- torch/csrc/autograd/profiler_python.cpp | 207 +++++++++++------------- torch/csrc/profiler/collection.h | 27 ++-- torch/csrc/profiler/python/init.cpp | 46 ++---- torch/csrc/profiler/python/init.h | 5 + 5 files changed, 144 insertions(+), 168 deletions(-) diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py index 5f3d7621dcfb3..09379bb02a531 100644 --- a/test/profiler/test_profiler.py +++ b/test/profiler/test_profiler.py @@ -1574,7 +1574,7 @@ def flat_out_extrafields(nodes, out=None): out = [] for node in nodes: if isinstance(node.extra_fields, _ExtraFields_PyCall) and node.extra_fields.module: - if node.extra_fields.module.params: + if node.extra_fields.module.parameters: out.append(node.extra_fields.module) flat_out_extrafields(node.children, out) return out @@ -1589,7 +1589,7 @@ def flat_out_extrafields(nodes, out=None): modules = flat_out_extrafields(p.profiler.kineto_results.experimental_event_tree()) self.assertEqual(len(modules), 2, f"Expected two parameter list, but got {len(modules)}") - params = [(n, p.storage_data_ptr, g.storage_data_ptr) for module in modules for (n, p, g) in module.params] + params = [(n, p.storage_data_ptr, g.storage_data_ptr) for module in modules for (n, p, g) in module.parameters] expected = [(name, val.storage().data_ptr(), val.grad.storage().data_ptr()) for name, val in net.fc1._parameters.items()] expected += [(name, val.storage().data_ptr(), val.grad.storage().data_ptr()) for name, val in net.fc2._parameters.items()] self.assertEqual(expected, params, f"{expected} vs. {params}") @@ -1599,29 +1599,34 @@ def _flat_out_extrafields(self, nodes, out=None): out = [] for node in nodes: if (isinstance(node.extra_fields, _ExtraFields_PyCall) and - node.extra_fields.optimizer and node.extra_fields.optimizer.param_addrs): + node.extra_fields.optimizer and node.extra_fields.optimizer.parameters): # avoiding OptInfo duplicates from iterations - addr = node.extra_fields.optimizer.param_addrs[0].storage_data_ptr - if not [o for o in out if addr == o.param_addrs[0].storage_data_ptr]: + addr = node.extra_fields.optimizer.parameters[0][0].storage_data_ptr + if not [o for o in out if addr == o.parameters[0][0].storage_data_ptr]: out.append(node.extra_fields.optimizer) self._flat_out_extrafields(node.children, out) return out def _check_results(self, opt, opts, check_items=False): self.assertEqual(len(opts), 1, f"Expected 1 optimizer: len(opts): {len(opts)}") - self.assertEqual(id(opt), opts[0].self, f"Optimizer addr ({id(opt)}) vs. profiled addr ({opts[0].self})") + self.assertEqual(id(opt), opts[0].self_ptr, f"Optimizer addr ({id(opt)}) vs. profiled addr ({opts[0].self_ptr})") if check_items: self.assertEqual(len(opt.param_groups), len(opts)) for group, opt_ in zip(opt.param_groups, opts): self.assertEqual( [(v.storage().data_ptr()) for v in group.get("params", [])], - [(o.storage_data_ptr) for o in opt_.param_addrs] + [(o.storage_data_ptr) for (o, _, _) in opt_.parameters] ) for opt_ in opts: - self.assertEqual( - [(name, val.storage().data_ptr()) for dic in opt.state.values() for name, val in dic.items()], - [(n, p.storage_data_ptr) for (n, p) in opt_.opt_state] - ) + observed_state = { + p.storage_data_ptr: {name: s.storage_data_ptr for name, s in state} + for (p, _, state) in opt_.parameters + } + for parameter, parameter_state in opt.state.items(): + self.assertEqual( + {name: value.storage().data_ptr() for name, value in parameter_state.items()}, + observed_state.get(parameter.storage().data_ptr(), []) + ) def test_optimizer(self): inputs = torch.rand(10) diff --git a/torch/csrc/autograd/profiler_python.cpp b/torch/csrc/autograd/profiler_python.cpp index aee3702b8b105..d971336ae0c4d 100644 --- a/torch/csrc/autograd/profiler_python.cpp +++ b/torch/csrc/autograd/profiler_python.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -204,20 +205,36 @@ struct Config { static constexpr EventType event_type = EventType::PyCall; }; -template <> -struct Config { - using key_t = PyModuleSelf; - using cls_t = PyModuleCls; +template +struct ExtendedPyCallConfig { + using key_t = Key; + using cls_t = Cls; using ephemeral_t = PyFrameObject*; - using info_t = std::pair>; - struct cache_t { - c10::optional location_; // nn.Module.forward; - ska::flat_hash_map modules_and_params_; + + struct ClsAndParameters { + cls_t cls_; + std::vector parameters_; + }; + + struct Cache { + // `nn.Module.forward` or `optim.Optimizer._optimizer_step_code` + c10::optional location_; + ska::flat_hash_map cls_and_parameters_; ska::flat_hash_map cls_names_; }; + using cache_t = Cache; + static constexpr EventType event_type = EventType::PyCall; }; +template <> +struct Config + : ExtendedPyCallConfig {}; + +template <> +struct Config + : ExtendedPyCallConfig {}; + template <> struct Config { using key_t = PyMethod; @@ -226,25 +243,6 @@ struct Config { static constexpr EventType event_type = EventType::PyCCall; }; -template <> -struct Config { - using key_t = PyOptimizerSelf; - using cls_t = PyOptimizerCls; - using ephemeral_t = PyFrameObject*; - struct info_t { - cls_t cls_; - std::vector params_; - std::vector> states_; - }; - struct cache_t { - c10::optional - location_; // optim.Optimizer._optimizer_step_code; - ska::flat_hash_map optimizer_data_; - ska::flat_hash_map cls_names_; - }; - static constexpr EventType event_type = EventType::PyCall; -}; - // ============================================================================ // == Callsite & ValueCache: Storage during profiling ========================= // ============================================================================ @@ -269,52 +267,6 @@ class Callsite { Config::key_t caller_; }; -void check_and_store( - const pybind11::handle& name, - const pybind11::handle& param_handle, - std::vector& storeroom) { - auto param_ptr = param_handle.ptr(); - if (py::isinstance(name) && THPVariable_CheckExact(param_ptr)) { - const auto& param = THPVariable_Unpack(param_ptr); - auto grad_ptr = py::getattr(param_handle, "grad", py::none()).ptr(); - c10::optional grad_metadata; - - if (THPVariable_CheckExact(grad_ptr)) { - grad_metadata = c10::optional( - TensorMetadata(THPVariable_Unpack(grad_ptr))); - } else { - grad_metadata = c10::nullopt; - } - - storeroom.push_back( - {name.cast(), TensorMetadata(param), grad_metadata}); - } -} - -void check_and_store( - const pybind11::handle& name, - const pybind11::handle& param_handle, - std::vector, TensorMetadata>>& - storeroom) { - auto param_ptr = param_handle.ptr(); - if (py::isinstance(name) && THPVariable_CheckExact(param_ptr)) { - const auto& param = THPVariable_Unpack(param_ptr); - - storeroom.emplace_back(name.cast(), param); - } -} - -void check_and_store( - const pybind11::handle& param_handle, - std::vector& storeroom) { - auto param_ptr = param_handle.ptr(); - if (THPVariable_CheckExact(param_ptr)) { - const auto& param = THPVariable_Unpack(param_ptr); - - storeroom.emplace_back(param); - } -} - // ============================================================================ // == Type specific store and load implementations. =========================== // ============================================================================ @@ -374,6 +326,28 @@ typename Config::cls_t set_class( return cls; } +auto toTensorMetadata(PyObject* self) { + TORCH_INTERNAL_ASSERT(THPVariable_CheckExact(self)); + return TensorMetadata{THPVariable_Unpack(self)}; +} + +auto recordIfTensor(py::handle p) { + return THPVariable_CheckExact(p.ptr()) + ? c10::optional{toTensorMetadata(p.ptr())} + : c10::nullopt; +} + +auto unpackTensorMap(py::dict tensor_map) { + std::vector> out; + for (auto& it : tensor_map) { + auto* value = it.second.ptr(); + if (py::isinstance(it.first) && THPVariable_CheckExact(value)) { + out.push_back({py::cast(it.first), toTensorMetadata(value)}); + } + } + return out; +} + template <> void ValueCache::store(const PyCallKey& key, no_ephemeral_t) { auto& locations = std::get(state_); @@ -397,16 +371,22 @@ void ValueCache::store( Config::ephemeral_t frame) { auto& cache = std::get(state_); if (C10_UNLIKELY( - cache.modules_and_params_.find(key) == - cache.modules_and_params_.end())) { + cache.cls_and_parameters_.find(key) == + cache.cls_and_parameters_.end())) { auto cls = set_class(this, cache, key, frame); py::dict params = py::handle((PyObject*)key).attr("_parameters"); - std::vector params_; + std::vector params_; for (auto& it : params) { - check_and_store(it.first, it.second, params_); + auto* p = it.second.ptr(); + if (py::isinstance(it.first) && THPVariable_CheckExact(p)) { + params_.push_back( + {it.first.cast(), + toTensorMetadata(p), + recordIfTensor(py::getattr(it.second, "grad", py::none()))}); + } } - cache.modules_and_params_[key] = make_pair(cls, params_); + cache.cls_and_parameters_[key] = {cls, params_}; } } @@ -415,45 +395,45 @@ ExtraFields::args_t ValueCache::load( const PyModuleCallKey& key) const { auto& cache = std::get(state_); TORCH_INTERNAL_ASSERT(cache.location_.has_value()); - auto cls = cache.modules_and_params_.at(key).first; - auto fwd = std::get(state_).at(*cache.location_); + const auto& cls_and_parameters = cache.cls_and_parameters_.at(key); + const auto& cls = cls_and_parameters.cls_; + NNModuleInfo info{ + key, cls, cache.cls_names_.at(cls), cls_and_parameters.parameters_}; return { - fwd, - NNModuleInfo{ - key, - cls, - cache.cls_names_.at(cls), - cache.modules_and_params_.at(key).second}}; + /*frame_state_=*/std::get(state_).at(*cache.location_), + /*module_info_=*/std::move(info), + /*optimizer_info_=*/c10::nullopt}; } + template <> void ValueCache::store( const PyOptimizerCallKey& key, Config::ephemeral_t frame) { auto& cache = std::get(state_); if (C10_UNLIKELY( - cache.optimizer_data_.find(key) == cache.optimizer_data_.end())) { + cache.cls_and_parameters_.find(key) == + cache.cls_and_parameters_.end())) { auto cls = set_class(this, cache, key, frame); - py::list param_groups_handle = - py::handle((PyObject*)key).attr("param_groups"); - std::vector params_; - // param_groups is a list of dict - for (auto& param_group : param_groups_handle) { - for (auto& param : - py::cast(param_group).attr("get")("params")) { - check_and_store(param, params_); - } - } - std::vector> states_; - py::dict state_handle = py::handle((PyObject*)key).attr("state"); - for (auto& it : state_handle) { - TORCH_INTERNAL_ASSERT( - py::isinstance(it.second), "Expects a dict type element"); - for (auto& state_elem : py::cast(it.second)) { - check_and_store(state_elem.first, state_elem.second, states_); + const py::handle self{(PyObject*)key}; + std::vector params; + + for (const auto& i : (py::list)self.attr("param_groups")) { + for (auto& param : py::cast(i).attr("get")("params")) { + if (THPVariable_CheckExact(param.ptr())) { + // While `self.state` is permitted to store data in an arbitrary way, + // all generic optimizers (SGD, Adam, etc) use param as the key since + // the state in question is tied to particular parameters. We can + // relax this assumption if the need arises. + params.push_back( + {toTensorMetadata(param.ptr()), + recordIfTensor(py::getattr(param, "grad", py::none())), + unpackTensorMap(py::cast(self.attr("state")) + .attr("get")(param, py::dict()))}); + } } } - cache.optimizer_data_[key] = {cls, params_, states_}; + cache.cls_and_parameters_[key] = {cls, params}; } } @@ -461,17 +441,14 @@ template <> ExtraFields::args_t ValueCache::load< CallType::PyOptimizerCall>(const PyOptimizerCallKey& key) const { auto& cache = std::get(state_); - auto cls = cache.optimizer_data_.at(key).cls_; - auto frame_state = std::get(state_).at(*cache.location_); + const auto& cls_and_parameters = cache.cls_and_parameters_.at(key); + auto cls = cls_and_parameters.cls_; + OptimizerInfo info{ + key, cls, cache.cls_names_.at(cls), cls_and_parameters.parameters_}; return { - frame_state, - c10::nullopt, - OptimizerInfo{ - key, - cls, - cache.cls_names_.at(cls), - cache.optimizer_data_.at(key).params_, - cache.optimizer_data_.at(key).states_}}; + /*frame_state_=*/std::get(state_).at(*cache.location_), + /*module_info_=*/c10::nullopt, + /*optimizer_info_=*/std::move(info)}; } template <> diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h index 9c979df8ff61b..5402e613eb858 100644 --- a/torch/csrc/profiler/collection.h +++ b/torch/csrc/profiler/collection.h @@ -238,29 +238,34 @@ using PyMethod = strong_t; using PyOptimizerSelf = strong_t; using PyOptimizerCls = strong_t; -struct ParameterInfo { - std::string param_name_; - TensorMetadata param_; - c10::optional grad_; -}; - struct NNModuleInfo { + struct ParameterInfo { + std::string name_; + TensorMetadata metadata_; + c10::optional grad_metadata_; + }; + PyModuleSelf self_; PyModuleCls cls_; at::StringView cls_name_; - std::vector params_; + std::vector parameters_; // Indicates that `self_` is the kth instance of `cls_` observed. size_t id_{std::numeric_limits::max()}; }; struct OptimizerInfo { + struct ParameterInfo { + TensorMetadata metadata_; + c10::optional grad_metadata_; + std::vector> state_; + }; + PyOptimizerSelf self_; - PyOptimizerCls opt_; - at::StringView opt_name_; + PyOptimizerCls cls_; + at::StringView cls_name_; - std::vector params_addr_; - std::vector> opt_state_; + std::vector parameters_; }; struct PyExtraFieldsBase { diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp index be25e5fc1b95c..63d893d4ef8d0 100644 --- a/torch/csrc/profiler/python/init.cpp +++ b/torch/csrc/profiler/python/init.cpp @@ -181,49 +181,33 @@ void initPythonBindings(PyObject* module) { py::class_(m, "_NNModuleInfo") .def_property_readonly( - "params", + "parameters", [](const NNModuleInfo& s) { - py::list list; - for (auto& p : s.params_) { - list.append(std::tuple< - std::string, - TensorMetadata, - c10::optional>( - p.param_name_, p.param_, p.grad_)); + py::list out; + for (const auto& p : s.parameters_) { + out.append( + py::make_tuple(p.name_, p.metadata_, p.grad_metadata_)); } - return list; + return out; }) .def_property_readonly( "cls_name", [](const NNModuleInfo& s) { return s.cls_name_.str(); }); - py::class_(m, "_OptInfo") - .def_property_readonly( - "self", - [](const OptimizerInfo& a) { - return reinterpret_cast(a.self_.value_of()); - }) - .def_property_readonly( - "param_addrs", - [](const OptimizerInfo& s) { - py::list params_addrs; - for (auto& addr : s.params_addr_) { - params_addrs.append(addr); - } - return params_addrs; - }) - .def_property_readonly("opt_state", [](const OptimizerInfo& s) { - py::list states; - for (auto& a : s.opt_state_) { - states.append(std::make_pair(a.first, a.second)); + py::class_(m, "_OptimizerInfo") + .def_readonly("self_ptr", &OptimizerInfo::self_) + .def_property_readonly("parameters", [](const OptimizerInfo& s) { + py::list out; + for (const auto& p : s.parameters_) { + out.append(py::make_tuple(p.metadata_, p.grad_metadata_, p.state_)); } - return states; + return out; }); py::class_>(m, "_ExtraFields_PyCall") - .def_readonly("optimizer", &ExtraFields::optimizer_) .def_readonly("callsite", &ExtraFields::callsite_) .def_readonly("caller", &ExtraFields::caller_) - .def_readonly("module", &ExtraFields::module_); + .def_readonly("module", &ExtraFields::module_) + .def_readonly("optimizer", &ExtraFields::optimizer_); py::class_>(m, "_ExtraFields_PyCCall") .def_readonly("caller", &ExtraFields::caller_); diff --git a/torch/csrc/profiler/python/init.h b/torch/csrc/profiler/python/init.h index d04b398d4554b..fac39fc62574a 100644 --- a/torch/csrc/profiler/python/init.h +++ b/torch/csrc/profiler/python/init.h @@ -7,6 +7,7 @@ namespace pybind11 { namespace detail { +using torch::profiler::impl::PyOptimizerSelf; using torch::profiler::impl::StorageImplData; using torch::profiler::impl::TensorID; using torch::profiler::impl::TensorImplAddress; @@ -19,6 +20,10 @@ template <> struct type_caster : public strong_pointer_type_caster {}; +template <> +struct type_caster + : public strong_pointer_type_caster {}; + template <> struct type_caster : public strong_uint_type_caster {}; } // namespace detail From eb6185041ab6de7d201fc1941e8460bdccaf9a2b Mon Sep 17 00:00:00 2001 From: Taylor Robie Date: Sat, 22 Oct 2022 17:37:55 -0700 Subject: [PATCH 0052/1922] [Profiler] Tensor IDs for Module and Optimizer variables (#86754) More sophisticated profiling will increasingly rely on python tracer to contextualize observed results. This PR adds Tensors which are observed by the python tracer to the identity assignment loop. Differential Revision: [D39852885](https://our.internmc.facebook.com/intern/diff/D39852885/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/86754 Approved by: https://github.com/slgong-fb, https://github.com/aaronenyeshi --- test/profiler/test_profiler.py | 80 +++++++++++++++++++++++-- torch/csrc/autograd/profiler_python.cpp | 16 +++-- torch/csrc/profiler/collection.cpp | 38 ++++++++++-- 3 files changed, 120 insertions(+), 14 deletions(-) diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py index 09379bb02a531..3831b6bd1247d 100644 --- a/test/profiler/test_profiler.py +++ b/test/profiler/test_profiler.py @@ -1278,12 +1278,14 @@ def test_nested_tensor_with_shapes(self): def find_node_with_name(nodes, name): - for node in nodes: + for node in _utils.traverse_dfs(nodes): if node.name == name: return node - result = find_node_with_name(node.children, name) - if result is not None: - return result + +def find_node_with_regex(nodes, pattern): + for node in _utils.traverse_dfs(nodes): + if re.search(pattern, node.name): + return node class SimpleNet(nn.Module): @@ -1368,6 +1370,73 @@ def get_fields(op_name, index): self.assertEqual(c_id, c_id_new) self.assertEqual(d_id, c_id_new) + def test_module_and_optimizer_ids(self) -> None: + model = torch.nn.Linear(2, 1, bias=True) + optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9) + + def check(cold_start: bool) -> None: + with profile(with_stack=True, profile_memory=True, record_shapes=True) as p: + x = torch.ones((1, 2)) + _ = x.sin() # Mark `x` + model(x).backward() + optimizer.step() + _ = optimizer.state[model.weight]["momentum_buffer"].cos() # Mark weight momentum + _ = model.weight.grad.tan() # Mark weight gradient + + nodes = p.profiler.kineto_results.experimental_event_tree() + + def get_fields(op_name, index): + return self._get_tensor_fields( + find_node_with_name(nodes, op_name), + index) + + # Marked Tensors act as ground truth for python tracer IDs. + _, _, x_id = get_fields("aten::sin", 0) + _, _, weight_momenumtum_id = get_fields("aten::cos", 0) + _, _, weight_grad_id = get_fields("aten::tan", 0) + self.assertNotEqual(x_id, weight_momenumtum_id) + self.assertNotEqual(x_id, weight_grad_id) + self.assertNotEqual(weight_momenumtum_id, weight_grad_id) + + # Use linear op to identify weight ground truth. + linear_op_node = find_node_with_name(nodes, "aten::linear") + self.assertIsNotNone(linear_op_node) + x_metadata, weight_metadata, _ = linear_op_node.extra_fields.inputs.tensor_metadata + self.assertEqual(x_id, x_metadata.id) + + # Module + linear_module_node = find_node_with_name(nodes, "nn.Module: Linear_0") + self.assertIsNotNone(linear_module_node) + self.assertIsNotNone(linear_module_node.extra_fields.module) + self.assertIsNone(linear_module_node.extra_fields.optimizer) + + linear_parameters = linear_module_node.extra_fields.module.parameters + name, weight, weight_grad = linear_parameters[0] + self.assertEqual(name, "weight") + self.assertEqual(weight.id, weight_metadata.id) + + self.assertEqual(weight_grad is None, cold_start) + if not cold_start: + self.assertEqual(weight_grad.id, weight_grad_id) + + # Optimizer + step_node = find_node_with_regex(nodes, "_optimizer_step_code") + self.assertIsNotNone(step_node) + self.assertIsNone(step_node.extra_fields.module) + self.assertIsNotNone(step_node.extra_fields.optimizer) + optimizer_parameters = step_node.extra_fields.optimizer.parameters + self.assertEqual(len(optimizer_parameters), 2) # Weight and bias + weight, weight_grad, state = optimizer_parameters[0] + self.assertEqual(weight.id, weight_metadata.id) + self.assertEqual(weight_grad.id, weight_grad_id) + self.assertEqual(len(state), 1) + self.assertEqual(state[0][0], "momentum_buffer") + self.assertEqual(state[0][1].id, weight_momenumtum_id) + + # Check that we handle first step (lazy initalization) and steady state. + check(cold_start=True) + check(cold_start=False) + def _test_allocation_ids(self, before_fn, after_fn) -> None: with profile(profile_memory=True, record_shapes=True) as p: # Introduce other operations and allocations to check robustness @@ -1622,6 +1691,9 @@ def _check_results(self, opt, opts, check_items=False): p.storage_data_ptr: {name: s.storage_data_ptr for name, s in state} for (p, _, state) in opt_.parameters } + + # Make sure the profiler collected all optimizer state and check + # that the address recorded by the profiler is correct. for parameter, parameter_state in opt.state.items(): self.assertEqual( {name: value.storage().data_ptr() for name, value in parameter_state.items()}, diff --git a/torch/csrc/autograd/profiler_python.cpp b/torch/csrc/autograd/profiler_python.cpp index d971336ae0c4d..308dcdcde49c8 100644 --- a/torch/csrc/autograd/profiler_python.cpp +++ b/torch/csrc/autograd/profiler_python.cpp @@ -205,7 +205,7 @@ struct Config { static constexpr EventType event_type = EventType::PyCall; }; -template +template struct ExtendedPyCallConfig { using key_t = Key; using cls_t = Cls; @@ -213,7 +213,7 @@ struct ExtendedPyCallConfig { struct ClsAndParameters { cls_t cls_; - std::vector parameters_; + std::vector parameters_; }; struct Cache { @@ -228,12 +228,16 @@ struct ExtendedPyCallConfig { }; template <> -struct Config - : ExtendedPyCallConfig {}; +struct Config : ExtendedPyCallConfig< + PyModuleSelf, + PyModuleCls, + NNModuleInfo::ParameterInfo> {}; template <> -struct Config - : ExtendedPyCallConfig {}; +struct Config : ExtendedPyCallConfig< + PyOptimizerSelf, + PyOptimizerCls, + OptimizerInfo::ParameterInfo> {}; template <> struct Config { diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp index 51fd2f9fd3df0..305cef5ffdf5d 100644 --- a/torch/csrc/profiler/collection.cpp +++ b/torch/csrc/profiler/collection.cpp @@ -850,14 +850,20 @@ void calculate_unique_tensor_ids(std::vector& sorted_results) { }; ska::flat_hash_set tensor_set; + auto insert_tensor = [&lookup, &tensors, &tensor_set](TensorMetadata& m) { + if (m.impl_ && m.data_) { + const auto id = lookup(m.data_); + tensor_set.insert(id); + tensors.emplace_back(TensorStoragePair{m.impl_, id, m.id_}); + } + }; + for (auto& result : sorted_results) { result->visit(c10::overloaded( [&](ExtraFields& torch_op) { for (auto& m : torch_op.inputs_.tensor_metadata_) { - if (m.has_value() && m->impl_ && m->data_) { - auto id = lookup(m->data_); - tensor_set.insert(id); - tensors.emplace_back(TensorStoragePair{m->impl_, id, m->id_}); + if (m.has_value()) { + insert_tensor(*m); } } }, @@ -874,6 +880,30 @@ void calculate_unique_tensor_ids(std::vector& sorted_results) { live_storage.erase(StorageImplData(alloc_op.ptr_)); } }, + [&](ExtraFields& py_call) { + // torch.nn.Module + if (py_call.module_.has_value()) { + for (auto& p : py_call.module_->parameters_) { + insert_tensor(p.metadata_); + if (p.grad_metadata_.has_value()) { + insert_tensor(*p.grad_metadata_); + } + } + } + + // torch.optim.Optimizer + if (py_call.optimizer_.has_value()) { + for (auto& p : py_call.optimizer_->parameters_) { + insert_tensor(p.metadata_); + if (p.grad_metadata_.has_value()) { + insert_tensor(*p.grad_metadata_); + } + for (auto& state_i : p.state_) { + insert_tensor(state_i.second); + } + } + } + }, [](const auto&) {})); } From 310683fb3fc428170b9ebfbb8f303104f11bce55 Mon Sep 17 00:00:00 2001 From: Taylor Robie Date: Sat, 22 Oct 2022 17:37:57 -0700 Subject: [PATCH 0053/1922] [Profiler][Trivial] Add Module cls and self bindings and type_caster macro (#86755) Just a bit of clean up. We will need `self` and `cls` for memory profiling, and the type_caster specializations were getting quite verbose. Differential Revision: [D39920728](https://our.internmc.facebook.com/intern/diff/D39920728/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/86755 Approved by: https://github.com/slgong-fb, https://github.com/aaronenyeshi --- torch/_C/_profiler.pyi | 4 ++++ torch/csrc/profiler/python/init.cpp | 4 +++- torch/csrc/profiler/python/init.h | 22 +++++++++------------- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/torch/_C/_profiler.pyi b/torch/_C/_profiler.pyi index 6d6c2893f4554..da6cfb165fb36 100644 --- a/torch/_C/_profiler.pyi +++ b/torch/_C/_profiler.pyi @@ -148,6 +148,10 @@ class _NNModuleInfo: @property def params(self) -> List[Tuple[str, int]]: ... @property + def self_ptr(self) -> int: ... + @property + def cls_ptr(self) -> int: ... + @property def cls_name(self) -> str: ... class _ExtraFields_PyCCall: diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp index 63d893d4ef8d0..8a800a3d5f82b 100644 --- a/torch/csrc/profiler/python/init.cpp +++ b/torch/csrc/profiler/python/init.cpp @@ -191,7 +191,9 @@ void initPythonBindings(PyObject* module) { return out; }) .def_property_readonly( - "cls_name", [](const NNModuleInfo& s) { return s.cls_name_.str(); }); + "cls_name", [](const NNModuleInfo& s) { return s.cls_name_.str(); }) + .def_readonly("self_ptr", &NNModuleInfo::self_) + .def_readonly("cls_ptr", &NNModuleInfo::cls_); py::class_(m, "_OptimizerInfo") .def_readonly("self_ptr", &OptimizerInfo::self_) diff --git a/torch/csrc/profiler/python/init.h b/torch/csrc/profiler/python/init.h index fac39fc62574a..226bf1a3f3bb3 100644 --- a/torch/csrc/profiler/python/init.h +++ b/torch/csrc/profiler/python/init.h @@ -7,22 +7,18 @@ namespace pybind11 { namespace detail { -using torch::profiler::impl::PyOptimizerSelf; -using torch::profiler::impl::StorageImplData; using torch::profiler::impl::TensorID; -using torch::profiler::impl::TensorImplAddress; -template <> -struct type_caster - : public strong_pointer_type_caster {}; - -template <> -struct type_caster - : public strong_pointer_type_caster {}; +#define STRONG_POINTER_TYPE_CASTER(T) \ + template <> \ + struct type_caster : public strong_pointer_type_caster {}; -template <> -struct type_caster - : public strong_pointer_type_caster {}; +STRONG_POINTER_TYPE_CASTER(torch::profiler::impl::StorageImplData); +STRONG_POINTER_TYPE_CASTER(torch::profiler::impl::TensorImplAddress); +STRONG_POINTER_TYPE_CASTER(torch::profiler::impl::PyModuleSelf); +STRONG_POINTER_TYPE_CASTER(torch::profiler::impl::PyModuleCls); +STRONG_POINTER_TYPE_CASTER(torch::profiler::impl::PyOptimizerSelf); +#undef STRONG_POINTER_TYPE_CASTER template <> struct type_caster : public strong_uint_type_caster {}; From dd23736655c5d83c7f18db6e78b5f8fe141dcdd9 Mon Sep 17 00:00:00 2001 From: Taylor Robie Date: Sat, 22 Oct 2022 17:37:58 -0700 Subject: [PATCH 0054/1922] [Profiler] Regularize `AccumulateGrad` name (#86909) Memory profiler will use AccumulateGrad when detecting gradients. The name difference between Windows and other platforms has already cropped up with profiler trees so it makes sense to address it at the source. Differential Revision: [D40347550](https://our.internmc.facebook.com/intern/diff/D40347550/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/86909 Approved by: https://github.com/slgong-fb, https://github.com/aaronenyeshi --- test/profiler/test_profiler_tree.py | 5 ----- torch/csrc/profiler/collection.cpp | 13 +++++++++++++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/test/profiler/test_profiler_tree.py b/test/profiler/test_profiler_tree.py index f0097985f2940..21c3826c4a9cd 100644 --- a/test/profiler/test_profiler_tree.py +++ b/test/profiler/test_profiler_tree.py @@ -138,11 +138,6 @@ def flatten(nodes, depth=0, out=None): @staticmethod def fmt_name(name: str) -> str: - # torch::autograd::Node relies on c10::demangle to generate names, and - # Windows demangles to include `struct` in the name. - if IS_WINDOWS: - name = name.replace('struct torch::autograd::AccumulateGrad', 'torch::autograd::AccumulateGrad') - match = re.match(r"^(.*)\.py\(([0-9]+)\): (.*)$", name) if match: filename, _, fn = match.groups() diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp index 305cef5ffdf5d..e76cfd5946db9 100644 --- a/torch/csrc/profiler/collection.cpp +++ b/torch/csrc/profiler/collection.cpp @@ -296,6 +296,19 @@ void ThreadLocalSubqueue::TorchOpStorage::materialize( } } + // `AccumulateGrad` is an important marker for profile analysis; however the + // annotation relies on `c10::demangle` which is platform dependent. In + // particular, Windows will add a "struct " prefix. + const std::string accumulate_grad = "torch::autograd::AccumulateGrad"; + const std::string windows_pattern = std::string("struct ") + accumulate_grad; + for (auto& event : op_events_) { + auto& name = event.basic_fields_.name_; + auto position = name.find(windows_pattern); + if (position != std::string::npos) { + name.replace(position, windows_pattern.size(), accumulate_grad); + } + } + auto input_getter = inputs_outputs_.getNextShapesAndDtypes(); // TODO: CTAD will take care of template args when we move to C++17 From f93a861cc445175160ab96415586dd95e0d0eb73 Mon Sep 17 00:00:00 2001 From: efiks <5167930+efiks@users.noreply.github.com> Date: Sun, 23 Oct 2022 19:29:25 +0000 Subject: [PATCH 0055/1922] [torch] Unify batch_box_cox implementations into perfkernels folder (#86569) Summary: 1) Adding MKL/AVX2 based implementation into perfkernels. This implementation is similar to caffe2/operators/batch_box_cox_op.cc 2) Migrating batch_box_cox_op of caffe2 use this implementation Test Plan: CI Differential Revision: D40208074 Pull Request resolved: https://github.com/pytorch/pytorch/pull/86569 Approved by: https://github.com/hyuen --- caffe2/operators/batch_box_cox_op.cc | 300 ++--------------------- caffe2/operators/batch_box_cox_op.h | 60 +---- caffe2/perfkernels/batch_box_cox.cc | 113 +++++++++ caffe2/perfkernels/batch_box_cox.h | 35 +++ caffe2/perfkernels/batch_box_cox_avx2.cc | 299 ++++++++++++++++++++++ caffe2/perfkernels/common.h | 3 + 6 files changed, 478 insertions(+), 332 deletions(-) create mode 100644 caffe2/perfkernels/batch_box_cox.cc create mode 100644 caffe2/perfkernels/batch_box_cox.h create mode 100644 caffe2/perfkernels/batch_box_cox_avx2.cc diff --git a/caffe2/operators/batch_box_cox_op.cc b/caffe2/operators/batch_box_cox_op.cc index aa444330969b5..6e2bb4d9a8d9d 100644 --- a/caffe2/operators/batch_box_cox_op.cc +++ b/caffe2/operators/batch_box_cox_op.cc @@ -2,72 +2,34 @@ #include "caffe2/core/operator.h" #include "caffe2/core/tensor.h" - -#ifdef CAFFE2_USE_MKL -#include -#endif // CAFFE2_USE_MKL +#include "caffe2/perfkernels/batch_box_cox.h" namespace caffe2 { -#ifdef CAFFE2_USE_MKL namespace { - -// Helpers for copying parameters. template -void TileArrayIntoVector(const T* a, int D, int K, vector* b) { - b->resize(K * D); - for (int k = 0; k < K; k++) { - std::copy(a, a + D, b->begin() + k * D); - } -} - -void TileIndicesInPlace(vector* v, int D, int K) { - int n = v->size(); - v->resize(K * n); - for (int k = 1; k < K; k++) { - for (int j = 0; j < n; j++) { - (*v)[k * n + j] = (*v)[j] + k * D; +void BoxCoxNaive( + int64_t N, + int64_t D, + const T* data_ptr, + const T* lambda1_ptr, + const T* lambda2_ptr, + T* output_ptr) { + constexpr T k_eps = static_cast(1e-6); + for (int64_t i = 0; i < N; i++) { + for (int64_t j = 0; j < D; j++, data_ptr++, output_ptr++) { + T lambda1_v = lambda1_ptr[j]; + T lambda2_v = lambda2_ptr[j]; + T tmp = std::max(*data_ptr + lambda2_v, k_eps); + if (lambda1_v == 0) { + *output_ptr = std::log(tmp); + } else { + *output_ptr = (std::pow(tmp, lambda1_v) - 1) / lambda1_v; + } } } } - -// MKL VML function templates. -template -void PackV(const int N, const T* a, const int* ia, T* y); -template -void UnpackV(const int N, const T* a, T* y, const int* iy); -template -void Pow(const int N, const T* a, const T* b, T* y); - -#define DELEGATE_PACKV_FUNCTION(T, OriginalFunc) \ - template <> \ - void PackV(const int N, const T* a, const int* ia, T* y) { \ - OriginalFunc(N, a, ia, y); \ - } -DELEGATE_PACKV_FUNCTION(float, vsPackV) -DELEGATE_PACKV_FUNCTION(double, vdPackV) -#undef DELEGATE_PACKV_FUNCTION - -#define DELEGATE_UNPACKV_FUNCTION(T, OriginalFunc) \ - template <> \ - void UnpackV(const int N, const T* a, T* y, const int* iy) { \ - OriginalFunc(N, a, y, iy); \ - } -DELEGATE_UNPACKV_FUNCTION(float, vsUnpackV) -DELEGATE_UNPACKV_FUNCTION(double, vdUnpackV) -#undef DELEGATE_UNPACKV_FUNCTION - -#define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Funcname, OriginalFunc) \ - template <> \ - void Funcname(const int N, const T* a, const T* b, T* y) { \ - OriginalFunc(N, a, b, y); \ - } -DELEGATE_SIMPLE_BINARY_FUNCTION(float, Pow, vsPow) -DELEGATE_SIMPLE_BINARY_FUNCTION(double, Pow, vdPow) -#undef DELEGATE_SIMPLE_BINARY_FUNCTION - -} // namespace -#endif // CAFFE2_USE_MKL +} template <> template @@ -93,227 +55,19 @@ bool BatchBoxCoxOp::DoRunWithType() { const auto* lambda1_ptr = lambda1.template data(); const auto* lambda2_ptr = lambda2.template data(); - const T k_eps = static_cast(1e-6); - #ifdef CAFFE2_USE_MKL if (min_block_size_ < 1) { - BoxCoxNaive(N, D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr); - } else { - // Find zero-valued columns, since they get special treatment. - nonzeros_.clear(); - zeros_.clear(); - nonzeros_.reserve(D); - zeros_.reserve(D); - for (int64_t j = 0; j < D; j++) { - if (lambda1_ptr[j] == 0) { - zeros_.push_back(j); - } else { - nonzeros_.push_back(j); - } - } - - // Process K rows at a time for effective vectorization with small rows. - const int K = std::min(N, (min_block_size_ + D - 1) / D); - - // Avoid copying data if all lambda1 values are zero, or if all are nonzero. - // In each of the three cases here, when K > 1, first process batches of K - // rows by replicating the input parameters K times. Then finish row-by-row. - TypedCachedBuffers& b = GetBuffers(); - if (nonzeros_.size() == D) { - int64_t i = 0; - if (K > 1) { - TileArrayIntoVector(lambda1_ptr, D, K, &b.lambda1_); - TileArrayIntoVector(lambda2_ptr, D, K, &b.lambda2_); - TORCH_DCHECK_EQ(K * D, b.lambda1_.size()); - TORCH_DCHECK_EQ(K * D, b.lambda2_.size()); - for (; i < N - K + 1; i += K, data_ptr += K * D, output_ptr += K * D) { - BoxCoxNonzeroLambda( - K * D, - data_ptr, - b.lambda1_.data(), - b.lambda2_.data(), - k_eps, - output_ptr); - } - } - for (; i < N; i++, data_ptr += D, output_ptr += D) { - BoxCoxNonzeroLambda( - D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr); - } - } else if (zeros_.size() == D) { - int64_t i = 0; - if (K > 1) { - TileArrayIntoVector(lambda2_ptr, D, K, &b.lambda2_z_); - TORCH_DCHECK_EQ(K * D, b.lambda2_z_.size()); - for (; i < N - K + 1; i += K, data_ptr += K * D, output_ptr += K * D) { - BoxCoxZeroLambda( - K * D, data_ptr, b.lambda2_z_.data(), k_eps, output_ptr); - } - } - for (; i < N; i++, data_ptr += D, output_ptr += D) { - BoxCoxZeroLambda(D, data_ptr, lambda2_ptr, k_eps, output_ptr); - } - } else { // General case of mixed zero and non-zero lambda1 values. - int n = nonzeros_.size(); - if (K > 1) { - TileIndicesInPlace(&nonzeros_, 0, K); - TileIndicesInPlace(&zeros_, 0, K); - } - - // Gather parameter values into contiguous memory. - b.lambda1_.resize(nonzeros_.size()); - b.lambda2_.resize(nonzeros_.size()); - b.lambda2_z_.resize(zeros_.size()); - PackV(nonzeros_.size(), lambda1_ptr, nonzeros_.data(), b.lambda1_.data()); - PackV(nonzeros_.size(), lambda2_ptr, nonzeros_.data(), b.lambda2_.data()); - PackV(zeros_.size(), lambda2_ptr, zeros_.data(), b.lambda2_z_.data()); - - int64_t i = 0; - b.accumulator_.resize(std::max(nonzeros_.size(), zeros_.size())); - if (K > 1) { - // Truncate to original size, and re-tile with offsets this time. - nonzeros_.resize(n); - zeros_.resize(D - n); - TileIndicesInPlace(&nonzeros_, D, K); - TileIndicesInPlace(&zeros_, D, K); - TORCH_DCHECK_EQ(nonzeros_.size(), b.lambda1_.size()); - TORCH_DCHECK_EQ(nonzeros_.size(), b.lambda2_.size()); - TORCH_DCHECK_EQ(zeros_.size(), b.lambda2_z_.size()); - for (; i < N - K + 1; i += K, data_ptr += K * D, output_ptr += K * D) { - BoxCoxMixedLambda( - data_ptr, - nonzeros_, - zeros_, - b.lambda1_.data(), - b.lambda2_.data(), - b.lambda2_z_.data(), - k_eps, - b.accumulator_.data(), - output_ptr); - } - // Truncate to original size. - nonzeros_.resize(n); - zeros_.resize(D - n); - } - for (; i < N; i++, data_ptr += D, output_ptr += D) { - BoxCoxMixedLambda( - data_ptr, - nonzeros_, - zeros_, - b.lambda1_.data(), - b.lambda2_.data(), - b.lambda2_z_.data(), - k_eps, - b.accumulator_.data(), - output_ptr); - } - } + BoxCoxNaive(N, D, data_ptr, lambda1_ptr, lambda2_ptr, output_ptr); + return true; } -#else // CAFFE2_USE_MKL - BoxCoxNaive(N, D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr); -#endif // CAFFE2_USE_MKL + caffe2::compute_batch_box_cox( + N, D, min_block_size_, data_ptr, lambda1_ptr, lambda2_ptr, output_ptr); +#else + BoxCoxNaive(N, D, data_ptr, lambda1_ptr, lambda2_ptr, output_ptr); +#endif return true; } -template <> -template -void BatchBoxCoxOp::BoxCoxNaive( - int64_t N, - int64_t D, - const T* data_ptr, - const T* lambda1_ptr, - const T* lambda2_ptr, - T k_eps, - T* output_ptr) { - for (int64_t i = 0; i < N; i++) { - for (int64_t j = 0; j < D; j++, data_ptr++, output_ptr++) { - T lambda1_v = lambda1_ptr[j]; - T lambda2_v = lambda2_ptr[j]; - T tmp = std::max(*data_ptr + lambda2_v, k_eps); - if (lambda1_v == 0) { - *output_ptr = std::log(tmp); - } else { - *output_ptr = (std::pow(tmp, lambda1_v) - 1) / lambda1_v; - } - } - } -} - -#ifdef CAFFE2_USE_MKL - -template <> -template -void BatchBoxCoxOp::BoxCoxNonzeroLambda( - int64_t D, - const T* data_ptr, - const T* lambda1, - const T* lambda2, - T k_eps, - T* out) { - caffe2::math::Add(D, data_ptr, lambda2, out, &context_); - for (int64_t j = 0; j < D; j++) { - out[j] = std::max(out[j], k_eps); - } - Pow(D, out, lambda1, out); - for (int64_t j = 0; j < D; j++) { - out[j] -= 1.0; - } - caffe2::math::Div(D, out, lambda1, out, &context_); -} - -template <> -template -void BatchBoxCoxOp::BoxCoxZeroLambda( - int64_t D, - const T* data_ptr, - const T* lambda2, - T k_eps, - T* output_ptr) { - caffe2::math::Add(D, data_ptr, lambda2, output_ptr, &context_); - for (int64_t j = 0; j < D; j++) { - output_ptr[j] = std::max(output_ptr[j], k_eps); - } - caffe2::math::Log(D, output_ptr, output_ptr, &context_); -} - -template <> -template -void BatchBoxCoxOp::BoxCoxMixedLambda( - const T* data_ptr, - const vector& nonzeros, - const vector& zeros, - const T* lambda1, - const T* lambda2, - const T* lambda2_z, - T k_eps, - T* buffer, - T* output_ptr) { - PackV(nonzeros.size(), data_ptr, nonzeros.data(), buffer); - BoxCoxNonzeroLambda(nonzeros.size(), buffer, lambda1, lambda2, k_eps, buffer); - UnpackV(nonzeros.size(), buffer, output_ptr, nonzeros.data()); - - PackV(zeros.size(), data_ptr, zeros.data(), buffer); - BoxCoxZeroLambda(zeros.size(), buffer, lambda2_z, k_eps, buffer); - UnpackV(zeros.size(), buffer, output_ptr, zeros.data()); -} - -// Helpers to access cached buffers. -#define DEFINE_CACHED_BUFFERS(T, tag) \ - template <> \ - template <> \ - BatchBoxCoxOp::TypedCachedBuffers& \ - BatchBoxCoxOp::GetBuffers() { \ - if (!buffers_ || buffers_->type_ != tag) { \ - buffers_.reset(new BatchBoxCoxOp::TypedCachedBuffers()); \ - buffers_->type_ = tag; \ - } \ - return *static_cast*>(buffers_.get()); \ - } -DEFINE_CACHED_BUFFERS(float, 1); -DEFINE_CACHED_BUFFERS(double, 2); -#undef DEFINE_CACHED_BUFFERS - -#endif // CAFFE2_USE_MKL namespace { diff --git a/caffe2/operators/batch_box_cox_op.h b/caffe2/operators/batch_box_cox_op.h index baa9c955b6cac..a177131e9adee 100644 --- a/caffe2/operators/batch_box_cox_op.h +++ b/caffe2/operators/batch_box_cox_op.h @@ -29,65 +29,7 @@ class BatchBoxCoxOp final : public Operator { bool DoRunWithType(); protected: - template - void BoxCoxNaive( - int64_t N, - int64_t D, - const T* data_ptr, - const T* lambda1_ptr, - const T* lambda2_ptr, - T k_eps, - T* output_ptr); - -#ifdef CAFFE2_USE_MKL - template - void BoxCoxNonzeroLambda( - int64_t D, - const T* data_ptr, - const T* lambda1, - const T* lambda2, - T k_eps, - T* output_ptr); - - template - void BoxCoxZeroLambda( - int64_t D, - const T* data_ptr, - const T* lambda2, - T k_eps, - T* output_ptr); - - template - void BoxCoxMixedLambda( - const T* data_ptr, - const vector& nonzeros, - const vector& zeros, - const T* lambda1, - const T* lambda2, - const T* lambda2_z, - T k_eps, - T* buffer, - T* output_ptr); - - vector nonzeros_, zeros_; - - // Buffers used by the MKL version are cached across calls. - struct CachedBuffers { - virtual ~CachedBuffers() {} - int type_; - }; - template - struct TypedCachedBuffers : public CachedBuffers { - vector lambda1_, lambda2_, lambda2_z_; - vector accumulator_; - }; - template - TypedCachedBuffers& GetBuffers(); - unique_ptr buffers_; - -#endif // CAFFE2_USE_MKL - - int min_block_size_; + std::size_t min_block_size_; INPUT_TAGS(DATA, LAMBDA1, LAMBDA2); }; diff --git a/caffe2/perfkernels/batch_box_cox.cc b/caffe2/perfkernels/batch_box_cox.cc new file mode 100644 index 0000000000000..3e840d8fa04d3 --- /dev/null +++ b/caffe2/perfkernels/batch_box_cox.cc @@ -0,0 +1,113 @@ +#include "caffe2/perfkernels/common.h" + +#include +#include +#include + +namespace caffe2 { + +namespace { +template +void BoxCoxNaive( + std::size_t N, + std::size_t D, + const T* data_ptr, + const T* __restrict lambda1_ptr, + const T* __restrict lambda2_ptr, + T* output_ptr) { + constexpr T k_eps = static_cast(1e-6); + + for (int64_t i = 0; i < N; i++) { + for (int64_t j = 0; j < D; j++, data_ptr++, output_ptr++) { + T lambda1_v = lambda1_ptr[j]; + T lambda2_v = lambda2_ptr[j]; + T tmp = std::max(*data_ptr + lambda2_v, k_eps); + if (lambda1_v == 0) { + *output_ptr = std::log(tmp); + } else { + T lambda_1 = 1 / lambda1_v; + T pow = std::pow(tmp, lambda1_v); + *output_ptr = lambda_1 * pow - lambda_1; + } + } + } + +} +} + +#if defined(CAFFE2_PERF_WITH_AVX2) && defined(CAFFE2_PERF_USE_MKL) +namespace details { +template +void compute_batch_box_cox__avx2_fma( + std::size_t N, + std::size_t D, + std::size_t block_size, + const T* data_ptr, + const T* __restrict lambda1_ptr, + const T* __restrict lambda2_ptr, + T* output_ptr); + +extern template +void compute_batch_box_cox__avx2_fma( + std::size_t N, + std::size_t D, + std::size_t block_size, + const float* self_data, + const float* __restrict lambda1_data, + const float* __restrict lambda2_data, + float* output_data); + +extern template +void compute_batch_box_cox__avx2_fma( + std::size_t N, + std::size_t D, + std::size_t block_size, + const double* self_data, + const double* __restrict lambda1_data, + const double* __restrict lambda2_data, + double* output_data); +} // namespace detail +#endif + +template +void compute_batch_box_cox( + std::size_t N, + std::size_t D, + std::size_t block_size, + const T* data, + const T* lambda1_data, + const T* lambda2_data, + T* output_data) { +#ifdef CAFFE2_PERF_WITH_AVX2 + AVX2_FMA_DO( + details::compute_batch_box_cox, + N, + D, + block_size, + data, + lambda1_data, + lambda2_data, + output_data); +#endif + BoxCoxNaive(N, D, data, lambda1_data, lambda2_data, output_data); +} + +template void compute_batch_box_cox( + std::size_t N, + std::size_t D, + std::size_t block_size, + const float* data, + const float* lambda1_data, + const float* lambda2_data, + float* output_data); + +template void compute_batch_box_cox( + std::size_t N, + std::size_t D, + std::size_t block_size, + const double* data, + const double* lambda1_data, + const double* lambda2_data, + double* output_data); + +} // namespace caffe2 diff --git a/caffe2/perfkernels/batch_box_cox.h b/caffe2/perfkernels/batch_box_cox.h new file mode 100644 index 0000000000000..60c973bbf8ea1 --- /dev/null +++ b/caffe2/perfkernels/batch_box_cox.h @@ -0,0 +1,35 @@ +// Impmenets BoxCox operator for CPU +#pragma once +#include + +namespace caffe2 { + +template +void compute_batch_box_cox( + std::size_t N, + std::size_t D, + std::size_t block_size, + const T* self_data, + const T* lambda1_data, + const T* lambda2_data, + T* output_data); + +extern template void compute_batch_box_cox( + std::size_t N, + std::size_t D, + std::size_t block_size, + const float* data, + const float* lambda1_data, + const float* lambda2_data, + float* output_data); + +extern template void compute_batch_box_cox( + std::size_t N, + std::size_t D, + std::size_t block_size, + const double* data, + const double* lambda1_data, + const double* lambda2_data, + double* output_data); + +} // namespace caffe2 diff --git a/caffe2/perfkernels/batch_box_cox_avx2.cc b/caffe2/perfkernels/batch_box_cox_avx2.cc new file mode 100644 index 0000000000000..cf0801b4733ef --- /dev/null +++ b/caffe2/perfkernels/batch_box_cox_avx2.cc @@ -0,0 +1,299 @@ +#ifdef CAFFE2_PERF_USE_MKL +#include +#include +#include + +#include +#include +#include + +#include + +namespace caffe2::details { + +// MKL VML function templates. +template +void PackV(const int N, const T* a, const int* ia, T* y); +template +void UnpackV(const int N, const T* a, T* y, const int* iy); +template +void Pow(const int N, const T* a, const T* b, T* y); +template +void Add(const int N, const T* a, const T* b, T* y); +template +void Div(const int N, const T* a, const T* b, T* y); +template +void Ln(const int N, const T* a, T* y); + +#define DELEGATE_PACKV_FUNCTION(T, OriginalFunc) \ + template <> \ + void PackV(const int N, const T* a, const int* ia, T* y) { \ + OriginalFunc(N, a, ia, y); \ + } +DELEGATE_PACKV_FUNCTION(float, vsPackV) +DELEGATE_PACKV_FUNCTION(double, vdPackV) +#undef DELEGATE_PACKV_FUNCTION + +#define DELEGATE_UNPACKV_FUNCTION(T, OriginalFunc) \ + template <> \ + void UnpackV(const int N, const T* a, T* y, const int* iy) { \ + OriginalFunc(N, a, y, iy); \ + } +DELEGATE_UNPACKV_FUNCTION(float, vsUnpackV) +DELEGATE_UNPACKV_FUNCTION(double, vdUnpackV) +#undef DELEGATE_UNPACKV_FUNCTION + +#define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Funcname, OriginalFunc) \ + template <> \ + void Funcname(const int N, const T* a, const T* b, T* y) { \ + OriginalFunc(N, a, b, y); \ + } +DELEGATE_SIMPLE_BINARY_FUNCTION(float, Pow, vsPow) +DELEGATE_SIMPLE_BINARY_FUNCTION(double, Pow, vdPow) +DELEGATE_SIMPLE_BINARY_FUNCTION(float, Add, vsAdd) +DELEGATE_SIMPLE_BINARY_FUNCTION(double, Add, vdAdd) +DELEGATE_SIMPLE_BINARY_FUNCTION(float, Div, vsDiv) +DELEGATE_SIMPLE_BINARY_FUNCTION(double, Div, vdDiv) +#undef DELEGATE_SIMPLE_BINARY_FUNCTION + +#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, OriginalFunc) \ + template <> \ + void Funcname(const int N, const T* a, T* y) { \ + OriginalFunc(N, a, y); \ + } +DELEGATE_SIMPLE_UNARY_FUNCTION(float, Ln, vsLn) +DELEGATE_SIMPLE_UNARY_FUNCTION(double, Ln, vdLn) +#undef DELEGATE_SIMPLE_UNARY_FUNCTION + +template +void box_cox_zero_lambda( + size_t D, + const T* const self_data, + const T* const lambda2_data, + T k_eps, + T* const output_data) { + Add(D, self_data, lambda2_data, output_data); + for (const auto j : c10::irange(D)) { + output_data[j] = std::max(output_data[j], k_eps); + } + + Ln(D, output_data, output_data); +} + +template +void box_cox_nonzero_lambda( + size_t D, + const T* const self_data, + const T* const lambda1_data, + const T* const lambda2_data, + T k_eps, + T* const output_data) { + Add(D, self_data, lambda2_data, output_data); + for (const auto j : c10::irange(D)) { + output_data[j] = std::max(output_data[j], k_eps); + } + + // output = output ^ lambda1 + Pow(D, output_data, lambda1_data, output_data); + // output = (output - 1)/ lambda1 + for (const auto j : c10::irange(D)) { + output_data[j] -= 1.0; + } + Div(D, output_data, lambda1_data, output_data); +} + +template +void box_cox_mixed_lambda( + const T* const self_data, + const std::vector& nonzeros, + const std::vector& zeros, + const T* const lambda1, + const T* const lambda2, + const T* const lambda2_z_, + T k_eps, + T* const buffer, + T* const output_data) { + PackV(nonzeros.size(), self_data, nonzeros.data(), buffer); + box_cox_nonzero_lambda( + nonzeros.size(), buffer, lambda1, lambda2, k_eps, buffer); + UnpackV(nonzeros.size(), buffer, output_data, nonzeros.data()); + + PackV(zeros.size(), self_data, zeros.data(), buffer); + box_cox_zero_lambda( + zeros.size(), buffer, lambda2_z_, k_eps, buffer); + UnpackV(zeros.size(), buffer, output_data, zeros.data()); +} + +template +void TileArrayIntoVector( + const T* const a, + const size_t D, + const int K, + std::vector& b) { + b.resize(K * D); + for (const auto k : c10::irange(K)) { + std::copy(a, a + D, b.begin() + k * D); + } +} + +void TileIndicesInPlace(std::vector& v, const std::size_t D, const std::size_t K) { + auto n = v.size(); + v.resize(K * n); + for (const auto k : c10::irange(1, K)) { + for (const auto j : c10::irange(n)) { + v[k * n + j] = v[j] + k * D; + } + } +} + +template +void compute_batch_box_cox__avx2_fma( + std::size_t N, + std::size_t D, + std::size_t block_size, + const T* self_data, + const T* __restrict lambda1_data, + const T* __restrict lambda2_data, + T* output_data) { + constexpr T k_eps = static_cast(1e-6); + + FOLLY_DECLARE_REUSED(zeros, std::vector); + FOLLY_DECLARE_REUSED(nonzeros, std::vector); + // Don't bother calling reserve; calls after the first will get a + // correctly-sized allocation anyway. + for (const auto j : c10::irange(D)) { + if (lambda1_data[j] == 0) { + zeros.push_back(j); + } else { + nonzeros.push_back(j); + } + } + + // Process K rows at a time for effective vectorization with small rows. + const auto K = std::min(N, (block_size + D - 1) / D); + + FOLLY_DECLARE_REUSED(lambda1_, std::vector); + FOLLY_DECLARE_REUSED(lambda2_, std::vector); + FOLLY_DECLARE_REUSED(lambda2_z_, std::vector); + + if (nonzeros.size() == D) { + // ((x + lambda2)^lambda1 - 1)/lambda1, if lambda1 != 0 + size_t i = 0; + if (K > 1) { + TileArrayIntoVector(lambda1_data, D, K, lambda1_); + TileArrayIntoVector(lambda2_data, D, K, lambda2_); + DCHECK_EQ(K * D, lambda1_.size()); + DCHECK_EQ(K * D, lambda2_.size()); + for (; i < N - K + 1; i += K, self_data += K * D, output_data += K * D) { + box_cox_nonzero_lambda( + K * D, + self_data, + lambda1_.data(), + lambda2_.data(), + k_eps, + output_data); + } + } + for (; i < N; i++, self_data += D, output_data += D) { + box_cox_nonzero_lambda( + D, self_data, lambda1_data, lambda2_data, k_eps, output_data); + } + } else if (zeros.size() == D) { + // ln(x + lambda2), if lambda1 == 0 + size_t i = 0; + if (K > 1) { + TileArrayIntoVector(lambda2_data, D, K, lambda2_z_); + DCHECK_EQ(K * D, lambda2_z_.size()); + for (; i < N - K + 1; i += K, self_data += K * D, output_data += K * D) { + box_cox_zero_lambda( + K * D, self_data, lambda2_z_.data(), k_eps, output_data); + } + } + for (; i < N; i++, self_data += D, output_data += D) { + box_cox_zero_lambda( + D, self_data, lambda2_data, k_eps, output_data); + } + } else { + // mix zeros and nonzeros + const size_t n = nonzeros.size(); + if (K > 1) { + TileIndicesInPlace(nonzeros, 0, K); + TileIndicesInPlace(zeros, 0, K); + } + + FOLLY_DECLARE_REUSED(buffer, std::vector); + + buffer.resize(std::max(nonzeros.size(), zeros.size())); + lambda1_.resize(nonzeros.size()); + lambda2_.resize(nonzeros.size()); + lambda2_z_.resize(zeros.size()); + PackV(nonzeros.size(), lambda1_data, nonzeros.data(), lambda1_.data()); + PackV(nonzeros.size(), lambda2_data, nonzeros.data(), lambda2_.data()); + PackV(zeros.size(), lambda2_data, zeros.data(), lambda2_z_.data()); + + size_t i = 0; + if (K > 1) { + // Truncate to original size, and re-tile with offsets this time. + nonzeros.resize(n); + DCHECK_GT(D, n); + zeros.resize(D - n); + TileIndicesInPlace(nonzeros, D, K); + TileIndicesInPlace(zeros, D, K); + DCHECK_EQ(nonzeros.size(), lambda1_.size()); + DCHECK_EQ(nonzeros.size(), lambda2_.size()); + DCHECK_EQ(zeros.size(), lambda2_z_.size()); + + for (; i < N - K + 1; i += K, self_data += K * D, output_data += K * D) { + box_cox_mixed_lambda( + self_data, + nonzeros, + zeros, + lambda1_.data(), + lambda2_.data(), + lambda2_z_.data(), + k_eps, + buffer.data(), + output_data); + } + // Truncate to original size. + nonzeros.resize(n); + zeros.resize(D - n); + } + for (; i < N; i++, self_data += D, output_data += D) { + box_cox_mixed_lambda( + self_data, + nonzeros, + zeros, + lambda1_.data(), + lambda2_.data(), + lambda2_z_.data(), + k_eps, + buffer.data(), + output_data); + } + } +}; + + +template +void compute_batch_box_cox__avx2_fma( + std::size_t N, + std::size_t D, + std::size_t block_size, + const float* self_data, + const float* __restrict lambda1_data, + const float* __restrict lambda2_data, + float* output_data); + +template +void compute_batch_box_cox__avx2_fma( + std::size_t N, + std::size_t D, + std::size_t block_size, + const double* self_data, + const double* __restrict lambda1_data, + const double* __restrict lambda2_data, + double* output_data); + +} // namespace caffe2::detail +#endif diff --git a/caffe2/perfkernels/common.h b/caffe2/perfkernels/common.h index fb960dbe5dc3c..6fed9e1d6d06c 100644 --- a/caffe2/perfkernels/common.h +++ b/caffe2/perfkernels/common.h @@ -62,7 +62,10 @@ In foo.cc, do: #pragma once +#if defined(CAFFE2_PERF_WITH_AVX512) || defined(CAFFE2_PERF_WITH_AVX2) \ + || defined(CAFFE2_PERF_WITH_AVX) #include +#endif // DO macros: these should be used in your entry function, similar to foo() // above, that routes implementations based on CPU capability. From c23583033f753c645a9143b5cc6e8497e1efc84b Mon Sep 17 00:00:00 2001 From: eqy Date: Sun, 23 Oct 2022 21:17:12 +0000 Subject: [PATCH 0056/1922] Fix 64bit indexing in `vol2col` (#87527) Surfaced from #87354 CC @ngimel @ptrblck @maybeLee Pull Request resolved: https://github.com/pytorch/pytorch/pull/87527 Approved by: https://github.com/ngimel --- aten/src/ATen/native/cuda/vol2col.cuh | 58 +++++++++++++-------------- test/test_nn.py | 10 +++++ 2 files changed, 39 insertions(+), 29 deletions(-) diff --git a/aten/src/ATen/native/cuda/vol2col.cuh b/aten/src/ATen/native/cuda/vol2col.cuh index 7ab719bc819eb..51dbe1c744053 100644 --- a/aten/src/ATen/native/cuda/vol2col.cuh +++ b/aten/src/ATen/native/cuda/vol2col.cuh @@ -15,7 +15,7 @@ using namespace at::cuda::detail; // Kernel for fast unfold+copy on volumes template __global__ void vol2col_kernel( - const int n, + const int64_t n, const T* data_vol, const int depth, const int height, @@ -37,16 +37,16 @@ __global__ void vol2col_kernel( const int width_col, T* data_col) { CUDA_KERNEL_LOOP(index, n) { - int w_out = index % width_col; + auto w_out = index % width_col; index /= width_col; - int h_out = index % height_col; + auto h_out = index % height_col; index /= height_col; - int t_out = index % depth_col; - int channel_in = index / depth_col; - int channel_out = channel_in * ksize_t * ksize_h * ksize_w; - int t_in = t_out * stride_t - pad_t; - int h_in = h_out * stride_h - pad_h; - int w_in = w_out * stride_w - pad_w; + auto t_out = index % depth_col; + auto channel_in = index / depth_col; + auto channel_out = channel_in * ksize_t * ksize_h * ksize_w; + auto t_in = t_out * stride_t - pad_t; + auto h_in = h_out * stride_h - pad_h; + auto w_in = w_out * stride_w - pad_w; data_col += ((channel_out * depth_col + t_out) * height_col + h_out) * width_col + w_out; @@ -54,9 +54,9 @@ __global__ void vol2col_kernel( for (int i = 0; i < ksize_t; ++i) { for (int j = 0; j < ksize_h; ++j) { for (int k = 0; k < ksize_w; ++k) { - int t = t_in + i * dilation_t; - int h = h_in + j * dilation_h; - int w = w_in + k * dilation_w; + auto t = t_in + i * dilation_t; + auto h = h_in + j * dilation_h; + auto w = w_in + k * dilation_w; *data_col = (t >= 0 && h >= 0 && w >= 0 && t < depth && h < height && w < width) ? data_vol @@ -126,7 +126,7 @@ void vol2col( template __global__ void vol2im_kernel( - const unsigned n, + const int64_t n, const T* data_col, const unsigned depth, const unsigned height, @@ -150,30 +150,30 @@ __global__ void vol2im_kernel( T* data_vol) { CUDA_KERNEL_LOOP(index, n) { accT val = static_cast(0); - const unsigned w_im = index % width + pad_w; - const unsigned h_im = (index / width) % height + pad_h; - const unsigned t_im = (index / width / height) % depth + pad_t; - const unsigned c_im = index / (width * height * depth); - unsigned kernel_extent_w = (kernel_w - 1) * dilation_w + 1; - unsigned kernel_extent_h = (kernel_h - 1) * dilation_h + 1; - unsigned kernel_extent_t = (kernel_t - 1) * dilation_t + 1; + const auto w_im = index % width + pad_w; + const auto h_im = (index / width) % height + pad_h; + const auto t_im = (index / width / height) % depth + pad_t; + const auto c_im = index / (width * height * depth); + auto kernel_extent_w = (kernel_w - 1) * dilation_w + 1; + auto kernel_extent_h = (kernel_h - 1) * dilation_h + 1; + auto kernel_extent_t = (kernel_t - 1) * dilation_t + 1; // compute the start and end of the output - const unsigned w_col_start = + const auto w_col_start = (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1; - const unsigned w_col_end = std::min(w_im / stride_w + 1, width_col); - const unsigned h_col_start = + const auto w_col_end = std::min(w_im / stride_w + 1, width_col); + const auto h_col_start = (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1; - const unsigned h_col_end = std::min(h_im / stride_h + 1, height_col); - const unsigned t_col_start = + const auto h_col_end = std::min(h_im / stride_h + 1, height_col); + const auto t_col_start = (t_im < kernel_extent_t) ? 0 : (t_im - kernel_extent_t) / stride_t + 1; - const unsigned t_col_end = std::min(t_im / stride_t + 1, depth_col); + const auto t_col_end = std::min(t_im / stride_t + 1, depth_col); // TODO: use LCM of stride and dilation to avoid unnecessary loops for (unsigned t_col = t_col_start; t_col < t_col_end; t_col += 1) { for (unsigned h_col = h_col_start; h_col < h_col_end; h_col += 1) { for (unsigned w_col = w_col_start; w_col < w_col_end; w_col += 1) { - unsigned t_k = (t_im - t_col * stride_t); - unsigned h_k = (h_im - h_col * stride_h); - unsigned w_k = (w_im - w_col * stride_w); + uint64_t t_k = (t_im - t_col * stride_t); + uint64_t h_k = (h_im - h_col * stride_h); + uint64_t w_k = (w_im - w_col * stride_w); if (t_k % dilation_t == 0 && h_k % dilation_h == 0 && w_k % dilation_w == 0) { t_k /= dilation_t; diff --git a/test/test_nn.py b/test/test_nn.py index 6c7f1e82ccd63..5fbdcacd641d8 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -15159,6 +15159,16 @@ def test_conv_large_nosplit(self, device): input_large = torch.randn(1, 1, 2048, 1024 , dtype=dtype, device=device) conv2(input_large) + @onlyCUDA + @largeTensorTest('40GB') + @largeTensorTest('24GB', 'cpu') + def test_conv3d_64bit_indexing(self, device): + x = torch.rand(1, 32, 512, 512, 256) + m = torch.nn.Conv3d(32, 1, kernel_size=1, padding=0, stride=1, bias=False) + yref = m(x) + y = m.to(device=device)(x.to(device=device)) + self.assertEqual(yref, y) + def test_conv_noncontig_weights(self, device): for dim in (1, 2, 3): for grouped in (False, True): From 315967c5bc944293ee47b3ae3169095f98443dc0 Mon Sep 17 00:00:00 2001 From: Will Constable Date: Sat, 22 Oct 2022 14:50:45 +0000 Subject: [PATCH 0057/1922] Improvements for DDP Optimizer (#87549) - adds support for 'first_bucket_cap' arg, to align bucketing more precisely with DDP, which may start a smaller first bucket - refactors the bucket splitting logic to be cleaner - adds pretty-print for bucket info, and a way to access bucket info from the DDPOptimizer class from a test case or benchmark - dumps debug logs to stdout cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang Pull Request resolved: https://github.com/pytorch/pytorch/pull/87549 Approved by: https://github.com/soumith --- test/distributed/test_dynamo_distributed.py | 9 +- torch/_dynamo/optimizations/distributed.py | 129 ++++++++++++-------- 2 files changed, 84 insertions(+), 54 deletions(-) diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py index 0fefd4ec507a7..43a4a23039175 100644 --- a/test/distributed/test_dynamo_distributed.py +++ b/test/distributed/test_dynamo_distributed.py @@ -18,8 +18,8 @@ def __init__(self, in_feat=10, hidden_feat=5000, num_hidden=2, out_feat=5): super().__init__() self.net = nn.Sequential( *[nn.Linear(in_feat, hidden_feat), nn.ReLU()] - + [nn.Linear(5000, 5000), nn.ReLU()] * num_hidden - + [nn.Linear(5000, 5), nn.ReLU()] + + [nn.Linear(hidden_feat, hidden_feat), nn.ReLU()] * num_hidden + + [nn.Linear(hidden_feat, 5), nn.ReLU()] ) def forward(self, inputs): @@ -160,7 +160,10 @@ def test_no_split(self): Ensures the DDPOptimizer returns a correct, compiled module without introducing graph splits. (Based on model parmeters fitting in the bucket) """ - m, inputs, correct_outputs = self.get_model() + # DDP will always do a 'first bucket' with a really small size; so only a tiny model will escape this + m = ToyModel(hidden_feat=5).to(self.device) + inputs = torch.randn(20, 10).to(self.device) + correct_outputs = m(inputs) ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=250) check_splits_compiler = CheckSplitsCompiler() diff --git a/torch/_dynamo/optimizations/distributed.py b/torch/_dynamo/optimizations/distributed.py index f65c16483aec6..bd3f07b635f3f 100644 --- a/torch/_dynamo/optimizations/distributed.py +++ b/torch/_dynamo/optimizations/distributed.py @@ -1,4 +1,5 @@ -from typing import Any, List +from dataclasses import dataclass, field +from typing import Any, List, Optional import torch import torch.fx.traceback as fx_traceback @@ -18,6 +19,28 @@ def args_str(args): return str(args) +@dataclass +class Bucket: + size: int = 0 + params: List[str] = field(default_factory=list) + nodes: List[fx.Node] = field(default_factory=list) + + +def pretty_print_buckets(buckets: List[Bucket]): + headers = ("Index", "Size (b)", "Param Names") + rows = [] + for idx, bucket in enumerate(reversed(buckets)): + rows.append((idx, bucket.size, bucket.params[0])) + for param in bucket.params[1:]: + rows.append((None, None, param)) + try: + from tabulate import tabulate + + print(tabulate(rows, headers=headers, tablefmt="simple_grid")) + except ImportError: + print("Please `pip install tabulate` in order to pretty-print ddp bucket sizes") + + class DDPOptimizer: def __init__( self, @@ -25,8 +48,20 @@ def __init__( parameters_to_ignore: List[str], backend_compile_fn, debug=False, + first_bucket_cap: Optional[int] = None, ): + if first_bucket_cap is not None: + self.first_bucket_cap = first_bucket_cap + elif torch.distributed.is_available(): + # this constant comes from C10D lib which is not always built + self.first_bucket_cap = torch.distributed._DEFAULT_FIRST_BUCKET_BYTES + else: + self.first_bucket_cap = bucket_bytes_cap + self.bucket_bytes_cap = bucket_bytes_cap + assert ( + self.first_bucket_cap <= self.bucket_bytes_cap + ), "First bucket should be smaller/equal to other buckets to get comms warmed up ASAP" self.parameters_to_ignore = parameters_to_ignore self.backend_compile_fn = backend_compile_fn self.debug = debug @@ -35,76 +70,69 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]): """ TODO: - handle params_and_buffers_to_ignore - - handle kwargs """ # 1: compute the partition map according to DDP bucket logic - bucket_bytes = 0 - bucket_actual_sizes = [] - node_splits = [[]] + buckets = [Bucket()] # (size, param_names) for node in reversed(gm.graph.nodes): - if node.op == "output" or node.op == "placeholder": + if node.op in ("output", "placeholder"): continue - if bucket_bytes >= self.bucket_bytes_cap: - bucket_actual_sizes.insert(0, bucket_bytes) - bucket_bytes = 0 - node_splits.insert(0, []) + if ( + buckets[0].size >= self.bucket_bytes_cap + or len(buckets) == 1 + and buckets[0].size >= self.first_bucket_cap + ): + buckets.insert(0, Bucket()) - elif node.op == "call_module": + if node.op == "call_module": target = gm.get_submodule(node.target) - params_size_b = sum( - [ - p.storage().nbytes() - for p in target.parameters() - if p.requires_grad - ] - ) - bucket_bytes += params_size_b - # print(f"accumulated {params_size_b} b from {node}") + for name, p in target.named_parameters(): + if p.requires_grad: + buckets[0].size += p.storage().nbytes() + # TODO correct FQ name? + buckets[0].params.append(f"{node}_{name}") elif node.op == "get_attr": maybe_param = getattr(gm, node.target) if maybe_param.requires_grad: - bucket_bytes += maybe_param.storage().nbytes() - else: - # TODO(whc) confirm this: - # (e.g. call_method, call_function aren't expected to 'have' parameters) - pass - - node_splits[0].append(node) - - if len(node_splits) == 1: - if self.debug: - print( - "DDPOptimizer did not split graphs." - f" Accumulated {bucket_bytes} bytes, and bucket cap is {self.bucket_bytes_cap}" - ) - return self.backend_compile_fn(gm, example_inputs) + buckets[0].size += maybe_param.storage().nbytes() + buckets[0].params.append(node.target) - if len(bucket_actual_sizes) < len(node_splits): - bucket_actual_sizes.insert(0, bucket_bytes) + # All nodes have to be mapped to a bucket, even if they don't have their own params + buckets[0].nodes.append(node) + # stash buckets for testing/debugging purposes + self.buckets = buckets if self.debug: print( - f"DDPOptimizer used bucket cap {self.bucket_bytes_cap}" - f" and split graphs into parameter sizes {', '.join([str(b) for b in bucket_actual_sizes])}" + f"DDPOptimizer used bucket cap {self.bucket_bytes_cap} and produced the following buckets:" ) + pretty_print_buckets(buckets) + + if len(buckets) == 1: + # bypass split/fuse logic if there is only one bucket + return self.backend_compile_fn(gm, example_inputs) # 2: partition the graphmodule according to bucket capacity partition_map = {} - for p, nodes in enumerate(node_splits): - for node in nodes: - partition_map[node] = p + for idx, b in enumerate(buckets): + for node in b.nodes: + partition_map[node] = idx split_gm = fx.passes.split_module.split_module( gm, None, lambda node: partition_map[node] ) if self.debug: - with open("debug_ddp_optimizer.log", "w") as dump_file: - dump_file.write("---orig graph---") - dump_file.write(str(gm.graph)) - dump_file.write("\n---split graph---") - dump_file.write(str(split_gm.graph)) + print("---orig graph---") + print(str(gm.graph)) + print("\n---split graph---") + print(str(split_gm.graph)) + for name, module in split_gm.named_modules(): + if "." not in name: + # only print the submod graphs, not their children + print(f"\n---{name} graph---") + print(str(module.graph)) + print("---------------") # 3: compile each of the partitioned submodules using the user-provided compiler class SubmodCompiler(torch.fx.interpreter.Interpreter): @@ -171,7 +199,6 @@ def run_node(self, n: Node) -> Any: self.module.delete_submodule(n.target) n.target = "compiled_" + n.target self.module.add_submodule(n.target, compiled_submod) - # then we execute the modified node using the usual logic return getattr(self, n.op)(n.target, args, kwargs) @@ -180,8 +207,8 @@ def run_node(self, n: Node) -> Any: split_gm.recompile() if self.debug: - with open("debug_ddp_optimizer.log", "a") as dump_file: - dump_file.write("\n---final graph---") - dump_file.write(str(split_gm.graph)) + print("\n---final graph---") + print(str(split_gm.graph)) + print("---------------") return split_gm From 58fba3d0bca7f74f8c76645ccea598a240f955cb Mon Sep 17 00:00:00 2001 From: lezcano Date: Sun, 23 Oct 2022 20:38:41 +0000 Subject: [PATCH 0058/1922] Improve readability of the extra message errors in assertEqual (#87202) Goes from (note the `linspace.default` is very difficult to find) ``` Mismatched elements: 15 / 50 (30.0%) Greatest absolute difference: 1 at index (17,) Greatest relative difference: 1.0 at index (17,) : linspace.default args = (0, -3, 50) kwargs = {'dtype': torch.int16, 'device': device(type='cpu'), 'pin_memory': False} ``` to ``` Mismatched elements: 15 / 50 (30.0%) Greatest absolute difference: 1 at index (17,) Greatest relative difference: 1.0 at index (17,) linspace.default args = (0, -3, 50) kwargs = {'dtype': torch.int16, 'device': device(type='cpu'), 'pin_memory': False} ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/87202 Approved by: https://github.com/ezyang --- test/test_testing.py | 2 +- torch/testing/_internal/common_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_testing.py b/test/test_testing.py index fad72ab91de0a..3ad6ff06c771e 100644 --- a/test/test_testing.py +++ b/test/test_testing.py @@ -68,7 +68,7 @@ def test_assertEqual_longMessage(self): self.longMessage = True extra_msg = "sentinel" - with self.assertRaisesRegex(AssertionError, re.escape(f"{default_msg} : {extra_msg}")): + with self.assertRaisesRegex(AssertionError, re.escape(f"{default_msg}\n{extra_msg}")): self.assertEqual(actual, expected, msg=extra_msg) finally: self.longMessage = long_message diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index 77887574e1888..5da1ffefaba91 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -2497,7 +2497,7 @@ def to_list(input): # This emulates unittest.TestCase's behavior if a custom message passed and # TestCase.longMessage (https://docs.python.org/3/library/unittest.html#unittest.TestCase.longMessage) # is True (default) - msg=(lambda generated_msg: f"{generated_msg} : {msg}") if isinstance(msg, str) and self.longMessage else msg, + msg=(lambda generated_msg: f"{generated_msg}\n{msg}") if isinstance(msg, str) and self.longMessage else msg, ) def assertNotEqual(self, x, y, msg: Optional[str] = None, *, # type: ignore[override] From 4f6bdefd4e166ae19ef1fdf4ea9e0ce0e438e56c Mon Sep 17 00:00:00 2001 From: lezcano Date: Sun, 23 Oct 2022 20:38:41 +0000 Subject: [PATCH 0059/1922] [PrimTorch] Add maker for *_copy variants of view functions (#87278) Implements `diagonal_copy` as an example. This PR also fixes a number of correcness issues with `diagonal_copy`. cc @ezyang @mruberry @ngimel @Lezcano @fdrocha Pull Request resolved: https://github.com/pytorch/pytorch/pull/87278 Approved by: https://github.com/mruberry --- .../functorch/BatchRulesDecompositions.cpp | 1 + aten/src/ATen/native/TensorShape.cpp | 12 ++++++++-- test/functorch/test_aotdispatch.py | 1 + tools/autograd/gen_variable_type.py | 1 + torch/_refs/__init__.py | 23 +++++++++++++++++++ .../_internal/common_methods_invocations.py | 16 +++++++++++-- 6 files changed, 50 insertions(+), 4 deletions(-) diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp index 157fbf23bf6fd..2ea8b7fac4546 100644 --- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp +++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp @@ -257,6 +257,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { OP_DECOMPOSE(frobenius_norm); OP_DECOMPOSE(type_as); OP_DECOMPOSE(linalg_diagonal); + OP_DECOMPOSE(diagonal_copy); m.impl("pad", native::pad_symint); m.impl("_pad_circular", native::_pad_circular_symint); OP_DECOMPOSE(t_); diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index a72fba7ac12e0..05d1f53515c14 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -3709,8 +3709,16 @@ at::Tensor& _sparse_broadcast_to_copy_out(const at::Tensor & self, at::IntArrayR at::Tensor& diagonal_copy_out(const at::Tensor & self, int64_t offset, int64_t dim1, int64_t dim2, at::Tensor & out) { - auto tmp = self.diagonal(offset, dim1, dim2); - out.copy_(tmp); + TORCH_CHECK( + out.device() == self.device(), + "diagonal_copy: Expected out and self tensors to be on the same device, but got ", + "out on ", out.device(), " and self on ", self.device()); + auto result = self.diagonal(offset, dim1, dim2); + at::native::resize_output(out, result.sizes()); + TORCH_CHECK( + canCast(result.scalar_type(), out.scalar_type()), + "diagonal_copy: result type ", result.scalar_type(), " can't be cast to the desired out= type ", out.scalar_type()); + out.copy_(result); return out; } diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py index c058b3618ecb1..57013636eeabf 100644 --- a/test/functorch/test_aotdispatch.py +++ b/test/functorch/test_aotdispatch.py @@ -1003,6 +1003,7 @@ def assert_compiler(gm: torch.fx.GraphModule, _): xfail('deg2rad', ''), # aten.deg2rad.default - couldn't find symbolic meta function/decomposition xfail('diag', ''), # Cannot call sizes() on tensor with symbolic sizes/strides xfail('diagonal', ''), # Cannot call sizes() on tensor with symbolic sizes/strides + xfail('diagonal_copy', ''), # Cannot call sizes() on tensor with symbolic sizes/strides xfail('diagonal_scatter', ''), # Cannot call sizes() on tensor with symbolic sizes/strides xfail('diff', ''), # aten.zeros_like.default - couldn't find symbolic meta function/decomposition xfail('digamma', ''), # aten.polygamma.default - couldn't find symbolic meta function/decomposition diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index 53bd60b76e6bd..2feb84bbd088d 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -295,6 +295,7 @@ "reflection_pad3d", "linalg_cholesky_ex", "linalg_eig", + "diagonal_copy", "select_backward", "diagonal_backward", "slice_backward", diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py index d6a8f476b3176..6169e5af06d9a 100644 --- a/torch/_refs/__init__.py +++ b/torch/_refs/__init__.py @@ -220,6 +220,7 @@ "contiguous", "diag_embed", "diagonal", + "diagonal_copy", "dsplit", "dstack", "expand", @@ -2001,6 +2002,25 @@ def _reduction( return result +def _make_copy_from_view(fn): + """ + Given a view function (e.g. torch.diagonal) generates its copy variant (e.g. torch.diagonal_copy) + """ + name = fn.__name__ + fn = out_wrapper()(fn) + + def _fn(*args, out=None, **kwargs): + result = fn(*args, out=out, **kwargs) + if out is None: + return result.clone(memory_format=torch.contiguous_format) + return result + + copy_name = f"{name}_copy" + _fn.__name__ = copy_name + _fn = register_decomposition(getattr(torch.ops.aten, copy_name))(_fn) + return _fn + + # Saves Python all py_all = all @@ -3505,6 +3525,9 @@ def diagonal( return result +diagonal_copy = _make_copy_from_view(diagonal) + + @register_decomposition(torch.ops.aten.diag_embed) def diag_embed( t: TensorLikeType, diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index f637339f16d24..95b1df24a9512 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -5138,7 +5138,7 @@ def reference_inputs_diagonal_diag_embed(op_info, device, dtype, requires_grad, samples3d = product(shapes3d, kwargs3d) for shape, kwargs in chain(samples1d, samples2d, samples3d): - if op_info.name in ('diagonal', '_refs.diagonal'): + if 'diagonal' in op_info.name: # these are error inputs for diagonal if shape in ((0,), (1,)): continue @@ -5174,7 +5174,7 @@ def error_inputs_diagonal_diag_embed(op_info, device, **kwargs): dim1 = kwargs.get('dim1') dim2 = kwargs.get('dim2') - if op_info.name in ('diagonal', '_refs.diagonal'): + if 'diagonal' in op_info.name: num_dim = arg.dim() elif op_info.name in ('diag_embed', '_refs.diag_embed'): # these are valid inputs for diag_embed @@ -9185,6 +9185,13 @@ def reference_flatten(input, start_dim=0, end_dim=-1): sample_inputs_func=sample_inputs_diagonal_diag_embed, reference_inputs_func=reference_inputs_diagonal_diag_embed, error_inputs_func=error_inputs_diagonal_diag_embed), + OpInfo('diagonal_copy', + dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf), + supports_forward_ad=True, + supports_fwgrad_bwgrad=True, + sample_inputs_func=sample_inputs_diagonal_diag_embed, + reference_inputs_func=reference_inputs_diagonal_diag_embed, + error_inputs_func=error_inputs_diagonal_diag_embed), OpInfo('diagonal_scatter', dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16), supports_out=False, @@ -17694,6 +17701,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1): torch_opinfo_name="diagonal", supports_nvfuser=False, ), + PythonRefInfo( + "_refs.diagonal_copy", + torch_opinfo_name="diagonal_copy", + supports_nvfuser=False, + ), PythonRefInfo( "_refs.diag_embed", torch_opinfo_name="diag_embed", From 6d90ea6627714417066203f8840d5892cf190097 Mon Sep 17 00:00:00 2001 From: lezcano Date: Sun, 23 Oct 2022 20:38:41 +0000 Subject: [PATCH 0060/1922] Simplify a few diagonal-related functions (#87180) `diag` was unnecessarily implemented as a kernel rather than as a composite function, which made it unnecessarily difficult (explicit backward + all it entails). We also change a few uses of `diag` on 2D tensors for `diagonal()`. The latter returns a view rather than creating a new tensor. We also upgrade its meta implementation to a fully-fledged decomposition I tried implementing the backwards of `diagonal()` via `diag_scatter` (or better `diag_scatter_` to keep the perf) but functionalisation was failing and I was not sure how to fix this, so I moved on. It may be possible to simplify that one as well if @soulitzer or someone knows how to do this. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87180 Approved by: https://github.com/ngimel, https://github.com/albanD, https://github.com/mruberry --- .../functorch/BatchRulesDecompositions.cpp | 1 - aten/src/ATen/functorch/BatchRulesViews.cpp | 2 +- aten/src/ATen/native/Correlation.cpp | 2 +- aten/src/ATen/native/TensorShape.cpp | 79 ++----- aten/src/ATen/native/cuda/TriangularOps.cu | 130 +----------- .../native/mps/operations/TriangularOps.mm | 192 ------------------ aten/src/ATen/native/native_functions.yaml | 15 +- aten/src/ATen/native/ts_native_functions.yaml | 1 + .../check_forward_backward_compatibility.py | 1 + test/functorch/test_vmap.py | 3 + test/lazy/test_ts_opinfo.py | 1 + test/test_autograd.py | 13 -- test/test_decomp.py | 2 + tools/autograd/derivatives.yaml | 4 - torch/_meta_registrations.py | 16 -- torch/_refs/__init__.py | 17 ++ .../csrc/jit/runtime/static/generated_ops.cpp | 19 -- .../lazy/ts_backend/ts_native_functions.cpp | 9 + .../_internal/common_methods_invocations.py | 22 +- 19 files changed, 69 insertions(+), 460 deletions(-) diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp index 2ea8b7fac4546..f1108bac25a0a 100644 --- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp +++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp @@ -81,7 +81,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { OP_DECOMPOSE2(dsplit, int); OP_DECOMPOSE2(dsplit, array); OP_DECOMPOSE(det); - m.impl("diag_backward", native::diag_backward_symint); OP_DECOMPOSE(diff); OP_DECOMPOSE(dstack); OP_DECOMPOSE(einsum); diff --git a/aten/src/ATen/functorch/BatchRulesViews.cpp b/aten/src/ATen/functorch/BatchRulesViews.cpp index 9dd014a4307f9..b8c3727d15dcc 100644 --- a/aten/src/ATen/functorch/BatchRulesViews.cpp +++ b/aten/src/ATen/functorch/BatchRulesViews.cpp @@ -506,7 +506,7 @@ std::tuple> diag_embed_batch_rule(const Tensor& self, } Tensor trace_decomp(const Tensor& tensor) { - return tensor.diag().sum(); + return tensor.diagonal().sum(); } TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { diff --git a/aten/src/ATen/native/Correlation.cpp b/aten/src/ATen/native/Correlation.cpp index 204e4f2cb5688..9aca753c78ca5 100644 --- a/aten/src/ATen/native/Correlation.cpp +++ b/aten/src/ATen/native/Correlation.cpp @@ -139,7 +139,7 @@ Tensor corrcoef(const Tensor& self) { } // normalize covariance - const auto d = c.diag(); + const auto d = c.diagonal(); const auto stddev = at::sqrt(d.is_complex() ? at::real(d) : d); c = c / stddev.view({-1, 1}); c = c / stddev.view({1, -1}); diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index 05d1f53515c14..6543509d3dcb8 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -3390,72 +3390,29 @@ Tensor unfold(const Tensor& self, int64_t d, int64_t size, int64_t step) { return self.as_strided(sizes, strides); } -template -void apply_diag(Tensor& result, const Tensor& self, int64_t dimension) { - TORCH_CHECK(self.dim() == 1 || self.dim() == 2, "matrix or a vector expected"); - - auto self_data = self.data_ptr(); - if (self.dim() == 1) { - auto self_size = self.size(0); - auto self_stride = self.stride(0); - int64_t sz = self_size + std::abs(dimension); - - at::native::resize_output(result, {sz, sz}); - result.zero_(); - auto r_data = result.data_ptr(); - auto r_stride_0 = result.stride(0); - auto r_stride_1 = result.stride(1); - r_data += (dimension >= 0 ? dimension*r_stride_1 : -dimension*r_stride_0); - - for (const auto i : c10::irange(self_size)) { - r_data[i * (r_stride_0 + r_stride_1)] = self_data[i * self_stride]; - } +Tensor diag(const Tensor& self, int64_t offset) { + auto ndim = self.dim(); + TORCH_CHECK(ndim == 1 || ndim == 2, "diag(): Supports 1D or 2D tensors. Got ", self.dim(), "D"); + if (ndim == 1) { + return at::diag_embed(self, offset); } else { - auto self_stride_0 = self.stride(0); - auto self_stride_1 = self.stride(1); - - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int64_t sz; - if (dimension >= 0) { - sz = std::min(self.size(0), self.size(1) - dimension); - } else { - sz = std::min(self.size(0) + dimension, self.size(1)); - } - - at::native::resize_output(result, {sz}); - result.zero_(); - auto r_data = result.data_ptr(); - auto r_stride_0 = result.stride(0); - self_data += (dimension >= 0 ? dimension * self_stride_1 : -dimension * self_stride_0); - for (const auto i : c10::irange(sz)) { - r_data[i * r_stride_0] = self_data[i * (self_stride_0 + self_stride_1)]; - } + // We return a copy of the diagonal + return at::diagonal_copy(self, offset); } } -Tensor diag(const Tensor& self, int64_t dimension) { - Tensor result = at::empty({0}, self.options()); - at::diag_out(result, self, dimension); - return result; -} - -Tensor& diag_cpu_out(const Tensor& self, int64_t dimension, Tensor &result) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kBool, self.scalar_type(), "diag", [&] { - apply_diag(result, self, dimension); - }); - return result; -} - -Tensor diag_backward_symint(const Tensor& grad, SymIntArrayRef input_sizes, int64_t diagonal) { - auto ndimension = input_sizes.size(); - AT_ASSERT(ndimension == 1 || ndimension == 2); - - if (ndimension == 1 || input_sizes[0] == input_sizes[1]) { - return grad.diag(diagonal); +Tensor& diag_out(const Tensor& self, int64_t offset, Tensor& out) { + auto ndim = self.dim(); + TORCH_CHECK(ndim == 1 || ndim == 2, "Supports 1D or 2D tensors. Got ", self.dim(), "D"); + if (ndim == 1) { + TORCH_CHECK( + canCast(self.scalar_type(), out.scalar_type()), + "diag: result type ", self.scalar_type(), " can't be cast to the desired out= type ", + out.scalar_type()); + return at::diag_embed_out(out, self, offset); + } else { + return at::diagonal_copy_out(out, self, offset); } - - // Input was a matrix but was not square - return at::diagonal_backward_symint(grad, input_sizes, diagonal, 0, 1); } Tensor diagonal_backward_symint(const Tensor & grad, SymIntArrayRef input_sizes, int64_t offset, int64_t dim1, int64_t dim2) { diff --git a/aten/src/ATen/native/cuda/TriangularOps.cu b/aten/src/ATen/native/cuda/TriangularOps.cu index f87d821f396ce..a079ec6849888 100644 --- a/aten/src/ATen/native/cuda/TriangularOps.cu +++ b/aten/src/ATen/native/cuda/TriangularOps.cu @@ -102,137 +102,9 @@ TORCH_IMPL_FUNC(triu_cuda)(const Tensor& self, int64_t k, const Tensor &result) } } -// Copy the kth diagonal of a matrix B to a vector A. -template -C10_LAUNCH_BOUNDS_1(1024) -__global__ void copy_from_diagonal_kernel( - scalar_t* a, - scalar_t* b, - std::ptrdiff_t start, - std::ptrdiff_t size, - std::ptrdiff_t strideSum, - std::ptrdiff_t strideA) { - for (std::ptrdiff_t linearIndex = blockIdx.x * blockDim.x + threadIdx.x; - linearIndex < size; - linearIndex += gridDim.x * blockDim.x) { - const std::ptrdiff_t bOffset = start + strideSum * linearIndex; - a[strideA * linearIndex] = b[bOffset]; - } -} - -// Copy vector B to the kth diagonal of a matrix A -template -C10_LAUNCH_BOUNDS_1(1024) -__global__ void copy_to_diagonal_kernel( - scalar_t* a, - scalar_t* b, - std::ptrdiff_t start, - std::ptrdiff_t size, - std::ptrdiff_t strideSum, - std::ptrdiff_t strideB) { - for (std::ptrdiff_t linearIndex = blockIdx.x * blockDim.x + threadIdx.x; - linearIndex < size; - linearIndex += gridDim.x * blockDim.x) { - const std::ptrdiff_t aOffset = start + strideSum * linearIndex; - a[aOffset] = b[strideB * linearIndex]; - } -} - -template -Tensor& apply_diag(Tensor& result, const Tensor& self, int64_t dimension) { - TORCH_CHECK( - self.dim() == 1 || self.dim() == 2, "matrix or a vector expected"); - - TensorArg result_arg{result, "result", 1}; - TensorArg self_arg{self, "self", 2}; - checkAllSameGPU(__func__, {result_arg, self_arg}); - checkSameType(__func__, result_arg, self_arg); - - int nDimension = self.dim(); - if (nDimension == 2) { - auto self_stride_0 = self.stride(0); - auto self_stride_1 = self.stride(1); - - int sz; - if (dimension > 0) { - sz = std::min(self.size(0), self.size(1) - dimension); - } else { - sz = std::min(self.size(0) + dimension, self.size(1)); - } - - at::native::resize_output(result, {sz}); - if (sz > 0) { - at::assert_no_internal_overlap(result); - auto result_stride = result.stride(0); - const dim3 threads(std::min( - int(sz), - int(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock))); - const dim3 grid( - std::min(int(1024), ceil_div(int(sz), int(threads.x)))); - auto start = - (dimension >= 0 ? dimension * self_stride_1 - : -dimension * self_stride_0); - - // Kernel Launch - copy_from_diagonal_kernel - <<>>( - result.data_ptr(), - self.data_ptr(), - start, - sz, - self_stride_0 + self_stride_1, - result_stride); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - } - } else { - auto n_elems = self.numel(); - auto sz = (dimension > 0) ? n_elems + dimension : n_elems - dimension; - auto self_stride = self.stride(0); - at::native::resize_output(result, {sz, sz}); - result.zero_(); - if (sz > 0) { - at::assert_no_internal_overlap(result); - auto result_stride_0 = result.stride(0); - auto result_stride_1 = result.stride(1); - const dim3 threads(std::min( - int(sz), at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock)); - const dim3 grid( - std::min(int(1024), ceil_div(int(sz), int(threads.x)))); - auto start = - (dimension >= 0 ? dimension * result_stride_1 - : -dimension * result_stride_0); - - // Kernel Launch - copy_to_diagonal_kernel - <<>>( - result.data_ptr(), - self.data_ptr(), - start, - n_elems, - result_stride_0 + result_stride_1, - self_stride); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - } - } - - return result; -} - -Tensor& diag_cuda_out(const Tensor& self, int64_t dimension, Tensor& result) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( - kComplexHalf, ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, - self.scalar_type(), "diag_cuda", - [&] { - apply_diag(result, self, dimension); - }); - return result; -} - Tensor trace_cuda(const Tensor& self) { TORCH_CHECK(self.dim() == 2, "expected a matrix"); - int dimension = 0; - auto result = at::diag(self, dimension); - return result.sum(); + return self.diagonal().sum(); } } // namespace native diff --git a/aten/src/ATen/native/mps/operations/TriangularOps.mm b/aten/src/ATen/native/mps/operations/TriangularOps.mm index fb6e1c52ba49e..c276707964997 100644 --- a/aten/src/ATen/native/mps/operations/TriangularOps.mm +++ b/aten/src/ATen/native/mps/operations/TriangularOps.mm @@ -172,197 +172,5 @@ } -Tensor& diag_mps_out(const Tensor& self, - int64_t diagonal, - Tensor &output) { - - // Do checks, resize output - IntArrayRef input_size = self.sizes(); - auto num_input_dims = input_size.size(); - // Input can only be 1D or 2D - TORCH_CHECK(num_input_dims == 1 || num_input_dims == 2, - "diag_mps_out: Input tensor must be 1D or 2D") - - if(num_input_dims == 1) { - auto n = input_size[0]; - if(diagonal > 0) - n += diagonal; - else if(diagonal < 0) - n -= diagonal; - - output.resize_({n, n}); - } - else if(num_input_dims == 2) { - auto num_diag_elements = std::min(input_size[0], input_size[1]); - if(diagonal > 0) { - TORCH_CHECK(input_size[1] - diagonal > 0, "Matrix not big enough for requested diagonal") - num_diag_elements = std::min(input_size[0], input_size[1] - diagonal); - } - else if(diagonal < 0) { - TORCH_CHECK(input_size[0] + diagonal > 0, "Matrix not big enough for requested diagonal") - num_diag_elements = std::min(input_size[0] + diagonal, input_size[1]); - } - - output.resize_({num_diag_elements}); - } - - using namespace mps; - MPSStream* stream = getCurrentMPSStream(); - - // Derive from MPSCachedGraph - struct CachedGraph : public MPSCachedGraph - { - CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} - MPSGraphTensor *inputTensor_ = nil; - MPSGraphTensor *outputTensor_ = nil; - }; - - MPSGraphCache* cache_ = MPSGraphCache::getInstance(); - - @autoreleasepool { - - MPSShape* input_shape = getMPSShape(self); - MPSShape* output_shape = getMPSShape(output); - NSNumber* num_input_cols = nil; - NSNumber* num_output_cols = nil; - NSMutableArray* flat_input_shape = nil; - NSMutableArray* flat_output_shape = nil; - if(num_input_dims == 1) { - num_output_cols = output_shape[1]; - flat_output_shape = [NSMutableArray arrayWithCapacity:1]; - flat_output_shape[0] = [NSNumber numberWithInt:[output_shape[0] intValue] * [output_shape[1] intValue]]; - } - else if(num_input_dims == 2) { - num_input_cols = input_shape[1]; - flat_input_shape = [NSMutableArray arrayWithCapacity:1]; - flat_input_shape[0] = [NSNumber numberWithInt:[input_shape[0] intValue] * [input_shape[1] intValue]]; - } - NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","]; - string key = "diag_mps_out:" + getMPSTypeString(self.scalar_type()) + ":" + std::to_string(diagonal) - + ":" + string([ns_shape_key UTF8String]); - CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); - - if(!cachedGraph) { - MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { - CachedGraph *newCachedGraph = nil; - - @autoreleasepool { - MPSGraph* mpsGraph = make_mps_graph(); - newCachedGraph = new CachedGraph(mpsGraph); - - // TODO: Accept this as the flat version in 2D case - MPSGraphTensor* inputTensor = nil; - if(num_input_dims == 1) - inputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type())); - else - inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()), flat_input_shape); - - MPSGraphTensor* outputTensor = nil; - - MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0 - dataType:MPSDataTypeInt32]; - MPSGraphTensor* numDiagElementsRange = nil; - MPSGraphTensor* diagOffset = nil; - MPSGraphTensor* rowMultiplier = nil; - MPSGraphTensor* rowIndices = nil; - MPSGraphTensor* colIndices = nil; - MPSGraphTensor* indicesTensor = nil; - - if(num_input_dims == 1) { - int shape_data[1] = {[input_shape[0] intValue]}; - MPSGraphTensor* inputShapeTensor = [mpsGraph constantWithData:[NSData dataWithBytes:shape_data length:sizeof(int)] - shape:@[@1] - dataType:MPSDataTypeInt32]; - numDiagElementsRange = [mpsGraph coordinateAlongAxisTensor: zeroTensor - withShapeTensor: inputShapeTensor - name: nil]; - diagOffset = [mpsGraph constantWithScalar:diagonal - dataType:MPSDataTypeInt32]; - rowMultiplier = [mpsGraph constantWithScalar:[num_output_cols intValue] - dataType:MPSDataTypeInt32]; - } - else { - int shape_data[1] = {[output_shape[0] intValue]}; - MPSGraphTensor* outputShapeTensor = [mpsGraph constantWithData:[NSData dataWithBytes:shape_data length:sizeof(int)] - shape:@[@1] - dataType:MPSDataTypeInt32]; - numDiagElementsRange = [mpsGraph coordinateAlongAxisTensor: zeroTensor - withShapeTensor: outputShapeTensor - name: nil]; - diagOffset = [mpsGraph constantWithScalar:diagonal - dataType:MPSDataTypeInt32]; - rowMultiplier = [mpsGraph constantWithScalar:[num_input_cols intValue] - dataType:MPSDataTypeInt32]; - } - - if(diagonal >= 0) { - rowIndices = numDiagElementsRange; - colIndices = [mpsGraph additionWithPrimaryTensor:numDiagElementsRange - secondaryTensor:diagOffset - name:nil]; - } - else { - rowIndices = [mpsGraph subtractionWithPrimaryTensor:numDiagElementsRange - secondaryTensor:diagOffset - name:nil];; - colIndices = numDiagElementsRange; - } - - indicesTensor = [mpsGraph multiplicationWithPrimaryTensor:rowIndices - secondaryTensor:rowMultiplier - name:nil]; - indicesTensor = [mpsGraph additionWithPrimaryTensor:indicesTensor - secondaryTensor:colIndices - name:nil]; - - if(num_input_dims == 1) { - // TODO: Scatter mode doesn't matter, so what should I set it to be? - outputTensor = [mpsGraph scatterWithUpdatesTensor:inputTensor - indicesTensor:indicesTensor - shape:flat_output_shape - axis:0 - mode:MPSGraphScatterModeAdd - name:nil]; - outputTensor = [mpsGraph reshapeTensor:outputTensor - withShape:output_shape - name:nil]; - } - else if(num_input_dims == 2) { - outputTensor = [mpsGraph gatherWithUpdatesTensor:inputTensor - indicesTensor:indicesTensor - axis:0 - batchDimensions:0 - name:nil]; - } - - newCachedGraph->inputTensor_ = inputTensor; - newCachedGraph->outputTensor_ = outputTensor; - } - return newCachedGraph; - }); - cachedGraph = static_cast(tmpCachedGraph); - } - - Placeholder selfPlaceholder = Placeholder(); - if(num_input_dims == 1) - selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); - else - selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, flat_input_shape); - - Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); - - NSDictionary* feeds = @{ - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() - }; - NSDictionary* results = @{ - outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() - }; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); - } - - return output; -} - } // namespace native } // namespace at diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index b827999cf54e9..faab6371c8af1 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -1815,7 +1815,7 @@ - func: diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor variants: function, method dispatch: - CompositeExplicitAutograd: diag_embed + CompositeExplicitAutogradNonFunctional: diag_embed autogen: diag_embed.out - func: diagflat(Tensor self, int offset=0) -> Tensor @@ -7698,21 +7698,10 @@ - func: diag.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: diag_cpu_out - CUDA: diag_cuda_out - MPS: diag_mps_out + CPU, CUDA: diag_out - func: diag(Tensor self, int diagonal=0) -> Tensor variants: method, function - dispatch: - CompositeExplicitAutograd: diag - -- func: diag_backward(Tensor grad, SymInt[] input_sizes, int diagonal) -> Tensor - variants: function - device_check: NoCheck - device_guard: False - dispatch: - CompositeImplicitAutograd: diag_backward_symint - func: cross.out(Tensor self, Tensor other, int? dim=None, *, Tensor(a!) out) -> Tensor(a!) diff --git a/aten/src/ATen/native/ts_native_functions.yaml b/aten/src/ATen/native/ts_native_functions.yaml index fc287045dc9dd..3cb290f1004bf 100644 --- a/aten/src/ATen/native/ts_native_functions.yaml +++ b/aten/src/ATen/native/ts_native_functions.yaml @@ -189,6 +189,7 @@ supported: # after functionalization, # but their implementations call view operators (which we need to functionalize away). - block_diag + - diag_embed - diagonal_backward - slice_backward - new_empty_strided diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py index 5f13834ee77e0..30e398dbf1e0d 100644 --- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py +++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py @@ -95,6 +95,7 @@ ("aten::_linalg_inv_out_helper", datetime.date(2022, 10, 1)), ("aten::col2im_backward", datetime.date(2022, 12, 1)), ("aten::im2col_backward", datetime.date(2022, 12, 1)), + ("aten::diag_backward", datetime.date(2022, 12, 1)), ("aten::solve", datetime.date(9999, 1, 1)), ("aten::solve.solution", datetime.date(9999, 1, 1)), ("aten::_solve_helper", datetime.date(9999, 1, 1)), diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py index 2ee0bc8537604..be457cfe25fcd 100644 --- a/test/functorch/test_vmap.py +++ b/test/functorch/test_vmap.py @@ -3293,6 +3293,8 @@ def test(): @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)}) @skipOps('TestVmapOperatorsOpInfo', 'test_vmap_exhaustive', vmap_fail.union({ xfail('native_batch_norm'), + # The error inputs are vectors, that pass when batched as they are treated as a matrix + xfail('trace'), })) def test_vmap_exhaustive(self, device, dtype, op): # needs to be fixed @@ -3349,6 +3351,7 @@ def test_vmap_exhaustive(self, device, dtype, op): xfail('resize_'), xfail('view_as_complex'), xfail('matrix_exp'), + xfail('trace'), # Does not support batched tensors xfail('bucketize'), xfail('fft.ihfft2'), xfail('fft.ihfftn'), diff --git a/test/lazy/test_ts_opinfo.py b/test/lazy/test_ts_opinfo.py index f5974ec9f6c2c..2e67035581477 100644 --- a/test/lazy/test_ts_opinfo.py +++ b/test/lazy/test_ts_opinfo.py @@ -59,6 +59,7 @@ def init_lists(): # but run functionalized versions of the composite kernels in core. # This means that we don't expect the ops to show directly in the LTC metrics. FUNCTIONAL_DECOMPOSE_LIST = set([ + 'diag_embed', 'block_diag', 'new_empty_strided', 'narrow_copy', diff --git a/test/test_autograd.py b/test/test_autograd.py index bcb42449c349f..03cc78dc242fb 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -3510,19 +3510,6 @@ def test_out_variant_raises_when_inputs_require_grad(self): # we should throw an exception if the output requires grad self.assertRaisesRegex(RuntimeError, 'out=', lambda: torch.mul(a, b, out=x)) - # TODO: see if this test can be OpInfo'd or moved to diagonal's test suite - def test_diagonal_derivative_requires_grad(self): - # test that the backward requires grad - # we do this is because diagonal_backward uses inplace - # operations and gradgradcheck does not catch whether - # they works as expected (it will succeed even if - # the gradient has requires_grad == False - a = torch.randn(5, 6, requires_grad=True) - b = torch.diagonal(a)**2 - c = b.sum() - d, = torch.autograd.grad(c, a, retain_graph=True, create_graph=True) - self.assertTrue(d.requires_grad) - def test_anomaly_detect_nan(self): size = 10 diff --git a/test/test_decomp.py b/test/test_decomp.py index dbc754147858f..27ad870a2adb1 100644 --- a/test/test_decomp.py +++ b/test/test_decomp.py @@ -292,6 +292,8 @@ def normalize_op_input_output(f, sample, requires_grad=True): # See https://github.com/pytorch/pytorch/issues/81669 (None, None, "nn.functional.relu6"), (None, None, "meshgrid"), + # diag was not decomposed (it just registers a decomp for diag_out, torch.diag is CompImplicit) + (None, None, "diag"), } CROSS_REF_BACKWARD_EXCLUDE_SET = { diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index 7ddbe8dd6cf70..853faeb1b2033 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -526,10 +526,6 @@ self: grad.diagonal(offset, dim1, dim2) result: auto_linear -- name: diag(Tensor self, int diagonal=0) -> Tensor - self: diag_backward_symint(grad, self.sym_sizes(), diagonal) - result: auto_linear - - name: diagonal(Tensor(a) self, int offset=0, int dim1=0, int dim2=1) -> Tensor(a) self: diagonal_backward_symint(grad, self.sym_sizes(), offset, dim1, dim2) result: auto_linear diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py index 7be63af9e051a..22ceaaf0a18b0 100644 --- a/torch/_meta_registrations.py +++ b/torch/_meta_registrations.py @@ -1019,22 +1019,6 @@ def is_fast_path(src, scale, output, padding_idx): return output, offset2bag, bag_size, max_indices -@register_meta([aten.diag.default, aten.diag.out]) -@out_wrapper() -def meta_diag(self, dim=0): - check(self.dim() in (1, 2), lambda: "matrix or a vector expected") - if self.dim() == 1: - sz = self.size(0) + abs(dim) - return self.new_empty((sz, sz)) - - # case: dim is 2 - if dim >= 0: - sz = min(self.size(0), self.size(1) - dim) - else: - sz = min(self.size(0) + dim, self.size(1)) - return self.new_empty((sz,)) - - @register_meta(aten._embedding_bag_forward_only.default) def meta_embedding_bag_forward_only(weight, indices, offsets, *args): output, offset2bag, bag_size, max_indices = meta_embedding_bag( diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py index 6169e5af06d9a..3e2f6c45768a6 100644 --- a/torch/_refs/__init__.py +++ b/torch/_refs/__init__.py @@ -219,6 +219,7 @@ "constant_pad_nd", "contiguous", "diag_embed", + "diag", "diagonal", "diagonal_copy", "dsplit", @@ -3483,6 +3484,22 @@ def vsplit( return tensor_split(a, split_sizes, 0) +@register_decomposition(torch.ops.aten.diag.out) +@out_wrapper() +def diag( + self: TensorLikeType, + offset: int = 0, +) -> TensorLikeType: + ndim = self.dim() + utils.check( + ndim in (1, 2), lambda: f"diag(): Supports 1D or 2D tensors. Got {ndim}D" + ) + if ndim == 1: + return torch.diag_embed(self, offset) + else: + return torch.diagonal_copy(self, offset) + + @register_decomposition(torch.ops.aten.diagonal, disable_meta=True) def diagonal( self: TensorLikeType, diff --git a/torch/csrc/jit/runtime/static/generated_ops.cpp b/torch/csrc/jit/runtime/static/generated_ops.cpp index 69cc98bf14ec6..bd9c8d553ab70 100644 --- a/torch/csrc/jit/runtime/static/generated_ops.cpp +++ b/torch/csrc/jit/runtime/static/generated_ops.cpp @@ -2431,25 +2431,6 @@ REGISTER_OPERATOR_FUNCTOR(aten::addbmm, aten_addbmm, [](Node* n) -> SROperator { return nullptr; }); -REGISTER_OPERATOR_FUNCTOR(aten::diag, aten_diag, [](Node* n) -> SROperator { - if (n->matches( - torch::schema("aten::diag(Tensor self, int diagonal=0) -> Tensor"))) { - return [](ProcessedNode* p_node) { - const auto& self = p_node->Input(0).toTensor(); - const auto diagonal = p_node->Input(1).toInt(); - if (p_node->Output(0).isNone()) { - p_node->Output(0) = at::native::diag(self, diagonal); - return; - } - auto& out = p_node->Output(0).toTensor(); - fastResizeToZero(out); - at::native::diag_cpu_out(self, diagonal, out); - }; - } - LogAndDumpSchema(n); - return nullptr; -}); - REGISTER_OPERATOR_FUNCTOR(aten::cross, aten_cross, [](Node* n) -> SROperator { if (n->matches(torch::schema( "aten::cross(Tensor self, Tensor other, int? dim=None) -> Tensor"))) { diff --git a/torch/csrc/lazy/ts_backend/ts_native_functions.cpp b/torch/csrc/lazy/ts_backend/ts_native_functions.cpp index c718fd517b7b0..1bdc0aca8d9af 100644 --- a/torch/csrc/lazy/ts_backend/ts_native_functions.cpp +++ b/torch/csrc/lazy/ts_backend/ts_native_functions.cpp @@ -522,6 +522,15 @@ at::Tensor& LazyNativeFunctions::logsumexp_out( return out; } +at::Tensor LazyNativeFunctions::diag_embed( + const at::Tensor& self, + int64_t offset, + int64_t dim1, + int64_t dim2) { + return at::functionalization::functionalize_aten_op::call(self, offset, dim1, dim2); +} + at::Tensor LazyNativeFunctions::diagonal_backward_symint( const at::Tensor& grad_output, at::SymIntArrayRef input_sizes, diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 95b1df24a9512..00f454bdf454a 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -2170,10 +2170,10 @@ def error_inputs_ormqr(op_info, device, **kwargs): def error_inputs_diag(op_info, device, **kwargs): zero_d = torch.randn((), device=device) yield ErrorInput(SampleInput(zero_d, args=(0,)), error_type=RuntimeError, - error_regex="matrix or a vector expected") + error_regex="1D or 2D") zero_d = torch.randn(1, 1, 1, device=device) yield ErrorInput(SampleInput(zero_d, args=(0,)), error_type=RuntimeError, - error_regex="matrix or a vector expected") + error_regex="1D or 2D") def error_inputs_embedding(op_info, device, **kwargs): indices = torch.rand(2, 2, device=device).long() @@ -9157,10 +9157,12 @@ def reference_flatten(input, start_dim=0, end_dim=-1): DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),), ), OpInfo('diag', - dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16), + ref=np.diag, + dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16), dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), supports_forward_ad=True, supports_fwgrad_bwgrad=True, + check_batched_forward_grad=False, sample_inputs_func=sample_inputs_diag, error_inputs_func=error_inputs_diag), OpInfo('diag_embed', @@ -16270,7 +16272,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1): "diagflat", ref=lambda input, offset=0: np.diagflat(input, k=offset), sample_inputs_func=sample_inputs_diagflat, - dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16), + dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16), dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), supports_out=False, supports_forward_ad=True, @@ -17696,6 +17698,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1): torch_opinfo_name="dsplit", supports_nvfuser=False, ), + PythonRefInfo( + "_refs.diag", + torch_opinfo_name="diag", + supports_nvfuser=False, + ), PythonRefInfo( "_refs.diagonal", torch_opinfo_name="diagonal", @@ -18023,12 +18030,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1): PythonRefInfo( "_refs.trace", torch_opinfo_name="trace", - decorators=( - # TODO: torch.diag is currently not supported by either refs, meta funcs, or NVFuser - DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref'), - DecorateInfo(unittest.skip("diag is not supported by meta"), 'TestCommon', 'test_python_ref_meta'), - DecorateInfo(unittest.skip("diag is not supported by nvfuser"), 'TestCommon', 'test_python_ref_executor'), - ), + supports_nvfuser=False, ), PythonRefInfo( "_refs.norm", From f6c911fec58f9e423d8a3007f9bc11baf302d31b Mon Sep 17 00:00:00 2001 From: PyTorch MergeBot Date: Mon, 24 Oct 2022 10:43:23 +0000 Subject: [PATCH 0061/1922] [xla hash update] update the pinned xla hash (#87590) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml). Update the pinned xla hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87590 Approved by: https://github.com/pytorchbot --- .github/ci_commit_pins/xla.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt index e7375040708bd..3ab9c4394d70b 100644 --- a/.github/ci_commit_pins/xla.txt +++ b/.github/ci_commit_pins/xla.txt @@ -1 +1 @@ -eff277e81fcfdeccba71e75ff40b6e2f3e29e27b +0cb29daa04097c868d23ed666563a3439d67065c From bd1adb47a71eb844167863c7ff9694e9c2fdffd0 Mon Sep 17 00:00:00 2001 From: Andrew Gu Date: Mon, 24 Oct 2022 03:39:38 +0000 Subject: [PATCH 0062/1922] [FSDP] Use `reduce_scatter_tensor()` (#87240) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let us silence some more warnings 👍🏼 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87240 Approved by: https://github.com/rohan-varma --- test/distributed/fsdp/test_fsdp_comm.py | 2 +- .../fsdp/test_fsdp_mixed_precision.py | 26 +++++++++---------- .../algorithms/_comm_hooks/default_hooks.py | 2 +- torch/testing/_internal/common_fsdp.py | 4 +-- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/test/distributed/fsdp/test_fsdp_comm.py b/test/distributed/fsdp/test_fsdp_comm.py index c9946a9dd5665..d19617e31acd3 100644 --- a/test/distributed/fsdp/test_fsdp_comm.py +++ b/test/distributed/fsdp/test_fsdp_comm.py @@ -221,7 +221,7 @@ def test_communication( # outside `no_sync()` num_iters = 3 with patch("torch.distributed.all_gather_into_tensor") as mock_all_gather, \ - patch("torch.distributed._reduce_scatter_base") as mock_reduce_scatter: + patch("torch.distributed.reduce_scatter_tensor") as mock_reduce_scatter: def reset_mocks(): mock_all_gather.reset_mock() mock_reduce_scatter.reset_mock() diff --git a/test/distributed/fsdp/test_fsdp_mixed_precision.py b/test/distributed/fsdp/test_fsdp_mixed_precision.py index c803164bff4e5..4440e394179ab 100644 --- a/test/distributed/fsdp/test_fsdp_mixed_precision.py +++ b/test/distributed/fsdp/test_fsdp_mixed_precision.py @@ -125,17 +125,17 @@ @contextlib.contextmanager def patch_reduce_scatter(new_reduce_scatter, full_precision_param_dtype): """ - Patches dist._reduce_scatter_base with a new reduce_scatter_base and - restores upon exiting. Used for validation of mixed precision + Patches ``dist.reduce_scatter_tensor`` with ``new_reduce_scatter`` and + restores upon exiting. Used for validation of mixed precision. """ - orig_reduce_scatter = dist._reduce_scatter_base - dist._reduce_scatter_base = new_reduce_scatter + orig_reduce_scatter = dist.reduce_scatter_tensor + dist.reduce_scatter_tensor = new_reduce_scatter global _CURRENT_FULL_PRECISION_PARAM_DTYPE _CURRENT_FULL_PRECISION_PARAM_DTYPE = full_precision_param_dtype try: yield finally: - dist._reduce_scatter_base = orig_reduce_scatter + dist.reduce_scatter_tensor = orig_reduce_scatter _CURRENT_FULL_PRECISION_PARAM_DTYPE = None class LinearMixedPrecision(nn.Module): @@ -250,7 +250,7 @@ def _validate_mp_shard_freed(self, fsdp_model): for param in fsdp.params: self.assertEqual(0, param._mp_shard.storage().size()) - def _reduce_scatter_base_validate_mp( + def _reduce_scatter_validate_mp( self, orig_reduce_scatter, mp_config, @@ -258,9 +258,9 @@ def _reduce_scatter_base_validate_mp( **kwargs ): """ - Performs dist._reduce_scatter_base but verifies mixed precision settings - before. This is to test mixed precision is working as expected during - backward pass. In particular it ensures that the gradients were cast to the right type + Runs reduce-scatter but verifies mixed precision settings before. This + is to test mixed precision is working as expected during backward pass. + In particular it ensures that the gradients were cast to the right type and comm. is going to happen in the right type. """ tensors = [] @@ -355,9 +355,9 @@ def _run_test_mixed_precision_e2e( model.cuda() # Patch reduce_scatter to add validation for mixed precision types. - orig_reduce_scatter = dist._reduce_scatter_base + orig_reduce_scatter = dist.reduce_scatter_tensor test_reduce_scatter = partial( - self._reduce_scatter_base_validate_mp, orig_reduce_scatter, mp_config, + self._reduce_scatter_validate_mp, orig_reduce_scatter, mp_config, ) with patch_reduce_scatter(test_reduce_scatter, full_precision_param_dtype): scaler = ShardedGradScaler(enabled=enable_sharded_grad_scaler) @@ -516,9 +516,9 @@ def _test_mixed_precision_embedding_table(self, mp_config): # Basic test to ensure int inputs are not casted which would break # modules such as embedding tables. param_dtype = mp_config.param_dtype or torch.float32 - orig_reduce_scatter = dist._reduce_scatter_base + orig_reduce_scatter = dist.reduce_scatter_tensor test_reduce_scatter = partial( - self._reduce_scatter_base_validate_mp, orig_reduce_scatter, mp_config, + self._reduce_scatter_validate_mp, orig_reduce_scatter, mp_config, ) with patch_reduce_scatter(test_reduce_scatter, param_dtype): # TODO: `test_mp_embedding_reduce()` fails if we do not wrap the diff --git a/torch/distributed/algorithms/_comm_hooks/default_hooks.py b/torch/distributed/algorithms/_comm_hooks/default_hooks.py index 10dcce3197c72..7d2c845f4e63b 100644 --- a/torch/distributed/algorithms/_comm_hooks/default_hooks.py +++ b/torch/distributed/algorithms/_comm_hooks/default_hooks.py @@ -108,7 +108,7 @@ def reduce_scatter_hook(state: DefaultState, grad: torch.Tensor, output: torch.T # Average grad by pre-division factor. if state.gradient_predivide_factor > 1: grad.div_(state.gradient_predivide_factor) - dist._reduce_scatter_base( + dist.reduce_scatter_tensor( output, grad, group=state.process_group ) # Average grad's shard by post-division factor. diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py index ee5d580c29d27..f97cacb2a9a41 100644 --- a/torch/testing/_internal/common_fsdp.py +++ b/torch/testing/_internal/common_fsdp.py @@ -479,7 +479,7 @@ def get_loss(self, input, output): return loss def run_backward(self, loss): - orig_reduce_scatter = torch.distributed._reduce_scatter_base + orig_reduce_scatter = torch.distributed.reduce_scatter_tensor def _delayed_reduce_scatter(*args, **kwargs): if self.delay_before_reduction_ms > 0: @@ -489,7 +489,7 @@ def _delayed_reduce_scatter(*args, **kwargs): return orig_reduce_scatter(*args, **kwargs) with mock.patch( - "torch.distributed._reduce_scatter_base", _delayed_reduce_scatter + "torch.distributed.reduce_scatter_tensor", _delayed_reduce_scatter ): self.module.run_backward(loss) From 9eaf5878ab07c0d9a2b92a0b54071b27047929ca Mon Sep 17 00:00:00 2001 From: Andrew Gu Date: Mon, 24 Oct 2022 03:36:52 +0000 Subject: [PATCH 0063/1922] [FSDP] Rename streams (#86833) This time around, I decided to rename the "all_gather" stream to the "unshard" stream to emphasize that it includes both the actual all-gather op but also the corresponding memory allocations (and also now the unflattening as well). (A similar reasoning applies for the "pre-all-gather" stream becoming the "pre-unshard" stream.) This PR is definitely safe. Pull Request resolved: https://github.com/pytorch/pytorch/pull/86833 Approved by: https://github.com/rohan-varma --- .../fsdp/fully_sharded_data_parallel.py | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py index a51df5195f0fc..823cba20d42c4 100644 --- a/torch/distributed/fsdp/fully_sharded_data_parallel.py +++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py @@ -1567,13 +1567,13 @@ def _unshard( if event: event.synchronize() any_ran_pre_unshard = False - with torch.cuda.stream(self._streams["pre_all_gather"]): + with torch.cuda.stream(self._streams["pre_unshard"]): for handle in handles: ran_pre_unshard = handle.pre_unshard() any_ran_pre_unshard = any_ran_pre_unshard or ran_pre_unshard if any_ran_pre_unshard: - self._streams["all_gather"].wait_stream(self._streams["pre_all_gather"]) - with torch.cuda.stream(self._streams["all_gather"]): + self._streams["unshard"].wait_stream(self._streams["pre_unshard"]) + with torch.cuda.stream(self._streams["unshard"]): for handle in handles: handle.unshard() handle.post_unshard() @@ -2006,12 +2006,15 @@ def _init_streams(self) -> None: computation. This should only be called on the root FSDP instance.""" assert self._is_root assert torch.cuda.is_available() - # Stream for all-gathering parameters. - self._streams["all_gather"] = torch.cuda.Stream() - # Stream for overlapping grad reduction with the backward pass. + # Stream for unshard logic, including allocating the all-gather + # destination tensors and the all-gathers themselves. + self._streams["unshard"] = torch.cuda.Stream() + # Stream for overlapping gradient reduction with the backward pass + # gradient computation. self._streams["post_backward"] = torch.cuda.Stream() - # Stream for pre-all-gather copies (e.g. H2D or precision cast). - self._streams["pre_all_gather"] = torch.cuda.Stream() + # Stream for pre-unshard logic, namely allocations and writes for + # CPU offloading (H2D copy) and mixed precision (low precision cast). + self._streams["pre_unshard"] = torch.cuda.Stream() def _wait_for_previous_optim_step(self) -> None: """ @@ -2022,11 +2025,11 @@ def _wait_for_previous_optim_step(self) -> None: if not self._is_root: return current_stream = torch.cuda.current_stream() - self._streams["all_gather"].wait_stream(current_stream) + self._streams["unshard"].wait_stream(current_stream) # Having the pre-all-gather stream wait for the current stream even if # we do not leverage the pre-all-gather stream is tolerable since this # only runs once per iteration - self._streams["pre_all_gather"].wait_stream(current_stream) + self._streams["pre_unshard"].wait_stream(current_stream) def _prefetch_handles( self, @@ -2893,7 +2896,7 @@ def _pre_forward_unshard( self._unshard(handles) handles_key = tuple(handles) self._needs_pre_forward_unshard[handles_key] = False - torch.cuda.current_stream().wait_stream(self._streams["all_gather"]) + torch.cuda.current_stream().wait_stream(self._streams["unshard"]) self._prefetch_handles(handles_key) def _post_forward( @@ -3137,7 +3140,7 @@ def _summon_full_params( self._clear_grads_if_needed() free_unsharded_flat_params = [handle.needs_unshard() for handle in self._handles] self._unshard(self._handles) - torch.cuda.current_stream().wait_stream(self._streams["all_gather"]) + torch.cuda.current_stream().wait_stream(self._streams["unshard"]) if with_grads: self._unshard_grads(self._handles) @@ -3444,7 +3447,7 @@ def _pre_backward_hook(_handles: List[FlatParamHandle], *unused: Any) -> None: # If the handles have been prefetched, this `_unshard()` simply # switches to using the unsharded parameter self._unshard(_handles) - torch.cuda.current_stream().wait_stream(self._streams["all_gather"]) + torch.cuda.current_stream().wait_stream(self._streams["unshard"]) # Set this to `False` to ensure that a mistargeted prefetch # does not actually unshard these handles From 7644e19f0c7ae9ba6290a6580f28e6b65921af12 Mon Sep 17 00:00:00 2001 From: Andrew Gu Date: Mon, 24 Oct 2022 03:31:34 +0000 Subject: [PATCH 0064/1922] [FSDP][Docs] Clarify warnings to mention collectives (#87478) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87478 Approved by: https://github.com/rohan-varma --- .../fsdp/fully_sharded_data_parallel.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py index 823cba20d42c4..3c8fa6817571b 100644 --- a/torch/distributed/fsdp/fully_sharded_data_parallel.py +++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py @@ -2498,8 +2498,8 @@ def state_dict(self, *args, **kwargs): >>> local_dict.keys() >>> odict_keys(['flat_param', 'inner.flat_param']) - .. warning:: This needs to be called on all ranks, since synchronization - primitives may be used. + .. warning:: This needs to be called on all ranks since it uses + collective communications. """ # TODO (rohan-varma): separate these out once a state_dict pre-hook # is available. @@ -2795,8 +2795,8 @@ def load_state_dict( >>> local_dict.keys() >>> odict_keys(['flat_param', 'inner.flat_param']) - .. warning:: This needs to be called on all ranks, since synchronization - primitives may be used. + .. warning:: This needs to be called on all ranks since it uses + collective communications. """ return super().load_state_dict(state_dict, *args) @@ -3944,8 +3944,8 @@ def clip_grad_norm_( calling it for FSDP models would lead to different scaling being applied per subset of model parameters. - .. warning:: This needs to be called on all ranks, since synchronization - primitives will be used. + .. warning:: This needs to be called on all ranks since it uses + collective communications. """ self._lazy_init() self._wait_for_previous_optim_step() @@ -4020,10 +4020,10 @@ def full_optim_state_dict( and ``"param_groups"``. The flattened parameters in ``FSDP`` modules contained in ``model`` are mapped back to their unflattened parameters. - .. warning:: This needs to be called on all ranks since synchronization - primitives are used. However, if ``rank0_only=True``, then the - state dict is only populated on rank 0, and all other ranks return - an empty :class:`dict`. + .. warning:: This needs to be called on all ranks since it uses + collective communications. However, if ``rank0_only=True``, then + the state dict is only populated on rank 0, and all other ranks + return an empty :class:`dict`. .. warning:: Unlike ``torch.optim.Optimizer.state_dict()``, this method uses full parameter names as keys instead of parameter IDs. From d84cd1f217e2add112d72c3aad429469c50b2e5e Mon Sep 17 00:00:00 2001 From: Andrew Gu Date: Mon, 24 Oct 2022 03:31:34 +0000 Subject: [PATCH 0065/1922] [FSDP][1/N] Rework `clip_grad_norm_()` and tests (#87479) This PR reworks FSDP's `clip_grad_norm_()` and its unit tests. The unit tests in `test_fsdp_core.py` still need to be revisited and will be done in follow-up work. Some details in arbitrary order: - This renames `_calc_grad_norm()` to `_get_grad_norm()`. This is to simplify our verb usage in method names. Otherwise, we may diverge to different verbs like "compute", "calculate", "get", "find" etc. I am open to discussion here. - Because we call `torch.linalg.vector_norm()` as the underlying norm calculation subroutine, which can take infinity as input for the norm type, there is no reason to have a separate conditional branch for the infinity norm. - This removes a host-device synchronization point from `clip_grad_norm_()` by using the same trick from `torch.nn.utils.clip_grad_norm_()`. This may improve throughput for workloads like metaseq, which computes gradient norms regularly. - This returns the total norm from `clip_grad_norm_()` as mentioned in the docstring. Before nothing was returned. - This rewrites the unit tests, which were slightly problematic. Much of the logic to verify gradient norms were computed correctly were exactly the same as the logic used to compute them in FSDP (i.e. `^p`, sum via all-reduce, `^(1/p)`). This defeats the purpose of unit testing. There were some other oddities like `input = torch.rand(14, 2, device=self.rank); in_data = torch.tensor(input[self.rank], device=self.rank)`, where we materialize a full `(14, 2)` shape but only ever use the first two rows (assuming world size 2). Pull Request resolved: https://github.com/pytorch/pytorch/pull/87479 Approved by: https://github.com/rohan-varma --- .../fsdp/test_fsdp_clip_grad_norm.py | 192 ++++++++++++------ .../fsdp/fully_sharded_data_parallel.py | 87 ++++---- torch/testing/_internal/common_fsdp.py | 23 --- 3 files changed, 181 insertions(+), 121 deletions(-) diff --git a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py index 9e39254ec423a..3af5a83cdde42 100644 --- a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py +++ b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py @@ -1,31 +1,33 @@ # Owner(s): ["oncall: distributed"] +import functools +import itertools import sys -from math import inf +from typing import Union import torch +import torch.nn as nn from torch import distributed as dist +from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload from torch.distributed.fsdp.fully_sharded_data_parallel import ( FullyShardedDataParallel as FSDP, - CPUOffload, - _calc_grad_norm, ) -from torch.nn import utils as nn_utils +from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy +from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer +from torch.nn.parallel import DistributedDataParallel as DDP from torch.testing._internal.common_distributed import skip_if_lt_x_gpu from torch.testing._internal.common_fsdp import ( - DeterministicModel, + CUDAInitMode, + FSDPInitMode, FSDPTest, - _collect_total_grad_norm_fsdp, - _collect_total_grad_norm_local, + TransformerWithSharedParams, ) from torch.testing._internal.common_utils import ( TEST_WITH_DEV_DBG_ASAN, - run_tests, - parametrize, instantiate_parametrized_tests, + run_tests, ) - if not dist.is_available(): print("Distributed not available, skipping tests", file=sys.stderr) sys.exit(0) @@ -39,67 +41,133 @@ class TestClipGradNorm(FSDPTest): - def _run_fsdp_one_iteration(self, norm_type, nested_fsdp, cpu_offload): - """Test FSDP with clip grad norm.""" - fsdp_model = DeterministicModel(nested_fsdp, cpu_offload=cpu_offload) - local_model = DeterministicModel(False) - input = torch.rand(14, 2, device=self.rank) - fsdp_model = FSDP(fsdp_model, cpu_offload=cpu_offload) - self.assertTrue(len(input) >= self.world_size) - out = local_model(input[: self.world_size]) - out.sum().backward() - in_data = torch.tensor(input[self.rank], device=self.rank) - out_fsdp = fsdp_model(in_data) - out_fsdp.sum().backward() - total_norms_fsdp = _collect_total_grad_norm_fsdp( - fsdp_model, norm_type, self.rank - ) - total_norms_local = _collect_total_grad_norm_local(local_model, norm_type) - total_norms_local /= self.world_size - norm_cap = total_norms_fsdp / 2.0 - self.assertEqual(total_norms_local, total_norms_fsdp) - fsdp_model.clip_grad_norm_(norm_cap, norm_type=norm_type) - nn_utils.clip_grad_norm_( - local_model.parameters(), norm_cap, norm_type=norm_type + """Tests :meth:`FullyShardedDataParallel.clip_grad_norm_`.""" + + @property + def world_size(self) -> int: + return 2 + + @skip_if_lt_x_gpu(2) + def test_non_root(self): + """ + Tests that calling ``clip_grad_norm_()`` on a non-root FSDP instance + raises an error. + """ + class Model(nn.Module): + def __init__(self) -> None: + super().__init__() + self.lin1 = nn.Linear(5, 5) + self.lin2 = nn.Linear(5, 5) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.lin2(self.lin1(x)) + + model = Model().cuda() + model.lin2 = FSDP(model.lin2) + fsdp_model = FSDP(model) + fsdp_model(torch.randn((2, 5), device=torch.device("cuda"))).sum().backward() + error_regex = "should only be called on the root FSDP instance" + with self.assertRaisesRegex(RuntimeError, error_regex): + fsdp_model.lin2.clip_grad_norm_(max_norm=2) + + @skip_if_lt_x_gpu(2) + def test_ddp_parity(self): + """ + Tests FSDP with ``FullyShardedDataParallel.clip_grad_norm_()` against + DDP with ``torch.nn.utils.clip_grad_norm_()`. + """ + self.run_subtests( + { + "max_norm": [1, 2.5], + "norm_type": [1, 2, float("inf")], + "use_orig_params": [False, True], + "offload_params": [False, True], + }, + self._test_ddp_parity, ) - total_norms_after_clip_fsdp = _collect_total_grad_norm_fsdp( - fsdp_model, norm_type, self.rank + + def _test_ddp_parity( + self, + max_norm: Union[float, int], + norm_type: Union[float, int], + offload_params: bool, + use_orig_params: bool, + ): + local_model = TransformerWithSharedParams.init( + self.process_group, + FSDPInitMode.NO_FSDP, + CUDAInitMode.CUDA_BEFORE, + deterministic=True, ) - total_norms_after_clip_local = _collect_total_grad_norm_local( - local_model, norm_type + ddp_model = DDP(local_model, device_ids=[self.rank]) + fsdp_kwargs = { + "auto_wrap_policy": functools.partial( + transformer_auto_wrap_policy, + transformer_layer_cls={ + TransformerEncoderLayer, + TransformerDecoderLayer, + }, + ), + "cpu_offload": CPUOffload(offload_params=offload_params), + "use_orig_params": use_orig_params, + } + fsdp_model = TransformerWithSharedParams.init( + self.process_group, + FSDPInitMode.RECURSIVE, + CUDAInitMode.CUDA_BEFORE, + deterministic=True, + fsdp_kwargs=fsdp_kwargs, ) - self.assertTrue(total_norms_after_clip_fsdp <= norm_cap) - self.assertEqual(total_norms_after_clip_local, total_norms_after_clip_fsdp) + LR = 1e-2 + ddp_optim = torch.optim.Adam(ddp_model.parameters(), lr=LR) + fsdp_optim = torch.optim.Adam(fsdp_model.parameters(), lr=LR) + device = torch.device("cuda") + LARGE_FACTOR = 100 + inp = ddp_model.module.get_input(device) + for model in (ddp_model, fsdp_model): + out = model(*inp) + loss = model.module.get_loss(inp, out) + loss.backward() - @skip_if_lt_x_gpu(2) - @parametrize("norm_type", [2.0, inf]) - @parametrize("nested_fsdp", [True, False]) - @parametrize( - "cpu_offload", - [CPUOffload(offload_params=True), CPUOffload(offload_params=False)], - ) - def test_fsdp_clip_grad_norm(self, norm_type, nested_fsdp, cpu_offload): - """Test FSDP with clip grad norm.""" - self._run_fsdp_one_iteration(norm_type, nested_fsdp, cpu_offload) + # Multiply gradients by a large factor to ensure that gradients will + # actually be clipped + for param in itertools.chain(ddp_model.parameters(), fsdp_model.parameters()): + if param.grad is not None: # gradients may be `None` for `use_orig_params=True` + param.grad *= LARGE_FACTOR + orig_ddp_grads = [param.grad.detach().clone() for param in ddp_model.parameters()] + orig_fsdp_grads = [ + param.grad.detach().clone() if param.grad is not None else None + for param in fsdp_model.parameters() + ] + + ddp_total_norm = torch.nn.utils.clip_grad_norm_( + ddp_model.parameters(), max_norm=max_norm, norm_type=norm_type, + ) + fsdp_total_norm = fsdp_model.clip_grad_norm_(max_norm=max_norm, norm_type=norm_type) + self.assertEqual(ddp_total_norm, fsdp_total_norm) + # Check that the gradients were modified by `clip_grad_norm_()` + for param, orig_grad in zip(ddp_model.parameters(), orig_ddp_grads): + assert not torch.equal(param.grad, orig_grad) + for param, orig_grad in zip(fsdp_model.parameters(), orig_fsdp_grads): + if param.grad is None: + self.assertEqual(param.grad, orig_grad) # `None` + else: + assert not torch.equal(param.grad, orig_grad) -class TestCalcuGradNorm(FSDPTest): - @skip_if_lt_x_gpu(2) - @parametrize("norm_type", [2.0, inf, 1.3, 2.5]) - @parametrize("nested_fsdp", [True, False]) - def test_fsdp_calc_grad_norm(self, norm_type, nested_fsdp): - """Test grad norm cal API.""" - model = FSDP(DeterministicModel(nested_fsdp)) - input = torch.rand(15, 2, device=self.rank) - out = model(input) - out.sum().backward() - total_norm = _calc_grad_norm(model.params_with_grad, norm_type) - total_norm_expected = _collect_total_grad_norm_local(model, norm_type) - self.assertEqual(total_norm, total_norm_expected) + # Run an optimizer step to ensure gradients matched after clipping + ddp_optim.step() + fsdp_optim.step() + with FSDP.summon_full_params(fsdp_model): + for (n1, p1), (n2, p2) in zip( + ddp_model.module.named_parameters(), + fsdp_model.named_parameters(), + ): + self.assertEqual(n1, n2) + self.assertEqual(p1, p2) instantiate_parametrized_tests(TestClipGradNorm) -instantiate_parametrized_tests(TestCalcuGradNorm) if __name__ == "__main__": run_tests() diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py index 3c8fa6817571b..e80844ea232df 100644 --- a/torch/distributed/fsdp/fully_sharded_data_parallel.py +++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py @@ -3923,11 +3923,11 @@ def params_with_grad(self) -> List[Parameter]: @torch.no_grad() def clip_grad_norm_( self, max_norm: Union[float, int], norm_type: Union[float, int] = 2.0 - ) -> None: + ) -> torch.Tensor: """ - Clip all gradients at this point in time. The norm is computed over all - gradients together, as if they were concatenated into a single vector. - Gradients are modified in-place. + Clips the gradient norm of all parameters. The norm is computed over + all parameters' gradients as viewed as a single vector, and the + gradients are modified in-place. Args: max_norm (float or int): max norm of the gradients @@ -3949,13 +3949,18 @@ def clip_grad_norm_( """ self._lazy_init() self._wait_for_previous_optim_step() - assert self._is_root, "clip_grad_norm should only be called on the root (parent) instance" + if not self._is_root: + raise RuntimeError( + "`clip_grad_norm_()` should only be called on the root FSDP instance" + ) self._assert_state(TrainingState_.IDLE) max_norm = float(max_norm) norm_type = float(norm_type) - # Computes the max norm for this shard's gradients and sync's across workers - local_norm = _calc_grad_norm(self.params_with_grad, norm_type).cuda() # type: ignore[arg-type] + # Compute the local gradient norm (only including this rank's shard + # of the gradients) + local_norm = _get_grad_norm(self.parameters(), norm_type).to(self.compute_device) + # Reconstruct the total gradient norm depending on the norm type if norm_type == math.inf: total_norm = local_norm dist.all_reduce(total_norm, op=torch.distributed.ReduceOp.MAX, group=self.process_group) @@ -3963,16 +3968,21 @@ def clip_grad_norm_( total_norm = local_norm ** norm_type dist.all_reduce(total_norm, group=self.process_group) total_norm = total_norm ** (1.0 / norm_type) - if self.cpu_offload.offload_params: total_norm = total_norm.cpu() - clip_coef = torch.tensor(max_norm, dtype=total_norm.dtype, device=total_norm.device) / (total_norm + 1e-6) - if clip_coef < 1: - # multiply by clip_coef, aka, (max_norm/total_norm). - for p in self.params_with_grad: - assert p.grad is not None - p.grad.detach().mul_(clip_coef.to(p.grad.device)) + clip_coef = ( + torch.tensor(max_norm, dtype=total_norm.dtype, device=total_norm.device) + / (total_norm + 1e-6) + ) + # Multiplying by the clamped coefficient is meaningless when it is + # equal to 1, but it avoids the host-device sync that would result from + # `if clip_coef < 1` + clip_coef_clamped = torch.clamp(clip_coef, max=1.0) + grads = [param.grad for param in self.parameters() if param.grad is not None] + for grad in grads: + grad.detach().mul_(clip_coef_clamped.to(grad.device)) + return total_norm @staticmethod def _warn_optim_input(optim_input): @@ -4625,30 +4635,35 @@ def _is_param_exec_order_prep_stage(self) -> bool: return is_prep_stage -def _calc_grad_norm(parameters: List[torch.nn.Parameter], p: float) -> torch.Tensor: - r"""Calculate gradient norm of an iterable of parameters. - Returns: - Total norm of the parameters (viewed as a single vector). +def _get_grad_norm( + params: List[nn.Parameter], + norm_type: float, +) -> torch.Tensor: """ - parameters = [p for p in parameters if p.grad is not None] - - if len(parameters) == 0: + Returns the gradient norm of parameters ``param`` s, where the gradients + are viewed as a single vector. + """ + params_with_grad = [param for param in params if param.grad is not None] + if len(params_with_grad) == 0: return torch.tensor(0.0) - if p == math.inf: - local_norm = torch.tensor(max(par.grad.detach().abs().max() for par in parameters)) - else: - # Compute the norm in full precision no matter what - local_norm = torch.linalg.vector_norm( - torch.stack( - [ - torch.linalg.vector_norm(par.grad.detach(), p, dtype=torch.float32) - for par in parameters - ] - ), - p, - ) - local_norm.to(dtype=parameters[0].dtype) - return local_norm + grads = [param.grad for param in params_with_grad] + grad_dtypes = set(grad.dtype for grad in grads) + if len(grad_dtypes) != 1: + raise ValueError(f"Requires uniform dtype across all gradients but got {grad_dtypes}") + # Compute the gradient norm in FP32, where we treat the gradients as a + # single vector + grad_norm = torch.linalg.vector_norm( + torch.stack( + [ + torch.linalg.vector_norm(grad.detach(), norm_type, dtype=torch.float32) + for grad in grads + ], + ), + norm_type, + dtype=torch.float32, + ) + grad_norm = grad_norm.to(grads[0].dtype) + return grad_norm def _get_param_to_unflat_param_names( diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py index f97cacb2a9a41..7fdbe573ed217 100644 --- a/torch/testing/_internal/common_fsdp.py +++ b/torch/testing/_internal/common_fsdp.py @@ -7,7 +7,6 @@ from contextlib import suppress from copy import deepcopy from enum import Enum, auto -from math import inf from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union from unittest import mock @@ -1058,25 +1057,3 @@ def forward(self, x): x = self.linear_skip(x) x = self.nested_linear(x) return x - - -def _collect_total_grad_norm_fsdp(model, norm_type, rank): - total_norm = _collect_total_grad_norm_local(model, norm_type) - op = torch.distributed.ReduceOp.SUM - if norm_type == inf: - op = torch.distributed.ReduceOp.MAX - norm_type = 1.0 - return_norm = torch.tensor(total_norm ** norm_type, device=rank) - dist.all_reduce(return_norm, op=op) - return return_norm ** (1.0 / norm_type) - - -def _collect_total_grad_norm_local(model, norm_type): - if norm_type == inf: - return max(p.grad.abs().max() for p in model.parameters()) - else: - total_norm = 0.0 - for p in model.parameters(): - local_norm = torch.linalg.vector_norm(p.grad, norm_type, dtype=torch.float32) - total_norm += local_norm ** norm_type - return total_norm ** (1.0 / norm_type) From 27dc00b83e64dc8dbce72067e800bf3652dbc8bb Mon Sep 17 00:00:00 2001 From: Andrew Gu Date: Mon, 24 Oct 2022 03:31:34 +0000 Subject: [PATCH 0066/1922] [FSDP][2/N] Remove `params_with_grad` (#87480) This PR removes the property `params_with_grad` from `FullyShardedDataParallel`. It was introduced when implementing `clip_grad_norm_()` but was not consistently used. Personally, I do not think it makes sense for `FullyShardedDataParallel` to expose this helper because it is not a common paradigm. This PR is technically BC-breaking. However, I checked that no one internally is using this API. cc @ezyang @gchanan Pull Request resolved: https://github.com/pytorch/pytorch/pull/87480 Approved by: https://github.com/rohan-varma --- torch/distributed/fsdp/fully_sharded_data_parallel.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py index e80844ea232df..f6eead9406a1c 100644 --- a/torch/distributed/fsdp/fully_sharded_data_parallel.py +++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py @@ -52,8 +52,6 @@ _sync_params_and_buffers, _to_kwargs, ) -from torch.nn.parameter import Parameter - from ._optim_utils import ( _broadcast_pos_dim_tensor_states, _broadcast_processed_optim_state_dict, @@ -3913,13 +3911,6 @@ def no_sync(self) -> Generator: ) m._sync_gradients = old_flag - @property - def params_with_grad(self) -> List[Parameter]: - """ - Recursively returns a list of all module parameters that have a gradient. - """ - return [p for p in self.parameters() if p.grad is not None] - @torch.no_grad() def clip_grad_norm_( self, max_norm: Union[float, int], norm_type: Union[float, int] = 2.0 From bd8ab66232207cc48d7065d27c00321e097ca393 Mon Sep 17 00:00:00 2001 From: andrewor14 Date: Fri, 21 Oct 2022 14:09:52 -0700 Subject: [PATCH 0067/1922] [Quant][docs] Add README for BackendConfig (#86523) Summary: This adds a README for `torch.ao.quantization.backend_config` that describes both the high level motivation and the specifications of the BackendConfig API. Reviewers: jerryzh168, vkuzo Subscribers: jerryzh168, vkuzo Pull Request resolved: https://github.com/pytorch/pytorch/pull/86523 Approved by: https://github.com/jerryzh168 --- .../ao/quantization/backend_config/README.md | 142 +++++++++++++++++- .../backend_config/backend_config.py | 17 ++- torch/ao/quantization/fx/README.md | 25 +-- 3 files changed, 155 insertions(+), 29 deletions(-) diff --git a/torch/ao/quantization/backend_config/README.md b/torch/ao/quantization/backend_config/README.md index a170581d5638b..b8d8ceb3e38de 100644 --- a/torch/ao/quantization/backend_config/README.md +++ b/torch/ao/quantization/backend_config/README.md @@ -1,10 +1,34 @@ -The patterns are we matching against is float modules types, functional operators and pytorch operators in reverse order: +## BackendConfig Overview + +BackendConfig allows PyTorch quantization to work with different backend or kernel libraries. These backends may have different sets of supported quantized operator patterns, and the same operator patterns may require different handling across different backends. To make quantization work with different backends and allow maximum flexibility, we strived to make all the parts of the quantization flow configurable with BackendConfig. Currently, it is only used by FX graph mode quantization. For more details on how it integrates with the FX graph mode quantization flow, refer to this [README](/torch/ao/quantization/fx/README.md). + +BackendConfig configures quantization behavior in terms of operator patterns. For each operator pattern, we need to specify what the supported data types are for the input and output activations, weights, and biases, and also specify the QAT modules, the reference quantized modules etc., which will be used in module swapping during the quantization passes. + +Quantized backends can have different support in terms of the following aspects: +* Quantization scheme (symmetric vs asymmetric, per-channel vs per-tensor) +* Data type (float32, float16, int8, uint8, bfloat16, etc.) for input/output/weight/bias +* Quantized (and fused) mapping: Some quantized operators may have different numerics compared to a naive (dequant - float_op - quant) reference implementation. For weighted operators, such as conv and linear, we need to be able to specify custom reference modules and a mapping from the float modules +* QAT mapping: For weighted operators, we need to swap them with the Quantization Aware Training (QAT) versions that add fake quantization to the weights + +As an example, here is what fbgemm looks like: +| | fbgemm | +|-------------------------------------------|-----------------------------------------------------------------------| +| Quantization Scheme | activation: per tensor, weight: per tensor or per channel | +| Data Type | activation: quint8 (with qmin/qmax range restrictions), weight: qint8 | +| Quantized and Fused Operators and Mapping | e.g. torch.nn.Conv2d -> torch.ao.nn.quantized.reference.Conv2d | +| QAT Module Mapping | e.g. torch.nn.Conv2d -> torch.ao.nn.qat.Conv2d | + +Instead of hardcoding the fusion mappings, float to reference quantized module mappings, fusion patterns etc., we will derive everything from the BackendConfig throughout the code base. This allows PyTorch Quantization to work with all first-party (fbgemm and qnnpack) and third-party backends (TensorRT, executorch etc.) that may differ from native backends in different aspects. With the recent addition of xnnpack, integrated as part of the qnnpack backend in PyTorch, the BackendConfig is needed to define the new constraints required for xnnpack quantized operators. + +## Pattern Specification + +The operator patterns used in BackendConfig are float modules, functional operators and pytorch operators specified in reverse order: ``` operator = module_type | functional | torch op | native op | MatchAllNode Pattern = (operator, Pattern, Pattern, ...) | operator ``` -where the first item for Pattern is the operator we want to match, and the rest are the patterns for the arguments of the operator. -For example, pattern (nn.ReLU, (operator.add, MatchAllNode, (nn.BatchNorm2d, nn.Conv2d))) would match the following graph: +where the first item for each Pattern is the operator, and the rest are the patterns for the arguments of the operator. +For example, the pattern (nn.ReLU, (operator.add, MatchAllNode, (nn.BatchNorm2d, nn.Conv2d))) would match the following graph: ``` tensor_1 tensor_2 | | @@ -17,4 +41,114 @@ tensor_1 tensor_2 nn.ReLU ``` -we’ll match the last node as the anchor point of the match, and we can retrieve the whole graph by tracing back from the node, e.g. in the example above, we matched nn.ReLU node, then node.args[0] is the operator.add node. +During prepare and convert, we’ll match the last node, which will be the anchor point of the match, and we can retrieve the whole graph by tracing back from the node. E.g. in the example above, we matched the `nn.ReLU` node, and `node.args[0]` is the `operator.add` node. + +## BackendConfig Implementation + +The BackendConfig is comprised of a list of BackendPatternConfigs, each of which define the specifications and the requirements for an operator pattern. Here is an example usage: + +``` +import torch +from torch.ao.quantization.backend_config import BackendConfig, BackendPatternConfig, DTypeConfig, ObservationType +from torch.ao.quantization.fuser_method_mappings import reverse_sequential_wrapper2 + +weighted_int8_dtype_config = DTypeConfig( + input_dtype=torch.quint8, + output_dtype=torch.quint8, + weight_dtype=torch.qint8, + bias_type=torch.float) + +linear_config = BackendPatternConfig(torch.nn.Linear) \ + .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) \ + .add_dtype_config(weighted_int8_dtype_config) \ + .set_root_module(torch.nn.Linear) \ + .set_qat_module(torch.ao.nn.qat.Linear) \ + .set_reference_quantized_module(torch.ao.nn.quantized.reference.Linear) + +conv_relu_config = BackendPatternConfig((torch.nn.ReLU, torch.nn.Conv2d)) \ + .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) \ + .add_dtype_config(weighted_int8_dtype_config) \ + .set_fused_module(torch.ao.nn.intrinsic.ConvReLU2d) \ + .set_fuser_method(reverse_sequential_wrapper2(torch.ao.nn.intrinsic.ConvReLU2d)) + +backend_config = BackendConfig("my_backend") \ + .set_backend_pattern_config(linear_config) \ + .set_backend_pattern_config(conv_relu_config) +``` + +### Observer Insertion + +Relevant APIs: +* `set_observation_type` + +During the prepare phase, we insert observers (or QuantDeQuantStubs in the future) into the graph for this operator pattern based on the observation type, which specifies whether to use different observers for the inputs and the outputs of the pattern. For more detail, see `torch.ao.quantization.backend_config.ObservationType`. + +### Reference Quantized Patterns + +Relevant APIs: +* `set_root_module` +* `set_reference_quantized_module` + +During the convert phase, when we construct the reference quantized model, the root modules (e.g. `torch.nn.Linear` for `nni.LinearReLU` or `nniqat.LinearReLU`) will be swapped to the corresponding reference quantized modules (e.g. `torch.ao.nn.reference.Linear`). This allows custom backends to specify custom reference quantized module implementations to match the numerics of their lowered operators. Since this is a one-to-one mapping, both the root module and the reference quantized module must be specified in the same BackendPatternConfig in order for the conversion to take place. + +### Fusion + +Relevant APIs: +* `set_fuser_method` +* `set_fused_module` +* `_set_root_node_getter` +* `_set_extra_inputs_getter` + +As an optimization, operator patterns such as (`torch.nn.ReLU`, `torch.nn.Linear`) may be fused into `nni.LinearReLU`. This is performed during the prepare phase according to the function specified in `set_fuser_method`, which replaces the pattern with the fused module. During the convert phase, these fused modules (identified by `set_fused_module`) will then be converted to the reference quantized versions of the modules. + +In FX graph mode quantization, we replace the corresponding nodes in the graph using two helper functions set by the user: `root_node_getter`, which returns the root node (typically the weighted module in the pattern like `torch.nn.Linear`) to replace the matched pattern in the graph, and `extra_inputs_getter`, which returns a list of extra input arguments that will be appended to the existing arguments of the fused module (copied over from the root node). See [this snippet](https://gist.github.com/jerryzh168/8bea7180a8ba3c279f2c9b050f2a69a6) for an example usage. + +### Data Type Restrictions + +Relevant APIs: +* `add_dtype_config` +* `set_dtype_configs` + +DTypeConfig specifies a set of supported data types for input/output/weight/bias along with the associated constraints, if any. There are two ways of specifying `input_dtype`, `output_dtype`, and `weight_dtype`, as simple `torch.dtype`s or as `DTypeWithConstraints`, e.g.: + +``` +import torch +from torch.ao.quantization.backend import DTypeConfig, DTypeWithConstraints + +dtype_config = DTypeConfig( + input_dtype=torch.quint8, + output_dtype=torch.quint8, + weight_dtype=torch.qint8, + bias_dtype=torch.float) + +dtype_config_with_constraints = DTypeConfig( + input_dtype=DTypeWithConstraints( + dtype=torch.quint8, + quant_min_lower_bound=0, + quant_max_upper_bound=255, + scale_min_lower_bound=2 ** -12, + ), + output_dtype=DTypeWithConstraints( + dtype=torch.quint8, + quant_min_lower_bound=0, + quant_max_upper_bound=255, + scale_min_lower_bound=2 ** -12, + ), + weight_dtype=DTypeWithConstraints( + dtype=torch.qint8, + quant_min_lower_bound=-128, + quant_max_upper_bound=127, + scale_min_lower_bound=2 ** -12, + ), + bias_dtype=torch.float) +``` + +During the prepare phase of quantization, we will compare the data types specified in these DTypeConfigs to the ones specified in the matching QConfig for a given operator pattern. If the data types do not match (or the constraints are not satisfied) for all the DTypeConfigs specified for the operator pattern, then we will simply ignore the QConfig and skip quantizing this pattern. + +#### Quantization range + +The user's QConfig may specify `quant_min` and `quant_max`, which are min and max restrictions on the quantization values. Here we set the lower bound for the `quant_min` and then upper bound for the `quant_max` to represent the limits of the backend. If a QConfig exceeds these limits in either direction, it will be treated as violating this constraint. + +#### Scale range + +Similarly, the user's QConfig may specify a minimum value for the quantization scale (currently exposed as `eps` but will change in the future to better reflect the semantics). Here we set the lower bound for the `scale_min` to represent the limits of the backend. If a QConfig's min scale value falls below this limit, the QConfig will be treated as violating this constraint. Note that `scale_max_upper_bound` is currently not used, because there is no corresponding mechanism to enforce this on the observer yet. diff --git a/torch/ao/quantization/backend_config/backend_config.py b/torch/ao/quantization/backend_config/backend_config.py index e0d7e0b9d7428..2f491b1624048 100644 --- a/torch/ao/quantization/backend_config/backend_config.py +++ b/torch/ao/quantization/backend_config/backend_config.py @@ -341,9 +341,17 @@ def __init__(self, pattern: Pattern): def set_observation_type(self, observation_type: ObservationType) -> BackendPatternConfig: """ - Set how observers should be inserted for this pattern. - See :class:`~torch.ao.quantization.backend_config.ObservationType` for details + Set how observers should be inserted in the graph for this pattern. + There are two observation types: + `OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT` (default): the output observer instance will be + different from the input. This is the most common observation type. + + `OUTPUT_SHARE_OBSERVER_WITH_INPUT`: the output observer instance will be the same as the input. + This is useful for operators like `cat`. + + Note: This will be renamed in the near future, since we will soon insert QuantDeQuantStubs with + observers (and fake quantizes) attached instead of observers themselves. """ self.observation_type = observation_type return self @@ -395,6 +403,11 @@ def set_fused_module(self, fused_module: Type[torch.nn.Module]) -> BackendPatter def set_fuser_method(self, fuser_method: Callable) -> BackendPatternConfig: """ Set the function that specifies how to fuse the pattern for this pattern. + + The first argument of this function should be `is_qat`, and the rest of the arguments + should be the items in the tuple pattern, e.g. (`torch.nn.ReLU`, `torch.nn.Linear`) + will have a function with three arguments, `is_qat`, `relu`, and `linear`. + The return value of this function should be the resulting fused module. """ self.fuser_method = fuser_method return self diff --git a/torch/ao/quantization/fx/README.md b/torch/ao/quantization/fx/README.md index 0ee5c5ec7e3f5..cba11e9d36413 100644 --- a/torch/ao/quantization/fx/README.md +++ b/torch/ao/quantization/fx/README.md @@ -329,7 +329,7 @@ backend_config configurations used in this step: BackendConfig(nniqat.LinearReLU) .set_root_module(nn.Linear) .set_reference_quantized_module_for_root(nnqr.Linear) - .set_fused_module(nni.Linear) + .set_fused_module(nni.LinearReLU) ``` Pattern in this case is the same as before, it defines the pattern for the subgraph we are dealing with @@ -376,26 +376,5 @@ There are no configurations related to lowering in `backend_config` since it is However, for some operator based backends, like the current pytorch native backends including fbgemm and qnnpack. We could interpret `backend_config` in terms of configurations for operators as well. e.g. configuring `input_dtype`=quint8, `weight_dtype`=qint8, `output_dtype`=torch.quint8 for nn.Linear is saying that the quantized linear will take a quint8 activation and qint8 weight as input and outputs a quint8 activation. But there is no guarantee that this interpretation will always work in the future, especially when we add new flavors of quantized operators. ## Extensibility -Different backend or kernel libraries may have different support for quantization. They may have different quantized operators, and the quantized operators might work for Tensors with different dtypes, the observers may need to be placed in different places. To make quantization work for different backends, and allow maximum flexibility, we also strived to make all the parts of the flow configurable with backend_config. -backend_config configures quantization behavior in terms of operator patterns. We need to define a operator pattern and specify what are the supported dtypes for input/output/weight/bias for the pattern, and also specify the qat modules, reference modules etc. for the pattern, which will be used in module swapping during the quantization passes. - -Quantized Backends can have different support in the following aspects: -* Quantization Scheme (symmetric vs asymmetric, per-channel vs per-tensor) -* Data Type (float32, float16, int8, uint8, bfloat16, etc) for input/output/weight/bias -* Quantized (and Fused) Operators and Mapping The quantized operators supported by the backend. For example: quantized conv2d, quantized linear etc. Some quantized operators may have different numerics compared to a naive (dequant - float_op - quant) implementation For weighted operators (conv and linear) we need to define a reference module and a mapping -* QAT Module Mapping For modules with weights, e.g. Conv2d and Linear, we need to swap them with qat (quantization aware training) module that adds fake quantization to the weights - -As an example, here is what fbgemm looks like: -+-------------------------------------------+-----------------------------------------------------------------------+ -| | fbgemm | -|-------------------------------------------|-----------------------------------------------------------------------| -| Quantization Scheme | activation: per tensor, weight: per tensor or per channel | -| Data Type | activation: quint8 (with qmin/qmax range restrictions), weight: qint8 | -| Quantized and Fused Operators and Mapping | e.g. nn.Conv2d -> torch.ao.nn.quantized.reference.Conv2d | -| QAT Module Mapping | nn.Conv -> torch.ao.nn.qat.Conv2d | -+-------------------------------------------+-----------------------------------------------------------------------+ - -So instead of hardcoding the fusion mappings, float to quantized module mappings, fusion patterns etc. we will derive everything through `backend_config` throughout the code base. This allows PyTorch Quantization to work for all first-party or third-party backends that may differ from native backends in different aspects. - -For use cases, we will use TensorRT as an example use case and have a tutorial talking about `backend_config`, pytorch native backends fbgemm and qnnpack will be using this to define their behaviors as well, especially with the recent addition of xnnpack (integrated as a part of qnnpack backend in pytorch), the `backend_config` api is needed to define the new constraints from xnnpack. +FX graph mode quantization can be extended to work with different backends, which may have different sets of supported quantized operator patterns and different requirements for each pattern. For more detail, please refer to the [BackendConfig README](/torch/ao/quantization/backend_config/README.md). From ee7f2abdd8ef41ae55d1a6d4cab0b6cd3db08e15 Mon Sep 17 00:00:00 2001 From: Andrew Gu Date: Mon, 24 Oct 2022 11:37:26 +0000 Subject: [PATCH 0068/1922] [FSDP] `summon_full_params()` in computation stream (#86836) This should help with memory usage. In particular, this allows FSDP to use caching allocator blocks from the computation stream for the `summon_full_params()` all-gathers, which should help avoid over-allocating blocks to the unshard stream. Pull Request resolved: https://github.com/pytorch/pytorch/pull/86836 Approved by: https://github.com/rohan-varma --- .../fsdp/fully_sharded_data_parallel.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py index f6eead9406a1c..e241c26d1e1f1 100644 --- a/torch/distributed/fsdp/fully_sharded_data_parallel.py +++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py @@ -1549,6 +1549,8 @@ def _register_param_handle(self, handle: FlatParamHandle) -> None: def _unshard( self, handles: List[FlatParamHandle], + unshard_stream: torch.cuda.Stream, + pre_unshard_stream: torch.cuda.Stream, ) -> None: """ Unshards the handles in ``handles``. If the handles are in @@ -1565,13 +1567,13 @@ def _unshard( if event: event.synchronize() any_ran_pre_unshard = False - with torch.cuda.stream(self._streams["pre_unshard"]): + with torch.cuda.stream(pre_unshard_stream): for handle in handles: ran_pre_unshard = handle.pre_unshard() any_ran_pre_unshard = any_ran_pre_unshard or ran_pre_unshard if any_ran_pre_unshard: - self._streams["unshard"].wait_stream(self._streams["pre_unshard"]) - with torch.cuda.stream(self._streams["unshard"]): + unshard_stream.wait_stream(pre_unshard_stream) + with torch.cuda.stream(unshard_stream): for handle in handles: handle.unshard() handle.post_unshard() @@ -2043,7 +2045,7 @@ def _prefetch_handles( for handles_key in handles_to_prefetch: # Prefetch the next set of handles without synchronizing to allow # the sync to happen as late as possible to maximize overlap - self._unshard(handles_key) + self._unshard(handles_key, self._streams["unshard"], self._streams["pre_unshard"]) self._handles_prefetched[handles_key] = True def _get_handles_to_prefetch( @@ -2891,7 +2893,7 @@ def _pre_forward_unshard( ) -> None: """Unshards parameters in the pre-forward.""" if handles: - self._unshard(handles) + self._unshard(handles, self._streams["unshard"], self._streams["pre_unshard"]) handles_key = tuple(handles) self._needs_pre_forward_unshard[handles_key] = False torch.cuda.current_stream().wait_stream(self._streams["unshard"]) @@ -3137,8 +3139,10 @@ def _summon_full_params( self._clear_grads_if_needed() free_unsharded_flat_params = [handle.needs_unshard() for handle in self._handles] - self._unshard(self._handles) - torch.cuda.current_stream().wait_stream(self._streams["unshard"]) + # No need to call `wait_stream()` since we unshard in the computation + # stream directly + computation_stream = torch.cuda.current_stream() + self._unshard(self._handles, computation_stream, computation_stream) if with_grads: self._unshard_grads(self._handles) @@ -3444,7 +3448,7 @@ def _pre_backward_hook(_handles: List[FlatParamHandle], *unused: Any) -> None: # If the handles have been prefetched, this `_unshard()` simply # switches to using the unsharded parameter - self._unshard(_handles) + self._unshard(_handles, self._streams["unshard"], self._streams["pre_unshard"]) torch.cuda.current_stream().wait_stream(self._streams["unshard"]) # Set this to `False` to ensure that a mistargeted prefetch From c2cd0d89c244d1d7bf1baa5520612979ba1ca528 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Mon, 24 Oct 2022 15:09:40 +0000 Subject: [PATCH 0069/1922] [BE][einsum] add small comment explaining an invariant (#87264) Tiny followup from https://github.com/pytorch/pytorch/pull/87135#discussion_r998488064 and another typo i noticed while doing the autograd lab Pull Request resolved: https://github.com/pytorch/pytorch/pull/87264 Approved by: https://github.com/soulitzer --- aten/src/ATen/native/Linear.cpp | 3 +++ tools/autograd/derivatives.yaml | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp index 7192cc6e1138c..b9b3abe3c7cae 100644 --- a/aten/src/ATen/native/Linear.cpp +++ b/aten/src/ATen/native/Linear.cpp @@ -545,6 +545,9 @@ Tensor einsum(c10::string_view equation, TensorList operands, at::OptionalIntArr // Sum out contraction dims if (perm_index - out_num_dim > 0) { + // if there were ops to contract, we would have already done so + // in the previous loop and all the dims to sum are now 1 + // NB: use view instead of squeeze (or sum) for faster (mps) performance if (num_ops > 1) { auto sizes = ops[0].sym_sizes().vec(); for (auto dim = perm_index - 1; dim >= out_num_dim; --dim) { diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index 853faeb1b2033..c77f63e8c8e73 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -200,7 +200,7 @@ # preferable since it would be less efficient. # # NB: The parameter names here MUST be consistent with the parameter names -# in Decalarations.yaml +# in native_functions.yaml - name: abs(Tensor self) -> Tensor self: grad * self.sgn() result: handle_r_to_c(result.scalar_type(), self_t.conj() * self_p.sgn()) From 03328460d184f071531e5b436fe6d2104d5696e7 Mon Sep 17 00:00:00 2001 From: RangiLyu Date: Mon, 24 Oct 2022 16:03:11 +0000 Subject: [PATCH 0070/1922] sync AveragedModel buffers when use_buffers=False (#84054) Fixes #84053 As described in the issue, the AveragedModel will deep copy the model during initialization, which means that the buffers in the averaged model cannot be updated together with the model. One solution is to make the buffers equal to the source model every time when calling `update_parameters`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/84054 Approved by: https://github.com/samdow --- test/test_optim.py | 7 +++++++ torch/optim/swa_utils.py | 5 +++++ 2 files changed, 12 insertions(+) diff --git a/test/test_optim.py b/test/test_optim.py index 104bdb046d345..a55a74d5d8667 100644 --- a/test/test_optim.py +++ b/test/test_optim.py @@ -2833,6 +2833,7 @@ def test_averaged_model_exponential(self): # Test AveragedModel with EMA as avg_fn dnn = torch.nn.Sequential( torch.nn.Conv2d(1, 5, kernel_size=3), + torch.nn.BatchNorm2d(5, momentum=0.3), torch.nn.Linear(5, 10) ) alpha = 0.9 @@ -2851,11 +2852,17 @@ def avg_fn(p_avg, p, n_avg): else: updated_averaged_params.append((p_avg * alpha + p * (1 - alpha)).clone()) + for b in dnn.buffers(): + if b.size() != torch.Size([]): + b.detach_().add_(torch.randn_like(b)) + averaged_dnn.update_parameters(dnn) averaged_params = updated_averaged_params for p_avg, p_swa in zip(averaged_params, averaged_dnn.parameters()): self.assertEqual(p_avg, p_swa) + for b_avg, b_swa in zip(dnn.buffers(), averaged_dnn.module.buffers()): + self.assertEqual(b_avg, b_swa) def test_averaged_model_exponential_buffers(self): # Test AveragedModel with EMA as avg_fn and use_buffers as True. diff --git a/torch/optim/swa_utils.py b/torch/optim/swa_utils.py index 4d2743a278c2e..f7a530f5ad0f1 100644 --- a/torch/optim/swa_utils.py +++ b/torch/optim/swa_utils.py @@ -132,6 +132,11 @@ def update_parameters(self, model): else: p_swa.detach().copy_(self.avg_fn(p_swa.detach(), p_model_, self.n_averaged.to(device))) + if not self.use_buffers: + # If not apply running averages to the buffers, + # keep the buffers in sync with the source model. + for b_swa, b_model in zip(self.module.buffers(), model.buffers()): + b_swa.detach().copy_(b_model.detach().to(device)) self.n_averaged += 1 From fce53be82fc24abf564a1144ee272a1405e1076c Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Mon, 24 Oct 2022 11:47:40 -0400 Subject: [PATCH 0071/1922] Fix accuracy minifier (#87606) Signed-off-by: Edward Z. Yang cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang @penguinwu Pull Request resolved: https://github.com/pytorch/pytorch/pull/87606 Approved by: https://github.com/anjali411, https://github.com/anijain2305, https://github.com/albanD, https://github.com/soumith, https://github.com/malfet --- torch/_dynamo/debug_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py index 1134267c5f60d..ea5671a81d02f 100644 --- a/torch/_dynamo/debug_utils.py +++ b/torch/_dynamo/debug_utils.py @@ -326,7 +326,7 @@ def nvfuser_fails(fx_g, args, check_str=None): def inductor_accuracy_fails(fx_g, args, check_str=None): - from torchinductor.compile_fx import compile_fx_inner + from torch._inductor.compile_fx import compile_fx_inner return backend_aot_accuracy_fails(fx_g, args, compile_fx_inner) @@ -874,10 +874,11 @@ def dynamo_minifier_backend(gm, example_inputs, compiler_name): @register_backend def dynamo_accuracy_minifier_backend(gm, example_inputs, compiler_name): from functorch.compile import minifier - from torchdynamo.optimizations.backends import BACKENDS + + from torch._dynamo.optimizations.backends import BACKENDS if compiler_name == "inductor": - from torchinductor.compile_fx import compile_fx + from torch._inductor.compile_fx import compile_fx compiler_fn = compile_fx else: From d7d37e38b39a6b5bfb15270fc938342954889c9a Mon Sep 17 00:00:00 2001 From: Will Constable Date: Sun, 23 Oct 2022 14:18:48 +0000 Subject: [PATCH 0072/1922] Add distributed dynamo benchmarking utils (#87419) Util for convenient local benchmarking/debugging of distributed models. Not to be confused with the 'real' distributed benchmark script we use for torchbench experiments on slurm. Tries to be simple/hackable and let you use different combinations of DDP/FSDP with models and dynamo backends. Example usage `python benchmarks/dynamo/distributed.py --toy_model --dynamo inductor --ddp` `--dynamo` flag accepts normal dynamo backends (plus 'print' which literally prints graphs to screen) `--torchbench_model ` works in place of `--toy_model` `--fsdp` is WIP cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87419 Approved by: https://github.com/jansel --- benchmarks/dynamo/dist_util.py | 147 +++++++++++++++++++++++++++ benchmarks/dynamo/distributed.py | 164 +++++++++++++++++++++++++++++++ 2 files changed, 311 insertions(+) create mode 100644 benchmarks/dynamo/dist_util.py create mode 100644 benchmarks/dynamo/distributed.py diff --git a/benchmarks/dynamo/dist_util.py b/benchmarks/dynamo/dist_util.py new file mode 100644 index 0000000000000..9e2f086ca8b70 --- /dev/null +++ b/benchmarks/dynamo/dist_util.py @@ -0,0 +1,147 @@ +import argparse +import functools +import importlib +import os + +import torch +import torch.distributed as dist +import torch.nn as nn +from torch._dynamo.testing import reduce_to_scalar_loss +from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( + apply_activation_checkpointing, + checkpoint_wrapper, + CheckpointImpl, +) +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy + +try: + from .torchbench import setup_torchbench_cwd +except ImportError: + from torchbench import setup_torchbench_cwd + + +def setup(rank, world_size): + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12355" + dist.init_process_group("nccl", rank=rank, world_size=world_size) + + +def cleanup(): + dist.destroy_process_group() + + +class CustomLinear(torch.nn.Module): + def __init__(self, a, b): + super(CustomLinear, self).__init__() + self.weight = nn.Parameter(torch.randn(a, b)) + + def forward(self, x): + return torch.mm(x, self.weight) + + +class MyModule(torch.nn.Module): + def __init__(self, a, b): + super(MyModule, self).__init__() + self.net = nn.Sequential( + nn.Linear(a, b), + nn.ReLU(), + ) + + def forward(self, x): + return self.net(x) + + +class ToyModel(nn.Module): + def __init__(self): + super(ToyModel, self).__init__() + self.net = nn.Sequential( + *[nn.Linear(10, 10000), nn.ReLU()] + + [nn.Linear(10000, 10000), nn.ReLU()] + + [MyModule(10000, 10000)] + + [MyModule(10000, 1000)] + + [MyModule(1000, 1000)] + + [MyModule(1000, 1000)] + + [MyModule(1000, 1000)] + + [MyModule(1000, 1000)] + + [MyModule(1000, 1000)] + + [MyModule(1000, 1000)] + + [MyModule(1000, 1000)] + + [nn.Linear(1000, 5)] + ) + + def forward(self, x): + return self.net(x) + + +def model_iter_fn(model, example_inputs, collect_outputs=False): + outputs = model(*example_inputs) + loss = reduce_to_scalar_loss(outputs) + loss.backward() + if collect_outputs: + return outputs + + +def get_model(args): + if args.torchbench_model: + old_cwd = setup_torchbench_cwd() + module = importlib.import_module( + f"torchbenchmark.models.{args.torchbench_model}" + ) + benchmark_cls = getattr(module, "Model", None) + bm = benchmark_cls( + test="train", device=args.device, jit=False, batch_size=args.batch_size + ) + model, inputs = bm.get_module() + elif args.toy_model: + model = ToyModel() + inputs = (torch.randn(20, 10),) + else: + raise argparse.ArgumentError( + args.torchbench_model, message="Must specify a model" + ) + + return model, inputs + + +def fsdp_checkpointing_base(model, blocks): + """apply activation checkpointing to model + returns None as model is updated directly + """ + non_reentrant_wrapper = functools.partial( + checkpoint_wrapper, + offload_to_cpu=False, + checkpoint_impl=CheckpointImpl.NO_REENTRANT, + ) + + def check_fn(submodule): + return isinstance(submodule, blocks) + + apply_activation_checkpointing( + model, checkpoint_wrapper_fn=non_reentrant_wrapper, check_fn=check_fn + ) + + +# from transformers.models.t5.modeling_t5 import T5Block + +MODEL_FSDP_WRAP = { + ToyModel: (MyModule,) + # TODO T5: (T5Block,) +} + + +def apply_fsdp(model, use_checkpointing=False, use_wrap_policy=True): + blocks = MODEL_FSDP_WRAP[model.__class__] + + wrap_policy = None + if use_wrap_policy: + # transformer policy is really a generic policy that wraps modules of specified classes + wrap_policy = functools.partial( + transformer_auto_wrap_policy, transformer_layer_cls=blocks + ) + + model = FSDP(model, auto_wrap_policy=wrap_policy) + if use_checkpointing: + fsdp_checkpointing_base(model, blocks) + + return model diff --git a/benchmarks/dynamo/distributed.py b/benchmarks/dynamo/distributed.py new file mode 100644 index 0000000000000..b4332556c7bb3 --- /dev/null +++ b/benchmarks/dynamo/distributed.py @@ -0,0 +1,164 @@ +import argparse +from functools import partial + +import numpy as np +import tabulate +import torch + +import torch._dynamo as dynamo +import torch.multiprocessing as mp +import torch.utils._pytree as pytree +from torch._dynamo.testing import reduce_to_scalar_loss +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.profiler import profile, ProfilerActivity, record_function + +try: + from .common import timed + from .dist_util import apply_fsdp, cleanup, get_model, model_iter_fn, setup +except ImportError: + from common import timed + from dist_util import apply_fsdp, cleanup, get_model, model_iter_fn, setup + + +def profile_model(args, model, inputs, rank): + with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: + for i in range(args.repeat): + with record_function("Forward"): + outputs = model(*inputs) + loss = reduce_to_scalar_loss(outputs) + with record_function("Backward"): + loss.backward() + if rank == 0: + prof.export_chrome_trace(args.trace_file) + + +def run_model(args, model, inputs, rank, world_size, key, result_q): + setup(rank, world_size) + if args.device == "cuda": + # needed for FSDP + torch.cuda.set_device(rank) + + dev_rank = f"{args.device}:{rank}" + model = model.to(dev_rank) + + def move_tensor(maybe_tensor): + if torch.is_tensor(maybe_tensor): + return maybe_tensor.to(dev_rank) + return maybe_tensor + + inputs = pytree.tree_map(move_tensor, inputs) + + if args.fsdp: + model = apply_fsdp( + model, + use_checkpointing=args.fsdp_checkpoint, + use_wrap_policy=args.fsdp_wrap, + ) + elif args.ddp: + model = DDP(model) + + if args.verbose: + print(model) + + if args.dynamo: + if args.verbose: + dynamo.config.verbose = True + + def print_compile(gm, ex): + print( + f"print_compile:\n{str(gm.graph)}\n-----------------------------------------" + ) + return gm + + dynamo_ctx = dynamo.optimize( + print_compile if args.dynamo == "print" else args.dynamo + ) + model = dynamo_ctx(model) + + # warmup + _ = timed(model, model_iter_fn, inputs, times=3, return_result=False) + times = [] + t_total = timed( + model, model_iter_fn, inputs, times=args.repeat, return_result=False + ) + times.append(t_total / args.repeat) + + if rank == 0: + result_q.put(times) + + if args.profile: + profile_model(args, model, inputs, rank) + + cleanup() + + +def experiment(fn, key, world_size, results): + key = f"{key}_{world_size}" + dynamo.reset() + ctx = mp.get_context("spawn") + result_q = ctx.SimpleQueue() + f_args = (world_size, key, result_q) + if world_size > 1: + mp.spawn( + fn, + args=f_args, + nprocs=world_size, + join=True, + ) + else: + # rank 0 + fn(0, *f_args) + times = result_q.get() + + results.append((key, np.median(times))) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--device", default="cuda") + parser.add_argument( + "--dynamo", + default=None, + help="if set to a str, uses dynamo[str] backend. else, eager", + ) + parser.add_argument("--verbose", action="store_true") + parser.add_argument("--batch_size", default=None) + parser.add_argument("--profile", action="store_true", help="Run the profiler") + parser.add_argument("--trace_file", default="profile.json", help="Run the profiler") + parser.add_argument("--repeat", default=10, help="Repeats for timing run") + parser.add_argument( + "--world_size", type=int, default=2, help="Number of ranks/gpus for experiments" + ) + parser.add_argument( + "--fsdp_checkpoint", + action="store_true", + help="whether to use gradient checkpointing via model-specific policy", + ) + parser.add_argument( + "--fsdp_wrap", + action="store_true", + help="whether to apply fsdp to submodules via model-specific policy", + ) + + dist_arg = parser.add_mutually_exclusive_group() + dist_arg.add_argument("--ddp", action="store_true") + dist_arg.add_argument("--fsdp", action="store_true") + + model_arg = parser.add_mutually_exclusive_group(required=True) + model_arg.add_argument( + "--torchbench_model", help="name of torchbench model, e.g. hf_Bert" + ) + model_arg.add_argument( + "--toy_model", action="store_true", help="use toy model instead" + ) + args = parser.parse_args() + + model_name = "ToyModel" if args.toy_model else args.torchbench_model + model, inputs = get_model(args) + + fn = partial(run_model, args, model, inputs) + + times = [] + experiment(fn, model_name, args.world_size, times) + print("\nExperiment Results:") + print(tabulate.tabulate(times, headers=("key", "time"))) From dc8e654390da54ecd377a68a2ff88840d041bddf Mon Sep 17 00:00:00 2001 From: Andrew Gu Date: Mon, 24 Oct 2022 14:48:33 +0000 Subject: [PATCH 0073/1922] [FSDP] Fix `use_orig_params=True` + AC (#87413) Without this change, the post-backward hooks do not run when using reentrant activation checkpointing. **Explanation** FSDP registers the original parameters as plain `Tensor`s in the forward pass so that their ops are tracked by autograd to ensure proper gradient propagation into the `FlatParameter`s. FSDP registers the post-backward hooks in its pre-forward. For `use_orig_params=True`, FSDP replaces the plain `Tensor`s with the sharded `nn.Parameter`s in the post-forward when resharding. This differs from `use_orig_params=False`, which keeps the plain `Tensor`s registered as attributes, except their data are freed, meaning that accessing them between forward and backward errors. Before this PR, for `use_orig_params=True`, FSDP simply restores the unsharded original parameter data in the pre-backward to enable correct gradient computation. However, this does not suffice for reentrant activation checkpointing (AC), where the recomputed forward happens after FSDP's pre-backward and the ops in the recomputed forward must be tracked by autograd. My initial solution was to simply have FSDP restore the original parameters as plain `Tensor`s again in the pre-backward so that they would be tracked by autograd exactly like the normal forward. However, this seems to not suffice in general. The `FlatParameter`'s `AccumulateGrad` object may change after the original pre-forward when performing a recomputed forward. The new approach in this PR is to follow the `use_orig_params=False` way -- namely, to preserve the plain `Tensor` variables across forward and backward. I achieved this by saving the variables explicitly in the forward and restoring them in the pre-backward. I clear them in the post-backward to avoid the dangling references (though, I do not think this is strictly necessary). An alternative approach I considered is using forward hooks. However, this does not change the order of operations across FSDP, checkpoint, and the wrapped module, so it does not work. (As long as the order is FSDP(checkpoint(module)), then registered hooks still happen either before or after the checkpoint recomputation -- we cannot insert logic to run inside the checkpoint recomputation.) **Test Plan** I augmented the existing reentrant checkpointing unit tests to also test `use_orig_params=True`. I also verified that the pycls model does not error (even with the new approach). Pull Request resolved: https://github.com/pytorch/pytorch/pull/87413 Approved by: https://github.com/rohan-varma --- test/distributed/fsdp/test_fsdp_checkpoint.py | 32 ++++++++--- torch/distributed/fsdp/flat_param.py | 55 ++++++++++++++++--- 2 files changed, 71 insertions(+), 16 deletions(-) diff --git a/test/distributed/fsdp/test_fsdp_checkpoint.py b/test/distributed/fsdp/test_fsdp_checkpoint.py index 14456df92f84f..b75fa17f86bf5 100644 --- a/test/distributed/fsdp/test_fsdp_checkpoint.py +++ b/test/distributed/fsdp/test_fsdp_checkpoint.py @@ -111,16 +111,23 @@ def _verify_parity(self, losses, outputs, models): [CPUOffload(offload_params=True), CPUOffload(offload_params=False)], ) @parametrize("offload_activations", [True, False]) - def test_checkpoint_fsdp_wrapping(self, cpu_offload, offload_activations): + @parametrize("use_orig_params", [False, True]) + def test_checkpoint_fsdp_wrapping( + self, + cpu_offload: CPUOffload, + offload_activations: bool, + use_orig_params: bool, + ): # Test checkpoint(FSDP(layer1), FSDP(layer2), ....) if offload_activations: wrapper_to_use = offload_wrapper else: wrapper_to_use = checkpoint_wrapper + fsdp_kwargs = {"cpu_offload": cpu_offload, "use_orig_params": use_orig_params} ckpt_sequential_wrapped_fsdp = wrapper_to_use( TestFSDPCheckpoint.SequentialModule( - wrap_fsdp=True, cpu_offload=cpu_offload + wrap_fsdp=True, **fsdp_kwargs, ), ) # Test FSDP(checkpoint(layer1)), FSDP(checkpoint(layer2)), .... @@ -128,11 +135,11 @@ def test_checkpoint_fsdp_wrapping(self, cpu_offload, offload_activations): checkpoint_layer=True, offload_activations=offload_activations, wrap_fsdp=True, - cpu_offload=cpu_offload, + **fsdp_kwargs, ) baseline = TestFSDPCheckpoint.SequentialModule( - wrap_fsdp=True, cpu_offload=cpu_offload + wrap_fsdp=True, **fsdp_kwargs, ) # note that reentrant-based checkpointing requires inputs to have grad @@ -168,12 +175,19 @@ def test_checkpoint_fsdp_wrapping(self, cpu_offload, offload_activations): [CPUOffload(offload_params=True), CPUOffload(offload_params=False)], ) @parametrize("offload_activations", [True, False]) - def test_basic_checkpoint_end_to_end(self, cpu_offload, offload_activations): + @parametrize("use_orig_params", [False, True]) + def test_basic_checkpoint_end_to_end( + self, + cpu_offload: CPUOffload, + offload_activations: bool, + use_orig_params: bool, + ): + fsdp_kwargs = {"cpu_offload": cpu_offload, "use_orig_params": use_orig_params} global _save_on_cpu_called with patch_save_on_cpu(get_patched_save_on_cpu()): seq = TestFSDPCheckpoint.SequentialModule().to(torch.cuda.current_device()) # Runs FSDP with no checkpointing - fsdp_only_seq = FSDP(deepcopy(seq), cpu_offload=cpu_offload) + fsdp_only_seq = FSDP(deepcopy(seq), **fsdp_kwargs) # Runs checkpoint-wrapped FSDP if offload_activations: wrapper_to_use = offload_wrapper @@ -181,15 +195,15 @@ def test_basic_checkpoint_end_to_end(self, cpu_offload, offload_activations): wrapper_to_use = checkpoint_wrapper checkpointed_fsdp = wrapper_to_use( - FSDP(deepcopy(seq), cpu_offload=cpu_offload), + FSDP(deepcopy(seq), **fsdp_kwargs), ) # Runs FSDP-wrapped checkpointed module fsdp_wrapped_checkpoint = FSDP( wrapper_to_use(deepcopy(seq)), - cpu_offload=cpu_offload, + **fsdp_kwargs, ) # Runs FSDP with manual calls to checkpoint. - fsdp_call_checkpoint = FSDP(deepcopy(seq), cpu_offload=cpu_offload) + fsdp_call_checkpoint = FSDP(deepcopy(seq), **fsdp_kwargs) # note that reentrant-based checkpointing requires inputs to have grad # flag set. diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py index 1e34510bd0225..266dc80b4ed42 100644 --- a/torch/distributed/fsdp/flat_param.py +++ b/torch/distributed/fsdp/flat_param.py @@ -212,6 +212,13 @@ class FlatParameter(nn.Parameter): _shared_params (Optional[List[nn.Parameter]]): The original shared parameter variables if ``use_orig_params=True`` and ``None`` otherwise. + _tensors (Optional[List[Optional[Tensor]]]): This saves the ``Tensor`` + views created in the forward and tracked by autograd when + ``use_orig_params=True`` and is ``None`` otherwise. This is to + preserve those ``Tensor`` variables for the backward to ensure that + the ``FlatParameter`` 's ``AccumulateGrad`` object does not change + in which case the post-backward hook does not run. This is relevant + for cases like reentrant activation checkpointing. _is_grad_none (Optional[List[bool]]): A mask over the original parameters' gradients indicating if it is logically ``None`` or not if ``use_orig_params=True`` and ``None`` otherwise. This is needed @@ -273,10 +280,14 @@ def _init_metadata( self._is_grad_none: Optional[List[bool]] = [ False for _ in range(len(params)) ] + self._tensors: Optional[List[Optional[Tensor]]] = [ + None for _ in range(len(self._params)) + ] else: self._params = None self._shared_params = None self._is_grad_none = None + self._tensors = None self._unpadded_unsharded_size = self.size() _set_fsdp_flattened(self) @@ -835,11 +846,15 @@ def _use_unsharded_flat_param( unsharded_size ) # this `.view()` is not autograd visible in_forward = self._training_state == HandleTrainingState.FORWARD + in_pre_backward = self._training_state == HandleTrainingState.BACKWARD_PRE if self._use_orig_params: - # NOTE: When not in the forward, `as_params=True` suffices since we - # only need to restore the tensor *values* for backward computation - # and do not fresh `Tensor` views. - self._use_unsharded_views(as_params=(not in_forward)) + # We use `Tensor` views in the forward so that they are tracked by + # autograd. We use them in the pre-backward as well to support + # reentrant activation checkpointing, which needs the views to be + # tracked by autograd in the backward pass's recomputed forward. + self._use_unsharded_views( + as_params=(not in_forward and not in_pre_backward) + ) elif in_forward: self._use_unsharded_views(as_params=False) @@ -903,7 +918,9 @@ def unshard_grad(self): self._check_sharded(flat_param.grad) flat_param._saved_grad_shard = flat_param.grad # type: ignore[attr-defined] sharded_grad = flat_param._saved_grad_shard # type: ignore[attr-defined] - dist.all_gather_into_tensor(padded_unsharded_grad, sharded_grad, self.process_group) + dist.all_gather_into_tensor( + padded_unsharded_grad, sharded_grad, self.process_group + ) unsharded_size = self.flat_param._unpadded_unsharded_size flat_param.grad = padded_unsharded_grad[: unsharded_size.numel()].view( unsharded_size @@ -1198,8 +1215,27 @@ def _use_unsharded_views(self, as_params: bool) -> None: param.data = view elif as_params: module.register_parameter(param_name, nn.Parameter(view)) - else: - setattr(module, param_name, view) + else: # `as_params=False` + param_var: Tensor = view + if self._use_orig_params: + if self._training_state == HandleTrainingState.FORWARD: + assert self.flat_param._tensors is not None + # Save the `Tensor` for the pre-backward + self.flat_param._tensors[i] = view # save for pre-backward + elif self._training_state == HandleTrainingState.BACKWARD_PRE: + # Use the saved `Tensor` variable from the forward to + # preserve the autograd graph so that the post-backward + # hook fires (e.g. for reentrant AC) + assert self.flat_param._tensors is not None # mypy + tensor = self.flat_param._tensors[i] + p_assert( + tensor is not None, + "Expects `Tensor` to have been saved in forward", + ) + tensor.data = view # type: ignore[union-attr] + assert tensor is not None # mypy + param_var = tensor + setattr(module, param_name, param_var) for i, ( param_name, module, @@ -1341,6 +1377,11 @@ def _use_sharded_views(self) -> None: setattr(module, param_name, param) prim_param = getattr(prim_module, prim_param_name) param.data = prim_param # could be both empty and non-empty + if self._training_state == HandleTrainingState.BACKWARD_POST: + assert self.flat_param._tensors is not None # mypy + # Clear the saved `Tensor`s since they are unneeded now + for i in range(len(self.flat_param._tensors)): + self.flat_param._tensors[i] = None # type: ignore[index] @torch.no_grad() def _use_sharded_grad_views(self) -> None: From dfcbd4d98f243ae51c795130507ff3d39b774776 Mon Sep 17 00:00:00 2001 From: Natalia Gimelshein Date: Mon, 24 Oct 2022 18:41:38 +0000 Subject: [PATCH 0074/1922] attempted fix for nvrtc with lovelace (#87611) Fixes #87595 (maybe?) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87611 Approved by: https://github.com/malfet, https://github.com/atalman --- aten/src/ATen/native/cuda/jit_utils.cpp | 2 ++ torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp index b292d488708bf..a1266fb1b5044 100644 --- a/aten/src/ATen/native/cuda/jit_utils.cpp +++ b/aten/src/ATen/native/cuda/jit_utils.cpp @@ -893,6 +893,8 @@ void codegenOutputQuery( max_dev_version = CUDAVersion(7, 5); } else if (nvrtc_version == CUDAVersion(11, 0)) { // 11.0 supports 3-8.0 max_dev_version = CUDAVersion(8, 0); + } else if (nvrtc_major == 11 && nvrtc_minor < 8) { + max_dev_version = CUDAVersion(8, 6); } else { // If the driver version is unknown (i.e. newer than this code) // assume the driver supports this device diff --git a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp index 85bd74bfdbae4..85de541f4ba78 100644 --- a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp +++ b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp @@ -64,6 +64,8 @@ void codegenOutputQuery( max_dev_version = CudaVersion(7, 5); } else if (nvrtc_version == CudaVersion(11, 0)) { // 11.0 supports 3-8.0 max_dev_version = CudaVersion(8, 0); + } else if (nvrtc_version.first == 11 && nvrtc_version.second < 8) { + max_dev_version = CudaVersion(8, 6); } else { // If the driver version is unknown (i.e. newer than this code) // assume the driver supports this device From 8fc7e2cfa059fa78aae0b5c1f48ba36461cd4023 Mon Sep 17 00:00:00 2001 From: Animesh Jain Date: Mon, 24 Oct 2022 18:48:46 +0000 Subject: [PATCH 0075/1922] [dynamo] Support class members in nn modules (#87531) Fixes https://github.com/pytorch/torchdynamo/issues/1740 @voznesenskym cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang @penguinwu Pull Request resolved: https://github.com/pytorch/pytorch/pull/87531 Approved by: https://github.com/jansel --- test/dynamo/test_repros.py | 17 +++++++++++++++++ torch/_dynamo/variables/nn_module.py | 11 ++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py index ffc71741d72c2..52802f32ad1e8 100644 --- a/test/dynamo/test_repros.py +++ b/test/dynamo/test_repros.py @@ -1719,6 +1719,23 @@ def forward(self, getitem_1, getitem_2, add): ] self.assertTrue(same_two_models(mod, opt_mod, args)) + def test_class_member(self): + class Foo(torch.nn.Module): + a = 4 + b = torch.ones(3, 4) + + def __init__(self): + super().__init__() + self.c = 4 + + def forward(self, x): + return x.cos() + self.a + self.b + self.c + + mod = Foo() + opt_mod = torch._dynamo.optimize("eager", nopython=True)(mod) + args = (torch.randn(3, 4),) + self.assertTrue(same(mod(*args), opt_mod(*args))) + if __name__ == "__main__": from torch._dynamo.test_case import run_tests diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py index 4da389bbd8c47..87a94565e180a 100644 --- a/torch/_dynamo/variables/nn_module.py +++ b/torch/_dynamo/variables/nn_module.py @@ -14,7 +14,13 @@ from ..guards import GuardBuilder from ..mutation_guard import GenerationTracker from ..source import AttrSource, GetItemSource, NNModuleSource, NotNNModuleSource -from ..utils import is_lazy_module, istype, proxy_args_kwargs +from ..utils import ( + is_lazy_module, + is_safe_constant, + istensor, + istype, + proxy_args_kwargs, +) from .base import MutableLocal, typestr, VariableTracker from .functions import invoke_and_store_as_constant from .lists import SliceVariable @@ -139,6 +145,9 @@ def var_getattr(self, tx, name): return variables.UserFunctionVariable(subobj.__get__(base), **options) elif istype(subobj, types.FunctionType): return variables.UserMethodVariable(subobj, self, **options) + elif is_safe_constant(subobj) or istensor(subobj): + # Support possibly common cases of class members + return VariableBuilder(tx, NNModuleSource(source))(subobj) else: unimplemented(f"class property {typestr(base)} {typestr(subobj)}") From 66522e6fd9e2474cb84ed7da44c101050cafcaf0 Mon Sep 17 00:00:00 2001 From: atalman Date: Mon, 24 Oct 2022 19:38:07 +0000 Subject: [PATCH 0076/1922] Fix distributed issue by including distributed files (#87615) This fixes regression in distributed headers installation. Caused by following PR: https://github.com/pytorch/pytorch/pull/85953 which removed the inclusions Fixes #87173 Test plan from wheel build by this CI: https://github.com/pytorch/pytorch/actions/runs/3314742519 ``` [ec2-user@ip-10-0-9-132 c10d]$ pwd /home/ec2-user/actions-runner/_work/_temp/artifacts/torch/include/torch/csrc/distributed/c10d [ec2-user@ip-10-0-9-132 c10d]$ ls -las total 300 4 drwxr-xr-x 2 ec2-user ec2-user 4096 Oct 24 19:12 . 0 drwxr-xr-x 4 ec2-user ec2-user 29 Oct 24 19:12 .. 12 -rw-r--r-- 1 ec2-user ec2-user 9051 Oct 24 17:28 Backend.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 216 Oct 24 17:28 c10d.h 4 -rw-r--r-- 1 ec2-user ec2-user 3880 Oct 24 17:28 comm.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 604 Oct 24 17:28 debug.h 4 -rw-r--r-- 1 ec2-user ec2-user 1717 Oct 24 17:28 default_comm_hooks.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 1316 Oct 24 17:28 error.h 4 -rw-r--r-- 1 ec2-user ec2-user 962 Oct 24 17:28 exception.h 4 -rw-r--r-- 1 ec2-user ec2-user 1461 Oct 24 17:28 FileStore.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 771 Oct 24 17:28 GlooDeviceFactory.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 1154 Oct 24 17:28 HashStore.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 4058 Oct 24 17:28 logger.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 2059 Oct 24 17:28 logging.h 8 -rw-r--r-- 1 ec2-user ec2-user 7979 Oct 24 17:28 NCCLUtils.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 2756 Oct 24 17:28 Ops.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 1814 Oct 24 17:28 ParamCommsUtils.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 1478 Oct 24 17:28 PrefixStore.hpp 16 -rw-r--r-- 1 ec2-user ec2-user 13235 Oct 24 17:28 ProcessGroupGloo.hpp 12 -rw-r--r-- 1 ec2-user ec2-user 11298 Oct 24 17:28 ProcessGroup.hpp 12 -rw-r--r-- 1 ec2-user ec2-user 8645 Oct 24 17:28 ProcessGroupMPI.hpp 28 -rw-r--r-- 1 ec2-user ec2-user 26526 Oct 24 17:28 ProcessGroupNCCL.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 3805 Oct 24 17:28 ProcessGroupRoundRobin.hpp 12 -rw-r--r-- 1 ec2-user ec2-user 10361 Oct 24 17:28 ProcessGroupUCC.hpp 8 -rw-r--r-- 1 ec2-user ec2-user 5062 Oct 24 17:28 ProcessGroupWrapper.hpp 8 -rw-r--r-- 1 ec2-user ec2-user 4201 Oct 24 17:28 PyProcessGroup.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 1072 Oct 24 17:28 python_comm_hook.h 24 -rw-r--r-- 1 ec2-user ec2-user 23859 Oct 24 17:28 reducer.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 2330 Oct 24 17:28 reducer_timer.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 1683 Oct 24 17:28 sequence_num.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 2108 Oct 24 17:28 socket.h 4 -rw-r--r-- 1 ec2-user ec2-user 2589 Oct 24 17:28 Store.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 3264 Oct 24 17:28 TCPStore.hpp 8 -rw-r--r-- 1 ec2-user ec2-user 6944 Oct 24 17:28 TraceUtils.h 8 -rw-r--r-- 1 ec2-user ec2-user 4539 Oct 24 17:28 Types.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 580 Oct 24 17:28 UCCForNCCL.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 2301 Oct 24 17:28 UCCTracing.hpp 8 -rw-r--r-- 1 ec2-user ec2-user 4933 Oct 24 17:28 UCCUtils.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 584 Oct 24 17:28 UnixSockUtils.hpp 24 -rw-r--r-- 1 ec2-user ec2-user 20796 Oct 24 17:28 Utils.hpp 4 -rw-r--r-- 1 ec2-user ec2-user 575 Oct 24 17:28 WinSockUtils.hpp 8 -rw-r--r-- 1 ec2-user ec2-user 4259 Oct 24 17:28 Work.hpp ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/87615 Approved by: https://github.com/malfet --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f844c690b74fb..e3eb3ced6005b 100644 --- a/setup.py +++ b/setup.py @@ -1097,7 +1097,8 @@ def main(): 'include/torch/csrc/autograd/generated/*.h', 'include/torch/csrc/autograd/utils/*.h', 'include/torch/csrc/cuda/*.h', - 'include/torch/csrc/distributed/c10d/exception.h', + 'include/torch/csrc/distributed/c10d/*.h', + 'include/torch/csrc/distributed/c10d/*.hpp', 'include/torch/csrc/distributed/rpc/*.h', 'include/torch/csrc/jit/*.h', 'include/torch/csrc/jit/backends/*.h', From 06db644a8c3a2d20f549a1af5c68083f57859bbd Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Mon, 24 Oct 2022 19:41:53 +0000 Subject: [PATCH 0077/1922] [Vulkan][TCC] Implement tests for hardtanh, hardtanh_, relu and relu_ (#87506) Summary: Implement Vulkan tests for these untested functions in Clamp.cpp: - hardtanh - hardtanh_ - relu - relu_ Test Plan: ```cd ~/fbsource buck run //xplat/caffe2:pt_vulkan_api_test_binAppleMac\#macosx-arm64``` Reviewed By: kirklandsign Differential Revision: D40603655 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87506 Approved by: https://github.com/salilsdesai --- aten/src/ATen/test/vulkan_api_test.cpp | 62 ++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp index a0f00daed5742..d122438f67586 100644 --- a/aten/src/ATen/test/vulkan_api_test.cpp +++ b/aten/src/ATen/test/vulkan_api_test.cpp @@ -1445,6 +1445,36 @@ TEST_F(VulkanAPITest, hardshrink_) { } } +TEST_F(VulkanAPITest, hardtanh) { + const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 10; + const auto in_vulkan = in_cpu.vulkan(); + + const auto out_cpu = at::hardtanh(in_cpu, 3, 7); + const auto out_vulkan = at::hardtanh(in_vulkan, 3, 7); + + const auto check = almostEqual(out_cpu, out_vulkan.cpu()); + if (!check) { + showRtol(out_cpu, out_vulkan.cpu()); + } + + ASSERT_TRUE(check); +} + +TEST_F(VulkanAPITest, hardtanh_) { + auto a_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 10; + auto a_vulkan = a_cpu.vulkan(); + + at::hardtanh_(a_cpu, 3, 7); + at::hardtanh_(a_vulkan, 3, 7); + + const auto check = almostEqual(a_cpu, a_vulkan.cpu()); + if (!check) { + showRtol(a_cpu, a_vulkan.cpu()); + } + + ASSERT_TRUE(check); +} + TEST_F(VulkanAPITest, layer_norm_invalid_inputs) { c10::InferenceMode mode; @@ -2229,6 +2259,38 @@ TEST_F(VulkanAPITest, mul_to_scalar_wrapped) { ASSERT_TRUE(check); } +TEST_F(VulkanAPITest, relu) { + const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)); + const auto in_vulkan = in_cpu.vulkan(); + + const auto out_cpu = at::relu(in_cpu); + const auto out_vulkan = at::relu(in_vulkan); + + const auto check = almostEqual(out_cpu, out_vulkan.cpu()); + + if (!check) { + showRtol(out_cpu, out_vulkan.cpu()); + } + + ASSERT_TRUE(check); +} + +TEST_F(VulkanAPITest, relu_) { + auto a_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)); + auto a_vulkan = a_cpu.vulkan(); + + at::relu_(a_cpu); + at::relu_(a_vulkan); + + const auto check = almostEqual(a_cpu, a_vulkan.cpu()); + + if (!check) { + showRtol(a_cpu, a_vulkan.cpu()); + } + + ASSERT_TRUE(check); +} + TEST_F(VulkanAPITest, reflection_pad2d) { const auto a_cpu = at::rand({2, 3, 47, 63}, at::device(at::kCPU).dtype(at::kFloat)); const auto a_vulkan = a_cpu.vulkan(); From d23007739747767ccdeceba58e2226ad23bd2b4c Mon Sep 17 00:00:00 2001 From: alexmsettle <37422826+alexmsettle@users.noreply.github.com> Date: Mon, 24 Oct 2022 20:02:56 +0000 Subject: [PATCH 0078/1922] New feature for issue #85575. (#86514) Introduced RECORD_OUTPUTS() macro that goes with RECORD_FUNCTION(). It is used to capture the output tensors from a kernel launch. The tensors automatically get passed to the profiler using record_function methods. This allows the profiler to track the tensors that flow into and out of each op. Fixes #85575 cc @robieta @chaekit @aaronenyeshi @ngimel @nbcsm @guotuofeng @guyang3532 @gaoteng-git @tiffzhaofb Pull Request resolved: https://github.com/pytorch/pytorch/pull/86514 Approved by: https://github.com/robieta --- aten/src/ATen/record_function.h | 10 ++++++++++ torch/csrc/jit/codegen/cuda/kernel_cache.cpp | 8 ++++++++ 2 files changed, 18 insertions(+) diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h index d4c143211a21a..323dc5f888b87 100644 --- a/aten/src/ATen/record_function.h +++ b/aten/src/ATen/record_function.h @@ -611,6 +611,16 @@ void record_function_with_scope_and_debug_handle( RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS( \ at::RecordScope::LITE_INTERPRETER, fn, debug_handle, inputs) +// Bookend to the RECORD_FUNCTION macros. Use this after the kernel +// launch to let the profiler bind the outputs to the op that produced +// them. Note that guard is declared by RECORD_FUNCTION so this macro +// needs to be called from the same scope as RECORD_FUNCTION +#define RECORD_OUTPUTS(outputs) \ + if (guard.needsOutputs()) { \ + guard.setOutputs( \ + std::vector(outputs.begin(), outputs.end())); \ + } + /** * addThreadLocalCallback adds a thread local callback to run with * RecordFunction, returns handle to use with removeThreadLocalCallback diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp index 85448dc8ac418..d4e4343e64d58 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp @@ -196,7 +196,15 @@ std::vector FusionExecutorCache::runFusionWithInputs( auto kernel_runtime = getKernelRuntimeFor(args); most_recent_runtime_ = kernel_runtime; + int seq_id = 0; + // Record kernel input and output tensors so profiler can construct + // the data flow graph + RECORD_FUNCTION( + "run_fused_kernel", + std::vector(inputs.begin(), inputs.end()), + seq_id); auto outputs = kernel_runtime->runWithInput(args); + RECORD_OUTPUTS(outputs); // permute output tensor returned by kernel execution. See Part_3 in Note [ // Permutation support in nvfuser ] From 683f68c2a30846fe4c5b5899ef91ce01a71ece6a Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Mon, 24 Oct 2022 12:46:27 -0700 Subject: [PATCH 0079/1922] Add codeowners for functorch (#86213) The list is for people who want to be notified on changes to the files in there. Review is not required from the list of names; I just want to be notified to keep track of what is going on. Let me know if you want your names added too in this PR. Pull Request resolved: https://github.com/pytorch/pytorch/pull/86213 Approved by: https://github.com/Chillee --- CODEOWNERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CODEOWNERS b/CODEOWNERS index 3bddc2f0373e4..8fdc5fc776632 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -96,6 +96,12 @@ test/test_binary_ufuncs.py @mruberry @ngimel test/test_reductions.py @mruberry @ngimel test/test_type_promotion.py @mruberry @ngimel +# functorch-related things +# This list is for people wanting to be notified every time there's a change +# Useful for e.g. auditing xfails that other folks add to tests +test/functorch/test_ops.py @zou3519 +test/functorch/test_vmap.py @zou3519 + # torch MPS test/test_mps.py @kulinseth aten/src/ATen/mps/ @kulinseth From 407353e7c29184c8c6d68c631d0671b023ed33a7 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Mon, 24 Oct 2022 20:21:16 +0000 Subject: [PATCH 0080/1922] cpp docs push fix (#87614) currently failing with ``` To https://github.com/pytorch/cppdocs + 2825b2745bb...80ec4daa657 HEAD -> pytorchbot/temp-branch-cpp (forced update) Branch 'master' set up to track remote branch 'pytorchbot/temp-branch-cpp' from 'origin'. ++ sleep 30 ++ git push -u origin fatal: The upstream branch of your current branch does not match the name of your current branch. To push to the upstream branch on the remote, use git push origin HEAD:pytorchbot/temp-branch-cpp To push to the branch of the same name on the remote, use git push origin HEAD ``` just checked the settings, master of pytorch/cppdocs does not have easy cla as a required check, so we don't need the temp branch Pull Request resolved: https://github.com/pytorch/pytorch/pull/87614 Approved by: https://github.com/huydhn --- .circleci/scripts/cpp_doc_push_script.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/.circleci/scripts/cpp_doc_push_script.sh b/.circleci/scripts/cpp_doc_push_script.sh index 6e66514ae93b9..4c22677e94bd3 100755 --- a/.circleci/scripts/cpp_doc_push_script.sh +++ b/.circleci/scripts/cpp_doc_push_script.sh @@ -98,9 +98,6 @@ git commit -m "Generate C++ docs from pytorch/pytorch@${GITHUB_SHA}" || true git status if [[ "${WITH_PUSH:-}" == true ]]; then - # push to a temp branch first to trigger CLA check and satisfy branch protections - git push -u origin HEAD:pytorchbot/temp-branch-cpp -f - sleep 30 git push -u origin fi From 4d19d5a6c416f7a7fbc04538cc81ceda59259571 Mon Sep 17 00:00:00 2001 From: shubhambhokare1 Date: Mon, 24 Oct 2022 20:48:29 +0000 Subject: [PATCH 0081/1922] [ONNX] Enable test_fill script test (#79555) For scripting mode, aten::clone requires input to be a TensorType. Hence if we encounter an IntType, FloatType or BoolType input, we set the input to the appropriate TensorType Pull Request resolved: https://github.com/pytorch/pytorch/pull/79555 Approved by: https://github.com/justinchuby, https://github.com/BowenBao, https://github.com/abock --- test/onnx/test_pytorch_onnx_onnxruntime.py | 9 ++++++++- .../passes/onnx/remove_inplace_ops_for_onnx.cpp | 15 +++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index 43e8d3579c192..e917e44ce21bd 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -11393,7 +11393,6 @@ def forward(self, x, y): self.run_test(M_ToDeviceDtype(), (x, y)) @skipIfUnsupportedMinOpsetVersion(9) - @skipScriptTest() def test_fill(self): class FillModule(torch.nn.Module): def forward(self, x, filled_value: int): @@ -11403,6 +11402,14 @@ def forward(self, x, filled_value: int): filled_value = 7 self.run_test(FillModule(), (x, filled_value)) + class FillFloatModule(torch.nn.Module): + def forward(self, x, filled_value: float): + return x.fill_(filled_value) + + x = torch.randn((4, 5, 6)) + filled_value = 7.5 + self.run_test(FillFloatModule(), (x, filled_value)) + class FillScalarModule(torch.nn.Module): def forward(self, x): res = x + 2 diff --git a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp index db74dca360e3f..efb7686fae3fe 100644 --- a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp +++ b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp @@ -136,6 +136,21 @@ Node* addDummyClone( orig_data->type()->kind() == TypeKind::BoolType) { auto* noneNode = graph->create(prim::Constant); noneNode->output()->setType(NoneType::get()); + // For scripting mode, aten::clone requires input to be a TensorType + // Hence if we encounter an IntType, FloatType, or BoolType, + // we set the input to the appropriate TensorType + if (orig_data->type()->kind() == TypeKind::IntType && + insertBefore == false) { + orig_data->setType(TensorType::fromNumberType(*IntType::get())); + } else if ( + orig_data->type()->kind() == TypeKind::FloatType && + insertBefore == false) { + orig_data->setType(TensorType::fromNumberType(*FloatType::get())); + } else if ( + orig_data->type()->kind() == TypeKind::BoolType && + insertBefore == false) { + orig_data->setType(TensorType::fromBoolType()); + } newNode = graph->create(aten::clone, /*num_outputs =*/1); newNode->addInput(orig_data); newNode->addInput(noneNode->output()); From 660d8ec815bf60e1c42824a333b58163a9ec3e4d Mon Sep 17 00:00:00 2001 From: albanD Date: Mon, 24 Oct 2022 21:03:58 +0000 Subject: [PATCH 0082/1922] small improvement to error message in fx interpreter (#87599) From https://github.com/pytorch/pytorch/pull/84246/files#r972537173 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87599 Approved by: https://github.com/ezyang --- torch/fx/interpreter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/fx/interpreter.py b/torch/fx/interpreter.py index aac20b6e649d0..95218bf271657 100644 --- a/torch/fx/interpreter.py +++ b/torch/fx/interpreter.py @@ -134,7 +134,7 @@ def run(self, *args, initial_env : Optional[Dict[Node, Any]] = None, enable_io_p msg += f"\nOriginal traceback:\n{node.stack_trace}" e.args = (msg,) + e.args[1:] if isinstance(e, KeyError): - raise RuntimeError(*e.args) + raise RuntimeError(*e.args) from e raise if self.garbage_collect_values: From dd7c5d2b5d539e00df418414fa04b7014396778d Mon Sep 17 00:00:00 2001 From: Greg Hogan Date: Mon, 24 Oct 2022 21:25:36 +0000 Subject: [PATCH 0083/1922] ada lovelace (arch 8.9) support (#87436) changes required to be able to compile https://github.com/pytorch/vision and https://github.com/nvidia/apex for `sm_89` architecture Pull Request resolved: https://github.com/pytorch/pytorch/pull/87436 Approved by: https://github.com/ngimel --- .../upstream/FindCUDA/select_compute_arch.cmake | 11 +++++++++++ torch/utils/cpp_extension.py | 3 ++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake index 7f22d476d2fbe..822c041ee5268 100644 --- a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake +++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake @@ -98,8 +98,19 @@ if(NOT CUDA_VERSION VERSION_LESS "11.1") list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.6") set(CUDA_LIMIT_GPU_ARCHITECUTRE "8.6") + if(CUDA_VERSION VERSION_LESS "11.8") + set(CUDA_LIMIT_GPU_ARCHITECTURE "8.9") + endif() +endif() + +if(NOT CUDA_VERSION VERSION_LESS "11.8") + list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Ada") + list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.9") + list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.9") + if(CUDA_VERSION VERSION_LESS "12.0") set(CUDA_LIMIT_GPU_ARCHITECTURE "9.0") + list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.9+PTX") endif() endif() diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index 36811bf22dedc..612ae9fdf0785 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -1729,10 +1729,11 @@ def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]: ('Volta', '7.0+PTX'), ('Turing', '7.5+PTX'), ('Ampere', '8.0;8.6+PTX'), + ('Ada', '8.9+PTX'), ]) supported_arches = ['3.5', '3.7', '5.0', '5.2', '5.3', '6.0', '6.1', '6.2', - '7.0', '7.2', '7.5', '8.0', '8.6'] + '7.0', '7.2', '7.5', '8.0', '8.6', '8.9'] valid_arch_strings = supported_arches + [s + "+PTX" for s in supported_arches] # The default is sm_30 for CUDA 9.x and 10.x From 3cc34e9b0023a6a31f2ce544f7d1785cd9520836 Mon Sep 17 00:00:00 2001 From: Michael Suo Date: Mon, 24 Oct 2022 14:29:00 -0700 Subject: [PATCH 0084/1922] [dynamo] fix `explain` (#87640) Another casualty of the core move Pull Request resolved: https://github.com/pytorch/pytorch/pull/87640 Approved by: https://github.com/voznesenskym --- torch/_dynamo/eval_frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py index 40beba357b1cf..9895da4ad9bba 100644 --- a/torch/_dynamo/eval_frame.py +++ b/torch/_dynamo/eval_frame.py @@ -379,7 +379,7 @@ def toy_example(a, b): ) -@patch("torchdynamo.symbolic_convert.explain", True) +@patch("torch._dynamo.symbolic_convert.explain", True) def explain(f, *args, **kwargs): # TODO(voz): Do we want a decorator for this? from . import reset From 8efc2b557ae9325b5e17aedcc71f1fde548eb5de Mon Sep 17 00:00:00 2001 From: Animesh Jain Date: Mon, 24 Oct 2022 21:53:14 +0000 Subject: [PATCH 0085/1922] [inductor] Prevent aggressive fusion during inductor lowering (#87447) Fixes https://github.com/pytorch/torchdynamo/issues/1599 Inductor performs aggressive fusion of ops during the lowering of Fx graph into IR nodes. Note that this fusion is different from the fusion that we typically discuss in the context of Inductor, which refers to the fusion of SchedulerNodes (way after lowering). This PR, instead, ensures that we don't accumulate too many ops in the IR node to begin with. In the case of hf_t5_large backward graph, earlier we would generate a kernel with 100s of operators, causing that kernel to take ~350 seconds of compilation time. With this PR, we get it down from 350 seconds to 50 seconds. Note that this could affect performance. I doubt that it will lead to really large dip though. In my toy examples, even if the lowering creates multiple IR nodes, if its a simple fusion, later fusion still creates one node. I would like (1) test_torchinductor.py, (2) test_torchinductor_info.py, and (3) atleast HF models to be enabled in CI before merging this one. @ngimel @jansel @Chillee cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang @penguinwu Pull Request resolved: https://github.com/pytorch/pytorch/pull/87447 Approved by: https://github.com/jansel --- test/inductor/test_torchinductor.py | 86 +++++++++++++++++++++++++++++ torch/_inductor/config.py | 5 ++ torch/_inductor/graph.py | 10 ++++ torch/_inductor/ir.py | 21 ++++++- 4 files changed, 119 insertions(+), 3 deletions(-) diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py index 52f36500b5025..e0501e0e8adef 100644 --- a/test/inductor/test_torchinductor.py +++ b/test/inductor/test_torchinductor.py @@ -1930,6 +1930,92 @@ def test_layer_norm(self): if self.device != "cpu": self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1) + def test_transpose_add(self): + def fn(a, b): + return a.t() + b + + self.common( + fn, (torch.randn([16, 32]), torch.randn([32, 16])), check_lowp=False + ) + if self.device != "cpu": + self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1) + + def test_softmax_one_kernel(self): + def fn(x): + dim = 1 + x_max = torch.amax(x, dim, keepdim=True) + unnormalized = torch.exp(x * x_max) + result = unnormalized / torch.sum(unnormalized, dim, keepdim=True) + return result + + self.common(fn, (torch.randn([16, 32]),), check_lowp=False) + if self.device != "cpu": + self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1) + + def test_cauchy(self): + def fn(x, y): + return torch.sum(1 / (torch.unsqueeze(x, -1) - y)) + + self.common( + fn, + ( + torch.randn(32), + torch.randn(32), + ), + # Absolute difference: 0.0003662109375 (up to 0.0001 allowed) + # Relative difference: 1.8804297408767818e-05 (up to 1e-05 allowed) + atol=5 * 1e-4, + rtol=5 * 1e-5, + check_lowp=False, + ) + if self.device != "cpu": + self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1) + + def test_gather_scatter(self): + def fn(node_feat, edge_index): + src_node_feat = node_feat[edge_index[0]] + dst_node_feat = node_feat[edge_index[1]] + edge_feat = src_node_feat - dst_node_feat + 1 + new_node_feat = torch.zeros_like(node_feat) + new_node_feat.scatter_add_( + 0, edge_index[1].unsqueeze(-1).expand_as(edge_feat), edge_feat + ) + return new_node_feat + + num_nodes = 16 + num_features = 32 + node_feat = torch.randn(num_nodes, num_features) + edge_index = torch.randint(0, num_nodes, size=(2, num_nodes * 5)) + self.common( + fn, + ( + node_feat, + edge_index, + ), + check_lowp=False, + ) + if self.device != "cpu": + self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2) + + @patch.object(torch._inductor.config, "max_fusion_size", 1) + def test_no_mega_fusion_during_lowering(self): + n = 50 + + def fn(*args): + x = args[0] + for i in range(n): + x = torch.add(x, args[i]) + return x + + self.common( + fn, + [torch.randn(64) for _ in range(n)], + check_lowp=False, + ) + print("-->", torch._inductor.metrics.generated_kernel_count) + if self.device != "cpu": + self.assertTrue(torch._inductor.metrics.generated_kernel_count > 1) + def test_move_arange(self): def fn(x): return torch.arange(len(x), device="cpu").to(x.device) + x diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py index f4b847e50c820..910e6d20b4d6f 100644 --- a/torch/_inductor/config.py +++ b/torch/_inductor/config.py @@ -28,9 +28,14 @@ benchmark_harness = True # control store vs recompute heuristic +# For fanouts, rematearialization can lead to exponential blowup. So, have +# smaller threashold realize_reads_threshold = 4 realize_bytes_threshold = 2000 +# Threshold to prevent excessive accumulation of ops in one buffer during lowering +realize_acc_reads_threshold = 8 + # fallback to eager for random/dropout, this is slow but useful for debugging fallback_random = False diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py index 2a1619a822451..8a971020ac047 100644 --- a/torch/_inductor/graph.py +++ b/torch/_inductor/graph.py @@ -299,6 +299,9 @@ def finalize(self): def run_node(self, n: torch.fx.Node): with ir.IRNode.current_origins({n}): result = super().run_node(n) + + # Realize if (1) any user need inputs realized, or (2) there is + # already too many reads and rematerializing can be bad. num_users = len(set(n.users)) if num_users > 1 and isinstance(result, TensorBox): for user in n.users: @@ -307,6 +310,13 @@ def run_node(self, n: torch.fx.Node): # TODO(jansel): introduce a store vs inline choice result.mark_reuse(len(n.users)) + + # Realize if the IRNode already has accumulated lots of reads + if isinstance(result, TensorBox) and result.has_exceeded_max_reads(): + # Prevent excessive accumulation in a computed buffer, when + # there are multiple branches meach with small number of memory + # reads, but they converge to a user. + result.realize_hint() return result def codegen(self): diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py index 13cf5d771a0c8..889e30bb54449 100644 --- a/torch/_inductor/ir.py +++ b/torch/_inductor/ir.py @@ -302,7 +302,7 @@ def inner_fn_str(self): with V.set_ops_handler(V.MockHandler()), patch.object( FlexibleLayout, "allow_indexing", True ): - return self.inner_fn(self._index(self.ranges)) + return str(self.inner_fn(self._index(self.ranges))) except Exception as e: return f"inner_fn(): {e}" @@ -419,8 +419,11 @@ def inner_fn_str(self): with V.set_ops_handler(V.MockHandler()), patch.object( FlexibleLayout, "allow_indexing", True ): - return self.inner_fn( - self._index(self.ranges), self._index(self.reduction_ranges, "r") + return str( + self.inner_fn( + self._index(self.ranges), + self._index(self.reduction_ranges, "r"), + ) ) except Exception as e: return f"inner_fn(): {e}" @@ -948,6 +951,9 @@ def get_name(self): def mark_reuse(self, users): return self.data.mark_reuse(users) + def has_exceeded_max_reads(self): + return self.data.has_exceeded_max_reads() + def realize(self): return self.data.realize() @@ -1422,6 +1428,9 @@ def get_device(self): def mark_reuse(self, users): pass + def has_exceeded_max_reads(self): + return False + def get_reads(self): return () @@ -3350,6 +3359,12 @@ def realize_hint(self): if isinstance(self.data, (Pointwise, Reduction)) and self.num_reads() > 1: self.realize() + def has_exceeded_max_reads(self): + return isinstance(self.data, Pointwise) and ( + self.num_reads() > config.realize_acc_reads_threshold + or len(self.inner_fn_str()) > config.realize_bytes_threshold + ) + def mark_reuse(self, users): """ A heuristic to decide if we should realize a tensor From 00928cade3daaa2bb7f8c93ba8610dacce5d9d1f Mon Sep 17 00:00:00 2001 From: Huy Do Date: Mon, 24 Oct 2022 22:24:44 +0000 Subject: [PATCH 0086/1922] Upgrade actions/upload-artifact to v3 (#87553) Upgrade a bunch of actions to get rid of the deprecation warnings, i.e. https://github.com/pytorch/pytorch/actions/runs/3304031186 * Upgrade actions/upload-artifact to v3 * Upgrade Windows actions/setup-python to v4 (left over) Note: Warnings coming from setup/cache will be fixed upstream by https://github.com/pytorch/test-infra/pull/941 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87553 Approved by: https://github.com/clee2000 --- .github/actions/setup-win/action.yml | 2 +- .github/templates/macos_binary_build_workflow.yml.j2 | 2 +- .github/workflows/_mac-build.yml | 4 ++-- .../generated-macos-arm64-binary-conda-nightly.yml | 6 +++--- .../generated-macos-arm64-binary-wheel-nightly.yml | 8 ++++---- .../workflows/generated-macos-binary-conda-nightly.yml | 8 ++++---- .../generated-macos-binary-libtorch-cxx11-abi-nightly.yml | 8 ++++---- .../generated-macos-binary-libtorch-pre-cxx11-nightly.yml | 8 ++++---- .../workflows/generated-macos-binary-wheel-nightly.yml | 8 ++++---- .github/workflows/run_torchbench.yml | 2 +- .github/workflows/scorecards.yml | 4 ++-- 11 files changed, 30 insertions(+), 30 deletions(-) diff --git a/.github/actions/setup-win/action.yml b/.github/actions/setup-win/action.yml index c5f1cac550f68..d442343430c7d 100644 --- a/.github/actions/setup-win/action.yml +++ b/.github/actions/setup-win/action.yml @@ -55,7 +55,7 @@ runs: .circleci/scripts/windows_cudnn_install.sh - name: Setup Python3 - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: "3.x" cache: pip diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2 index 5e6b505664e60..95802252a4f98 100644 --- a/.github/templates/macos_binary_build_workflow.yml.j2 +++ b/.github/templates/macos_binary_build_workflow.yml.j2 @@ -97,7 +97,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: !{{ config["build_name"] }} diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml index 895b07164213e..557d3c7b292c1 100644 --- a/.github/workflows/_mac-build.yml +++ b/.github/workflows/_mac-build.yml @@ -131,7 +131,7 @@ jobs: zip -1 -r artifacts.zip dist/ build/.ninja_log build/compile_commands.json .pytorch-test-times.json - name: Store PyTorch Build Artifacts on GHA - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' with: name: ${{ env.BUILD_ENVIRONMENT }} @@ -140,7 +140,7 @@ jobs: path: artifacts.zip - name: Upload sccache stats to GHA - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 # Only if sccache is installed, see above if: ${{ (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && steps.build.outcome != 'skipped' }} with: diff --git a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml index 5d47cc77cf3a7..ce32755e32098 100644 --- a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml +++ b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml @@ -116,7 +116,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: conda-py3_8-cpu @@ -226,7 +226,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: conda-py3_9-cpu @@ -336,7 +336,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: conda-py3_10-cpu diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml index e58d153269b38..6bc3894a00be5 100644 --- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml +++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml @@ -116,7 +116,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: wheel-py3_7-cpu @@ -226,7 +226,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: wheel-py3_8-cpu @@ -336,7 +336,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: wheel-py3_9-cpu @@ -446,7 +446,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: wheel-py3_10-cpu diff --git a/.github/workflows/generated-macos-binary-conda-nightly.yml b/.github/workflows/generated-macos-binary-conda-nightly.yml index 079687e6ff951..ba3697e3fef91 100644 --- a/.github/workflows/generated-macos-binary-conda-nightly.yml +++ b/.github/workflows/generated-macos-binary-conda-nightly.yml @@ -114,7 +114,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: conda-py3_7-cpu @@ -224,7 +224,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: conda-py3_8-cpu @@ -334,7 +334,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: conda-py3_9-cpu @@ -444,7 +444,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: conda-py3_10-cpu diff --git a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml index dcb480b0a07ce..381e0a4c73ad7 100644 --- a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml +++ b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml @@ -118,7 +118,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: libtorch-cpu-shared-with-deps-cxx11-abi @@ -233,7 +233,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: libtorch-cpu-shared-without-deps-cxx11-abi @@ -348,7 +348,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: libtorch-cpu-static-with-deps-cxx11-abi @@ -463,7 +463,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: libtorch-cpu-static-without-deps-cxx11-abi diff --git a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml index 5f02ea874b4e4..55b28480a7545 100644 --- a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml +++ b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml @@ -118,7 +118,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: libtorch-cpu-shared-with-deps-pre-cxx11 @@ -233,7 +233,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: libtorch-cpu-shared-without-deps-pre-cxx11 @@ -348,7 +348,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: libtorch-cpu-static-with-deps-pre-cxx11 @@ -463,7 +463,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: libtorch-cpu-static-without-deps-pre-cxx11 diff --git a/.github/workflows/generated-macos-binary-wheel-nightly.yml b/.github/workflows/generated-macos-binary-wheel-nightly.yml index 081f470d6109f..f4baf9129b690 100644 --- a/.github/workflows/generated-macos-binary-wheel-nightly.yml +++ b/.github/workflows/generated-macos-binary-wheel-nightly.yml @@ -114,7 +114,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: wheel-py3_7-cpu @@ -224,7 +224,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: wheel-py3_8-cpu @@ -334,7 +334,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: wheel-py3_9-cpu @@ -444,7 +444,7 @@ jobs: # shellcheck disable=SC1091 source "${RUNNER_TEMP}/anaconda/bin/activate" "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: always() with: name: wheel-py3_10-cpu diff --git a/.github/workflows/run_torchbench.yml b/.github/workflows/run_torchbench.yml index 9a46a23af5bfc..2d1013abafc02 100644 --- a/.github/workflows/run_torchbench.yml +++ b/.github/workflows/run_torchbench.yml @@ -92,7 +92,7 @@ jobs: conda env remove --name pr-ci rm /tmp/pr-body.txt - name: Upload artifact - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: TorchBench result path: ~/.torchbench/bisection/pr${{ github.event.number }} diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 516998bfa95be..d896864349fe4 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -25,7 +25,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@2541b1294d2704b0964813337f33b291d3f8596b # tag=v3.0.2 + uses: actions/checkout@v3 with: persist-credentials: false @@ -42,7 +42,7 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@3cea5372237819ed00197afe530f5a7ea3e805c8 # tag=v3.1.0 + uses: actions/upload-artifact@v3 with: name: SARIF file path: results.sarif From 48443f8275f66a51e32bc87809854226c03b2390 Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Mon, 24 Oct 2022 22:43:11 +0000 Subject: [PATCH 0087/1922] [functorch] dont compute expected output multiple times (#86202) Fixes https://github.com/pytorch/functorch/issues/1028 Description: We update `get_fallback_and_vmap_exhaustive` to compute expected output only once as described in the issue. NOTE: This doesn't take care of the repeated computation in `test_vmap_exhaustive` and will be followed up later. TODO: * [x] Benchmark and see how much difference does this make. (Comparison Table Below: [Link](https://github.com/pytorch/pytorch/pull/86202#issuecomment-1285477653)) Pull Request resolved: https://github.com/pytorch/pytorch/pull/86202 Approved by: https://github.com/zou3519 --- test/functorch/common_utils.py | 50 +++++++++++++++++++++++++++------- test/functorch/test_ops.py | 4 ++- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/test/functorch/common_utils.py b/test/functorch/common_utils.py index 1d7356b6ca7e5..c082340d7882e 100644 --- a/test/functorch/common_utils.py +++ b/test/functorch/common_utils.py @@ -222,10 +222,11 @@ def clone_if_tensor(x): return x.clone() return x - -def compute_quantities_for_vmap_test( +# Helper function to compare output of `vmap` against the +# `for-loop` version. +def _compute_quantities_for_vmap_test( op, orig_batched_args, orig_kwarg_values, in_dims, - out_dim=0, batch_size=2, compute_loop_out=True, + out_dim, batch_size, compute_loop_out=True, clone_inputs=False): def maybe_clone_inputs(): @@ -236,10 +237,12 @@ def maybe_clone_inputs(): return orig_batched_args, orig_kwarg_values batched_args, kwarg_values = maybe_clone_inputs() + if compute_loop_out: loop_out = loop(op, in_dims, out_dim, batch_size, *batched_args, **kwarg_values) else: loop_out = None + # Used for debugging the resulting operations # from functorch import make_fx # def f(a): @@ -248,7 +251,6 @@ def maybe_clone_inputs(): # print(in_dims, [arg.shape for arg in batched_args], kwarg_values) batched_args, kwarg_values = maybe_clone_inputs() batched_out = vmap(op, in_dims=in_dims, out_dims=out_dim)(*batched_args, **kwarg_values) - yield (loop_out, batched_out) # Tests case where we dispatch to a batching rule with no bdims # This should be handled by autogenerated plumbing. For vmap support @@ -262,24 +264,52 @@ def f(dummy, *args, **kwargs): return op(*args, **kwargs) dummy = torch.ones(batch_size, 1) - expected = pytree.tree_map(add_bdim_if_tensor, batched_out) + vmapvmap_expected = pytree.tree_map(add_bdim_if_tensor, batched_out) inner_in_dims = (0,) + pytree.tree_map(lambda x: None, in_dims) outer_in_dims = (0,) + in_dims batched_args, kwarg_values = maybe_clone_inputs() - output = vmap(vmap(f, inner_in_dims), outer_in_dims)(dummy, *batched_args, **kwarg_values) - yield (expected, output) + vmapvmap_output = vmap(vmap(f, inner_in_dims), outer_in_dims)(dummy, *batched_args, **kwarg_values) + + yield (batched_out, loop_out, vmapvmap_output, vmapvmap_expected) + + +# Function with more friendly return types +# compared to `_compute_quantities_for_vmap_test` +def compute_quantities_for_vmap_test( + op, orig_batched_args, orig_kwarg_values, in_dims, + out_dim=0, batch_size=2, compute_loop_out=True, + clone_inputs=False): + for quantities in _compute_quantities_for_vmap_test(op, orig_batched_args, orig_kwarg_values, in_dims, + out_dim, batch_size, compute_loop_out, clone_inputs): + yield (quantities[0], quantities[1]) + yield (quantities[2], quantities[3]) def get_fallback_and_vmap_exhaustive(op, arg_values, kwarg_values, is_batch_norm_and_training=False, compute_loop_out=True): out_dim = 0 batch_size = 2 + def make_batched(t): + if isinstance(t, torch.Tensor): + shape = list(t.shape) + shape.insert(out_dim, batch_size) + return t.expand(*shape) + return t + + # Inputs generated by `generate_vmap_inputs` just copy/expand the unbatched inputs + # over the batched dimension. Thus we can compute the expected value once and just + # expand it based on the `out_dim` and `batch_size`. + expected_unbatched = op(*arg_values, **kwarg_values) + expected_batched = pytree.tree_map(make_batched, expected_unbatched) generator = generate_vmap_inputs(arg_values, kwarg_values, is_batch_norm_and_training) for batched_args, in_dims, kwarg_values in generator: - for quantities in compute_quantities_for_vmap_test( - op, batched_args, kwarg_values, in_dims, out_dim, batch_size, compute_loop_out): - yield quantities + for quantities in _compute_quantities_for_vmap_test( + op, batched_args, kwarg_values, in_dims, out_dim, batch_size, + compute_loop_out=False): + assert quantities[1] is None + yield (quantities[0], expected_batched) + yield (quantities[2], quantities[3]) def opinfo_in_dict(opinfo, d): diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py index 5dfe76b3e2877..bda05d970a5e9 100644 --- a/test/functorch/test_ops.py +++ b/test/functorch/test_ops.py @@ -9,7 +9,8 @@ import itertools import unittest -from torch.testing._internal.common_utils import TestCase, run_tests, is_iterable_of_tensors, IS_ARM64, parametrize, TEST_WITH_ASAN +from torch.testing._internal.common_utils import TestCase, run_tests, is_iterable_of_tensors, IS_MACOS, \ + IS_ARM64, parametrize, TEST_WITH_ASAN import torch from torch import Tensor import functools @@ -823,6 +824,7 @@ def test_vmapvjp(self, device, dtype, op): # ---------------------------- BUGS ------------------------------------ # The following are bugs that we should fix decorate('nn.functional.conv2d', decorator=unittest.skipIf(IS_ARM64, "Fails on M1")), + decorate('linalg.det', 'singular', decorator=unittest.skipIf(IS_MACOS, "Fails on x86 MacOS CI")), skip('nn.functional.max_pool1d'), # fails on cpu, runs on cuda xfail('masked.mean'), # silent incorrectness (nan difference) From 8d0f13b649b35c5429cea831fb6f1cfdacabc79d Mon Sep 17 00:00:00 2001 From: Huy Do Date: Mon, 24 Oct 2022 22:44:42 +0000 Subject: [PATCH 0088/1922] Add some common tools to docker base (#86993) I always need to install these 2 tools whenever I use Docker manually to debug build and test issues: * unzip is to extracted the zipped artifacts from PyTorch CI * gdb is to do you know what :) IMO, it makes sense to have them as part of the container image Pull Request resolved: https://github.com/pytorch/pytorch/pull/86993 Approved by: https://github.com/ZainRizvi --- .circleci/docker/common/install_base.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.circleci/docker/common/install_base.sh b/.circleci/docker/common/install_base.sh index 6724031c0a447..5bca9f6dd3335 100755 --- a/.circleci/docker/common/install_base.sh +++ b/.circleci/docker/common/install_base.sh @@ -68,7 +68,9 @@ install_ubuntu() { sudo \ vim \ jq \ - libtool + libtool \ + unzip \ + gdb # Should resolve issues related to various apt package repository cert issues # see: https://github.com/pytorch/pytorch/issues/65931 @@ -126,7 +128,9 @@ install_centos() { opencv-devel \ sudo \ wget \ - vim + vim \ + unzip \ + gdb # Cleanup yum clean all From dbb53bf3fb008ba698e8eb85cf121b07d93ff167 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 24 Oct 2022 23:05:14 +0000 Subject: [PATCH 0089/1922] [CI] Fix triton wheel build (#87461) If one to use auto-install llvm mechanism, somehow one ends us with few unresovled symbols if build on manylinux image. Workaround by installing llvm from OS repos. Also, add an upload job, which is executed only on trunk Fixes https://github.com/pytorch/torchdynamo/issues/1733 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87461 Approved by: https://github.com/msaroufim --- .github/workflows/build-triton-wheel.yml | 53 +++++++++++++++++++++++- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml index 074d53498faa6..f602eaa30af4d 100644 --- a/.github/workflows/build-triton-wheel.yml +++ b/.github/workflows/build-triton-wheel.yml @@ -3,7 +3,8 @@ name: Build Triton wheels on: push: branches: - main + - main + - master paths: - .github/workflows/build-triton-wheel.yml - .github/scripts/build_triton_wheel.py @@ -84,7 +85,7 @@ jobs: ;; esac - docker exec -t "${container_name}" yum install -y zlib-devel + docker exec -t "${container_name}" yum install -y llvm11 llvm11-devel llvm11-static llvm11-libs zlib-devel docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" /pytorch/.github/scripts/build_triton_wheel.py docker exec -t "${container_name}" chown -R 1000.1000 /artifacts @@ -98,3 +99,51 @@ jobs: - name: Teardown Linux uses: pytorch/test-infra/.github/actions/teardown-linux@main if: always() + upload-wheel: + runs-on: linux.20_04.4x + needs: build-wheel + container: + image: continuumio/miniconda3:4.12.0 + env: + GITHUB_TOKEN: ${{ secrets.github-token }} + steps: + - name: Download Build Artifacts (3.7) + uses: actions/download-artifact@v3 + with: + name: "pytorch-triton-3.7" + path: "${{ runner.temp }}/artifacts/" + - name: Download Build Artifacts (3.8) + uses: actions/download-artifact@v3 + with: + name: "pytorch-triton-3.8" + path: "${{ runner.temp }}/artifacts/" + - name: Download Build Artifacts (3.9) + uses: actions/download-artifact@v3 + with: + name: "pytorch-triton-3.9" + path: "${{ runner.temp }}/artifacts/" + - name: Download Build Artifacts (3.10) + uses: actions/download-artifact@v3 + with: + name: "pytorch-triton-3.10" + path: "${{ runner.temp }}/artifacts/" + - name: Download Build Artifacts (3.11) + uses: actions/download-artifact@v3 + with: + name: "pytorch-triton-3.11" + path: "${{ runner.temp }}/artifacts/" + - name: Upload binaries + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/master' || github.event.ref == 'refs/heads/main') }} + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.aws-access-key-id }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.aws-pytorch-uploader-secret-access-key }} + UPLOAD_BUCKET: "s3://pytorch" + run: | + set -ex + pip install -q awscli + s3_dir="${UPLOAD_BUCKET}/whl/nightly/" + for pkg in "${PKG_DIR}/"*.whl; do + aws s3 cp --no-progress --acl public-read "${pkg}" "${s3_dir}" + done From 69a4a386cf1e796c438b9ffdff7f0f4aed8c3468 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Mon, 17 Oct 2022 18:57:06 +0100 Subject: [PATCH 0090/1922] ATen/native (3/6): Use per-operator headers (#75573) Differential Revision: [D40126701](https://our.internmc.facebook.com/intern/diff/D40126701) Pull Request resolved: https://github.com/pytorch/pytorch/pull/75573 Approved by: https://github.com/malfet --- aten/src/ATen/native/Histogram.cpp | 22 +++- aten/src/ATen/native/Histogram.h | 2 - aten/src/ATen/native/Im2Col.cpp | 15 ++- aten/src/ATen/native/IndexingUtils.cpp | 1 + aten/src/ATen/native/Integration.cpp | 17 ++- aten/src/ATen/native/Itertools.cpp | 19 ++- aten/src/ATen/native/Lerp.cpp | 9 ++ aten/src/ATen/native/Linear.cpp | 31 ++++- aten/src/ATen/native/LinearAlgebra.cpp | 119 ++++++++++++++++-- aten/src/ATen/native/Loss.cpp | 57 ++++++++- aten/src/ATen/native/LossCTC.cpp | 26 +++- aten/src/ATen/native/LossMulti.h | 8 +- aten/src/ATen/native/LossMultiLabelMargin.cpp | 15 ++- aten/src/ATen/native/LossMultiMargin.cpp | 14 ++- aten/src/ATen/native/LossNLL.cpp | 23 +++- aten/src/ATen/native/LossNLL2d.cpp | 17 ++- aten/src/ATen/native/MathBitsFallback.h | 9 +- aten/src/ATen/native/MaxPooling.cpp | 13 +- aten/src/ATen/native/MaxUnpooling.cpp | 13 +- aten/src/ATen/native/Memory.cpp | 13 +- aten/src/ATen/native/NNPACK.cpp | 13 +- .../native/NaiveConvolutionTranspose2d.cpp | 15 ++- .../native/NaiveConvolutionTranspose3d.cpp | 16 ++- .../ATen/native/NaiveDilatedConvolution.cpp | 15 ++- aten/src/ATen/native/NamedTensor.cpp | 28 ++++- aten/src/ATen/native/vol2col.h | 4 +- 26 files changed, 470 insertions(+), 64 deletions(-) diff --git a/aten/src/ATen/native/Histogram.cpp b/aten/src/ATen/native/Histogram.cpp index c3a007f2c2dcb..89ede6bea35c1 100644 --- a/aten/src/ATen/native/Histogram.cpp +++ b/aten/src/ATen/native/Histogram.cpp @@ -1,10 +1,28 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include -#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include #include #include diff --git a/aten/src/ATen/native/Histogram.h b/aten/src/ATen/native/Histogram.h index 9df0aafafc18d..3305cc5e315fb 100644 --- a/aten/src/ATen/native/Histogram.h +++ b/aten/src/ATen/native/Histogram.h @@ -3,8 +3,6 @@ #include #include -#include - namespace at { namespace native { using histogramdd_fn = void(*)(const Tensor&, const c10::optional&, bool, Tensor&, const TensorList&); diff --git a/aten/src/ATen/native/Im2Col.cpp b/aten/src/ATen/native/Im2Col.cpp index 7cb5133eef9ad..416e77e9ff199 100644 --- a/aten/src/ATen/native/Im2Col.cpp +++ b/aten/src/ATen/native/Im2Col.cpp @@ -1,12 +1,21 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include -#include -#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#endif + namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/IndexingUtils.cpp b/aten/src/ATen/native/IndexingUtils.cpp index e91eff03ab856..c5f5ff6fbcc07 100644 --- a/aten/src/ATen/native/IndexingUtils.cpp +++ b/aten/src/ATen/native/IndexingUtils.cpp @@ -1,3 +1,4 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include namespace at { namespace native { diff --git a/aten/src/ATen/native/Integration.cpp b/aten/src/ATen/native/Integration.cpp index 7ca01bae18a57..09e444476d1fd 100644 --- a/aten/src/ATen/native/Integration.cpp +++ b/aten/src/ATen/native/Integration.cpp @@ -1,12 +1,23 @@ -#include -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include +#include +#include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/Itertools.cpp b/aten/src/ATen/native/Itertools.cpp index 265b05054b0a3..8d6ff506a43f8 100644 --- a/aten/src/ATen/native/Itertools.cpp +++ b/aten/src/ATen/native/Itertools.cpp @@ -1,5 +1,20 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#endif #include diff --git a/aten/src/ATen/native/Lerp.cpp b/aten/src/ATen/native/Lerp.cpp index bfac91a881ae0..2e67dec35033f 100644 --- a/aten/src/ATen/native/Lerp.cpp +++ b/aten/src/ATen/native/Lerp.cpp @@ -1,5 +1,14 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS #include +#else +#include +#endif namespace at { namespace meta { diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp index b9b3abe3c7cae..591289a726ac8 100644 --- a/aten/src/ATen/native/Linear.cpp +++ b/aten/src/ATen/native/Linear.cpp @@ -1,17 +1,36 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include -#include #include -#include #include -#include +#include +#include #include #include #include -#include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include -#include #include #include #include diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index c658d4427c97d..8c5a6fc8f1955 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -1,27 +1,132 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include #include #include #include #include #include -#include #include #include #include #include #include -#include -#include #include +#include +#include +#include #include -#include #include #include #include -#include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include #include #include diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp index 52569ba6b4995..78b7d70236207 100644 --- a/aten/src/ATen/native/Loss.cpp +++ b/aten/src/ATen/native/Loss.cpp @@ -1,15 +1,62 @@ -#include -#include -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include +#include +#include +#include +#include #include #include -#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + constexpr float EPSILON = 1e-12; namespace { diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp index 1ddb8f2285640..dcfad968cad79 100644 --- a/aten/src/ATen/native/LossCTC.cpp +++ b/aten/src/ATen/native/LossCTC.cpp @@ -5,16 +5,36 @@ // 1. Graves et al: http://www.cs.toronto.edu/~graves/icml_2006.pdf // We use the equations from above link, but note that [1] has 1-based indexing and we (of course) use 0-based. // Graves et al call the probabilities y, we use log_probs (also calling them inputs) +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS -#include +#include #include #include -#include +#include +#include #include #include #include -#include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include namespace at { diff --git a/aten/src/ATen/native/LossMulti.h b/aten/src/ATen/native/LossMulti.h index 54736bcc123b2..148615e7e14f1 100644 --- a/aten/src/ATen/native/LossMulti.h +++ b/aten/src/ATen/native/LossMulti.h @@ -1,8 +1,8 @@ -#include -#include -#include - #pragma once +#include +#include +#include +#include namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/LossMultiLabelMargin.cpp b/aten/src/ATen/native/LossMultiLabelMargin.cpp index f59de5c8817a4..26d7a748df8d4 100644 --- a/aten/src/ATen/native/LossMultiLabelMargin.cpp +++ b/aten/src/ATen/native/LossMultiLabelMargin.cpp @@ -1,10 +1,23 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#endif + namespace at { namespace native { diff --git a/aten/src/ATen/native/LossMultiMargin.cpp b/aten/src/ATen/native/LossMultiMargin.cpp index c7ab53f1d211b..110520cf8f950 100644 --- a/aten/src/ATen/native/LossMultiMargin.cpp +++ b/aten/src/ATen/native/LossMultiMargin.cpp @@ -1,9 +1,19 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include +#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#endif + namespace at { namespace native { diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp index 79e98c877548a..8e5864b68728d 100644 --- a/aten/src/ATen/native/LossNLL.cpp +++ b/aten/src/ATen/native/LossNLL.cpp @@ -1,13 +1,32 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include +#include #include +#include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include #include diff --git a/aten/src/ATen/native/LossNLL2d.cpp b/aten/src/ATen/native/LossNLL2d.cpp index 6950cb2805e9e..ab7c084eb80df 100644 --- a/aten/src/ATen/native/LossNLL2d.cpp +++ b/aten/src/ATen/native/LossNLL2d.cpp @@ -1,12 +1,23 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include -#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#endif + namespace at { namespace native { diff --git a/aten/src/ATen/native/MathBitsFallback.h b/aten/src/ATen/native/MathBitsFallback.h index 4e9c2d9e98b18..84e72aa724d0e 100644 --- a/aten/src/ATen/native/MathBitsFallback.h +++ b/aten/src/ATen/native/MathBitsFallback.h @@ -1,12 +1,17 @@ -#include +#include #include #include #include -#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + namespace at { namespace native { // This fallback should only be used for operations that are self inverse and have a corresponding tensor diff --git a/aten/src/ATen/native/MaxPooling.cpp b/aten/src/ATen/native/MaxPooling.cpp index 0f05eeac7d3e9..e809c75ba21d6 100644 --- a/aten/src/ATen/native/MaxPooling.cpp +++ b/aten/src/ATen/native/MaxPooling.cpp @@ -1,4 +1,5 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include #include @@ -6,6 +7,16 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + namespace at { namespace native { diff --git a/aten/src/ATen/native/MaxUnpooling.cpp b/aten/src/ATen/native/MaxUnpooling.cpp index 33cc4dc7a61ce..adab802d65cd5 100644 --- a/aten/src/ATen/native/MaxUnpooling.cpp +++ b/aten/src/ATen/native/MaxUnpooling.cpp @@ -1,8 +1,17 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#endif + namespace at { namespace native { diff --git a/aten/src/ATen/native/Memory.cpp b/aten/src/ATen/native/Memory.cpp index df6949b2d7d95..2b66f08933934 100644 --- a/aten/src/ATen/native/Memory.cpp +++ b/aten/src/ATen/native/Memory.cpp @@ -1,6 +1,17 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + namespace at { namespace native { diff --git a/aten/src/ATen/native/NNPACK.cpp b/aten/src/ATen/native/NNPACK.cpp index 3df0a0623e437..544641f091a35 100644 --- a/aten/src/ATen/native/NNPACK.cpp +++ b/aten/src/ATen/native/NNPACK.cpp @@ -1,10 +1,21 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + #if !AT_NNPACK_ENABLED() namespace at { diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp index ea604c426c3b4..a9cf36a004f4c 100644 --- a/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp +++ b/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp @@ -1,5 +1,5 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include @@ -8,6 +8,17 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#endif + #include #include diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp index 3d34091fd036a..cf60f56f9df44 100644 --- a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp +++ b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp @@ -1,11 +1,23 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#endif + namespace at { namespace native { diff --git a/aten/src/ATen/native/NaiveDilatedConvolution.cpp b/aten/src/ATen/native/NaiveDilatedConvolution.cpp index fa7b30f5977ef..827bf204b093f 100644 --- a/aten/src/ATen/native/NaiveDilatedConvolution.cpp +++ b/aten/src/ATen/native/NaiveDilatedConvolution.cpp @@ -1,14 +1,25 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include #include #include #include #include #include -#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#endif + namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/NamedTensor.cpp b/aten/src/ATen/native/NamedTensor.cpp index d725c26a14631..6ee2f095b6d09 100644 --- a/aten/src/ATen/native/NamedTensor.cpp +++ b/aten/src/ATen/native/NamedTensor.cpp @@ -1,8 +1,30 @@ -#include -#include - +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include #include diff --git a/aten/src/ATen/native/vol2col.h b/aten/src/ATen/native/vol2col.h index 12718a8f00afc..2b2ee3b57b0c4 100644 --- a/aten/src/ATen/native/vol2col.h +++ b/aten/src/ATen/native/vol2col.h @@ -1,8 +1,6 @@ #pragma once -#include -#include -#include +#include namespace at { namespace native { From f7344e69c037cd76ec238e0796140d79ec8c57c6 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Mon, 17 Oct 2022 18:57:07 +0100 Subject: [PATCH 0091/1922] ATen/native (4/6): Use per-operator headers (#75574) Differential Revision: [D40126697](https://our.internmc.facebook.com/intern/diff/D40126697) Pull Request resolved: https://github.com/pytorch/pytorch/pull/75574 Approved by: https://github.com/malfet --- aten/src/ATen/PadNd.h | 2 + aten/src/ATen/native/NegateFallback.cpp | 1 + aten/src/ATen/native/Normalization.cpp | 44 +++++++- aten/src/ATen/native/Onehot.cpp | 12 ++- aten/src/ATen/native/PackedSequence.cpp | 17 ++- aten/src/ATen/native/PadNd.cpp | 23 +++- aten/src/ATen/native/PixelShuffle.cpp | 17 ++- aten/src/ATen/native/PointwiseOps.cpp | 15 ++- aten/src/ATen/native/Pooling.cpp | 27 ++++- aten/src/ATen/native/Pow.cpp | 15 ++- aten/src/ATen/native/QuantizedLinear.cpp | 26 +++-- aten/src/ATen/native/RNN.cpp | 48 ++++++++- aten/src/ATen/native/RangeFactories.cpp | 16 ++- aten/src/ATen/native/ReduceAllOps.cpp | 15 ++- aten/src/ATen/native/ReduceOps.cpp | 107 +++++++++++++++++-- aten/src/ATen/native/ReflectionPad.cpp | 21 +++- aten/src/ATen/native/Repeat.cpp | 13 ++- aten/src/ATen/native/ReplicationPadding.cpp | 19 +++- aten/src/ATen/native/Resize.cpp | 11 +- aten/src/ATen/native/RowwisePrune.cpp | 11 +- aten/src/ATen/native/Scalar.cpp | 12 ++- aten/src/ATen/native/SegmentReduce.cpp | 15 ++- aten/src/ATen/native/SobolEngineOps.cpp | 16 ++- aten/src/ATen/native/SobolEngineOpsUtils.cpp | 1 + aten/src/ATen/native/SobolEngineOpsUtils.h | 10 +- aten/src/ATen/native/SoftMax.cpp | 27 ++++- aten/src/ATen/native/Sorting.cpp | 36 ++++++- aten/src/ATen/native/SpectralOps.cpp | 69 ++++++++++-- aten/src/ATen/native/SummaryOps.cpp | 11 +- aten/src/ATen/native/TensorDimApply.h | 3 +- 30 files changed, 584 insertions(+), 76 deletions(-) diff --git a/aten/src/ATen/PadNd.h b/aten/src/ATen/PadNd.h index 2c0d67e9d5d3f..573d1a7b88ab7 100644 --- a/aten/src/ATen/PadNd.h +++ b/aten/src/ATen/PadNd.h @@ -1,4 +1,6 @@ #pragma once +#include +#include namespace at { diff --git a/aten/src/ATen/native/NegateFallback.cpp b/aten/src/ATen/native/NegateFallback.cpp index a2b134a91e40e..0a34b4f4331d6 100644 --- a/aten/src/ATen/native/NegateFallback.cpp +++ b/aten/src/ATen/native/NegateFallback.cpp @@ -1,3 +1,4 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp index 6911d780c1d0e..5169c5e58e9ad 100644 --- a/aten/src/ATen/native/Normalization.cpp +++ b/aten/src/ATen/native/Normalization.cpp @@ -1,18 +1,52 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include -#include -#include #include +#include +#include +#include +#include +#include +#include +#include #include -#include #include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include #include diff --git a/aten/src/ATen/native/Onehot.cpp b/aten/src/ATen/native/Onehot.cpp index a0c061062174b..41b7a69618636 100644 --- a/aten/src/ATen/native/Onehot.cpp +++ b/aten/src/ATen/native/Onehot.cpp @@ -1,4 +1,14 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#endif namespace at { namespace native { diff --git a/aten/src/ATen/native/PackedSequence.cpp b/aten/src/ATen/native/PackedSequence.cpp index 736829eb6d118..19b12b0819607 100644 --- a/aten/src/ATen/native/PackedSequence.cpp +++ b/aten/src/ATen/native/PackedSequence.cpp @@ -1,5 +1,20 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include #include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif #include diff --git a/aten/src/ATen/native/PadNd.cpp b/aten/src/ATen/native/PadNd.cpp index c6b18c1257b51..9421d537717c8 100644 --- a/aten/src/ATen/native/PadNd.cpp +++ b/aten/src/ATen/native/PadNd.cpp @@ -1,8 +1,29 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +#include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + namespace at { namespace native { Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value) { diff --git a/aten/src/ATen/native/PixelShuffle.cpp b/aten/src/ATen/native/PixelShuffle.cpp index 2a100321a6400..e535909a73429 100644 --- a/aten/src/ATen/native/PixelShuffle.cpp +++ b/aten/src/ATen/native/PixelShuffle.cpp @@ -1,10 +1,21 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +#include -#include -#include #include -#include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#endif + +#include +#include +#include namespace at { namespace native { diff --git a/aten/src/ATen/native/PointwiseOps.cpp b/aten/src/ATen/native/PointwiseOps.cpp index a99bc959eb958..8259135ce14a3 100644 --- a/aten/src/ATen/native/PointwiseOps.cpp +++ b/aten/src/ATen/native/PointwiseOps.cpp @@ -1,12 +1,17 @@ // Ternary and higher-order pointwise operations +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include -#include -#include -#include +#include +#include +#include -#include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif namespace at { namespace meta { diff --git a/aten/src/ATen/native/Pooling.cpp b/aten/src/ATen/native/Pooling.cpp index 724c53fdd0c00..fcbe741ab0ea0 100644 --- a/aten/src/ATen/native/Pooling.cpp +++ b/aten/src/ATen/native/Pooling.cpp @@ -1,12 +1,31 @@ -#include - -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include #include -#include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include namespace at { namespace native { diff --git a/aten/src/ATen/native/Pow.cpp b/aten/src/ATen/native/Pow.cpp index 4326853a8165a..7050524acebf2 100644 --- a/aten/src/ATen/native/Pow.cpp +++ b/aten/src/ATen/native/Pow.cpp @@ -1,11 +1,20 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include -#include -#include +#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + namespace at { namespace meta { diff --git a/aten/src/ATen/native/QuantizedLinear.cpp b/aten/src/ATen/native/QuantizedLinear.cpp index af7643ec18b6c..002bb1adc4386 100644 --- a/aten/src/ATen/native/QuantizedLinear.cpp +++ b/aten/src/ATen/native/QuantizedLinear.cpp @@ -1,20 +1,28 @@ -#include -#include -#include -#include -#include -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include -#include +#include #include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include #ifdef USE_FBGEMM diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp index 670395893d8ef..52efc6929f54e 100644 --- a/aten/src/ATen/native/RNN.cpp +++ b/aten/src/ATen/native/RNN.cpp @@ -1,8 +1,10 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include -#include -#include +#include +#include +#include +#include #include #include #include @@ -10,6 +12,46 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + int register_linear_params(); namespace at { namespace native { diff --git a/aten/src/ATen/native/RangeFactories.cpp b/aten/src/ATen/native/RangeFactories.cpp index 038da93456edb..408bf0a27e6fe 100644 --- a/aten/src/ATen/native/RangeFactories.cpp +++ b/aten/src/ATen/native/RangeFactories.cpp @@ -1,13 +1,23 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include #include -#include #include -#include +#include +#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + namespace at { namespace native { diff --git a/aten/src/ATen/native/ReduceAllOps.cpp b/aten/src/ATen/native/ReduceAllOps.cpp index 1ef5e9b93733c..e1d51a1666af2 100644 --- a/aten/src/ATen/native/ReduceAllOps.cpp +++ b/aten/src/ATen/native/ReduceAllOps.cpp @@ -1,8 +1,21 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include -#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include #include +#else +#include +#include +#include +#include +#include +#include +#include +#endif namespace at { namespace native { diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp index 2bb01abd51b5f..2fe5eee4a286d 100644 --- a/aten/src/ATen/native/ReduceOps.cpp +++ b/aten/src/ATen/native/ReduceOps.cpp @@ -1,21 +1,114 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include +#include #include -#include -#include +#include #include #include #include +#include +#include +#include #include #include -#include -#include #include -#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include #include @@ -24,9 +117,7 @@ #include #include #include -#include #include -#include #include namespace at { diff --git a/aten/src/ATen/native/ReflectionPad.cpp b/aten/src/ATen/native/ReflectionPad.cpp index 7824de63805f3..3a6ad683d0457 100644 --- a/aten/src/ATen/native/ReflectionPad.cpp +++ b/aten/src/ATen/native/ReflectionPad.cpp @@ -1,9 +1,26 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include +#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + namespace at { namespace meta { diff --git a/aten/src/ATen/native/Repeat.cpp b/aten/src/ATen/native/Repeat.cpp index b6e5c04f77026..b671a2232044b 100644 --- a/aten/src/ATen/native/Repeat.cpp +++ b/aten/src/ATen/native/Repeat.cpp @@ -1,8 +1,19 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#endif + template static void compute_cpu( index_t* repeat_ptr, diff --git a/aten/src/ATen/native/ReplicationPadding.cpp b/aten/src/ATen/native/ReplicationPadding.cpp index 40fdb788a4ffa..d0a4ea919acbf 100644 --- a/aten/src/ATen/native/ReplicationPadding.cpp +++ b/aten/src/ATen/native/ReplicationPadding.cpp @@ -1,9 +1,24 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include +#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#endif + namespace at { namespace meta { diff --git a/aten/src/ATen/native/Resize.cpp b/aten/src/ATen/native/Resize.cpp index 08286f3983cc9..bd47a25e69601 100644 --- a/aten/src/ATen/native/Resize.cpp +++ b/aten/src/ATen/native/Resize.cpp @@ -1,9 +1,16 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include +#include #include -#include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif namespace at { namespace native { diff --git a/aten/src/ATen/native/RowwisePrune.cpp b/aten/src/ATen/native/RowwisePrune.cpp index 40ae2215cbccc..c27707c4d3075 100644 --- a/aten/src/ATen/native/RowwisePrune.cpp +++ b/aten/src/ATen/native/RowwisePrune.cpp @@ -1,8 +1,17 @@ // Copyright 2004-present Facebook. All Rights Reserved. +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS -#include +#include +#include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#endif namespace at { namespace native { diff --git a/aten/src/ATen/native/Scalar.cpp b/aten/src/ATen/native/Scalar.cpp index 7342c4806d44c..f8932ea03bb2e 100644 --- a/aten/src/ATen/native/Scalar.cpp +++ b/aten/src/ATen/native/Scalar.cpp @@ -1,5 +1,15 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include #include +#else +#include +#include +#include +#endif namespace at { namespace native { diff --git a/aten/src/ATen/native/SegmentReduce.cpp b/aten/src/ATen/native/SegmentReduce.cpp index 3e562b7cf859f..1e5e28dab86b2 100644 --- a/aten/src/ATen/native/SegmentReduce.cpp +++ b/aten/src/ATen/native/SegmentReduce.cpp @@ -1,10 +1,23 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include +#include #include #include +#include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#endif + namespace at { namespace native { diff --git a/aten/src/ATen/native/SobolEngineOps.cpp b/aten/src/ATen/native/SobolEngineOps.cpp index 48366976a2e70..187faeba16a7b 100644 --- a/aten/src/ATen/native/SobolEngineOps.cpp +++ b/aten/src/ATen/native/SobolEngineOps.cpp @@ -1,11 +1,21 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include -#include #include #include -#include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#endif namespace at { namespace native { diff --git a/aten/src/ATen/native/SobolEngineOpsUtils.cpp b/aten/src/ATen/native/SobolEngineOpsUtils.cpp index ef7cbb1faae92..709d5c06d3c97 100644 --- a/aten/src/ATen/native/SobolEngineOpsUtils.cpp +++ b/aten/src/ATen/native/SobolEngineOpsUtils.cpp @@ -1,4 +1,5 @@ /// This file contains tensor-agnostic SoboleEngine constants +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include /* diff --git a/aten/src/ATen/native/SobolEngineOpsUtils.h b/aten/src/ATen/native/SobolEngineOpsUtils.h index d3d7a362f2e87..495a43ed8a7cf 100644 --- a/aten/src/ATen/native/SobolEngineOpsUtils.h +++ b/aten/src/ATen/native/SobolEngineOpsUtils.h @@ -1,6 +1,14 @@ /// This file contains some tensor-agnostic operations to be used in the /// core functions of the `SobolEngine` -#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#include +#endif namespace at { namespace native { diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp index d9d1b90534d73..0e3dafb24e9e8 100644 --- a/aten/src/ATen/native/SoftMax.cpp +++ b/aten/src/ATen/native/SoftMax.cpp @@ -1,13 +1,36 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include -#include +#include #include #include #include +#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include #include #include diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp index 66b9daf7fad8c..3b50d7744aa28 100644 --- a/aten/src/ATen/native/Sorting.cpp +++ b/aten/src/ATen/native/Sorting.cpp @@ -1,8 +1,16 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include #include #include #include #include +#include +#include +#include +#include +#include #include #include #include @@ -11,6 +19,32 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include namespace at { diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp index f39eeaccf9d4f..2840b1651dba5 100644 --- a/aten/src/ATen/native/SpectralOps.cpp +++ b/aten/src/ATen/native/SpectralOps.cpp @@ -1,16 +1,67 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include #include #include -#include -#include +#include +#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include -#include -#include namespace at { namespace native { @@ -157,7 +208,7 @@ Tensor fft_c2r(c10::string_view function_name, const auto norm = norm_from_string(norm_str, forward); if (forward) { // FIXME: _fft does not support complex_output=false with inverse=false - input = at::conj(input); + input = input.conj(); } return fft_c2r_maybe_out( function_name, out, input, dim, static_cast(norm), n); @@ -192,7 +243,7 @@ Tensor fft_r2c(c10::string_view function_name, if (!forward) { // FIXME: _fft_r2c doesn't support native r2c IFFT - return out.defined() ? at::conj_physical_out(out, ret) : at::conj(ret); + return out.defined() ? at::conj_physical_out(out, ret) : ret.conj(); } else { return ret; } @@ -521,7 +572,7 @@ static Tensor fft_hfftn_impl( } const auto last_dim = desc.dim.back(); - tmp = at::conj(tmp); + tmp = tmp.conj(); return fft_c2r_maybe_out(fname, out, tmp, last_dim, norm, last_dim_size); } @@ -559,7 +610,7 @@ static Tensor fft_ihfftn_impl( const auto last_dim = desc.dim.back(); auto tmp = at::_fft_r2c(x, last_dim, norm, /*onesided=*/true); if (desc.dim.size() == 1) { - return out.defined() ? at::conj_physical_out(tmp, out) : at::conj(tmp); + return out.defined() ? at::conj_physical_out(tmp, out) : tmp.conj(); } tmp = at::conj_physical(tmp); diff --git a/aten/src/ATen/native/SummaryOps.cpp b/aten/src/ATen/native/SummaryOps.cpp index e7dbe72576721..ae0b38c96efa7 100644 --- a/aten/src/ATen/native/SummaryOps.cpp +++ b/aten/src/ATen/native/SummaryOps.cpp @@ -1,10 +1,17 @@ // Returns the frequency of elements of input non-negative integer tensor. +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS -#include +#include #include #include -#include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#endif namespace at { namespace native { diff --git a/aten/src/ATen/native/TensorDimApply.h b/aten/src/ATen/native/TensorDimApply.h index ad9ca857eeab8..e75cd40caf48b 100644 --- a/aten/src/ATen/native/TensorDimApply.h +++ b/aten/src/ATen/native/TensorDimApply.h @@ -1,4 +1,5 @@ -#include +#pragma once +#include #include namespace at { From 90123495d826964d165d01bd70a3b6fec8e86374 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Mon, 17 Oct 2022 18:57:07 +0100 Subject: [PATCH 0092/1922] ATen/native (5/6): Use per-operator headers (#75575) Differential Revision: [D40126696](https://our.internmc.facebook.com/intern/diff/D40126696) Pull Request resolved: https://github.com/pytorch/pytorch/pull/75575 Approved by: https://github.com/malfet --- .../ATen/native/TensorAdvancedIndexing.cpp | 74 ++++++- .../ATen/native/TensorAdvancedIndexingUtils.h | 2 +- aten/src/ATen/native/TensorCompare.cpp | 74 ++++++- aten/src/ATen/native/TensorConversions.cpp | 44 ++++- aten/src/ATen/native/TensorFactories.cpp | 90 +++++++-- aten/src/ATen/native/TensorFactories.h | 5 +- aten/src/ATen/native/TensorIteratorReduce.cpp | 11 +- aten/src/ATen/native/TensorProperties.cpp | 27 ++- aten/src/ATen/native/TensorShape.cpp | 181 +++++++++++++++++- aten/src/ATen/native/TensorShape.h | 7 - .../src/ATen/native/TensorTransformations.cpp | 21 +- aten/src/ATen/native/TestOps.cpp | 19 +- aten/src/ATen/native/TriangularOps.cpp | 18 +- aten/src/ATen/native/TriangularOpsUtils.h | 2 +- aten/src/ATen/native/TypeProperties.cpp | 26 ++- 15 files changed, 537 insertions(+), 64 deletions(-) diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp index 2f7dbaf45252f..3004dc1b31c79 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp +++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp @@ -47,31 +47,93 @@ // ...) // // where & and * represent the C-style address-of and indirection operations. +// #define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include #include -#include -#include +#include +#include +#include +#include #include #include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include #include #include #include #include +#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include #include #include -#include #include #include diff --git a/aten/src/ATen/native/TensorAdvancedIndexingUtils.h b/aten/src/ATen/native/TensorAdvancedIndexingUtils.h index 3e786bf7db4fc..0c0db4b83f351 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexingUtils.h +++ b/aten/src/ATen/native/TensorAdvancedIndexingUtils.h @@ -1,5 +1,5 @@ #pragma once -#include +#include #include #include diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp index 856d684c52e85..5d3ee7d98d803 100644 --- a/aten/src/ATen/native/TensorCompare.cpp +++ b/aten/src/ATen/native/TensorCompare.cpp @@ -1,19 +1,73 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include #include +#include #include #include -#include -#include -#include #include -#include #include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif namespace at { namespace meta { diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp index 2af35c66a0b9e..ec699bf1bf7fa 100644 --- a/aten/src/ATen/native/TensorConversions.cpp +++ b/aten/src/ATen/native/TensorConversions.cpp @@ -1,8 +1,50 @@ +// #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include +#include #include #include +#include #include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif #include #include diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp index 2e01f7e8699ad..9d1c6d8a36333 100644 --- a/aten/src/ATen/native/TensorFactories.cpp +++ b/aten/src/ATen/native/TensorFactories.cpp @@ -1,31 +1,99 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include + +#include #include #include #include +#include #include #include -#include #include +#include +#include +#include #include -#include -#include -#include -#include #include -#include #include #include -#include -#include +#include + #ifndef AT_PER_OPERATOR_HEADERS #include +#include #else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #endif #include -#include -#include #include #include #include diff --git a/aten/src/ATen/native/TensorFactories.h b/aten/src/ATen/native/TensorFactories.h index 35e058df4b3ab..2c0665518a9e3 100644 --- a/aten/src/ATen/native/TensorFactories.h +++ b/aten/src/ATen/native/TensorFactories.h @@ -1,10 +1,9 @@ #pragma once #include -#include +#include +#include #include -#include -#include #ifndef AT_PER_OPERATOR_HEADERS #include diff --git a/aten/src/ATen/native/TensorIteratorReduce.cpp b/aten/src/ATen/native/TensorIteratorReduce.cpp index ea772bfe7e641..606a442226876 100644 --- a/aten/src/ATen/native/TensorIteratorReduce.cpp +++ b/aten/src/ATen/native/TensorIteratorReduce.cpp @@ -1,11 +1,14 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include -#include -#include -#include -#include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + #include /// Contains the implementation of parallel reductions in TensorIterator. diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp index 7941f2e3b758c..6a703cbe07f90 100644 --- a/aten/src/ATen/native/TensorProperties.cpp +++ b/aten/src/ATen/native/TensorProperties.cpp @@ -1,12 +1,27 @@ -#include -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include -#include -#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif -#include #include + namespace at { namespace native { diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index 6543509d3dcb8..e1f9835184cbd 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -1,12 +1,17 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include +#include #include +#include #include #include #include #include -#include #include #include +#include #include #include #include @@ -26,6 +31,178 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include #include #include diff --git a/aten/src/ATen/native/TensorShape.h b/aten/src/ATen/native/TensorShape.h index 21d0ba78261ec..60e2533e9b538 100644 --- a/aten/src/ATen/native/TensorShape.h +++ b/aten/src/ATen/native/TensorShape.h @@ -53,11 +53,4 @@ inline int64_t get_num_splits(const Tensor& self, int64_t split_size, int64_t di return num_splits; } -/// -/// For more information, see -/// https://pytorch.org/docs/master/generated/torch.Tensor.unfold.html#torch.Tensor.unfold -/// - -Tensor unfold(const Tensor& self, int64_t dimension, int64_t size, int64_t step); - }} // namespace at::native diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp index f0e2c0f02caa7..028b05e66930e 100644 --- a/aten/src/ATen/native/TensorTransformations.cpp +++ b/aten/src/ATen/native/TensorTransformations.cpp @@ -1,14 +1,31 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include // for flip_stub -#include -#include #include +#include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include #include diff --git a/aten/src/ATen/native/TestOps.cpp b/aten/src/ATen/native/TestOps.cpp index a8c30f5c3ba61..f36765436991e 100644 --- a/aten/src/ATen/native/TestOps.cpp +++ b/aten/src/ATen/native/TestOps.cpp @@ -1,10 +1,25 @@ // Copyright 2004-present Facebook. All Rights Reserved. +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS -#include +#include #include -#include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include namespace at { diff --git a/aten/src/ATen/native/TriangularOps.cpp b/aten/src/ATen/native/TriangularOps.cpp index f98018d7fe5a5..fbdd204f64307 100644 --- a/aten/src/ATen/native/TriangularOps.cpp +++ b/aten/src/ATen/native/TriangularOps.cpp @@ -1,14 +1,24 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include -#include #include #include -#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#endif + namespace at { namespace meta { diff --git a/aten/src/ATen/native/TriangularOpsUtils.h b/aten/src/ATen/native/TriangularOpsUtils.h index c5bce42ed3fd7..e380a510bddeb 100644 --- a/aten/src/ATen/native/TriangularOpsUtils.h +++ b/aten/src/ATen/native/TriangularOpsUtils.h @@ -1,4 +1,4 @@ -#include +#include #include namespace at { diff --git a/aten/src/ATen/native/TypeProperties.cpp b/aten/src/ATen/native/TypeProperties.cpp index feceb75631cec..36354c133a98e 100644 --- a/aten/src/ATen/native/TypeProperties.cpp +++ b/aten/src/ATen/native/TypeProperties.cpp @@ -1,8 +1,26 @@ -#include -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include -#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif namespace at { namespace native { From 6c80bbdd6f603731292e3b96c07cb06269111d0f Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Mon, 17 Oct 2022 18:57:07 +0100 Subject: [PATCH 0093/1922] ATen/native (6/6): Use per-operator headers (#75576) Differential Revision: [D40126699](https://our.internmc.facebook.com/intern/diff/D40126699) Pull Request resolved: https://github.com/pytorch/pytorch/pull/75576 Approved by: https://github.com/malfet --- aten/src/ATen/native/ComplexHelper.h | 9 +- aten/src/ATen/native/UnaryOps.cpp | 176 ++++++++++++++++-- aten/src/ATen/native/Unfold2d.cpp | 1 + aten/src/ATen/native/Unfold3d.cpp | 4 +- aten/src/ATen/native/UnfoldBackward.h | 5 +- aten/src/ATen/native/Unique.cpp | 21 ++- aten/src/ATen/native/UpSample.cpp | 1 + aten/src/ATen/native/UpSampleBicubic2d.cpp | 20 +- aten/src/ATen/native/UpSampleBilinear2d.cpp | 19 +- aten/src/ATen/native/UpSampleLinear1d.cpp | 16 +- aten/src/ATen/native/UpSampleNearest1d.cpp | 20 +- aten/src/ATen/native/UpSampleNearest2d.cpp | 19 +- aten/src/ATen/native/UpSampleNearest3d.cpp | 19 +- aten/src/ATen/native/UpSampleTrilinear3d.cpp | 15 +- aten/src/ATen/native/VariableMethodStubs.cpp | 20 +- aten/src/ATen/native/WeightNorm.cpp | 20 +- .../ATen/native/cpu/UnfoldBackwardKernel.cpp | 1 + .../ATen/native/cuda/UnfoldBackwardKernel.cu | 1 + aten/src/ATen/native/group_norm.cpp | 23 ++- aten/src/ATen/native/layer_norm.cpp | 25 ++- .../src/ATen/native/prim_native_functions.cpp | 9 +- 21 files changed, 388 insertions(+), 56 deletions(-) diff --git a/aten/src/ATen/native/ComplexHelper.h b/aten/src/ATen/native/ComplexHelper.h index 88668d13145c5..8d69f6292772c 100644 --- a/aten/src/ATen/native/ComplexHelper.h +++ b/aten/src/ATen/native/ComplexHelper.h @@ -1,8 +1,15 @@ #pragma once -#include +#include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif + // WARNING: this header contains non-inline functions and should be only // included from ONE cpp file diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp index c301d8ecc26a2..845610ce373e7 100644 --- a/aten/src/ATen/native/UnaryOps.cpp +++ b/aten/src/ATen/native/UnaryOps.cpp @@ -1,26 +1,174 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include -#include #include +#include +#include +#include +#include +#include #include -#include -#include -#include #include #include -#include -#include #include -#include -#include -#include -#include -#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif -#include +#include namespace at { diff --git a/aten/src/ATen/native/Unfold2d.cpp b/aten/src/ATen/native/Unfold2d.cpp index 0a3b760a33fda..60bbc8a777121 100644 --- a/aten/src/ATen/native/Unfold2d.cpp +++ b/aten/src/ATen/native/Unfold2d.cpp @@ -1,3 +1,4 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include namespace at { namespace native { diff --git a/aten/src/ATen/native/Unfold3d.cpp b/aten/src/ATen/native/Unfold3d.cpp index 3495f92dc3ce6..1a2d0ea2ae1f9 100644 --- a/aten/src/ATen/native/Unfold3d.cpp +++ b/aten/src/ATen/native/Unfold3d.cpp @@ -1,5 +1,7 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include +#include #include #include diff --git a/aten/src/ATen/native/UnfoldBackward.h b/aten/src/ATen/native/UnfoldBackward.h index 1f6c8fa1b289c..cb4856ec2718e 100644 --- a/aten/src/ATen/native/UnfoldBackward.h +++ b/aten/src/ATen/native/UnfoldBackward.h @@ -1,10 +1,9 @@ #pragma once #include -#include +#include #include -#include -#include +#include #ifndef AT_PER_OPERATOR_HEADERS #include diff --git a/aten/src/ATen/native/Unique.cpp b/aten/src/ATen/native/Unique.cpp index f418611e08644..92b48c9f388ca 100644 --- a/aten/src/ATen/native/Unique.cpp +++ b/aten/src/ATen/native/Unique.cpp @@ -1,8 +1,27 @@ // Returns unique elements of input tensor. +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS -#include +#include #include #include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif #include #include diff --git a/aten/src/ATen/native/UpSample.cpp b/aten/src/ATen/native/UpSample.cpp index db75b7e99fdb1..1a6af75260300 100644 --- a/aten/src/ATen/native/UpSample.cpp +++ b/aten/src/ATen/native/UpSample.cpp @@ -1,4 +1,5 @@ // Copyright 2004-present Facebook. All Rights Reserved. +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include diff --git a/aten/src/ATen/native/UpSampleBicubic2d.cpp b/aten/src/ATen/native/UpSampleBicubic2d.cpp index 5bf7ba6a53666..3a0fa941a4d4a 100644 --- a/aten/src/ATen/native/UpSampleBicubic2d.cpp +++ b/aten/src/ATen/native/UpSampleBicubic2d.cpp @@ -1,8 +1,24 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#endif + namespace at { namespace meta { diff --git a/aten/src/ATen/native/UpSampleBilinear2d.cpp b/aten/src/ATen/native/UpSampleBilinear2d.cpp index 527555a066abb..69c856f06fcbf 100644 --- a/aten/src/ATen/native/UpSampleBilinear2d.cpp +++ b/aten/src/ATen/native/UpSampleBilinear2d.cpp @@ -1,11 +1,26 @@ // Adapted from interp.cpp from Caffe util by Pauline Luc // Originally developed by George Papandreou +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS -#include -#include +#include +#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#endif + namespace at { namespace meta { diff --git a/aten/src/ATen/native/UpSampleLinear1d.cpp b/aten/src/ATen/native/UpSampleLinear1d.cpp index b100450c2b6a7..048d4b5a3d9c1 100644 --- a/aten/src/ATen/native/UpSampleLinear1d.cpp +++ b/aten/src/ATen/native/UpSampleLinear1d.cpp @@ -1,10 +1,22 @@ // Adapted from interp.cpp from Caffe util by Pauline Luc // Originally developed by George Papandreou +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS -#include -#include +#include +#include +#include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + namespace at { namespace meta { diff --git a/aten/src/ATen/native/UpSampleNearest1d.cpp b/aten/src/ATen/native/UpSampleNearest1d.cpp index 83121ed3be45b..5cc53dea349b7 100644 --- a/aten/src/ATen/native/UpSampleNearest1d.cpp +++ b/aten/src/ATen/native/UpSampleNearest1d.cpp @@ -1,7 +1,23 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#endif + namespace at { namespace meta { diff --git a/aten/src/ATen/native/UpSampleNearest2d.cpp b/aten/src/ATen/native/UpSampleNearest2d.cpp index ee5dce4a02eff..14c7a7d1b74f0 100644 --- a/aten/src/ATen/native/UpSampleNearest2d.cpp +++ b/aten/src/ATen/native/UpSampleNearest2d.cpp @@ -1,9 +1,24 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#endif + namespace at { namespace meta { diff --git a/aten/src/ATen/native/UpSampleNearest3d.cpp b/aten/src/ATen/native/UpSampleNearest3d.cpp index 0e4040980ae26..73948f66fa769 100644 --- a/aten/src/ATen/native/UpSampleNearest3d.cpp +++ b/aten/src/ATen/native/UpSampleNearest3d.cpp @@ -1,8 +1,23 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#endif + namespace at { namespace meta { diff --git a/aten/src/ATen/native/UpSampleTrilinear3d.cpp b/aten/src/ATen/native/UpSampleTrilinear3d.cpp index 73fffbe5afe79..76bc4da85addb 100644 --- a/aten/src/ATen/native/UpSampleTrilinear3d.cpp +++ b/aten/src/ATen/native/UpSampleTrilinear3d.cpp @@ -1,11 +1,22 @@ // Adapted from interp.cpp from Caffe util by Pauline Luc // Originally developed by George Papandreou +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS -#include -#include +#include +#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + namespace at { namespace meta { diff --git a/aten/src/ATen/native/VariableMethodStubs.cpp b/aten/src/ATen/native/VariableMethodStubs.cpp index ce5432e677af2..6191717930aec 100644 --- a/aten/src/ATen/native/VariableMethodStubs.cpp +++ b/aten/src/ATen/native/VariableMethodStubs.cpp @@ -1,5 +1,23 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include #include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif // The stubs in here are used by dynamic dispatch. It just redirects everything // to the Tensor method we manually bind in TensorBody.h. diff --git a/aten/src/ATen/native/WeightNorm.cpp b/aten/src/ATen/native/WeightNorm.cpp index bf258d80a0fb3..8291120f19603 100644 --- a/aten/src/ATen/native/WeightNorm.cpp +++ b/aten/src/ATen/native/WeightNorm.cpp @@ -1,11 +1,21 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include -#include +#include #include -#include -#include -#include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#endif + #include namespace at { diff --git a/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp b/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp index 129ab3a973e3a..cf934586c74e7 100644 --- a/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp +++ b/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp @@ -1,5 +1,6 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +#include #include #include #include diff --git a/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu b/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu index 90f5238d0180d..7865a7f61545f 100644 --- a/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu +++ b/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu @@ -1,6 +1,7 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +#include #include #include #include diff --git a/aten/src/ATen/native/group_norm.cpp b/aten/src/ATen/native/group_norm.cpp index 24a23577e490e..5b38b02702828 100644 --- a/aten/src/ATen/native/group_norm.cpp +++ b/aten/src/ATen/native/group_norm.cpp @@ -1,15 +1,24 @@ -#include -#include -#include -#include -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +#include +#include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#endif + #include #include -#include #include #include diff --git a/aten/src/ATen/native/layer_norm.cpp b/aten/src/ATen/native/layer_norm.cpp index 80a7bb6111f23..71dc42da380b2 100644 --- a/aten/src/ATen/native/layer_norm.cpp +++ b/aten/src/ATen/native/layer_norm.cpp @@ -1,17 +1,26 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include -#include -#include -#include -#include +#include #include #include -#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif #include -#include -#include #include #include diff --git a/aten/src/ATen/native/prim_native_functions.cpp b/aten/src/ATen/native/prim_native_functions.cpp index 8f82345c19058..4e79c112d7fc6 100644 --- a/aten/src/ATen/native/prim_native_functions.cpp +++ b/aten/src/ATen/native/prim_native_functions.cpp @@ -1,4 +1,11 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif namespace at { namespace native { From cf709008be0404e86789432108c2f94125275b0f Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Mon, 24 Oct 2022 16:36:25 -0700 Subject: [PATCH 0094/1922] Fix bernoulli functionalization. (#87573) For testing, see https://github.com/pytorch/pytorch/issues/87571 Signed-off-by: Edward Z. Yang cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang Pull Request resolved: https://github.com/pytorch/pytorch/pull/87573 Approved by: https://github.com/albanD --- .github/ci_commit_pins/xla.txt | 2 +- aten/src/ATen/native/native_functions.yaml | 2 ++ torch/_inductor/decomposition.py | 6 ++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt index 3ab9c4394d70b..6d16c6159e998 100644 --- a/.github/ci_commit_pins/xla.txt +++ b/.github/ci_commit_pins/xla.txt @@ -1 +1 @@ -0cb29daa04097c868d23ed666563a3439d67065c +cf5dea047d1c9c63a201fb1b97b690416b683dde diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index faab6371c8af1..d514cae670855 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -985,6 +985,8 @@ device_check: NoCheck # TensorIterator variants: function, method tags: nondeterministic_seeded + dispatch: + CompositeExplicitAutogradNonFunctional: bernoulli - func: bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias=None) -> Tensor diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py index 6fed9ca691240..c22a8406b9b61 100644 --- a/torch/_inductor/decomposition.py +++ b/torch/_inductor/decomposition.py @@ -304,6 +304,12 @@ def bernoulli(self, *, generator=None): return torch.rand_like(self, dtype=torch.float32) < self +@register_decomposition([aten.bernoulli.p]) +def bernoulli_p(self, p=0.5, *, generator=None): + assert generator is None + return torch.rand_like(self, dtype=torch.float32) < p + + """ Some decomps result in differences from eager related to randomness. We put these decomps in a separate table `extra_random_decomps` to allow From d444835764f2e1e609a248c0d9f9be81587cdb40 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Mon, 24 Oct 2022 19:40:19 -0400 Subject: [PATCH 0095/1922] Make me codeowner of test_aotdispatch.py (#87624) Signed-off-by: Edward Z. Yang Pull Request resolved: https://github.com/pytorch/pytorch/pull/87624 Approved by: https://github.com/albanD --- CODEOWNERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CODEOWNERS b/CODEOWNERS index 8fdc5fc776632..3d030ad4d9e45 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -112,3 +112,6 @@ torch/csrc/autograd/profiler* @robieta torch/autograd/profiler* @robieta torch/csrc/profiler/ @robieta torch/profiler/ @robieta + +# AOTDispatch tests +test/functorch/test_aotdispatch.py @ezyang @Chillee From 5f987d9af2f328ddf2760e1a6c214647a5aa08a6 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Mon, 24 Oct 2022 23:52:44 +0000 Subject: [PATCH 0096/1922] Fix typo under docs directory (#87583) This PR fixes typo in `.rst` files under docs directory Pull Request resolved: https://github.com/pytorch/pytorch/pull/87583 Approved by: https://github.com/kit1980 --- docs/cpp/source/notes/tensor_cuda_stream.rst | 2 +- docs/source/nested.rst | 2 +- docs/source/notes/modules.rst | 2 +- docs/source/notes/numerical_accuracy.rst | 2 +- docs/source/quantization-accuracy-debugging.rst | 2 +- docs/source/quantization.rst | 2 +- docs/source/sparse.rst | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/cpp/source/notes/tensor_cuda_stream.rst b/docs/cpp/source/notes/tensor_cuda_stream.rst index b80615e8f7f10..bdb66361d9a70 100644 --- a/docs/cpp/source/notes/tensor_cuda_stream.rst +++ b/docs/cpp/source/notes/tensor_cuda_stream.rst @@ -206,7 +206,7 @@ CUDA Stream Usage Examples // sum() on tensor0 uses default CUDA stream as current CUDA stream on device 0 tensor0.sum(); - // sum() on tensor1 uses defualt CUDA stream as current CUDA stream on device 1 + // sum() on tensor1 uses default CUDA stream as current CUDA stream on device 1 tensor1.sum(); .. attention:: diff --git a/docs/source/nested.rst b/docs/source/nested.rst index 4cfb5bdf701ae..21ff980256911 100644 --- a/docs/source/nested.rst +++ b/docs/source/nested.rst @@ -199,7 +199,7 @@ NestedTensor and any constraints they have. :func:`torch.add`; "Supports elementwise addition of two nested tensors. Supports addition of a scalar to a nested tensor." :func:`torch.mul`; "Supports elementwise multiplication of two nested tensors. - Supports multipication of a nested tensor by a scalar." + Supports multiplication of a nested tensor by a scalar." :func:`torch.select`; "Supports selecting along ``dim=0`` only (analogously ``nt[i]``)." :func:`torch.clone`; "Behavior is the same as on regular tensors." :func:`torch.detach`; "Behavior is the same as on regular tensors." diff --git a/docs/source/notes/modules.rst b/docs/source/notes/modules.rst index 7eea02dfa857f..49b27a0ae0142 100644 --- a/docs/source/notes/modules.rst +++ b/docs/source/notes/modules.rst @@ -599,7 +599,7 @@ PyTorch provides two types of hooks for modules: * **Forward hooks** are called during the forward pass. They can be installed for a given module with :func:`~torch.nn.Module.register_forward_pre_hook` and :func:`~torch.nn.Module.register_forward_hook`. These hooks will be called respectively just before the forward function is called and just after it is called. - Alternatively, these hooks can be installed globally for all modules with the analagous + Alternatively, these hooks can be installed globally for all modules with the analogous :func:`~torch.nn.modules.module.register_module_forward_pre_hook` and :func:`~torch.nn.modules.module.register_module_forward_hook` functions. * **Backward hooks** are called during the backward pass. They can be installed with diff --git a/docs/source/notes/numerical_accuracy.rst b/docs/source/notes/numerical_accuracy.rst index b1d05f9460419..fad14ed912027 100644 --- a/docs/source/notes/numerical_accuracy.rst +++ b/docs/source/notes/numerical_accuracy.rst @@ -34,7 +34,7 @@ even though mathematically it's an identical computation. Similarly, an operation applied to a tensor slice is not guaranteed to produce results that are identical to the slice of the result of the same operation applied to the full tensor. E.g. let -``A`` be a 2-dimentional tensor. ``A.sum(-1)[0]`` is not guaranteed to be bitwise equal to +``A`` be a 2-dimensional tensor. ``A.sum(-1)[0]`` is not guaranteed to be bitwise equal to ``A[:,0].sum()``. Extremal values diff --git a/docs/source/quantization-accuracy-debugging.rst b/docs/source/quantization-accuracy-debugging.rst index 69bda8706cc67..0fa590abd2f0c 100644 --- a/docs/source/quantization-accuracy-debugging.rst +++ b/docs/source/quantization-accuracy-debugging.rst @@ -6,7 +6,7 @@ accuracy. If a quantized model has error compared to the original model, we can categorize the error into: 1. **data insensitive error** - caused by intrinsic model quantization error, - large portion of input data has large errror + large portion of input data has large error 2. **data sensitive error** - caused by outlier input data, small portion of input data has large error 3. **implementation error** - quantized kernel is not matching reference implementation diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst index 34cbad9b52cc3..e00720570a1a3 100644 --- a/docs/source/quantization.rst +++ b/docs/source/quantization.rst @@ -258,7 +258,7 @@ PTSQ API Example:: # attach a global qconfig, which contains information about what kind # of observers to attach. Use 'fbgemm' for server inference and # 'qnnpack' for mobile inference. Other quantization configurations such - # as selecting symmetric or assymetric quantization and MinMax or L2Norm + # as selecting symmetric or asymmetric quantization and MinMax or L2Norm # calibration techniques can be specified here. model_fp32.qconfig = torch.quantization.get_default_qconfig('fbgemm') diff --git a/docs/source/sparse.rst b/docs/source/sparse.rst index 2da6a6faaee55..29790312cb3b8 100644 --- a/docs/source/sparse.rst +++ b/docs/source/sparse.rst @@ -117,7 +117,7 @@ Operator overview Fundamentally, operations on Tensor with sparse storage formats behave the same as operations on Tensor with strided (or other) storage formats. The particularities of storage, that is the physical layout of the data, influences the performance of -an operation but shhould not influence the semantics. +an operation but should not influence the semantics. We are actively increasing operator coverage for sparse tensors. Users should not From fd80049e44992fec873d887e864f442d13e6d9e5 Mon Sep 17 00:00:00 2001 From: albanD Date: Mon, 24 Oct 2022 15:37:20 -0400 Subject: [PATCH 0097/1922] Improve argument printing (#87601) No more "expected tuple but got tuple". We appropriately grovel in the list/tuple for the element that mismatched and report what exactly twinged the failure. invalid_arguments.cpp is a shitshow so I did something slapdash to get it not completely horrible. See https://github.com/pytorch/pytorch/issues/87514 for more context. Signed-off-by: Edward Z. Yang Pull Request resolved: https://github.com/pytorch/pytorch/pull/87601 Approved by: https://github.com/Chillee --- test/test_native_functions.py | 42 +++++++++++++- torch/csrc/utils/invalid_arguments.cpp | 29 +++++++++- torch/csrc/utils/python_arg_parser.cpp | 77 ++++++++++++++++++++------ torch/csrc/utils/python_arg_parser.h | 3 +- 4 files changed, 130 insertions(+), 21 deletions(-) diff --git a/test/test_native_functions.py b/test/test_native_functions.py index 831998cbf6be2..ba7889e10f4c5 100644 --- a/test/test_native_functions.py +++ b/test/test_native_functions.py @@ -19,6 +19,46 @@ def forward(self, values, incr: Optional[List[int]]): class TestNativeFunctions(TestCase): + def _lists_with_str(self): + return [ + ("foo",), + (2, "foo"), + ("foo", 3), + ["foo"], + [2, "foo"], + ["foo", 3], + "foo", + ] + + def _test_raises_str_typeerror(self, fn): + for arg in self._lists_with_str(): + self.assertRaisesRegex(TypeError, "str", lambda: fn(arg)) + try: + fn(arg) + except TypeError as e: + print(e) + + def test_symintlist_error(self): + x = torch.randn(1) + self._test_raises_str_typeerror(lambda arg: torch._C._nn.pad(x, arg)) + + def test_vararg_symintlist_error(self): + self._test_raises_str_typeerror(lambda arg: torch.rand(arg)) + self._test_raises_str_typeerror(lambda arg: torch.rand(*arg)) + + def test_symintlist_error_with_overload_but_is_unique(self): + x = torch.randn(1) + y = torch.randn(1) + self._test_raises_str_typeerror(lambda arg: x.set_(y, 0, arg)) + + def test_symintlist_error_with_overload(self): + x = torch.randn(1) + self._test_raises_str_typeerror(lambda arg: x.view(arg)) + + def test_intlist_error_with_overload(self): + x = torch.randn(1) + self._test_raises_str_typeerror(lambda arg: torch._C._nn.pad(x, arg)) + # # optional float list # @@ -113,7 +153,7 @@ def fake_module(values, const): self.do_test_optional_intlist_with_module(fake_module) def test_optional_intlist_invalid(self): - with self.assertRaisesRegex(TypeError, "must be .* not"): + with self.assertRaisesRegex(TypeError, "must be .* but found"): IntListWrapperModule()(torch.zeros(1), [0.5]) with self.assertRaisesRegex(RuntimeError, "value of type .* instead found type"): diff --git a/torch/csrc/utils/invalid_arguments.cpp b/torch/csrc/utils/invalid_arguments.cpp index e76b9cf22ff50..9ff3e71fdc960 100644 --- a/torch/csrc/utils/invalid_arguments.cpp +++ b/torch/csrc/utils/invalid_arguments.cpp @@ -272,7 +272,34 @@ std::string _formattedArgDesc( result += red; if (is_kwarg) result += option.arguments[i].name + "="; - result += py_typename(arg); + bool is_tuple = PyTuple_Check(arg); + if (is_tuple || PyList_Check(arg)) { + result += py_typename(arg) + " of "; + auto num_elements = PySequence_Length(arg); + if (is_tuple) { + result += "("; + } else { + result += "["; + } + for (const auto i : c10::irange(num_elements)) { + if (i != 0) { + result += ", "; + } + result += py_typename( + py::reinterpret_steal(PySequence_GetItem(arg, i)) + .ptr()); + } + if (is_tuple) { + if (num_elements == 1) { + result += ","; + } + result += ")"; + } else { + result += "]"; + } + } else { + result += py_typename(arg); + } if (is_matching) result += reset_green; else diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp index 177346614704f..f338d3f196adc 100644 --- a/torch/csrc/utils/python_arg_parser.cpp +++ b/torch/csrc/utils/python_arg_parser.cpp @@ -664,7 +664,10 @@ bool is_float_or_complex_list(PyObject* obj) { return true; } -static bool is_int_list(PyObject* obj, int broadcast_size) { +static bool is_int_list( + PyObject* obj, + int broadcast_size, + int64_t* failed_idx = nullptr) { if (PyTuple_Check(obj) || PyList_Check(obj)) { auto len = PySequence_Size(obj); if (len == 0) { @@ -684,6 +687,9 @@ static bool is_int_list(PyObject* obj, int broadcast_size) { for (int i = 1; i < len; i++) { if (torch::is_symint_node( py::reinterpret_steal(PySequence_GetItem(obj, i)))) { + if (failed_idx != nullptr) { + *failed_idx = i; + } return false; } } @@ -694,9 +700,13 @@ static bool is_int_list(PyObject* obj, int broadcast_size) { // NOTE: JIT tracer allows arbitrary scalar tensors to act as ints // in an intlist argument. Even float or complex scalar tensors. - return ( - jit::tracer::isTracing() && THPVariable_Check(item.ptr()) && - THPVariable_Unpack(item.ptr()).sizes() == c10::IntArrayRef{}); + bool r = + (jit::tracer::isTracing() && THPVariable_Check(item.ptr()) && + THPVariable_Unpack(item.ptr()).sizes() == c10::IntArrayRef{}); + if (!r && failed_idx != nullptr) { + *failed_idx = 0; + } + return r; } // if a size is specified (e.g. IntArrayRef[2]) we also allow passing a single // int @@ -711,7 +721,10 @@ static bool is_int_or_symint(PyObject* obj) { return torch::is_symint_node(py::handle(obj)) || THPUtils_checkIndex(obj); } -static bool is_int_or_symint_list(PyObject* obj, int broadcast_size) { +static bool is_int_or_symint_list( + PyObject* obj, + int broadcast_size, + int64_t* failed_idx = nullptr) { if (PyTuple_Check(obj) || PyList_Check(obj)) { if (PySequence_Size(obj) == 0) { return true; @@ -723,9 +736,13 @@ static bool is_int_or_symint_list(PyObject* obj, int broadcast_size) { } // NOTE: JIT tracer allows arbitrary scalar tensors to act as ints // in an intlist argument. Even float or complex scalar tensors. - return ( - jit::tracer::isTracing() && THPVariable_Check(item.ptr()) && - THPVariable_Unpack(item.ptr()).sizes() == c10::IntArrayRef{}); + bool r = + (jit::tracer::isTracing() && THPVariable_Check(item.ptr()) && + THPVariable_Unpack(item.ptr()).sizes() == c10::IntArrayRef{}); + if (!r && failed_idx != nullptr) { + *failed_idx = 0; + } + return r; } // if a size is specified (e.g. IntArrayRef[2]) we also allow passing a single // int @@ -736,7 +753,8 @@ static bool is_int_or_symint_list(PyObject* obj, int broadcast_size) { auto FunctionParameter::check( PyObject* obj, std::vector& overloaded_args, - int argnum) -> bool { + int argnum, + int64_t* failed_idx) -> bool { switch (type_) { case ParameterType::TENSOR: { if (is_tensor_and_append_overloaded(obj, &overloaded_args)) { @@ -793,7 +811,7 @@ auto FunctionParameter::check( obj, &overloaded_args, argnum, true /* throw_error */); } case ParameterType::INT_LIST: - return is_int_list(obj, size); + return is_int_list(obj, size, failed_idx); case ParameterType::FLOAT_LIST: return is_float_or_complex_list(obj); case ParameterType::GENERATOR: @@ -824,12 +842,13 @@ auto FunctionParameter::check( case ParameterType::SYM_INT: return is_int_or_symint(obj); case ParameterType::SYM_INT_LIST: - return is_int_or_symint_list(obj, size); + return is_int_or_symint_list(obj, size, failed_idx); default: throw std::runtime_error("unknown parameter type"); } } +// WARNING: these strings are parsed invalid_arguments.cpp std::string FunctionParameter::type_name() const { switch (type_) { case ParameterType::TENSOR: @@ -837,9 +856,10 @@ std::string FunctionParameter::type_name() const { case ParameterType::SCALAR: return "Number"; case ParameterType::INT64: - return "int"; + // NB: SymInt is intentionally not mentioned here, as conventional user + // use will only know about ints case ParameterType::SYM_INT: - return "SymInt"; + return "int"; case ParameterType::DOUBLE: return "float"; case ParameterType::COMPLEX: @@ -877,7 +897,7 @@ std::string FunctionParameter::type_name() const { case ParameterType::SCALAR_LIST: return "tuple of Scalars"; case ParameterType::SYM_INT_LIST: - return "tuple of SymInts"; + return "tuple of ints"; default: throw std::runtime_error("unknown parameter type"); } @@ -1341,6 +1361,8 @@ bool FunctionSignature::parse( is_kwd = true; } + int64_t failed_idx = -1; + bool varargs_eligible = allow_varargs_intlist && arg_pos == 0 && !is_kwd; if ((!obj && param.optional) || (obj == Py_None && param.allow_none)) { dst[i++] = nullptr; } else if (!obj) { @@ -1349,15 +1371,16 @@ bool FunctionSignature::parse( missing_args(*this, i); } return false; - } else if (param.check(obj, this->overloaded_args, i)) { + } else if (param.check(obj, this->overloaded_args, i, &failed_idx)) { dst[i++] = obj; // XXX: the Variable check is necessary because sizes become tensors when // tracer is enabled. This behavior easily leads to ambiguities, and we // should avoid having complex signatures that make use of it... } else if ( - allow_varargs_intlist && arg_pos == 0 && !is_kwd && - ((int_list_overload ? is_int_list(args, param.size) - : is_int_or_symint_list(args, param.size)))) { + varargs_eligible && + ((int_list_overload + ? is_int_list(args, param.size, &failed_idx) + : is_int_or_symint_list(args, param.size, &failed_idx)))) { // take all positional arguments as this parameter // e.g. permute(1, 2, 3) -> permute((1, 2, 3)) dst[i++] = args; @@ -1374,6 +1397,24 @@ bool FunctionSignature::parse( Py_TYPE(obj)->tp_name); } else { // foo(): argument 'other' (position 2) must be str, not int + if (failed_idx != -1) { + if (!(PyTuple_Check(obj) || PyList_Check(obj))) { + TORCH_INTERNAL_ASSERT(varargs_eligible); + obj = args; + } + TORCH_INTERNAL_ASSERT(failed_idx < PySequence_Size(obj)); + throw TypeError( + "%s(): argument '%s' (position %ld) must be %s, but found element of type %s at pos %ld", + name.c_str(), + param.name.c_str(), + static_cast(arg_pos + 1), + param.type_name().c_str(), + Py_TYPE(py::reinterpret_steal( + PySequence_GetItem(obj, failed_idx)) + .ptr()) + ->tp_name, + static_cast(failed_idx)); + } throw TypeError( "%s(): argument '%s' (position %ld) must be %s, not %s", name.c_str(), diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h index a08441369db82..acb830addf8f7 100644 --- a/torch/csrc/utils/python_arg_parser.h +++ b/torch/csrc/utils/python_arg_parser.h @@ -382,7 +382,8 @@ struct FunctionParameter { bool check( PyObject* obj, std::vector& overloaded_args, - int argnum); + int argnum, + int64_t* failed_idx = nullptr); void set_default_str(const std::string& str); std::string type_name() const; From 484ef6bd0663cac0d044efee9e32a2952dee40f7 Mon Sep 17 00:00:00 2001 From: albanD Date: Mon, 24 Oct 2022 15:37:20 -0400 Subject: [PATCH 0098/1922] Fix a PyObject leak (#87608) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87608 Approved by: https://github.com/ezyang --- torch/csrc/utils/invalid_arguments.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/torch/csrc/utils/invalid_arguments.cpp b/torch/csrc/utils/invalid_arguments.cpp index 9ff3e71fdc960..49f591d1a64b3 100644 --- a/torch/csrc/utils/invalid_arguments.cpp +++ b/torch/csrc/utils/invalid_arguments.cpp @@ -82,7 +82,9 @@ struct SequenceType : public Type { return false; auto num_elements = PySequence_Length(object); for (const auto i : c10::irange(num_elements)) { - if (!type->is_matching(PySequence_GetItem(object, i))) + if (!type->is_matching( + py::reinterpret_steal(PySequence_GetItem(object, i)) + .ptr())) return false; } return true; From 67fe3ec534eac27bf8dbbb79aa10007ce47fb777 Mon Sep 17 00:00:00 2001 From: albanD Date: Mon, 24 Oct 2022 15:37:20 -0400 Subject: [PATCH 0099/1922] Add /= to c10::SymInt (#87603) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87603 Approved by: https://github.com/bdhirsh --- c10/core/SymInt.cpp | 4 ++++ c10/core/SymInt.h | 1 + 2 files changed, 5 insertions(+) diff --git a/c10/core/SymInt.cpp b/c10/core/SymInt.cpp index 03f39078b406c..5ef576b3af1b0 100644 --- a/c10/core/SymInt.cpp +++ b/c10/core/SymInt.cpp @@ -155,6 +155,10 @@ void SymInt::operator*=(SymInt sci) { *this = *this * sci; } +void SymInt::operator/=(SymInt sci) { + *this = *this / sci; +} + void SymInt::operator+=(SymInt sci) { *this = *this + sci; } diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h index f5c2ddf00998e..6934a607ccbff 100644 --- a/c10/core/SymInt.h +++ b/c10/core/SymInt.h @@ -169,6 +169,7 @@ class C10_API SymInt { bool operator>=(SymInt sci) const; void operator*=(SymInt sci); void operator+=(SymInt sci); + void operator/=(SymInt sci); SymInt min(SymInt sci) const; SymInt max(SymInt sci) const; From b2537ede13c5f736c848a9fab9af4347fcd8530c Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 25 Oct 2022 00:00:57 +0000 Subject: [PATCH 0100/1922] Fix typo in secrets name (#87655) They are case sensitive and should be all uppercase Pull Request resolved: https://github.com/pytorch/pytorch/pull/87655 Approved by: https://github.com/kit1980, https://github.com/weiwangmeta --- .github/workflows/build-triton-wheel.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml index f602eaa30af4d..e3f02e6b77b36 100644 --- a/.github/workflows/build-triton-wheel.yml +++ b/.github/workflows/build-triton-wheel.yml @@ -137,8 +137,8 @@ jobs: env: PKG_DIR: "${{ runner.temp }}/artifacts" # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.aws-access-key-id }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.aws-pytorch-uploader-secret-access-key }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_UPDATE_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_UPDATE_SECRET_ACCESS_KEY }} UPLOAD_BUCKET: "s3://pytorch" run: | set -ex From b4b2b9ee327f507ab1929f0a8bdf43b1d8e0987f Mon Sep 17 00:00:00 2001 From: Masaki Kozuki Date: Tue, 25 Oct 2022 00:11:50 +0000 Subject: [PATCH 0101/1922] [docs] `batch_isend_irecv` and `P2POp` of torch.distributed (#86438) Reopening https://github.com/pytorch/pytorch/pull/79722 cc @mrshenli @pritamdamania87 @zhaojuanmao @satgera @rohan-varma @gqchen @aazzolini @osalpekar @jiayisuse @H-Huang @kwen2501 @awgu Pull Request resolved: https://github.com/pytorch/pytorch/pull/86438 Approved by: https://github.com/kit1980 --- docs/source/distributed.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst index 8b1186fb4ceec..530ff88721048 100644 --- a/docs/source/distributed.rst +++ b/docs/source/distributed.rst @@ -350,6 +350,10 @@ as they should never be created manually, but they are guaranteed to support two .. autofunction:: irecv +.. autofunction:: batch_isend_irecv + +.. autoclass:: P2POp + Synchronous and asynchronous collective operations -------------------------------------------------- Every collective operation function supports the following two kinds of operations, From e7fc9e1138db5f58f20c4b0fa1907d1c113700b2 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 25 Oct 2022 00:18:31 +0000 Subject: [PATCH 0102/1922] Fix TensorShape.cpp compilation (#87654) Build failure introduced by landrace while merging https://github.com/pytorch/pytorch/pull/75575 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87654 Approved by: https://github.com/albanD --- aten/src/ATen/native/TensorShape.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index e1f9835184cbd..d25113577b2d5 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -89,13 +89,14 @@ #include #include #include -#include +#include #include #include #include #include #include #include +#include #include #include #include From 7cdfb6fef8b6de1f02211a0ba5a3036506910ef0 Mon Sep 17 00:00:00 2001 From: Aaron Enye Shi Date: Tue, 25 Oct 2022 00:50:13 +0000 Subject: [PATCH 0103/1922] [Kineto][Profiler] Rename Profiler post processing Index Key (#87477) Summary: Rather than using the full name Profiler Event Index, use a shorten name Ev Idx. In the future, we should address this by adding a lookup table of short name to long name. Test Plan: CI Reviewed By: robieta, slgong-fb Differential Revision: D40328758 Pulled By: aaronenyeshi Pull Request resolved: https://github.com/pytorch/pytorch/pull/87477 Approved by: https://github.com/chaekit --- torch/csrc/profiler/collection.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp index e76cfd5946db9..01b7c4024f269 100644 --- a/torch/csrc/profiler/collection.cpp +++ b/torch/csrc/profiler/collection.cpp @@ -522,7 +522,7 @@ void mark_finished(std::shared_ptr& r) { TORCH_INTERNAL_ASSERT(r->endTimeNS() >= r->start_time_ns_, r->name()); } -static constexpr const char* indexKey = "Profiler Event Index"; +static constexpr const char* indexKey = "Ev Idx"; void passEventsToKineto( const std::vector>& results, From 0752e41f6b48585512fe685b27ef1aae5329e932 Mon Sep 17 00:00:00 2001 From: erjia Date: Tue, 25 Oct 2022 01:27:56 +0000 Subject: [PATCH 0104/1922] [DataLoader2] Change serialization wrapper to iterator (#87459) This is temporary fix for internal SEV. We have run three different workflows to validate this fix would unblock internal SEV. And, those are a few following-up tasks: - [ ] Create reproducible test for multithreading with generator - [ ] Figure out how to make fullsynciterator is working properly with generator - [ ] Move Wrapper back to generator if needed Pull Request resolved: https://github.com/pytorch/pytorch/pull/87459 Approved by: https://github.com/NivekT --- torch/utils/data/datapipes/datapipe.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/torch/utils/data/datapipes/datapipe.py b/torch/utils/data/datapipes/datapipe.py index 43adc00bdddaf..42120148d0269 100644 --- a/torch/utils/data/datapipes/datapipe.py +++ b/torch/utils/data/datapipes/datapipe.py @@ -356,8 +356,17 @@ def __len__(self): class _IterDataPipeSerializationWrapper(_DataPipeSerializationWrapper, IterDataPipe): - def __iter__(self): - yield from self._datapipe + def __init__(self, datapipe: IterDataPipe[T_co]): + super().__init__(datapipe) + self._datapipe_iter: Optional[Iterator[T_co]] = None + + def __iter__(self) -> "_IterDataPipeSerializationWrapper": + self._datapipe_iter = iter(self._datapipe) + return self + + def __next__(self) -> T_co: + assert self._datapipe_iter is not None + return next(self._datapipe_iter) class _MapDataPipeSerializationWrapper(_DataPipeSerializationWrapper, MapDataPipe): From 9996730a0de852d076aba81dbcc44180012f0566 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Tue, 25 Oct 2022 01:45:23 +0000 Subject: [PATCH 0105/1922] Add cached conda env files for macos (arm64, x86) (#87541) So far, we only cache macos conda dependency for build workflow. All the test dependencies are still not cached and installed by the CI. This PR introduces a new `.github/requirements` directory which I plan to explicitly include all the conda and pip build and test dependencies across all platforms. This allows pip and conda installation to be consolidated in one place (and properly cached) Those conda dependencies come from https://github.com/pytorch/pytorch/blob/master/.jenkins/pytorch/macos-common.sh. Once this PR is merged, I will follow up with another one to clean up all conda installation from that file (to make sure that nothing break along the way) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87541 Approved by: https://github.com/ZainRizvi --- .github/requirements/README.md | 19 +++++++++++++++++++ .github/requirements/conda-env-macOS-ARM64 | 16 ++++++++++++++++ .github/requirements/conda-env-macOS-X64 | 18 ++++++++++++++++++ .github/workflows/_mac-test.yml | 3 ++- 4 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 .github/requirements/README.md create mode 100644 .github/requirements/conda-env-macOS-ARM64 create mode 100644 .github/requirements/conda-env-macOS-X64 diff --git a/.github/requirements/README.md b/.github/requirements/README.md new file mode 100644 index 0000000000000..654bb04558b9b --- /dev/null +++ b/.github/requirements/README.md @@ -0,0 +1,19 @@ +### Cached requirements and consolidation of conda and pip installation + +At the moment, the installation of conda and pip dependencies happens at +different places in the CI depending at the whim of different +developers, which makes it very challenging to handle issues like +network flakiness or upstream dependency failures gracefully. So, this +center directory is created to gradually include all the conda enviroment +and pip requirement files that are used to setup CI jobs. Not only it +gives a clear picture of all the dependencies required by different CI +jobs, but it also allows them to be cached properly to improve CI +reliability. + +The list of support files are as follows: + +* Conda: + * conda-env-macOS-ARM64. This is used by MacOS (m1, arm64) build and + test jobs to setup the conda environment + * conda-env-macOS-X64. This is use by MacOS (x86-64) build and test + jobs to setup the conda environment diff --git a/.github/requirements/conda-env-macOS-ARM64 b/.github/requirements/conda-env-macOS-ARM64 new file mode 100644 index 0000000000000..6e7e4221a85ba --- /dev/null +++ b/.github/requirements/conda-env-macOS-ARM64 @@ -0,0 +1,16 @@ +numpy=1.22.3 +pyyaml=6.0 +setuptools=61.2.0 +cmake=3.22.1 +cffi=1.15.1 +typing_extensions=4.3.0 +dataclasses=0.8 +pip=22.2.2 +six=1.16.0 +pillow=9.2.0 +libuv=1.39.0 +pkg-config=0.29.2 +wheel=0.37.1 + +# Not pinning certifi so that we can always get the latest certificates +certifi diff --git a/.github/requirements/conda-env-macOS-X64 b/.github/requirements/conda-env-macOS-X64 new file mode 100644 index 0000000000000..81463d4b39d56 --- /dev/null +++ b/.github/requirements/conda-env-macOS-X64 @@ -0,0 +1,18 @@ +mkl=2021.2.0 +mkl-include=2021.2.0 +numpy=1.18.5 +pyyaml=5.3 +setuptools=46.0.0 +cmake=3.22.1 +cffi=1.15.1 +typing_extensions=4.3.0 +dataclasses=0.8 +pip=22.2.2 +six=1.16.0 +pillow=9.2.0 +libuv=1.40.0 +pkg-config=0.29.2 +wheel=0.37.1 + +# Not pinning certifi so that we can always get the latest certificates +certifi diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml index 72ee311498503..db524cae464b6 100644 --- a/.github/workflows/_mac-test.yml +++ b/.github/workflows/_mac-test.yml @@ -82,7 +82,6 @@ jobs: - name: Checkout PyTorch uses: pytorch/pytorch/.github/actions/checkout-pytorch@master - - name: Download build artifacts uses: ./.github/actions/download-build-artifacts with: @@ -94,12 +93,14 @@ jobs: uses: pytorch/test-infra/.github/actions/setup-miniconda@main with: python-version: 3.8 + environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }} - name: Setup miniconda (arm64, py3.9) if: ${{ runner.arch == 'ARM64' }} uses: pytorch/test-infra/.github/actions/setup-miniconda@main with: python-version: 3.9 + environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }} - name: Start monitoring script id: monitor-script From 68a21b7ca5e2ac9f43df576212f953b3e30c3b7b Mon Sep 17 00:00:00 2001 From: Sherlock Huang Date: Mon, 24 Oct 2022 21:52:12 +0000 Subject: [PATCH 0106/1922] Defer importing meta_table (#87630) This is needed to work around an internal test failure: https://www.internalfb.com/tasks/?t=135878641 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87630 Approved by: https://github.com/eellison, https://github.com/khabinov --- torch/_subclasses/fake_tensor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py index bb6970303facd..56b6d4b826af7 100644 --- a/torch/_subclasses/fake_tensor.py +++ b/torch/_subclasses/fake_tensor.py @@ -9,7 +9,6 @@ from typing import Any, Callable, Dict, List, Optional, Type, TypeVar, Union import torch -from torch._decomp import meta_table as meta_table from torch._ops import OpOverload from torch._subclasses.meta_utils import MetaConverter, WeakTensorRefKey from torch.fx.operator_schemas import normalize_function @@ -761,6 +760,8 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None): has_symbolic_sizes and func not in self.functions_with_cpp_meta_impl_that_support_symint ): + from torch._decomp import meta_table as meta_table + with no_dispatch(): if func == aten.size.default: sys.stderr.write( From c682fa471571ea279710785ef63162740983843a Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 25 Oct 2022 02:49:11 +0000 Subject: [PATCH 0107/1922] Fix incorrect param names in get_testing_overrides (#87625) This PR fixes incorrect parameter names for lambda in `get_testing_overrides()` Pull Request resolved: https://github.com/pytorch/pytorch/pull/87625 Approved by: https://github.com/kit1980 --- torch/overrides.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/overrides.py b/torch/overrides.py index c463cf3ca94d4..95e7c66111b5b 100644 --- a/torch/overrides.py +++ b/torch/overrides.py @@ -416,7 +416,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]: torch.concatenate: lambda tensors, dim=0, out=None: -1, # alias for torch.concatenate torch.cdist: lambda x1, x2, p=2.0, compute_mode='use_mm_for_euclid_dist_if_necessary': -1, torch.ceil: lambda input, out=None: -1, - torch.celu: lambda input, alhpa=1., inplace=False: -1, + torch.celu: lambda input, alpha=1., inplace=False: -1, torch.chain_matmul: lambda *matrices, out=None: -1, torch.channel_shuffle: lambda input, groups : -1, torch.cholesky: lambda input, upper=False, out=None: -1, @@ -572,7 +572,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]: torch.grid_sampler_2d: lambda input, grid, interpolation_mode, padding_mode, align_corners: -1, torch.grid_sampler_3d: lambda input, grid, interpolation_mode, padding_mode, align_corners: -1, torch.group_norm: lambda input, num_groups, weight=None, bias=None, eps=1e-05, cudnn_enabled=True: -1, - torch.gru: lambda input, hx, params, has_biases, num_layers, gropout, train, bidirectional, batch_first: -1, + torch.gru: lambda input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first: -1, torch.gru_cell: lambda input, hx, w_ih, w_hh, b_ih=None, b_hh=None: -1, torch.gt: lambda input, other, out=None: -1, torch.greater: lambda input, other, out=None: -1, From e2d130fca32a1c04ab0692fb2c20f46e09b3145a Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Tue, 25 Oct 2022 03:22:27 +0000 Subject: [PATCH 0108/1922] [Inductor] Truncate function expr str if it's too long at RecordLoadStore (#87248) See context at https://github.com/pytorch/torchdynamo/issues/1352#issuecomment-1283131872 Fixes https://github.com/pytorch/torchdynamo/issues/1352 cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @penguinwu Pull Request resolved: https://github.com/pytorch/pytorch/pull/87248 Approved by: https://github.com/jansel --- test/inductor/test_torchinductor.py | 25 +++++++++++++++++++++++++ torch/_inductor/dependencies.py | 10 ++++++++++ torch/_inductor/virtualized.py | 10 +++++++--- 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py index e0501e0e8adef..c106658b21c0e 100644 --- a/test/inductor/test_torchinductor.py +++ b/test/inductor/test_torchinductor.py @@ -3523,6 +3523,31 @@ def fn(a, b, c): ], ) + # From https://github.com/pytorch/torchdynamo/issues/1352 + def test_max_pool2d_with_indices_backward4(self): + def fn(a, b, c): + return aten.max_pool2d_with_indices_backward( + a, b, [5, 5], [1, 1], [2, 2], [1, 1], False, c + ) + + x = torch.randn([2, 64, 3, 4]) + result, indices = aten.max_pool2d_with_indices( + x, + [5, 5], + [1, 1], + 2, + 1, + False, + ) + self.common( + fn, + [ + torch.randn_like(result), + x, + indices, + ], + ) + def test_avg_pool2d_backward(self): def fn(a, b): return aten.avg_pool2d_backward( diff --git a/torch/_inductor/dependencies.py b/torch/_inductor/dependencies.py index 6eee943b60074..27c92f82c07c9 100644 --- a/torch/_inductor/dependencies.py +++ b/torch/_inductor/dependencies.py @@ -7,6 +7,7 @@ import sympy +from . import config from .codegen.common import index_prevent_reordering from .utils import sympy_product, sympy_str, sympy_subs, sympy_symbol, VarRanges from .virtualized import V @@ -146,6 +147,15 @@ def __init__(self, var_ranges: VarRanges, normalize: bool): self._var_ranges: VarRanges = var_ranges self._normalize: bool = normalize + # Truncate the expr str by a threshold to prevent it's too long + # and cause process hanging. The result is not used. + # https://github.com/pytorch/torchdynamo/issues/1352 + @staticmethod + def truncate_expr(expr): + if len(expr) > config.realize_bytes_threshold: + expr = f"{expr[:config.realize_bytes_threshold]}..." + return expr + def canonicalize( self, index: sympy.Expr ) -> Tuple[sympy.Expr, Tuple[sympy.Expr, ...]]: diff --git a/torch/_inductor/virtualized.py b/torch/_inductor/virtualized.py index 64c221895a91b..5d40d05f751f9 100644 --- a/torch/_inductor/virtualized.py +++ b/torch/_inductor/virtualized.py @@ -60,13 +60,17 @@ def __getattr__(self, name): def inner(*args, **kwargs): fargs = [_arg_str(a) for a in args] fargs.extend(f"{k}={v}" for k, v in kwargs.items()) - return f"{name}({', '.join(fargs)})" + return self.truncate_expr(f"{name}({', '.join(fargs)})") return inner @staticmethod - def masked(mask, body, other): - return f"masked({mask}, {body()}, {other})" + def truncate_expr(expr): + return expr + + @classmethod + def masked(cls, mask, body, other): + return cls.truncate_expr(f"masked({mask}, {body()}, {other})") @staticmethod def indirect_indexing(index_var): From f59154d6ed3c88ca04af23418afd32cd3495bea0 Mon Sep 17 00:00:00 2001 From: Tom Stein Date: Tue, 25 Oct 2022 04:07:16 +0000 Subject: [PATCH 0109/1922] [Python] refactor slices on sorted (#86995) Sometimes you want to query the small element of a set of elements and use `sorted(elements)[0]` without a second thought. However, this is not optimal, since the entire list must be sorted first `O(n log n)`. It would be better to use the `min(elements)` method provided for this purpose `O(n)`. Furthermore `sorted(elements)[::-1]` is not very efficient, because it would be better to use `sorted(elements, reverse=True)` to save the slice operation. **TLDR: using `sorted(elements)[0]` is slow and can be replaced with `min(elements)`.** I stumbled across these code snippets while playing around with CodeQL (see https://lgtm.com/query/4148064474379348546/). Pull Request resolved: https://github.com/pytorch/pytorch/pull/86995 Approved by: https://github.com/jansel --- tools/testing/test_selections.py | 6 +++--- torch/distributed/rpc/api.py | 2 +- torch/fx/experimental/symbolic_shapes.py | 4 ++-- torch/masked/maskedtensor/reductions.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/testing/test_selections.py b/tools/testing/test_selections.py index 3b33281781894..766ec0ff1fe7f 100644 --- a/tools/testing/test_selections.py +++ b/tools/testing/test_selections.py @@ -45,14 +45,14 @@ def calculate_shards( ] for test in sorted_tests: if must_serial(test): - min_sharded_job = sorted(sharded_jobs, key=lambda j: j.get_total_time())[0] + min_sharded_job = min(sharded_jobs, key=lambda j: j.get_total_time()) min_sharded_job.serial.append(test) else: - min_sharded_job = sorted(sharded_jobs, key=lambda j: j.get_total_time())[0] + min_sharded_job = min(sharded_jobs, key=lambda j: j.get_total_time()) min_sharded_job.parallel.append(test) # Round robin the unknown jobs starting with the smallest shard - index = sorted(range(num_shards), key=lambda i: sharded_jobs[i].get_total_time())[0] + index = min(range(num_shards), key=lambda i: sharded_jobs[i].get_total_time()) for test in unknown_tests: sharded_jobs[index].serial.append(test) index = (index + 1) % num_shards diff --git a/torch/distributed/rpc/api.py b/torch/distributed/rpc/api.py index 8eda3a729c380..f5e544806822d 100644 --- a/torch/distributed/rpc/api.py +++ b/torch/distributed/rpc/api.py @@ -191,7 +191,7 @@ def _all_gather(obj, worker_names=None, timeout=UNSET_RPC_TIMEOUT): _ALL_WORKER_NAMES is not None ), "`_ALL_WORKER_NAMES` is not initialized for `def _all_gather`." worker_names = _ALL_WORKER_NAMES - leader_name = sorted(worker_names)[0] + leader_name = min(worker_names) self_name = _get_current_rpc_agent().get_worker_info().name diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py index 3e80c8c9f4906..7615e410e2515 100644 --- a/torch/fx/experimental/symbolic_shapes.py +++ b/torch/fx/experimental/symbolic_shapes.py @@ -398,13 +398,13 @@ def create_symbolic_sizes_strides(self, ex: torch.Tensor): candidates[ex.size(i) * ex.stride()[i]] = size[i] * stride[i] if any(x is None for x in stride): # bind the smallest unbound stride to a new variable - val, i = sorted( + val, i = min( [ (ex.stride()[i], i) for i in range(len(stride)) if stride[i] is None ] - )[0] + ) stride[i] = self.create_symbol(val) assert all(x is not None for x in stride) return [self.create_symintnode(i) for i in size], [self.create_symintnode(i) for i in stride] # type: ignore[arg-type] diff --git a/torch/masked/maskedtensor/reductions.py b/torch/masked/maskedtensor/reductions.py index 137ae58e6e190..210af5d6c09cc 100644 --- a/torch/masked/maskedtensor/reductions.py +++ b/torch/masked/maskedtensor/reductions.py @@ -31,7 +31,7 @@ def _masked_all(*args, **kwargs): def _multidim_any(mask, dim, keepdim): if isinstance(dim, int): return _multidim_any(mask, [dim], keepdim) - for d in sorted(dim)[::-1]: + for d in sorted(dim, reverse=True): mask = torch.any(mask, dim=d, keepdim=keepdim) return mask From 9e6517a6bd5af20f082cc80dee6719db8df21d35 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Mon, 24 Oct 2022 12:30:45 -0700 Subject: [PATCH 0110/1922] Fix use after free in tensorpipe agent (#87627) Fixes #87359, which identifies use after free for reverse device maps. This is only in the dynamic RPC feature and not effecting stable RPC code path. Unfortunately the test `TensorPipeRpcTest.test_dynamic_rpc_existing_rank_can_communicate_with_new_rank_cuda` that is failing is also running into separate issue. I've temporarily disabled some of the test code to investigate the error in asychronously. Testing plan: - tested all the dynamic RPC tests Pull Request resolved: https://github.com/pytorch/pytorch/pull/87627 Approved by: https://github.com/rohan-varma --- .../csrc/distributed/rpc/tensorpipe_agent.cpp | 20 +++++++++------ .../_internal/distributed/rpc/rpc_test.py | 25 +++++++++++-------- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp index 2480b21d105f1..c885713637421 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp +++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp @@ -1260,18 +1260,22 @@ void TensorPipeAgent::updateGroupMembership( workerNameToInfo_.erase(name); workerNameToURL_.erase(name); - for (const auto& it : reverseDeviceMaps_) { - if (reverseDeviceMaps.find(it.first) == reverseDeviceMaps.end()) { - reverseDeviceMaps_.erase(it.first); + // remove reverse device maps that are no longer used + for (auto it = reverseDeviceMaps_.begin(); + it != reverseDeviceMaps_.end();) { + if (reverseDeviceMaps.find(it->first) == reverseDeviceMaps.end()) { + it = reverseDeviceMaps_.erase(it); + } else { + it++; } } - auto iter = devices_.begin(); - while (iter != devices_.end()) { - if (std::find(devices.begin(), devices.end(), *iter) == devices.end()) { - iter = devices_.erase(iter); + // remove devices that are no longer used + for (auto it = devices_.begin(); it != devices_.end();) { + if (std::find(devices.begin(), devices.end(), *it) == devices.end()) { + it = devices_.erase(it); } else { - iter++; + it++; } } } diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py index 2c59629aec633..764117f43dbf6 100644 --- a/torch/testing/_internal/distributed/rpc/rpc_test.py +++ b/torch/testing/_internal/distributed/rpc/rpc_test.py @@ -5106,16 +5106,21 @@ def test_dynamic_rpc_existing_rank_can_communicate_with_new_rank_cuda(self): rpc_backend_options=self.rpc_backend_options, ) - dist.barrier() - if self.rank == 0: - for i in range(1, self.world_size): - x = torch.ones(2) - result_on_device_0 = rpc.rpc_sync(worker_name(i), torch.add, args=(x.to(0), 1)) - result_on_device_1 = rpc.rpc_sync(worker_name(i), torch.add, args=(x.to(1), 1)) - self.assertEqual(torch.add(torch.ones(2), 1), result_on_device_0) - self.assertEqual(torch.device('cuda:0'), result_on_device_0.device) - self.assertEqual(torch.add(torch.ones(2), 1), result_on_device_1) - self.assertEqual(torch.device('cuda:1'), result_on_device_1.device) + # TODO: Cuda RPC is failing due to: + # terminate called after throwing an instance of 'c10::Error' + # what(): 0 <= device && static_cast(device) < device_allocator.size() + # INTERNAL ASSERT FAILED at "../c10/cuda/CUDACachingAllocator.cpp":1937, + # please report a bug to PyTorch. Allocator not initialized for device 1: did you call init? + # dist.barrier() + # if self.rank == 0: + # for i in range(1, self.world_size): + # x = torch.ones(2) + # result_on_device_0 = rpc.rpc_sync(worker_name(i), torch.add, args=(x.to(0), 1)) + # result_on_device_1 = rpc.rpc_sync(worker_name(i), torch.add, args=(x.to(1), 1)) + # self.assertEqual(torch.add(torch.ones(2), 1), result_on_device_0) + # self.assertEqual(torch.device('cuda:0'), result_on_device_0.device) + # self.assertEqual(torch.add(torch.ones(2), 1), result_on_device_1) + # self.assertEqual(torch.device('cuda:1'), result_on_device_1.device) # Barrier to ensure that all rpc_sync calls are finished dist.barrier() From a40a689d5438765760cf76ab44d44989eb86af00 Mon Sep 17 00:00:00 2001 From: Soof Golan <83900570+soof-golan@users.noreply.github.com> Date: Tue, 25 Oct 2022 04:43:07 +0000 Subject: [PATCH 0111/1922] Fix `tensor.stride()` type hint (#84177) `tensor.stride()` now hints at tuple of variable length instead of tuple with constant length of 1 Fixes #84176 Pull Request resolved: https://github.com/pytorch/pytorch/pull/84177 Approved by: https://github.com/Chillee --- tools/pyi/gen_pyi.py | 2 +- torch/_prims/__init__.py | 2 +- torch/fx/passes/shape_prop.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py index 79f97c4e9f30c..417d73f829a6e 100644 --- a/tools/pyi/gen_pyi.py +++ b/tools/pyi/gen_pyi.py @@ -597,7 +597,7 @@ def gen_pyi( "def size(self, dim: _int) -> _int: ...", ], "stride": [ - "def stride(self) -> Tuple[_int]: ...", + "def stride(self) -> Tuple[_int, ...]: ...", "def stride(self, _int) -> _int: ...", ], "new_ones": [ diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py index d724ac50e2839..eae38612a2237 100644 --- a/torch/_prims/__init__.py +++ b/torch/_prims/__init__.py @@ -1273,7 +1273,7 @@ def _collapse_view_helper( strides = (1,) else: shape = a.shape # type: ignore[assignment] - strides = a.stride() + strides = a.stride() # type: ignore[assignment] utils.validate_idx(len(shape), start) utils.validate_exclusive_idx(len(shape), end) diff --git a/torch/fx/passes/shape_prop.py b/torch/fx/passes/shape_prop.py index 9c3a036e90bf4..2be996f714ce8 100644 --- a/torch/fx/passes/shape_prop.py +++ b/torch/fx/passes/shape_prop.py @@ -17,7 +17,7 @@ class TensorMetadata(NamedTuple): shape : torch.Size dtype : torch.dtype requires_grad : bool - stride : Tuple[int] + stride : Tuple[int, ...] memory_format : Optional[torch.memory_format] # Quantization metadata From faf7f3d455536b403f77c6894d932a0365368384 Mon Sep 17 00:00:00 2001 From: shynehr Date: Tue, 25 Oct 2022 04:45:52 +0000 Subject: [PATCH 0112/1922] remove unnecessary __syncthreads() in conv_depthwise2d_grad_weight_kernel (#84854) Threads within a thread block would be synchronize inside the function BlockReduceSum when intra-warp reduce finishes. It's unnessary to synchronize threads before invoking function BlockReduceSum. Pull Request resolved: https://github.com/pytorch/pytorch/pull/84854 Approved by: https://github.com/ngimel --- aten/src/ATen/native/cuda/DepthwiseConv2d.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu index 8f0f9b99903a7..20748837bbaf7 100644 --- a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu +++ b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu @@ -236,7 +236,6 @@ __global__ void conv_depthwise2d_grad_weight_kernel( } } } - __syncthreads(); // At this point each thread in the block has a local gradient, which we need to // accumulate prior to writing the global value From d2b1929f9ccf7ce39527e7ecbffed856bc49f475 Mon Sep 17 00:00:00 2001 From: Bill Schnurr Date: Tue, 25 Oct 2022 04:47:10 +0000 Subject: [PATCH 0113/1922] Fix torch.testing.assert_close not exported from module (#87619) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For pylance/pyright static typechecking "Imported symbols are considered private by default. If they use the “import A as A” (a redundant module alias), “from X import A as A” (a redundant symbol alias)" https://github.com/microsoft/pyright/blob/main/docs/typed-libraries.md#library-interface torch.testing.assert_close not exported from module https://github.com/microsoft/pylance-release/issues/3526 Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/87619 Approved by: https://github.com/kit1980 --- torch/testing/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torch/testing/__init__.py b/torch/testing/__init__.py index 130eaf672983c..ad69ef1d24901 100644 --- a/torch/testing/__init__.py +++ b/torch/testing/__init__.py @@ -1,4 +1,4 @@ -from ._comparison import assert_close -from torch._C import FileCheck -from ._creation import make_tensor +from ._comparison import assert_close as assert_close +from torch._C import FileCheck as FileCheck +from ._creation import make_tensor as make_tensor from ._deprecated import * # noqa: F403 From 661fe5d8251cd108dcd25d2b6f1a23004a84ebc1 Mon Sep 17 00:00:00 2001 From: Takeshi Watanabe Date: Tue, 25 Oct 2022 05:49:52 +0000 Subject: [PATCH 0114/1922] [JIT] Fix return types of inputs/outputs method in Graph (#86349) The C++ definition return `ArrayRef` but in python binding it returns iterator instead: https://github.com/pytorch/pytorch/blob/d04889323e2bc0b7321b76e564292565c88b9a5e/torch/csrc/jit/python/python_ir.cpp#L631 I've had hard time with mypy and there is also fixed version of stubs in pytorch-pfn-extras for my project: https://github.com/pfnet/pytorch-pfn-extras/blob/beeab3f30381fd1ed313bc09d561c567482784a1/stubs/torch/_C/__init__.pyi#L458 Pull Request resolved: https://github.com/pytorch/pytorch/pull/86349 Approved by: https://github.com/kit1980 --- torch/_C/__init__.pyi.in | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index 5b9049e4bdc7d..792e231999163 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -527,8 +527,8 @@ class Value: # Defined in torch/csrc/jit/ir/ir.h class Block: - def inputs(self) -> List[Value]: ... - def outputs(self) -> List[Value]: ... + def inputs(self) -> Iterator[Value]: ... + def outputs(self) -> Iterator[Value]: ... def nodes(self) -> Iterator[Node]: ... def paramNode(self) -> Node: ... def returnNode(self) -> Node: ... @@ -542,11 +542,11 @@ class Node: def __getitem__(self, key: str) -> Any: ... def schema(self) -> str: ... def input(self) -> Value: ... - def inputs(self) -> List[Value]: ... + def inputs(self) -> Iterator[Value]: ... def inputsAt(self, idx: _int) -> Value: ... def inputsSize(self) -> _int: ... def output(self) -> Value: ... - def outputs(self) -> List[Value]: ... + def outputs(self) -> Iterator[Value]: ... def outputsAt(self, idx: _int) -> Value: ... def outputsSize(self) -> _int: ... def hasMultipleOutputs(self) -> _bool: ... @@ -622,8 +622,8 @@ class Node: # Defined in torch/torch/csrc/jit/ir/ir.h class Graph: - def inputs(self) -> List[Value]: ... - def outputs(self) -> List[Value]: ... + def inputs(self) -> Iterator[Value]: ... + def outputs(self) -> Iterator[Value]: ... def nodes(self) -> Iterator[Node]: ... def param_node(self) -> Node: ... def return_node(self) -> Node: ... From c4c0491a4c962f76550bd164fd8d19c8a330af36 Mon Sep 17 00:00:00 2001 From: PyTorch MergeBot Date: Tue, 25 Oct 2022 06:14:54 +0000 Subject: [PATCH 0115/1922] [vision hash update] update the pinned vision hash (#87639) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml). Update the pinned vision hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87639 Approved by: https://github.com/pytorchbot --- .github/ci_commit_pins/vision.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt index 02a12c728a3a5..88e283fa46ec9 100644 --- a/.github/ci_commit_pins/vision.txt +++ b/.github/ci_commit_pins/vision.txt @@ -1 +1 @@ -9c112935abe400222cca8f9fbc2d8386e0f25e80 +0d7807d59520289b2065b4db4a138b7fba2f61fd From 4f2e3c4e69de54419e07317c8f6cf5813465adcb Mon Sep 17 00:00:00 2001 From: Ivan Yashchuk Date: Tue, 25 Oct 2022 06:55:59 +0000 Subject: [PATCH 0116/1922] Intercept aten._reshape_alias for nvFuser (#87072) This would help forming larger fusion groups. If this won't end up executed by nvFuser then eager mode implementation would call into `.reshape`: https://github.com/pytorch/pytorch/blob/37e9e89afbc3554258545a026fab4cd9e1a4b85d/torch/_prims/nvfuser_prims.py#L552-L553 cc @kevinstephano @jjsjann123 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87072 Approved by: https://github.com/ngimel --- torch/_prims/context.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/torch/_prims/context.py b/torch/_prims/context.py index fea3f17a5009b..2bcee069d146c 100644 --- a/torch/_prims/context.py +++ b/torch/_prims/context.py @@ -405,6 +405,12 @@ def __torch_function__( warn("view has ignored kwargs!") return torch.ops.nvprims.view(a, shape) + if orig_func == torch.ops.aten._reshape_alias.default: + a, shape, stride = args + if len(kwargs) > 0: + warn("view has ignored kwargs!") + return torch.ops.nvprims.view(a, shape) + if self._is_native_batch_norm(orig_func): return torch.ops.nvprims.native_batch_norm(*args, **kwargs) From 6ebb69c37f57fde73385c5fbcf0b787d4f13be84 Mon Sep 17 00:00:00 2001 From: Daniel Falbel Date: Tue, 25 Oct 2022 07:12:28 +0000 Subject: [PATCH 0117/1922] Support `signbit` in MPS. (#87214) Implements the signbit operator for MPS. Links to #77764 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87214 Approved by: https://github.com/kulinseth, https://github.com/kit1980 --- .../src/ATen/native/mps/operations/UnaryOps.mm | 18 ++++++++++++++++++ aten/src/ATen/native/native_functions.yaml | 1 + test/test_mps.py | 14 ++++++++++++++ 3 files changed, 33 insertions(+) diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm index dd9c8176d0b7c..2763eff39f6a6 100644 --- a/aten/src/ATen/native/mps/operations/UnaryOps.mm +++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm @@ -93,6 +93,24 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una { return mps::trunc_tensor(mpsGraph, inputTensor); }); } +TORCH_IMPL_FUNC(signbit_out_mps) (const Tensor& self, const Tensor& output) { + mps::unary_op(self, output, "signbit_out_mps", + ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) { + MPSGraphTensor* output; + // signbit is not implemented for int64 type. + // workaround for `Function signbitOp_i64 was not found in the library` + if ([inputTensor dataType] == MPSDataTypeInt64) { + MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0 dataType:inputTensor.dataType]; + output = [mpsGraph lessThanWithPrimaryTensor:inputTensor + secondaryTensor:zeroTensor + name:nil]; + } else { + output = [mpsGraph signbitWithTensor: inputTensor name: nil]; + } + return mps::castMPSTensor(mpsGraph, output, ScalarType::Bool); + }); +} + TORCH_IMPL_FUNC(sign_out_mps) (const Tensor& self, const Tensor& output) { mps::unary_op(self, output, "sign_out_mps", ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) { diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index d514cae670855..c1c2b363cb99b 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -8533,6 +8533,7 @@ dispatch: CPU: signbit_out CUDA: signbit_out + MPS: signbit_out_mps SparseCPU, SparseCUDA: signbit_sparse_out SparseCsrCPU, SparseCsrCUDA: signbit_sparse_csr_out diff --git a/test/test_mps.py b/test/test_mps.py index 8eeae7dbcaf7b..98df393c3e955 100644 --- a/test/test_mps.py +++ b/test/test_mps.py @@ -4154,6 +4154,20 @@ def helper(shape): helper((2, 8, 4, 5)) + def test_signbit(self): + def helper(shape, dtype): + cpu_x = torch.randn(shape, device='cpu').to(dtype) + x = cpu_x.clone().to('mps') + + signbit_result = torch.signbit(x) + signbit_result_cpu = torch.signbit(cpu_x) + + self.assertEqual(signbit_result, signbit_result_cpu) + + helper((2, 8, 4, 5), torch.int) + helper((2, 8, 4, 5), torch.float) + helper((2, 8, 4, 5), torch.int64) + # Test neg def test_neg(self): def helper(shape): From 3eae04586282d1452eaf090992742fbc4f4004c6 Mon Sep 17 00:00:00 2001 From: Jagadish Krishnamoorthy Date: Tue, 25 Oct 2022 07:17:44 +0000 Subject: [PATCH 0118/1922] [ROCm] [FakeTensorTest] Enable test_fallback_memory_prop (#85760) Signed-off-by: Jagadish Krishnamoorthy Pull Request resolved: https://github.com/pytorch/pytorch/pull/85760 Approved by: https://github.com/kit1980 --- test/test_fake_tensor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py index 0d81cdf10f82f..50a92436f406b 100644 --- a/test/test_fake_tensor.py +++ b/test/test_fake_tensor.py @@ -329,7 +329,6 @@ def fn( self.assertTrue(isinstance(ten, FakeTensor)) self.assertEqual(ten.device.type, 'cuda') - @skipIfRocm @unittest.skipIf(not RUN_CUDA, "requires cuda") def test_fallback_memory_prop(self): m = nn.Conv2d(16, 33, 3, stride=2, device="cuda", dtype=torch.half) From 2e786e59ee1d29930535b14471d05cdc3a138eca Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Mon, 24 Oct 2022 12:57:57 -0700 Subject: [PATCH 0119/1922] [inductor] Trivial smoke-test (#87598) As we're bringing up dynamo+inductor on Meta-internal infra, I keep wanting a stupidly simple program to run to see if anything at all is working. This test is that program :-p. Obviously test_torchinductor.py is more comprehensive but it's also harder to tell exactly what's going on, whereas this test fits on one screen. Differential Revision: [D40595798](https://our.internmc.facebook.com/intern/diff/D40595798/) **NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D40595798/)! cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87598 Approved by: https://github.com/anijain2305, https://github.com/brad-mengchi --- test/inductor/test_smoke.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 test/inductor/test_smoke.py diff --git a/test/inductor/test_smoke.py b/test/inductor/test_smoke.py new file mode 100644 index 0000000000000..64afbcf0254e3 --- /dev/null +++ b/test/inductor/test_smoke.py @@ -0,0 +1,30 @@ +# Owner(s): ["module: inductor"] +import logging +import unittest + +import torch +import torch._dynamo as torchdynamo +import torch._inductor.config as torchinductor_config + +torchdynamo.config.log_level = logging.INFO +torchdynamo.config.verbose = True +torchinductor_config.debug = True + + +class MLP(torch.nn.Module): + def __init__(self): + super(MLP, self).__init__() + self.l1 = torch.nn.Linear(1, 6) + self.l2 = torch.nn.Linear(6, 1) + + def forward(self, x=None): + x = torch.relu(self.l1(x)) + x = torch.relu(self.l2(x)) + return x + + +class SmokeTest(unittest.TestCase): + def test_mlp(self): + mlp = torchdynamo.optimize("inductor")(MLP().cuda()) + for _ in range(3): + mlp(torch.randn(1, device="cuda")) From e34086840fae2787d36768d6c9f9536f2007dfcb Mon Sep 17 00:00:00 2001 From: Driss Guessous Date: Tue, 25 Oct 2022 14:44:05 +0000 Subject: [PATCH 0120/1922] Performance improvment to cumulative seq len (#87530) # Summary Performance improvement to calculating metadata needed for gluing in nested tensors to fused kernels. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87530 Approved by: https://github.com/cpuhrsch --- .../cuda/NestedTensorTransformerFunctions.cpp | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp index a90af2fe0af32..4028c8d5c3e4b 100644 --- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp +++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp @@ -234,7 +234,7 @@ std::tuple _scaled_dot_product_attention_forward_nested( return std::make_tuple(Tensor(), Tensor()); } } - +namespace{ /** * This function is used to calculate two pieces of metadata that are needed @@ -242,9 +242,10 @@ std::tuple _scaled_dot_product_attention_forward_nested( * cumulative sequence_length over a batch of sequences and the maximum sequence * length. * - * @return A tuple of cumulative sequence lengths and the maximum sequence length + * @return A tuple of cumulative sequence lengths and the maximum sequence length, + * and the last element in the cumulative_sequence_lengths */ -std::tuple cumulative_and_max_seq_len(Tensor qkv) { +std::tuple cumulative_and_max_seq_len(Tensor qkv) { TORCH_CHECK( qkv.is_nested(), "QKV must be nested for flash cumulative_seq_len calculation.") @@ -274,7 +275,7 @@ std::tuple cumulative_and_max_seq_len(Tensor qkv) { // Send to GPU, this is pretty light weight calc for normal batch size // but maybe this needs to be on gpu cumulative_seqlen = cumulative_seqlen.to(TensorOptions().device(at::kCUDA)); - return std::tuple{cumulative_seqlen, max_seqlen}; + return std::tuple{cumulative_seqlen, max_seqlen, sum}; } /** @@ -337,6 +338,7 @@ bool is_safe_to_get_storage_as_tensor(const NestedTensorImpl* tensor) { return true; } +} // namespace std::tuple mem_efficient_helper_nested_unpacked( const Tensor& query, const Tensor& key, @@ -354,19 +356,19 @@ std::tuple mem_efficient_helper_nested_unpacked( Tensor k_t = key.transpose(1, 2); Tensor v_t = value.transpose(1, 2); - auto cumulative_and_max_q = cumulative_and_max_seq_len(q_t); - auto cumulative_and_max_k = cumulative_and_max_seq_len(k_t); + auto cumulative_and_max_q_and_nnz_q = cumulative_and_max_seq_len(q_t); + auto cumulative_and_max_k_and_nnz_k = cumulative_and_max_seq_len(k_t); // K and V have to have the same Nnz, should probably torch_check // assume in order to not iterate over v - Tensor cumulative_sequence_length_q = std::get<0>(cumulative_and_max_q); - Tensor cumulative_sequence_length_k = std::get<0>(cumulative_and_max_k); + Tensor cumulative_sequence_length_q = std::get<0>(cumulative_and_max_q_and_nnz_q); + Tensor cumulative_sequence_length_k = std::get<0>(cumulative_and_max_k_and_nnz_k); - const int64_t max_seqlen_batch_q = std::get<1>(cumulative_and_max_q); + const int64_t max_seqlen_batch_q = std::get<1>(cumulative_and_max_q_and_nnz_q); - const int64_t Nnz_q = cumulative_sequence_length_q[-1].item(); - const int64_t Nnz_kv = cumulative_sequence_length_k[-1].item(); + const int64_t Nnz_q = std::get<2>(cumulative_and_max_q_and_nnz_q); + const int64_t Nnz_kv = std::get<2>(cumulative_and_max_k_and_nnz_k); Tensor query_buffer_reshaped; Tensor key_buffer_reshaped; @@ -460,15 +462,15 @@ Tensor flash_attention_helper( int64_t head_dim{query.size(-1)}; int64_t num_heads{query.size(-2)}; - auto cumulative_and_max_q = cumulative_and_max_seq_len(query); - Tensor cumulative_sequence_length_q = std::get<0>(cumulative_and_max_q); - int64_t max_seqlen_batch_q = std::get<1>(cumulative_and_max_q); + auto cumulative_and_max_q_and_nnz_q = cumulative_and_max_seq_len(query); + Tensor cumulative_sequence_length_q = std::get<0>(cumulative_and_max_q_and_nnz_q); + int64_t max_seqlen_batch_q = std::get<1>(cumulative_and_max_q_and_nnz_q); TORCH_CHECK( key.is_same(key) && query.is_same(value), "Key and Value must be the same tensor"); - int64_t Nnz_q{cumulative_sequence_length_q[-1].item()}; + int64_t Nnz_q = std::get<2>(cumulative_and_max_q_and_nnz_q); // For the packed case we need to set the output size for dim 2 to 1 auto atten_size = get_nested_size_tensor(query).clone(); From 53292a7d9760f47c0c2353321ff973c646db26dd Mon Sep 17 00:00:00 2001 From: PyTorch MergeBot Date: Tue, 25 Oct 2022 14:45:12 +0000 Subject: [PATCH 0121/1922] Revert "Intercept aten._reshape_alias for nvFuser (#87072)" This reverts commit 163a829caa82559e7f938f65c1b647a5d50663c3. Reverted https://github.com/pytorch/pytorch/pull/87072 on behalf of https://github.com/malfet due to Looks like it broke test_indexing in dynamo shard, see https://github.com/pytorch/pytorch/actions/runs/3318778609/jobs/5483248042 --- torch/_prims/context.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/torch/_prims/context.py b/torch/_prims/context.py index 2bcee069d146c..fea3f17a5009b 100644 --- a/torch/_prims/context.py +++ b/torch/_prims/context.py @@ -405,12 +405,6 @@ def __torch_function__( warn("view has ignored kwargs!") return torch.ops.nvprims.view(a, shape) - if orig_func == torch.ops.aten._reshape_alias.default: - a, shape, stride = args - if len(kwargs) > 0: - warn("view has ignored kwargs!") - return torch.ops.nvprims.view(a, shape) - if self._is_native_batch_norm(orig_func): return torch.ops.nvprims.native_batch_norm(*args, **kwargs) From c82324336d214f3073379b455e4286941ead46d4 Mon Sep 17 00:00:00 2001 From: AllenTiTaiWang Date: Mon, 24 Oct 2022 21:14:18 +0000 Subject: [PATCH 0122/1922] [ONNX] Add Support on 0d tensor Broadcast (#87211) I am not sure if this will break things ... Although 0d tensor is an undefined behavior in ONNX spec, I did some experiments and found that ONNX shape inference actually provides 0d as inference from 0d and 1d Op calculations, and the bug happened in Broadcast function. But still, if this breaks things really bad, I think we can put 0d tensor handling on hold, as it's not very common usage on models? Pull Request resolved: https://github.com/pytorch/pytorch/pull/87211 Approved by: https://github.com/jcwchen, https://github.com/BowenBao --- .../csrc/jit/passes/onnx/shape_type_inference.cpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp index 248733f746a63..d2873ddf464cb 100644 --- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp +++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp @@ -700,18 +700,25 @@ std::vector<::c10::ShapeSymbol> Broadcast( const c10::ShapeSymbol& ss_shape_1 = input_shape_value_1[rank_1 - 1 - idx]; bool is_static_0 = ss_shape_0.is_static(); bool is_static_1 = ss_shape_1.is_static(); + size_t shape_idx = rank_max - 1 - idx; if (is_static_0 && is_static_1) { int64_t static_0_sz = ss_shape_0.static_size(); int64_t static_1_sz = ss_shape_1.static_size(); - final_shape[rank_max - 1 - idx] = ::c10::ShapeSymbol::fromStaticSize( - std::max(static_0_sz, static_1_sz)); + // condition for corner case of 0d tensor + // 0d tensor with 1d tensor would give us 0d tensor + if (std::min(static_0_sz, static_1_sz) == 0) { + final_shape[shape_idx] = ::c10::ShapeSymbol::fromStaticSize( + std::min(static_0_sz, static_1_sz)); + } else { + final_shape[shape_idx] = ::c10::ShapeSymbol::fromStaticSize( + std::max(static_0_sz, static_1_sz)); + } } else if (!is_static_0 && !is_static_1) { if (ss_shape_0.value() == ss_shape_1.value()) { - final_shape[rank_max - 1 - idx] = ss_shape_0; + final_shape[shape_idx] = ss_shape_0; } } } - if (rank_0 < rank_1) { for (size_t idx = rank_min; idx < rank_max; idx++) { size_t shape_idx = rank_max - 1 - idx; From b4e180d93f71c5278b080abd472df0f7a9fae9d9 Mon Sep 17 00:00:00 2001 From: Sherlock Huang Date: Tue, 25 Oct 2022 04:46:42 +0000 Subject: [PATCH 0123/1922] Prefer python meta function over c++ meta function (#87426) This is a policy update for meta registration. **We now prefer python meta implementation over C++ meta function.** This is a flip of the previous policy, where we prefer C++ meta function over python meta function if they both exist. Here's the meta registration process: 1. register_meta and register_decomposition will place the python meta/decomp functions into the `global_decomp_table`. However, they will NOT register them into dispatcher. 2. After global_decomp_table is populated, we will compile an `active_meta_table`. For a given op, we pick the most specific decomp function from `global_decomp_table` in the preference order of Meta > PostAutograd > PreAutograd. 3. We will unconditionally register all of them into python dispatcher. And register them into C++ dispatcher, unless it one of the following 3 cases - 1. the op is a CompositeImplicitAutograd, and should rely on decomposed op's meta - 2. the op is a view op, as the MetaTensor doesn't support aliased storage - 3. the op is in the blocklist (due to UT failures, and we will burn down this list op by op) Over the long run, we wish to implement all meta functions in python. With this PR, 321 op_overloads will have cpp meta overridden by python meta. There are still 400 op_overloads is using cpp meta. The exact list can be found here https://gist.github.com/SherlockNoMad/d20bb736178df8eebd3b054c8bb7cdc5 cc @ngimel @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang Pull Request resolved: https://github.com/pytorch/pytorch/pull/87426 Approved by: https://github.com/ezyang, https://github.com/jansel --- aten/src/ATen/core/dispatch/OperatorEntry.cpp | 18 ++- test/test_ops.py | 2 - torch/_decomp/__init__.py | 96 +++++--------- torch/_decomp/decompositions.py | 14 +- torch/_inductor/decomposition.py | 4 +- torch/_meta_registrations.py | 123 +++++++++++++----- torch/_ops.py | 4 + torch/_refs/__init__.py | 30 ++--- torch/_subclasses/fake_tensor.py | 3 +- torch/library.py | 5 +- 10 files changed, 162 insertions(+), 137 deletions(-) diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp index 822924a602533..5bd5d8abf54dc 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp +++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp @@ -147,13 +147,17 @@ OperatorEntry::AnnotatedKernelContainerIterator OperatorEntry::registerKernel( #else if (k.size() > 0) { #endif - TORCH_WARN("Overriding a previously registered kernel for the same operator and the same dispatch key\n", - " operator: ", (schema_.has_value() ? toString(schema_->schema) : toString(name_)), "\n", - " ", (this->schema_.has_value() ? this->schema_->debug : "no debug info"), "\n", - " dispatch key: ", toString(dispatch_key), "\n", - " previous kernel: ", (cpp_signature_.has_value() ? cpp_signature_->debug : (sym_cpp_signature_.has_value() ? sym_cpp_signature_->debug : "no debug info")), "\n", - " new kernel: ", debug - ); + // Suppress the warning for Meta key as we are overriding C++ meta functions with python meta functions + // for some ops + if (dispatch_key != DispatchKey::Meta) { + TORCH_WARN("Overriding a previously registered kernel for the same operator and the same dispatch key\n", + " operator: ", (schema_.has_value() ? toString(schema_->schema) : toString(name_)), "\n", + " ", (this->schema_.has_value() ? this->schema_->debug : "no debug info"), "\n", + " dispatch key: ", toString(dispatch_key), "\n", + " previous kernel: ", (cpp_signature_.has_value() ? cpp_signature_->debug : (sym_cpp_signature_.has_value() ? sym_cpp_signature_->debug : "no debug info")), "\n", + " new kernel: ", debug + ); + } } #ifdef C10_DISPATCHER_ONE_KERNEL_PER_DISPATCH_KEY diff --git a/test/test_ops.py b/test/test_ops.py index c63de0a4778d3..5e9371e982341 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -1891,8 +1891,6 @@ def test_refs_are_in_decomp_table(self, op): "svd_lowrank", "sgn", "cholesky", - "linalg.eigh", - "symeig", } fake_backward_xfails = {xfail(stride_skip) for stride_skip in fake_backward_xfails} | { diff --git a/torch/_decomp/__init__.py b/torch/_decomp/__init__.py index 2dcda014cea30..d50f33933da49 100644 --- a/torch/_decomp/__init__.py +++ b/torch/_decomp/__init__.py @@ -26,16 +26,34 @@ pre_autograd_decomposition_table = global_decomposition_table["pre_autograd"] meta_table = global_decomposition_table["meta"] -meta_lib = torch.library.Library("aten", "IMPL", "Meta") - -# decompositions which have been disabled as meta kernel implementations, -# usually due to mismatching strides, aliasing, or other inconsistent property -_disabled_meta_decomps = set() +def _add_op_to_registry(registry, op, fn): + """ + This is an internal API for adding an op to the decomposition table. -def register_decomposition( - aten_op, registry=None, *, type="post_autograd", disable_meta: bool = False -): + If op is OpOverload, it will be added to the registry directly. + If op is OpOverloadPacket, all the valid op_overloads in the packet will be added to the registry. + """ + overloads = [] + if isinstance(op, OpOverload): + overloads.append(op) + else: + assert isinstance(op, OpOverloadPacket) + for ol in op.overloads(): + overloads.append(getattr(op, ol)) + + for op_overload in overloads: + if op_overload in registry: + raise RuntimeError(f"duplicate registrations for {op_overload}") + + # TorchScript dumps a bunch of extra nonsense overloads + # which don't have corresponding dispatcher entries, we need + # to filter those out, e.g aten.add.float_int + if torch._C._dispatch_has_kernel(op_overload.name()): + registry[op_overload] = fn + + +def register_decomposition(aten_op, registry=None, *, type="post_autograd"): """ A decorator to register a function as a decomposition to the Python decomposition table. Use it like this:: @@ -52,9 +70,8 @@ def clamp_min(x): autograd) and not just backend tracing, where we then need to know if a decomposition can be used to simulate a transform. - By default, if the decomposition is for an operator that doesn't have - a Meta implementation, we will register it to the dispatcher. Use - `disable_meta` to disable this behavior. + By default, we also will register it to the Meta key of dispatcher, + and replace the c++ Meta implementation if there is already one. """ assert type in {"post_autograd", "pre_autograd", "meta"} @@ -106,62 +123,11 @@ def _fn(*args, **kwargs): if registry is None: registry = global_decomposition_table[type] - def add_op_to_table(aten_op): - overloads = [] - if isinstance(aten_op, OpOverload): - overloads.append(aten_op) - else: - assert isinstance(aten_op, OpOverloadPacket) - for ol in aten_op.overloads(): - overloads.append(getattr(aten_op, ol)) - for op_overload in overloads: - if op_overload in registry: - raise RuntimeError(f"duplicate registrations for {op_overload}") - registry[op_overload] = fn - op_overload.py_impl(torch._C.DispatchKey.Meta)(fn) - # TODO: factor this logic into OpOverload or Library API - name = op_overload._schema.name - if op_overload._schema.overload_name: - name += "." + op_overload._schema.overload_name - - if disable_meta: - global _disabled_meta_decomps - _disabled_meta_decomps.add(op_overload) - - if ( - not disable_meta - # TorchScript dumps a bunch of extra nonsense overloads - # which don't have corresponding dispatcher entries, we need - # to filter those out - and torch._C._dispatch_has_kernel(name) - # Don't register a python meta kernel to any operator that has - # should already work with meta tensors today. - # We can check that by seeing if the "computed table" for the operator - # has a registration to Meta; - # either through a direct registration, or an indirect one through - # an alias dispatch key (e.g. CompositeImplicitAutograd) - and not torch._C._dispatch_has_computed_kernel_for_dispatch_key( - name, "Meta" - ) - ): - if any( - a.alias_info is not None and not a.alias_info.is_write - for a in op_overload._schema.arguments - ): - raise RuntimeError( - f""" -Attempting to register a python meta kernel for a view operator: {str(op_overload)}. -We shouldn't do this, because the output will report as not having aliased storages. -All view ops have meta kernels in C++ today, so we should use those instead. - -If you're registering an operator through the `@register_decomposition` decorator, -Please set `disable_meta=True`. - """ - ) - meta_lib.impl(op_overload, fn) + def register(op): + _add_op_to_registry(registry, op, fn) # To handle allowing multiple aten_ops at once - tree_map(add_op_to_table, aten_op) + tree_map(register, aten_op) return fn return decomposition_decorator diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py index 2b4d2914fe858..234e43d12bf81 100644 --- a/torch/_decomp/decompositions.py +++ b/torch/_decomp/decompositions.py @@ -1073,7 +1073,7 @@ def prod(x: List[int]): return r -@register_decomposition(aten.split_with_sizes, disable_meta=True) +@register_decomposition(aten.split_with_sizes) def split_with_sizes( self: Tensor, split_sizes: List[int], dim: int = 0 ) -> List[Tensor]: @@ -1087,7 +1087,7 @@ def split_with_sizes( return splits -@register_decomposition(aten.split.Tensor, disable_meta=True) +@register_decomposition(aten.split.Tensor) def split(self: Tensor, split_size: int, dim: int = 0) -> List[Tensor]: input_sizes = self.shape dim_size = input_sizes[dim] @@ -1131,7 +1131,7 @@ def normalize(input, norm_dims, eps): return out, mean, rstd -@register_decomposition(aten.native_group_norm.default, disable_meta=True) +@register_decomposition(aten.native_group_norm.default) def native_group_norm( input: Tensor, weight: Optional[Tensor], @@ -1500,7 +1500,7 @@ def std_decomposition( # Questionable decompositions # This is only valid if we're running the graph without autograd, such as if the backward pass has been traced. # Note that this decomposition causes issues with in-place ops -@register_decomposition([aten.detach, aten.lift, aten.lift_fresh], disable_meta=True) +@register_decomposition([aten.detach, aten.lift, aten.lift_fresh]) def nop_decomposition(x): return aten.alias(x) @@ -1666,7 +1666,7 @@ def cudnn_batch_norm_backward( ) -@register_decomposition(aten._adaptive_avg_pool2d, disable_meta=True) +@register_decomposition(aten._adaptive_avg_pool2d) @pw_cast_for_opmath def adaptive_avg_pool2d(input: Tensor, output_size: Tuple[int, int]): # Preconditions @@ -1928,7 +1928,7 @@ def is_same_size(a: Tensor, b: Tensor) -> bool: return a.shape == b.shape -@register_decomposition([aten._reshape_alias, aten._unsafe_view], disable_meta=True) +@register_decomposition([aten._reshape_alias, aten._unsafe_view]) def _reshape_alias(x, shape, *args): return aten.view(x, shape) @@ -2194,7 +2194,7 @@ def mv(self, vec): return (self * vec).sum(dim=1) -@register_decomposition(aten.dot, disable_meta=True) +@register_decomposition(aten.dot) @out_wrapper() @pw_cast_for_opmath def dot(self, other): diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py index c22a8406b9b61..b4c8087537c6f 100644 --- a/torch/_inductor/decomposition.py +++ b/torch/_inductor/decomposition.py @@ -109,7 +109,7 @@ def register_decomposition(ops): for op in [ops] if callable(ops) else ops: if op in decompositions: log.warning(f"duplicate decomp: {ops}") - return decomp.register_decomposition(ops, decompositions, disable_meta=True) + return decomp.register_decomposition(ops, decompositions) @register_decomposition([aten.clamp]) @@ -317,7 +317,7 @@ def bernoulli_p(self, p=0.5, *, generator=None): """ extra_random_decomps = get_decompositions([aten.native_dropout]) register_extra_random_decomp = functools.partial( - decomp.register_decomposition, registry=extra_random_decomps, disable_meta=True + decomp.register_decomposition, registry=extra_random_decomps ) diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py index 22ceaaf0a18b0..cb961ff898790 100644 --- a/torch/_meta_registrations.py +++ b/torch/_meta_registrations.py @@ -4,7 +4,8 @@ import torch import torch._prims_common as utils from torch import Tensor -from torch._decomp import meta_table as meta_table +from torch._decomp import _add_op_to_registry, global_decomposition_table, meta_table +from torch._ops import OpOverload from torch._prims_common import ( check, corresponding_complex_dtype, @@ -21,27 +22,19 @@ from torch._subclasses.fake_tensor import check_no_bool_index_tensors from torch.utils._pytree import tree_map + aten = torch.ops.aten _meta_lib_dont_use_me_use_register_meta = torch.library.Library("aten", "IMPL", "Meta") -def register_meta(op, register_dispatcher=True): - def wrapper(f): - def add_func(op): - meta_table[op] = f - if register_dispatcher: - name = ( - op.__name__ - if op._overloadname != "default" - else op.overloadpacket.__name__ - ) - _meta_lib_dont_use_me_use_register_meta.impl(name, f) - - op.py_impl(torch._C.DispatchKey.Meta)(f) +def register_meta(op): + def wrapper(fn): + def register(op): + _add_op_to_registry(meta_table, op, fn) - tree_map(add_func, op) - return f + tree_map(register, op) + return fn return wrapper @@ -101,7 +94,7 @@ def meta_fft_c2r(self, dim, normalization, lastdim): return self.new_empty(output_sizes, dtype=toRealValueType(self.dtype)) -@register_meta(aten.copy_.default, register_dispatcher=False) +@register_meta(aten.copy_.default) def meta_copy_(self, src, non_blocking=False): return self @@ -241,7 +234,7 @@ def meta_pad2d(self, padding): return self.new_empty((nbatch, nplane, output_h, output_w)) -@register_meta(aten.bernoulli_.float, register_dispatcher=False) +@register_meta(aten.bernoulli_.float) def meta_bernoulli_(self, p=0.5, generator=None): return self @@ -283,7 +276,7 @@ def meta_dot(self, tensor): return self.new_empty(()) -@register_meta([aten.mm.default], register_dispatcher=False) +@register_meta([aten.mm.default]) def meta_mm(a, b): check(a.dim() == 2, lambda: "a must be 2D") check(b.dim() == 2, lambda: "b must be 2D") @@ -467,7 +460,7 @@ def check_dim_size(tensor, dim, dim_size, size): ) -@register_meta(aten.avg_pool2d.default, register_dispatcher=False) +@register_meta(aten.avg_pool2d.default) def meta_avg_pool2d( input, kernel_size, @@ -586,7 +579,7 @@ def avg_pool2d_backward_shape_check( # Don't override the C++ registration. -@register_meta(aten.avg_pool2d_backward.default, register_dispatcher=False) +@register_meta(aten.avg_pool2d_backward.default) def meta_avg_pool2d_backward( gradOutput_, input, @@ -731,7 +724,7 @@ def vdot(self, other): # of indexing shape inference is useful, # but not registering it to the dispatcher because we already # get shape inference through structured kernels -@register_meta(aten.index.Tensor, register_dispatcher=False) +@register_meta(aten.index.Tensor) def meta_index_Tensor(self, indices): check_no_bool_index_tensors(aten.index.Tensor, self, indices) check(indices, lambda: "at least one index must be provided") @@ -1090,42 +1083,42 @@ def meta_repeat(self, repeats): return self.new_empty(target_size) -@register_meta(aten.zero_.default, register_dispatcher=False) +@register_meta(aten.zero_.default) def meta_zero_(self): return self -@register_meta([aten.fill_.Tensor, aten.fill_.Scalar], register_dispatcher=False) +@register_meta([aten.fill_.Tensor, aten.fill_.Scalar]) def meta_fill_(self, val): return self -@register_meta([aten.fill.Tensor, aten.fill.Scalar], register_dispatcher=False) +@register_meta([aten.fill.Tensor, aten.fill.Scalar]) def meta_fill(self, val): return self.new_empty(self.shape) -@register_meta(aten.relu_.default, register_dispatcher=False) +@register_meta(aten.relu_.default) def meta_relu_(self): return self -@register_meta(aten.index_put.default, register_dispatcher=False) +@register_meta(aten.index_put.default) def meta_index_put(self, indices, values, accumulate=False): return self.new_empty(self.size()) -@register_meta(aten.masked_fill_.Scalar, register_dispatcher=False) +@register_meta(aten.masked_fill_.Scalar) def meta_masked_fill_(self, mask, value): return self -@register_meta(aten.index_put_.default, register_dispatcher=False) +@register_meta(aten.index_put_.default) def meta_index_put_(self, indices, values, accumulate=False): return self -@register_meta(aten.alias.default, register_dispatcher=False) +@register_meta(aten.alias.default) def meta_alias(self): return self.view(self.shape) @@ -1163,7 +1156,7 @@ def common_meta_baddbmm_bmm(batch1, batch2, is_bmm, self_baddbmm=None): return output -@register_meta(aten.bmm.default, register_dispatcher=False) +@register_meta(aten.bmm.default) def meta_bmm(self, mat2): return common_meta_baddbmm_bmm(self, mat2, True) @@ -1273,7 +1266,7 @@ def pool2d_shape_check( ) -@register_meta(aten.max_pool2d_with_indices.default, register_dispatcher=False) +@register_meta(aten.max_pool2d_with_indices.default) def meta_max_pool2d_with_indices( input, kernel_size, stride=(), padding=(0,), dilation=(1,), ceil_mode=False ): @@ -1471,7 +1464,7 @@ def gather_shape_check(self, dim, index): ) -@register_meta(aten.gather.default, register_dispatcher=False) +@register_meta(aten.gather.default) def meta_gather(self, dim, index, sparse_grad=False): wrapped_dim = maybe_wrap_dim(dim, self.dim()) is_index_empty = index.numel() == 0 @@ -1587,7 +1580,7 @@ def scatter_meta_impl(self, dim, index, src=None, reduce_=None, use_new_options= get_operator_enum(reduce_, use_new_options) -@register_meta(aten.scatter_add.default, register_dispatcher=False) +@register_meta(aten.scatter_add.default) def meta_scatter_add(self, dim, index, src): scatter_meta_impl(self, dim, index, src, "add") return self.new_empty(self.shape) @@ -1624,3 +1617,65 @@ def upsample_nearest2d_vec(input, output_size, scale_factors): import torch._refs import torch._refs.nn.functional import torch._refs.special + + +def activate_meta(): + + activate_meta_table = {} + + # For a given op, we pick the most specific decomp function from + # global_decomp_table in the precedence order of meta > post_autograd > pre_autograd + for type in ["meta", "post_autograd", "pre_autograd"]: + registry = global_decomposition_table[type] + + for opo in registry: + if opo not in activate_meta_table: + activate_meta_table[opo] = registry[opo] + + for op_overload, fn in activate_meta_table.items(): + assert isinstance(op_overload, OpOverload) + + op_overload.py_impl(torch._C.DispatchKey.Meta)(fn) + + if torch._C._dispatch_has_kernel_for_dispatch_key( + op_overload.name(), "CompositeImplicitAutograd" + ): + # Internally, we shouldn't be registering meta kernels for any operators that + # have CompositeImplicitAutograd kernels. + # Instead, we should be letting those decompositions run, and writing meta kernels + # only for the base operators. + pass + elif any( + a.alias_info is not None and not a.alias_info.is_write + for a in op_overload._schema.arguments + ): + # Attempting to register a python meta kernel for a view operator. + # We shouldn't do this, because the output will report as not having aliased storages. + # All view ops have meta kernels in C++ today, so we should use those instead. + pass + elif op_overload.name() in { + "aten::empty_strided", # causing infinite recursion, test_meta.py + "aten::clone", # causing infinite recursion + "aten::_to_copy", # causing infinite recursion, test_serialization.py -k test_tensor_subclass_getstate_overwrite # noqa: B950 + "aten::randn", # pin_memory parameter is not supported!, test_proxy_tensor.py -k test_make_fx_symbolic_exhaustive_randn_cpu_float32 # noqa: B950 + "aten::zeros.names", # TypeError: zeros() got an unexpected keyword argument 'names', inductor/test_torchinductor.py -k test_zeros_cpu # noqa: B950 + "aten::empty.names", # TypeError: empty() got an unexpected keyword argument 'names', inductor/test_torchinductor.py -k test_zeros_cpu # noqa: B950 + "aten::add.Tensor", # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_both_scalars # noqa: B950 + "aten::sub.Tensor", # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_both_scalars # noqa: B950 + "aten::mul.Tensor", # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_both_scalars # noqa: B950 + "aten::div.Tensor", # ValueError: Receive two Number inputs to an elementwise binary operation! test_fake_tensor.py -k test_scalar_inputs # noqa: B950 + "aten::div.Tensor_mode", # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_div8_cpu # noqa: B950 + "aten::diag_embed", # Stride mismatch! test_ops.py -k test_fake_autocast_diag_embed_cuda_float32 # noqa: B950 + "aten::copy_", # Exception not raiseed, test_torch.py -k test_storage_meta_errors_cpu_int64 # noqa: B950 + "aten::constant_pad_nd", # requires_grad mismatch, test_ops.py -k test_fake_crossref_backward_amp_istft_cuda_float32 # noqa: B950 + "aten::masked_fill.Scalar", # Stride mismatch! test_ops.py -k test_fake_crossref_backward_amp_nanquantile_cuda_float32 # noqa: B950 + "aten::tril", # Stride mismatch! test_ops.py -k test_fake_crossref_backward_amp_ormqr_cuda_float32 # noqa: B950 + "aten::triu", # Stride mismatch! test_ops.py -k test_fake_crossref_backward_amp_lu_solve_cuda_float32 # noqa: B950 + "aten::rot90", # requires_grad mismatch! test_ops.py -k test_fake_crossref_backward_amp_rot90_cuda_float32 # noqa: B950 + }: + pass + else: + _meta_lib_dont_use_me_use_register_meta.impl(op_overload, fn) + + +activate_meta() diff --git a/torch/_ops.py b/torch/_ops.py index b3ebd401ab8a2..ed0276d0ada2f 100644 --- a/torch/_ops.py +++ b/torch/_ops.py @@ -296,6 +296,10 @@ def inner(fn): dispatch_key_or_mode != torch._C.DispatchKey.Python ), "Please register a mode for the torch._C.DispatchKey.Python key instead." + if dispatch_key_or_mode in self.py_kernels: + raise RuntimeError( + f"Trying to override a python impl for {dispatch_key_or_mode} on operator {self._name}" + ) self.py_kernels[dispatch_key_or_mode] = fn return fn diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py index 3e2f6c45768a6..ccb44c6367a50 100644 --- a/torch/_refs/__init__.py +++ b/torch/_refs/__init__.py @@ -379,7 +379,6 @@ def _make_elementwise_unary_reference( type_promotion_kind, *, aten_op=infer_aten_op, - disable_meta=False, extra_meta=None, ) -> Callable: def inner(prim: Callable): @@ -406,7 +405,7 @@ def _ref(a: TensorLikeType) -> TensorLikeType: if aten_op is infer_aten_op: aten_op = getattr(torch.ops.aten, prim.__name__) if aten_op is not None: - register_decomposition(aten_op, disable_meta=disable_meta)(_ref) + register_decomposition(aten_op)(_ref) return _ref @@ -853,7 +852,6 @@ def _make_elementwise_binary_reference( has_out=True, supports_lhs_python_scalar=True, supports_rhs_python_scalar=True, - disable_meta=False, ) -> Callable: @elementwise_type_promotion_wrapper( type_promoting_args=("a", "b"), @@ -876,7 +874,7 @@ def _ref( # TODO: enable this for operations that support it, like add if isinstance(a, Number) and isinstance(b, Number): raise ValueError( - "Receive two Number inputs to an elementwise binary operation!" + f"Receive two Number inputs to an elementwise binary operation {prim}!" ) a, b = _maybe_broadcast(a, b) @@ -888,7 +886,7 @@ def _ref( if aten_op is infer_aten_op: aten_op = getattr(torch.ops.aten, prim.__name__.split(".")[0]) if aten_op is not None: - register_decomposition(aten_op, disable_meta=disable_meta)(_ref) + register_decomposition(aten_op)(_ref) return _ref @@ -2628,7 +2626,7 @@ def dstack(tensors: TensorSequenceType) -> TensorLikeType: return cat(aligned_tensors, 2) -@register_decomposition(torch.ops.aten.expand, disable_meta=True) +@register_decomposition(torch.ops.aten.expand) def expand(a: Tensor, *shape) -> Tensor: # NOTE: cannot use utils.extract_shape_from_varargs here # because that also validates the shape, but the shape @@ -2849,7 +2847,7 @@ def native_layer_norm( # TODO: Adding this as a meta function causes functorch tests to fail when compiled with debug mode. # test/test_eager_transforms.py::TestFunctionalizeCPU::test_functionalize_fx_transpose_simple_cpu -@register_decomposition(torch.ops.aten.permute, disable_meta=True) +@register_decomposition(torch.ops.aten.permute) def permute(a: TensorLikeType, *dims) -> TensorLikeType: _permutation = utils.canonicalize_dims( a.ndim, utils.extract_dims_from_varargs(dims) @@ -3285,7 +3283,7 @@ def index_add( return x.clone().index_add_(dim, index, tensor, alpha=alpha) # type: ignore[arg-type] -@register_decomposition(torch.ops.aten.index_select, disable_meta=True) +@register_decomposition(torch.ops.aten.index_select) @out_wrapper() def index_select(x: TensorLike, dim: int, index: TensorLike): dim = utils.canonicalize_dims(x.ndim, dim) @@ -3304,7 +3302,7 @@ def index_select(x: TensorLike, dim: int, index: TensorLike): # Note: although squeeze is documented as having the out= kwarg it doesn't -@register_decomposition(torch.ops.aten.squeeze, disable_meta=True) +@register_decomposition(torch.ops.aten.squeeze) def squeeze(a: TensorLikeType, dim: Optional[int] = None) -> TensorLikeType: if dim is not None: dim = utils.canonicalize_dim(a.ndim, dim) @@ -3500,7 +3498,7 @@ def diag( return torch.diagonal_copy(self, offset) -@register_decomposition(torch.ops.aten.diagonal, disable_meta=True) +@register_decomposition(torch.ops.aten.diagonal) def diagonal( self: TensorLikeType, offset: int = 0, @@ -3613,7 +3611,7 @@ def dsplit(a: TensorLikeType, sections: DimsType) -> TensorSequenceType: return tensor_split(a, sections, 2) -@register_decomposition(torch.ops.aten.t.default, disable_meta=True) +@register_decomposition(torch.ops.aten.t.default) def t(a: TensorLikeType): # TODO: Add sparse support # if a.is_sparse: @@ -3644,7 +3642,7 @@ def T(a: TensorLikeType) -> TensorLikeType: return a.t() -@register_decomposition(torch.ops.aten.transpose, disable_meta=True) +@register_decomposition(torch.ops.aten.transpose) def transpose(a: TensorLikeType, dim0: int, dim1: int) -> TensorLikeType: _dim0, _dim1 = utils.canonicalize_dims(a.ndim, (dim0, dim1)) # type: ignore[misc] @@ -3674,7 +3672,9 @@ def unfold( @register_decomposition(torch.ops.aten.unfold_copy) @out_wrapper() def unfold_copy(self: TensorLikeType, dimension: int, size: int, step: int): - return self.unfold(dimension, size, step).clone() + return self.unfold(dimension, size, step).clone( + memory_format=torch.contiguous_format + ) @register_decomposition(torch.ops.aten.cumsum) @@ -3701,7 +3701,7 @@ def cumsum( return sum(masked_a, dim=dim, keepdim=keepdim, dtype=dtype, out=out) -@register_decomposition(torch.ops.aten.unsqueeze, disable_meta=True) +@register_decomposition(torch.ops.aten.unsqueeze) def unsqueeze(a: TensorLikeType, dim: int) -> TensorLikeType: # Note that unsqueeze canonicalizes with rank + 1 because it allows # a new innermost dimension to be specified @@ -3714,7 +3714,7 @@ def unsqueeze(a: TensorLikeType, dim: int) -> TensorLikeType: # Tensor.view(a, b, c) or Tensor.view((a, b, c)) Function call torch.view # doesn't support unpacked shapes # TODO: Turn this into a decomposition (currently fails on reshape meta tests) -@register_decomposition(torch.ops.aten.view, disable_meta=True) +@register_decomposition(torch.ops.aten.view) def view(a: TensorLikeType, *shape: ShapeType) -> TensorLikeType: return _reshape_view_helper(a, *shape, allow_copy=False) diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py index 56b6d4b826af7..2f2f07f3db378 100644 --- a/torch/_subclasses/fake_tensor.py +++ b/torch/_subclasses/fake_tensor.py @@ -747,7 +747,7 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None): # is written to must be invalidated self.invalidate_written_to_constants(func, flat_arg_fake_tensors, args, kwargs) - from torch._decomp import _disabled_meta_decomps, decomposition_table + from torch._decomp import decomposition_table with self: # Decomposes CompositeImplicitAutograd ops @@ -781,7 +781,6 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None): if ( func in decomposition_table and torch_decomp_decompositions(func) - and func not in _disabled_meta_decomps and all(not e.is_sparse for e in flat_arg_fake_tensors) ): with self: diff --git a/torch/library.py b/torch/library.py index e97ae57267812..d75427ea4c703 100644 --- a/torch/library.py +++ b/torch/library.py @@ -114,13 +114,12 @@ def impl(self, op_name, fn, dispatch_key=''): dispatcher_op_name = name if '::' not in dispatcher_op_name: dispatcher_op_name = f'{self.ns}::{dispatcher_op_name}' - # get a string containing the names of every dispatch key that the operator has a registration for. - dispatch_key_registration = torch._C._dispatch_dump(dispatcher_op_name) + # Internally, we shouldn't be registering meta kernels for any operators that # have CompositeImplicitAutograd kernels. # Instead, we should be letting those decompositions run, and writing meta kernels # only for the base operators. - if 'CompositeImplicitAutograd' in dispatch_key_registration: + if torch._C._dispatch_has_kernel_for_dispatch_key(dispatcher_op_name, "CompositeImplicitAutograd"): raise RuntimeError( f"We should not register a meta kernel directly to the operator '{name}'," " because it has a CompositeImplicitAutograd kernel in core." From 1abd76b39a281ec19edf76e570cd9ff709472f4b Mon Sep 17 00:00:00 2001 From: Horace He Date: Tue, 25 Oct 2022 04:04:16 +0000 Subject: [PATCH 0124/1922] Add get_guard_expr to symbolic_shapes which returns all guards in a single expression (#87665) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87665 Approved by: https://github.com/ezyang, https://github.com/voznesenskym --- test/test_dynamic_shapes.py | 3 +-- test/test_proxy_tensor.py | 9 ++++++++ torch/fx/experimental/symbolic_shapes.py | 29 ++++++++++++++---------- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py index 87b1dd9aa8217..b183b6169dd6b 100644 --- a/test/test_dynamic_shapes.py +++ b/test/test_dynamic_shapes.py @@ -327,8 +327,7 @@ def test_guard_int(self): shape_env = ShapeEnv() a0 = create_symint(shape_env, 2) self.assertEqual(a0.guard_int(), 2) - self.assertEqual(str(shape_env.guards[0][0]), "s0") - self.assertEqual(shape_env.guards[0][1], 2) + self.assertEqual(str(shape_env.guards[0][0]), "Eq(s0, 2)") @skipIfNoSympy def test_int_conversion(self): diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py index 0092daa77ab49..3c2e818497a48 100644 --- a/test/test_proxy_tensor.py +++ b/test/test_proxy_tensor.py @@ -976,6 +976,15 @@ def f(x): return x.shape self._test_dynamic(f, [(5, 3)], [[(4, 6)]]) + def test_mega_guard(self): + def f(a, b): + assert a.shape[0] == b.shape[0] * 2 + assert b.shape[0] == 8 + return a.cos() + fx_g = make_fx(f, tracing_mode="symbolic")(torch.randn(16), torch.randn(8)) + self.assertExpectedInline(str(fx_g.shape_env.get_guard_expr()), "Eq(s1, 8) & Eq(s0, 2*s1)") + + def _assert_no_guards(self, fx_g, free_symbols): assert _get_free_symbols(fx_g.shape_env) == free_symbols, fx_g.shape_env.var_to_val assert len(fx_g.shape_env.get_nontrivial_guards()) == 0, fx_g.shape_env.format_guards() diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py index 7615e410e2515..29a49b50ba29b 100644 --- a/torch/fx/experimental/symbolic_shapes.py +++ b/torch/fx/experimental/symbolic_shapes.py @@ -439,26 +439,26 @@ def evaluate_guards_for_args(self, *args): # and wrap_fake_symbolic meta_converter = MetaConverter() pytree.tree_map_only(torch.Tensor, partial(meta_converter, shape_env=new_env), args) - return all(guard.xreplace(new_env.var_to_val) == value for guard, value, _ in self.guards) + return all(guard.xreplace(new_env.var_to_val) for guard, _ in self.guards) + + def get_guard_expr(self): + """ + Returns a sympy expression representing all of the shape env guards. + + NOTE: Does not include implicit 0/1 or duck-shaping guards + """ + return sympy.And(*[guard for guard, _ in self.guards]) def get_nontrivial_guards(self): - return [(self.simplify(guard), val) for guard, val, _ in self.guards if self._maybe_evaluate_static(guard) is None] + return [self.simplify(guard) for guard, _ in self.guards if self._maybe_evaluate_static(guard) is None] def format_guards(self, verbose=False): - def format_val(guard, val): - if val is sympy.true: - return str(guard) - elif val is sympy.false: - return f"Not({guard})" - else: - return f"Eq({guard}, {val})" - def format_tb(tb): if not verbose: return "" return f"\n Guarded at:\n{textwrap.indent(tb, ' ')}" - return '\n'.join(f" - {format_val(guard, val)}{format_tb(tb)}" for guard, val, tb in self.guards) + return '\n'.join(f" - {guard}{format_tb(tb)}" for guard, tb in self.guards) def get_shape_groups(self): shape_groups = collections.defaultdict(list) @@ -600,5 +600,10 @@ def evaluate_expr(self, expr: "sympy.Expr"): # NB: drop two frames; evaluate_expr and the Sym* function that # actually called us stack = ''.join(traceback.format_list(traceback.extract_stack()[:-2])) - self.guards.append((expr, concrete_val, stack)) + if concrete_val is sympy.true: + self.guards.append((expr, stack)) + elif concrete_val is sympy.false: + self.guards.append((sympy.Not(expr), stack)) + else: + self.guards.append((sympy.Eq(expr, concrete_val), stack)) return concrete_val From f567c08906d40f7bc5e4add7dc7d52f2e40d3516 Mon Sep 17 00:00:00 2001 From: stumpOS Date: Tue, 25 Oct 2022 17:00:27 +0000 Subject: [PATCH 0125/1922] consider numel args when identifying aligned args (#87394) Fixes #ISSUE_NUMBER https://github.com/pytorch/torchdynamo/issues/1527 cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang @penguinwu Pull Request resolved: https://github.com/pytorch/pytorch/pull/87394 Approved by: https://github.com/jansel --- test/inductor/test_torchinductor.py | 82 +++++++++++++++++++++++++++++ torch/_inductor/codegen/triton.py | 8 +-- 2 files changed, 86 insertions(+), 4 deletions(-) diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py index c106658b21c0e..bec1ea197c078 100644 --- a/test/inductor/test_torchinductor.py +++ b/test/inductor/test_torchinductor.py @@ -6,6 +6,7 @@ import os import random import sys +import typing import unittest import weakref from unittest.mock import patch @@ -4402,6 +4403,87 @@ def forward(pred_objectness_logits_3_: torch.Tensor): result = forward(*args) assert same(result, torch.sort(args[0], descending=True, dim=1)[0]) + class TritonCodeGenTests(TestCase): + from torch._inductor.triton_ops.autotune import CachingAutotuner + + class NoOpCompilerBackend: + def __init__(self): + self.example_args = None + self.model = None + + def noop_backend( + self, + model_: torch.fx.GraphModule, + example_inputs_: typing.List[torch.Tensor], + ): + """ + The Noop backend does not compile the fx graph it is given. + Instead, it transforms the fx graph so that its functions are + aten operations. It then saves this graph. + """ + from functorch._src.aot_autograd import Interpreter + from torch._inductor.decomposition import select_decomp_table + from torch._subclasses import FakeTensorMode + + fake_mode = FakeTensorMode() + + def interpret(*args, **kwargs): + return Interpreter(model_).run(*args[0:], **kwargs) + + fake_flat_tensor_args = [ + fake_mode.from_tensor(x) for x in example_inputs_ + ] + fw_module = make_fx(interpret, select_decomp_table())( + *fake_flat_tensor_args + ) + self.model = fw_module + self.example_args = fake_flat_tensor_args + return lambda x: example_inputs_ + + def get_kernels(self, fn, args) -> typing.List[CachingAutotuner]: + from torch._inductor.debug import DebugContext + from torch._inductor.graph import GraphLowering + from torch._inductor.virtualized import V + + cxt = TritonCodeGenTests.NoOpCompilerBackend() + torch._dynamo.optimize(backend=cxt.noop_backend)(fn)(*args) + graph = GraphLowering(cxt.model) + graph.num_static_inputs = 0 + kernels = [] + with V.set_graph_handler(graph), V.set_debug_handler(DebugContext()): + graph.run(*(cxt.example_args)) + mod = graph.compile_to_module() + i = 0 + while True: + attribute = f"kernel{i}" + if not hasattr(mod, attribute): + break + else: + kernels.append(getattr(mod, attribute)) + i = i + 1 + return kernels + + def test_divisibile_by_16_covers_numel_args(self): + def fn(a: torch.Tensor) -> torch.Tensor: + return torch.sum(a) + + kernels = self.get_kernels(fn, [torch.randn([256, 256], device="cuda")]) + self.assertTrue(len(kernels) == 2, "SUM should result in two kernels") + + # kernel0 reduces from 256 to (xnumel=8, rnumel=8192), which means it reduces 256 by 256 into an array of + # size 8 by accumulating 8192 elements at once note that rnumel is equal to 512 * 16, so rnumel which is + # at slot 3 should be in the divisible by 16 descriptor + arguments_that_are_divisible_by_16_in_kernel0 = ( + kernels[0].meta["configs"][0].divisible_by_16 + ) + self.assertEqual(arguments_that_are_divisible_by_16_in_kernel0, (0, 1, 3)) + + # kernel1 reduces from 8 elements to a single scalar. + arguments_that_are_divisible_by_16_in_kernel1 = ( + kernels[1].meta["configs"][0].divisible_by_16 + ) + self.assertEqual(arguments_that_are_divisible_by_16_in_kernel1, (0, 1)) + if __name__ == "__main__": from torch._dynamo.test_case import run_tests diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py index 832a0e6c82b4c..0ece1a06c9fa0 100644 --- a/torch/_inductor/codegen/triton.py +++ b/torch/_inductor/codegen/triton.py @@ -991,15 +991,14 @@ def codegen_kernel(self, name=None): triton_meta = { "signature": dict(enumerate(map(signature_of, signature))), "device": V.graph.scheduler.current_device.index, - "configs": [config_of(signature)], "constants": {}, } for tree in self.range_trees: if tree.prefix != "r" or self.inside_reduction: - triton_meta["signature"][len(argdefs)] = signature_of( - SizeArg(f"{tree.prefix}numel", tree.numel) - ) + sizearg = SizeArg(f"{tree.prefix}numel", tree.numel) + signature.append(sizearg) + triton_meta["signature"][len(argdefs)] = signature_of(sizearg) argdefs.append(f"{tree.prefix}numel") # constexpr version causes issues, see # https://github.com/pytorch/torchdynamo/pull/1362 @@ -1007,6 +1006,7 @@ def codegen_kernel(self, name=None): # tree.numel # ) # argdefs.append(f"{tree.prefix}numel: tl.constexpr") + triton_meta["configs"] = [config_of(signature)] for tree in self.range_trees: if tree.prefix != "r" or self.inside_reduction: From 9c72769164a218f404231fb9552dff84216469f5 Mon Sep 17 00:00:00 2001 From: Tugsbayasgalan Manlaibaatar Date: Mon, 24 Oct 2022 15:44:46 -0700 Subject: [PATCH 0126/1922] Add named_buffers to torchdynamo nn_module (#87644) Fixes: https://github.com/pytorch/torchdynamo/issues/1738 cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87644 Approved by: https://github.com/jansel --- test/dynamo/test_repros.py | 19 +++++++++++++++++++ torch/_dynamo/variables/nn_module.py | 7 +++++++ 2 files changed, 26 insertions(+) diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py index 52802f32ad1e8..bbb8ba527fc73 100644 --- a/test/dynamo/test_repros.py +++ b/test/dynamo/test_repros.py @@ -1736,6 +1736,25 @@ def forward(self, x): args = (torch.randn(3, 4),) self.assertTrue(same(mod(*args), opt_mod(*args))) + def test_named_buffers(self): + class Foo(torch.nn.Module): + def __init__(self): + super().__init__() + self.register_buffer("x", torch.ones(3)) + self.register_buffer("y", torch.ones(3)) + + def forward(self, inp): + res = 0 + for name, buffer in self.named_buffers(): + res += buffer.sum() + + return inp.cos() + res + + mod = Foo() + opt_mod = torch._dynamo.optimize("eager", nopython=True)(mod) + args = (torch.randn(3, 4),) + self.assertTrue(same(mod(*args), opt_mod(*args))) + if __name__ == "__main__": from torch._dynamo.test_case import run_tests diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py index 87a94565e180a..6f7c2ff287373 100644 --- a/torch/_dynamo/variables/nn_module.py +++ b/torch/_dynamo/variables/nn_module.py @@ -323,6 +323,13 @@ def named_embed(name, obj): ): result.append(named_embed(name, param)) return ListIteratorVariable(result, mutable_local=MutableLocal(), **options) + elif name == "named_buffers": + result = [] + for name, buffer in module.named_buffers( + **get_kwargs("prefix", "recurse", "remove_duplicate") + ): + result.append(named_embed(name, buffer)) + return ListIteratorVariable(result, mutable_local=MutableLocal(), **options) elif name == "named_modules": result = [] for name, submod in module.named_modules( From cf08badfd700da919e89e5459dcaefda6640f2ae Mon Sep 17 00:00:00 2001 From: Valentin Andrei Date: Tue, 25 Oct 2022 17:03:23 +0000 Subject: [PATCH 0127/1922] [pytorch] Layer norm backward speed gain with warp shuffles (#87445) Test Plan: ``` Times below are Forward + Backward on A100 Size FP32. Gain. FP16. Gain 256, 256 101.30 9% 103.9 6% 512, 256 110.10 -4% 102.9 10% 1024, 256 104.30 7% 102.4 6% 2048, 256 107.60 4% 109.7 0% 4096, 256 116.70 8% 109.1 0% 6144, 256 106.10 7% 112.8 2% 8192, 256 106.10 1% 109.7 2% 256, 512 102.10 3% 108.5 1% 512, 512 101.50 40% 105.9 4% 1024, 512 109.70 20% 109.2 -1% 2048, 512 107.40 24% 107.2 1% 4096, 512 108.00 6% 110.6 -3% 6144, 512 103.90 13% 105.8 7% 8192, 512 138.70 14% 105.6 7% 256, 1024 106.20 1% 102.9 6% 512, 1024 104.50 4% 104.2 3% 1024, 1024 126.90 -15% 103.9 10% 2048, 1024 127.40 -15% 102.2 6% 4096, 1024 117.70 6% 102.8 21% 6144, 1024 165.30 11% 112.2 12% 8192, 1024 211.90 11% 144.8 13% 256, 1536 102.80 11% 103.1 6% 512, 1536 103.30 9% 102.9 18% 1024, 1536 111.00 -2% 117.2 7% 2048, 1536 102.30 12% 132.1 -4% 4096, 1536 165.50 5% 112.9 18% 6144, 1536 236.60 5% 145.7 12% 8192, 1536 307.80 5% 186.1 11% 256, 2048 110.60 -1% 103.8 7% 512, 2048 105.20 3% 105.6 1% 1024, 2048 106.70 3% 114.8 3% 2048, 2048 124.90 5% 109.7 0% 4096, 2048 231.40 4% 129.9 10% 6144, 2048 332.80 4% 182.5 11% 8192, 2048 434.60 4% 235.2 11% 256, 3072 111.60 8% 110.8 1% 512, 3072 106.80 1% 104.6 10% 1024, 3072 104.90 3% 109.9 4% 2048, 3072 193.80 0% 106.2 10% 4096, 3072 364.50 0% 187.8 5% 6144, 3072 538.30 0% 267 5% 8192, 3072 718.00 -1% 346.7 6% 256, 4096 103.60 4% 110.2 -1% 512, 4096 131.40 -11% 117 -7% 1024, 4096 135.80 1% 104.8 7% 2048, 4096 268.20 1% 149.4 10% 4096, 4096 520.70 1% 268.5 9% 6144, 4096 786.30 0% 389.8 9% 8192, 4096 1043.50 0% 509 10% ``` Used the following script from ngimel: ``` import torch from torch.utils.benchmark import Compare, Timer results = [] for dtype in (torch.float, torch.half): for fs in (256, 512, 1024, 1536, 2048, 3072, 4096): for bs in (256, 512, 1024, 2048, 4096, 6144, 8192): ln = torch.nn.LayerNorm((fs,), device="cuda", dtype=dtype) X = torch.randn(bs, fs, device="cuda", dtype=dtype, requires_grad=True) gO = torch.rand_like(X) stmtfwd = "ln(X)" stmtfwdbwd = "X.grad=None; ln.zero_grad(set_to_none=True); out = ln(X); out.backward(gO)" tfwd = Timer( stmt=stmtfwd, label="ln", sub_label=f"{bs:5}, {fs:5}", description=f"fwd, {dtype}", globals=globals(), ) tfwdbwd = Timer( stmt=stmtfwdbwd, label="ln", sub_label=f"{bs:5}, {fs:5}", description=f"fwdbwd, {dtype}", globals=globals(), ) for t in (tfwd, tfwdbwd): results.append(t.blocked_autorange()) print(fs, end="\r") c = Compare(results) c.print() ``` Differential Revision: D40567574 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87445 Approved by: https://github.com/ngimel --- .../src/ATen/native/cuda/layer_norm_kernel.cu | 242 ++++++++++++++---- 1 file changed, 188 insertions(+), 54 deletions(-) diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu index ae09f0aaad8f8..732545465d9c9 100644 --- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu +++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu @@ -33,6 +33,7 @@ namespace { constexpr int kCUDANumThreads = 256; constexpr int kColwiseReduceTileSize = 32; +constexpr int kWarpSize = 32; constexpr int vec_size = 4; //we could make it dependent on dtype, but that would lead to different results between float and low-p types // aligned vector generates vectorized load/store on CUDA (copy-pasted from MemoryAccess.cuh) @@ -555,8 +556,108 @@ __global__ void GammaBetaBackwardCUDAKernel1( } } +template +__global__ void GammaBetaBackwardCUDAKernel_32x32( + int64_t M, + int64_t N, + const T* dY, + const T* X, + const T_ACC* mean, + const T_ACC* rstd, + T* dg, + T* db) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T_ACC* s_data_typed = reinterpret_cast(&s_data1); + T_ACC* s_dg; + T_ACC* s_db; + T_ACC dg_sum = 0; + T_ACC db_sum = 0; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + + if (j < N) { + constexpr int unroll_factor = 8; + int laneId = threadIdx.x & 0x1f; + + T_ACC mean_reg, mean_reg_tmp; + T_ACC rstd_reg, rstd_reg_tmp; + T dY_reg; + T X_reg; + + // Main loop + int bcounter; + for (bcounter = 0; bcounter < M / (blockDim.y * unroll_factor); + bcounter++) { + int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor; + + if (laneId < unroll_factor) { + mean_reg_tmp = mean[offset + laneId]; + rstd_reg_tmp = rstd[offset + laneId]; + } +#if !defined(USE_ROCM) + // Volta and newer architectures allow lane divergence within a warp. + __syncwarp(); +#endif + + #pragma unroll + for (int ii = 0; ii < unroll_factor; ++ii) { + dY_reg = dY[(offset + ii) * N + j]; + X_reg = X[(offset + ii) * N + j]; + mean_reg = WARP_SHFL(mean_reg_tmp, ii, kWarpSize); + rstd_reg = WARP_SHFL(rstd_reg_tmp, ii, kWarpSize); + dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg; + db_sum += dY_reg; + } + } + + // Remainder loop + int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor; + for (int ii = 0; ii < unroll_factor; ii++) { + if ((offset + ii) < M) { + mean_reg = mean[offset + ii]; + rstd_reg = rstd[offset + ii]; + dY_reg = dY[(offset + ii) * N + j]; + X_reg = X[(offset + ii) * N + j]; + dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg; + db_sum += dY_reg; + } + } + + // This kernel uses a block of (32 x 32) and gets called when M; N + // divide by 32. We can use warp shuffles for the final reduction + // step. This removes 4 shmem loads and stores with their + // corresponding __syncthreads() + + // This greatly reduces bank conflicts at the expense of a little + // extra shared memory. It does not impact occupancy + int padded_bx = (1 + blockDim.x); + + s_dg = s_data_typed; + s_db = s_data_typed + (padded_bx * blockDim.y); + s_dg[threadIdx.y * padded_bx + threadIdx.x] = dg_sum; + s_db[threadIdx.y * padded_bx + threadIdx.x] = db_sum; + __syncthreads(); + + // Load transposed so that a warp holds an entire column + T_ACC reg_dg = s_dg[threadIdx.x * padded_bx + threadIdx.y]; + T_ACC reg_db = s_db[threadIdx.x * padded_bx + threadIdx.y]; + for (int delta = 16; delta >= 1; delta /= 2) { + reg_dg += WARP_SHFL_XOR(reg_dg, delta, kWarpSize); + reg_db += WARP_SHFL_XOR(reg_db, delta, kWarpSize); + } + + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y; + if (dg) { + dg[j] = reg_dg; + } + if (db) { + db[j] = reg_db; + } + } + } +} template __global__ void GammaBetaBackwardCUDAKernel( @@ -569,66 +670,75 @@ __global__ void GammaBetaBackwardCUDAKernel( T* dg, T* db) { alignas(sizeof(double)) extern __shared__ char s_data1[]; - T_ACC * s_data_typed = reinterpret_cast(&s_data1); + T_ACC* s_data_typed = reinterpret_cast(&s_data1); + T_ACC* s_dg; + T_ACC* s_db; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; - constexpr int unroll = 8; - T dYs[unroll]; - T Xs[unroll]; - T_ACC * means = s_data_typed; - T_ACC * rstds = s_data_typed + unroll * blockDim.y; + T_ACC dg_sum = 0; T_ACC db_sum = 0; + if (j < N) { + constexpr int unroll_factor = 8; + + T_ACC mean_reg; + T_ACC rstd_reg; + T dY_reg; + T X_reg; + + // Main Loop int bcounter; - for (bcounter = 0; bcounter < M/(blockDim.y * unroll); bcounter++){ - int offset = (bcounter * blockDim.y + threadIdx.y) * unroll; - #pragma unroll - for (int ii=0; ii=1; offset /= 2){ + + for (int offset = blockDim.y / 2; offset >= 1; offset /= 2) { if (threadIdx.y < offset) { - s_data_typed[threadIdx.y * blockDim.x + threadIdx.x] += s_data_typed[(threadIdx.y + offset) * blockDim.x + threadIdx.x]; - s_data_typed[blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x] += - s_data_typed[blockDim.x * blockDim.y + (threadIdx.y + offset) * blockDim.x + threadIdx.x]; - } + s_dg[threadIdx.y * blockDim.x + threadIdx.x] += + s_dg[(threadIdx.y + offset) * blockDim.x + threadIdx.x]; + s_db[threadIdx.y * blockDim.x + threadIdx.x] += + s_db[(threadIdx.y + offset) * blockDim.x + threadIdx.x]; + } __syncthreads(); } + if (threadIdx.y == 0) { if (dg) { - dg[j] = s_data_typed[threadIdx.x]; + dg[j] = s_dg[threadIdx.x]; } if (db) { - db[j] = s_data_typed[threadIdx.x + blockDim.x * blockDim.y]; + db[j] = s_db[threadIdx.x]; } } } @@ -763,7 +873,8 @@ void LayerNormBackwardKernelImplInternal( T* dgamma_data = dgamma->defined() ? dgamma->template data_ptr() : nullptr; T* dbeta_data = dbeta->defined() ? dbeta->template data_ptr() : nullptr; - if (M < 512) { + + if (M < 128) { // For small batch size, do colwise reduce directly. const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; GammaBetaBackwardSimpleCUDAKernel @@ -778,19 +889,42 @@ void LayerNormBackwardKernelImplInternal( dbeta_data); C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { - dim3 threads{16, 32}; - int blocks = (N + threads.x-1)/threads.x; - GammaBetaBackwardCUDAKernel - <<>>( - M, - N, - dY_data, - X_data, - mean_data, - rstd_data, - dgamma_data, - dbeta_data); - C10_CUDA_KERNEL_LAUNCH_CHECK(); + if ((M % kWarpSize == 0) && (N % kWarpSize == 0)) { + // This implementation relies on warp primitives and requires that M and N divide + // exactly to warp size. + dim3 threads{kWarpSize, kWarpSize}; + int blocks = (N + threads.x - 1) / threads.x; + + // If M and N divide by 32, we can use warp shuffles for the final reduction. That requires + // transposing values in shared memory, so we apply a padding to reduce bank conflicts. + size_t shmem_sz = 2 * sizeof(T_ACC) * (threads.x + 1) * threads.y; + GammaBetaBackwardCUDAKernel_32x32 + <<>>( + M, + N, + dY_data, + X_data, + mean_data, + rstd_data, + dgamma_data, + dbeta_data); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } else { + dim3 threads{16, 32}; + int blocks = (N + threads.x - 1) / threads.x; + size_t shmem_sz = 2 * sizeof(T_ACC) * threads.x * threads.y; + GammaBetaBackwardCUDAKernel + <<>>( + M, + N, + dY_data, + X_data, + mean_data, + rstd_data, + dgamma_data, + dbeta_data); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } } } } From b45e93f42090066e9d6ac9d6145aa3d36f173d6d Mon Sep 17 00:00:00 2001 From: Will Constable Date: Tue, 25 Oct 2022 02:35:41 +0000 Subject: [PATCH 0128/1922] Graph-break on FSDP in dynamo (#87420) Why we want to graph-break FSDP - FSDP has communication ops during forward and backward which we currently can't trace into the graph but also want to ensure are overlapped with compute - dynamo has issues tracing into or capturing a call to fsdp module without a break (see below) How we graph-break on FSDP - marking FSDP.forward code as skip means the code frames will graph-break; but in this case all of torch.* is listed in skipfiles.py anyway, so this is taken care of - disallowing the FSDP module prevents dynamo trying to record a 'call_module(FSDPmodule)' node into a graph, which happens earlier than the graphbreak that would be caused by skip, and causes additional issues: dynamo deepcopies modules before call-module handling, and FSDP module isn't trivially deep-copyable cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87420 Approved by: https://github.com/aazzolini --- torch/_dynamo/allowed_functions.py | 19 +++++++++++++++++++ torch/_dynamo/skipfiles.py | 16 ++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/torch/_dynamo/allowed_functions.py b/torch/_dynamo/allowed_functions.py index 42a6580ac1c86..67daafc5adac7 100644 --- a/torch/_dynamo/allowed_functions.py +++ b/torch/_dynamo/allowed_functions.py @@ -18,6 +18,24 @@ from . import config from .utils import is_safe_constant +""" +A note on allowed functions: + +Dynamo consults this file to determine if a particular function/module +is allowed to appear as a node in its fx output. + +If a function is disallowed, it may either be traced-through, or skipped. + +Trace-through means dynamo will continue to trace the interior code for +the function/module rather than stopping at its boundary and recording it +as a node in the fx graph. Whether tracing through or allowing, the functionality +of the function/module is part of the dynamo graph. Caveat: if tracing through, +any interior operation could trigger its own graph-break. + +Skips are determined by (torch/_dynamo/skipfiles.py) - see "a note on +skipfiles" there. +""" + def make_function_id_set(lazy_initializer): """ @@ -130,6 +148,7 @@ def _is_allowed_module_prefix(obj): "torch._inductor.", "torch._C.inductor.", "torch.fx.", + "torch.distributed.fsdp.", ) allowed_modules_dot = tuple([x + "." for x in allowed_modules]) module = inspect.getmodule(obj) diff --git a/torch/_dynamo/skipfiles.py b/torch/_dynamo/skipfiles.py index 2b6fbb3959c8d..ee2ad3f9395ff 100644 --- a/torch/_dynamo/skipfiles.py +++ b/torch/_dynamo/skipfiles.py @@ -49,6 +49,22 @@ from . import config +""" +A note on skipfiles: + +Dynamo consults this file to determine whether code should be compiled or skipped. + +A skip applies at the frame boundary, meaning dynamo either triggers a graph break +at the beginning of the frame or attempts to trace the whole frame. When skipping +a frame, recursively called frames are still traced by dynamo unless also skipped. + +Skipfiles (skipped at the file level instead of function level) still apply on a +frame-by-frame boundary as dynamo traces, but apply to all functions in that file. + +@skip is a helper decorator that can be applied to your function to cause it to be +included here. +""" + def _strip_init_py(s): return re.sub(r"__init__.py$", "", s) From dc2b01a61c79bdcdc82297b4f08822690fd86251 Mon Sep 17 00:00:00 2001 From: Bin Bao Date: Tue, 25 Oct 2022 17:34:29 +0000 Subject: [PATCH 0129/1922] Disable test_inductor_timm_shard (#87710) Summary: tests are flaky. Need more time for investigation. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87710 Approved by: https://github.com/anijain2305, https://github.com/malfet --- .github/workflows/inductor.yml | 12 ++---------- .jenkins/pytorch/test.sh | 13 +++---------- 2 files changed, 5 insertions(+), 20 deletions(-) diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml index da27466b60e90..e6a79e2a738d8 100644 --- a/.github/workflows/inductor.yml +++ b/.github/workflows/inductor.yml @@ -22,16 +22,8 @@ jobs: cuda-arch-list: 8.6 test-matrix: | { include: [ - { config: "inductor", shard: 1, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "inductor", shard: 2, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "inductor", shard: 3, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "inductor", shard: 4, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "inductor", shard: 5, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "inductor", shard: 6, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "inductor", shard: 7, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "inductor", shard: 8, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "inductor", shard: 9, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "inductor", shard: 10, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, ]} linux-bionic-cuda11_6-py3_10-gcc7-inductor-test: diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index b263c1949c10f..a1381a5c75957 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -738,24 +738,17 @@ elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHAR install_filelock install_triton test_dynamo_shard 2 -elif [[ "${TEST_CONFIG}" == *inductor* && $SHARD_NUMBER -lt 9 && $NUM_TEST_SHARDS -gt 1 ]]; then +elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then install_torchvision install_filelock install_triton - install_timm - id=$((SHARD_NUMBER-1)) - test_inductor_timm_shard $id -elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 9 && $NUM_TEST_SHARDS -gt 1 ]]; then + test_inductor +elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then install_torchvision install_filelock install_triton install_huggingface test_inductor_huggingface_shard 0 -elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 10 && $NUM_TEST_SHARDS -gt 1 ]]; then - install_torchvision - install_filelock - install_triton - test_inductor elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then test_without_numpy install_torchvision From e6e2f7b3a944cf161d98711b0fbecb44ca7cc435 Mon Sep 17 00:00:00 2001 From: AllenTiTaiWang Date: Tue, 25 Oct 2022 15:55:31 +0000 Subject: [PATCH 0130/1922] [ONNX] Support quantized::conv1d_relu (#85997) According to #38248, quantized::conv1d_relu shares packing parameters with Conv2D (kspatialDim is also 2), and needs a different unpacking way. Therefore, a new `QuantizedParamsType=Conv1D` is used to differentiate the two, and has to extract 1D information from 2D packed parameters. Pull Request resolved: https://github.com/pytorch/pytorch/pull/85997 Approved by: https://github.com/BowenBao --- test/onnx/test_pytorch_onnx_onnxruntime.py | 14 ++++++ .../passes/onnx/unpack_quantized_weights.cpp | 47 ++++++++++++++----- torch/onnx/symbolic_opset10.py | 26 ++++++++++ 3 files changed, 76 insertions(+), 11 deletions(-) diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index e917e44ce21bd..bc70011b78871 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -11834,6 +11834,20 @@ def test_quantized_conv2d_relu(self): q_input = torch.quantize_per_tensor(input, 0.5, 128, torch.quint8) self.run_test(model, q_input) + @skipIfUnsupportedMinOpsetVersion(10) + def test_quantized_conv1d_relu(self): + model = torch.nn.intrinsic.quantized.ConvReLU1d(16, 33, 3, stride=2) + # Manually initialize model weight and bias to random numbers. + # By default all zeros. + q_weight = torch.quantize_per_tensor( + torch.randn(33, 16, 3), 0.5, 0, torch.qint8 + ) + bias = torch.arange(33).to(torch.float) - 16 + model.set_weight_bias(q_weight, bias) + input = torch.randn(3, 16, 32) + q_input = torch.quantize_per_tensor(input, 0.5, 128, torch.quint8) + self.run_test(model, q_input) + @common_utils.parametrize( "function_or_module", [ diff --git a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp index f5a50e76fcae4..300e3452a8d17 100644 --- a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp +++ b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp @@ -299,7 +299,10 @@ void ConvertQuantizedWeight( } } -enum class QuantizedParamsType { CONV, LINEAR }; +// CONV1D needs a different unpacking from CONV, since it's +// packed as CONV2D intentionally at the first place. +// See: https://github.com/pytorch/pytorch/pull/38248 +enum class QuantizedParamsType { CONV1D, CONV, LINEAR }; // This is called before the onnx pass. Using pattern matching we // find the relevant nodes and extract the packed_params. The packed_params are @@ -413,7 +416,8 @@ void unpackQuantizedWeightsHelper( groups = groups_int; transpose = transpose_int; } else if ( - params_type == QuantizedParamsType::CONV && + (params_type == QuantizedParamsType::CONV || + params_type == QuantizedParamsType::CONV1D) && ser_tup->elements()[0].isString()) { const auto& elements = ser_tup->elements(); auto version = elements[0].toStringRef(); @@ -426,25 +430,32 @@ void unpackQuantizedWeightsHelper( const int64_t kSpatialDim = conv_params_packed[0].item(); // skip kSpatialDim int64_t idx = 1; + // kSpatialDim = 2 even it's for Conv1D from torch.op to adopt Conv2D, + // so we need a special unpack for Conv1D which has Conv2D dim. + // See: https://github.com/pytorch/pytorch/pull/38248 for (const auto i : c10::irange(kSpatialDim)) { - (void)i; // Suppress unused variable warning - stride_int.emplace_back(conv_params_packed[idx].item()); + if (params_type != QuantizedParamsType::CONV1D || i != 0) { + stride_int.emplace_back(conv_params_packed[idx].item()); + } idx++; } for (const auto i : c10::irange(kSpatialDim)) { - (void)i; // Suppress unused variable warning - padding_int.emplace_back(conv_params_packed[idx].item()); + if (params_type != QuantizedParamsType::CONV1D || i != 0) { + padding_int.emplace_back(conv_params_packed[idx].item()); + } idx++; } for (const auto i : c10::irange(kSpatialDim)) { - (void)i; // Suppress unused variable warning - dilation_int.emplace_back(conv_params_packed[idx].item()); + if (params_type != QuantizedParamsType::CONV1D || i != 0) { + dilation_int.emplace_back(conv_params_packed[idx].item()); + } idx++; } for (const auto i : c10::irange(kSpatialDim)) { - (void)i; // Suppress unused variable warning - output_padding_int.emplace_back( - conv_params_packed[idx].item()); + if (params_type != QuantizedParamsType::CONV1D || i != 0) { + output_padding_int.emplace_back( + conv_params_packed[idx].item()); + } idx++; } groups_int = conv_params_packed[idx].item(); @@ -461,6 +472,9 @@ void unpackQuantizedWeightsHelper( torch::List optional = elements[2].toList(); bias = optional.get(0).toOptional(); + if (params_type == QuantizedParamsType::CONV1D) { + unpacked_weight = unpacked_weight.squeeze_(2); + } stride = stride_int; padding = padding_int; dilation = dilation_int; @@ -638,6 +652,10 @@ void UnpackQuantizedWeights( graph(%input, %packed_weight, %w_scale, %w_zero_point): %r = quantized::linear(%input, %packed_weight, %w_scale, %w_zero_point) return (%r) )"; + std::string qconv1d_relu = R"( + graph(%input, %packed_params, %scale, %zero_point): + %r = quantized::conv1d_relu(%input, %packed_params, %scale, %zero_point) + return (%r) )"; std::string qconv2d = R"( graph(%input, %packed_params, %scale, %zero_point): %r = quantized::conv2d(%input, %packed_params, %scale, %zero_point) @@ -668,6 +686,13 @@ void UnpackQuantizedWeights( "quantized::conv2d_unpack", QuantizedParamsType::CONV, caffe2); + unpackQuantizedWeightsHelper( + graph, + paramsDict, + qconv1d_relu, + "quantized::conv1d_unpack", + QuantizedParamsType::CONV1D, + caffe2); unpackQuantizedWeightsHelper( graph, paramsDict, diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py index f88e1fe797c8b..bc04db1f37f59 100644 --- a/torch/onnx/symbolic_opset10.py +++ b/torch/onnx/symbolic_opset10.py @@ -41,6 +41,7 @@ "quantized_add_relu", "quantized_add", "quantized_cat", + "quantized_conv1d_relu", "quantized_conv2d_relu", "quantized_conv2d", "quantized_group_norm", @@ -826,6 +827,31 @@ def quantized_instance_norm( return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point) +@_onnx_symbolic("quantized::conv1d_relu") +@_beartype.beartype +def quantized_conv1d_relu( + g: jit_utils.GraphContext, + q_input, + q_weight, + bias, + stride, + padding, + dilation, + groups, + op_scale, + op_zero_point, +): + input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input) + weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight) + q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale) + bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias) + + output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups) + output = opset9.relu(g, output) + + return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point) + + @_onnx_symbolic("quantized::conv2d_relu") @_beartype.beartype def quantized_conv2d_relu( From 90f2a0675836804b5dff41ea1a1257e336c8d794 Mon Sep 17 00:00:00 2001 From: AllenTiTaiWang Date: Tue, 25 Oct 2022 15:52:17 +0000 Subject: [PATCH 0131/1922] [ONNX] replace AT_ASSERT with TORCH_INTERTNAL_ASSERT take 2 (#86405) Address the AT_ASSERT in torch/jit/csrc/serialization (ONNX related). Pull Request resolved: https://github.com/pytorch/pytorch/pull/86405 Approved by: https://github.com/justinchuby, https://github.com/BowenBao --- torch/csrc/jit/passes/onnx.cpp | 18 ++++++------- .../pattern_conversion/pattern_conversion.cpp | 10 ++++--- torch/csrc/jit/serialization/export.cpp | 27 ++++++++++--------- 3 files changed, 29 insertions(+), 26 deletions(-) diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp index 98f3cb42aea0f..f5e948b2cacf3 100644 --- a/torch/csrc/jit/passes/onnx.cpp +++ b/torch/csrc/jit/passes/onnx.cpp @@ -59,13 +59,13 @@ void checkONNXCompatibility(const c10::FunctionSchema& schema) { if (type->kind() == TypeKind::OptionalType) { type = reinterpret_cast(type.get())->getElementType(); // recursive optional type is not supported - AT_ASSERT(type->kind() != TypeKind::OptionalType); + TORCH_INTERNAL_ASSERT(type->kind() != TypeKind::OptionalType); } if (type->kind() == TypeKind::ListType) { const auto& elem_type = reinterpret_cast(type.get())->getElementType(); if (elem_type->isSubtypeOf(*TensorType::get())) { - AT_ASSERTM( + TORCH_INTERNAL_ASSERT( !has_tensor_list, "ONNX export supports at most one TensorList as input."); has_tensor_list = true; @@ -92,7 +92,7 @@ void preprocessCaffe2Ops(Block* block) { size_t origin_inputs_index = 0; for (const auto& arg : args) { auto type = arg.type(); - AT_ASSERT(origin_inputs_index < origin_inputs.size()); + TORCH_INTERNAL_ASSERT(origin_inputs_index < origin_inputs.size()); const auto& origin_input = origin_inputs[origin_inputs_index++]; if (type->kind() == TypeKind::OptionalType && origin_input->mustBeNone()) { @@ -104,24 +104,24 @@ void preprocessCaffe2Ops(Block* block) { type->kind() == TypeKind::BoolType || type->kind() == TypeKind::IntType) { const auto* constant_node = origin_input->node(); - AT_ASSERT(constant_node->kind() == prim::Constant); + TORCH_INTERNAL_ASSERT(constant_node->kind() == prim::Constant); it->i_(Symbol::attr(arg.name()), constant_node->i(attr::value)); } else if (type->kind() == TypeKind::FloatType) { const auto* constant_node = origin_input->node(); - AT_ASSERT(constant_node->kind() == prim::Constant); + TORCH_INTERNAL_ASSERT(constant_node->kind() == prim::Constant); it->f_(Symbol::attr(arg.name()), constant_node->f(attr::value)); } else if (type->kind() == TypeKind::StringType) { const auto* constant_node = origin_input->node(); - AT_ASSERT(constant_node->kind() == prim::Constant); + TORCH_INTERNAL_ASSERT(constant_node->kind() == prim::Constant); it->s_(Symbol::attr(arg.name()), constant_node->s(attr::value)); } else if (type->kind() == TypeKind::ListType) { const auto& list_node = origin_input->node(); const auto& elem_type = type->castRaw()->getElementType(); - AT_ASSERT( + TORCH_INTERNAL_ASSERT( list_node->kind() == prim::ListConstruct || list_node->kind() == prim::Constant); if (elem_type->isSubtypeOf(*TensorType::get())) { - AT_ASSERT(list_node->kind(), prim::ListConstruct); + TORCH_INTERNAL_ASSERT(list_node->kind(), prim::ListConstruct); const auto& tensor_list = origin_input->node()->inputs(); for (const auto& t : tensor_list) { it->addInput(t); @@ -131,7 +131,7 @@ void preprocessCaffe2Ops(Block* block) { if (list_node->kind() == prim::ListConstruct) { for (const auto* elem_input : list_node->inputs()) { const auto* constant_node = elem_input->node(); - AT_ASSERT(constant_node->kind() == prim::Constant); + TORCH_INTERNAL_ASSERT(constant_node->kind() == prim::Constant); values.push_back(constant_node->f(attr::value)); } } else { // is a constant list diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp index 2280ea6eb30bb..d93e34f87c6e9 100644 --- a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp +++ b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp @@ -146,8 +146,8 @@ std::unordered_map MergeSliceAndSelectToIndices( std::forward_as_tuple(index_tensor, aten::select)); dim_offset++; } else { - AT_ERROR( - "Unexpected node kind ", + TORCH_CHECK( + false, node->kind().toDisplayString(), " Expected aten::slice or aten::select."); } @@ -202,7 +202,8 @@ std::vector ReshapeToAdvancedIndexingFormat( if (((max_index_dim - min_index_dim + 1) != tensor_ind_count) && tensor_ind_count != 0) { - AT_ERROR( + TORCH_CHECK( + false, "Only consecutive 1-d tensor indices are supported in exporting aten::index_put to ONNX.", "Check https://pytorch.org/docs/stable/onnx.html#indexing for details"); } @@ -230,7 +231,8 @@ std::vector ReshapeToAdvancedIndexingFormat( break; } default: - AT_ERROR("Unexpected node kind ", index_i->second.orig_node_kind); + TORCH_CHECK( + false, "Unexpected node kind ", index_i->second.orig_node_kind); } if (ind_size != 1) { diff --git a/torch/csrc/jit/serialization/export.cpp b/torch/csrc/jit/serialization/export.cpp index 2f178addda955..f5f5ab7c99088 100644 --- a/torch/csrc/jit/serialization/export.cpp +++ b/torch/csrc/jit/serialization/export.cpp @@ -729,7 +729,7 @@ void GraphEncoder::EncodeBlock( bool add_node_names, bool use_external_data_format, const std::string& onnx_file_path) { - AT_ASSERT(graph_proto != nullptr); + TORCH_INTERNAL_ASSERT(graph_proto != nullptr); std::string block_name = "torch_jit"; if (num_blocks_) { block_name += std::to_string(num_blocks_); @@ -806,7 +806,7 @@ void GraphEncoder::AddInitializersIntoGraphProto( const std::map& initializers, bool use_external_data_format, const std::string& onnx_file_path) { - AT_ASSERT(block->inputs().size() >= initializers.size()); + TORCH_INTERNAL_ASSERT(block->inputs().size() >= initializers.size()); for (auto input : block->inputs()) { auto name_tensor_pair = initializers.find(input->debugName()); if (name_tensor_pair == initializers.end()) { @@ -888,7 +888,7 @@ void GraphEncoder::EncodeNode( node_proto->set_domain(domain); } if (operator_export_type_ == onnx_torch::OperatorExportTypes::ONNX) { - AT_ASSERT( + TORCH_INTERNAL_ASSERT( !node->kind().is_aten() && !node->kind().is_prim() && !node->kind().is_attr()); } @@ -923,7 +923,7 @@ void GraphEncoder::EncodeNode( node_proto, node, attr_name, use_external_data_format, onnx_file_path); } if (node->kind() == ::c10::onnx::Loop) { - AT_ASSERT(node->blocks().size() == 1); + TORCH_INTERNAL_ASSERT(node->blocks().size() == 1); auto body = node_proto->add_attribute(); body->set_name("body"); @@ -940,7 +940,7 @@ void GraphEncoder::EncodeNode( onnx_file_path); } if (node->kind() == ::c10::onnx::If) { - AT_ASSERT(node->blocks().size() == 2); + TORCH_INTERNAL_ASSERT(node->blocks().size() == 2); auto then_branch = node_proto->add_attribute(); then_branch->set_name("then_branch"); @@ -978,7 +978,7 @@ void GraphEncoder::AddAttribute( const std::string& ref_attr_name, const AttributeKind attr_kind) { auto attr = node_proto->add_attribute(); - AT_ASSERT(name.is_attr()); + TORCH_INTERNAL_ASSERT(name.is_attr()); attr->set_name(name.toUnqualString()); attr->set_ref_attr_name(ref_attr_name); attr->set_type(ATenAttributeKindToOnnxAttributeType(attr_kind, name)); @@ -1009,7 +1009,7 @@ void GraphEncoder::AddAttribute( }; auto attr = node_proto->add_attribute(); - AT_ASSERT(name.is_attr()); + TORCH_INTERNAL_ASSERT(name.is_attr()); attr->set_name(name.toUnqualString()); attr->set_type( ATenAttributeKindToOnnxAttributeType(node->kindOf(name), name)); @@ -1236,7 +1236,7 @@ void GraphEncoder::EncodeTensor( // or use_external_data_format should be true, not both at the same time. They // can both be false at the same time (for ONNX export for regular model // size). - AT_ASSERT( + TORCH_INTERNAL_ASSERT( !((defer_weight_export_ && external_ref) && use_external_data_format)); // Add a buffer to the raw_data_export_map for the caller to dump into an // external data store. If external_ref is not specified, we instead dump @@ -1244,18 +1244,19 @@ void GraphEncoder::EncodeTensor( if (defer_weight_export_ && external_ref) { // For now, we use the name of the tensor as the external lookup name to // avoid ONNX protobuf changes. - AT_ASSERT(external_ref.value() == tensor_proto->name()); - AT_ASSERT(raw_data_export_map_.count(external_ref.value()) == 0); + TORCH_INTERNAL_ASSERT(external_ref.value() == tensor_proto->name()); + TORCH_INTERNAL_ASSERT( + raw_data_export_map_.count(external_ref.value()) == 0); raw_data_export_map_[external_ref.value()] = t; tensor_proto->set_raw_data("__EXTERNAL"); } else { - AT_ASSERT(t.is_contiguous()); + TORCH_INTERNAL_ASSERT(t.is_contiguous()); size_t tensorSize = static_cast(c10::multiply_integers( std::begin(tensor.sizes()), std::end(tensor.sizes()))); if (use_external_data_format && tensorSize > ParamSizeThresholdForExternalStorage) { - AT_ASSERT(!onnx_file_path.empty()); - AT_ASSERT(tensor_proto->has_name()); + TORCH_INTERNAL_ASSERT(!onnx_file_path.empty()); + TORCH_INTERNAL_ASSERT(tensor_proto->has_name()); auto tensorName = GetExternalFileName(tensor_proto->name()); CreateExternalFile(t, tensorName, onnx_file_path); onnx::StringStringEntryProto* location = From 28a8d2889874c61053eaa77ba95d2b9ccc3cae7b Mon Sep 17 00:00:00 2001 From: Shen Li Date: Tue, 25 Oct 2022 15:00:39 +0000 Subject: [PATCH 0132/1922] Add prepend argument to nn.Module hooks (#87370) cc @ezyang @gchanan Pull Request resolved: https://github.com/pytorch/pytorch/pull/87370 Approved by: https://github.com/soulitzer --- test/nn/test_module_hooks.py | 196 ++++++++++++++++++++++ torch/distributed/nn/api/remote_module.py | 12 +- torch/nn/modules/module.py | 68 +++++++- 3 files changed, 270 insertions(+), 6 deletions(-) create mode 100644 test/nn/test_module_hooks.py diff --git a/test/nn/test_module_hooks.py b/test/nn/test_module_hooks.py new file mode 100644 index 0000000000000..5fe984c2bd6a7 --- /dev/null +++ b/test/nn/test_module_hooks.py @@ -0,0 +1,196 @@ +# Owner(s): ["module: nn"] +from torch.testing._internal.common_utils import ( + TestCase, + run_tests, + skipIfTorchDynamo, +) + +import torch +import torch.nn as nn + +from functools import partial +from typing import List, Tuple + + +class Net(nn.Module): + def __init__(self) -> None: + super().__init__() + self.seq1 = nn.Sequential(*[nn.Linear(10, 10) for _ in range(2)]) + self.seq2 = nn.Sequential(*[nn.Linear(10, 10) for _ in range(2)]) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.seq2(self.seq1(x)) + + +class ToyModel(nn.Module): + def __init__(self) -> None: + super().__init__() + self.net1 = Net() + self.net2 = Net() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.net2(self.net1(x)) + + +def forward_hook( + self: TestCase, + fired_hooks: List[int], + expected_module: nn.Module, + hook_id: int, + module: nn.Module, + inp: Tuple[torch.Tensor], + out: torch.Tensor, +) -> None: + fired_hooks.append(hook_id) + self.assertEqual(id(module), id(expected_module)) + self.assertEqual(len(inp), 1) + + +def forward_pre_hook( + self: TestCase, + fired_hooks: List[int], + expected_module: nn.Module, + hook_id: int, + module: nn.Module, + inp: Tuple[torch.Tensor], +) -> None: + fired_hooks.append(hook_id) + self.assertEqual(id(module), id(expected_module)) + self.assertEqual(len(inp), 1) + + +def full_backward_hook( + self: TestCase, + fired_hooks: List[int], + expected_module: nn.Module, + hook_id: int, + module: nn.Module, + grad_input: Tuple[torch.Tensor], + grad_output: Tuple[torch.Tensor], +) -> None: + fired_hooks.append(hook_id) + self.assertEqual(id(module), id(expected_module)) + self.assertEqual(len(grad_input), 1) + self.assertEqual(len(grad_output), 1) + + +def full_backward_pre_hook( + self: TestCase, + fired_hooks: List[int], + expected_module: nn.Module, + hook_id: int, + module: nn.Module, + grad_input: Tuple[torch.Tensor], +) -> None: + fired_hooks.append(hook_id) + self.assertEqual(id(module), id(expected_module)) + self.assertEqual(len(grad_input), 1) + + +class TestModuleHooks(TestCase): + + @skipIfTorchDynamo("Dynamo does not yet capture hooks") + def test_forward_hooks(self): + fired_hooks: List[int] = [] + model = ToyModel() + x = torch.randn(10, 10) + hook = partial(forward_hook, self, fired_hooks, model.net1.seq2) + model.net1.seq2.register_forward_hook(partial(hook, 0)) + model.net1.seq2.register_forward_hook(partial(hook, 1), prepend=True) + model.net1.seq2.register_forward_hook(partial(hook, 2)) + model.net1.seq2.register_forward_hook(partial(hook, 3)) + model.net1.seq2.register_forward_hook(partial(hook, 4), prepend=True) + expected = [4, 1, 0, 2, 3] + + self.assertEqual(fired_hooks, []) + out = model(x) + self.assertEqual(fired_hooks, expected) + out.sum().backward() + self.assertEqual(fired_hooks, expected) + model(x).sum().backward() + self.assertEqual(fired_hooks, expected + expected) + + @skipIfTorchDynamo("Dynamo does not yet capture hooks") + def test_forward_pre_hooks(self): + fired_hooks: List[int] = [] + model = ToyModel() + x = torch.randn(10, 10) + hook = partial(forward_pre_hook, self, fired_hooks, model.net2.seq1) + model.net2.seq1.register_forward_pre_hook(partial(hook, 0), prepend=True) + model.net2.seq1.register_forward_pre_hook(partial(hook, 1)) + model.net2.seq1.register_forward_pre_hook(partial(hook, 2)) + model.net2.seq1.register_forward_pre_hook(partial(hook, 3)) + model.net2.seq1.register_forward_pre_hook(partial(hook, 4), prepend=True) + expected = [4, 0, 1, 2, 3] + + self.assertEqual(fired_hooks, []) + out = model(x) + self.assertEqual(fired_hooks, expected) + out.sum().backward() + self.assertEqual(fired_hooks, expected) + model(x).sum().backward() + self.assertEqual(fired_hooks, expected + expected) + + @skipIfTorchDynamo("Dynamo does not yet capture hooks") + def test_full_backward_hooks(self): + fired_hooks: List[int] = [] + model = ToyModel() + x = torch.randn(10, 10) + hook = partial(full_backward_hook, self, fired_hooks, model.net1) + model.net1.register_full_backward_hook(partial(hook, 0)) + model.net1.register_full_backward_hook(partial(hook, 1)) + model.net1.register_full_backward_hook(partial(hook, 2)) + model.net1.register_full_backward_hook(partial(hook, 3), prepend=True) + model.net1.register_full_backward_hook(partial(hook, 4), prepend=True) + expected = [4, 3, 0, 1, 2] + + self.assertEqual(fired_hooks, []) + out = model(x) + self.assertEqual(fired_hooks, []) + out.sum().backward() + self.assertEqual(fired_hooks, expected) + model(x).sum().backward() + self.assertEqual(fired_hooks, expected + expected) + + @skipIfTorchDynamo("Dynamo does not yet capture hooks") + def test_full_backward_pre_hooks(self): + fired_hooks: List[int] = [] + model = ToyModel() + x = torch.randn(10, 10) + hook = partial(full_backward_pre_hook, self, fired_hooks, model.net1) + model.net1.register_full_backward_pre_hook(partial(hook, 0), prepend=True) + model.net1.register_full_backward_pre_hook(partial(hook, 1), prepend=True) + model.net1.register_full_backward_pre_hook(partial(hook, 2)) + model.net1.register_full_backward_pre_hook(partial(hook, 3)) + model.net1.register_full_backward_pre_hook(partial(hook, 4)) + expected = [1, 0, 2, 3, 4] + + self.assertEqual(fired_hooks, []) + out = model(x) + self.assertEqual(fired_hooks, []) + out.sum().backward() + self.assertEqual(fired_hooks, expected) + model(x).sum().backward() + self.assertEqual(fired_hooks, expected + expected) + + @skipIfTorchDynamo("Dynamo does not yet capture hooks") + def test_mixed_hooks(self): + fired_hooks: List[int] = [] + model = ToyModel() + x = torch.randn(10, 10) + model.register_forward_pre_hook(partial(forward_pre_hook, self, fired_hooks, model, 0)) + model.register_forward_hook(partial(forward_hook, self, fired_hooks, model, 1)) + model.register_full_backward_pre_hook(partial(full_backward_pre_hook, self, fired_hooks, model, 2)) + model.register_full_backward_hook(partial(full_backward_hook, self, fired_hooks, model, 3)) + + self.assertEqual(fired_hooks, []) + out = model(x) + self.assertEqual(fired_hooks, [0, 1]) + out.sum().backward() + self.assertEqual(fired_hooks, [0, 1, 2, 3]) + model(x).sum().backward() + self.assertEqual(fired_hooks, [0, 1, 2, 3, 0, 1, 2, 3]) + + +if __name__ == "__main__": + run_tests() diff --git a/torch/distributed/nn/api/remote_module.py b/torch/distributed/nn/api/remote_module.py index 4d1f2fe707769..b7e81ad9d3e64 100644 --- a/torch/distributed/nn/api/remote_module.py +++ b/torch/distributed/nn/api/remote_module.py @@ -361,10 +361,18 @@ def register_backward_hook( # type: ignore[return] ) -> RemovableHandle: _raise_not_supported(self.register_backward_hook.__name__) - def register_forward_pre_hook(self, hook: Callable[..., None]) -> RemovableHandle: # type: ignore[return] + def register_forward_pre_hook( # type: ignore[return] + self, + hook: Callable[..., None], + prepend: bool = False, + ) -> RemovableHandle: _raise_not_supported(self.register_forward_pre_hook.__name__) - def register_forward_hook(self, hook: Callable[..., None]) -> RemovableHandle: # type: ignore[return] + def register_forward_hook( # type: ignore[return] + self, + hook: Callable[..., None], + prepend: bool = False, + ) -> RemovableHandle: _raise_not_supported(self.register_forward_hook.__name__) def state_dict(self, *args, **kwargs): diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py index 4f4a850f00859..1ce6cc0742ab8 100644 --- a/torch/nn/modules/module.py +++ b/torch/nn/modules/module.py @@ -1118,7 +1118,9 @@ def convert(t): return self._apply(convert) def register_full_backward_pre_hook( - self, hook: Callable[['Module', _grad_t], Union[None, _grad_t]] + self, + hook: Callable[["Module", _grad_t], Union[None, _grad_t]], + prepend: bool = False, ) -> RemovableHandle: r"""Registers a backward pre-hook on the module. @@ -1141,6 +1143,17 @@ def register_full_backward_pre_hook( Modifying inputs inplace is not allowed when using backward hooks and will raise an error. + Args: + hook (Callable): The user-defined hook to be registered. + prepend (bool): If true, the provided ``hook`` will be fired before + all existing ``backward_pre`` hooks on this + :class:`torch.nn.modules.Module`. Otherwise, the provided + ``hook`` will be fired after all existing ``backward_pre`` hooks + on this :class:`torch.nn.modules.Module`. Note that global + ``backward_pre`` hooks registered with + :func:`register_module_full_backward_pre_hook` will fire before + all hooks registered by this method. + Returns: :class:`torch.utils.hooks.RemovableHandle`: a handle that can be used to remove the added hook by calling @@ -1149,6 +1162,8 @@ def register_full_backward_pre_hook( """ handle = hooks.RemovableHandle(self._backward_pre_hooks) self._backward_pre_hooks[handle.id] = hook + if prepend: + self._backward_pre_hooks.move_to_end(handle.id, last=False) # type: ignore[attr-defined] return handle def register_backward_hook( @@ -1176,7 +1191,9 @@ def register_backward_hook( return handle def register_full_backward_hook( - self, hook: Callable[['Module', _grad_t, _grad_t], Union[None, _grad_t]] + self, + hook: Callable[["Module", _grad_t, _grad_t], Union[None, _grad_t]], + prepend: bool = False, ) -> RemovableHandle: r"""Registers a backward hook on the module. @@ -1202,6 +1219,17 @@ def register_full_backward_hook( Modifying inputs or outputs inplace is not allowed when using backward hooks and will raise an error. + Args: + hook (Callable): The user-defined hook to be registered. + prepend (bool): If true, the provided ``hook`` will be fired before + all existing ``backward`` hooks on this + :class:`torch.nn.modules.Module`. Otherwise, the provided + ``hook`` will be fired after all existing ``backward`` hooks on + this :class:`torch.nn.modules.Module`. Note that global + ``backward`` hooks registered with + :func:`register_module_full_backward_hook` will fire before + all hooks registered by this method. + Returns: :class:`torch.utils.hooks.RemovableHandle`: a handle that can be used to remove the added hook by calling @@ -1216,6 +1244,8 @@ def register_full_backward_hook( handle = hooks.RemovableHandle(self._backward_hooks) self._backward_hooks[handle.id] = hook + if prepend: + self._backward_hooks.move_to_end(handle.id, last=False) # type: ignore[attr-defined] return handle def _get_backward_hooks(self): @@ -1287,7 +1317,9 @@ def _maybe_warn_non_full_backward_hook(self, inputs, result, grad_fn): "some grad_input. Please use register_full_backward_hook to get the documented " "behavior.") - def register_forward_pre_hook(self, hook: Callable[..., None]) -> RemovableHandle: + def register_forward_pre_hook( + self, hook: Callable[..., None], prepend: bool = False + ) -> RemovableHandle: r"""Registers a forward pre-hook on the module. The hook will be called every time before :func:`forward` is invoked. @@ -1301,6 +1333,17 @@ def register_forward_pre_hook(self, hook: Callable[..., None]) -> RemovableHandl single modified value in the hook. We will wrap the value into a tuple if a single value is returned(unless that value is already a tuple). + Args: + hook (Callable): The user defined hook to be registered. + prepend (bool): If true, the provided ``hook`` will be fired before + all existing ``forward_pre`` hooks on this + :class:`torch.nn.modules.Module`. Otherwise, the provided + ``hook`` will be fired after all existing ``forward_pre`` hooks + on this :class:`torch.nn.modules.Module`. Note that global + ``forward_pre`` hooks registered with + :func:`register_module_forward_pre_hook` will fire before all + hooks registered by this method. + Returns: :class:`torch.utils.hooks.RemovableHandle`: a handle that can be used to remove the added hook by calling @@ -1308,9 +1351,13 @@ def register_forward_pre_hook(self, hook: Callable[..., None]) -> RemovableHandl """ handle = hooks.RemovableHandle(self._forward_pre_hooks) self._forward_pre_hooks[handle.id] = hook + if prepend: + self._forward_pre_hooks.move_to_end(handle.id, last=False) # type: ignore[attr-defined] return handle - def register_forward_hook(self, hook: Callable[..., None]) -> RemovableHandle: + def register_forward_hook( + self, hook: Callable[..., None], prepend: bool = False + ) -> RemovableHandle: r"""Registers a forward hook on the module. The hook will be called every time after :func:`forward` has computed an output. @@ -1324,6 +1371,17 @@ def register_forward_hook(self, hook: Callable[..., None]) -> RemovableHandle: it will not have effect on forward since this is called after :func:`forward` is called. + Args: + hook (Callable): The user defined hook to be registered. + prepend (bool): If true, the provided ``hook`` will be fired before + all existing ``forward`` hooks on this + :class:`torch.nn.modules.Module`. Otherwise, the provided + ``hook`` will be fired after all existing ``forward`` hooks on + this :class:`torch.nn.modules.Module`. Note that global + ``forward`` hooks registered with + :func:`register_module_forward_hook` will fire before all hooks + registered by this method. + Returns: :class:`torch.utils.hooks.RemovableHandle`: a handle that can be used to remove the added hook by calling @@ -1331,6 +1389,8 @@ def register_forward_hook(self, hook: Callable[..., None]) -> RemovableHandle: """ handle = hooks.RemovableHandle(self._forward_hooks) self._forward_hooks[handle.id] = hook + if prepend: + self._forward_hooks.move_to_end(handle.id, last=False) # type: ignore[attr-defined] return handle def _slow_forward(self, *input, **kwargs): From eac8ccae771f532852e32d9905a3927c230d18cd Mon Sep 17 00:00:00 2001 From: min-jean-cho Date: Tue, 25 Oct 2022 19:24:35 +0000 Subject: [PATCH 0133/1922] build: support DNNL_GRAPH_CPU_RUNTIME=TBB (#87512) Force set cmake `DNNL_GRAPH_CPU_RUNTIME` as `MKLDNN_CPU_RUNTIME` to overwrite [`set(DNNL_GRAPH_CPU_RUNTIME "OMP")`](https://github.com/oneapi-src/oneDNN/blob/d19d0f795c60695bd32f894c6f01771b2dfbe24d/cmake/options.cmake#L65-L67), enabling user-specified `MKLDNN_CPU_RUNTIME` values (`OMP` (default), `TBB`) for `DNNL_GRAPH_CPU_RUNTIME`. Fixes https://github.com/pytorch/pytorch/issues/87511 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87512 Approved by: https://github.com/jgong5, https://github.com/ashokei, https://github.com/malfet --- cmake/Modules/FindMKLDNN.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/Modules/FindMKLDNN.cmake b/cmake/Modules/FindMKLDNN.cmake index e2f427be67c89..30ac5401ddf32 100644 --- a/cmake/Modules/FindMKLDNN.cmake +++ b/cmake/Modules/FindMKLDNN.cmake @@ -76,6 +76,8 @@ IF(NOT MKLDNN_FOUND) SET(DNNL_BUILD_EXAMPLES FALSE CACHE BOOL "" FORCE) SET(DNNL_LIBRARY_TYPE STATIC CACHE STRING "" FORCE) SET(DNNL_ENABLE_PRIMITIVE_CACHE TRUE CACHE BOOL "" FORCE) + SET(DNNL_GRAPH_CPU_RUNTIME ${MKLDNN_CPU_RUNTIME} CACHE STRING "" FORCE) + IF(BUILD_ONEDNN_GRAPH) SET(DNNL_GRAPH_LIBRARY_TYPE STATIC CACHE STRING "" FORCE) ENDIF(BUILD_ONEDNN_GRAPH) From f8e4d1e54d221dd439836a462e3620bfaea07e70 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Tue, 25 Oct 2022 19:38:41 +0000 Subject: [PATCH 0134/1922] Use setup_instance script to enable conda and load cuda libraries (#87296) Fixes the broken torchbench CI after the machine image update. RUN_TORCHBENCH: nvfuser Pull Request resolved: https://github.com/pytorch/pytorch/pull/87296 Approved by: https://github.com/davidberard98 --- .github/workflows/run_torchbench.yml | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/.github/workflows/run_torchbench.yml b/.github/workflows/run_torchbench.yml index 2d1013abafc02..b6c870fa7839d 100644 --- a/.github/workflows/run_torchbench.yml +++ b/.github/workflows/run_torchbench.yml @@ -1,4 +1,4 @@ -name: TorchBench CI (pytorch-linux-py3.7-cu102) +name: TorchBench CI (pytorch-linux-py3.8-cu116) on: pull_request: @@ -6,6 +6,7 @@ env: PYTHON_VERSION: "3.8" # must be consistent with https://github.com/pytorch/benchmark/blob/main/requirements.txt#L19 NUMPY_VERSION: "1.21.2" + SETUP_SCRIPT: "/data/nvme/bin/setup_instance.sh" PR_NUM: ${{ github.event.number }} PR_BODY: ${{ github.event.pull_request.body }} PR_BASE_SHA: ${{ github.event.pull_request.base.sha }} @@ -35,18 +36,19 @@ jobs: - name: Create conda environment and install deps run: | conda create -y -n pr-ci python="${PYTHON_VERSION}" - # shellcheck disable=SC1091 - . "${HOME}"/anaconda3/etc/profile.d/conda.sh + # shellcheck source=/dev/null + . "${SETUP_SCRIPT}" conda activate pr-ci # pin cmake version to 3.22 since 3.23 breaks pytorch build # see details at: https://github.com/pytorch/pytorch/issues/74985 conda install -y numpy="${NUMPY_VERSION}" requests ninja pyyaml mkl mkl-include \ setuptools cmake=3.22 cffi typing_extensions boto3 \ future six dataclasses pillow pytest tabulate gitpython git-lfs tqdm psutil + pip install --pre torch torchvision torchtext -f https://download.pytorch.org/whl/nightly/cu116/torch_nightly.html - name: Setup TorchBench branch run: | - # shellcheck disable=SC1091 - . "${HOME}"/anaconda3/etc/profile.d/conda.sh + # shellcheck source=/dev/null + . "${SETUP_SCRIPT}" conda activate pr-ci PR_BODY_FILE=/tmp/pr-body.txt echo "$PR_BODY" > ${PR_BODY_FILE} @@ -69,8 +71,8 @@ jobs: popd PR_BODY_FILE=/tmp/pr-body.txt echo "$PR_BODY" > ${PR_BODY_FILE} - # shellcheck disable=SC1091 - . "${HOME}"/anaconda3/etc/profile.d/conda.sh + # shellcheck source=/dev/null + . "${SETUP_SCRIPT}" conda activate pr-ci python3 pytorch/.github/scripts/run_torchbench.py \ --pr-body "$PR_BODY_FILE" \ @@ -82,7 +84,8 @@ jobs: --pr-head-sha "$PR_HEAD_SHA" - name: Upload result to S3 run: | - . "${HOME}"/anaconda3/etc/profile.d/conda.sh + # shellcheck source=/dev/null + . "${SETUP_SCRIPT}" conda activate pr-ci python3 pytorch/.github/scripts/run_torchbench.py \ upload-s3 \ From 7301114839d54f112e797eb6c8082b73a348da24 Mon Sep 17 00:00:00 2001 From: AllenTiTaiWang Date: Tue, 25 Oct 2022 16:31:45 +0000 Subject: [PATCH 0135/1922] [ONNX] Fix pad Circular Mode (#86984) In https://github.com/pytorch/pytorch/pull/73433, a ONNX test case is missed, and the result is incorrect when it is converted to ONNX. Pull Request resolved: https://github.com/pytorch/pytorch/pull/86984 Approved by: https://github.com/BowenBao --- test/onnx/test_pytorch_onnx_onnxruntime.py | 21 +++++++++++++++++ torch/onnx/symbolic_opset11.py | 8 ++++++- torch/onnx/symbolic_opset9.py | 27 ++++++++++++++-------- 3 files changed, 46 insertions(+), 10 deletions(-) diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index bc70011b78871..4577dafdad56c 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -7437,6 +7437,27 @@ def forward(self, x, pad: List[int]): x = torch.randn(2, 2, 4, 4) self.run_test(Pad(), (x, pad)) + @skipIfUnsupportedMinOpsetVersion(11) + def test_pad_circular(self): + class PadModel(torch.nn.Module): + def forward(self, x): + out = torch.nn.functional.pad(x, (1, 2, 1, 2), mode="circular") + return out + + x = torch.randn(2, 3, 3, 4) + self.run_test(PadModel(), (x)) + + @skipIfUnsupportedMinOpsetVersion(11) + def test_pad_circular_negative(self): + # Test for different pad integer types + class PadModel(torch.nn.Module): + def forward(self, x): + out = torch.nn.functional.pad(x, (-1, -2), mode="circular") + return out + + x = torch.randn(2, 3, 6) + self.run_test(PadModel(), (x)) + @skipIfUnsupportedMaxOpsetVersion(10) @skipScriptTest() # TODO: the logic in symbolic_opset9 doesn't handle script def test_unsupported_pad(self): diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py index 2d3993c417014..c845d6dcc2e4d 100644 --- a/torch/onnx/symbolic_opset11.py +++ b/torch/onnx/symbolic_opset11.py @@ -821,7 +821,13 @@ def replication_pad(g: jit_utils.GraphContext, input, padding): @_onnx_symbolic("aten::pad") @_beartype.beartype -def pad(g: jit_utils.GraphContext, input, pad, mode, value): +def pad( + g: jit_utils.GraphContext, + input: _C.Value, + pad: _C.Value, + mode: _C.Value, + value: _C.Value, +): mode = symbolic_helper._parse_arg(mode, "s") if mode == "replicate": return replication_pad(g, input, pad) diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index 0daabb5e333d9..c071438169da3 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -4,6 +4,7 @@ release on 01/23/19 """ +import builtins import functools import math import sys @@ -1851,32 +1852,34 @@ def constant_pad_nd(g: jit_utils.GraphContext, input, padding, value): ) -@_onnx_symbolic("aten::_pad_circular") @_beartype.beartype -def _pad_circular(g: jit_utils.GraphContext, input, pad): +def _pad_circular(g: jit_utils.GraphContext, input: _C.Value, pad: _C.Value): padding = _convert_padding_node(pad) assert len(padding) % 2 == 0 ndim = len(padding) // 2 cur = input for idx in range(ndim): - pad_l = padding[-(2 * idx + 1)] - pad_r = padding[-(2 * idx + 2)] - + pad_r = padding[-(2 * idx + 1)] + pad_l = padding[-(2 * idx + 2)] + # get size for targeting the last idx, as Slice don't take start=[-1], end=[-1] + size = symbolic_helper._get_tensor_sizes(input) tensors = [] if pad_l > 0: left = symbolic_helper._slice_helper( - g, cur, axes=[2 + idx], starts=[-(pad_l + 1)], ends=[-1] + g, cur, axes=[2 + idx], starts=[-(pad_l)], ends=[size[2 + idx]] ) tensors.append(left) if pad_l < 0 or pad_r < 0: + start = builtins.max(0, -pad_l) + end = -(builtins.max(0, -pad_r)) middle = symbolic_helper._slice_helper( g, cur, axes=[2 + idx], - starts=[max(0, -pad_l)], - ends=[-(1 + max(0, -pad_r))], + starts=[start], + ends=[end], ) tensors.append(middle) else: @@ -1921,7 +1924,13 @@ def replication_pad(g: jit_utils.GraphContext, input, padding): @_onnx_symbolic("aten::pad") @_beartype.beartype -def pad(g: jit_utils.GraphContext, input, pad, mode, value): +def pad( + g: jit_utils.GraphContext, + input: _C.Value, + pad: _C.Value, + mode: _C.Value, + value: _C.Value, +): mode = symbolic_helper._parse_arg(mode, "s") if mode == "replicate": return replication_pad(g, input, pad) From 2caad3ecbfc831826241eadfdd8c9b0b97e7eb4e Mon Sep 17 00:00:00 2001 From: "S.Cao-office" Date: Tue, 25 Oct 2022 19:51:42 +0000 Subject: [PATCH 0136/1922] Fixed minor typos in torch.flip and torch.rot90 (#87724) Fixes #87721 @malfet Pull Request resolved: https://github.com/pytorch/pytorch/pull/87724 Approved by: https://github.com/malfet --- torch/_torch_docs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index d84ed259b6d38..c7b8d796a497d 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -11155,7 +11155,7 @@ def merge_dicts(*dicts): r""" flip(input, dims) -> Tensor -Reverse the order of a n-D tensor along given axis in dims. +Reverse the order of an n-D tensor along given axis in dims. .. note:: `torch.flip` makes a copy of :attr:`input`'s data. This is different from NumPy's `np.flip`, @@ -11312,7 +11312,7 @@ def merge_dicts(*dicts): r""" rot90(input, k=1, dims=[0,1]) -> Tensor -Rotate a n-D tensor by 90 degrees in the plane specified by dims axis. +Rotate an n-D tensor by 90 degrees in the plane specified by dims axis. Rotation direction is from the first towards the second axis if k > 0, and from the second towards the first for k < 0. Args: From f5004129a2672c8a617cc2aa7619d0376108c244 Mon Sep 17 00:00:00 2001 From: Animesh Jain Date: Tue, 25 Oct 2022 19:58:23 +0000 Subject: [PATCH 0137/1922] [inductor] Revert channels-last support (#87588) We witnessed slow compilation times last week. Earlier, I thought it was due to parallel compilation. But, after git bisect, I found the source of extra time to be my PR - https://github.com/pytorch/pytorch/pull/87049 For 1x1 kernel, the current striding check incorrectly declares channels-first 1x1 convs to channels last. I am not sure why it caused so much compilation time jump. Or why it did not fail? There was no change in performance speedup. cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang @penguinwu to identify what could be source of this compilation time increase, so that we can manually check that part of the stack. With this `res2next50` compilation time went back to 96 seconds (which was raised to 900 seconds with my earlier PR) for single thread. And parallel-compilation brings it down to ~30 seconds. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87588 Approved by: https://github.com/soumith, https://github.com/jansel, https://github.com/ngimel --- benchmarks/dynamo/common.py | 17 +++++++++++------ test/inductor/test_torchinductor.py | 1 + torch/_inductor/ir.py | 14 +++++++++++--- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py index b1f8bbd993f3b..86e6bb62842f6 100644 --- a/benchmarks/dynamo/common.py +++ b/benchmarks/dynamo/common.py @@ -98,6 +98,8 @@ def set_model_name(name): "cait_m36_384", # Accuracy "ghostnet_100", # Accuracy "swin_base_patch4_window7_224", # Accuracy + # Trying to get CI working - https://github.com/pytorch/pytorch/pull/87588 + "visformer_small", # fails accuracy on CI but passes locally ] CI_SKIP_INDUCTOR_TRAINING = [ @@ -809,12 +811,15 @@ def setup_amp(self): self.autocast = torch.cuda.amp.autocast def init_optimizer(self, device, params): - param_list = list(params) - if device == "cuda" and len(param_list) != 0: - # capturable is only supported on cuda at the moment - self.optimizer = torch.optim.Adam(param_list, capturable=True) - else: - self.optimizer = None + self.optimizer = None + # TODO - Currently, optimizers are used incorrectly. Fix optimizers with + # https://github.com/pytorch/pytorch/pull/87492 + # param_list = list(params) + # if device == "cuda" and len(param_list) != 0: + # # capturable is only supported on cuda at the moment + # self.optimizer = torch.optim.Adam(param_list, capturable=True) + # else: + # self.optimizer = None @property def args(self): diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py index bec1ea197c078..c0139b3fcdf86 100644 --- a/test/inductor/test_torchinductor.py +++ b/test/inductor/test_torchinductor.py @@ -1475,6 +1475,7 @@ def fn(x, w, b): check_lowp=False, ) + @unittest.skipIf(HAS_CUDA, "only support cpu channels_last") def test_conv2d_channels_last(self): m = torch.nn.Sequential( torch.nn.Conv2d(3, 3, 1, 1), diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py index 889e30bb54449..867e26e56c5ef 100644 --- a/torch/_inductor/ir.py +++ b/torch/_inductor/ir.py @@ -3137,9 +3137,17 @@ def create( ): valid_cudnn = True - valid_device = x.get_device().type == "cpu" or ( - x.get_device().type == "cuda" and valid_cudnn - ) + # TODO - We cannot use strides to identify if a tensor is + # channels-last for 1x1 kernels. Incorrectly identifying the + # channels last configuration leads to a dramatic increase in + # compilation time. Unfortuantely, this breaks the channels last + # support. + # valid_device = x.get_device().type == "cpu" or ( + # x.get_device().type == "cuda" and valid_cudnn + # ) + + valid_device = x.get_device().type == "cpu" + if ( valid_device and len(x.get_size()) == 4 From a6ab0090596f7e05d9f57e898a87d3849ee4fc65 Mon Sep 17 00:00:00 2001 From: Sherlock Huang Date: Tue, 25 Oct 2022 04:46:42 +0000 Subject: [PATCH 0138/1922] Fix _refs for aten.zeros/ones/empty/randn (#87569) refs for aten.zeros/ones/empty/randn doesn't support .names overload. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87569 Approved by: https://github.com/ngimel --- torch/_meta_registrations.py | 2 -- torch/_refs/__init__.py | 8 ++++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py index cb961ff898790..873f942da42ab 100644 --- a/torch/_meta_registrations.py +++ b/torch/_meta_registrations.py @@ -1658,8 +1658,6 @@ def activate_meta(): "aten::clone", # causing infinite recursion "aten::_to_copy", # causing infinite recursion, test_serialization.py -k test_tensor_subclass_getstate_overwrite # noqa: B950 "aten::randn", # pin_memory parameter is not supported!, test_proxy_tensor.py -k test_make_fx_symbolic_exhaustive_randn_cpu_float32 # noqa: B950 - "aten::zeros.names", # TypeError: zeros() got an unexpected keyword argument 'names', inductor/test_torchinductor.py -k test_zeros_cpu # noqa: B950 - "aten::empty.names", # TypeError: empty() got an unexpected keyword argument 'names', inductor/test_torchinductor.py -k test_zeros_cpu # noqa: B950 "aten::add.Tensor", # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_both_scalars # noqa: B950 "aten::sub.Tensor", # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_both_scalars # noqa: B950 "aten::mul.Tensor", # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_both_scalars # noqa: B950 diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py index ccb44c6367a50..2e91ceeeb679d 100644 --- a/torch/_refs/__init__.py +++ b/torch/_refs/__init__.py @@ -3729,7 +3729,7 @@ def ravel(a: TensorLikeType) -> TensorLikeType: return reshape(a, (-1,)) -@register_decomposition(torch.ops.aten.empty) +@register_decomposition(torch.ops.aten.empty.memory_format) @out_wrapper() def empty( *shape, @@ -3822,7 +3822,7 @@ def new_empty_strided( ) -@register_decomposition(torch.ops.aten.zeros) +@register_decomposition(torch.ops.aten.zeros.default) @out_wrapper() def zeros( *size, @@ -3874,7 +3874,7 @@ def new_zeros( ) -@register_decomposition(torch.ops.aten.ones) +@register_decomposition(torch.ops.aten.ones.default) @out_wrapper() def ones( *size, @@ -4409,7 +4409,7 @@ def full_like( ones_like = partial(full_like, fill_value=True) # TODO: add pin_memory support -@register_decomposition(torch.ops.aten.randn) +@register_decomposition(torch.ops.aten.randn.default) @out_wrapper() def randn( *shape, From 0df79889c50bfaa60eb71b4a4a04353855333da3 Mon Sep 17 00:00:00 2001 From: HDCharles Date: Tue, 25 Oct 2022 09:58:57 -0700 Subject: [PATCH 0139/1922] [ao] Adding FAQ to docs (#87322) Summary: migrated from: https://discuss.pytorch.org/t/quantization-frequently-asked-questions/161251 Test Plan: circle CI tests Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: https://github.com/pytorch/pytorch/pull/87322 Approved by: https://github.com/z-a-f --- docs/source/quantization.rst | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst index e00720570a1a3..55fa6b0c604d2 100644 --- a/docs/source/quantization.rst +++ b/docs/source/quantization.rst @@ -998,6 +998,25 @@ if ``dtype`` is ``torch.qint8``, make sure to set a custom ``quant_min`` to be ` you call the `torch.ao.quantization.get_default_qconfig(backend)` or `torch.ao.quantization.get_default_qat_qconfig(backend)` function to get the default ``qconfig`` for ``fbgemm`` or ``qnnpack`` backend +Frequently Asked Questions +-------------------------- + +1. How can I do quantized inference on GPU?: + + We don't have official GPU support yet, but this is an area of active development, you can find more information + `here `_ + +2. Where can I get ONNX support for my quantized model?: + + You can open an issue in `GitHub - onnx/onnx `_ when you encounter problems with ONNX, + or reach out to people in this list: `PyTorch Governance | Maintainers | ONNX exporter `_ + +3. How can I use quantization with LSTM's?: + + LSTM is supported through our custom module api in both eager mode and fx graph mode quantization. Examples can be found at + Eager Mode: `pytorch/test_quantized_op.py TestQuantizedOps.test_custom_module_lstm `_ + FX Graph Mode: `pytorch/test_quantize_fx.py TestQuantizeFx.test_static_lstm `_ + Common Errors --------------------------------------- From 9cd7b7cec2afa53370474d74eb8fcb5e66b849b6 Mon Sep 17 00:00:00 2001 From: Michael Voznesensky Date: Tue, 25 Oct 2022 21:15:40 +0000 Subject: [PATCH 0140/1922] [Dynamo] Symbolic shape guards (#87570) **Introduces symbolic shape guards into dynamo.** In this PR, we take the existing fake tensor infra and plumbing in dynamo and we start passing a shape_env around. This shape_env does not get plumbed down to middle layers / backend yet - it only collects expressions from frontend invocations at the moment. We then translate these expressions into guards at the point where we take other guards installed throughout dynamo - and add them to check_fn. Part 1 of https://docs.google.com/document/d/1QJ-M4zfMkD-fjHIqW089RptjLl9EgozZGCceUbvmgfY/edit# cc @jansel @lezcano @fdrocha @mlazos @soumith @yanboliang @penguinwu @anijain2305 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87570 Approved by: https://github.com/ezyang --- test/dynamo/test_dynamic_shapes.py | 153 +++++++++++++++++++++++++- test/dynamo/test_functions.py | 1 + test/dynamo/test_repros.py | 5 +- test/functorch/test_aotdispatch.py | 1 + test/test_proxy_tensor.py | 5 + torch/_dynamo/convert_frame.py | 2 +- torch/_dynamo/guards.py | 167 ++++++++++++++++++++++++++++- torch/_dynamo/output_graph.py | 5 + torch/_dynamo/symbolic_convert.py | 3 +- torch/_dynamo/utils.py | 46 +++++++- torch/_dynamo/variables/builtin.py | 18 +++- torch/_dynamo/variables/tensor.py | 22 ++-- torch/_dynamo/variables/torch.py | 18 ++++ torch/_subclasses/fake_tensor.py | 2 +- torch/_subclasses/meta_utils.py | 2 +- 15 files changed, 427 insertions(+), 23 deletions(-) diff --git a/test/dynamo/test_dynamic_shapes.py b/test/dynamo/test_dynamic_shapes.py index a2a94fce1e559..a32825d03aeaa 100644 --- a/test/dynamo/test_dynamic_shapes.py +++ b/test/dynamo/test_dynamic_shapes.py @@ -3,14 +3,26 @@ from torch._dynamo.testing import make_test_cls_with_patches try: - from . import test_functions, test_misc, test_modules, test_repros, test_unspec + from . import ( + test_export, + test_functions, + test_misc, + test_modules, + test_repros, + test_subgraphs, + test_unspec, + ) except ImportError: + import test_export import test_functions import test_misc import test_modules import test_repros + import test_subgraphs import test_unspec +import unittest + def make_dynamic_cls(cls): return make_test_cls_with_patches( @@ -23,6 +35,145 @@ def make_dynamic_cls(cls): DynamicShapesReproTests = make_dynamic_cls(test_repros.ReproTests) DynamicShapesNNModuleTests = make_dynamic_cls(test_modules.NNModuleTests) DynamicShapesUnspecTests = make_dynamic_cls(test_unspec.UnspecTests) +DynamicShapesExportTests = make_dynamic_cls(test_export.ExportTests) +DynamicShapesSubGraphTests = make_dynamic_cls(test_subgraphs.SubGraphTests) + + +# DynamicShapesFunctionTests +unittest.expectedFailure( + DynamicShapesFunctionTests.test_len_tensor_dynamic_shapes + # TypeError: 'torch._C.SymIntNode' object cannot be interpreted as an integer +) + +unittest.expectedFailure( + DynamicShapesFunctionTests.test_tensor_len_dynamic_shapes + # TypeError: 'torch._C.SymIntNode' object cannot be interpreted as an integer +) + + +# DynamicShapesReproTests +unittest.expectedFailure( + DynamicShapesReproTests.test_reformer_eval_dynamic_shapes + # TypeError: 'torch._C.SymIntNode' object cannot be interpreted as an integer +) + +unittest.expectedFailure( + DynamicShapesReproTests.test_reformer_train_dynamic_shapes + # TypeError: 'torch._C.SymIntNode' object cannot be interpreted as an integer +) + +unittest.expectedFailure( + DynamicShapesReproTests.test_issue175_dynamic_shapes + # TypeError: 'torch._C.SymIntNode' object cannot be interpreted as an integer +) + +unittest.expectedFailure( + DynamicShapesReproTests.test_do_paste_mask_dynamic_shapes + # aten.min.dim - couldn't find symbolic meta function/decomposition +) + +unittest.expectedFailure( + DynamicShapesReproTests.test_convert_boxes_to_pooler_format_dynamic_shapes + # Could not infer dtype of torch._C.SymIntNode +) + +unittest.expectedFailure( + DynamicShapesReproTests.test_ellipsis_dynamic_shapes + # Cannot call sizes() on tensor with symbolic sizes/strides +) + +unittest.expectedFailure( + DynamicShapesReproTests.test_hf_t5_forward_dynamic_shapes + # Cannot call sizes() on tensor with symbolic sizes/strides +) + +unittest.expectedFailure( + DynamicShapesReproTests.test_reformer_sorting_dynamic_shapes + # Unable to cast Python instance to C++ type +) + +unittest.expectedFailure( + DynamicShapesReproTests.test_boxes_len_dynamic_shapes + # Unable to cast Python instance to C++ type +) + +unittest.expectedFailure( + DynamicShapesReproTests.test_guard_fail_tensor_bool_dynamic_shapes + # RuntimeError: aten.allclose.default - couldn't find symbolic meta function/decomposition +) + +# DynamicShapesMiscTests +unittest.expectedFailure( + DynamicShapesMiscTests.test_unsupported_fake_tensor_dynamic_shapes + # aten.quantize_per_tensor.default - couldn't find symbolic meta function/decomposition +) +unittest.expectedFailure( + DynamicShapesMiscTests.test_module_deepcopy_dynamic_shapes + # aten.squeeze_.dim - couldn't find symbolic meta function/decompositio +) + +# DynamicShapesUnspecTests +unittest.expectedFailure( + DynamicShapesUnspecTests.test_unspec_float_precision_dynamic_shapes + # float() argument must be a string or a real number, not 'torch._C.SymIntNode' +) + + +# DynamicShapesNNModuleTests +unittest.expectedFailure( + DynamicShapesNNModuleTests.test_unsupportedmethod_dynamic_shapes + # aten.squeeze_.dim - couldn't find symbolic meta function/decomposition +) + +unittest.expectedFailure( + DynamicShapesNNModuleTests.test_unsupportedmodule_dynamic_shapes + # aten.squeeze_.dim - couldn't find symbolic meta function/decomposition +) + +unittest.expectedFailure( + DynamicShapesNNModuleTests.test_self_mutating1_dynamic_shapes + # aten.squeeze_.dim - couldn't find symbolic meta function/decomposition +) + +unittest.expectedFailure( + DynamicShapesNNModuleTests.test_call_fn_with_non_const_inputs_safe_dynamic_shapes + # aten.squeeze_.dim - couldn't find symbolic meta function/decomposition +) + + +# DynamicShapesExportTests +unittest.expectedFailure( + DynamicShapesExportTests.test_export_compare_optimize_with_make_fx_dynamic_shapes +) +unittest.expectedFailure( + DynamicShapesExportTests.test_export_with_constant_list_nonzero_dynamic_shapes +) +unittest.expectedFailure( + DynamicShapesExportTests.test_export_with_constant_list_nonzero_free_function_dynamic_shapes +) +unittest.expectedFailure( + DynamicShapesExportTests.test_export_with_constant_tuple_nonzero_dynamic_shapes +) +unittest.expectedFailure( + DynamicShapesExportTests.test_export_with_stack_trace_dynamic_shapes +) +unittest.expectedFailure( + DynamicShapesExportTests.test_zeroes_in_new_shape_scalar_out_dynamic_shapes +) +unittest.expectedFailure( + DynamicShapesExportTests.test_zeroes_in_new_shape_scalar_out_permute_dupe_and_bypass_dynamic_shapes +) +unittest.expectedFailure( + DynamicShapesExportTests.test_zeroes_in_new_shape_scalar_out_permute_dynamic_shapes +) + + +# DynamicShapesSubGraphTests +unittest.expectedFailure( + DynamicShapesSubGraphTests.test_enumerate_not_break_graph_dynamic_shapes +) +unittest.expectedFailure(DynamicShapesSubGraphTests.test_restore_state_dynamic_shapes) + if __name__ == "__main__": from torch._dynamo.test_case import run_tests diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py index d18ef7e1173fe..d428a4369fc1e 100644 --- a/test/dynamo/test_functions.py +++ b/test/dynamo/test_functions.py @@ -6,6 +6,7 @@ import itertools import operator from typing import Any +from unittest.mock import patch import torch diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py index bbb8ba527fc73..66fc19895dd62 100644 --- a/test/dynamo/test_repros.py +++ b/test/dynamo/test_repros.py @@ -872,8 +872,9 @@ def test_longformer_chunk(self): self.assertTrue(same(opt_fn(input1), correct1)) self.assertTrue(same(opt_fn(input2), correct2)) - self.assertEqual(cnt.frame_count, ifdyn(1, 2)) - self.assertEqual(cnt.op_count, ifdyn(19, 4)) + # Dyn recompiles are due to changes in hidden_state (Should we be guarding on this?) + self.assertEqual(cnt.frame_count, ifdyn(4, 2)) + self.assertEqual(cnt.op_count, ifdyn(76, 4)) def test_hf_t5_forward(self): input = torch.randn([1, 2048, 512]) diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py index 57013636eeabf..d406f2eb53047 100644 --- a/test/functorch/test_aotdispatch.py +++ b/test/functorch/test_aotdispatch.py @@ -1174,6 +1174,7 @@ def assert_compiler(gm: torch.fx.GraphModule, _): xfail('nn.functional.rrelu', ''), # aten.rrelu_with_noise.default - couldn't find symbolic meta function... xfail('nn.functional.smooth_l1_loss', ''), # could not find kernel xfail('nn.functional.unfold', ''), # Cannot call sizes() on tensor with symbolic sizes/strides + xfail('unfold', ''), # aten.squeeze_copy.dim - couldn't find symbolic meta function/decomposition xfail('nn.functional.upsample_bilinear', ''), # Cannot call sizes() on tensor with symbolic sizes/strides xfail('nn.functional.upsample_nearest', ''), # Cannot call sizes() on tensor with symbolic sizes/strides xfail('norm', ''), # Cannot call sizes() on tensor with symbolic sizes/strides diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py index 3c2e818497a48..1e72d5a4bc277 100644 --- a/test/test_proxy_tensor.py +++ b/test/test_proxy_tensor.py @@ -1288,6 +1288,7 @@ def f(a, b, c, d, e): xfail('nn.functional.unfold', ''), # aten.im2col.default - couldn't find symbolic meta function/decomposition xfail('nn.functional.upsample_bilinear', ''), # aten.upsample_bilinear2d.vec - couldn't find symbolic meta function/de... xfail('nn.functional.upsample_nearest', ''), # aten.upsample_nearest1d.vec - couldn't find symbolic meta function/deco... + xfail('nonzero', ''), # aten.nonzero.default - couldn't find symbolic meta function/decomposition xfail('norm', 'nuc'), # aten._linalg_svd.default - couldn't find symbolic meta function/decomposition xfail('normal', ''), # aten.normal.Tensor_Tensor - couldn't find symbolic meta function/decomposition xfail('normal', 'number_mean'), # aten.normal.float_Tensor - couldn't find symbolic meta function/decomposition @@ -1305,6 +1306,7 @@ def f(a, b, c, d, e): xfail('qr', ''), # aten.linalg_qr.default - couldn't find symbolic meta function/decomposition xfail('rad2deg', ''), # aten.rad2deg.default - couldn't find symbolic meta function/decomposition xfail('renorm', ''), # aten.renorm.default - couldn't find symbolic meta function/decomposition + xfail('repeat_interleave', ''), # Cannot call sizes() on tensor with symbolic sizes/strides xfail('reshape_as', ''), # aten.size.default - couldn't find symbolic meta function/decomposition xfail('resize_', ''), # aten.clone.default - couldn't find symbolic meta function/decomposition xfail('resize_as_', ''), # aten.clone.default - couldn't find symbolic meta function/decomposition @@ -1354,6 +1356,8 @@ def f(a, b, c, d, e): xfail('view_as', ''), # aten.size.default - couldn't find symbolic meta function/decomposition xfail('vsplit', ''), # aten.size.default - couldn't find symbolic meta function/decomposition xfail('unbind', ''), # aten.unbind.int - couldn't find symbolic meta function/decomposition + xfail('unique_consecutive', ''), # aten.unique_consecutive.default - couldn't find symbolic meta function/decomposition + xfail('unique', ''), # aten._unique2.default - couldn't find symbolic meta function/decomposition } symbolic_tensor_segfaults = { skip('nn.functional.batch_norm') # Segfault?? @@ -1454,6 +1458,7 @@ def f(a, b, c, d, e): xfail('true_divide', ''), # aten.div_.Tensor - couldn't find symbolic meta function/decomposition xfail('trunc', ''), # aten.trunc_.default - couldn't find symbolic meta function/decomposition xfail('uniform', ''), # aten.uniform_.default - couldn't find symbolic meta function/decomposition + xfail('unique', ''), # aten.unique_consecutive.default - couldn't find symbolic meta function/decomposition xfail('unsqueeze', ''), # aten.unsqueeze_.default - couldn't find symbolic meta function/decomposition xfail('xlogy', ''), # aten.xlogy_.Tensor - couldn't find symbolic meta function/decomposition } diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py index 46a23b330a0a4..206cffb7aeeda 100644 --- a/torch/_dynamo/convert_frame.py +++ b/torch/_dynamo/convert_frame.py @@ -417,7 +417,7 @@ def transform(instructions, code_options): assert output.guards is not None CleanupManager.instance[out_code] = output.cleanups - check_fn = CheckFunctionManager(output.guards, locals, globals) + check_fn = CheckFunctionManager(output, output.guards, locals, globals) guarded_code = GuardedCode(out_code, check_fn.check_fn) guard_str = "GUARDS:\n" diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py index 1f43ac667e579..9edd6f60560df 100644 --- a/torch/_dynamo/guards.py +++ b/torch/_dynamo/guards.py @@ -12,7 +12,10 @@ import numpy as np +import sympy + import torch +from torch.fx.experimental.symbolic_shapes import FloorDiv from . import config, convert_frame, mutation_guard from .eval_frame import set_guard_error_hook, set_guard_fail_hook @@ -176,6 +179,7 @@ def __init__( # Code is python expression strings generated for each guard self.code: List[str] = [] self.tensor_check_names = [] + self.tensor_check_ids = {} self.tensor_check_examples = [] self.guarded_code = guarded_code @@ -414,9 +418,13 @@ def TENSOR_MATCH(self, guard: Guard): self.ID_MATCH(guard) else: value = self.get(guard.name) - self.tensor_check_names.append(self.arg_ref(guard)) + tensor_name = self.arg_ref(guard) + self.tensor_check_names.append(tensor_name) self.tensor_check_examples.append(value) + # STOP - DO NOT USE id_ref FOR TENSORS - TENSOR INVALIDATION RULES DIFFER + self.tensor_check_ids[tensor_name] = id(value) + # Note: Guard code produced for tensor_match is a little different. # We accumulate tensor names, then do a single install of `___check_tensors`. # See _guards.cpp and TensorGuard for more information. @@ -469,6 +477,62 @@ class GuardedCode: check_fn: Callable +from sympy.printing.str import StrPrinter + + +@dataclasses.dataclass +class TensorReference(object): + """ + TensorReference objects are entirely optional. They are created to give us hints + into where the symbolic shape came from. + + ref_id: The id of the tensor + kind: A string tracking where in the tensor this value came from ("size","stride", etc) + idx: An index in the structure + + NOTE - A symbolic shape coming from tensor at id 12345's shape dim 2, would be + TensorReference(ref_id=12345, kind="size", idx=2) + """ + + ref_id: Optional[int] = None + kind: Optional[str] = None + idx: Optional[int] = None + # Note - this is untyped because of TypeError: '_SpecialForm' object does not support item assignment + # But it is a Optional[Union["sympy.Expr", int]] + expr: Optional[object] = None # Populated after association + + def __hash__(self): + return hash((self.ref_id, self.kind, self.idx)) + + +class DynamoGuardPrinter(StrPrinter): + @staticmethod + def tensor_ref_as_str(tensor_ref, id_to_name_map): + if tensor_ref.kind in ("size", "stride"): + return f"{id_to_name_map[tensor_ref.ref_id]}.{tensor_ref.kind}()[{tensor_ref.idx}]" + return f"{id_to_name_map[tensor_ref.ref_id]}.{tensor_ref.kind}()" + + def __init__(self, expr_to_tensor_ref, id_to_name_map): + super().__init__() + self.expr_to_tensor_ref = expr_to_tensor_ref + self.id_to_name_map = id_to_name_map + + def _print_Symbol(self, expr) -> str: + assert isinstance(expr, sympy.core.symbol.Symbol) + if expr == 0: + return "0" + if expr == 1: + return "1" + assert expr in self.expr_to_tensor_ref, f"Unknown expression {expr}" + refs = self.expr_to_tensor_ref[expr] + if len(refs) == 0: + return super()._print_Symbol(expr) + tensor_ref = next( + iter(refs) + ) # Any is fine here, because we install equality guards later + return DynamoGuardPrinter.tensor_ref_as_str(tensor_ref, self.id_to_name_map) + + # NB: Naively, you'd expect this to only be a function that produces # the callable that consistutes the guard. However, there is some # delicate handling for invalidating this check function when the @@ -482,6 +546,7 @@ class GuardedCode: class CheckFunctionManager: def __init__( self, + output_graph=None, guards: Optional[Set[Guard]] = None, f_locals: Optional[Dict] = None, f_globals: Optional[Dict] = None, @@ -489,6 +554,7 @@ def __init__( self.valid = True self._weakrefs = [] self._seen_ids = set() + self.output_graph = output_graph # Note: right overrides left def combine_scopes(left, right): @@ -511,6 +577,82 @@ def combine_scopes(left, right): self.check_fn = self.compile_check_fn(local_builder, global_builder) self._seen_ids.clear() + """ + This is a complex bit of logic. The outline here is brief. For a line by line breakdown, see + the code comments below. + + The role of this function is to take the current state of symbolic shape guards, tensor ids in the + CURRENT dynamo frame, and tensor names (dynamo's frame agnostic tensor reference mechanism, see TensorCheck and + guards.cpp for more info) - and produce executable python expressions for addition to our guarded code components + that make their way into check_fn. + + We DO NOT create guards based on ids. The IDs act as a lookup for the following mapping: + + dynamo: tensor_name <> tensor_id + shape_env: tensor_id <> shape_expr + + This allows us to then create a tensor_name <> shape_expr association for the current frames guards. + """ + + def _parse_symbolic_shape_expressions(self, tensor_check_names, tensor_check_ids): + # Pre join output + finished_expressions = [] + + # A mapping of tensor_ids to tensor names + id_to_name_map = {} + + # We should not have a shape env, or guards if we are not in config.dynamic shapes + # But check it anyway. + if not config.dynamic_shapes: + return None + + expr_to_tensor_ref = {} + guard_printer = DynamoGuardPrinter(expr_to_tensor_ref, id_to_name_map) + + # tensor_check_names is the primary tensor association mechanism in dynamo. + # All other guards installations are driven off of it, so these ones will too. + for name in tensor_check_names: + tensor_id = tensor_check_ids[name] + id_to_name_map[tensor_id] = name + + if tensor_id in self.output_graph.tensor_id_to_sym_shape_ref: + # If we made it here, this tensor_id is relevant to dynamo guard installation + # AND was found in the shape_env + tensor_ref_set = self.output_graph.tensor_id_to_sym_shape_ref[tensor_id] + for tensor_ref in tensor_ref_set: + obj_expr = tensor_ref.expr + if obj_expr not in expr_to_tensor_ref: + expr_to_tensor_ref[obj_expr] = {} + expr_to_tensor_ref[obj_expr][tensor_ref] = "" + finished_expressions.append(f"isinstance({name}, torch.Tensor)") + + guard_expression = self.output_graph.shape_env.get_guard_expr() + expr_as_str = guard_printer.doprint(guard_expression) + # We may get into a state where symbolic shape keys (all should be found in replacements) + # Have not been removed from the expression. This is a serious enough error state that we need to assert. + for key in self.output_graph.shape_env.var_to_val.keys(): + assert str(key) not in expr_as_str, f"Unknown shape symbol {key}. " + finished_expressions.append(expr_as_str) + + for expr in expr_to_tensor_ref.keys(): + tensor_refs = expr_to_tensor_ref[expr].keys() + equality_candidates = [ + DynamoGuardPrinter.tensor_ref_as_str(x, id_to_name_map) + for x in tensor_refs + ] + + if len(equality_candidates) > 1: + equality_expr = " == ".join(equality_candidates) + # breakpoint() + finished_expressions.append(equality_expr) + + # Redundant with code_parts, but allows us to wrap it with parens nicely. + if len(finished_expressions) == 0: + return None + + expression = " and ".join(finished_expressions) + return f"({expression})" + def compile_check_fn(self, local_builder, global_builder): assert not (set(local_builder.argnames) & set(global_builder.argnames)) # see parallel handling of ".0" / "___implicit0" in _eval_frame.c @@ -530,9 +672,20 @@ def compile_check_fn(self, local_builder, global_builder): tensor_check_names = ( local_builder.tensor_check_names + global_builder.tensor_check_names ) + + tensor_check_ids = local_builder.tensor_check_ids.copy() + tensor_check_ids.update(global_builder.tensor_check_ids) + check_tensors_fn = None check_tensors_verbose_fn = None if tensor_check_names: + symbolic_shape_expression = self._parse_symbolic_shape_expressions( + tensor_check_names, tensor_check_ids + ) + if symbolic_shape_expression: + code_parts.append(symbolic_shape_expression) + verbose_code_parts.append(symbolic_shape_expression) + tensor_check_examples = ( local_builder.tensor_check_examples + global_builder.tensor_check_examples @@ -548,14 +701,23 @@ def compile_check_fn(self, local_builder, global_builder): ) verbose_code_parts.append(f"___check_tensors_verbose({verbose_args})") - code = " and ".join(unique(code_parts)) + def direct_equality(a, b): + return a == b + def direct_negation(a, b): + return not direct_equality(a, b) + + code = " and ".join(unique(code_parts)) closure_vars = collections.OrderedDict( [ ("___guarded_code", self), ("___check_tensors", check_tensors_fn), ("___check_tensors_verbose", check_tensors_verbose_fn), ("tensor_check_names", tensor_check_names), + ("Eq", direct_equality), + ("Ne", direct_negation), + ("Mod", sympy.Mod), + ("FloorDiv", FloorDiv), ] ) closure_vars.update(CLOSURE_VARS) @@ -567,6 +729,7 @@ def ___make_guard_fn({','.join(closure_vars.keys())}): print("GUARDS", code) set_guard_fail_hook(guard_fail_hook) out = dict() + # print("RUNNING PY CODE", py_code) exec(py_code, global_builder.scope, out) guard_fn = out["___make_guard_fn"](*closure_vars.values()) guard_fn.closure_vars = closure_vars diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py index f87b07996d73b..c23d4f6dd9934 100644 --- a/torch/_dynamo/output_graph.py +++ b/torch/_dynamo/output_graph.py @@ -10,6 +10,7 @@ import torch.nn from torch import fx +from torch.fx.experimental.symbolic_shapes import ShapeEnv from . import config, logging as torchdynamo_logging, variables from .bytecode_transformation import create_instruction, Instruction, unique_id @@ -104,6 +105,8 @@ def __init__( self.random_values_var = None self.initial_random_state = () self.unspec_variable_map = {} + self.shape_env = ShapeEnv() if config.dynamic_shapes else None + self.tensor_id_to_sym_shape_ref = {} @property def output(self): @@ -394,8 +397,10 @@ def compile_and_call_fx_graph(self, tx, rv, root): gm.recompile() gm.compile_subgraph_reason = self.compile_subgraph_reason name = unique_id("__compiled_fn") + compiled_fn = self.call_user_compiler(gm) compiled_fn = disable(compiled_fn) + counters["stats"]["unique_graphs"] += 1 self.install_global(name, compiled_fn) diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py index 0b5cfae69363c..4031a976f52d6 100644 --- a/torch/_dynamo/symbolic_convert.py +++ b/torch/_dynamo/symbolic_convert.py @@ -1340,7 +1340,8 @@ def __init__( if fake_tensors_available: with torch._subclasses.FakeTensorMode( - throw_on_data_dependent_ops=True + throw_on_data_dependent_ops=True, + shape_env=output.shape_env, ) as fake_mode: pass self._fake_mode = fake_mode diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py index aa64de0eeef3b..1bc646be45435 100644 --- a/torch/_dynamo/utils.py +++ b/torch/_dynamo/utils.py @@ -25,6 +25,7 @@ from typing import Any, Dict import numpy as np +import sympy import torch from torch import fx @@ -666,6 +667,43 @@ def rename_implicit(v): UnsupportedFakeTensorException, ) + def make_fake_tensor(e, fake_mode, tx=None): + fake_tensor = fake_mode.from_tensor( + e, static_shapes=config.dynamic_shapes is False + ) + if tx is not None: + from torch._dynamo.guards import TensorReference + + def _record(tensor_ref): + if tensor_ref.ref_id not in tx.output.tensor_id_to_sym_shape_ref: + tx.output.tensor_id_to_sym_shape_ref[tensor_ref.ref_id] = set() + tx.output.tensor_id_to_sym_shape_ref[tensor_ref.ref_id].add(tensor_ref) + + def _extract(symbol): + if isinstance(symbol, int): + return None + sym_expr = symbol.get_pyobj().expr + if not isinstance(sym_expr, sympy.Symbol): + return None + return sym_expr + + def _record_ref(e, index, symbol, kind): + sym_expr = _extract(symbol) + if sym_expr: + tensor_ref = TensorReference(id(e), kind, index, sym_expr) + _record(tensor_ref) + + for index, symbol in enumerate(fake_tensor.size()): + _record_ref(e, index, symbol, "size") + + for index, symbol in enumerate(fake_tensor.stride()): + _record_ref(e, index, symbol, "stride") + + offset = fake_tensor.storage_offset() + _record_ref(e, None, offset, "storage_offset") + + return fake_tensor + def wrap_fake_exception(fn): try: return fn() @@ -678,7 +716,13 @@ def wrap_fake_exception(fn): def wrap_to_fake_tensor(e, fake_mode): if type(e) in (torch.Tensor, torch.nn.Parameter): - return wrap_fake_exception(lambda: fake_mode.from_tensor(e)) + return wrap_fake_exception(lambda: make_fake_tensor(e, fake_mode)) + else: + return e + + def wrap_to_fake_tensor_and_record(e, tx): + if type(e) in (torch.Tensor, torch.nn.Parameter): + return wrap_fake_exception(lambda: make_fake_tensor(e, tx.fake_mode, tx)) else: return e diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py index 53fdb95aca8bb..cc64e009d094c 100644 --- a/torch/_dynamo/variables/builtin.py +++ b/torch/_dynamo/variables/builtin.py @@ -359,11 +359,23 @@ def _call_min_max(self, tx, a, b): a, b = b, a assert isinstance(a, variables.TensorVariable) - # 1. result of an item call is a scalar convert to a tensor - # 2. dynamic shape should be resolved to tensor - if isinstance(a, (FakeItemVariable, DynamicShapeVariable)): + # result of an item call is a scalar convert to a tensor + if isinstance(a, FakeItemVariable): a = variables.TorchVariable(torch.tensor).call_function(tx, [a], {}) + # Dynamic input does not get resolved, rather, gets stored as call_function + if isinstance(a, DynamicShapeVariable): + return variables.TensorVariable.create( + tx=tx, + proxy=tx.output.create_proxy( + "call_function", + self.fn, + *proxy_args_kwargs([a, b], {}), + current_tx=tx, + ), + **VariableTracker.propagate(self, [a, b]), + ) + # convert min/max to torch ops if b.is_python_constant(): kwargs = {"min": b} if (self.fn is max) else {"max": b} diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py index a8db819cb272d..864d2c4ca3e0f 100644 --- a/torch/_dynamo/variables/tensor.py +++ b/torch/_dynamo/variables/tensor.py @@ -17,7 +17,7 @@ DataDependentOutputException, DynamicOutputShapeException, ) - from ..utils import deepcopy_to_fake_tensor, wrap_to_fake_tensor + from ..utils import deepcopy_to_fake_tensor, wrap_to_fake_tensor_and_record import torch.utils._python_dispatch as py_dispatch from torch.fx.immutable_collections import immutable_list @@ -98,7 +98,7 @@ def _get_fake_value(node, tx): Run the computation represented by `node` using fake tensors and return the result. """ op = node.op - fake_wrapper = functools.partial(wrap_to_fake_tensor, fake_mode=tx.fake_mode) + fake_wrapper = functools.partial(wrap_to_fake_tensor_and_record, tx=tx) from ..utils import wrap_fake_exception def visit(n: torch.fx.Node): @@ -206,7 +206,7 @@ def create(cls, tx, proxy, example_value=None, **options): proxy.tracer.real_value_cache[proxy.node] = _clone_input(example_value) if use_fake_tensors: fake_wrapper = functools.partial( - wrap_to_fake_tensor, fake_mode=tx.fake_mode + wrap_to_fake_tensor_and_record, tx=tx ) example_value = fake_wrapper(example_value) @@ -241,14 +241,14 @@ def create(cls, tx, proxy, example_value=None, **options): return TorchVariable(proxy.node.target) elif istype(example_value, (int, bool, float)) and config.dynamic_shapes: proxy.node.meta["example_value"] = example_value - return DynamicShapeVariable(proxy, type(example_value), **options) + return DynamicShapeVariable(proxy, example_value, **options) elif istype(example_value, torch.Size) and config.dynamic_shapes: proxy.node.meta["example_value"] = example_value sizes = [] for i, v in enumerate(example_value): proxy_i = proxy[i] proxy_i.node.meta["example_value"] = v - sizes.append(DynamicShapeVariable(proxy_i, int)) + sizes.append(DynamicShapeVariable(proxy_i, v)) return SizeVariable(sizes, proxy, **options) elif istype(example_value, int) and proxy.node.target in ( torch.seed, @@ -258,7 +258,7 @@ def create(cls, tx, proxy, example_value=None, **options): getattr(torch.distributed, "get_world_size", _missing), ): proxy.node.meta["example_value"] = example_value - return DynamicShapeVariable(proxy, type(example_value), **options) + return DynamicShapeVariable(proxy, example_value, **options) elif istype(example_value, torch.Size) and all( [isinstance(x, int) for x in example_value] ): @@ -337,6 +337,9 @@ def create(cls, tx, proxy, example_value=None, **options): from . import UserDefinedObjectVariable return UserDefinedObjectVariable(example_value) + elif isinstance(example_value, torch.SymIntNode): + proxy.node.meta["example_value"] = example_value + return cls(proxy, **options) else: raise AssertionError( "torch.* op returned non-Tensor " @@ -474,7 +477,6 @@ def call_method( kwargs = dict(kwargs) options = VariableTracker.propagate(self, args, kwargs.values()) - if name == "stride" and self.stride is not None: constant_result = ConstantVariable(self.stride, **options) elif name == "size" and self.size is not None: @@ -578,12 +580,12 @@ class DynamicShapeVariable(TensorVariable): Represents a symbolic size, e.g., as returned by tensor.size(0) """ - def __init__(self, proxy, dyn_shape_cls, **kwargs): + def __init__(self, proxy, dyn_shape, **kwargs): super(DynamicShapeVariable, self).__init__(proxy, **kwargs) - self.dyn_shape_cls = dyn_shape_cls + self.dyn_shape = dyn_shape def python_type(self): - return self.dyn_shape_cls + return type(self.dyn_shape) def unpack_var_sequence(self, tx): super(DynamicShapeVariable, self).unpack_var_sequence(tx) diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py index 1ecfbe1a70b2c..e0c88b2cf059a 100644 --- a/torch/_dynamo/variables/torch.py +++ b/torch/_dynamo/variables/torch.py @@ -344,6 +344,24 @@ def get_state_from_generator(): example_value=example_value, **options, ) + elif ( + self.value == torch.numel + and len(args) == 1 + and isinstance(args[0], TensorVariable) + and len(kwargs) == 0 + ): + # TODO(voz): This is rewritten as a call_method because + # torch.numel(x) w/ sym shapes raises a RuntimeError and x.numel() does not + return TensorVariable.create( + tx=tx, + proxy=tx.output.create_proxy( + "call_method", + "numel", + *proxy_args_kwargs(args, kwargs), + current_tx=tx, + ), + **options, + ) else: # Handle sth like torch.LongTensor(list(np.int64, np.int64, ...)), # as FX symbolic trace doesn't support numpy int/float as base types. diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py index 2f2f07f3db378..652c24c9a521d 100644 --- a/torch/_subclasses/fake_tensor.py +++ b/torch/_subclasses/fake_tensor.py @@ -234,7 +234,7 @@ def from_real_tensor(self, fake_mode, t, make_constant=False, shape_env=None): warnings.filterwarnings("ignore", "The .grad attribute of a Tensor") grad_not_none = t.grad is not None if grad_not_none: - out.grad = self.from_real_tensor(fake_mode, t.grad) + out.grad = self.from_real_tensor(fake_mode, t.grad, shape_env=shape_env) self.set_tensor_memo(t, out) return out diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py index 80723f1246339..3e1040d037f0d 100644 --- a/torch/_subclasses/meta_utils.py +++ b/torch/_subclasses/meta_utils.py @@ -146,7 +146,7 @@ def meta_tensor(self, t, shape_env=None): def sym(x): if make_symbolic: - return shape_env.create_symbol(x) + return shape_env.create_symintnode(shape_env.create_symbol(x)) else: return x From 0c1734aeb99f8f1f664c3e57ea7c3261bd81146e Mon Sep 17 00:00:00 2001 From: Sherlock Huang Date: Tue, 25 Oct 2022 04:46:42 +0000 Subject: [PATCH 0141/1922] Fix stride for prims.where (#87563) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87563 Approved by: https://github.com/ngimel, https://github.com/mruberry --- torch/_meta_registrations.py | 6 +----- torch/_prims/__init__.py | 2 +- torch/_refs/__init__.py | 23 +++++++++++++++++------ 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py index 873f942da42ab..0af6813ce4a00 100644 --- a/torch/_meta_registrations.py +++ b/torch/_meta_registrations.py @@ -1663,12 +1663,8 @@ def activate_meta(): "aten::mul.Tensor", # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_both_scalars # noqa: B950 "aten::div.Tensor", # ValueError: Receive two Number inputs to an elementwise binary operation! test_fake_tensor.py -k test_scalar_inputs # noqa: B950 "aten::div.Tensor_mode", # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_div8_cpu # noqa: B950 - "aten::diag_embed", # Stride mismatch! test_ops.py -k test_fake_autocast_diag_embed_cuda_float32 # noqa: B950 - "aten::copy_", # Exception not raiseed, test_torch.py -k test_storage_meta_errors_cpu_int64 # noqa: B950 + "aten::copy_", # Exception not raised, test_torch.py -k test_storage_meta_errors_cpu_int64 # noqa: B950 "aten::constant_pad_nd", # requires_grad mismatch, test_ops.py -k test_fake_crossref_backward_amp_istft_cuda_float32 # noqa: B950 - "aten::masked_fill.Scalar", # Stride mismatch! test_ops.py -k test_fake_crossref_backward_amp_nanquantile_cuda_float32 # noqa: B950 - "aten::tril", # Stride mismatch! test_ops.py -k test_fake_crossref_backward_amp_ormqr_cuda_float32 # noqa: B950 - "aten::triu", # Stride mismatch! test_ops.py -k test_fake_crossref_backward_amp_lu_solve_cuda_float32 # noqa: B950 "aten::rot90", # requires_grad mismatch! test_ops.py -k test_fake_crossref_backward_amp_rot90_cuda_float32 # noqa: B950 }: pass diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py index eae38612a2237..3248009ee66e5 100644 --- a/torch/_prims/__init__.py +++ b/torch/_prims/__init__.py @@ -335,7 +335,7 @@ def _elementwise_meta( args_ = list(args) if args_with_fixed_dtypes is not None: - args_.extend(args_with_fixed_dtypes) + args_ = list(args_with_fixed_dtypes) + args_ utils.check_same_device(*args_, allow_cpu_scalar_tensors=True) utils.check_same_shape(*args_, allow_cpu_scalar_tensors=True) diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py index 2e91ceeeb679d..44b75bb92df48 100644 --- a/torch/_refs/__init__.py +++ b/torch/_refs/__init__.py @@ -3594,7 +3594,10 @@ def diag_embed( cond = a_range == b_range.unsqueeze(-1) cond_shape = [last_dim if i in (dim1, dim2) else 1 for i in range(len(t.shape))] cond = cond.reshape(cond_shape) - return utils.mask_tensor(cond, t) + + # aten.diag_embed always returns a new contiguous tensor + # contiguous() is needed to correctly model the output stride + return utils.mask_tensor(cond, t).contiguous() # CompositeImplicitAutograd - don't register decomp @@ -4517,10 +4520,14 @@ def masked_fill(a: TensorLikeType, mask: TensorLikeType, value: TensorOrNumberLi # Since `where` allows type-promotion, # cast value to correct type before passing to `where` if isinstance(value, Number): - return torch.where(mask, python_type(value), a) + r = torch.where(mask, python_type(value), a) + else: + assert isinstance(value, TensorLike) + r = torch.where(mask, prims.to_dtype(value, a.dtype), a) - assert isinstance(value, TensorLike) - return torch.where(mask, prims.to_dtype(value, a.dtype), a) + # aten.mask_fill always return a new contiguous tensor + # contiguous() is needed to correctly model the output stride + return r.contiguous() # CompositeImplicitAutograd - don't register decomp @@ -4622,7 +4629,9 @@ def triu(a: TensorLikeType, diagonal: int = 0) -> TensorLikeType: - torch.arange(h, device=a.device).unsqueeze(-1) ) >= diagonal - return utils.mask_tensor(mask, a) + # aten.triu always returns a new contiguous tensor + # contiguous() is needed to correctly model the output stride + return utils.mask_tensor(mask, a).contiguous() @register_decomposition(torch.ops.aten.tril) @@ -4637,7 +4646,9 @@ def tril(a: TensorLikeType, diagonal: int = 0) -> TensorLikeType: - torch.arange(h, device=a.device).unsqueeze(-1) ) <= diagonal - return utils.mask_tensor(mask, a) + # aten.tril always returns a new contiguous tensor + # contiguous() is needed to correctly model the output stride + return utils.mask_tensor(mask, a).contiguous() # This is based on get_tril_size in aten/src/ATen/native/TensorFactories.h From d30196d447ff15140a8d03f1ff054be068e90917 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 25 Oct 2022 21:49:59 +0000 Subject: [PATCH 0142/1922] Fix typos under functorch directory (#87663) This PR fixes typos in `.md` and `.rst` files under functorch directory Pull Request resolved: https://github.com/pytorch/pytorch/pull/87663 Approved by: https://github.com/kit1980 --- functorch/dim/README.md | 22 +++++++++++----------- functorch/docs/source/batch_norm.rst | 2 +- functorch/docs/source/ux_limitations.rst | 2 +- functorch/examples/maml_omniglot/README.md | 2 +- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/functorch/dim/README.md b/functorch/dim/README.md index 750c8847c8502..5ed7bbd3d5284 100644 --- a/functorch/dim/README.md +++ b/functorch/dim/README.md @@ -7,7 +7,7 @@ _An implementation of [named tensors](https://namedtensor.github.io) with the fu The tensor input to a resnet might have the shape [8, 3, 224, 224] but informally we think of those dimensions as 'batch', 'channel', 'width', and 'height'. Eventhough 'width' and 'height' have the same _size_ we still think of them as separate dimensions, and if we have two _different_ images, we think of both as sharing the _same_ 'channel' dimension. -Named tensors gives these dimensions names. [PyTorch's current implementation](https://pytorch.org/docs/stable/named_tensor.html) uses strings to name dimensions. Instead, this library introduces a Python object, a `Dim`, to represent the concept. By expanding the semantics of tensors with dim objects, in addition to naming dimensions, we can get behavior equivalent to batching transforms (xmap, vmap), einops-style rearragement, and loop-style tensor indexing. +Named tensors gives these dimensions names. [PyTorch's current implementation](https://pytorch.org/docs/stable/named_tensor.html) uses strings to name dimensions. Instead, this library introduces a Python object, a `Dim`, to represent the concept. By expanding the semantics of tensors with dim objects, in addition to naming dimensions, we can get behavior equivalent to batching transforms (xmap, vmap), einops-style rearrangement, and loop-style tensor indexing. A preview: @@ -85,11 +85,11 @@ from torchdim import dims batch, channel, width, height = dims(4) ``` -The existing implemention of [Named Tensors](https://pytorch.org/docs/stable/named_tensor.html) in PyTorch, or [JAX's xmap](https://jax.readthedocs.io/en/latest/notebooks/xmap_tutorial.html) use strings to name dimensions. We call these dimensions _first class_ because they are Python objects. +The existing implementation of [Named Tensors](https://pytorch.org/docs/stable/named_tensor.html) in PyTorch, or [JAX's xmap](https://jax.readthedocs.io/en/latest/notebooks/xmap_tutorial.html) use strings to name dimensions. We call these dimensions _first class_ because they are Python objects. In addition to the normal _positional_ dimensions in a tensor, tensors can also have a separate set of first-class dimensions. -You can create tensors with first-class dimensions by indexing the normal positional dimensions of a tensor with a dimension object. The `ndim` property continues to list the number of positional dimesions, while the new `dims` property lists all the bound first-class dimensions. +You can create tensors with first-class dimensions by indexing the normal positional dimensions of a tensor with a dimension object. The `ndim` property continues to list the number of positional dimensions, while the new `dims` property lists all the bound first-class dimensions. ```py input = torch.rand(2, 3, 224, 224) @@ -101,7 +101,7 @@ print(input_fc.dims) # first class dimensions > (batch, channel, width, height) -# since we converted all the positional dimesions +# since we converted all the positional dimensions # first class `input_fc` has 0 positional dimensions now. print(input_fc.ndim) > 0 @@ -266,7 +266,7 @@ print(i <= j) > with dims=(i, j) sizes=(4, 4) ``` -Because of the intentional similarity to loop-level code, using dimsions as tensors makes complicated indexing arithmetic easier to read. +Because of the intentional similarity to loop-level code, using dimensions as tensors makes complicated indexing arithmetic easier to read. Here is code that lookups up features in an embedding table given a sequence of ids: @@ -296,7 +296,7 @@ Unbinding Dims ------------- The `order` method converts first-class dimensions in a tensor back to normal positional dimensions by specifying an order for those dimensions.[^4] -By specifiying a different order from how things were originally bound, it is easy to do transpositions. +By specifying a different order from how things were originally bound, it is easy to do transpositions. ```py i, j = dims(2) @@ -305,7 +305,7 @@ A_T = A[i, j].order(j, i) assert torch.allclose(A.T, A_T) ``` -Indexing acts left-to-right, and `order` also places the new dimensions back on the left, so it possible to work on tensors that have mixed positonal and first-class dimensions: +Indexing acts left-to-right, and `order` also places the new dimensions back on the left, so it possible to work on tensors that have mixed positional and first-class dimensions: ```py B = torch.rand(3, 4, 5) @@ -313,7 +313,7 @@ B_T = B[i, j].order(j, i) assert torch.allclose(B.permute(1, 0, 2), B_T) ``` -[^4] `order` is actually just a synonym for the already-existing `permute` method, which takes a list a dimension specifiers and puts the tensor in that order because rule #2 says that first-class dims can be passed as arguments to functions that previousely took only integers as dimensions. However, the name `permute` is confusing in this context since it implies dim objects have an original order, so we prefer to use `order` when writing code. +[^4] `order` is actually just a synonym for the already-existing `permute` method, which takes a list a dimension specifiers and puts the tensor in that order because rule #2 says that first-class dims can be passed as arguments to functions that previously took only integers as dimensions. However, the name `permute` is confusing in this context since it implies dim objects have an original order, so we prefer to use `order` when writing code. Flattening and Splitting Dims ----------------------------- @@ -412,7 +412,7 @@ Named tensors with first-class dimensions can accomplish the same goal, but usin Automatically batching Code (`vmap`, `xmap`) ----------------------------- -The implicit batching of Rule #1 means it is easy to created batched versions of existing PyTorch code. Simply bind a dim to the dimensions that should act as a batch, and then pass the tensor to the unbatched function. Since the unbatched function does not know about the dim, the dim will be implicictly batched over: +The implicit batching of Rule #1 means it is easy to created batched versions of existing PyTorch code. Simply bind a dim to the dimensions that should act as a batch, and then pass the tensor to the unbatched function. Since the unbatched function does not know about the dim, the dim will be implicitly batched over: ```py batch_size, feature_size = 3, 5 @@ -501,7 +501,7 @@ def multiheadattention(q, k, v, num_attention_heads, dropout_prob, use_positiona Indexing -------- -Rule #3 enables indexing because dimensions act as loop indices when used as a tensor. This allows for a lot of powerful behavior. The simplest might be using the dimensions to compute masks, such as extracing the upper triangular part of a matrix: +Rule #3 enables indexing because dimensions act as loop indices when used as a tensor. This allows for a lot of powerful behavior. The simplest might be using the dimensions to compute masks, such as extracting the upper triangular part of a matrix: ```py from torch import where @@ -745,7 +745,7 @@ The semantics and surface syntax of dimension objects resembles the kind of code These compilers and language have syntax and semantics that resemble the loop-level analogy similar to first-class dimensions. However, as compilers or statically typed languages, they require some binding code to go from running deep learning framework code in Python to using the compiled language. This often at least requires refactoring the compiled parts into their own functions, and may require defining a gradient function. Similar to graph mode frameworks, this adds friction to using and debugging the code. -Dimension objects are just an extension of the existing PyTorch tensors and eager sematics, so there is no friction switching between normal Python code and code that uses them. However, since loops over the dimensions are defined implicitly, they can still execute in Python with good performance compared to explicit loops. Furthermore, with dimension objects, a tensors containing dimensions can compute through code that is oblivous to the dimension such as batching examples. There is no need to separate code into 'compiled' vs 'eager'. +Dimension objects are just an extension of the existing PyTorch tensors and eager semantics, so there is no friction switching between normal Python code and code that uses them. However, since loops over the dimensions are defined implicitly, they can still execute in Python with good performance compared to explicit loops. Furthermore, with dimension objects, a tensors containing dimensions can compute through code that is oblivious to the dimension such as batching examples. There is no need to separate code into 'compiled' vs 'eager'. In this way, first-class dims are a way of adapting the nicer syntax of these array compilers and languages to eager numpy-style libraries. diff --git a/functorch/docs/source/batch_norm.rst b/functorch/docs/source/batch_norm.rst index 09eb6001b5b66..8ccd4ee587d35 100644 --- a/functorch/docs/source/batch_norm.rst +++ b/functorch/docs/source/batch_norm.rst @@ -11,7 +11,7 @@ we end up with this error How to fix ---------- All of these options assume that you don't need running stats. If you're using a module this means -that it's assumed you won't use batch norm in evalution mode. If you have a use case that involves +that it's assumed you won't use batch norm in evaluation mode. If you have a use case that involves running batch norm with vmap in evaluation mode, please file an issue Option 1: Change the BatchNorm diff --git a/functorch/docs/source/ux_limitations.rst b/functorch/docs/source/ux_limitations.rst index e0090047752e0..4fee30e432881 100644 --- a/functorch/docs/source/ux_limitations.rst +++ b/functorch/docs/source/ux_limitations.rst @@ -290,5 +290,5 @@ Under "same" randomness, elements in a batch produce same random values. For ins .. note:: Finally, our randomness differs from JAX because we aren't using a stateless PRNG, in part because PyTorch doesn't have full support for a stateless PRNG. Instead, we've introduced a flag system to allow for the - most common forms of randmoness that we see. If your use case does not fit these forms of randomness, please + most common forms of randomness that we see. If your use case does not fit these forms of randomness, please file an issue. diff --git a/functorch/examples/maml_omniglot/README.md b/functorch/examples/maml_omniglot/README.md index dfb6077814bfe..afc3f55023d47 100644 --- a/functorch/examples/maml_omniglot/README.md +++ b/functorch/examples/maml_omniglot/README.md @@ -1,6 +1,6 @@ # Omniglot MAML examples -In this directory we've provided some examples of traning omniglot that reproduce the experiments from [the original MAML paper](https://arxiv.org/abs/1703.03400). +In this directory we've provided some examples of training omniglot that reproduce the experiments from [the original MAML paper](https://arxiv.org/abs/1703.03400). They can be run via `python {filename}`. From 88c46f7ee17705b1d379ac1fac0718c5a1b221de Mon Sep 17 00:00:00 2001 From: Ivan Yashchuk Date: Tue, 25 Oct 2022 21:53:11 +0000 Subject: [PATCH 0143/1922] Intercept aten._reshape_alias for nvFuser (#87072) This would help forming larger fusion groups. If this won't end up executed by nvFuser then eager mode implementation would call into `.reshape`: https://github.com/pytorch/pytorch/blob/37e9e89afbc3554258545a026fab4cd9e1a4b85d/torch/_prims/nvfuser_prims.py#L552-L553 cc @kevinstephano @jjsjann123 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87072 Approved by: https://github.com/ngimel --- torch/_prims/context.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/torch/_prims/context.py b/torch/_prims/context.py index fea3f17a5009b..2bcee069d146c 100644 --- a/torch/_prims/context.py +++ b/torch/_prims/context.py @@ -405,6 +405,12 @@ def __torch_function__( warn("view has ignored kwargs!") return torch.ops.nvprims.view(a, shape) + if orig_func == torch.ops.aten._reshape_alias.default: + a, shape, stride = args + if len(kwargs) > 0: + warn("view has ignored kwargs!") + return torch.ops.nvprims.view(a, shape) + if self._is_native_batch_norm(orig_func): return torch.ops.nvprims.native_batch_norm(*args, **kwargs) From a7de7c7d98c364b75921020efb7928e660dc057e Mon Sep 17 00:00:00 2001 From: Michael Lazos Date: Tue, 25 Oct 2022 21:55:27 +0000 Subject: [PATCH 0144/1922] Fix debug dir bugs and minifier output directories (#87682) Fixes https://github.com/pytorch/torchdynamo/issues/1758, https://github.com/pytorch/torchdynamo/issues/1752 - minifier_launcher.py now dumps checkpoints to \/checkpoints when run - a single debug directory is created per script invocation, asserts failing with no directory will no longer occur - torchinductor debug tracing will correctly dump to the debug directory now since no prior setup is needed, (the directory was incorrectly only initialized during dynamo tracing) cc @jansel @lezcano @fdrocha @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87682 Approved by: https://github.com/ezyang --- test/dynamo/test_debug_dir.py | 96 ----------------------------------- test/dynamo/test_minifier.py | 2 - torch/_dynamo/debug_utils.py | 20 ++------ torch/_dynamo/eval_frame.py | 4 -- torch/_dynamo/utils.py | 36 +++---------- 5 files changed, 12 insertions(+), 146 deletions(-) delete mode 100644 test/dynamo/test_debug_dir.py diff --git a/test/dynamo/test_debug_dir.py b/test/dynamo/test_debug_dir.py deleted file mode 100644 index 5827ff40ea781..0000000000000 --- a/test/dynamo/test_debug_dir.py +++ /dev/null @@ -1,96 +0,0 @@ -# Owner(s): ["module: dynamo"] -import shutil -import unittest - -import torch -import torch._dynamo.test_case -import torch._dynamo.testing -from torch._dynamo.utils import DebugDir, get_debug_dir - - -class DebugDirTests(torch._dynamo.test_case.TestCase): - @classmethod - def setUpClass(cls): - super().setUpClass() - cls._exit_stack.enter_context( - unittest.mock.patch.object( - torch._dynamo.config, - "debug_dir_root", - "/tmp/torch._dynamo_debug_dirs/", - ) - ) - - @classmethod - def tearDownClass(cls): - shutil.rmtree(torch._dynamo.config.debug_dir_root, ignore_errors=True) - cls._exit_stack.close() - - def setUp(self): - super().setUp() - torch._dynamo.utils.debug_dir = DebugDir() - - def tearDown(self): - torch._dynamo.utils.debug_dir = DebugDir() - super().tearDown() - - def _setup(self): - debug_dir = torch._dynamo.utils.debug_dir - debug_dir.setup() - self.assertIsNotNone(debug_dir.debug_path) - self.assertEqual(debug_dir.num_setup_calls, 1) - return debug_dir - - def test_setup(self): - self._setup() - - def test_clear(self): - debug_dir = self._setup() - debug_dir.clear() - self.assertIsNone(debug_dir.debug_path) - self.assertEqual(debug_dir.num_setup_calls, 0) - - def test_multi_setup_single_clear(self): - debug_dir = self._setup() - prev = get_debug_dir() - - debug_dir.setup() - self.assertEqual(prev, get_debug_dir()) - self.assertEqual(debug_dir.num_setup_calls, 2) - - debug_dir.clear() - self.assertEqual(prev, get_debug_dir()) - self.assertEqual(debug_dir.num_setup_calls, 1) - - def test_multi_setup_multi_clear(self): - debug_dir = self._setup() - prev = get_debug_dir() - - debug_dir.setup() - self.assertEqual(prev, get_debug_dir()) - self.assertEqual(debug_dir.num_setup_calls, 2) - - debug_dir.clear() - self.assertEqual(prev, get_debug_dir()) - self.assertEqual(debug_dir.num_setup_calls, 1) - - debug_dir.clear() - self.assertIsNone(debug_dir.debug_path) - self.assertEqual(debug_dir.num_setup_calls, 0) - - def test_single_setup_single_clear(self): - debug_dir = self._setup() - debug_dir.clear() - self.assertIsNone(debug_dir.debug_path) - self.assertEqual(debug_dir.num_setup_calls, 0) - - def test_multi_get(self): - self._setup() - prev = get_debug_dir() - next = get_debug_dir() - self.assertEqual(prev, next) - - -if __name__ == "__main__": - from torch._dynamo.test_case import run_tests - - run_tests() diff --git a/test/dynamo/test_minifier.py b/test/dynamo/test_minifier.py index a282485285797..0cec7d202a9d4 100644 --- a/test/dynamo/test_minifier.py +++ b/test/dynamo/test_minifier.py @@ -43,10 +43,8 @@ def tearDownClass(cls): def setUp(self): super().setUp() - torch._dynamo.utils.debug_dir.setup() def tearDown(self): - torch._dynamo.utils.debug_dir.clear() super().tearDown() def test_after_dynamo(self): diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py index ea5671a81d02f..0ece930d1d13b 100644 --- a/torch/_dynamo/debug_utils.py +++ b/torch/_dynamo/debug_utils.py @@ -240,7 +240,7 @@ def save_graph_repro(fd, gm, args, compiler_name): def isolate_fails(fx_g, args, compiler_name: str, env=None): if env is None: env = {} - subdir = f"{minifier_dir()}/isolate" + subdir = os.path.join(os.getcwd(), "isolate") if not os.path.exists(subdir): os.makedirs(subdir, exist_ok=True) file_name = os.path.join(subdir, f"{str(uuid.uuid4())[:5]}.py") @@ -600,10 +600,11 @@ def dump_backend_repro_as_file(gm, args, compiler_name, check_accuracy=False): """ Saves the repro to a repro.py file """ - subdir = os.path.join(minifier_dir()) + curdir = os.getcwd() + subdir = os.path.join(os.getcwd(), "checkpoints") if not os.path.exists(subdir): os.makedirs(subdir, exist_ok=True) - file_name = os.path.join(subdir, f"{len(gm.graph.nodes)}.py") + file_name = os.path.join(subdir, f"minified_{len(gm.graph.nodes)}_nodes.py") log.warning(f"Writing checkpoint with {len(gm.graph.nodes)} nodes to {file_name}") model_str = NNModuleToString.convert(gm) @@ -613,19 +614,10 @@ def dump_backend_repro_as_file(gm, args, compiler_name, check_accuracy=False): model_str, args, compiler_name, check_accuracy ) ) - latest_repro = os.path.join(subdir, "repro.py") + latest_repro = os.path.join(curdir, "repro.py") log.warning(f"Copying {file_name} to {latest_repro} for convenience") shutil.copyfile(file_name, latest_repro) - local_path = os.path.join(config.base_dir, "repro.py") - try: - shutil.copyfile(file_name, local_path) - log.warning( - f"Copying minified repro from {file_name} to {local_path} for convenience" - ) - except OSError: - log.warning("No write permissions for {local_path}") - # TODO - Commented because we are assuming that nn.Modules can be safely repr'd # If that does not work, we might have to bring this code back. So, keeping it @@ -748,8 +740,6 @@ def dump_to_minify_after_dynamo(gm, args, compiler_name): from {config.dynamo_import}.optimizations.backends import BACKENDS from {config.dynamo_import}.testing import rand_strided -{config.dynamo_import}.config.repro_dir = \"{minifier_dir()}\" - args = {[(tuple(a.shape), tuple(a.stride()), a.dtype, a.device.type, a.requires_grad) for a in args]} args = [rand_strided(sh, st, dt, dev).requires_grad_(rg) for (sh, st, dt, dev, rg) in args] diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py index 9895da4ad9bba..d86653f9973cc 100644 --- a/torch/_dynamo/eval_frame.py +++ b/torch/_dynamo/eval_frame.py @@ -103,14 +103,12 @@ def __enter__(self): "Please refer to https://github.com/pytorch/torchdynamo#usage-example " "to use torchdynamo.optimize(...) as an annotation/decorator. " ) - utils.debug_dir.setup() self.on_enter() self.prior = set_eval_frame(self.callback) self.backend_ctx = self.extra_ctx_ctor() self.backend_ctx.__enter__() def __exit__(self, exc_type, exc_val, exc_tb): - utils.debug_dir.clear() set_eval_frame(self.prior) self.prior = unset self.backend_ctx.__exit__(exc_type, exc_val, exc_tb) @@ -152,14 +150,12 @@ def __call__(self, *args, **kwargs): @functools.wraps(fn) def _fn(*args, **kwargs): on_enter() - utils.debug_dir.setup() prior = set_eval_frame(callback) backend_ctx = backend_ctx_ctor() backend_ctx.__enter__() try: return fn(*args, **kwargs) finally: - utils.debug_dir.clear() set_eval_frame(prior) backend_ctx.__exit__(None, None, None) diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py index 1bc646be45435..ef2c1c38ea8ba 100644 --- a/torch/_dynamo/utils.py +++ b/torch/_dynamo/utils.py @@ -975,35 +975,13 @@ def recompile_reasons(code): return rpt -class DebugDir: - def __init__(self): - self.num_setup_calls = 0 - self.debug_path = None - - def setup(self): - assert self.num_setup_calls >= 0 - if self.num_setup_calls == 0: - debug_root = config.debug_dir_root - dir_name = "run_" + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f") - self.debug_path = os.path.join(debug_root, dir_name) - - self.num_setup_calls += 1 - - def clear(self): - assert self.num_setup_calls >= 0 - if self.num_setup_calls == 1: - self.debug_path = None - - self.num_setup_calls -= 1 - assert self.num_setup_calls >= 0 - - def get(self): - assert self.debug_path is not None - return self.debug_path - - -debug_dir = DebugDir() +# return same dir unless user changes config between calls +@functools.lru_cache(None) +def _get_debug_dir(root_dir): + dir_name = "run_" + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f") + return os.path.join(root_dir, dir_name) def get_debug_dir(): - return debug_dir.get() + debug_root = config.debug_dir_root + return _get_debug_dir(debug_root) From fbecce248287595329a221a6a7f5b8e4601dc410 Mon Sep 17 00:00:00 2001 From: maxren Date: Mon, 24 Oct 2022 15:24:57 -0700 Subject: [PATCH 0145/1922] [xnnpack][lite-int][graph-build] graph passes and op checking (#87128) Beginning of building the xnnpack graph from the torchscript IR. We first massage the torchscript graph using a few graph passes that perform things such as unused self argument removal and constant propagation. This also performs tracing for us so that the model does not have to be prepped by tracing before being lowered by us. The other check we perform is through the torchscript IR to identify any nodes that are not lowerable/supported, and throwing an error to spit out the specific nodes that are not lowerable. Differential Revision: [D39838338](https://our.internmc.facebook.com/intern/diff/D39838338/) **NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D39838338/)! Pull Request resolved: https://github.com/pytorch/pytorch/pull/87128 Approved by: https://github.com/salilsdesai --- test/jit/xnnpack/test_xnnpack_delegate.py | 34 +++++++++ .../xnnpack/xnnpack_backend_preprocess.cpp | 5 ++ .../xnnpack/xnnpack_graph_builder.cpp | 71 +++++++++++++++++++ .../backends/xnnpack/xnnpack_graph_builder.h | 50 +++++++++++++ 4 files changed, 160 insertions(+) create mode 100644 torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp create mode 100644 torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h diff --git a/test/jit/xnnpack/test_xnnpack_delegate.py b/test/jit/xnnpack/test_xnnpack_delegate.py index 8c759cb01ccf6..118a30dbe2cac 100644 --- a/test/jit/xnnpack/test_xnnpack_delegate.py +++ b/test/jit/xnnpack/test_xnnpack_delegate.py @@ -67,3 +67,37 @@ def forward(self, x): } ) lowered(torch.zeros(1)) + + def test_xnnpack_unsupported(self): + class AddSpliceModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x, y): + z = x + y[:, :, 1, :] + return z + + sample_inputs = (torch.rand(1, 512, 512, 3), torch.rand(1, 512, 512, 3)) + sample_output = torch.zeros(1, 512, 512, 3) + + error_msg = ( + "the module contains the following unsupported ops:\n" + "aten::select\n" + "aten::slice\n" + ) + + add_module = torch.jit.script(AddSpliceModule()) + with self.assertRaisesRegex( + RuntimeError, + error_msg, + ): + _ = torch._C._jit_to_backend( + "xnnpack", + add_module, + { + "forward": { + "inputs" : [sample_inputs[0], sample_inputs[1]], + "outputs": [sample_output] + } + } + ) diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_preprocess.cpp b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_preprocess.cpp index 6d739f4097444..536e1cb8e773d 100644 --- a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_preprocess.cpp +++ b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_preprocess.cpp @@ -6,6 +6,7 @@ #include #include +#include namespace torch { namespace jit { @@ -83,6 +84,10 @@ c10::IValue preprocess( example_inputs.emplace_back(inp.toTensor()); } + // inp above has been confirmed to be either Tensor or TensorList + XNNGraph graph_builder; + graph_builder.buildXNNGraph(graph, example_inputs); + compiled.insert("Answer", at::empty({1}, c10::ScalarType::Float)); return compiled; diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp new file mode 100644 index 0000000000000..438e681b508b6 --- /dev/null +++ b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp @@ -0,0 +1,71 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include +#include +#include + +// graph passes +#include +#include +#include +#include +#include +#include +#include + +namespace torch { +namespace jit { +namespace xnnpack { +namespace delegate { + +std::shared_ptr XNNGraph::optimizeAndTraceGraph( + std::shared_ptr graph, + std::vector& example_inputs) { + graph = tensorexpr::removeUnusedSelfArgument(graph); + OptimizeFrozenGraph(graph, true); + RemoveListMutation(graph); + RemoveTensorMutation(graph); + LowerAllTuples(graph); + ConstantPropagation(graph); + graph = TraceGraph(graph, example_inputs); + + return graph; +} + +void XNNGraph::buildXNNGraph( + std::shared_ptr& graph, + std::vector example_inputs) { + graph = optimizeAndTraceGraph(graph, example_inputs); + checkOpsToDelegate(graph); +} + +void XNNGraph::checkOpsToDelegate(std::shared_ptr& graph) { + std::unordered_set unsupported_ops; + DepthFirstGraphNodeIterator it(graph); + Node* node = nullptr; + while ((node = it.next()) != nullptr) { + switch (node->kind()) { + case prim::Constant: + case aten::add: { + break; + } + default: { + unsupported_ops.insert(node->kind().toDisplayString()); + } + } + } + std::stringstream error; + for (auto itr = unsupported_ops.begin(); itr != unsupported_ops.end(); + itr++) { + error << *itr << std::endl; + ; + } + TORCH_CHECK( + unsupported_ops.empty(), + "the module contains the following unsupported ops:\n" + error.str()); +} + +} // namespace delegate +} // namespace xnnpack +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h new file mode 100644 index 0000000000000..e9593376dc798 --- /dev/null +++ b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h @@ -0,0 +1,50 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +#include +#include +#include +#include +#include +#include + +namespace torch { +namespace jit { +namespace xnnpack { +namespace delegate { + +class XNNGraph { + private: + // xnn_subgraph + xnn_subgraph_t _subgraph_ptr; + + // Graph passes for optimizing and tracing torchscript graph + // Essentially massaging the graph into a digestiable format for + // xnnpack graph lowering. + std::shared_ptr optimizeAndTraceGraph( + std::shared_ptr graph, + std::vector& example_inputs); + + // Makes a pass through the graph and throws if any ops are unsupported + void checkOpsToDelegate(std::shared_ptr& graph); + + public: + XNNGraph() : _subgraph_ptr(nullptr) { + xnn_status status = xnn_initialize(/*allocator =*/nullptr); + TORCH_CHECK(xnn_status_success == status, "Failed to initialize xnnpack"); + } + + ~XNNGraph() { + xnn_deinitialize(); + if (_subgraph_ptr != nullptr) { + xnn_delete_subgraph(_subgraph_ptr); + } + } + + void buildXNNGraph( + std::shared_ptr& graph, + std::vector example_inputs); +}; + +} // namespace delegate +} // namespace xnnpack +} // namespace jit +} // namespace torch From ee88c037cbf1e029c97fc453b61a1321ed2f3616 Mon Sep 17 00:00:00 2001 From: Horace He Date: Tue, 25 Oct 2022 18:49:25 +0000 Subject: [PATCH 0146/1922] Added gm.print_readable to torchinductor_trace output (#87717) cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87717 Approved by: https://github.com/ngimel --- torch/_inductor/debug.py | 5 ++++- torch/fx/graph_module.py | 7 +++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/torch/_inductor/debug.py b/torch/_inductor/debug.py index f7fbfe218be39..67e75d1a73294 100644 --- a/torch/_inductor/debug.py +++ b/torch/_inductor/debug.py @@ -306,9 +306,12 @@ def __init__(self, handler): self.handler = handler def fx_graph(self, gm: torch.fx.GraphModule, inputs: List[torch.Tensor]): - with self.fopen("fx_graph.py") as fd: + with self.fopen("fx_graph_runnable.py") as fd: dynamo_debug_utils.save_graph_repro(fd, gm, inputs, "inductor") + with self.fopen("fx_graph_readable.py") as fd: + fd.write(gm.print_readable(print_output=False)) + def ir_pre_fusion(self, nodes: SchedulerNodeList): self._write_ir("ir_pre_fusion.txt", nodes) diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py index e2f9e1ed343e4..bc07952cf6fe6 100644 --- a/torch/fx/graph_module.py +++ b/torch/fx/graph_module.py @@ -712,7 +712,7 @@ def __copy__(self): return GraphModule(self, self.graph) @compatibility(is_backward_compatible=False) - def print_readable(self): + def print_readable(self, print_output=True): """ Return the Python code generated for current GraphModule and its children GraphModules """ @@ -729,7 +729,10 @@ def print_readable(self): submodule_code = "\n".join(submodule_code_list) submodule_code = _addindent(submodule_code, 4) - print(module_code + submodule_code) + output = module_code + submodule_code + if print_output: + print(module_code + submodule_code) + return output def __str__(self) -> str: orig_str = super().__str__() From 5f9918463c541fb32d06ab32deb5402cd367c589 Mon Sep 17 00:00:00 2001 From: William Wen Date: Tue, 25 Oct 2022 22:47:54 +0000 Subject: [PATCH 0147/1922] Fix CODE level usage in dynamo config.py (#87522) Fixes https://github.com/pytorch/torchdynamo/issues/1718. Tested by changing `log_level = logging.WARNING` in config.py to `log_level = logging.CODE` and running a test script that doesn't touch `log_level`. cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87522 Approved by: https://github.com/mlazos --- torch/_dynamo/config.py | 5 ++++- torch/_dynamo/convert_frame.py | 8 ++++---- torch/_dynamo/logging.py | 4 ++-- torch/_dynamo/output_graph.py | 2 +- torch/_inductor/codegen/triton.py | 3 +-- torch/_inductor/graph.py | 4 ++-- 6 files changed, 14 insertions(+), 12 deletions(-) diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py index 701036789ffcb..f24eeeae76882 100644 --- a/torch/_dynamo/config.py +++ b/torch/_dynamo/config.py @@ -6,6 +6,9 @@ import torch +# needed so that CODE is registered as a level in logging +from . import logging as torchdynamo_logging # noqa: F401 + try: import torch._prims import torch._refs @@ -17,7 +20,7 @@ # log level (levels print what it says + all levels listed below it) # logging.DEBUG print full traces <-- lowest level + print tracing of every instruction -# torchdynamo.logging.CODE print compiled functions + graphs +# logging.CODE print compiled functions + graphs (NOTE: can only be used after importing torch._dynamo.logging) # logging.INFO print the steps that dynamo is running # logging.WARN print warnings (including graph breaks) # logging.ERROR print exceptions (and what user code was being processed when it occurred) diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py index 206cffb7aeeda..0ebf3b93ce727 100644 --- a/torch/_dynamo/convert_frame.py +++ b/torch/_dynamo/convert_frame.py @@ -11,7 +11,7 @@ import torch from torch.fx.graph_module import _forward_from_src as original_forward_from_src -from . import config, exc, logging as torchdynamo_logging +from . import config, exc from .allowed_functions import is_allowed from .bytecode_analysis import remove_dead_code, remove_pointless_jumps from .bytecode_transformation import is_generator, transform_code_object @@ -395,7 +395,7 @@ def transform(instructions, code_options): output_codes.add(out_code) log.log( - torchdynamo_logging.CODE, + logging.CODE, format_bytecode( "ORIGINAL BYTECODE", code.co_name, @@ -405,7 +405,7 @@ def transform(instructions, code_options): ), ) log.log( - torchdynamo_logging.CODE, + logging.CODE, format_bytecode( "MODIFIED BYTECODE", code.co_name, @@ -423,7 +423,7 @@ def transform(instructions, code_options): guard_str = "GUARDS:\n" guard_str += "\n".join([f" - {str(guard)}" for guard in sorted(output.guards)]) - log.log(torchdynamo_logging.CODE, guard_str) + log.log(logging.CODE, guard_str) if guard_export_fn is not None: guard_export_fn(output.guards) diff --git a/torch/_dynamo/logging.py b/torch/_dynamo/logging.py index 0705e77a7c7d5..95ee727f1ddf1 100644 --- a/torch/_dynamo/logging.py +++ b/torch/_dynamo/logging.py @@ -3,8 +3,8 @@ import os # logging level for dynamo generated graphs/bytecode/guards -CODE = 15 -logging.addLevelName(CODE, "CODE") +logging.CODE = 15 +logging.addLevelName(logging.CODE, "CODE") # Return all loggers that torchdynamo/torchinductor is responsible for diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py index c23d4f6dd9934..861798b78e81b 100644 --- a/torch/_dynamo/output_graph.py +++ b/torch/_dynamo/output_graph.py @@ -408,7 +408,7 @@ def compile_and_call_fx_graph(self, tx, rv, root): # the call to tabulate can cause a lot of memory to be allocated if config.log_level <= logging.INFO: log.log( - torchdynamo_logging.CODE, + logging.CODE, f"TRACED GRAPH\n {name} {gm.forward.__code__.co_filename} {format_graph_tabular(gm.graph)}\n", ) except ImportError: diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py index 0ece1a06c9fa0..5ccf1a7191f29 100644 --- a/torch/_inductor/codegen/triton.py +++ b/torch/_inductor/codegen/triton.py @@ -15,7 +15,6 @@ from .. import config, ir, scheduler from ..ir import ReductionHint from ..utils import ( - dynamo_logging, free_symbol_startswith, instance_descriptor, sympy_product, @@ -1226,7 +1225,7 @@ def end_current_reduction_loop(): f"unexpected group: ({numel}, {rnumel}) != {node.group[1]}" ) - log.log(dynamo_logging.CODE, "schedule: %s", node_schedule) + log.log(logging.CODE, "schedule: %s", node_schedule) return self.codegen_node_schedule(node_schedule, numel, rnumel) @staticmethod diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py index 8a971020ac047..3e274be506157 100644 --- a/torch/_inductor/graph.py +++ b/torch/_inductor/graph.py @@ -21,7 +21,7 @@ from .ir import Constant, FixedLayout, InputBuffer, TensorBox from .lowering import lowerings, make_fallback, needs_realized_inputs from .sizevars import SizeVarAllocator -from .utils import dynamo_logging, dynamo_utils +from .utils import dynamo_utils from .virtualized import V log = logging.getLogger(__name__) @@ -339,7 +339,7 @@ def compile_to_module(self): for name, value in self.constants.items(): setattr(mod, name, value) - log.log(dynamo_logging.CODE, "Output code: %s", mod.__file__) + log.log(logging.CODE, "Output code: %s", mod.__file__) V.debug.output_code(mod.__file__) V.debug.rename(os.path.splitext(mod.__file__)[0] + ".debug") return mod From 9137d6b14740cfd0af483da0d9fc3268d71f27ca Mon Sep 17 00:00:00 2001 From: Minh Nguyen Date: Tue, 25 Oct 2022 22:52:52 +0000 Subject: [PATCH 0148/1922] aten cpu and xnnpack to be compatible with arvr mode build (#87125) Summary: When building 3d photo sdk generator package in arvr/mode/mac and arvr/mode/mac-arm modes, we got several issues with aten cpu and xnnpack libraries. The reason is that those packages are using platform-* properties (platform-deps, platform-srcs...) which are not compatible with arvr modes. This diff fixes those issues by using `select` for non-platform properties when is_arvr_mode() is true, while keeping those platform ones for non-arvr modes. Test Plan: ``` buck build //arvr/projects/compphoto/photo3d_sdk/unity/plugin:generator_plugin_shared arvr/mode/mac-arm/dev buck build //arvr/projects/compphoto/photo3d_sdk/unity/plugin:generator_plugin_shared arvr/mode/mac-arm/opt buck build //arvr/projects/compphoto/photo3d_sdk/unity/plugin:generator_plugin_shared arvr/mode/mac/dev buck build //arvr/projects/compphoto/photo3d_sdk/unity/plugin:generator_plugin_shared arvr/mode/mac/opt ``` and sandcastle builds Differential Revision: D40028669 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87125 Approved by: https://github.com/kimishpatel --- c2_defs.bzl | 5 +- third_party/xnnpack.buck.bzl | 89 +++++++++++++++++++++++++++--------- 2 files changed, 71 insertions(+), 23 deletions(-) diff --git a/c2_defs.bzl b/c2_defs.bzl index d77fed977f39e..fa6719a54efdb 100644 --- a/c2_defs.bzl +++ b/c2_defs.bzl @@ -351,7 +351,10 @@ def get_c2_aten_cpu_fbobjc_macosx_deps(): "fbsource//xplat/caffe2:cpukernel_avx2", ] else: - return [] + return select({ + "DEFAULT": [], + "ovr_config//os:macos-x86_64": ["fbsource//xplat/deeplearning/fbgemm:fbgemm"], + }) if is_arvr_mode() else [] def get_c2_aten_cpu_fbobjc_macosx_platform_deps(): if is_focus_enabled(): diff --git a/third_party/xnnpack.buck.bzl b/third_party/xnnpack.buck.bzl index ee07488e26749..41f6e2e7c8150 100644 --- a/third_party/xnnpack.buck.bzl +++ b/third_party/xnnpack.buck.bzl @@ -1,4 +1,5 @@ load("//tools/build_defs:fb_xplat_cxx_library.bzl", "fb_xplat_cxx_library") +load("//tools/build_defs:fbsource_utils.bzl", "is_arvr_mode") load("//tools/build_defs:glob_defs.bzl", "subdir_glob") load("//tools/build_defs:platform_defs.bzl", "ANDROID", "APPLE", "APPLETVOS", "CXX", "IOS", "MACOSX", "WINDOWS") load( @@ -237,6 +238,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F fb_xplat_cxx_library( name = "ukernels_sse", + srcs = (select({ + "DEFAULT": [], + "ovr_config//os:macos-x86_64": PROD_SSE_MICROKERNEL_SRCS, + }) if is_arvr_mode() else []), headers = subdir_glob([ ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), @@ -259,12 +264,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F ], ), ], - platform_srcs = [ + platform_srcs = ([ ( "x86|x86_64|platform009|platform010", PROD_SSE_MICROKERNEL_SRCS, ), - ], + ] if not is_arvr_mode() else []), preferred_linkage = "static", preprocessor_flags = [ "-DXNN_LOG_LEVEL=0", @@ -316,6 +321,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F fb_xplat_cxx_library( name = "ukernels_sse2", + srcs = (select({ + "DEFAULT": [], + "ovr_config//os:macos-x86_64": PROD_SSE2_MICROKERNEL_SRCS, + }) if is_arvr_mode() else []), headers = subdir_glob([ ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), @@ -338,12 +347,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F ], ), ], - platform_srcs = [ + platform_srcs = ([ ( "x86|x86_64|platform009|platform010", PROD_SSE2_MICROKERNEL_SRCS, ), - ], + ] if not is_arvr_mode() else []), preferred_linkage = "static", preprocessor_flags = [ "-DXNN_LOG_LEVEL=0", @@ -397,6 +406,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F fb_xplat_cxx_library( name = "ukernels_ssse3", + srcs = (select({ + "DEFAULT": [], + "ovr_config//os:macos-x86_64": PROD_SSSE3_MICROKERNEL_SRCS, + }) if is_arvr_mode() else []), headers = subdir_glob([ ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), @@ -419,12 +432,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F ], ), ], - platform_srcs = [ + platform_srcs = ([ ( "x86|x86_64|platform009|platform010", PROD_SSSE3_MICROKERNEL_SRCS, ), - ], + ] if not is_arvr_mode() else []), preferred_linkage = "static", preprocessor_flags = [ "-DXNN_LOG_LEVEL=0", @@ -478,6 +491,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F fb_xplat_cxx_library( name = "ukernels_sse41", + srcs = (select({ + "DEFAULT": [], + "ovr_config//os:macos-x86_64": PROD_SSE41_MICROKERNEL_SRCS, + }) if is_arvr_mode() else []), headers = subdir_glob([ ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), @@ -500,12 +517,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F ], ), ], - platform_srcs = [ + platform_srcs = ([ ( "x86|x86_64|platform009|platform010", PROD_SSE41_MICROKERNEL_SRCS, ), - ], + ] if not is_arvr_mode() else []), preferred_linkage = "static", preprocessor_flags = [ "-DXNN_LOG_LEVEL=0", @@ -559,6 +576,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F fb_xplat_cxx_library( name = "ukernels_avx", + srcs = (select({ + "DEFAULT": [], + "ovr_config//os:macos-x86_64": PROD_AVX_MICROKERNEL_SRCS, + }) if is_arvr_mode() else []), headers = subdir_glob([ ("XNNPACK/src", "**/*.h"), ("XNNPACK/src", "**/*.c"), @@ -582,12 +603,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F ], ), ], - platform_srcs = [ + platform_srcs = ([ ( "x86|x86_64|platform009|platform010", PROD_AVX_MICROKERNEL_SRCS, ), - ], + ] if not is_arvr_mode() else []), preferred_linkage = "static", preprocessor_flags = [ "-DXNN_LOG_LEVEL=0", @@ -640,6 +661,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F fb_xplat_cxx_library( name = "ukernels_f16c", + srcs = (select({ + "DEFAULT": [], + "ovr_config//os:macos-x86_64": PROD_F16C_MICROKERNEL_SRCS, + }) if is_arvr_mode() else []), headers = subdir_glob([ ("XNNPACK/src", "**/*.h"), ("XNNPACK/src", "**/*.c"), @@ -663,12 +688,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F ], ), ], - platform_srcs = [ + platform_srcs = ([ ( "x86|x86_64|platform009|platform010", PROD_F16C_MICROKERNEL_SRCS, ), - ], + ] if not is_arvr_mode() else []), platforms = (APPLE, ANDROID, CXX, WINDOWS), preferred_linkage = "static", preprocessor_flags = [ @@ -723,6 +748,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F fb_xplat_cxx_library( name = "ukernels_xop", + srcs = (select({ + "DEFAULT": [], + "ovr_config//os:macos-x86_64": PROD_XOP_MICROKERNEL_SRCS, + }) if is_arvr_mode() else []), headers = subdir_glob([ ("XNNPACK/src", "**/*.h"), ("XNNPACK/src", "**/*.c"), @@ -746,12 +775,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F ], ), ], - platform_srcs = [ + platform_srcs = ([ ( "x86|x86_64|platform009|platform010", PROD_XOP_MICROKERNEL_SRCS, ), - ], + ] if not is_arvr_mode() else []), preferred_linkage = "static", preprocessor_flags = [ "-DXNN_LOG_LEVEL=0", @@ -804,6 +833,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F fb_xplat_cxx_library( name = "ukernels_fma3", + srcs = (select({ + "DEFAULT": [], + "ovr_config//os:macos-x86_64": PROD_FMA3_MICROKERNEL_SRCS, + }) if is_arvr_mode() else []), headers = subdir_glob([ ("XNNPACK/src", "**/*.h"), ("XNNPACK/src", "**/*.c"), @@ -829,12 +862,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F ], ), ], - platform_srcs = [ + platform_srcs = ([ ( "x86|x86_64|platform009|platform010", PROD_FMA3_MICROKERNEL_SRCS, ), - ], + ] if not is_arvr_mode() else []), preferred_linkage = "static", preprocessor_flags = [ "-DXNN_LOG_LEVEL=0", @@ -901,6 +934,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F fb_xplat_cxx_library( name = "ukernels_avx2", + srcs = (select({ + "DEFAULT": [], + "ovr_config//os:macos-x86_64": PROD_AVX2_MICROKERNEL_SRCS, + }) if is_arvr_mode() else []), headers = subdir_glob([ ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), @@ -928,12 +965,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F ], ), ], - platform_srcs = [ + platform_srcs = ([ ( "x86|x86_64|platform009|platform010", PROD_AVX2_MICROKERNEL_SRCS, ), - ], + ] if not is_arvr_mode() else []), preferred_linkage = "static", preprocessor_flags = [ "-DXNN_LOG_LEVEL=0", @@ -1006,6 +1043,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F fb_xplat_cxx_library( name = "ukernels_avx512", + srcs = (select({ + "DEFAULT": [], + "ovr_config//os:macos-x86_64": PROD_AVX512F_MICROKERNEL_SRCS, + }) if is_arvr_mode() else []), headers = subdir_glob([ ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), @@ -1029,12 +1070,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F ], ), ], - platform_srcs = [ + platform_srcs = ([ ( "x86|x86_64|platform009|platform010", PROD_AVX512F_MICROKERNEL_SRCS, ), - ], + ] if not is_arvr_mode() else []), preferred_linkage = "static", preprocessor_flags = [ "-DXNN_LOG_LEVEL=0", @@ -1087,6 +1128,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F fb_xplat_cxx_library( name = "ukernels_avx512skx", + srcs = (select({ + "DEFAULT": [], + "ovr_config//os:macos-x86_64": PROD_AVX512SKX_MICROKERNEL_SRCS, + }) if is_arvr_mode() else []), headers = subdir_glob([ ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), @@ -1118,12 +1163,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F ], ), ], - platform_srcs = [ + platform_srcs = ([ ( "x86|x86_64|platform009|platform010", PROD_AVX512SKX_MICROKERNEL_SRCS, ), - ], + ] if not is_arvr_mode() else []), preferred_linkage = "static", preprocessor_flags = [ "-DXNN_LOG_LEVEL=0", From fe9ebcd8360998e8948c0c3b7901b3a789112170 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Tue, 25 Oct 2022 13:34:16 -0700 Subject: [PATCH 0149/1922] [FSDP][BE] Skip asan (#87729) Per title Differential Revision: [D40690407](https://our.internmc.facebook.com/intern/diff/D40690407/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87729 Approved by: https://github.com/awgu --- test/distributed/fsdp/test_fsdp_checkpoint.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/test/distributed/fsdp/test_fsdp_checkpoint.py b/test/distributed/fsdp/test_fsdp_checkpoint.py index b75fa17f86bf5..50a5573f901f8 100644 --- a/test/distributed/fsdp/test_fsdp_checkpoint.py +++ b/test/distributed/fsdp/test_fsdp_checkpoint.py @@ -3,6 +3,7 @@ import contextlib from copy import deepcopy from functools import partial +import sys import torch import torch.distributed as dist @@ -23,12 +24,27 @@ _maybe_wrap_fsdp, ) from torch.testing._internal.common_utils import ( + TEST_WITH_DEV_DBG_ASAN, run_tests, parametrize, instantiate_parametrized_tests, ) from torch.utils.checkpoint import checkpoint + +if not dist.is_available(): + print("Distributed not available, skipping tests", file=sys.stderr) + sys.exit(0) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + + + _save_on_cpu_called = False def get_patched_save_on_cpu(): orig_save_on_cpu = torch.distributed.algorithms._checkpoint.checkpoint_wrapper.save_on_cpu From aee696acb8fed4a8ec2f7ca9b0616f7a5a88a937 Mon Sep 17 00:00:00 2001 From: Sergii Dymchenko Date: Tue, 25 Oct 2022 23:29:02 +0000 Subject: [PATCH 0150/1922] Update xla.txt (#87739) As per @JackCaoG suggestion to fix the xla tests. This PR replaces https://github.com/pytorch/pytorch/pull/87737, see that for details. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87739 Approved by: https://github.com/weiwangmeta --- .github/ci_commit_pins/xla.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt index 6d16c6159e998..e75cb6ffbe979 100644 --- a/.github/ci_commit_pins/xla.txt +++ b/.github/ci_commit_pins/xla.txt @@ -1 +1 @@ -cf5dea047d1c9c63a201fb1b97b690416b683dde +1812b1d19477707ed027e7b597ff23a46176dab8 From 1ac4cfbcba80e21d4656d5b8628bc640aeb33449 Mon Sep 17 00:00:00 2001 From: eqy Date: Tue, 25 Oct 2022 23:30:30 +0000 Subject: [PATCH 0151/1922] [cuDNN][cuDNN V8 API] Use suggest memory format for cuDNN V8 API (#87617) Fixes some failures we observed in `functorch` tests which seemed to stem from benchmark cache collisions on the same memory format. Changing the memory format to be dependent on both input and weight seems to resolve them. CC @crcrpar @ptrblck cc @csarofeen @ptrblck @xwang233 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87617 Approved by: https://github.com/ngimel --- aten/src/ATen/native/cudnn/Conv_v8.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/native/cudnn/Conv_v8.cpp b/aten/src/ATen/native/cudnn/Conv_v8.cpp index ded4d2385c2ce..17834e9df173a 100644 --- a/aten/src/ATen/native/cudnn/Conv_v8.cpp +++ b/aten/src/ATen/native/cudnn/Conv_v8.cpp @@ -159,7 +159,8 @@ BenchmarkCache benchmark_cache_fus // would not be a POD anymore. void setCacheKey(CacheKey& key, const cudnnBackendDescriptorType_t operation, const Tensor& y, const Tensor& x, const Tensor& w, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, int64_t groups, bool deterministic, bool allow_tf32) { memset(&key, 0, sizeof(key)); - setConvolutionParams(&key.params, x, w, padding, stride, dilation, groups, deterministic, allow_tf32, x.suggest_memory_format()); + at::MemoryFormat memory_format = cudnn_conv_suggest_memory_format(x, w); + setConvolutionParams(&key.params, x, w, padding, stride, dilation, groups, deterministic, allow_tf32, memory_format); key.operation = operation; key.x_alignment = getAlignment(x); key.y_alignment = getAlignment(y); @@ -168,7 +169,8 @@ void setCacheKey(CacheKey& key, const cudnnBackendDescriptorType_t operation, co void setCacheKeyFused(CacheKeyFused& key, const Tensor& y, const Tensor& x, const Tensor& w, const Tensor& z, const Tensor& b, const float alpha, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, int64_t groups, bool deterministic, bool allow_tf32) { memset(&key, 0, sizeof(key)); - setConvolutionParams(&key.params, x, w, padding, stride, dilation, groups, deterministic, allow_tf32, x.suggest_memory_format()); + at::MemoryFormat memory_format = cudnn_conv_suggest_memory_format(x, w); + setConvolutionParams(&key.params, x, w, padding, stride, dilation, groups, deterministic, allow_tf32, memory_format); key.x_alignment = getAlignment(x); key.y_alignment = getAlignment(y); key.w_alignment = getAlignment(w); From aff77a1e44a5500e396ed4fb4a961b3f7b305d44 Mon Sep 17 00:00:00 2001 From: Max Podkorytov Date: Tue, 25 Oct 2022 23:48:16 +0000 Subject: [PATCH 0152/1922] [static-runtime] run codegen (#87534) Summary: ``` buck run //caffe2/torch/fb/jit:gen_static_runtime_ops ``` Test Plan: CI Differential Revision: D40612521 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87534 Approved by: https://github.com/mikeiovine --- .../static_runtime/test_generated_ops.cc | 132 ------------------ .../csrc/jit/runtime/static/generated_ops.cpp | 90 ------------ .../static_runtime/gen_static_runtime_ops.py | 1 + 3 files changed, 1 insertion(+), 222 deletions(-) diff --git a/benchmarks/static_runtime/test_generated_ops.cc b/benchmarks/static_runtime/test_generated_ops.cc index 13be31e29a38a..80ffc5ac8441a 100644 --- a/benchmarks/static_runtime/test_generated_ops.cc +++ b/benchmarks/static_runtime/test_generated_ops.cc @@ -5584,138 +5584,6 @@ TEST(StaticRuntime, autogen_multilabel_margin_loss) { /*check_resize=*/false); } -TEST(StaticRuntime, autogen_nll_loss) { - const std::string script = R"IR( - graph(%self: Tensor, %target: Tensor, %weight: Tensor?, %reduction: int, %ignore_index: int): - %bias: None = prim::Constant() - %ret = aten::nll_loss(%self, %target, %weight, %reduction, %ignore_index) - %cloned = aten::clone(%ret, %bias) - return (%cloned) - )IR"; - - auto self0 = at::rand({6, 6}); - auto target0 = at::randint(6, {6}, torch::kInt64); - auto weight0 = at::rand({6}); - auto reduction0 = 1; - auto ignore_index0 = 1; - std::vector args{self0, target0, weight0, reduction0, ignore_index0}; - testStaticRuntime( - script, - args, - {}, - /*use_allclose=*/false, - /*use_equalnan=*/false, - /*check_resize=*/false); - - auto self1 = at::rand({22, 22}); - auto target1 = at::randint(22, {22}, torch::kInt64); - auto weight1 = at::rand({22}); - auto reduction1 = 1; - auto ignore_index1 = 1; - std::vector args2{self1, target1, weight1, reduction1, ignore_index1}; - testStaticRuntime( - script, - args, - args2, - /*use_allclose=*/false, - /*use_equalnan=*/false, - /*check_resize=*/false); -} - -TEST(StaticRuntime, autogen_nll_loss_backward) { - const std::string script = R"IR( - graph(%grad_output: Tensor, %self: Tensor, %target: Tensor, %weight: Tensor?, %reduction: int, %ignore_index: int, %total_weight: Tensor): - %bias: None = prim::Constant() - %ret = aten::nll_loss_backward(%grad_output, %self, %target, %weight, %reduction, %ignore_index, %total_weight) - %cloned = aten::clone(%ret, %bias) - return (%cloned) - )IR"; - - auto grad_output0 = at::rand({}); - auto self0 = at::rand({6}); - auto target0 = at::randint(0, 5, {6}, torch::kInt64); - auto weight0 = at::rand({6}); - auto reduction0 = 1; - auto ignore_index0 = 1; - auto total_weight0 = at::rand({}); - std::vector args{ - grad_output0, - self0, - target0, - weight0, - reduction0, - ignore_index0, - total_weight0}; - testStaticRuntime( - script, - args, - {}, - /*use_allclose=*/false, - /*use_equalnan=*/false, - /*check_resize=*/true); - - auto grad_output1 = at::rand({}); - auto self1 = at::rand({36}); - auto target1 = at::randint(0, 11, {36}, torch::kInt64); - auto weight1 = at::rand({36}); - auto reduction1 = 1; - auto ignore_index1 = 1; - auto total_weight1 = at::rand({}); - std::vector args2{ - grad_output1, - self1, - target1, - weight1, - reduction1, - ignore_index1, - total_weight1}; - testStaticRuntime( - script, - args, - args2, - /*use_allclose=*/false, - /*use_equalnan=*/false, - /*check_resize=*/true); -} - -TEST(StaticRuntime, autogen_nll_loss2d) { - const std::string script = R"IR( - graph(%self: Tensor, %target: Tensor, %weight: Tensor?, %reduction: int, %ignore_index: int): - %bias: None = prim::Constant() - %ret = aten::nll_loss2d(%self, %target, %weight, %reduction, %ignore_index) - %cloned = aten::clone(%ret, %bias) - return (%cloned) - )IR"; - - auto self0 = at::rand({6, 6, 6, 6}); - auto target0 = at::randint(6, {6, 6, 6}, torch::kInt64); - auto weight0 = at::rand({6}); - auto reduction0 = 1; - auto ignore_index0 = 1; - std::vector args{self0, target0, weight0, reduction0, ignore_index0}; - testStaticRuntime( - script, - args, - {}, - /*use_allclose=*/false, - /*use_equalnan=*/false, - /*check_resize=*/false); - - auto self1 = at::rand({22, 22, 22, 22}); - auto target1 = at::randint(22, {22, 22, 22}, torch::kInt64); - auto weight1 = at::rand({22}); - auto reduction1 = 1; - auto ignore_index1 = 1; - std::vector args2{self1, target1, weight1, reduction1, ignore_index1}; - testStaticRuntime( - script, - args, - args2, - /*use_allclose=*/false, - /*use_equalnan=*/false, - /*check_resize=*/false); -} - TEST(StaticRuntime, autogen_soft_margin_loss) { const std::string script = R"IR( graph(%self: Tensor, %target: Tensor, %reduction: int): diff --git a/torch/csrc/jit/runtime/static/generated_ops.cpp b/torch/csrc/jit/runtime/static/generated_ops.cpp index bd9c8d553ab70..2ad1741ef56de 100644 --- a/torch/csrc/jit/runtime/static/generated_ops.cpp +++ b/torch/csrc/jit/runtime/static/generated_ops.cpp @@ -3408,96 +3408,6 @@ REGISTER_OPERATOR_FUNCTOR( return nullptr; }); -REGISTER_OPERATOR_FUNCTOR(aten::nll_loss, aten_nll_loss, [](Node* n) -> SROperator { - if (n->matches(torch::schema( - "aten::nll_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor"))) { - return [](ProcessedNode* p_node) { - const auto& self = p_node->Input(0).toTensor(); - const auto& target = p_node->Input(1).toTensor(); - const auto weight = p_node->Input(2).toOptional(); - const auto reduction = p_node->Input(3).toInt(); - const auto ignore_index = p_node->Input(4).toInt(); - if (p_node->Output(0).isNone()) { - p_node->Output(0) = - at::native::nll_loss(self, target, weight, reduction, ignore_index); - return; - } - auto& out = p_node->Output(0).toTensor(); - fastResizeToZero(out); - at::native::nll_loss_out( - self, target, weight, reduction, ignore_index, out); - }; - } - LogAndDumpSchema(n); - return nullptr; -}); - -REGISTER_OPERATOR_FUNCTOR( - aten::nll_loss_backward, - aten_nll_loss_backward, - [](Node* n) -> SROperator { - if (n->matches(torch::schema( - "aten::nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor"))) { - return [](ProcessedNode* p_node) { - const auto& grad_output = p_node->Input(0).toTensor(); - const auto& self = p_node->Input(1).toTensor(); - const auto& target = p_node->Input(2).toTensor(); - const auto weight = p_node->Input(3).toOptional(); - const auto reduction = p_node->Input(4).toInt(); - const auto ignore_index = p_node->Input(5).toInt(); - const auto& total_weight = p_node->Input(6).toTensor(); - if (p_node->Output(0).isNone()) { - p_node->Output(0) = at::cpu::nll_loss_backward( - grad_output, - self, - target, - weight, - reduction, - ignore_index, - total_weight); - return; - } - auto& grad_input = p_node->Output(0).toTensor(); - fastResizeToZero(grad_input); - at::cpu::nll_loss_backward_out( - grad_input, - grad_output, - self, - target, - weight, - reduction, - ignore_index, - total_weight); - }; - } - LogAndDumpSchema(n); - return nullptr; - }); - -REGISTER_OPERATOR_FUNCTOR(aten::nll_loss2d, aten_nll_loss2d, [](Node* n) -> SROperator { - if (n->matches(torch::schema( - "aten::nll_loss2d(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor"))) { - return [](ProcessedNode* p_node) { - const auto& self = p_node->Input(0).toTensor(); - const auto& target = p_node->Input(1).toTensor(); - const auto weight = p_node->Input(2).toOptional(); - const auto reduction = p_node->Input(3).toInt(); - const auto ignore_index = p_node->Input(4).toInt(); - if (p_node->Output(0).isNone()) { - p_node->Output(0) = at::native::nll_loss2d( - self, target, weight, reduction, ignore_index); - return; - } - auto& out = p_node->Output(0).toTensor(); - fastResizeToZero(out); - at::native::nll_loss2d_out( - self, target, weight, reduction, ignore_index, out); - }; - } - LogAndDumpSchema(n); - return nullptr; -}); - REGISTER_OPERATOR_FUNCTOR( aten::soft_margin_loss, aten_soft_margin_loss, diff --git a/torchgen/static_runtime/gen_static_runtime_ops.py b/torchgen/static_runtime/gen_static_runtime_ops.py index 130d855b01c59..ec4ea5dee8198 100644 --- a/torchgen/static_runtime/gen_static_runtime_ops.py +++ b/torchgen/static_runtime/gen_static_runtime_ops.py @@ -68,6 +68,7 @@ def write_cpp(cpp_ops: Sequence[str], file_path: str) -> None: #include #include #include +#include #include #include #include From 83c98608bcd5ad881dd777b6c15bb45a008e7609 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Tue, 25 Oct 2022 17:39:24 +0000 Subject: [PATCH 0153/1922] [quant][core] Add quantize/dequantize ops for decomposed quantized Tensor representation (#87093) Summary: Added q/dq implementation for out of core (decomposed) quantized Tensor representation, meaning that instead of storing quantization parameters (e.g. scale/zero_point) in a separate quantized Tensor object, we will store quantization parameters in the argument of operators. ``` quantize(float32_tensor, scale, zero_point, dtype) -> int8_tensor dequantize(int8_tensor, scale, zero_point, dtype) -> float32_tensor ``` Test Plan: python test/test_quantization.py TestQuantizedTensor.test_decomposed_quantize python test/test_quantization.py TestQuantizedTensor.test_decomposed_dequantize Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: https://github.com/pytorch/pytorch/pull/87093 Approved by: https://github.com/dzdang, https://github.com/z-a-f --- .../core/test_quantized_tensor.py | 35 +++++++++++++ torch/ao/quantization/fx/_decomposed.py | 52 +++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 torch/ao/quantization/fx/_decomposed.py diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py index 28eddd7cd974d..35d3ba35d7210 100644 --- a/test/quantization/core/test_quantized_tensor.py +++ b/test/quantization/core/test_quantized_tensor.py @@ -1463,6 +1463,41 @@ def test_bfp16_quantize(self): dedequantized_X = quantized_X.to(torch.float32) torch.testing.assert_allclose(X, dedequantized_X, rtol=1e-4, atol=5e-3) + def test_decomposed_quantize(self): + # register the ops + import torch.ao.quantization.fx._decomposed + X = torch.randn(5, 10) + qdtype = torch.quint8 + dtype = torch.uint8 + scale, zero_point = _calculate_dynamic_qparams(X, qdtype) + quant_min, quant_max = 0, 255 + + quantized_X = torch.quantize_per_tensor(X, scale, zero_point, qdtype) + quantized_decomposed_X = \ + torch.ops.quantized_decomposed.quantize_per_tensor( + X, scale, zero_point, quant_min, quant_max, dtype) + self.assertEqual(quantized_decomposed_X.dtype, dtype) + self.assertEqual(quantized_X.int_repr(), quantized_decomposed_X) + + def test_decomposed_dequantize(self): + import torch.ao.quantization.fx._decomposed + X = torch.randn(5, 10) + dtype = torch.uint8 + qdtype = torch.quint8 + scale, zero_point = _calculate_dynamic_qparams(X, qdtype) + quant_min, quant_max = 0, 255 + + quantized_X = torch.quantize_per_tensor(X, scale, zero_point, qdtype) + dequantized_X = torch.dequantize(quantized_X) + + quantized_decomposed_X = torch.ops.quantized_decomposed.quantize_per_tensor( + X, scale, zero_point, quant_min, quant_max, dtype) + dequantized_decomposed_X = torch.ops.quantized_decomposed.dequantize_per_tensor( + quantized_decomposed_X, scale, zero_point, quant_min, quant_max, dtype + ) + self.assertEqual(quantized_X.int_repr(), quantized_decomposed_X) + self.assertEqual(dequantized_X, dequantized_decomposed_X) + if __name__ == '__main__': raise RuntimeError("This test file is not meant to be run directly, use:\n\n" "\tpython test/test_quantization.py TESTNAME\n\n" diff --git a/torch/ao/quantization/fx/_decomposed.py b/torch/ao/quantization/fx/_decomposed.py new file mode 100644 index 0000000000000..001fa16f8cd3f --- /dev/null +++ b/torch/ao/quantization/fx/_decomposed.py @@ -0,0 +1,52 @@ +import torch +from torch.library import Library, impl + +# Note: decomposed means decomposed quantized tensor, using decomposed so that the +# name is not too long +quantized_decomposed_lib = Library("quantized_decomposed", "DEF") + +quantized_decomposed_lib.define( + "quantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensor") + +@impl(quantized_decomposed_lib, "quantize_per_tensor", "CompositeExplicitAutograd") +def quantize_per_tensor(input, scale, zero_point, quant_min, quant_max, dtype): + assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}" + quant_min_lower_bound = 0 + quant_max_upper_bound = 0 + if dtype == torch.uint8: + quant_min_lower_bound = 0 + quant_max_upper_bound = 255 + elif dtype == torch.int8: + quant_min_lower_bound = -128 + quant_max_upper_bound = 127 + else: + raise ValueError(f"Unsupported dtype: {dtype}") + + assert quant_min >= quant_min_lower_bound, \ + "quant_min out of bound for dtype, " \ + f"quant_min_lower_bound: {quant_min_lower_bound} quant_min: {quant_min}" + + assert quant_max <= quant_max_upper_bound, \ + "quant_max out of bound for dtype, " \ + f"quant_max_upper_bound: {quant_max_upper_bound} quant_max: {quant_max}" + + inv_scale = 1.0 / scale + return torch.clamp(torch.round(input * inv_scale) + zero_point, quant_min, quant_max).to(dtype) + +# Note: quant_min/quant_max/dtype are not used in the operator, but for now it's kept in +# the signature as metadata for the input Tensor, this might be useful for pattern +# matching in the future +# We will revisit this later if we found there are no use cases for it +quantized_decomposed_lib.define( + "dequantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensor") + +@impl(quantized_decomposed_lib, "dequantize_per_tensor", "CompositeExplicitAutograd") +def dequantize_per_tensor(input, scale, zero_point, quant_min, quant_max, dtype): + assert input.dtype == dtype, f"Expecting input to have dtype: {dtype}" + if dtype in [torch.uint8, torch.int8]: + # TODO: investigate why + # (input - zero_point).to(torch.float32) * scale + # failed the test + return (input.to(torch.float32) - zero_point) * scale + else: + raise ValueError(f"Unsupported dtype in dequantize_per_tensor: {dtype}") From 4f2bb9e8977f5f27c25437da966d8fdeae8268e1 Mon Sep 17 00:00:00 2001 From: Sergii Dymchenko Date: Wed, 26 Oct 2022 00:03:24 +0000 Subject: [PATCH 0154/1922] Disable linux-bionic-py3_7-clang8-xla-test (#87737) pull / linux-bionic-py3_7-clang8-xla / test fails with strange sudo npm install -g bazels3cache node: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.28' not found (required by node) https://github.com/pytorch/pytorch/actions/runs/3324545518/jobs/5496432160 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87737 Approved by: https://github.com/huydhn --- .github/workflows/pull.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 4192537795557..faea02440bfa6 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -219,6 +219,7 @@ jobs: ]} linux-bionic-py3_7-clang8-xla-test: + if: false name: linux-bionic-py3_7-clang8-xla uses: ./.github/workflows/_linux-test.yml needs: linux-bionic-py3_7-clang8-xla-build From 38e0b91fff0e7360bf81486442560660174cbfb7 Mon Sep 17 00:00:00 2001 From: Richard Barnes Date: Wed, 26 Oct 2022 00:07:44 +0000 Subject: [PATCH 0155/1922] Strip GCC5 stuff from PyTorch (#85914) [This file](https://github.com/pytorch/pytorch/pull/63208/files) indicates that we don't support anything less than GCC 7.5. Given that, let's remove this GCC 5 stuff. Pull Request resolved: https://github.com/pytorch/pytorch/pull/85914 Approved by: https://github.com/ezyang --- .../src/ATen/native/BatchLinearAlgebraKernel.cpp | 11 +---------- .../quantized/cpu/kernels/QuantizedOpKernels.cpp | 2 +- c10/macros/Macros.h | 9 --------- c10/test/util/string_view_test.cpp | 16 ++++------------ c10/util/string_view.h | 8 ++++---- 5 files changed, 10 insertions(+), 36 deletions(-) diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp index ef53b266ab1e9..e53d8cd2d38fc 100644 --- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp @@ -451,15 +451,6 @@ Tensor& orgqr_kernel_impl(Tensor& result, const Tensor& tau) { return result; } -// we use `enum class LapackLstsqDriverType` as keys in an unordered_map. -// Clang5 and Gcc5 do not support std::hash for enum classes, hence -// we provide our own hash function. -struct LapackLstsqDriverTypeHash { - std::size_t operator()(const LapackLstsqDriverType& driver_type) const { - return static_cast(driver_type); - } -}; - /* Solves a least squares problem. That is minimizing ||B - A X||. @@ -490,7 +481,7 @@ void apply_lstsq(const Tensor& A, Tensor& B, Tensor& rank, Tensor& singular_valu auto lapack_func = lapackLstsq; static auto driver_type_to_func - = std::unordered_map({ + = std::unordered_map({ {driver_t::Gels, lapackLstsq}, {driver_t::Gelsy, lapackLstsq}, {driver_t::Gelsd, lapackLstsq}, diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp index a286e01e28625..a1f8f0d7c2457 100644 --- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp +++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp @@ -119,7 +119,7 @@ Tensor qcat_nhwc_kernel( c10::nullopt); // N, H, and W are explicitly captured here because there's a bug in GCC5 - // which causes an internal compiler error if they're not + // and clang5 which causes an internal compiler error if they're not AT_DISPATCH_QINT_TYPES(output.scalar_type(), "qcat_nhwc", [&, N, H, W]() { using Vec = Vectorized; at::parallel_for(0, N * H * W, 0, [&](int64_t begin, int64_t end) { diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h index beefca1d63c60..4be9faef4895e 100644 --- a/c10/macros/Macros.h +++ b/c10/macros/Macros.h @@ -439,15 +439,6 @@ __device__ __attribute__((noinline)) __attribute__((weak)) void __assert_fail( #define C10_IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable::value #endif -#if !defined(__clang__) && !defined(_MSC_VER) && defined(__GNUC__) && \ - __GNUC__ < 6 -#define CONSTEXPR_EXCEPT_GCC5 -#define IS_NOT_GCC5_CONSTEXPR 0 -#else -#define CONSTEXPR_EXCEPT_GCC5 constexpr -#define IS_NOT_GCC5_CONSTEXPR 1 -#endif - #if defined(__CUDA_ARCH__) #if defined(_MSC_VER) && defined(__CUDACC__) #define CONSTEXPR_EXCEPT_WIN_CUDA const diff --git a/c10/test/util/string_view_test.cpp b/c10/test/util/string_view_test.cpp index f63bd1ea71a7c..43e8994d8bfca 100644 --- a/c10/test/util/string_view_test.cpp +++ b/c10/test/util/string_view_test.cpp @@ -218,19 +218,17 @@ static_assert(!string_view("hello").empty(), ""); } // namespace test_empty namespace test_remove_prefix { -CONSTEXPR_EXCEPT_GCC5 string_view remove_prefix(string_view input, size_t len) { +constexpr string_view remove_prefix(string_view input, size_t len) { input.remove_prefix(len); return input; } TEST(StringViewTest, whenRemovingValidPrefix_thenWorks) { -#if IS_NOT_GCC5_CONSTEXPR static_assert( remove_prefix(string_view("hello"), 0) == string_view("hello"), ""); static_assert( remove_prefix(string_view("hello"), 1) == string_view("ello"), ""); static_assert(remove_prefix(string_view("hello"), 5) == string_view(""), ""); -#endif EXPECT_EQ(remove_prefix(string_view("hello"), 0), string_view("hello")); EXPECT_EQ(remove_prefix(string_view("hello"), 1), string_view("ello")); @@ -245,19 +243,17 @@ TEST(StringViewTest, whenRemovingTooLargePrefix_thenThrows) { } // namespace test_remove_prefix namespace test_remove_suffix { -CONSTEXPR_EXCEPT_GCC5 string_view remove_suffix(string_view input, size_t len) { +constexpr string_view remove_suffix(string_view input, size_t len) { input.remove_suffix(len); return input; } TEST(StringViewTest, whenRemovingValidSuffix_thenWorks) { -#if IS_NOT_GCC5_CONSTEXPR static_assert( remove_suffix(string_view("hello"), 0) == string_view("hello"), ""); static_assert( remove_suffix(string_view("hello"), 1) == string_view("hell"), ""); static_assert(remove_suffix(string_view("hello"), 5) == string_view(""), ""); -#endif EXPECT_EQ(remove_suffix(string_view("hello"), 0), string_view("hello")); EXPECT_EQ(remove_suffix(string_view("hello"), 1), string_view("hell")); @@ -272,17 +268,15 @@ TEST(StringViewTest, whenRemovingTooLargeSuffix_thenThrows) { } // namespace test_remove_suffix namespace test_swap_function { -CONSTEXPR_EXCEPT_GCC5 std::pair get() { +constexpr std::pair get() { string_view first = "first"; string_view second = "second"; swap(first, second); return std::make_pair(first, second); } TEST(StringViewTest, testSwapFunction) { -#if IS_NOT_GCC5_CONSTEXPR static_assert(string_view("second") == get().first, ""); static_assert(string_view("first") == get().second, ""); -#endif EXPECT_EQ(string_view("second"), get().first); EXPECT_EQ(string_view("first"), get().second); @@ -290,17 +284,15 @@ TEST(StringViewTest, testSwapFunction) { } // namespace test_swap_function namespace test_swap_method { -CONSTEXPR_EXCEPT_GCC5 std::pair get() { +constexpr std::pair get() { string_view first = "first"; string_view second = "second"; first.swap(second); return std::make_pair(first, second); } TEST(StringViewTest, testSwapMethod) { -#if IS_NOT_GCC5_CONSTEXPR static_assert(string_view("second") == get().first, ""); static_assert(string_view("first") == get().second, ""); -#endif EXPECT_EQ(string_view("second"), get().first); EXPECT_EQ(string_view("first"), get().second); diff --git a/c10/util/string_view.h b/c10/util/string_view.h index 0a4e043740b29..9ad4397d83775 100644 --- a/c10/util/string_view.h +++ b/c10/util/string_view.h @@ -179,7 +179,7 @@ class basic_string_view final { return size() == 0; } - CONSTEXPR_EXCEPT_GCC5 void remove_prefix(size_type n) { + constexpr void remove_prefix(size_type n) { if (n > size()) { throw std::out_of_range( "basic_string_view::remove_prefix: out of range. PrefixLength: " + @@ -189,7 +189,7 @@ class basic_string_view final { size_ -= n; } - CONSTEXPR_EXCEPT_GCC5 void remove_suffix(size_type n) { + constexpr void remove_suffix(size_type n) { if (n > size()) { throw std::out_of_range( "basic_string_view::remove_suffix: out of range. SuffixLength: " + @@ -198,7 +198,7 @@ class basic_string_view final { size_ -= n; } - CONSTEXPR_EXCEPT_GCC5 void swap(basic_string_view& sv) noexcept { + constexpr void swap(basic_string_view& sv) noexcept { auto tmp = *this; *this = sv; sv = tmp; @@ -694,7 +694,7 @@ inline std::basic_ostream& operator<<( } template -CONSTEXPR_EXCEPT_GCC5 inline void swap( +constexpr inline void swap( basic_string_view& lhs, basic_string_view& rhs) { lhs.swap(rhs); From 978135ce82e5635dd442d6c2e86eb92df00df7c8 Mon Sep 17 00:00:00 2001 From: Bin Bao Date: Tue, 25 Oct 2022 20:21:16 +0000 Subject: [PATCH 0156/1922] Bring back TIMM model inductor CI test (#87730) Summary: https://github.com/pytorch/pytorch/pull/87588 has solved the inductor compilation speed regression, so we can try to run TIMM models with fewer shards and also enable pretained model downloading which should resolve the flakyness we have seen previously. cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87730 Approved by: https://github.com/anijain2305 --- .github/workflows/inductor.yml | 9 +++++++-- .jenkins/pytorch/test.sh | 9 ++++++++- benchmarks/dynamo/timm_models.py | 3 +-- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml index e6a79e2a738d8..7348b10674a74 100644 --- a/.github/workflows/inductor.yml +++ b/.github/workflows/inductor.yml @@ -22,8 +22,13 @@ jobs: cuda-arch-list: 8.6 test-matrix: | { include: [ - { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 1, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 2, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 3, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 4, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 5, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 6, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor", shard: 7, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" }, ]} linux-bionic-cuda11_6-py3_10-gcc7-inductor-test: diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index a1381a5c75957..94896701771c6 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -281,7 +281,7 @@ test_inductor_timm_shard() { TEST_REPORTS_DIR=/tmp/test-reports mkdir -p "$TEST_REPORTS_DIR" python benchmarks/dynamo/timm_models.py --ci --training --accuracy \ - --device cuda --inductor --float32 --total-partitions 8 --partition-id "$1" \ + --device cuda --inductor --float32 --total-partitions 5 --partition-id "$1" \ --output "$TEST_REPORTS_DIR"/inductor_timm_"$1".csv python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_timm_"$1".csv } @@ -749,6 +749,13 @@ elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SH install_triton install_huggingface test_inductor_huggingface_shard 0 +elif [[ "${TEST_CONFIG}" == *inductor* && $SHARD_NUMBER -lt 8 && $NUM_TEST_SHARDS -gt 1 ]]; then + install_torchvision + install_filelock + install_triton + install_timm + id=$((SHARD_NUMBER-3)) + test_inductor_timm_shard $id elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then test_without_numpy install_torchvision diff --git a/benchmarks/dynamo/timm_models.py b/benchmarks/dynamo/timm_models.py index 34b2078d23e36..f7ff2559cbb8a 100755 --- a/benchmarks/dynamo/timm_models.py +++ b/benchmarks/dynamo/timm_models.py @@ -205,8 +205,7 @@ def load_model( drop_rate=0.0, drop_path_rate=None, drop_block_rate=None, - # Skip downloading pretrained models for speedy CI - pretrained=not self.args.ci, + pretrained=True, # global_pool=kwargs.pop('gp', 'fast'), # num_classes=kwargs.pop('num_classes', None), # drop_rate=kwargs.pop('drop', 0.), From b38f181ec73ec69203f46e177db9e16446d09d1e Mon Sep 17 00:00:00 2001 From: Justin Chu Date: Tue, 25 Oct 2022 23:07:12 +0000 Subject: [PATCH 0157/1922] [ONNX] Refactor UnsupportedOperatorError arguments (#85349) Merged the first two arguments because we always use qualified names to identify symbolic functions Pull Request resolved: https://github.com/pytorch/pytorch/pull/85349 Approved by: https://github.com/AllenTiTaiWang, https://github.com/BowenBao --- torch/onnx/errors.py | 70 +++++++++++++++++++++----------------------- torch/onnx/utils.py | 3 +- 2 files changed, 35 insertions(+), 38 deletions(-) diff --git a/torch/onnx/errors.py b/torch/onnx/errors.py index 9de6c46167264..467494c560447 100644 --- a/torch/onnx/errors.py +++ b/torch/onnx/errors.py @@ -45,52 +45,50 @@ class CheckerError(OnnxExporterError): class UnsupportedOperatorError(OnnxExporterError): """Raised when an operator is unsupported by the exporter.""" - def __init__( - self, - domain: str, - op_name: str, - version: int, - supported_version: Optional[int], - ): - if domain in {"", "aten", "prim", "quantized"}: - msg = f"Exporting the operator '{domain}::{op_name}' to ONNX opset version {version} is not supported. " - if supported_version is not None: - msg += ( - f"Support for this operator was added in version {supported_version}, " - "try exporting with this version." - ) + def __init__(self, name: str, version: int, supported_version: Optional[int]): + msg = f"Exporting the operator '{name}' to ONNX opset version {version} is not supported. " + if supported_version is not None: + msg += ( + f"Support for this operator was added in version {supported_version}. " + "Please try exporting with this version." + ) + diagnostics.context.diagnose( + diagnostics.rules.operator_supported_in_newer_opset_version, + diagnostics.levels.ERROR, + message_args=( + name, + version, + supported_version, + ), + ) + else: + msg += "Please feel free to request support or submit a pull request on PyTorch GitHub: " + msg += _constants.PYTORCH_GITHUB_ISSUES_URL + + if ( + name.startswith("aten::") + or name.startswith("prim::") + or name.startswith("quantized::") + ): diagnostics.context.diagnose( - diagnostics.rules.operator_supported_in_newer_opset_version, + diagnostics.rules.missing_standard_symbolic_function, diagnostics.levels.ERROR, message_args=( - f"{domain}::{op_name}", + name, version, - supported_version, + _constants.PYTORCH_GITHUB_ISSUES_URL, ), ) else: - msg += "Please feel free to request support or submit a pull request on PyTorch GitHub: " - msg += _constants.PYTORCH_GITHUB_ISSUES_URL + msg += ( + "If you are trying to export a custom operator, make sure you registered " + "it with the correct domain and version." + ) diagnostics.context.diagnose( - diagnostics.rules.missing_standard_symbolic_function, + diagnostics.rules.missing_custom_symbolic_function, diagnostics.levels.ERROR, - message_args=( - f"{domain}::{op_name}", - version, - _constants.PYTORCH_GITHUB_ISSUES_URL, - ), + message_args=(name,), ) - else: - msg = ( - f"ONNX export failed on an operator with unrecognized namespace '{domain}::{op_name}'. " - "If you are trying to export a custom operator, make sure you registered " - "it with the right domain and version." - ) - diagnostics.context.diagnose( - diagnostics.rules.missing_custom_symbolic_function, - diagnostics.levels.ERROR, - message_args=(f"{domain}::{op_name}",), - ) super().__init__(msg) diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py index 04fc984ded2b9..7cee61ed70b46 100644 --- a/torch/onnx/utils.py +++ b/torch/onnx/utils.py @@ -1887,8 +1887,7 @@ def _run_symbolic_function( ) raise errors.UnsupportedOperatorError( - domain, - op_name, + symbolic_function_name, opset_version, symbolic_function_group.get_min_supported() if symbolic_function_group From 8939d2e8247e2c9d94a54a102e0a52338182e178 Mon Sep 17 00:00:00 2001 From: Sergii Dymchenko Date: Wed, 26 Oct 2022 00:26:44 +0000 Subject: [PATCH 0158/1922] Disable ossf-scorecard (#87740) Disable as it frequently fails https://github.com/pytorch/pytorch/actions/runs/3325113107/jobs/5497443452 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87740 Approved by: https://github.com/huydhn --- .github/workflows/scorecards.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index d896864349fe4..8abee79cf400f 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -21,7 +21,7 @@ jobs: # Used to receive a badge. id-token: write - if: github.repository == 'pytorch/pytorch' # don't run on forks + if: false && github.repository == 'pytorch/pytorch' # don't run on forks steps: - name: "Checkout code" From a8d70f70fdabcb98a49a938a0059297e1e3cc2a0 Mon Sep 17 00:00:00 2001 From: Justin Chu Date: Wed, 26 Oct 2022 00:39:59 +0000 Subject: [PATCH 0159/1922] [ONNX] Expand `_cast_` symbolic functions (#87666) The `_cast_` family of symbolic functions has been created from a template function. Even though it saved some lines, it very much obscured the intention of the code. Since the list doesn't really change and the `_cast_` family are IIRC deprecated, it is safe for us to expand the templates and make the code more readable. This PR also removes any direct calls to `_cast_` functions to maintain a consistent pattern of directly creating `Cast` nodes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87666 Approved by: https://github.com/BowenBao --- torch/onnx/_deprecation.py | 2 +- torch/onnx/symbolic_opset10.py | 2 +- torch/onnx/symbolic_opset8.py | 10 +-- torch/onnx/symbolic_opset9.py | 149 ++++++++++++++++++++++----------- 4 files changed, 108 insertions(+), 55 deletions(-) diff --git a/torch/onnx/_deprecation.py b/torch/onnx/_deprecation.py index 1267b5f24be45..0f482f16e2421 100644 --- a/torch/onnx/_deprecation.py +++ b/torch/onnx/_deprecation.py @@ -23,7 +23,7 @@ def wrapper(*args, **kwargs): warnings.warn( f"'{function.__module__}.{function.__name__}' " f"is deprecated in version {since} and will be " - f"removed in version {removed_in}. Please {instructions}.", + f"removed in {removed_in}. Please {instructions}.", category=FutureWarning, stacklevel=2, ) diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py index bc04db1f37f59..f20a1290ca17a 100644 --- a/torch/onnx/symbolic_opset10.py +++ b/torch/onnx/symbolic_opset10.py @@ -601,7 +601,7 @@ def fake_quantize_per_tensor_affine( @_onnx_symbolic("aten::isinf") @_beartype.beartype def isinf(g: jit_utils.GraphContext, input): - return g.op("IsInf", opset9._cast_Double(g, input, False)) # type: ignore[attr-defined] + return g.op("IsInf", g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.DOUBLE)) @_onnx_symbolic("aten::isfinite") diff --git a/torch/onnx/symbolic_opset8.py b/torch/onnx/symbolic_opset8.py index e940044dd74cf..e0a6401be1dfa 100644 --- a/torch/onnx/symbolic_opset8.py +++ b/torch/onnx/symbolic_opset8.py @@ -34,6 +34,7 @@ import warnings import torch +from torch._C import _onnx as _C_onnx from torch.onnx import _type_utils, errors, symbolic_helper, symbolic_opset9 as opset9 from torch.onnx._internal import jit_utils, registration @@ -166,11 +167,10 @@ def _try_cast_integer_to_float(g: jit_utils.GraphContext, *args): if arg0_type is not None: old_type = arg0_type if old_type not in floating_scalar_types: - # TODO(justinchuby): Remove the type ignore hint once _cast_Float is - # properly defined. - # NOTE: _cast_Float is generated programmatically so we need to make the - # type checker happy with ignore[attr-defined]. - args = tuple(opset9._cast_Float(g, arg, False) for arg in args) # type: ignore[attr-defined] + args = tuple( + g.op("Cast", arg, to_i=_C_onnx.TensorProtoDataType.FLOAT) + for arg in args + ) else: return (None,) + args else: diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index c071438169da3..bbb97f3f8d794 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -20,6 +20,7 @@ # Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics from torch.onnx import ( # noqa: F401 _constants, + _deprecation, _patch_torch, _type_utils, errors, @@ -723,7 +724,7 @@ def _maybe_cast_reduce_op_input(g: jit_utils.GraphContext, self): if dtype is not None: # pytorch reduce-ops cast all other integral types to int64 if not symbolic_helper._is_fp(self) and not (dtype == "Long"): - self = _cast_Long(g, self, False) # type: ignore[name-defined] + self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.INT64) return self @@ -3385,51 +3386,103 @@ def _unique2(g: jit_utils.GraphContext, input, sorted, return_inverse, return_co symbolic_helper._onnx_opset_unsupported("_unique2", 9, 11, input) +@_onnx_symbolic("aten::_cast_Byte") +@_deprecation.deprecated( + "1.14", + "the future", + "Avoid using this function and create a Cast node instead", +) @_beartype.beartype -def _cast_func_template(to_i, g, input, non_blocking): - """Template for creating a cast function.""" - return g.op("Cast", input, to_i=to_i) - - -# TODO(justinchuby): Use the decorator and _export for these operators -# Metaprogram symbolics for each ATen native specialized cast operator. -# For e.g. we specify a function named `_cast_Byte` that instantiates an -# ONNX cast node with `to` attribute "UINT8" -# def _cast_Byte -# def _cast_Char -# def _cast_Short -# def _cast_Int -# def _cast_Long -# def _cast_Half -# def _cast_Float -# def _cast_Double -# def _cast_ComplexFloat -# def _cast_ComplexDouble -# def _cast_Bool -# def _cast_BFloat16 -for scalar_type in ( - "Byte", - "Char", - "Short", - "Int", - "Long", - "Half", - "Float", - "Double", - "ComplexFloat", - "ComplexDouble", - "Bool", - "BFloat16", -): - func_name = f"_cast_{scalar_type}" - globals()[func_name] = _onnx_symbolic(f"aten::{func_name}")( - symbolic_helper.parse_args("v", "i")( - functools.partial( - _cast_func_template, - _type_utils.JitScalarType.from_name(scalar_type).onnx_type(), - ) - ) - ) +def _cast_Byte(g: jit_utils.GraphContext, input, non_blocking): + return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.UINT8) + + +@_onnx_symbolic("aten::_cast_Char") +@_deprecation.deprecated( + "1.14", + "the future", + "Avoid using this function and create a Cast node instead", +) +@_beartype.beartype +def _cast_Char(g: jit_utils.GraphContext, input, non_blocking): + return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT8) + + +@_onnx_symbolic("aten::_cast_Short") +@_deprecation.deprecated( + "1.14", + "the future", + "Avoid using this function and create a Cast node instead", +) +@_beartype.beartype +def _cast_Short(g: jit_utils.GraphContext, input, non_blocking): + return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT16) + + +@_onnx_symbolic("aten::_cast_Int") +@_deprecation.deprecated( + "1.14", + "the future", + "Avoid using this function and create a Cast node instead", +) +@_beartype.beartype +def _cast_Int(g: jit_utils.GraphContext, input, non_blocking): + return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32) + + +@_onnx_symbolic("aten::_cast_Long") +@_deprecation.deprecated( + "1.14", + "the future", + "Avoid using this function and create a Cast node instead", +) +@_beartype.beartype +def _cast_Long(g: jit_utils.GraphContext, input, non_blocking): + return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT64) + + +@_onnx_symbolic("aten::_cast_Half") +@_deprecation.deprecated( + "1.14", + "the future", + "Avoid using this function and create a Cast node instead", +) +@_beartype.beartype +def _cast_Half(g: jit_utils.GraphContext, input, non_blocking): + return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT16) + + +@_onnx_symbolic("aten::_cast_Float") +@_deprecation.deprecated( + "1.14", + "the future", + "Avoid using this function and create a Cast node instead", +) +@_beartype.beartype +def _cast_Float(g: jit_utils.GraphContext, input, non_blocking): + return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT) + + +@_onnx_symbolic("aten::_cast_Double") +@_deprecation.deprecated( + "1.14", + "the future", + "Avoid using this function and create a Cast node instead", +) +@_beartype.beartype +def _cast_Double(g: jit_utils.GraphContext, input, non_blocking): + return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.DOUBLE) + + +@_onnx_symbolic("aten::_cast_Bool") +@_deprecation.deprecated( + "1.14", + "the future", + "Avoid using this function and create a Cast node instead", +) +@_beartype.beartype +def _cast_Bool(g: jit_utils.GraphContext, input, non_blocking): + return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.BOOL) @_onnx_symbolic("aten::empty") @@ -4761,7 +4814,7 @@ def _pack_padded_sequence(g: jit_utils.GraphContext, input, lengths, batch_first # It's really only necessary because those operators expand to something that # only works with int32 types in Caffe2... if lengths.type().scalarType() != "Int": - lengths = _cast_Int(g, lengths, False) # type: ignore[name-defined] + lengths = g.op("Cast", lengths, to_i=_C_onnx.TensorProtoDataType.INT32) return g.op("prim::PackPadded", input, lengths, outputs=2) @@ -4994,7 +5047,7 @@ def _any(g: jit_utils.GraphContext, *args): input, dim, keepdim = args dim = [symbolic_helper._parse_arg(dim, "i")] keepdim = symbolic_helper._parse_arg(keepdim, "i") - input = _cast_Long(g, input, False) # type: ignore[name-defined] + input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT64) input_sum = symbolic_helper._reducesum_helper( g, input, axes_i=dim, keepdims_i=keepdim ) @@ -5334,7 +5387,7 @@ def lift(g: jit_utils.GraphContext, self): @_onnx_symbolic("aten::masked_fill") @_beartype.beartype def masked_fill(g: jit_utils.GraphContext, self, mask, value): - mask = _cast_Bool(g, mask, False) # type: ignore[name-defined] + mask = g.op("Cast", mask, to_i=_C_onnx.TensorProtoDataType.BOOL) value = symbolic_helper._maybe_get_scalar(value) return g.op("Where", mask, symbolic_helper._if_scalar_type_as(value, self), self) From 48c9b26dfec8359f0c17ddafb2259d231a56da3e Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 26 Oct 2022 02:28:36 +0000 Subject: [PATCH 0160/1922] Upgrade CI binary build runner from 4x to 12xlarge (#87727) It currently takes a whopping 2h30m just to build PyTorch binary for every PR and commit. Pushing it to 12xlarge reduces the time to 1h40m https://github.com/pytorch/pytorch/actions/runs/3323869550/jobs/5494754029, not exactly a linear (and fair) trade, but good enough to reduce this long pole. I'll monitor the queue for 12xlarge after this change. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87727 Approved by: https://github.com/kit1980, https://github.com/malfet --- .github/workflows/_binary-build-linux.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_binary-build-linux.yml b/.github/workflows/_binary-build-linux.yml index 6bd2ccd691918..b023ad6701c61 100644 --- a/.github/workflows/_binary-build-linux.yml +++ b/.github/workflows/_binary-build-linux.yml @@ -67,8 +67,8 @@ on: jobs: build: - runs-on: linux.4xlarge - timeout-minutes: 270 + runs-on: linux.12xlarge + timeout-minutes: 150 env: PYTORCH_ROOT: ${{ inputs.PYTORCH_ROOT }} BUILDER_ROOT: ${{ inputs.BUILDER_ROOT }} From b7a8fe6569d40b7f10e063906cc835b9027f3db0 Mon Sep 17 00:00:00 2001 From: Shen Li Date: Tue, 25 Oct 2022 22:30:54 +0000 Subject: [PATCH 0161/1922] Add distributed composable API contract (#87580) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87580 Approved by: https://github.com/yhcharles --- test/distributed/_composable/test_contract.py | 98 +++++++++++++++++ torch/distributed/_composable/__init__.py | 102 ++++++++++++++++++ 2 files changed, 200 insertions(+) create mode 100644 test/distributed/_composable/test_contract.py create mode 100644 torch/distributed/_composable/__init__.py diff --git a/test/distributed/_composable/test_contract.py b/test/distributed/_composable/test_contract.py new file mode 100644 index 0000000000000..206f9196b7b3b --- /dev/null +++ b/test/distributed/_composable/test_contract.py @@ -0,0 +1,98 @@ +# Owner(s): ["oncall: distributed"] + +from torch.testing._internal.common_utils import ( + TestCase, + run_tests, + skipIfTorchDynamo, +) + +import torch +import torch.nn as nn +from torch.distributed._composable import contract + +from copy import deepcopy +from typing import Tuple + + +class ToyModel(nn.Module): + def __init__(self): + super().__init__() + self.seq1 = nn.Sequential(*[nn.Linear(10, 10) for _ in range(2)]) + self.seq2 = nn.Sequential(*[nn.Linear(10, 10) for _ in range(2)]) + self.p = nn.Parameter(torch.randn(10, 10), requires_grad=True) + self.b = torch.zeros(1) # buffer + + def forward(self, x, y): + with torch.no_grad(): + self.b += x.sum() + y.sum() + + return self.p + self.seq1(x) + self.seq2(y) + + +class TestContract(TestCase): + @skipIfTorchDynamo("Dynamo does not yet capture module hooks") + def test_add_hooks(self): + def forward_pre_hook( + module: nn.Module, inp: Tuple[torch.Tensor] + ) -> Tuple[torch.Tensor]: + return inp + + def forward_hook( + module: nn.Module, inp: Tuple[torch.Tensor], out: torch.Tensor + ) -> torch.Tensor: + return out + + def backward_pre_hook( + module: nn.Module, grad_output: torch.Tensor + ) -> torch.Tensor: + return grad_output + + def backward_hook( + module: nn.Module, + grad_input: Tuple[torch.Tensor], + grad_output: torch.Tensor, + ) -> Tuple[torch.Tensor]: + return grad_input + + @contract + def noop_api(module: nn.Module) -> nn.Module: + module.register_forward_pre_hook(forward_pre_hook) + module.register_forward_hook(forward_hook) + module.register_full_backward_pre_hook(backward_pre_hook) + module.register_full_backward_hook(backward_hook) + return module + + model = ToyModel() + model_with_hooks = deepcopy(model) + noop_api(model.seq1) + noop_api(model.seq2) + + x, y = torch.randn(10, 10), torch.randn(10, 10) + model(x, y).sum().backward() + model_with_hooks(x, y).sum().backward() + + for p1, p2 in zip(model.parameters(), model_with_hooks.parameters()): + self.assertEqual(p1, p2) + + @skipIfTorchDynamo("Dynamo does not yet capture module hooks") + def test_modify_fqn(self): + class ModelWrapper(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + + def forward(self, x): + return self.module(x) + + @contract + def wrap_module(module: nn.Module) -> nn.Module: + return ModelWrapper(module) + + model = ToyModel() + + with self.assertRaisesRegex(RuntimeError, "cannot modify FQNs"): + wrap_module(model.seq1) + + +if __name__ == "__main__": + run_tests() diff --git a/torch/distributed/_composable/__init__.py b/torch/distributed/_composable/__init__.py new file mode 100644 index 0000000000000..90533a13cdf56 --- /dev/null +++ b/torch/distributed/_composable/__init__.py @@ -0,0 +1,102 @@ +import torch +import torch.nn as nn + +from collections import OrderedDict +from typing import List, Optional + + +def contract(func): + r""" + Decorate a function as a composable distributed API, where the first + argument of the function must be an :class:`nn.Module` instance. The + decorator verifies that the wrapped function does not modify parameter, + buffer or sub-module fully-qualified names (FQN). + + Example:: + >>> import torch.nn as nn + >>> + >>> class MyModel(nn.Module): + >>> def __init__(self): + >>> super().__init__() + >>> self.l1 = nn.Linear(10, 10) + >>> self.l2 = nn.Linear(10, 10) + >>> + >>> def forward(self, x): + >>> return self.l2(self.l1(x)) + >>> + >>> @contract + >>> def my_noop_feature(module: nn.Module) -> nn.Module: + >>> return module + >>> + >>> model = MyModel() + >>> my_noop_feature(model.l1) + >>> my_noop_feature(model.l2) + >>> model(torch.randn(2, 10)).sum().backward() + """ + + def wrapper(module: nn.Module, *args, **kwargs) -> Optional[nn.Module]: + orig_named_params = OrderedDict(module.named_parameters()) + orig_named_buffers = OrderedDict( + module.named_buffers(remove_duplicate=False) + ) + orig_named_modules = OrderedDict( + module.named_modules(remove_duplicate=False) + ) + + updated = func(module, *args, **kwargs) + + if updated is None: + updated = module + + new_named_params = OrderedDict(updated.named_parameters()) + new_named_buffers = OrderedDict( + updated.named_buffers(remove_duplicate=False) + ) + new_named_modules = OrderedDict( + updated.named_modules(remove_duplicate=False) + ) + + assert isinstance(updated, nn.Module), ( + "Output of composable distributed APIs must be either None or " + f"nn.Module, but got {type(updated)}" + ) + + def check_fqn(orig_fqns: List[str], new_fqns: List[str]): + if orig_fqns == new_fqns: + return + + orig_fqn_set, new_fqn_set = set(orig_fqns), set(new_fqns) + orig_only = orig_fqn_set - new_fqn_set + new_only = new_fqn_set - orig_fqn_set + if len(orig_only) or len(new_only): + raise RuntimeError( + "Composable distributed API implementations cannot modify " + "FQNs.\n" + f"Only in original FQNs: {orig_only},\n" + f"Only in new FQNs: {new_only}" + ) + else: + raise RuntimeError( + "Composable distributed API implementations cannot modify " + "the order of FQNs.\n" + f"Original FQNs: {orig_only}\n" + f"New FQNs: {new_only}" + ) + + check_fqn(list(orig_named_params.keys()), list(new_named_params.keys())) + check_fqn( + list(orig_named_buffers.keys()), list(new_named_buffers.keys()) + ) + check_fqn( + list(orig_named_modules.keys()), list(new_named_modules.keys()) + ) + + # TODO: a stricter verification should also reject changing module + # types and monkey-patching forward() method implementations. + + # TODO: verify that installed distributed paradigms are compatible with + # each other. + + return updated + + return wrapper From 89414df39e64d2a01376feb332381b31aa1b087b Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 26 Oct 2022 03:30:45 +0000 Subject: [PATCH 0162/1922] [MPS] Use `bandPartWithTensor:numLowerTensor:...` (#87752) To make it uniform with the rest of usage of this op throughout MPS codebase Pull Request resolved: https://github.com/pytorch/pytorch/pull/87752 Approved by: https://github.com/kulinseth --- aten/src/ATen/native/mps/operations/Distributions.mm | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm index d26b25e8c352d..a1a41d11e5b50 100644 --- a/aten/src/ATen/native/mps/operations/Distributions.mm +++ b/aten/src/ATen/native/mps/operations/Distributions.mm @@ -405,9 +405,14 @@ Tensor normal_mps(const Tensor& mean, const Tensor& std, c10::optional Date: Wed, 26 Oct 2022 03:31:54 +0000 Subject: [PATCH 0163/1922] [BE] Delete `TH_DISALLOW_COPY_AND_ASSIGN` (#87743) Replace it with `AT_DISALLOW_COPY_AND_ASSIGN` and delete the header that contained this define Pull Request resolved: https://github.com/pytorch/pytorch/pull/87743 Approved by: https://github.com/atalman, https://github.com/ngimel --- torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h | 1 - torch/csrc/jit/codegen/fuser/cpu/temp_file.h | 4 ++-- torch/csrc/jit/codegen/fuser/fused_kernel.h | 4 ++-- torch/csrc/jit/ir/ir.h | 10 +++++----- torch/csrc/jit/serialization/pickler.h | 4 ++-- torch/csrc/jit/serialization/unpickler.h | 2 +- torch/csrc/utils/disallow_copy.h | 5 ----- 7 files changed, 12 insertions(+), 18 deletions(-) delete mode 100644 torch/csrc/utils/disallow_copy.h diff --git a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h index ce5d6ee2c5546..2e6d59596323d 100644 --- a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h +++ b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h @@ -3,7 +3,6 @@ #include #include #include -#include #include #include diff --git a/torch/csrc/jit/codegen/fuser/cpu/temp_file.h b/torch/csrc/jit/codegen/fuser/cpu/temp_file.h index 080d76bde2225..9fb53bc962c5b 100644 --- a/torch/csrc/jit/codegen/fuser/cpu/temp_file.h +++ b/torch/csrc/jit/codegen/fuser/cpu/temp_file.h @@ -1,9 +1,9 @@ #pragma once #include +#include #include #include -#include #ifdef _WIN32 #include @@ -61,7 +61,7 @@ int wmkstemps(wchar_t* tmpl, int suffix_len) { #endif struct TempFile { - TH_DISALLOW_COPY_AND_ASSIGN(TempFile); + AT_DISALLOW_COPY_AND_ASSIGN(TempFile); TempFile(const std::string& t, int suffix) { #ifdef _MSC_VER diff --git a/torch/csrc/jit/codegen/fuser/fused_kernel.h b/torch/csrc/jit/codegen/fuser/fused_kernel.h index 3d34082ff771b..29ab3e7ed51c0 100644 --- a/torch/csrc/jit/codegen/fuser/fused_kernel.h +++ b/torch/csrc/jit/codegen/fuser/fused_kernel.h @@ -1,9 +1,9 @@ #pragma once #include +#include #include #include -#include #include #include @@ -14,7 +14,7 @@ namespace jit { namespace fuser { struct FusedKernel { - TH_DISALLOW_COPY_AND_ASSIGN(FusedKernel); + AT_DISALLOW_COPY_AND_ASSIGN(FusedKernel); // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) FusedKernel( diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h index fe9e340fbe02d..67f878e9f7065 100644 --- a/torch/csrc/jit/ir/ir.h +++ b/torch/csrc/jit/ir/ir.h @@ -7,10 +7,10 @@ #include #include -#include #include #include +#include #include #include #include @@ -177,7 +177,7 @@ struct Wrap { }; struct Value { - TH_DISALLOW_COPY_AND_ASSIGN(Value); + AT_DISALLOW_COPY_AND_ASSIGN(Value); Value(Node* node_, size_t offset_); private: @@ -310,7 +310,7 @@ struct Value { }; struct TORCH_API Node { - TH_DISALLOW_COPY_AND_ASSIGN(Node); + AT_DISALLOW_COPY_AND_ASSIGN(Node); friend struct Graph; friend struct Block; friend struct Value; @@ -1015,7 +1015,7 @@ struct Block { friend struct Node; friend struct Graph; - TH_DISALLOW_COPY_AND_ASSIGN(Block); + AT_DISALLOW_COPY_AND_ASSIGN(Block); TORCH_API Block(Graph* graph_, Node* node_); at::ArrayRef inputs() { @@ -1164,7 +1164,7 @@ struct Block { }; struct Graph { - TH_DISALLOW_COPY_AND_ASSIGN(Graph); + AT_DISALLOW_COPY_AND_ASSIGN(Graph); friend struct Node; friend struct Value; friend struct Block; diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h index 399d7c232de13..e6ba2d281ada0 100644 --- a/torch/csrc/jit/serialization/pickler.h +++ b/torch/csrc/jit/serialization/pickler.h @@ -5,11 +5,11 @@ #include #include +#include #include #include #include #include -#include namespace torch { namespace jit { @@ -118,7 +118,7 @@ void setTypeTags(bool state); bool getTypeTags(); class TORCH_API Pickler { - TH_DISALLOW_COPY_AND_ASSIGN(Pickler); + AT_DISALLOW_COPY_AND_ASSIGN(Pickler); public: Pickler(std::function writer) diff --git a/torch/csrc/jit/serialization/unpickler.h b/torch/csrc/jit/serialization/unpickler.h index c57aa2556d73c..5411d421a0c57 100644 --- a/torch/csrc/jit/serialization/unpickler.h +++ b/torch/csrc/jit/serialization/unpickler.h @@ -23,7 +23,7 @@ class DeserializationStorageContext; // deleted at some point, the Pickler doesn't produce it and it's only around to // support models saved before 1.1 class TORCH_API Unpickler { - TH_DISALLOW_COPY_AND_ASSIGN(Unpickler); + AT_DISALLOW_COPY_AND_ASSIGN(Unpickler); using TypeParserT = c10::TypePtr (*)(const std::string&); diff --git a/torch/csrc/utils/disallow_copy.h b/torch/csrc/utils/disallow_copy.h deleted file mode 100644 index 5960421d3a4ee..0000000000000 --- a/torch/csrc/utils/disallow_copy.h +++ /dev/null @@ -1,5 +0,0 @@ -#pragma once - -#include - -#define TH_DISALLOW_COPY_AND_ASSIGN AT_DISALLOW_COPY_AND_ASSIGN From 2206e4274effe14961c648ed7021fb3537527868 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Tue, 25 Oct 2022 22:59:57 +0000 Subject: [PATCH 0164/1922] [FSDP][BE] Split state_dict related hooks to a separate file to reduce development conflicts (#87421) This PR does following two things to improve the code quality. 1. Split state_dict related hooks to a separate file to reduce development conflicts. 2. Remove unused APIs. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87421 Approved by: https://github.com/rohan-varma --- torch/distributed/fsdp/_state_dict_utils.py | 418 +++++++++++++++ .../fsdp/fully_sharded_data_parallel.py | 500 +----------------- 2 files changed, 430 insertions(+), 488 deletions(-) create mode 100644 torch/distributed/fsdp/_state_dict_utils.py diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py new file mode 100644 index 0000000000000..33fa0d441107b --- /dev/null +++ b/torch/distributed/fsdp/_state_dict_utils.py @@ -0,0 +1,418 @@ +import functools +import math +import warnings +from typing import Any, cast, Dict + +import torch +import torch.distributed as dist +import torch.distributed.algorithms._checkpoint.checkpoint_wrapper as checkpoint_wrapper +# Import the entire FSDP file to avoid circular imports +import torch.distributed.fsdp.fully_sharded_data_parallel as FSDP +import torch.nn as nn +import torch.nn.functional as F + +from torch.distributed._shard.sharded_tensor import ( + Shard, + ShardedTensor, + init_from_local_shards, +) +from torch.distributed.utils import ( + _replace_by_prefix, +) + +from ._fsdp_extensions import _ext_chunk_tensor, _ext_pre_load_state_dict_transform +from .flat_param import ( + FlatParamHandle, +) + + +def _full_post_state_dict_hook( + module, + state_dict: Dict[str, Any], + prefix: str, +) -> Dict[str, Any]: + """ + Hook that runs after model.state_dict() is called before returning result to + user. For FSDP, we may have to clone the tensors in state_dict as params go + back to sharded version after _summon_full_params ends, and also remove + the ``FSDP_WRAPPED_MODULE`` prefix. + """ + _replace_by_prefix(state_dict, prefix + f"{FSDP.FSDP_PREFIX}", prefix) + module._assert_state([FSDP.TrainingState_.SUMMON_FULL_PARAMS]) + # Return early for trivial cases + if not state_dict or not module._has_params: + return state_dict + + # If a rank has already exited the `summon_full_params()` context here + # (e.g. when `rank0_only=True` and `rank != 0`), then the rank only + # needed to participate in the all-gather and does not need to save the + # state dict. For `use_orig_params=False`, we can check this via + # `FlatParameter` registration. + # TODO: For `use_orig_params=True`, we check for the reshard upon + # exiting `summon_full_params()` via the parameter shape. However, for + # `NO_SHARD`, we cannot tell from the shape, so we do not return early. + if ( + ( + not module._use_orig_params + and FSDP.FLAT_PARAM in module.module._parameters + ) + or ( + module._use_orig_params + and module._handles + and module._handles[0].uses_sharded_strategy + and module._handles[0].is_sharded(module._handles[0].flat_param) + ) + ): + return state_dict + + offload_to_cpu = module._state_dict_config.offload_to_cpu + cpu_device = torch.device("cpu") + + # Loop only the parameters saved in this instance's wrapped module to + # avoid processing buffers. + for fqn, param_name, module_name in module._param_fqns: + fqn = f"{prefix}{fqn}" + clean_key = fqn + clean_prefix = FSDP.clean_tensor_name(prefix) + # Strip prefix out of key if needed as buffer names and param names + # do not have prefix considered as they are not computed in `state_dict` + # call. + if clean_key.startswith(clean_prefix): + clean_key = clean_key[len(clean_prefix):] + + # Clone non-ignored parameters before exiting the + # `_summon_full_params()` context + assert fqn in state_dict, ( + f"FSDP assumes {fqn} is in the state_dict but the state_dict " + f"only has {state_dict.keys()}. prefix={prefix}, " + f"module_name={module_name} param_name={param_name} rank={module.rank}." + ) + if clean_key not in module._ignored_param_names and \ + not getattr(state_dict[fqn], "_has_been_cloned", False): + try: + state_dict[fqn] = state_dict[fqn].clone().detach() + state_dict[fqn]._has_been_cloned = True # type: ignore[attr-defined] + except BaseException as e: + warnings.warn( + f"Failed to clone() tensor with name {fqn} on rank {module.rank}. " + "This may mean that this state_dict entry could point to invalid " + "memory regions after returning from state_dict() call if this " + "parameter is managed by FSDP. Please check clone " + f"implementation of {fqn}. Error: {str(e)}" + ) + + # Offload the buffer to CPU if needed -- we do not do this in + # `_summon_full_params()` since without care, that would free + # the original buffer's GPU memory and require reallocating + # that memory later; this only affects the state dict's buffer + # variable and leaves the original buffer's GPU memory intact + if offload_to_cpu: + for clean_key in module._buffer_names: + # This is a hack to support activation checkpoint. + clean_key = clean_key.replace( + f"{checkpoint_wrapper._CHECKPOINT_PREFIX}.", "" + ) + fqn = f"{prefix}{clean_key}" + if fqn not in state_dict: + # A buffer can be registered as non-persistent. + continue + if state_dict[fqn].device != cpu_device: + state_dict[fqn] = state_dict[fqn].to(cpu_device) + return state_dict + + +def _full_pre_load_state_dict_hook( + module, + state_dict: Dict[str, Any], + prefix: str, +) -> None: + # We do not expect to be calling pre-hooks twice without post-hook + # call in between. + assert getattr(module, '_full_param_ctx', None) is None + # Note that it needs writeback=True to persist. + module._full_param_ctx = module._summon_full_params( + recurse=False, writeback=True + ) + module._full_param_ctx.__enter__() + _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP.FSDP_PREFIX}") + + +def _full_post_load_state_dict_hook(module, *args, **kwargs) -> None: + # We should exit summon_full_params context. + module._assert_state([FSDP.TrainingState_.SUMMON_FULL_PARAMS]) + assert getattr(module, '_full_param_ctx', None) is not None + module._full_param_ctx.__exit__(None, None, None) + module._full_param_ctx = None + + +def _local_post_state_dict_hook( + module, + state_dict: Dict[str, Any], + prefix: str, +) -> Dict[str, Any]: + """ + This hook create a ShardedTensor from the local flat_param and replace + the state_dict[f"{prefix}{FLAT_PARAM}] with the ShardedTensor. No copy + will happen. The underlying storage is the same. + """ + _replace_by_prefix(state_dict, f"{prefix}{FSDP.FSDP_PREFIX}", prefix) + if not module._has_params: + return state_dict + + # state_dict[f"{prefix}{FLAT_PARAM}"] exists and has the same tensor + # value as the flat_param but it is a pure Tensor because + # nn.Module.state_dict() will detach the parameter. Therefore, we need + # to get flat_param to get the metadata. + assert module._handles, "Should have returned early" + flat_param = module._handles[0].flat_param + # Construct a ShardedTensor from the flat_param. + full_numel = flat_param._unpadded_unsharded_size.numel() # type: ignore[attr-defined] + shard_offset = flat_param.numel() * module.rank + valid_data_size = flat_param.numel() - flat_param._shard_numel_padded + if valid_data_size > 0 and flat_param._shard_numel_padded > 0: + flat_param = flat_param.narrow(0, 0, valid_data_size) + local_shards = [ + Shard.from_tensor_and_offsets(flat_param, [shard_offset], module.rank) + ] + sharded_tensor = init_from_local_shards( + local_shards, full_numel, process_group=module.process_group + ) # type: ignore[assignment] + if module._state_dict_config.offload_to_cpu: + sharded_tensor = sharded_tensor.cpu() + state_dict[f"{prefix}{FSDP.FLAT_PARAM}"] = sharded_tensor + return state_dict + + +def _local_post_load_state_dict_hook(module, *args, **kwargs) -> None: + pass + + +def _local_pre_load_state_dict_hook( + module, state_dict: Dict[str, Any], prefix: str, +) -> None: + """ + This hook finds the local flat_param for this FSDP module from the + state_dict. The flat_param should be a ShardedTensor. This hook converts + the ShardedTensor to a tensor. No copy happen unless padding is required. + """ + _replace_by_prefix(state_dict, prefix, f"{prefix}{FSDP.FSDP_PREFIX}") + fqn = f"{prefix}{FSDP.FSDP_PREFIX}{FSDP.FLAT_PARAM}" + if fqn not in state_dict: + assert not module._has_params, ( + "No `FlatParameter` in `state_dict` for this FSDP instance " + "but it has parameters" + ) + return + load_tensor = state_dict[fqn] + assert isinstance( + load_tensor, ShardedTensor + ), "Tensors in local_state_dict should be ShardedTensor." + + # Convert the ShardedTensor to a Tensor. + shards = load_tensor.local_shards() + assert len(shards), "load_local_state_dict assume one shard per ShardedTensor." + load_tensor = shards[0].tensor + + # Get the metada of the flat_param to decide whether to pad the loaded + # tensor. + flat_param = module._handles[0].flat_param + assert flat_param is not None + if flat_param._shard_numel_padded not in (0, flat_param.numel()): + assert load_tensor.numel() < flat_param.numel(), ( + f"Local shard size = {flat_param.numel()} and the tensor in " + f"the state_dict is {load_tensor.numel()}." + ) + load_tensor = F.pad(load_tensor, [0, flat_param._shard_numel_padded]) + state_dict[fqn] = load_tensor + + +def _sharded_post_state_dict_hook( + module, + state_dict: Dict[str, Any], + prefix: str, +) -> Dict[str, Any]: + """ + The hook replaces the unflattened, unsharded parameter in the state_dict + with a unflattened, sharded parameter (a ShardedTensor). + """ + _replace_by_prefix(state_dict, f"{prefix}{FSDP.FSDP_PREFIX}", prefix) + if not module._has_params: + return state_dict + + assert module.training_state != FSDP.TrainingState_.SUMMON_FULL_PARAMS, ( + "Inside _sharded_post_state_dict_hook, the training_state must " + "not be SUMMON_FULL_PARAMS." + ) + with module._summon_full_params(recurse=False, writeback=False): + for fqn, _, _ in module._param_fqns: + # Create a ShardedTensor for the unflattened, non-sharded parameter. + param = functools.reduce(getattr, fqn.split("."), module.module) + sharded_tensor = _ext_chunk_tensor( + tensor=param, + rank=module.rank, + world_size=module.world_size, + num_devices_per_node=torch.cuda.device_count(), + pg=module.process_group + ) + if module._state_dict_config.offload_to_cpu: + sharded_tensor = sharded_tensor.cpu() + state_dict[f"{prefix}{fqn}"] = sharded_tensor + # For `use_orig_params=True`, the `FlatParameter` is not registered, so + # there is no entry in the state dict for it to pop. + if not module._use_orig_params: + state_dict.pop(f"{prefix}{FSDP.FLAT_PARAM}") + return state_dict + + +def _sharded_post_load_state_dict_hook(module, *args, **kwargs) -> None: + if module._use_orig_params: + module._register_orig_params() + + +def _sharded_pre_load_state_dict_hook( + module, state_dict: Dict[str, Any], prefix: str, +) -> None: + """ + The hook combines the unflattened, sharded parameters (ShardedTensor) to + a new FlatParameter and shards the new FlatParameter to the local chunk. + """ + _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP.FSDP_PREFIX}") + if not module._has_params: + return + + if not module._handles[0].uses_sharded_strategy: + raise RuntimeError( + "load_sharded_state_dict can only be called when parameters " + "are flatten and sharded." + ) + + nonsharded_tensors = [] + shared_fqns = [fqn for fqn, _, _ in module._shared_param_fqns] + for fqn, _, _ in module._param_fqns: + full_fqn = f"{prefix}{FSDP.FSDP_PREFIX}{fqn}" + param = state_dict.pop(full_fqn) + if fqn in shared_fqns: + continue + # All-gather the param (ShardedTensor) + param, shards = _ext_pre_load_state_dict_transform(param) + assert len(shards) < 2, ( + "Expects 0 or 1 shard per rank " + f"but got {len(shards)} shards on rank {module.rank}." + ) + param_numel = param.size().numel() + dim_0_size = param.size()[0] + chunk_size = ( + math.ceil(dim_0_size / module.world_size) * param_numel // dim_0_size + ) + if len(shards) == 1: + local_tensor = shards[0].tensor.flatten() + if not local_tensor.is_cuda: + local_tensor = local_tensor.cuda() + num_padding = chunk_size - local_tensor.numel() + if num_padding > 0: + local_tensor = F.pad(local_tensor, [0, num_padding]) + else: + local_tensor = torch.zeros(chunk_size, dtype=param.dtype).cuda() + tensor = torch.empty( + chunk_size * module.world_size, dtype=local_tensor.dtype + ).cuda() + dist.all_gather_into_tensor(tensor, local_tensor, group=module.process_group) + tensor = tensor.narrow(0, 0, param_numel).reshape(param.size()) + nonsharded_tensors.append(tensor) + + # Create a new flat_param from the loaded, non-sharded tensors. + flat_param = module._handles[0].flat_param + loaded_flat_param = FlatParamHandle.flatten_params( + nonsharded_tensors, requires_grad=False + ) + + # Get the chunk from the loaded flat_param for the local rank. + loaded_flat_tensor, num_to_pad = FlatParamHandle._get_shard( + loaded_flat_param, module.rank, module.world_size, + ) + loaded_flat_tensor.to(flat_param.device) + assert flat_param.numel() == loaded_flat_tensor.numel(), ( + f"The loaded local chunk has different numel({loaded_flat_tensor.numel()}) " + f"from the local chunk {flat_param.numel()}." + ) + assert flat_param._shard_numel_padded == num_to_pad, ( + f"The loaded local chunk has different padding({num_to_pad}) " + f"from the local chunk {flat_param._shard_numel_padded}." + ) + state_dict[f"{prefix}{FSDP.FSDP_PREFIX}{FSDP.FLAT_PARAM}"] = loaded_flat_tensor + if module._use_orig_params: + module._deregister_orig_params() + + +@torch.no_grad() +def _post_state_dict_hook( + module: nn.Module, + state_dict: Dict[str, Any], + prefix: str, + *args: Any, +) -> Dict[str, Any]: + """ + _post_state_dict_hook() is called after the state_dict() of this + FSDP module is executed. ``module._state_dict_type`` is used to decide + what postprocessing will be done. + """ + _post_state_dict_hook_fn = { + FSDP.StateDictType.FULL_STATE_DICT: _full_post_state_dict_hook, + FSDP.StateDictType.LOCAL_STATE_DICT: _local_post_state_dict_hook, + FSDP.StateDictType.SHARDED_STATE_DICT: _sharded_post_state_dict_hook, + } + fsdp_module = cast(FSDP.FullyShardedDataParallel, module) + processed_state_dict = _post_state_dict_hook_fn[fsdp_module._state_dict_type]( + fsdp_module, state_dict, prefix + ) + # Restore buffers, which currently are in their full precision type, + # back to their mixed precision type. This is because buffers are cast + # during lazy_init() and stay at their mixed precision type before/after + # forward/backward. As a result state_dict() should maintain this. + if ( + fsdp_module._is_root + and fsdp_module._mixed_precision_enabled_for_buffers() + ): + fsdp_module._cast_buffers(recurse=True) + return processed_state_dict + + +@torch.no_grad() +def _pre_load_state_dict_hook( + module: nn.Module, + state_dict: Dict[str, Any], + prefix: str, + *args: Any, +) -> None: + """ + ``_pre_state_dict_hook` is called before ``module._load_from_state_dict()`` + is called. ``module._state_dict_type`` is used to decide what preprocessing + will be done. + """ + _pre_load_state_dict_hook_fn = { + FSDP.StateDictType.FULL_STATE_DICT: _full_pre_load_state_dict_hook, + FSDP.StateDictType.LOCAL_STATE_DICT: _local_pre_load_state_dict_hook, + FSDP.StateDictType.SHARDED_STATE_DICT: _sharded_pre_load_state_dict_hook, + } + # Code that is common for all state_dict impls + fsdp_module = cast(FSDP.FullyShardedDataParallel, module) + if torch.cuda.is_available(): + torch.cuda.synchronize() + # Dispatch into state_dict specific implementation of pre-hook. + _pre_load_state_dict_hook_fn[fsdp_module._state_dict_type]( + fsdp_module, state_dict, prefix + ) + + +@torch.no_grad() +def _post_load_state_dict_hook(module: nn.Module, *args: Any) -> None: + _post_load_state_dict_hook_fn = { + FSDP.StateDictType.FULL_STATE_DICT: _full_post_load_state_dict_hook, + FSDP.StateDictType.LOCAL_STATE_DICT: _local_post_load_state_dict_hook, + FSDP.StateDictType.SHARDED_STATE_DICT: _sharded_post_load_state_dict_hook, + } + # Code that is common for all state_dict impls + fsdp_module = cast(FSDP.FullyShardedDataParallel, module) + # Dispatch into state_dict type specific implementation of post-hook for + # loading state_dict. + _post_load_state_dict_hook_fn[fsdp_module._state_dict_type](fsdp_module) diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py index e241c26d1e1f1..5fb2e5cdf0f6b 100644 --- a/torch/distributed/fsdp/fully_sharded_data_parallel.py +++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py @@ -18,8 +18,6 @@ Iterable, Iterator, List, - Mapping, - NamedTuple, Optional, Set, Tuple, @@ -34,11 +32,6 @@ import torch.nn.functional as F from torch.autograd import Variable from torch.distributed import ProcessGroup -from torch.distributed._shard.sharded_tensor import ( - Shard, - ShardedTensor, - init_from_local_shards, -) from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( _CHECKPOINT_PREFIX, ) @@ -48,7 +41,6 @@ ) from torch.distributed.distributed_c10d import _get_default_group from torch.distributed.utils import ( - _replace_by_prefix, _sync_params_and_buffers, _to_kwargs, ) @@ -64,7 +56,11 @@ _process_pos_dim_tensor_state, _rekey_sharded_optim_state_dict, ) -from ._fsdp_extensions import _ext_chunk_tensor, _ext_pre_load_state_dict_transform +from ._state_dict_utils import ( + _post_state_dict_hook, + _pre_load_state_dict_hook, + _post_load_state_dict_hook, +) from ._utils import ( _apply_to_modules, _apply_to_tensors, @@ -601,7 +597,7 @@ def _check_order(self, handles_key: _HandlesKey, is_training: bool) -> None: ( rank, world_indices[ - rank * num_valid_indices : (rank + 1) * num_valid_indices + rank * num_valid_indices: (rank + 1) * num_valid_indices ], ) for rank in range(self.world_size) @@ -1138,28 +1134,11 @@ def __init__( # implemented using post-save and pre-load hooks self._state_dict_type = StateDictType.FULL_STATE_DICT self._state_dict_config = FullStateDictConfig() - self._register_state_dict_hook(self._post_state_dict_hook) - self._post_state_dict_hook_fn = { - StateDictType.FULL_STATE_DICT: self._full_post_state_dict_hook, - StateDictType.LOCAL_STATE_DICT: self._local_post_state_dict_hook, - StateDictType.SHARDED_STATE_DICT: self._sharded_post_state_dict_hook, - } + self._register_state_dict_hook(_post_state_dict_hook) self._register_load_state_dict_pre_hook( - self._pre_load_state_dict_hook, with_module=True - ) - self._pre_load_state_dict_hook_fn = { - StateDictType.FULL_STATE_DICT: self._full_pre_load_state_dict_hook, - StateDictType.LOCAL_STATE_DICT: self._local_pre_load_state_dict_hook, - StateDictType.SHARDED_STATE_DICT: self._sharded_pre_load_state_dict_hook, - } - self.register_load_state_dict_post_hook( - self._post_load_state_dict_hook + _pre_load_state_dict_hook, with_module=True ) - self._post_load_state_dict_hook_fn = { - StateDictType.FULL_STATE_DICT: self._full_post_load_state_dict_hook, - StateDictType.LOCAL_STATE_DICT: self._local_post_load_state_dict_hook, - StateDictType.SHARDED_STATE_DICT: self._sharded_post_load_state_dict_hook, - } + self.register_load_state_dict_post_hook(_post_load_state_dict_hook) def _get_ignored_modules( self, @@ -2171,7 +2150,9 @@ def set_state_dict_type( prev_state_dict_config = submodule._state_dict_config if prev_state_dict_type != submodule._state_dict_type: raise RuntimeError("All FSDP module should the same state_dict_type.") - if type(prev_state_dict_config) != type(submodule._state_dict_config): + if not isinstance( + submodule._state_dict_config, type(prev_state_dict_config) + ): raise RuntimeError( "All FSDP modules should have the same type of state_dict_config." ) @@ -2268,200 +2249,6 @@ def _shared_param_fqns(self) -> Iterator[Tuple[str, str, str]]: fqn = f"{module_name}{param_name}" yield fqn, param_name, module_name - def _full_post_state_dict_hook( - self, - state_dict: Dict[str, Any], - prefix: str, - ) -> Dict[str, Any]: - """ - Hook that runs after model.state_dict() is called before returning result to - user. For FSDP, we may have to clone the tensors in state_dict as params go - back to sharded version after _summon_full_params ends, and also remove - the ``FSDP_WRAPPED_MODULE`` prefix. - """ - _replace_by_prefix(state_dict, prefix + f"{FSDP_PREFIX}", prefix) - self._assert_state([TrainingState_.SUMMON_FULL_PARAMS]) - # Return early for trivial cases - if not state_dict or not self._has_params: - return state_dict - - # If a rank has already exited the `summon_full_params()` context here - # (e.g. when `rank0_only=True` and `rank != 0`), then the rank only - # needed to participate in the all-gather and does not need to save the - # state dict. For `use_orig_params=False`, we can check this via - # `FlatParameter` registration. - # TODO: For `use_orig_params=True`, we check for the reshard upon - # exiting `summon_full_params()` via the parameter shape. However, for - # `NO_SHARD`, we cannot tell from the shape, so we do not return early. - if ( - ( - not self._use_orig_params - and FLAT_PARAM in self.module._parameters - ) - or ( - self._use_orig_params - and self._handles - and self._handles[0].uses_sharded_strategy - and self._handles[0].is_sharded(self._handles[0].flat_param) - ) - ): - return state_dict - - offload_to_cpu = self._state_dict_config.offload_to_cpu - cpu_device = torch.device("cpu") - - # Loop only the parameters saved in this instance's wrapped module to - # avoid processing buffers. - for fqn, param_name, module_name in self._param_fqns: - fqn = f"{prefix}{fqn}" - clean_key = fqn - clean_prefix = clean_tensor_name(prefix) - # Strip prefix out of key if needed as buffer names and param names - # do not have prefix considered as they are not computed in `state_dict` - # call. - if clean_key.startswith(clean_prefix): - clean_key = clean_key[len(clean_prefix):] - - # Clone non-ignored parameters before exiting the - # `_summon_full_params()` context - assert fqn in state_dict, ( - f"FSDP assumes {fqn} is in the state_dict but the state_dict " - f"only has {state_dict.keys()}. prefix={prefix}, " - f"module_name={module_name} param_name={param_name} rank={self.rank}." - ) - if clean_key not in self._ignored_param_names and \ - not getattr(state_dict[fqn], "_has_been_cloned", False): - try: - state_dict[fqn] = state_dict[fqn].clone().detach() - state_dict[fqn]._has_been_cloned = True # type: ignore[attr-defined] - except BaseException as e: - warnings.warn( - f"Failed to clone() tensor with name {fqn} on rank {self.rank}. " - "This may mean that this state_dict entry could point to invalid memory " - "regions after returning from state_dict() call if this " - "parameter is managed by FSDP. Please check clone " - f"implementation of {fqn}. Error: {str(e)}" - ) - - # Offload the buffer to CPU if needed -- we do not do this in - # `_summon_full_params()` since without care, that would free - # the original buffer's GPU memory and require reallocating - # that memory later; this only affects the state dict's buffer - # variable and leaves the original buffer's GPU memory intact - if offload_to_cpu: - for clean_key in self._buffer_names: - # This is a hack to support activation checkpoint. - clean_key = clean_key.replace( - f"{checkpoint_wrapper._CHECKPOINT_PREFIX}.", "" - ) - fqn = f"{prefix}{clean_key}" - if fqn not in state_dict: - # A buffer can be registered as non-persistent. - continue - if state_dict[fqn].device != cpu_device: - state_dict[fqn] = state_dict[fqn].to(cpu_device) - return state_dict - - def _local_post_state_dict_hook( - self, - state_dict: Dict[str, Any], - prefix: str, - ) -> Dict[str, Any]: - """ - This hook create a ShardedTensor from the local flat_param and replace - the state_dict[f"{prefix}{FLAT_PARAM}] with the ShardedTensor. No copy - will happen. The underlying storage is the same. - """ - _replace_by_prefix(state_dict, f"{prefix}{FSDP_PREFIX}", prefix) - if not self._has_params: - return state_dict - - # state_dict[f"{prefix}{FLAT_PARAM}"] exists and has the same tensor - # value as the flat_param but it is a pure Tensor because - # nn.Module.state_dict() will detach the parameter. Therefore, we need - # to get flat_param to get the metadata. - assert self._handles, "Should have returned early" - flat_param = self._handles[0].flat_param - # Construct a ShardedTensor from the flat_param. - full_numel = flat_param._unpadded_unsharded_size.numel() # type: ignore[attr-defined] - shard_offset = flat_param.numel() * self.rank - valid_data_size = flat_param.numel() - flat_param._shard_numel_padded - if valid_data_size > 0 and flat_param._shard_numel_padded > 0: - flat_param = flat_param.narrow(0, 0, valid_data_size) - local_shards = [ - Shard.from_tensor_and_offsets(flat_param, [shard_offset], self.rank) - ] - sharded_tensor = init_from_local_shards( - local_shards, full_numel, process_group=self.process_group - ) # type: ignore[assignment] - if self._state_dict_config.offload_to_cpu: - sharded_tensor = sharded_tensor.cpu() - state_dict[f"{prefix}{FLAT_PARAM}"] = sharded_tensor - return state_dict - - @torch.no_grad() - def _sharded_post_state_dict_hook( - self, - state_dict: Dict[str, Any], - prefix: str, - ) -> Dict[str, Any]: - """ - The hook replaces the unflattened, unsharded parameter in the state_dict - with a unflattened, sharded parameter (a ShardedTensor). - """ - _replace_by_prefix(state_dict, f"{prefix}{FSDP_PREFIX}", prefix) - if not self._has_params: - return state_dict - - assert self.training_state != TrainingState_.SUMMON_FULL_PARAMS, ( - "Inside _sharded_post_state_dict_hook, the training_state must " - "not be SUMMON_FULL_PARAMS." - ) - with self._summon_full_params(recurse=False, writeback=False): - for fqn, _, _ in self._param_fqns: - # Create a ShardedTensor for the unflattened, non-sharded parameter. - param = functools.reduce(getattr, fqn.split("."), self.module) - sharded_tensor = _ext_chunk_tensor( - tensor=param, - rank=self.rank, - world_size=self.world_size, - num_devices_per_node=torch.cuda.device_count(), - pg=self.process_group - ) - if self._state_dict_config.offload_to_cpu: - sharded_tensor = sharded_tensor.cpu() - state_dict[f"{prefix}{fqn}"] = sharded_tensor - # For `use_orig_params=True`, the `FlatParameter` is not registered, so - # there is no entry in the state dict for it to pop. - if not self._use_orig_params: - state_dict.pop(f"{prefix}{FLAT_PARAM}") - return state_dict - - @staticmethod - def _post_state_dict_hook( - module: nn.Module, - state_dict: Dict[str, Any], - prefix: str, - *args: Any, - ) -> Dict[str, Any]: - """ - _post_state_dict_hook() is called after the state_dict() of this - FSDP module is executed. ``self._state_dict_type`` is used to decide - what postprocessing will be done. - """ - self = cast(FullyShardedDataParallel, module) - processed_state_dict = self._post_state_dict_hook_fn[self._state_dict_type](state_dict, prefix) - # Restore buffers, which currently are in their full precision type, - # back to their mixed precision type. This is because buffers are cast - # during lazy_init() and stay at their mixed precision type before/after - # forward/backward. As a result state_dict() should maintain this. - if ( - self._is_root - and self._mixed_precision_enabled_for_buffers() - ): - self._cast_buffers(recurse=True) - return processed_state_dict - def state_dict(self, *args, **kwargs): """ This is the entry point of all three FSDP ``state_dict`` APIs: full, @@ -2560,268 +2347,6 @@ def state_dict(self, *args, **kwargs): else: raise ValueError(f"Unknown StateDictType {self._state_dict_type}.") - def _local_state_dict(self, *args: Any, **kwargs: Any) -> Any: - """ - Returns the local state of the module. Parameters are flattened and - sharded, so the resulting state_dict can only be loaded after the module - has been wrapped with FSDP. - """ - with self.state_dict_type(self, StateDictType.LOCAL_STATE_DICT): - return self.state_dict(*args, **kwargs) - - def _full_post_load_state_dict_hook(self, *args, **kwargs) -> None: - # We should exit summon_full_params context. - self._assert_state([TrainingState_.SUMMON_FULL_PARAMS]) - assert getattr(self, '_full_param_ctx', None) is not None - self._full_param_ctx.__exit__(None, None, None) - self._full_param_ctx = None - - def _sharded_state_dict(self, *args: Any, **kwargs: Any) -> Any: - """ - Returns the sharded states of the module. Parameters are unflattened and - sharded, so the resulting state_dict can be used with any parallelism - (e.g., DPP, model parallelism, and single trainer) after a valid - resharding. - """ - with self.state_dict_type(StateDictType.SHARDED_STATE_DICT): - return self.state_dict(self, *args, **kwargs) - - def _full_pre_load_state_dict_hook( - self, - state_dict: Dict[str, Any], - prefix: str, - ) -> None: - # We do not expect to be calling pre-hooks twice without post-hook - # call in between. - assert getattr(self, '_full_param_ctx', None) is None - # Note that it needs writeback=True to persist. - self._full_param_ctx = self._summon_full_params( - recurse=False, writeback=True - ) - self._full_param_ctx.__enter__() - _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_PREFIX}") - - def _local_post_load_state_dict_hook(self, *args, **kwargs) -> None: - pass - - def _local_pre_load_state_dict_hook( - self, - state_dict: Dict[str, Any], - prefix: str, - ) -> None: - """ - This hook finds the local flat_param for this FSDP module from the - state_dict. The flat_param should be a ShardedTensor. This hook converts - the ShardedTensor to a tensor. No copy happen unless padding is required. - """ - _replace_by_prefix(state_dict, prefix, f"{prefix}{FSDP_PREFIX}") - fqn = f"{prefix}{FSDP_PREFIX}{FLAT_PARAM}" - if fqn not in state_dict: - assert not self._has_params, ( - "No `FlatParameter` in `state_dict` for this FSDP instance but it has parameters" - ) - return - load_tensor = state_dict[fqn] - assert isinstance( - load_tensor, ShardedTensor - ), "Tensors in local_state_dict should be ShardedTensor." - - # Convert the ShardedTensor to a Tensor. - shards = load_tensor.local_shards() - assert len(shards), "load_local_state_dict assume one shard per ShardedTensor." - load_tensor = cast(torch.Tensor, shards[0].tensor) - - # Get the metada of the flat_param to decide whether to pad the loaded - # tensor. - flat_param = self._handles[0].flat_param - assert flat_param is not None - if flat_param._shard_numel_padded not in (0, flat_param.numel()): - assert load_tensor.numel() < flat_param.numel(), ( - f"Local shard size = {flat_param.numel()} and the tensor in " - f"the state_dict is {load_tensor.numel()}." - ) - load_tensor = F.pad(load_tensor, [0, flat_param._shard_numel_padded]) - state_dict[fqn] = load_tensor - - def _sharded_post_load_state_dict_hook(self, *args, **kwargs) -> None: - if self._use_orig_params: - self._register_orig_params() - - def _sharded_pre_load_state_dict_hook( - self, - state_dict: Dict[str, Any], - prefix: str, - ) -> None: - """ - The hook combines the unflattened, sharded parameters (ShardedTensor) to - a new FlatParameter and shards the new FlatParameter to the local chunk. - """ - _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_PREFIX}") - if not self._has_params: - return - - if not self._handles[0].uses_sharded_strategy: - raise RuntimeError( - "load_sharded_state_dict can only be called when parameters " - "are flatten and sharded." - ) - - nonsharded_tensors = [] - # TODO: Reduce the communication by using only one - # `all_gather_into_tensor()` to gather all the parameters in this - # layer. This can be achieved by concatenating all the local shards and - # then appending the padding. - # https://github.com/pytorch/pytorch/issues/77461 - shared_fqns = [fqn for fqn, _, _ in self._shared_param_fqns] - for fqn, _, _ in self._param_fqns: - full_fqn = f"{prefix}{FSDP_PREFIX}{fqn}" - param = state_dict.pop(full_fqn) - if fqn in shared_fqns: - continue - # All-gather the param (ShardedTensor) - param, shards = _ext_pre_load_state_dict_transform(param) - assert len(shards) < 2, ( - f"Expects 0 or 1 shard per rank but got {len(shards)} shards on rank {self.rank}" - ) - param_numel = param.size().numel() - dim_0_size = param.size()[0] - chunk_size = ( - math.ceil(dim_0_size / self.world_size) * param_numel // dim_0_size - ) - if len(shards) == 1: - local_tensor = cast(torch.Tensor, shards[0].tensor).flatten() - if not local_tensor.is_cuda: - local_tensor = local_tensor.cuda() - num_padding = chunk_size - local_tensor.numel() - if num_padding > 0: - local_tensor = F.pad(local_tensor, [0, num_padding]) - else: - local_tensor = torch.zeros(chunk_size, dtype=param.dtype).cuda() - tensor = torch.empty( - chunk_size * self.world_size, dtype=local_tensor.dtype - ).cuda() - dist.all_gather_into_tensor(tensor, local_tensor, group=self.process_group) - tensor = tensor.narrow(0, 0, param_numel).reshape(param.size()) - nonsharded_tensors.append(tensor) - - # Create a new flat_param from the loaded, non-sharded tensors. - flat_param = self._handles[0].flat_param - loaded_flat_param = FlatParamHandle.flatten_params(nonsharded_tensors, requires_grad=False) - - # Get the chunk from the loaded flat_param for the local rank. - loaded_flat_param, num_to_pad = FlatParamHandle._get_shard( - loaded_flat_param, self.rank, self.world_size, - ) - loaded_flat_param.to(flat_param.device) - assert flat_param.numel() == loaded_flat_param.numel(), ( - f"The loaded local chunk has different numel({loaded_flat_param.numel()}) " - f"from the local chunk {flat_param.numel()}." - ) - assert flat_param._shard_numel_padded == num_to_pad, ( - f"The loaded local chunk has different padding({num_to_pad}) " - f"from the local chunk {flat_param._shard_numel_padded}." - ) - state_dict[f"{prefix}{FSDP_PREFIX}{FLAT_PARAM}"] = loaded_flat_param - if self._use_orig_params: - self._deregister_orig_params() - - @staticmethod - def _pre_load_state_dict_hook( - module: nn.Module, - state_dict: Dict[str, Any], - prefix: str, - *args: Any, - ) -> None: - """ - ``_pre_state_dict_hook` is called before ``self._load_from_state_dict()`` - is called. ``self._state_dict_type`` is used to decide what preprocessing - will be done. - """ - # Code that is common for all state_dict impls - self = cast(FullyShardedDataParallel, module) - if torch.cuda.is_available(): - torch.cuda.synchronize() - # Dispatch into state_dict specific implementation of pre-hook. - self._pre_load_state_dict_hook_fn[self._state_dict_type](state_dict, prefix) - - @staticmethod - def _post_load_state_dict_hook(module: nn.Module, *args: Any) -> None: - # Code that is common for all state_dict impls - self = cast(FullyShardedDataParallel, module) - # Dispatch into state_dict type specific implementation of post-hook for - # loading state_dict. - self._post_load_state_dict_hook_fn[self._state_dict_type]() - - def load_state_dict( - self, - state_dict: Mapping[str, Any], - *args, - **kwargs, - ) -> NamedTuple: - """ - The entry point of all three FSDP ``load_state_dict`` APIs. By default, - calling ``load_state_dict`` on an FSDP module will result in FSDP - attempting to load a "full" state_dict, i.e. a state_dict consisting of - full, unsharded, unflattened original module parameters. This requires - FSDP to load the full parameter context on each rank which could result - in GPU OOM. As a result, :func:`state_dict_type` API is available to - configure between ``load_state_dict`` implementations. User can thus use - ``with self.state_dict_type(self, StateDictType.LOCAL_STATE_DICT)`` context - manager to load a local state dict checkpoint that will restore only - local shards of the module. Currently, the only supported - implementations are ``StateDictType.LOCAL_STATE_DICT`` and - ``StateDictType.FULL_STATE_DICT`` (default). Please see :func:`state_dict` - for documentation around creating an FSDP checkpoint. - - Example:: - - >>> # xdoctest: +SKIP("undefined variables") - >>> import torch - >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP - >>> from torch.distributed.fsdp import StateDictType - >>> torch.cuda.set_device(device_id) - >>> my_module = nn.Linear(...) - >>> sharded_module = FSDP(my_module) - >>> checkpoint = torch.load(PATH) - >>> full_state_dict = checkpoint['full_state_dict'] - >>> with FSDP.state_dict_type(sharded_module, StateDictType.FULL_STATE_DICT): - >>> sharded_module.load_state_dict(full_state_dict) - >>> full_dict.keys() - >>> odict_keys(['weight', 'bias']) - >>> # using local state dict - >>> local_state_dict = checkpoint['local_state_dict'] - >>> with FSDP.state_dict_type(sharded_module, StateDictType.LOCAL_STATE_DICT): - >>> sharded_module.load_state_dict(local_state_dict) - >>> local_dict.keys() - >>> odict_keys(['flat_param', 'inner.flat_param']) - - .. warning:: This needs to be called on all ranks since it uses - collective communications. - """ - return super().load_state_dict(state_dict, *args) - - def _load_local_state_dict( - self, - state_dict: Mapping[str, Any], - *args, - ) -> NamedTuple: - """ - Load states from a flattened, sharded state dictionary. - """ - with self.state_dict_type(self, StateDictType.LOCAL_STATE_DICT): - return self.load_state_dict(state_dict, *args) - - def _load_sharded_state_dict( - self, - state_dict: Union[Dict[str, torch.Tensor], "OrderedDict[str, torch.Tensor]"], - strict: bool = True, - ) -> NamedTuple: - """ - Load states from a unflattened, sharded state dictionary. - """ - with self.state_dict_type(StateDictType.SHARDED_STATE_DICT): - return self.load_state_dict(state_dict, strict) - def forward(self, *args: Any, **kwargs: Any) -> Any: """ Runs the forward pass for the wrapped module, inserting FSDP-specific @@ -4540,7 +4065,6 @@ def register_comm_hook(self, state: object, hook: callable): submodule._communication_hook_state = state submodule._communication_hook = hook - def _init_param_exec_order_wrap_policy(self, *args, **kwargs) -> None: auto_wrap_policy = kwargs["auto_wrap_policy"] module = kwargs["module"] From e03181f55ac0098a14d3b581cbf0235423bdaf5b Mon Sep 17 00:00:00 2001 From: Zachary DeVito Date: Tue, 25 Oct 2022 19:47:30 +0000 Subject: [PATCH 0165/1922] [inductor] Fix finalization issues when using multiprocessing (#87725) If python was launched with 'spawn' it will not use the standard shutdown methods that concurrent.futures requires. So we register a shutdown with the method it does uses. Without this, shutdown hangs since the workers will not exit. cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87725 Approved by: https://github.com/wconstab --- torch/_inductor/codecache.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py index 1d83633019cb8..1c97c26a7870e 100644 --- a/torch/_inductor/codecache.py +++ b/torch/_inductor/codecache.py @@ -9,6 +9,7 @@ import shutil import signal import subprocess +import sys import sysconfig import tempfile import types @@ -304,9 +305,15 @@ def run(): # we rely on 'fork' because we cannot control whether users # have an `if __name__ == '__main__'` in their main process. fork_context = multiprocessing.get_context("fork") - return ProcessPoolExecutor( + pool = ProcessPoolExecutor( config.compile_threads, mp_context=fork_context, initializer=init ) + # when this pool is created in a subprocess object, the normal exit handler + # doesn't run, and we need to register our own handler. + # exitpriority has to be high, because another one of the finalizers will + # kill the worker thread that sends the shutdown message to the workers... + multiprocessing.util.Finalize(None, pool.shutdown, exitpriority=sys.maxsize) + return pool @classmethod def warm_pool(cls): From 40cc28559a5def4b81151daf991796d7e98488e2 Mon Sep 17 00:00:00 2001 From: PyTorch MergeBot Date: Wed, 26 Oct 2022 05:09:39 +0000 Subject: [PATCH 0166/1922] [vision hash update] update the pinned vision hash (#87744) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml). Update the pinned vision hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87744 Approved by: https://github.com/pytorchbot --- .github/ci_commit_pins/vision.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt index 88e283fa46ec9..d4dee5af2936d 100644 --- a/.github/ci_commit_pins/vision.txt +++ b/.github/ci_commit_pins/vision.txt @@ -1 +1 @@ -0d7807d59520289b2065b4db4a138b7fba2f61fd +edb3a8069a0b86231f14e84ac9f26fd7c7bffb5f From 10fea5102d1908ec71a3f9d8efeb9d4d0cc0c86a Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Wed, 26 Oct 2022 05:40:25 +0000 Subject: [PATCH 0167/1922] [Inductor] update triton commit pin (#87732) Fixes https://github.com/pytorch/torchdynamo/issues/1746 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87732 Approved by: https://github.com/ngimel --- .github/ci_commit_pins/triton.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ci_commit_pins/triton.txt b/.github/ci_commit_pins/triton.txt index 58d82813d6e13..26387597d0911 100644 --- a/.github/ci_commit_pins/triton.txt +++ b/.github/ci_commit_pins/triton.txt @@ -1 +1 @@ -db3aa1d1fb2bb536752a71d9e0f03cf6a86ddf65 +5ca1ed01016530056c4507661c24d6c21efc983d From 889ebaebdff71e72b489fbd89adf3ce14a0af6d1 Mon Sep 17 00:00:00 2001 From: Charlie Yan Date: Wed, 26 Oct 2022 00:32:13 +0000 Subject: [PATCH 0168/1922] Format distributed.py (#87667) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87667 Approved by: https://github.com/zhaojuanmao --- torch/nn/parallel/distributed.py | 385 ++++++++++++++++++------------- 1 file changed, 230 insertions(+), 155 deletions(-) diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py index 2e271f7bb081f..23625d9d20014 100644 --- a/torch/nn/parallel/distributed.py +++ b/torch/nn/parallel/distributed.py @@ -39,10 +39,11 @@ from ._replicated_tensor_ddp_utils import _ddp_with_replicated_tensor_enabled from .scatter_gather import gather, is_namedtuple, scatter_kwargs # noqa: F401 -__all__ = ['DistributedDataParallel'] +__all__ = ["DistributedDataParallel"] logger = logging.getLogger(__name__) + def _tree_flatten_with_rref(output): output_is_rref = RPC_AVAILABLE and isinstance(output, RRef) if output_is_rref: @@ -142,12 +143,14 @@ class _BufferCommHookLocation(Enum): PRE_FORWARD = auto() POST_FORWARD = auto() + @dataclass class _BufferCommHook: buffer_comm_hook: Callable buffer_comm_hook_state: Any buffer_comm_hook_location: _BufferCommHookLocation + # Add a DDPSink to run various functions when backwards starts, such as # queueing call back of out-most backward/graph task, # this helps call back is fired after all gradients' calculation @@ -161,9 +164,7 @@ def forward(ctx, reducer, state_dict, *inputs): ctx.reducer = reducer ctx.state_dict = state_dict ret = tuple( - inp.clone() - if isinstance(inp, torch.Tensor) - else inp + inp.clone() if isinstance(inp, torch.Tensor) else inp for inp in inputs ) return ret @@ -173,8 +174,13 @@ def backward(ctx, *grad_outputs): state_dict = ctx.state_dict # Enqueue delay allreduce for static graph training on the first # iteration. - if ctx.state_dict['static_graph'] and ctx.state_dict['num_iterations'] == 1: - Variable._execution_engine.queue_callback(ctx.reducer._delay_all_reduce) + if ( + ctx.state_dict["static_graph"] + and ctx.state_dict["num_iterations"] == 1 + ): + Variable._execution_engine.queue_callback( + ctx.reducer._delay_all_reduce + ) return (None, None, *grad_outputs) @@ -209,7 +215,9 @@ def main_hook(self): ddp._check_and_sync_module_buffers() # Check if need to sync in the backward pass - work = ddp._check_global_requires_backward_grad_sync(is_joined_rank=True) + work = ddp._check_global_requires_backward_grad_sync( + is_joined_rank=True + ) work.wait() should_sync_backwards = work.result()[0].item() != 0 # Forward parameter sync is disabled in the next iteration if we @@ -237,6 +245,7 @@ def post_hook(self, is_last_joiner: bool): """ self.ddp._sync_final_model(is_last_joiner) + class DistributedDataParallel(Module, Joinable): r"""Implements distributed data parallelism that is based on ``torch.distributed`` package at the module level. @@ -556,10 +565,13 @@ def __init__( if device_ids is not None and len(device_ids) > 1: self._log_and_throw( - ValueError, "device_ids can only be None or contain a single element." + ValueError, + "device_ids can only be None or contain a single element.", ) - self.is_multi_device_module = len({p.device for p in module.parameters()}) > 1 + self.is_multi_device_module = ( + len({p.device for p in module.parameters()}) > 1 + ) distinct_device_types = {p.device.type for p in module.parameters()} if len(distinct_device_types) != 1: self._log_and_throw( @@ -619,7 +631,9 @@ def __init__( else: self.parameters_to_ignore = [] - self._use_replicated_tensor_module = _ddp_with_replicated_tensor_enabled() + self._use_replicated_tensor_module = ( + _ddp_with_replicated_tensor_enabled() + ) self._build_replicated_tensor_module() if check_reduction: @@ -662,10 +676,15 @@ def __init__( params_and_buffers_to_ignore=self.parameters_to_ignore, ) # In debug mode, build a mapping of parameter index -> parameter. - param_to_name_mapping = self._build_debug_param_to_name_mapping(parameters) + param_to_name_mapping = self._build_debug_param_to_name_mapping( + parameters + ) # Builds reducer. self._ddp_init_helper( - parameters, expect_sparse_gradient, param_to_name_mapping, static_graph + parameters, + expect_sparse_gradient, + param_to_name_mapping, + static_graph, ) self._has_rebuilt_buckets = False @@ -678,7 +697,10 @@ def _build_replicated_tensor_module(self): # registering '_replicated_tensor_module' as a submodule by directly # adding to self.__dict__. from ._replicated_tensor_ddp_interop import _replicate_module - self.__dict__['_replicated_tensor_module'] = _replicate_module(self.module, self.process_group) + + self.__dict__["_replicated_tensor_module"] = _replicate_module( + self.module, self.process_group + ) def _log_and_throw(self, err_type, err_msg): if self.logger is not None: @@ -686,8 +708,11 @@ def _log_and_throw(self, err_type, err_msg): raise err_type(err_msg) def _ddp_init_helper( - self, parameters, expect_sparse_gradient, param_to_name_mapping, - static_graph + self, + parameters, + expect_sparse_gradient, + param_to_name_mapping, + static_graph, ): """ Initialization helper function that does the following: @@ -720,8 +745,14 @@ def _ddp_init_helper( if static_graph is True or self.find_unused_parameters is False: bucket_size_limits = [sys.maxsize] else: - bucket_size_limits = [dist._DEFAULT_FIRST_BUCKET_BYTES, self.bucket_bytes_cap] - bucket_indices, per_bucket_size_limits = dist._compute_bucket_assignment_by_size( + bucket_size_limits = [ + dist._DEFAULT_FIRST_BUCKET_BYTES, + self.bucket_bytes_cap, + ] + ( + bucket_indices, + per_bucket_size_limits, + ) = dist._compute_bucket_assignment_by_size( parameters, bucket_size_limits, expect_sparse_gradient, @@ -747,7 +778,7 @@ def _ddp_init_helper( param_to_name_mapping, # User can set dist._DEFAULT_FIRST_BUCKET_BYTES to tune DDP first # bucket. - dist._DEFAULT_FIRST_BUCKET_BYTES + dist._DEFAULT_FIRST_BUCKET_BYTES, ) self.logger = dist.Logger(self.reducer) @@ -793,10 +824,15 @@ def __setstate__(self, state): self.__dict__.setdefault("require_backward_grad_sync", True) parameters, expect_sparse_gradient = self._build_params_for_reducer() # In debug mode, build a mapping of parameter index -> parameter. - param_to_name_mapping = self._build_debug_param_to_name_mapping(parameters) + param_to_name_mapping = self._build_debug_param_to_name_mapping( + parameters + ) # Builds reducer. self._ddp_init_helper( - parameters, expect_sparse_gradient, param_to_name_mapping, self.static_graph + parameters, + expect_sparse_gradient, + param_to_name_mapping, + self.static_graph, ) if self.static_graph: self.reducer._set_static_graph() @@ -815,7 +851,8 @@ def _build_params_for_reducer(self): # parameters through _former_parameters. for param_name, param in module.named_parameters(recurse=False) if param.requires_grad - and f"{module_name}.{param_name}" not in self.parameters_to_ignore + and f"{module_name}.{param_name}" + not in self.parameters_to_ignore ] ] @@ -824,7 +861,8 @@ def _build_params_for_reducer(self): modules_and_parameters = [ # "p not in memo" is the deduplication check. # "not memo.add(p)" is always True, and it's only there to cause "add(p)" if needed. - (m, p) for m, p in modules_and_parameters + (m, p) + for m, p in modules_and_parameters if p not in memo and not memo.add(p) ] @@ -841,7 +879,10 @@ def produces_sparse_gradient(module): # Build list of booleans indicating whether or not to expect sparse # gradients for the corresponding parameters. - expect_sparse_gradient = list(produces_sparse_gradient(module) for module, _ in modules_and_parameters) + expect_sparse_gradient = list( + produces_sparse_gradient(module) + for module, _ in modules_and_parameters + ) self._assign_modules_buffers() @@ -862,19 +903,21 @@ def _assign_modules_buffers(self): if buffer_name not in self.parameters_to_ignore ] self.modules_buffers = [ - buffer - for (buffer, buffer_name) in named_module_buffers + buffer for (buffer, buffer_name) in named_module_buffers ] # Dict[str, tensor] representing module buffers not ignored by DDP. self.named_module_buffers = { - buffer_name: buffer for (buffer, buffer_name) in named_module_buffers + buffer_name: buffer + for (buffer, buffer_name) in named_module_buffers } def _build_debug_param_to_name_mapping(self, parameters): if dist.get_debug_level() == dist.DebugLevel.OFF: return {} - param_to_param_index = {parameters[i]: i for i in range(len(parameters))} + param_to_param_index = { + parameters[i]: i for i in range(len(parameters)) + } param_set = set(parameters) param_index_to_param_fqn = {} for module_name, module in self.module.named_modules(): @@ -987,14 +1030,18 @@ def _inside_ddp_forward(self): DistributedDataParallel._active_ddp_module = None def _run_ddp_forward(self, *inputs, **kwargs): - module_to_run = self._replicated_tensor_module if self._use_replicated_tensor_module else self.module + module_to_run = ( + self._replicated_tensor_module + if self._use_replicated_tensor_module + else self.module + ) if self.device_ids: inputs, kwargs = _to_kwargs( inputs, kwargs, self.device_ids[0], - self.use_side_stream_for_tensor_copies + self.use_side_stream_for_tensor_copies, ) with self._inside_ddp_forward(): return module_to_run(*inputs[0], **kwargs[0]) @@ -1003,7 +1050,9 @@ def _run_ddp_forward(self, *inputs, **kwargs): return module_to_run(*inputs, **kwargs) def forward(self, *inputs, **kwargs): - with torch.autograd.profiler.record_function("DistributedDataParallel.forward"): + with torch.autograd.profiler.record_function( + "DistributedDataParallel.forward" + ): if torch.is_grad_enabled() and self.require_backward_grad_sync: self.logger.set_runtime_stats_and_log() self.num_iterations += 1 @@ -1024,18 +1073,22 @@ def forward(self, *inputs, **kwargs): # during forward computation. # This should be called only once during whole training period. if torch.is_grad_enabled() and self.reducer._rebuild_buckets(): - logger.info("Reducer buckets have been rebuilt in this iteration.") + logger.info( + "Reducer buckets have been rebuilt in this iteration." + ) self._has_rebuilt_buckets = True # sync params according to location (before/after forward) user # specified as part of hook, if hook was specified. - buffer_hook_registered = hasattr(self, 'buffer_hook') + buffer_hook_registered = hasattr(self, "buffer_hook") if self._check_sync_bufs_pre_fwd(): self._sync_buffers() if self._join_config.enable: # Notify joined ranks whether they should sync in backwards pass or not. - self._check_global_requires_backward_grad_sync(is_joined_rank=False) + self._check_global_requires_backward_grad_sync( + is_joined_rank=False + ) output = self._run_ddp_forward(*inputs, **kwargs) @@ -1053,7 +1106,9 @@ def forward(self, *inputs, **kwargs): # unused parameters. Only if `find_unused_parameters` is set. if self.find_unused_parameters and not self.static_graph: # Do not need to populate this for static graph. - self.reducer.prepare_for_backward(list(_find_tensors(output))) + self.reducer.prepare_for_backward( + list(_find_tensors(output)) + ) else: self.reducer.prepare_for_backward([]) else: @@ -1065,13 +1120,15 @@ def forward(self, *inputs, **kwargs): self.static_graph and self.num_iterations == 1 ): state_dict = { - 'static_graph': self.static_graph, - 'num_iterations': self.num_iterations, + "static_graph": self.static_graph, + "num_iterations": self.num_iterations, } - output_tensor_list, treespec, output_is_rref = _tree_flatten_with_rref( - output - ) + ( + output_tensor_list, + treespec, + output_is_rref, + ) = _tree_flatten_with_rref(output) output_placeholders = [None for _ in range(len(output_tensor_list))] # Do not touch tensors that have no grad_fn, which can cause issues # such as https://github.com/pytorch/pytorch/issues/60733 @@ -1134,7 +1191,9 @@ def _check_global_requires_backward_grad_sync(self, is_joined_rank): # the models have buffers that should be synchronized in the forward pass. def _check_and_sync_module_buffers(self): if self._check_sync_bufs_pre_fwd(): - authoritative_rank = self._find_common_rank(self._distributed_rank, False) + authoritative_rank = self._find_common_rank( + self._distributed_rank, False + ) self._sync_module_buffers(authoritative_rank) # When running in join model, agrees upon a common rank and broadcast model @@ -1151,7 +1210,7 @@ def _sync_final_model(self, is_last_joiner): process_group=self.process_group, broadcast_bucket_size=self.broadcast_bucket_size, src=self._authoritative_rank, - params_and_buffers_to_ignore=self.parameters_to_ignore + params_and_buffers_to_ignore=self.parameters_to_ignore, ) # Schedule comm ops to match those scheduled in the reducer's backward @@ -1315,7 +1374,9 @@ def join_hook( cases for possibly better results. Default is ``True``. """ - divide_by_initial_world_size = kwargs.get("divide_by_initial_world_size", True) + divide_by_initial_world_size = kwargs.get( + "divide_by_initial_world_size", True + ) return _DDPJoinHook( self, divide_by_initial_world_size=divide_by_initial_world_size ) @@ -1332,49 +1393,49 @@ def _register_buffer_comm_hook( self, state, hook: callable, - comm_hook_location=_BufferCommHookLocation.POST_FORWARD + comm_hook_location=_BufferCommHookLocation.POST_FORWARD, ): r""" - Allows custom registration of hooks that define how buffer are - synchronized across ranks. The hook takes in an optional state - and is passed in a Dict[str, Tensor] corresponding to buffer names - and the buffers, and can run arbitrary reductions on buffers as - opposed to DDP's default broadcast from rank 0. This is useful for - example if a counter needs to be summed or averaged across ranks - every iteration. + Allows custom registration of hooks that define how buffer are + synchronized across ranks. The hook takes in an optional state + and is passed in a Dict[str, Tensor] corresponding to buffer names + and the buffers, and can run arbitrary reductions on buffers as + opposed to DDP's default broadcast from rank 0. This is useful for + example if a counter needs to be summed or averaged across ranks + every iteration. - Args: - state (Any): Optional state that is passed to the hook. - hook (Callable): Callable with the following signature: - ``hook(state: object, buffers: Dict[str, torch.Tensor]) - -> Optional[List[torch.futures.Future[torch.Tensor]]]`` - comm_hook_location (_BufferCommHookLocation): Enum value indicating - where to run the hook. - _BufferCommHookLocation.PRE_FORWARD means that the - hook will run _before_ the forward pass, and - _BufferCommHookLocation.POST_FORWARD means that the - hook will run _after_ the forward pass. - - hook (Callable): Callable with the following signature: - ``hook(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]``: + Args: + state (Any): Optional state that is passed to the hook. + hook (Callable): Callable with the following signature: + ``hook(state: object, buffers: Dict[str, torch.Tensor]) + -> Optional[List[torch.futures.Future[torch.Tensor]]]`` + comm_hook_location (_BufferCommHookLocation): Enum value indicating + where to run the hook. + _BufferCommHookLocation.PRE_FORWARD means that the + hook will run _before_ the forward pass, and + _BufferCommHookLocation.POST_FORWARD means that the + hook will run _after_ the forward pass. - NOTE: To maximize performance, users can return a - List[torch.futures.Future] from their hook, and DDP will - install and await these hooks appropriately at the end of - the backward pass. This will ensure all buffers are - synchronized by the end of the backward pass. If this - setting is used, it is recommended to pass - comm_hook_location=_BufferCommHookLocation.POST_FORWARD, - which will trigger the hook after the forward pass. - If _BufferCommHookLocation.PRE_FORWARD is used, users must - ensure appropriate synchronization when manipulating GPU - buffers in the forward pass. - """ + hook (Callable): Callable with the following signature: + ``hook(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]``: + + NOTE: To maximize performance, users can return a + List[torch.futures.Future] from their hook, and DDP will + install and await these hooks appropriately at the end of + the backward pass. This will ensure all buffers are + synchronized by the end of the backward pass. If this + setting is used, it is recommended to pass + comm_hook_location=_BufferCommHookLocation.POST_FORWARD, + which will trigger the hook after the forward pass. + If _BufferCommHookLocation.PRE_FORWARD is used, users must + ensure appropriate synchronization when manipulating GPU + buffers in the forward pass. + """ assert callable(hook) self.buffer_hook = _BufferCommHook( buffer_comm_hook=hook, buffer_comm_hook_state=state, - buffer_comm_hook_location=comm_hook_location + buffer_comm_hook_location=comm_hook_location, ) def register_comm_hook(self, state: object, hook: callable): @@ -1486,69 +1547,75 @@ def _register_builtin_comm_hook(self, comm_hook_type): self.logger._set_comm_hook_name(str(comm_hook_type)) dist._register_builtin_comm_hook(self.reducer, comm_hook_type) - def _register_fused_optim(self, optim: Type, *args, optim_params=None, **kwargs): + def _register_fused_optim( + self, optim: Type, *args, optim_params=None, **kwargs + ): r""" - Registers an optimizer with DDP such that the optimization for a - parameter will run immediately when that parameter's gradient is - finished with reduction, instead of waiting for all parameters' - gradients to finish reduction. This can result in a training speedup - depending on your workload since the optimizer can run while gradient - reduction for other parameters are still ongoing. In addition, this has - the potential to reduce peak memory consumption during training, as it - only needs to load the per-parameter optimizer states of a single - parameter at a time, instead of loading all per-parameter optimizer - states at once. + Registers an optimizer with DDP such that the optimization for a + parameter will run immediately when that parameter's gradient is + finished with reduction, instead of waiting for all parameters' + gradients to finish reduction. This can result in a training speedup + depending on your workload since the optimizer can run while gradient + reduction for other parameters are still ongoing. In addition, this has + the potential to reduce peak memory consumption during training, as it + only needs to load the per-parameter optimizer states of a single + parameter at a time, instead of loading all per-parameter optimizer + states at once. - Args: - optim_cls (Type): a ``torch.optim.Optimizer`` class to be registered - as a fused optimizer. - *args (Sequence[Any]): Arguments to forward to `optim_cls`. - optim_params (Optional[Iterable[torch.Tensor]]): Set of parameters - to optimize, similar to `params` argument of traditional `torch.optim` - Optimizers. If this is omitted, all DDP model parameters will be - optimized. - **kwargs: (Dict[str, Any]): Keyword arguments to forward to `optim_cls`. - - .. warning :: - _register_fused_optim should only be called once on a DDP instance, - and registering multiple fused optimizers for the same DDP model - is not currently supported. Please ping - https://github.com/pytorch/pytorch/issues/71595 if this is necessary - for your use case. - - .. warning :: - _register_fused_optim and register_comm_hook currently do not - compose together, meaning that custom DDP communication hooks are - not supported with overlapped optimizers. Please ping - https://github.com/pytorch/pytorch/issues/71595 if this is necessary - for your use case. - - .. warning :: - Gradient accumulation and DDP `no_sync` are currently not supported - with overlapped optimizer. Please ping - https://github.com/pytorch/pytorch/issues/71595 if this is necessary - for your use case. + Args: + optim_cls (Type): a ``torch.optim.Optimizer`` class to be registered + as a fused optimizer. + *args (Sequence[Any]): Arguments to forward to `optim_cls`. + optim_params (Optional[Iterable[torch.Tensor]]): Set of parameters + to optimize, similar to `params` argument of traditional `torch.optim` + Optimizers. If this is omitted, all DDP model parameters will be + optimized. + **kwargs: (Dict[str, Any]): Keyword arguments to forward to `optim_cls`. - Example:: + .. warning :: + _register_fused_optim should only be called once on a DDP instance, + and registering multiple fused optimizers for the same DDP model + is not currently supported. Please ping + https://github.com/pytorch/pytorch/issues/71595 if this is necessary + for your use case. - >>> # xdoctest: +SKIP("No rendezvous handler") - >>> torch.distributed.init_process_group(backend='nccl', world_size=4, init_method='...') - >>> net = torch.nn.parallel.DistributedDataParallel(model, pg) - >>> lr = 1e-2 - >>> betas = (0.9, 0.99) - >>> eps = 1e-6 - >>> net._register_fused_optim(torch.optim.Adam, lr, betas=betas, eps=eps) - >>> # Example with subset of parameters - >>> params_to_opt = [list(net.parameters())[0]] - >>> net._register_fused_optim( - ... torch.optim.Adam, lr, optim_params=params_to_opt, betas=betas, eps=eps - ... ) + .. warning :: + _register_fused_optim and register_comm_hook currently do not + compose together, meaning that custom DDP communication hooks are + not supported with overlapped optimizers. Please ping + https://github.com/pytorch/pytorch/issues/71595 if this is necessary + for your use case. + + .. warning :: + Gradient accumulation and DDP `no_sync` are currently not supported + with overlapped optimizer. Please ping + https://github.com/pytorch/pytorch/issues/71595 if this is necessary + for your use case. + + Example:: + + >>> # xdoctest: +SKIP("No rendezvous handler") + >>> torch.distributed.init_process_group(backend='nccl', world_size=4, init_method='...') + >>> net = torch.nn.parallel.DistributedDataParallel(model, pg) + >>> lr = 1e-2 + >>> betas = (0.9, 0.99) + >>> eps = 1e-6 + >>> net._register_fused_optim(torch.optim.Adam, lr, betas=betas, eps=eps) + >>> # Example with subset of parameters + >>> params_to_opt = [list(net.parameters())[0]] + >>> net._register_fused_optim( + ... torch.optim.Adam, lr, optim_params=params_to_opt, betas=betas, eps=eps + ... ) """ # Note: importing in function, otherwise this will cause a circular # import as optimizer_overlap module needs to import DistributedDataParallel. - from torch.distributed.algorithms._optimizer_overlap import _as_overlapped_optim + from torch.distributed.algorithms._optimizer_overlap import ( + _as_overlapped_optim, + ) - overlapped_optim = _as_overlapped_optim(optim, optim_params, *args, **kwargs) + overlapped_optim = _as_overlapped_optim( + optim, optim_params, *args, **kwargs + ) try: overlapped_optim.register_ddp(self) except NotImplementedError: @@ -1565,16 +1632,16 @@ def _distributed_broadcast_coalesced( def _check_sync_bufs_post_fwd(self): return ( - self.will_sync_module_buffers() and - hasattr(self, 'buffer_hook') and - self.buffer_hook.buffer_comm_hook_location == - _BufferCommHookLocation.POST_FORWARD + self.will_sync_module_buffers() + and hasattr(self, "buffer_hook") + and self.buffer_hook.buffer_comm_hook_location + == _BufferCommHookLocation.POST_FORWARD ) def _check_sync_bufs_pre_fwd(self): return self.will_sync_module_buffers() and ( - not hasattr(self, 'buffer_hook') or - self.buffer_hook.buffer_comm_hook_location + not hasattr(self, "buffer_hook") + or self.buffer_hook.buffer_comm_hook_location == _BufferCommHookLocation.PRE_FORWARD ) @@ -1621,8 +1688,10 @@ def _sync_buffers(self): self._sync_module_buffers(authoritative_rank) def _sync_module_buffers(self, authoritative_rank): - if not hasattr(self, 'buffer_hook'): - self._default_broadcast_coalesced(authoritative_rank=authoritative_rank) + if not hasattr(self, "buffer_hook"): + self._default_broadcast_coalesced( + authoritative_rank=authoritative_rank + ) else: hook = self.buffer_hook.buffer_comm_hook state = self.buffer_hook.buffer_comm_hook_state @@ -1644,9 +1713,7 @@ def _default_broadcast_coalesced( bucket_size = self.broadcast_bucket_size self._distributed_broadcast_coalesced( - bufs, - bucket_size, - authoritative_rank + bufs, bucket_size, authoritative_rank ) def _passing_sync_batchnorm_handle(self, module): @@ -1654,12 +1721,15 @@ def _passing_sync_batchnorm_handle(self, module): if isinstance(layer, torch.nn.modules.SyncBatchNorm): if self.device_type == "cpu": self._log_and_throw( - ValueError, "SyncBatchNorm layers only work with GPU modules" + ValueError, + "SyncBatchNorm layers only work with GPU modules", ) def _check_comm_hook(self, hook): if not callable(hook): - self._log_and_throw(TypeError, "Communication hook must be callable.") + self._log_and_throw( + TypeError, "Communication hook must be callable." + ) sig = inspect.signature(hook) if ( @@ -1680,18 +1750,23 @@ def _check_comm_hook(self, hook): "Communication hook: return annotation should be torch.futures.Future[torch.Tensor].", ) - if ( - hook.__name__ in ["bf16_compress_hook", "bf16_compress_wrapper_hook"] - and - ( - (torch.version.cuda is None and torch.version.hip is None) - or (torch.version.cuda is not None and int(torch.version.cuda.split('.')[0]) < 11) - or not dist.is_available() - or not dist.is_nccl_available() - or torch.cuda.nccl.version() < (2, 10) + if hook.__name__ in [ + "bf16_compress_hook", + "bf16_compress_wrapper_hook", + ] and ( + (torch.version.cuda is None and torch.version.hip is None) + or ( + torch.version.cuda is not None + and int(torch.version.cuda.split(".")[0]) < 11 ) + or not dist.is_available() + or not dist.is_nccl_available() + or torch.cuda.nccl.version() < (2, 10) ): - self._log_and_throw(TypeError, "BF16 all reduce communication hook required CUDA 11+ and NCCL 2.10+.") + self._log_and_throw( + TypeError, + "BF16 all reduce communication hook required CUDA 11+ and NCCL 2.10+.", + ) @property def _distributed_rank(self): From 862bf333f913b4f8d229a9a68b4b0cb7afcec771 Mon Sep 17 00:00:00 2001 From: Natalia Gimelshein Date: Wed, 26 Oct 2022 06:33:43 +0000 Subject: [PATCH 0169/1922] Couple fixes for argmax/argmin (#87758) Removes a wrong assert, makes min number of warps = 2 (1 for some reason generates invalid code, https://github.com/openai/triton/issues/802). Hopefully fixes https://github.com/pytorch/torchdynamo/issues/1743, cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @mreso Pull Request resolved: https://github.com/pytorch/pytorch/pull/87758 Approved by: https://github.com/Chillee, https://github.com/soumith --- test/inductor/test_torchinductor.py | 11 +++++++++++ torch/_inductor/ir.py | 1 - torch/_inductor/triton_ops/autotune.py | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py index c0139b3fcdf86..a675fc476672b 100644 --- a/test/inductor/test_torchinductor.py +++ b/test/inductor/test_torchinductor.py @@ -721,6 +721,17 @@ def fn(a): self.common(fn, (torch.full((4,), float("-inf")),)) + def test_reduction4(self): + if self.device == "cpu": + raise unittest.SkipTest("Non-deterministic CPU results") + + def fn(a): + return (a.argmax(-1), a.argmin(-1)) + + inputs = (torch.ones(128), torch.ones(4, 4, 1)) + for i in inputs: + self.common(fn, (i,)) + @patch.object(config, "dynamic_shapes", False) def test_unroll_small_reduction(self): def fn(x): diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py index 867e26e56c5ef..7554dc905e23f 100644 --- a/torch/_inductor/ir.py +++ b/torch/_inductor/ir.py @@ -688,7 +688,6 @@ def create( if reduction_type in ("argmin", "argmax"): def fn(index): - assert len(index) <= 1 return 0 else: diff --git a/torch/_inductor/triton_ops/autotune.py b/torch/_inductor/triton_ops/autotune.py index 5d53b3522a25c..59ee762c7500a 100644 --- a/torch/_inductor/triton_ops/autotune.py +++ b/torch/_inductor/triton_ops/autotune.py @@ -343,7 +343,7 @@ def triton_config_reduction(size_hints, x, r, num_stages=2) -> Config: r *= 2 cfg = {"XBLOCK": x, "RBLOCK": r} - num_warps = next_power_of_2(min(max(conditional_product(x, r) // 128, 1), 8)) + num_warps = next_power_of_2(min(max(conditional_product(x, r) // 128, 2), 8)) return Config(cfg, num_warps=num_warps, num_stages=num_stages) From 55e7e0ff21539de56413f923888d9d2e1fa73ddb Mon Sep 17 00:00:00 2001 From: Ivan Yashchuk Date: Wed, 26 Oct 2022 14:18:46 +0000 Subject: [PATCH 0170/1922] Remove getitem special handling in the partitioner (#87073) This special handling of getitem unnecessary splits fusions at functions with tuple outputs. Example script: ```py import torch from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner from torch._prims.nvfuser_executor import NvfuserPrimOperatorSupport from torch.fx.experimental.proxy_tensor import make_fx def func(x): xx = torch.ops.nvprims.add(x, 1) var, mean = torch.ops.nvprims.var_mean(x, correction=0) var_cos = torch.ops.nvprims.cos(var) mean_sin = torch.ops.nvprims.sin(mean) return torch.ops.nvprims.add(var_cos, mean_sin) a = torch.randn(5, 3, 3, device="cuda") gm = make_fx(func)(a) gm.graph.print_tabular() supported_ops = NvfuserPrimOperatorSupport() partitioner = CapabilityBasedPartitioner( gm, supported_ops, allows_single_node_partition=False ) partitions = partitioner.propose_partitions() print(partitions) partitioned_graph = partitioner.fuse_partitions(partitions) partitioned_graph.graph.print_tabular() ``` Output on master: ```py opcode name target args kwargs ------------- --------- --------------------------- ---------------- ----------------- placeholder x_1 x_1 () {} call_function add nvprims.add.default (x_1, 1) {} call_function var_mean nvprims.var_mean.main (x_1, [0, 1, 2]) {'correction': 0} call_function getitem (var_mean, 0) {} call_function getitem_1 (var_mean, 1) {} call_function cos nvprims.cos.default (getitem,) {} call_function sin nvprims.sin.default (getitem_1,) {} call_function add_1 nvprims.add.default (cos, sin) {} output output output (add_1,) {} [{cos, sin, add_1}, {var_mean, add, getitem, getitem_1}] opcode name target args kwargs ------------- --------- --------------------------- ---------------------- -------- placeholder x_1 x_1 () {} call_module fused_1 fused_1 (x_1,) {} call_function getitem_2 (fused_1, 0) {} call_function getitem_3 (fused_1, 1) {} call_module fused_0 fused_0 (getitem_2, getitem_3) {} output output output (fused_0,) {} ``` Output with this PR: ``` [{var_mean, add_1, cos, sin, add, getitem_1, getitem}] opcode name target args kwargs ----------- ------- -------- ---------- -------- placeholder x_1 x_1 () {} call_module fused_0 fused_0 (x_1,) {} output output output (fused_0,) {} ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/87073 Approved by: https://github.com/jjsjann123, https://github.com/SherlockNoMad --- test/test_fx_passes.py | 9 ++++++++- test/test_prims.py | 23 +++++++++++++++++++++++ torch/_prims/nvfuser_executor.py | 12 ++++++------ torch/fx/passes/infra/partitioner.py | 20 +++++++++----------- 4 files changed, 46 insertions(+), 18 deletions(-) diff --git a/test/test_fx_passes.py b/test/test_fx_passes.py index 0aa721f34a167..aa04fbac26187 100644 --- a/test/test_fx_passes.py +++ b/test/test_fx_passes.py @@ -182,10 +182,16 @@ def forward13(a, b, c): c1 = a1 + c return b1 + c1 + @staticmethod + def forward14(a, b, c): + a0, a1 = torch.ops.aten.std_mean(a) + out = a0 + 1.0 + return out + # A mock OperatorSupport class, where only operator.add is supported class MockOperatorSupport(OperatorSupport): def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: - return node.op == "call_function" and node.target in {operator.add, operator.getitem} + return node.op == "call_function" and node.target in {operator.add, operator.getitem, torch.ops.aten.std_mean} @instantiate_parametrized_tests @@ -215,6 +221,7 @@ class TestFXGraphPasses(JitTestCase): # 5 getitem special case (TestPartitionFunctions.forward13, [["add_2", "add_1", "add"]]), + (TestPartitionFunctions.forward14, [["add", "std_mean", "getitem", "getitem_1"]]), ]) def test_partitioner(self, fn, expected_partition): traced = symbolic_trace(fn) diff --git a/test/test_prims.py b/test/test_prims.py index 674a032796044..f1b8f897528b9 100644 --- a/test/test_prims.py +++ b/test/test_prims.py @@ -215,6 +215,29 @@ def func(a): ) self.assertFalse(include_any_nvprims_sin) + def test_partitioner_tuple_output(self, device): + # This test verifies that the partitioner doesn't segment on nodes with + # tuple outputs. + from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner + from torch._prims.nvfuser_executor import NvfuserPrimOperatorSupport + + a = make_tensor(5, 3, 3, device=device, dtype=torch.float32) + + def func(x): + xx = torch.ops.nvprims.add(x, 1) + var, mean = torch.ops.nvprims.var_mean(x, correction=0) + var_cos = torch.ops.nvprims.cos(var) + mean_sin = torch.ops.nvprims.sin(mean) + return torch.ops.nvprims.add(var_cos, mean_sin) + + gm = make_fx(func)(a) + supported_ops = NvfuserPrimOperatorSupport() + partitioner = CapabilityBasedPartitioner( + gm, supported_ops, allows_single_node_partition=False + ) + partitions = partitioner.propose_partitions() + self.assertEqual(len(partitions), 1) + @onlyCUDA @skipCUDAIfRocm def test_nvfuser_empty_fusion(self, device): diff --git a/torch/_prims/nvfuser_executor.py b/torch/_prims/nvfuser_executor.py index e7d3df238bb50..01e566d97874c 100644 --- a/torch/_prims/nvfuser_executor.py +++ b/torch/_prims/nvfuser_executor.py @@ -1,3 +1,4 @@ +import operator from copy import deepcopy from dataclasses import dataclass from functools import lru_cache @@ -89,7 +90,7 @@ def make_nvfuser_fusion(gm: GraphModule, *nv_args_templates): # Everything in the graph must support nvfuser for node in gm.graph.nodes: - if node.op == "call_function" and "getitem" in node.name: + if node.op == "call_function" and node.target == operator.getitem: continue if ( node.op == "call_function" @@ -152,7 +153,7 @@ def run_node(self, node): def call_function(self, target, args, kwargs): # This handles tuple unpacking - if "getitem" in str(target): + if target == operator.getitem: assert isinstance(args[0], tuple) return target(*args, **kwargs) args = tuple(map(_to_nvfuser_constant, args)) @@ -237,10 +238,9 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: ) is not None ) - return ( - node.op == "call_function" - and getattr(node.target, "impl_nvfuser", None) is not None - or "getitem" in node.name # getitem is a special case + return node.op == "call_function" and ( + getattr(node.target, "impl_nvfuser", None) is not None + or node.target == operator.getitem ) diff --git a/torch/fx/passes/infra/partitioner.py b/torch/fx/passes/infra/partitioner.py index bc2af4c78cb7a..d582f98ecb764 100644 --- a/torch/fx/passes/infra/partitioner.py +++ b/torch/fx/passes/infra/partitioner.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Set, Iterable +from typing import Dict, List, Set, Iterable, Optional from torch.fx.passes.utils.fuser_utils import fuse_by_partitions @@ -44,12 +44,6 @@ def __init__(self, def __is_node_supported(self, node: Node) -> bool: return ( self.operator_support.is_node_supported(dict(self.graph_module.named_modules()), node) - and - # reject 'getitem' node since they are special cased in partitioning. - ( - node.op != "call_function" or - _get_qualified_name(node.target) != "_operator.getitem" # type: ignore[arg-type] - ) ) def propose_partitions(self) -> List[Partition]: @@ -110,13 +104,17 @@ def dfs_find_cycle(node): return True - def merge_single_node(node: Node, id: int): - assert node not in assignment + def merge_single_node(node: Node, id: Optional[int]): + if node in assignment: + partitions_by_id[assignment[node]].remove_node(node) - assignment[node] = id - if id not in partitions_by_id: + if id is None: + assignment.pop(node) + elif id not in partitions_by_id: + assignment[node] = id partitions_by_id[id] = Partition(id=id, nodes=[node]) else: + assignment[node] = id partitions_by_id[id].add_node(node) logger.debug("Proposing partitions...") From ffd6d882b1f180970e374686332390cf38297f13 Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Wed, 26 Oct 2022 14:34:29 +0000 Subject: [PATCH 0171/1922] [Static Runtime] Make canEnableStaticRuntime examine sub-blocks (#87396) Summary: Someone was running into problems where 1) Static Runtime enablement would fail 2) We would try to fall back to the JIT interpreter *after trying to create `StaticModule`* 3) The fallback fails because Static Runtime mangled the graph. We don't want to prevent Static Runtime from mutating its input due to memory concerns. The intent of `canEnableStaticRuntime` is to catch issues in the module before Static Runtime messes with it. With this diff, `StaticModule` instantiation can be avoided by querying `canEnableStaticRuntime` and the issue is fixed. Test Plan: New unit test Differential Revision: D40564452 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87396 Approved by: https://github.com/tenpercent --- .../static_runtime/test_static_module.cc | 12 +++++++ torch/csrc/jit/runtime/static/impl.cpp | 33 ++++++++++++++----- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/benchmarks/static_runtime/test_static_module.cc b/benchmarks/static_runtime/test_static_module.cc index 70d1d1d306939..1574cda3ee24a 100644 --- a/benchmarks/static_runtime/test_static_module.cc +++ b/benchmarks/static_runtime/test_static_module.cc @@ -354,6 +354,18 @@ TEST(StaticRuntime, CanEnableStaticRuntime) { EXPECT_TRUE(testCanEnableStaticRuntime(is_not_script_none)); } +TEST(StaticRuntime, CanEnableStaticRuntimeSubBlocks) { + const auto src = R"JIT( + def forward(self, a: Tensor, b: Tensor, cond: bool): + if cond: + # aten::__is__ on tensors is blocked + return a is b + return False + )JIT"; + + EXPECT_FALSE(testCanEnableStaticRuntime(src)); +} + TEST(StaticRuntime, NestedOutput) { // dict of tuple of list const auto nested_output_script_0 = R"JIT( diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index 897f3b5eee644..bef31efb50d17 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -56,9 +56,9 @@ namespace jit { namespace { -bool allArgsAreTensors(Node* node) { +bool allArgsAreTensors(const Node* node) { const auto& inputs = node->inputs(); - return std::all_of(inputs.begin(), inputs.end(), [](Value* value) { + return std::all_of(inputs.begin(), inputs.end(), [](const Value* value) { return value->type()->kind() == TypeKind::TensorType; }); } @@ -69,7 +69,7 @@ bool allArgsAreTensors(Node* node) { // These are rarely-used ops. Disallowing them typically eliminates // corner cases in graph optimizations, allowing for more aggressive // optimizations and better performance. -bool isUnsupportedOp(Node* node) { +bool isUnsupportedOp(const Node* node) { auto kind = node->kind(); if (kind != aten::__is__ && kind != aten::__isnot__) { return false; @@ -87,12 +87,21 @@ bool isUnsupportedOp(Node* node) { return allArgsAreTensors(node); } -// graph must be frozen or canEnableStaticRuntime would return false -// if there's any prim::CallMethod op left in the graph -bool canEnableStaticRuntime(const std::shared_ptr& graph) { - // check for sub-blocks +namespace { + +bool canEnableStaticRuntimeImpl(const Block* block) { + if (block == nullptr) { + return false; + } + bool can_support = true; - for (auto* node : graph->block()->nodes()) { + for (auto* node : block->nodes()) { + for (auto* subblock : node->blocks()) { + // The ordering prevents && from short circuiting, which we want - + // it's useful to see *all* the unsupported ops. + can_support = canEnableStaticRuntimeImpl(subblock) && can_support; + } + const auto kind = node->kind(); if (kind == prim::Constant) { continue; @@ -107,6 +116,14 @@ bool canEnableStaticRuntime(const std::shared_ptr& graph) { return can_support; } +} // namespace + +// Graph must be frozen. canEnableStaticRuntime will return false +// if there's any prim::CallMethod ops left in the graph. +bool canEnableStaticRuntime(const std::shared_ptr& graph) { + return canEnableStaticRuntimeImpl(graph->block()); +} + namespace { auto sr_metadata_registerer = torch::class_( From ecde979c3f0baefc07e391575ee47a0c04cd8d60 Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Wed, 26 Oct 2022 14:40:29 +0000 Subject: [PATCH 0172/1922] [ROCm] Use -rpath-link to fix libtinfo conflict (#83552) Fixes issue building PyTorch for ROCm5.3 and above on Ubuntu20.04 because libtinfo6 from conda conflicts with the one from the distro causing symbol not found errors. Pull Request resolved: https://github.com/pytorch/pytorch/pull/83552 Approved by: https://github.com/malfet --- cmake/Dependencies.cmake | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 47f5be14ed9a6..05153a0f75d5b 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1270,6 +1270,21 @@ endif() # ---[ HIP if(USE_ROCM) + # This prevents linking in the libtinfo from /opt/conda/lib which conflicts with ROCm libtinfo. + # Currently only active for Ubuntu 20.04 and greater versions. + if(UNIX) + file(STRINGS /etc/os-release OS_RELEASE) + string(REGEX REPLACE "NAME=\"([A-Za-z]+).*" "\\1" OS_DISTRO ${OS_RELEASE}) + string(REGEX REPLACE ".*VERSION_ID=\"([0-9\.]+).*" "\\1" OS_VERSION ${OS_RELEASE}) + if(OS_DISTRO STREQUAL "Ubuntu" AND OS_VERSION VERSION_GREATER_EQUAL "20.04") + find_library(LIBTINFO_LOC tinfo NO_CMAKE_PATH NO_CMAKE_ENVIRONMENT_PATH) + if(LIBTINFO_LOC) + get_filename_component(LIBTINFO_LOC_PARENT ${LIBTINFO_LOC} DIRECTORY) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath-link,${LIBTINFO_LOC_PARENT}") + endif() + endif() + endif() + include(${CMAKE_CURRENT_LIST_DIR}/public/LoadHIP.cmake) if(PYTORCH_FOUND_HIP) message(INFO "Compiling with HIP for AMD.") From b980eaf8ac6847a7a91ff1b85e2efe56ba1722e3 Mon Sep 17 00:00:00 2001 From: Antoni Viros i Martin Date: Wed, 26 Oct 2022 14:48:27 +0000 Subject: [PATCH 0173/1922] Implement copy_, fill_, and ones_like for Nested Tensors backends (#87728) Summary: This diff implements copy_ in order to allow pinned memory transfers for nested tensors, as well as fill_ and ones_like, to test whether nested tensors can be created with other factory functions. Test Plan: Pass all CI and sandcastle jobs. Reviewed By: mikekgfb Differential Revision: D40689594 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87728 Approved by: https://github.com/cpuhrsch --- aten/src/ATen/native/native_functions.yaml | 4 + .../native/nested/NestedTensorFactories.cpp | 24 +++++- .../cuda/NestedTensorTransformerFunctions.cpp | 2 +- test/test_nestedtensor.py | 76 ++++++++++++++++++- 4 files changed, 100 insertions(+), 6 deletions(-) diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index c1c2b363cb99b..69951d7b2fabf 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -1539,6 +1539,7 @@ SparseCPU, SparseCUDA: copy_sparse_wrapper_ CompositeExplicitAutograd: copy_ SparseCsrCPU, SparseCsrCUDA: copy_sparse_compressed_ + NestedTensorCPU, NestedTensorCUDA: copy_nested_ autogen: copy.out - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor @@ -2404,6 +2405,7 @@ QuantizedCPU, QuantizedCUDA: fill_quantized_ Meta: fill_meta_ SparseCsrCPU, SparseCsrCUDA: fill_sparse_csr_ + NestedTensorCPU, NestedTensorCUDA: fill_nested_ autogen: fill.Scalar_out - func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!) @@ -2414,6 +2416,7 @@ MPS: fill_tensor_mps_ QuantizedCPU, QuantizedCUDA: fill_quantized_ Meta: fill_meta_ + NestedTensorCPU, NestedTensorCUDA: fill_nested_ autogen: fill.Tensor_out - func: floor(Tensor self) -> Tensor @@ -3863,6 +3866,7 @@ # NB: Although this composite mutates on the inside, it is # non-differentiable so NonFunctional doesn't apply CompositeExplicitAutograd: ones_like + NestedTensorCPU, NestedTensorCUDA: ones_like autogen: ones_like.out - func: pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor diff --git a/aten/src/ATen/native/nested/NestedTensorFactories.cpp b/aten/src/ATen/native/nested/NestedTensorFactories.cpp index 998a62eb136d1..01e72649bd3ff 100644 --- a/aten/src/ATen/native/nested/NestedTensorFactories.cpp +++ b/aten/src/ATen/native/nested/NestedTensorFactories.cpp @@ -106,9 +106,31 @@ Tensor _to_copy_nested( Tensor r; r = at::empty_like(self, dtype, layout, device, pin_out, memory_format); get_nested_tensor_impl(r)->get_buffer().copy_( - get_nested_tensor_impl(self)->get_buffer()); + get_nested_tensor_impl(self)->get_buffer(), non_blocking); return r; } +Tensor& copy_nested_(Tensor& self, const Tensor& src, bool non_blocking) { + const auto* nt_self = get_nested_tensor_impl(self); + const auto* nt_src = get_nested_tensor_impl(src); + TORCH_CHECK( + at::equal(nt_self->get_nested_size_tensor(), nt_src->get_nested_size_tensor()), + "copy_ only supports tensors that are the same size for Nested implementations"); + nt_self->get_buffer().copy_(nt_src->get_buffer(), non_blocking); + return self; +} + +Tensor& fill_nested_(Tensor& self, const Scalar& value) { + const auto& self_buf = get_nested_tensor_impl(self)->get_buffer(); + self_buf.fill_(value); + return self; +} + +Tensor& fill_nested_(Tensor& self, const Tensor& value) { + const auto& self_buf = get_nested_tensor_impl(self)->get_buffer(); + self_buf.fill_(value); + return self; +} + } // namespace native } // namespace at diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp index 4028c8d5c3e4b..307fc20721d60 100644 --- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp +++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp @@ -53,7 +53,7 @@ Tensor nested_from_padded_cuda( TORCH_CHECK( (padded.dim() == 4 && do_transform_0213) || (padded.dim() == 3 && !do_transform_0213), - "padded tensor size error"); + "padded tensor size error: ", padded.dim()); Tensor target_offsets = NestedTensor_batch_offsets_from_size_tensor(sizes, 0); Tensor padded_sizes_tensor = at::tensor(padded.sizes()); diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py index 7eb7dead38d3d..f51db59958696 100644 --- a/test/test_nestedtensor.py +++ b/test/test_nestedtensor.py @@ -1,18 +1,26 @@ # Owner(s): ["module: nestedtensor"] +import unittest + import torch import torch.nn -import unittest from torch.testing._internal.common_device_type import ( dtypes, dtypesIfCUDA, instantiate_device_type_tests, - skipMeta, + onlyCPU, onlyCUDA, - onlyCPU + skipMeta, ) from torch.testing._internal.common_dtype import floating_types_and_half -from torch.testing._internal.common_utils import TestCase, IS_FBCODE, run_tests, freeze_rng_state, parametrize, gradcheck +from torch.testing._internal.common_utils import ( + freeze_rng_state, + gradcheck, + IS_FBCODE, + parametrize, + run_tests, + TestCase, +) # Tests are ported from pytorch/nestedtensor. # This makes porting as_nested_tensor easier in the future. @@ -365,6 +373,66 @@ def test_data_ptr(getter): self.assertIs(torch.int32, nt2.to(dtype=torch.int32).dtype) self.assertEqual(nt2.device, nt2.to(dtype=torch.int32).device) + def test_copy_(self): + ntensors = 4 + nt = random_nt(torch.device('cpu'), torch.float32, ntensors, (4, 4)) + nt_copy = torch.empty_like(nt) + nt_copy.copy_(nt) + + for (nt_ub, nt_copy_ub) in zip(nt.unbind(), nt_copy): + self.assertEqual(nt_ub, nt_copy_ub) + + nt_error = torch.nested.nested_tensor([torch.tensor([0, 0])]) + self.assertRaisesRegex( + RuntimeError, + "copy_ only supports tensors that are the same size for Nested implementations", + lambda: nt_error.copy_(nt) + ) + + if torch.cuda.is_available(): + nt = random_nt(torch.device('cuda'), torch.float32, ntensors, (4, 4)) + nt_copy = torch.empty_like(nt, device=torch.device('cpu')) + nt_copy.copy_(nt, non_blocking=True) + torch.cuda.current_stream(torch.cuda.current_device()).synchronize() + for (nt_ub, nt_copy_ub) in zip(nt.unbind(), nt_copy): + self.assertEqual(nt_ub, nt_copy_ub) + + nt_copy = torch.empty_like(nt, device=torch.device('cpu')) + nt_copy.copy_(nt, non_blocking=False) + for (nt_ub, nt_copy_ub) in zip(nt.unbind(), nt_copy): + self.assertEqual(nt_ub, nt_copy_ub) + + def test_fill_(self): + ntensors = 4 + nt = random_nt(torch.device('cpu'), torch.float32, ntensors, (4, 4)) + nt.fill_(10.) + for nt_ub in nt.unbind(): + t = torch.empty_like(nt_ub) + t.fill_(10.) + self.assertEqual(nt_ub, t) + + fill_tensor = torch.tensor([11.]) + self.assertRaisesRegex( + RuntimeError, + "fill_ only supports 0-dimension value tensor", + lambda: nt.fill_(fill_tensor) + ) + + nt.fill_(fill_tensor[0]) + for nt_ub in nt.unbind(): + t = torch.empty_like(nt_ub) + t.fill_(11.) + self.assertEqual(nt_ub, t) + + def test_ones_like(self): + ntensors = 4 + nt = random_nt(torch.device('cpu'), torch.float32, ntensors, (4, 4)) + ones_nt = torch.ones_like(nt) + + for nt_ub in ones_nt.unbind(): + t = torch.ones_like(nt_ub) + self.assertEqual(nt_ub, t) + class TestNestedTensorDeviceType(TestCase): From b53172b61675f02a14eb6276a97a152cdda8212b Mon Sep 17 00:00:00 2001 From: Pruthvi Madugundu Date: Wed, 26 Oct 2022 15:34:38 +0000 Subject: [PATCH 0174/1922] [ROCm] Move ROCm CI build to python 3.8 version (#86677) Currently it is python 3.7 want to upgrade to python 3.8 Pull Request resolved: https://github.com/pytorch/pytorch/pull/86677 Approved by: https://github.com/malfet --- .circleci/docker/build.sh | 8 +++---- .github/workflows/docker-builds.yml | 4 ++-- .github/workflows/periodic.yml | 36 ++++++++++++++--------------- .github/workflows/pull.yml | 8 +++---- .github/workflows/trunk.yml | 18 +++++++-------- 5 files changed, 37 insertions(+), 37 deletions(-) diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh index 7633f1eacac09..b38456badc271 100755 --- a/.circleci/docker/build.sh +++ b/.circleci/docker/build.sh @@ -259,8 +259,8 @@ case "$image" in VISION=yes CONDA_CMAKE=yes ;; - pytorch-linux-focal-rocm5.1-py3.7) - ANACONDA_PYTHON_VERSION=3.7 + pytorch-linux-focal-rocm5.1-py3.8) + ANACONDA_PYTHON_VERSION=3.8 GCC_VERSION=9 PROTOBUF=yes DB=yes @@ -268,8 +268,8 @@ case "$image" in ROCM_VERSION=5.1.1 CONDA_CMAKE=yes ;; - pytorch-linux-focal-rocm5.2-py3.7) - ANACONDA_PYTHON_VERSION=3.7 + pytorch-linux-focal-rocm5.2-py3.8) + ANACONDA_PYTHON_VERSION=3.8 GCC_VERSION=9 PROTOBUF=yes DB=yes diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 62699dde2243d..572d8146ebe51 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -38,8 +38,8 @@ jobs: - docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7 - docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7 - docker-image-name: pytorch-linux-bionic-py3.7-clang9 - - docker-image-name: pytorch-linux-focal-rocm5.1-py3.7 - - docker-image-name: pytorch-linux-focal-rocm5.2-py3.7 + - docker-image-name: pytorch-linux-focal-rocm5.1-py3.8 + - docker-image-name: pytorch-linux-focal-rocm5.2-py3.8 - docker-image-name: pytorch-linux-jammy-cuda11.6-cudnn8-py3.8-clang12 - docker-image-name: pytorch-linux-jammy-cuda11.7-cudnn8-py3.8-clang12 - docker-image-name: pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7 diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml index e0b69e6b6d91e..58e379e0b5fd2 100644 --- a/.github/workflows/periodic.yml +++ b/.github/workflows/periodic.yml @@ -34,20 +34,20 @@ jobs: docker-image: ${{ needs.linux-bionic-cuda11_6-py3-gcc7-slow-gradcheck-build.outputs.docker-image }} test-matrix: ${{ needs.linux-bionic-cuda11_6-py3-gcc7-slow-gradcheck-build.outputs.test-matrix }} - linux-focal-rocm5_2-py3_7-slow-build: - name: linux-focal-rocm5.2-py3.7-slow + linux-focal-rocm5_2-py3_8-slow-build: + name: linux-focal-rocm5.2-py3.8-slow uses: ./.github/workflows/_linux-build.yml with: - build-environment: linux-focal-rocm5.2-py3.7 - docker-image-name: pytorch-linux-focal-rocm5.2-py3.7 + build-environment: linux-focal-rocm5.2-py3.8 + docker-image-name: pytorch-linux-focal-rocm5.2-py3.8 - linux-focal-rocm5_2-py3_7-slow-test: - name: linux-focal-rocm5.2-py3.7-slow + linux-focal-rocm5_2-py3_8-slow-test: + name: linux-focal-rocm5.2-py3.8-slow uses: ./.github/workflows/_rocm-test.yml - needs: linux-focal-rocm5_2-py3_7-slow-build + needs: linux-focal-rocm5_2-py3_8-slow-build with: - build-environment: linux-focal-rocm5.2-py3.7 - docker-image: ${{ needs.linux-focal-rocm5_2-py3_7-slow-build.outputs.docker-image }} + build-environment: linux-focal-rocm5.2-py3.8 + docker-image: ${{ needs.linux-focal-rocm5_2-py3_8-slow-build.outputs.docker-image }} test-matrix: | { include: [ { config: "slow", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" }, @@ -56,20 +56,20 @@ jobs: AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }} AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }} - linux-focal-rocm5_2-py3_7-distributed-build: - name: linux-focal-rocm5.2-py3.7-distributed + linux-focal-rocm5_2-py3_8-distributed-build: + name: linux-focal-rocm5.2-py3.8-distributed uses: ./.github/workflows/_linux-build.yml with: - build-environment: linux-focal-rocm5.2-py3.7 - docker-image-name: pytorch-linux-focal-rocm5.2-py3.7 + build-environment: linux-focal-rocm5.2-py3.8 + docker-image-name: pytorch-linux-focal-rocm5.2-py3.8 - linux-focal-rocm5_2-py3_7-distributed-test: - name: linux-focal-rocm5.2-py3.7-distributed + linux-focal-rocm5_2-py3_8-distributed-test: + name: linux-focal-rocm5.2-py3.8-distributed uses: ./.github/workflows/_rocm-test.yml - needs: linux-focal-rocm5_2-py3_7-distributed-build + needs: linux-focal-rocm5_2-py3_8-distributed-build with: - build-environment: linux-focal-rocm5.2-py3.7 - docker-image: ${{ needs.linux-focal-rocm5_2-py3_7-distributed-build.outputs.docker-image }} + build-environment: linux-focal-rocm5.2-py3.8 + docker-image: ${{ needs.linux-focal-rocm5_2-py3_8-distributed-build.outputs.docker-image }} test-matrix: | { include: [ { config: "distributed", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" }, diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index faea02440bfa6..cc25bfc1326d1 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -298,12 +298,12 @@ jobs: docker-image-name: pytorch-linux-focal-py3.7-gcc7 build-generates-artifacts: false - linux-focal-rocm5_2-py3_7-build: + linux-focal-rocm5_2-py3_8-build: # don't run build twice on master if: github.event_name == 'pull_request' - name: linux-focal-rocm5.2-py3.7 + name: linux-focal-rocm5.2-py3.8 uses: ./.github/workflows/_linux-build.yml with: - build-environment: linux-focal-rocm5.2-py3.7 - docker-image-name: pytorch-linux-focal-rocm5.2-py3.7 + build-environment: linux-focal-rocm5.2-py3.8 + docker-image-name: pytorch-linux-focal-rocm5.2-py3.8 sync-tag: rocm-build diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index af348a84556c9..29dc9f3c44d3f 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -284,21 +284,21 @@ jobs: cuda-version: "11.6" test-matrix: ${{ needs.win-vs2019-cuda11_6-py3-build.outputs.test-matrix }} - linux-focal-rocm5_2-py3_7-build: - name: linux-focal-rocm5.2-py3.7 + linux-focal-rocm5_2-py3_8-build: + name: linux-focal-rocm5.2-py3.8 uses: ./.github/workflows/_linux-build.yml with: - build-environment: linux-focal-rocm5.2-py3.7 - docker-image-name: pytorch-linux-focal-rocm5.2-py3.7 + build-environment: linux-focal-rocm5.2-py3.8 + docker-image-name: pytorch-linux-focal-rocm5.2-py3.8 sync-tag: rocm-build - linux-focal-rocm5_2-py3_7-test: - name: linux-focal-rocm5.2-py3.7 + linux-focal-rocm5_2-py3_8-test: + name: linux-focal-rocm5.2-py3.8 uses: ./.github/workflows/_rocm-test.yml - needs: linux-focal-rocm5_2-py3_7-build + needs: linux-focal-rocm5_2-py3_8-build with: - build-environment: linux-focal-rocm5.2-py3.7 - docker-image: ${{ needs.linux-focal-rocm5_2-py3_7-build.outputs.docker-image }} + build-environment: linux-focal-rocm5.2-py3.8 + docker-image: ${{ needs.linux-focal-rocm5_2-py3_8-build.outputs.docker-image }} test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" }, From 342f5f9e82c616f5cc3c83d08a280f6901fdac9f Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Tue, 25 Oct 2022 22:59:58 +0000 Subject: [PATCH 0175/1922] [FSDP][BE] Improve the assert message of sharded load_state_dict (#87486) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87486 Approved by: https://github.com/awgu --- torch/distributed/fsdp/_state_dict_utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py index 33fa0d441107b..ed4b8f226c123 100644 --- a/torch/distributed/fsdp/_state_dict_utils.py +++ b/torch/distributed/fsdp/_state_dict_utils.py @@ -21,6 +21,7 @@ ) from ._fsdp_extensions import _ext_chunk_tensor, _ext_pre_load_state_dict_transform +from ._fsdp_extensions import _extensions as _user_extensions from .flat_param import ( FlatParamHandle, ) @@ -288,6 +289,7 @@ def _sharded_pre_load_state_dict_hook( nonsharded_tensors = [] shared_fqns = [fqn for fqn, _, _ in module._shared_param_fqns] + loaded_shapes = [] for fqn, _, _ in module._param_fqns: full_fqn = f"{prefix}{FSDP.FSDP_PREFIX}{fqn}" param = state_dict.pop(full_fqn) @@ -295,6 +297,7 @@ def _sharded_pre_load_state_dict_hook( continue # All-gather the param (ShardedTensor) param, shards = _ext_pre_load_state_dict_transform(param) + loaded_shapes.append(param.size()) assert len(shards) < 2, ( "Expects 0 or 1 shard per rank " f"but got {len(shards)} shards on rank {module.rank}." @@ -331,6 +334,11 @@ def _sharded_pre_load_state_dict_hook( loaded_flat_param, module.rank, module.world_size, ) loaded_flat_tensor.to(flat_param.device) + assert all(s1 == s2 for s1, s2 in zip(loaded_shapes, flat_param._shapes)), ( + f"The original shapes in FSDP are {flat_param._shapes}. " + f"The loaded shapes are {loaded_shapes}. " + f"FSDP extension is {'NOT' if _user_extensions is None else ''} None." + ) assert flat_param.numel() == loaded_flat_tensor.numel(), ( f"The loaded local chunk has different numel({loaded_flat_tensor.numel()}) " f"from the local chunk {flat_param.numel()}." From 8448407ea789263a08a027597d753f1d0b6a6288 Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Tue, 25 Oct 2022 06:58:11 -0700 Subject: [PATCH 0176/1922] Add test that `import torch` doesn't modify global logging state (#87629) Fixes https://github.com/pytorch/pytorch/issues/87626 Also adds the same test for `import functorch`. Users have complained at us when we do modify the global logging state, which has happened in the past. Test Plan: - tested locally; I added `logging.basicConfig` to `torch/__init__.py` and checked that the test got triggered Pull Request resolved: https://github.com/pytorch/pytorch/pull/87629 Approved by: https://github.com/albanD --- test/test_testing.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/test/test_testing.py b/test/test_testing.py index 3ad6ff06c771e..ccb2471e71e7b 100644 --- a/test/test_testing.py +++ b/test/test_testing.py @@ -1818,6 +1818,27 @@ def test_no_warning_on_import(self) -> None: cwd=os.path.dirname(os.path.realpath(__file__)),).decode("utf-8") self.assertEquals(out, "") + @unittest.skipIf(IS_WINDOWS, "importing torch+CUDA on CPU results in warning") + @parametrize('path', ['torch', 'functorch']) + def test_no_mutate_global_logging_on_import(self, path) -> None: + # Calling logging.basicConfig, among other things, modifies the global + # logging state. It is not OK to modify the global logging state on + # `import torch` (or other submodules we own) because users do not expect it. + expected = 'abcdefghijklmnopqrstuvwxyz' + commands = [ + 'import logging', + f'import {path}', + '_logger = logging.getLogger("torch_test_testing")', + 'logging.root.addHandler(logging.StreamHandler())', + 'logging.root.setLevel(logging.INFO)', + f'_logger.info("{expected}")' + ] + out = subprocess.check_output( + [sys.executable, "-W", "all", "-c", "; ".join(commands)], + stderr=subprocess.STDOUT, + ).decode("utf-8") + self.assertEqual(out.strip(), expected) + class TestOpInfos(TestCase): def test_sample_input(self) -> None: a, b, c, d, e = [object() for _ in range(5)] @@ -1913,6 +1934,7 @@ def test_opinfo_error_generators(self, device, op): instantiate_device_type_tests(TestOpInfoSampleFunctions, globals()) +instantiate_parametrized_tests(TestImports) if __name__ == '__main__': From 7dcfbdcec11d5a2b98bf6ca35e8aedf256b33a63 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Wed, 26 Oct 2022 16:20:46 +0000 Subject: [PATCH 0177/1922] optim utils all_gather_into_tensor (#87769) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/87769 Approved by: https://github.com/awgu --- torch/distributed/fsdp/_optim_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py index 7200e6f207a5f..a5e1ab64278e5 100644 --- a/torch/distributed/fsdp/_optim_utils.py +++ b/torch/distributed/fsdp/_optim_utils.py @@ -193,7 +193,7 @@ def _communicate_optim_state( # has the same shape as the sharded flattened parameter buffer_size = flat_param._full_param_padded.size() # type: ignore[attr-defined] tensor_buffer = value.new_zeros(*buffer_size) - dist._all_gather_base(tensor_buffer, value, group=group) + dist.all_gather_into_tensor(tensor_buffer, value, group=group) torch.cuda.synchronize() if to_save: unpadded_numel = flat_param._unpadded_unsharded_size.numel() # type: ignore[attr-defined] From 4a93e2aa2536cee4ff1e66dd9fc4dcb5905f7c01 Mon Sep 17 00:00:00 2001 From: PyTorch MergeBot Date: Wed, 26 Oct 2022 16:43:13 +0000 Subject: [PATCH 0178/1922] Revert "[ROCm] Use -rpath-link to fix libtinfo conflict (#83552)" This reverts commit a10446c4d826ae5505fa129ea9800d3924b25364. Reverted https://github.com/pytorch/pytorch/pull/83552 on behalf of https://github.com/kit1980 due to Broke ios/macos builds https://github.com/pytorch/pytorch/actions/runs/3329991911/jobs/5507911292 --- cmake/Dependencies.cmake | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 05153a0f75d5b..47f5be14ed9a6 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1270,21 +1270,6 @@ endif() # ---[ HIP if(USE_ROCM) - # This prevents linking in the libtinfo from /opt/conda/lib which conflicts with ROCm libtinfo. - # Currently only active for Ubuntu 20.04 and greater versions. - if(UNIX) - file(STRINGS /etc/os-release OS_RELEASE) - string(REGEX REPLACE "NAME=\"([A-Za-z]+).*" "\\1" OS_DISTRO ${OS_RELEASE}) - string(REGEX REPLACE ".*VERSION_ID=\"([0-9\.]+).*" "\\1" OS_VERSION ${OS_RELEASE}) - if(OS_DISTRO STREQUAL "Ubuntu" AND OS_VERSION VERSION_GREATER_EQUAL "20.04") - find_library(LIBTINFO_LOC tinfo NO_CMAKE_PATH NO_CMAKE_ENVIRONMENT_PATH) - if(LIBTINFO_LOC) - get_filename_component(LIBTINFO_LOC_PARENT ${LIBTINFO_LOC} DIRECTORY) - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath-link,${LIBTINFO_LOC_PARENT}") - endif() - endif() - endif() - include(${CMAKE_CURRENT_LIST_DIR}/public/LoadHIP.cmake) if(PYTORCH_FOUND_HIP) message(INFO "Compiling with HIP for AMD.") From b0c8e63a4d8459ec9cce9e7da6938b4a6120fbe1 Mon Sep 17 00:00:00 2001 From: Ivan Yashchuk Date: Wed, 26 Oct 2022 17:00:02 +0000 Subject: [PATCH 0179/1922] Enable nvprims.transpose fusions for nvFuser (#86967) This PR allows transposes to be fused with other operations. If a fusion group is formed only from operations that just manipulate metadata in PyTorch (transpose, view, etc.) then this group is not sent to nvFuser. On top of that if we have converted to `nvprims` but then decided to not form a fusion group we modify the graph use `prim.impl_aten` attribute instead of calling `prim(*args, **kwargs)` that has a higher overhead. cc @kevinstephano @jjsjann123 Pull Request resolved: https://github.com/pytorch/pytorch/pull/86967 Approved by: https://github.com/jjsjann123, https://github.com/SherlockNoMad --- test/test_prims.py | 26 +++++++++++++++++++++ torch/_prims/__init__.py | 1 + torch/_prims/context.py | 4 ---- torch/_prims/nvfuser_executor.py | 35 ++++++++++++++++++++++++++-- torch/_prims/nvfuser_prims.py | 8 ++++++- torch/fx/passes/infra/partitioner.py | 18 +++++++++++--- 6 files changed, 82 insertions(+), 10 deletions(-) diff --git a/test/test_prims.py b/test/test_prims.py index f1b8f897528b9..6223a34e0a3a9 100644 --- a/test/test_prims.py +++ b/test/test_prims.py @@ -875,6 +875,32 @@ def func7(a): out = execute(gm, a, executor="strictly_nvfuser") self.assertEqual(out, func(a)) + @onlyCUDA + @skipCUDAIfRocm + @dtypes(torch.float16, torch.float32) + def test_nvprims_view_partitioner(self, device, dtype): + # This test verifies that views that are not fused with other ops are + # correctly overriden to call aten implementation. + from torch.fx.experimental.proxy_tensor import make_fx + from torch._prims.context import TorchRefsNvfuserCapabilityMode + from torch._prims.nvfuser_executor import maybe_partition_graph + + make_arg = partial(make_tensor, device=device, dtype=dtype) + a = make_arg((4, 5)) + b = make_arg((5, 4)) + + def func(a, b): + aa = a.view(b.shape) + aa = aa.view(a.shape) + return aa.digamma() + + with TorchRefsNvfuserCapabilityMode(): + gm = make_fx(func)(a, b) + gm, _ = maybe_partition_graph(gm, False, False) + + out = gm(a, b) + self.assertEqual(out, func(a, b)) + @onlyCUDA @skipCUDAIfRocm @dtypes(torch.float32, torch.float16) diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py index 3248009ee66e5..b54019ef031c9 100644 --- a/torch/_prims/__init__.py +++ b/torch/_prims/__init__.py @@ -306,6 +306,7 @@ def _backend_select_impl(*args, **kwargs): p.schema = schema p.prim_impl = _prim_impl p.prim_meta_impl = meta + p.impl_aten = impl_aten return _prim diff --git a/torch/_prims/context.py b/torch/_prims/context.py index 2bcee069d146c..203d73fd948eb 100644 --- a/torch/_prims/context.py +++ b/torch/_prims/context.py @@ -254,10 +254,6 @@ def _is_func_unsupported_nvfuser( class TorchRefsNvfuserCapabilityMode(TorchRefsMode): def __init__(self, *, skip_ops=()): aten_ops_to_skip = ( - "aten.transpose.int", - "aten.t.default", - "aten.unsqueeze.default", - "aten.permute.default", "aten._log_softmax.default", "aten._log_softmax_backward_data.default", "aten.expand.default", diff --git a/torch/_prims/nvfuser_executor.py b/torch/_prims/nvfuser_executor.py index 01e566d97874c..227e1847265bb 100644 --- a/torch/_prims/nvfuser_executor.py +++ b/torch/_prims/nvfuser_executor.py @@ -30,7 +30,7 @@ DEFAULT_NVFUSER_PYTHON_CONFIG = MappingProxyType( { "use_python_fusion_cache": True, - "allow_single_op_fusion": True, + "allow_single_op_fusion": False, } ) @@ -268,6 +268,23 @@ def __call__(self, *args): ) +# A set of operators that are supported by nvFuser +# but should not form a fusion group solely on their own +_non_compute_ops = [ + "torch.ops." + str(getattr(torch.ops.nvprims, prim).default) + for prim in dir(torch.ops.nvprims) + if isinstance(getattr(torch.ops.nvprims, prim), torch._ops.OpOverloadPacket) + and getattr(torch.ops.nvprims, prim).return_type + == torch._prims_common.RETURN_TYPE.VIEW +] + +_allowed_single_node_partition_ops = [ + "torch.ops.nvprims.native_batch_norm.default", + "torch.ops.nvprims.var_mean.default", + "torch.ops.nvprims.var_mean.main", +] + + def _remove_empty_like_fill(gm: GraphModule): # Remove empty_like + fill nodes that prevent lowering to nvprims # This is a workaround for nonoptimal traces of C++ code `(1 - tensor)` @@ -325,7 +342,11 @@ def maybe_partition_graph( # CapabilityBasedPartitioner modifies the graph in-place so we need to make a copy of the graph gm = deepcopy(gm) partitioner = CapabilityBasedPartitioner( - gm, supported_ops, allows_single_node_partition=allow_single_op_fusion + gm, + supported_ops, + allows_single_node_partition=allow_single_op_fusion, + non_compute_ops=_non_compute_ops, + allowed_single_node_partition_ops=_allowed_single_node_partition_ops, ) partitions = partitioner.propose_partitions() if len(partitions) == 0: @@ -350,6 +371,16 @@ def maybe_partition_graph( NvfuserGraphModule(nvfuser_submodule, use_python_fusion_cache), ) + # Go through the graph and replace all the nodes that were converted to + # nvprims but won't be sent to nvFuser with a call to PyTorch's eager + # mode. This is necessary because torch.ops.* have higher overhead than + # calling the eager mode directly. + for node in partitioned_graph.graph.nodes: + if node.op == "call_function" and str(node.target).startswith("nvprims."): + if getattr(node.target, "impl_aten", None) is not None: + node.target = node.target.impl_aten + partitioned_graph.graph.eliminate_dead_code() + partitioned_graph.recompile() return partitioned_graph, any_unsupported else: return gm, any_unsupported diff --git a/torch/_prims/nvfuser_prims.py b/torch/_prims/nvfuser_prims.py index d4132b356473a..f37a21459e0cd 100644 --- a/torch/_prims/nvfuser_prims.py +++ b/torch/_prims/nvfuser_prims.py @@ -538,6 +538,10 @@ def _var_mean_autograd( p.return_type = torch._prims_common.RETURN_TYPE.NEW # type: ignore[attr-defined] +def _nvprims_view_impl_aten(a, original_shape, new_shape): + return a.reshape(new_shape) + + def register_view(): """This function is used to register the view function in torch.ops.view module.""" # View is implemented as a decomposition into prims.split_dim, @@ -568,7 +572,8 @@ def _view_no_original_shape_overload_impl(a, shape): for p in (prim_packet, prim): p.__doc__ = "Creates a tensor with the specified shape containing a copy of the data in a." p.impl_nvfuser = _nvfuser_impls["view"] - p.return_type = torch._prims_common.RETURN_TYPE.NEW # type: ignore[attr-defined] + p.return_type = torch._prims_common.RETURN_TYPE.VIEW # type: ignore[attr-defined] + p.impl_aten = _nvprims_view_impl_aten def register_nvprims(): @@ -594,3 +599,4 @@ def register_nvprims(): p.__doc__ = main_prim.__doc__ p.impl_nvfuser = _nvfuser_impls[name] p.return_type = main_prim.return_type # type: ignore[attr-defined] + p.impl_aten = main_prim.impl_aten diff --git a/torch/fx/passes/infra/partitioner.py b/torch/fx/passes/infra/partitioner.py index d582f98ecb764..5f5a808b85121 100644 --- a/torch/fx/passes/infra/partitioner.py +++ b/torch/fx/passes/infra/partitioner.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Set, Iterable, Optional +from typing import Dict, List, Set, Iterable, Sequence, Optional from torch.fx.passes.utils.fuser_utils import fuse_by_partitions @@ -35,11 +35,19 @@ class CapabilityBasedPartitioner: def __init__(self, graph_module: GraphModule, operator_support: OperatorSupportBase, - allows_single_node_partition: bool = False + allows_single_node_partition: bool = False, + non_compute_ops: Optional[Sequence[str]] = None, + allowed_single_node_partition_ops: Optional[Sequence[str]] = None, ) -> None: self.graph_module = graph_module self.operator_support = operator_support self.allows_single_node_partition = allows_single_node_partition + self.non_compute_ops = non_compute_ops if non_compute_ops is not None else [] + self.allowed_single_node_partition_ops = ( + allowed_single_node_partition_ops + if allowed_single_node_partition_ops is not None + else [] + ) def __is_node_supported(self, node: Node) -> bool: return ( @@ -169,7 +177,8 @@ def merge_single_node(node: Node, id: Optional[int]): # filter out single node partitions if not self.allows_single_node_partition: logger.debug("Filtering out single node partitions...") - non_compute_ops = {"torch.ops.aten.view", "_operator.getitem"} + default_non_compute_ops = {"torch.ops.aten.view", "_operator.getitem"} + non_compute_ops = default_non_compute_ops.union(set(self.non_compute_ops)) partitions_to_remove: List[int] = [] for id, partition in partitions_by_id.items(): compute_node_count = 0 @@ -177,6 +186,9 @@ def merge_single_node(node: Node, id: Optional[int]): if node.op == "call_function" and \ _get_qualified_name(node.target) not in non_compute_ops: # type: ignore[arg-type] compute_node_count += 1 + if node.op == "call_function" and \ + _get_qualified_name(node.target) in self.allowed_single_node_partition_ops: + compute_node_count += 1 if compute_node_count <= 1: partitions_to_remove.append(id) for id in partitions_to_remove: From d0586995d9d0e3b6e33a025d1ff90c706cd29a67 Mon Sep 17 00:00:00 2001 From: albanD Date: Wed, 26 Oct 2022 10:26:44 -0400 Subject: [PATCH 0180/1922] Many symintifications (#87604) Adds expand_inplace conv conv_double_backward convolution adaptive_avg_pool2d_symint _embedding_bag_backward_symint cudnn_grid_sampler cuda 32 bit indexing nll_loss / nll_loss_2d tensor split pooling same mode cudnn_is_acceptable storage nbytes Pull Request resolved: https://github.com/pytorch/pytorch/pull/87604 Approved by: https://github.com/ezyang --- aten/src/ATen/ExpandUtils.h | 5 +- aten/src/ATen/core/TensorBase.h | 10 ++++ .../ATen/functorch/BatchRulesConvolution.cpp | 56 +++++++++---------- .../functorch/BatchRulesDecompositions.cpp | 2 +- .../ATen/native/AdaptiveAveragePooling.cpp | 6 +- aten/src/ATen/native/Convolution.cpp | 20 +++---- aten/src/ATen/native/EmbeddingBag.cpp | 2 +- aten/src/ATen/native/GridSamplerUtils.h | 2 +- aten/src/ATen/native/IndexingUtils.cpp | 12 ++-- aten/src/ATen/native/LossNLL.cpp | 2 +- aten/src/ATen/native/LossNLL2d.cpp | 2 +- aten/src/ATen/native/NonSymbolicBC.h | 1 + aten/src/ATen/native/Pool.h | 18 ++++-- aten/src/ATen/native/TensorProperties.cpp | 2 +- aten/src/ATen/native/TensorShape.cpp | 28 +++++++--- aten/src/ATen/native/group_norm.cpp | 24 ++++---- aten/src/ATen/native/native_functions.yaml | 54 +++++++++--------- test/functorch/test_aotdispatch.py | 3 - test/test_proxy_tensor.py | 4 -- tools/autograd/derivatives.yaml | 44 +++++++-------- tools/jit/gen_unboxing.py | 4 +- torch/csrc/StorageMethods.cpp | 2 +- torch/csrc/autograd/FunctionsManual.cpp | 16 +++--- torch/csrc/autograd/FunctionsManual.h | 8 +-- torch/storage.py | 4 +- torchgen/api/cpp.py | 8 ++- torchgen/api/native.py | 2 +- torchgen/api/python.py | 6 +- torchgen/gen.py | 4 +- 29 files changed, 197 insertions(+), 154 deletions(-) diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h index 779894645b8ec..786cbf132cd77 100644 --- a/aten/src/ATen/ExpandUtils.h +++ b/aten/src/ATen/ExpandUtils.h @@ -94,10 +94,11 @@ inline void check_defined( inline c10::MaybeOwned expand_inplace( const Tensor& tensor, const Tensor& to_expand) { - if (tensor.sizes().equals(to_expand.sizes())) { + if (tensor.sym_sizes().equals(to_expand.sym_sizes())) { return c10::MaybeOwned::borrowed(to_expand); } - return c10::MaybeOwned::owned(to_expand.expand(tensor.sizes())); + return c10::MaybeOwned::owned( + to_expand.expand_symint(tensor.sym_sizes())); } inline c10::MaybeOwned expand_inplace( diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h index 08a14f2e09580..0ecd4456033b0 100644 --- a/aten/src/ATen/core/TensorBase.h +++ b/aten/src/ATen/core/TensorBase.h @@ -955,11 +955,21 @@ c10::SymIntArrayRef sizes(const TensorBase& t) { return t.sym_sizes(); } template > IntArrayRef sizes(const TensorBase& t) { return t.sizes(); } +template > +c10::SymInt size(const TensorBase& t, int64_t dim) { return t.sym_size(dim); } +template > +int64_t size(const TensorBase& t, int64_t dim) { return t.size(dim); } + template > c10::SymIntArrayRef strides(const TensorBase& t) { return t.sym_strides(); } template > IntArrayRef strides(const TensorBase& t) { return t.strides(); } +template > +c10::SymInt numel(const TensorBase& t) { return t.sym_numel(); } +template > +int64_t numel(const TensorBase& t) { return t.numel(); } + } // namespace symint } // namespace at diff --git a/aten/src/ATen/functorch/BatchRulesConvolution.cpp b/aten/src/ATen/functorch/BatchRulesConvolution.cpp index 0640af3a1b533..79523ed1fb6d9 100644 --- a/aten/src/ATen/functorch/BatchRulesConvolution.cpp +++ b/aten/src/ATen/functorch/BatchRulesConvolution.cpp @@ -17,7 +17,7 @@ namespace at { namespace functorch { // we do not support batch_group_count (which is needed for convolution backwards). // Instead, there's a convolution_backward op that needs a batching rule. std::tuple> -convolution_batch_rule(const Tensor& lhs, optional lhs_bdim, const Tensor& rhs, optional rhs_bdim, const optional& bias, optional bias_bdim, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed, IntArrayRef output_padding, int64_t groups) { +convolution_batch_rule(const Tensor& lhs, optional lhs_bdim, const Tensor& rhs, optional rhs_bdim, const optional& bias, optional bias_bdim, IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, int64_t groups) { DimVector lhs_spec(stride.size() + 2); std::iota(lhs_spec.begin(), lhs_spec.end(), 0); DimVector rhs_spec = lhs_spec; @@ -42,13 +42,13 @@ convolution_batch_rule(const Tensor& lhs, optional lhs_bdim, const Tens std::tuple> result; if (lhs_bdim && !rhs_bdim) { auto new_x = reshape_dim_into(*lhs_bdim, lhs_spec[0], lhs); - auto out = at::convolution(new_x, rhs, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups); + auto out = at::convolution_symint(new_x, rhs, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups); out = reshape_dim_outof(out_spec[0], lhs.sizes()[*lhs_bdim], out); result = std::make_tuple(out, out_spec[0]); } else if (!lhs_bdim && rhs_bdim) { if (groups == 1) { auto new_w = reshape_dim_into(*rhs_bdim, rhs_spec[0], rhs); - auto out = at::convolution(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups); + auto out = at::convolution_symint(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups); out = reshape_dim_outof(out_spec[1], rhs.size(*rhs_bdim), out); result = std::make_tuple(out, out_spec[1]); } else { @@ -62,7 +62,7 @@ convolution_batch_rule(const Tensor& lhs, optional lhs_bdim, const Tens // BIOHW -> I(BO)HW auto new_w = reshape_dim_into(*rhs_bdim, 1, rhs); // NIHW, I(BO)HW -> N(GBO)HW - auto out = at::convolution(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups); + auto out = at::convolution_symint(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups); // N(GBO)HW -> NG(BO)HW out = reshape_dim_outof(1, groups, out); // NG(BO)HW -> NGBOHW @@ -84,7 +84,7 @@ convolution_batch_rule(const Tensor& lhs, optional lhs_bdim, const Tens // G(BO)IHW -> (GBO)IHW new_w = reshape_dim_into(0, 0, new_w); // N(GI)HW, (GBO)IHW -> N(GBO)HW - auto out = at::convolution(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups); + auto out = at::convolution_symint(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups); // N(GBO)HW -> NG(BO)HW out = reshape_dim_outof(1, groups, out); // NG(BO)HW -> NGBOHW @@ -99,11 +99,11 @@ convolution_batch_rule(const Tensor& lhs, optional lhs_bdim, const Tens groups *= lhs.sizes()[*lhs_bdim]; auto dim_with_groups = transposed ? 1 : 0; auto new_w = reshape_dim_into(*rhs_bdim, rhs_spec[dim_with_groups], rhs); - auto out = at::convolution(new_x, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups); + auto out = at::convolution_symint(new_x, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups); out = reshape_dim_outof(out_spec[1], lhs.sizes()[*lhs_bdim], out); result = std::make_tuple(out, out_spec[1]); } else { - result = std::make_tuple(at::convolution(lhs, rhs, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups), nullopt); + result = std::make_tuple(at::convolution_symint(lhs, rhs, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups), nullopt); } if (separate_bias) { auto A = std::get<0>(result); @@ -244,8 +244,8 @@ convolution_backward_input_batch_rule( const Tensor& grad_output, optional grad_output_bdim, const Tensor& input, optional input_bdim, const Tensor& weight, optional weight_bdim, - IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed, - IntArrayRef output_padding, int64_t groups) { + IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed, + c10::SymIntArrayRef output_padding, int64_t groups) { const std::array mask = {true, false, false}; if (grad_output_bdim && weight_bdim) { // regular: BNO, BOI -> N(BO), (BO)I -> N(BI) @@ -254,7 +254,7 @@ convolution_backward_input_batch_rule( const auto grad_output_ = reshape_dim_into(*grad_output_bdim, 1, grad_output); const auto weight_ = reshape_dim_into(*weight_bdim, 0, weight); auto dummy_input = make_dummy(input, input_bdim, 1, batch_size); - const auto result = at::convolution_backward( + const auto result = at::convolution_backward_symint( grad_output_, dummy_input, weight_, nullopt, stride, padding, dilation, transposed, output_padding, groups * batch_size, mask); const auto grad_input = reshape_dim_outof(1, batch_size, std::get<0>(result)); @@ -265,7 +265,7 @@ convolution_backward_input_batch_rule( const auto batch_size = grad_output.size(*grad_output_bdim); const auto grad_output_ = reshape_dim_into(*grad_output_bdim, 0, grad_output); auto dummy_input = make_dummy(input, input_bdim, 0, batch_size); - const auto result = at::convolution_backward( + const auto result = at::convolution_backward_symint( grad_output_, dummy_input, weight, nullopt, stride, padding, dilation, transposed, output_padding, groups, mask); const auto grad_input = reshape_dim_outof(0, batch_size, std::get<0>(result)); @@ -278,7 +278,7 @@ convolution_backward_input_batch_rule( const auto in_ch_dim = transposed ? 0 : 1; const auto weight_ = reshape_dim_into(*weight_bdim, in_ch_dim, weight); auto dummy_input = make_dummy(input, input_bdim, 1, batch_size); - const auto result = at::convolution_backward( + const auto result = at::convolution_backward_symint( grad_output, dummy_input, weight_, nullopt, stride, padding, dilation, transposed, output_padding, groups, mask); const auto grad_input = reshape_dim_outof(1, batch_size, std::get<0>(result)); @@ -289,7 +289,7 @@ convolution_backward_input_batch_rule( // N(GO), B(GO)I -> N(GO), (GO)(BI) -> N(GBI) const auto weight_ = reshape_dim_into(*weight_bdim, 1, weight); auto dummy_input = make_dummy(input, input_bdim, 1, batch_size); - const auto result = at::convolution_backward( + const auto result = at::convolution_backward_symint( grad_output, dummy_input, weight_, nullopt, stride, padding, dilation, transposed, output_padding, groups, mask); grad_input = std::get<0>(result); // N(GBI) @@ -300,7 +300,7 @@ convolution_backward_input_batch_rule( weight_ = weight_.transpose(0, 1); // GBIO weight_ = weight_.flatten(0, 2); // (GBI)O const auto dummy_input = make_dummy(input, input_bdim, 1, batch_size); - const auto result = at::convolution_backward( + const auto result = at::convolution_backward_symint( grad_output, dummy_input, weight_, nullopt, stride, padding, dilation, transposed, output_padding, groups, mask); grad_input = std::get<0>(result); // N(GBI) @@ -314,7 +314,7 @@ convolution_backward_input_batch_rule( } else { TORCH_INTERNAL_ASSERT(input_bdim); const auto dummy_input = make_dummy(input, input_bdim, 0, 1); - const auto result = at::convolution_backward( + const auto result = at::convolution_backward_symint( grad_output, dummy_input, weight, nullopt, stride, padding, dilation, transposed, output_padding, groups, mask); return std::make_tuple(std::get<0>(result), nullopt); @@ -325,8 +325,8 @@ convolution_backward_weight_batch_rule( const Tensor& grad_output, optional grad_output_bdim, const Tensor& input, optional input_bdim, const Tensor& weight, optional weight_bdim, - IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed, - IntArrayRef output_padding, int64_t groups) { + IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed, + c10::SymIntArrayRef output_padding, int64_t groups) { const std::array mask = {false, true, false}; if (grad_output_bdim && input_bdim) { // BNO, BNI -> N(BO), N(BI) -> (BO)I (regular) (BI)O (transposed) @@ -334,7 +334,7 @@ convolution_backward_weight_batch_rule( const auto grad_output_ = reshape_dim_into(*grad_output_bdim, 1, grad_output); const auto input_ = reshape_dim_into(*input_bdim, 1, input); const auto dummy_weight = make_dummy(weight, weight_bdim, 0, batch_size); - const auto result = at::convolution_backward( + const auto result = at::convolution_backward_symint( grad_output_, input_, dummy_weight, nullopt, stride, padding, dilation, transposed, output_padding, groups * batch_size, mask); auto grad_weight = std::get<1>(result); @@ -348,7 +348,7 @@ convolution_backward_weight_batch_rule( const auto grad_output_ = reshape_dim_into(*grad_output_bdim, 1, grad_output); const auto out_ch_dim = transposed ? 1 : 0; const auto dummy_weight = make_dummy(weight, weight_bdim, out_ch_dim, batch_size); - const auto result = at::convolution_backward( + const auto result = at::convolution_backward_symint( grad_output_, input, dummy_weight, nullopt, stride, padding, dilation, transposed, output_padding, groups, mask); auto grad_weight = std::get<1>(result); @@ -362,7 +362,7 @@ convolution_backward_weight_batch_rule( if (!transposed) { // BN(GO), N(GI) -> N(GBO), N(GI) -> (GBO)I const auto dummy_weight = make_dummy(weight, weight_bdim, 0, batch_size); - const auto result = at::convolution_backward( + const auto result = at::convolution_backward_symint( grad_output_, input, dummy_weight, nullopt, stride, padding, dilation, transposed, output_padding, groups, mask); auto grad_weight = std::get<1>(result); @@ -373,7 +373,7 @@ convolution_backward_weight_batch_rule( } else { // BN(GO), N(GI) -> N(GBO), N(GI) -> (GI)(BO) const auto dummy_weight = make_dummy(weight, weight_bdim, 1, batch_size); - const auto result = at::convolution_backward( + const auto result = at::convolution_backward_symint( grad_output_, input, dummy_weight, nullopt, stride, padding, dilation, transposed, output_padding, groups, mask); auto grad_weight = std::get<1>(result); @@ -389,7 +389,7 @@ convolution_backward_weight_batch_rule( const auto input_ = reshape_dim_into(*input_bdim, 1, input); const auto in_ch_dim = transposed ? 0 : 1; const auto dummy_weight = make_dummy(weight, weight_bdim, in_ch_dim, batch_size); - const auto result = at::convolution_backward( + const auto result = at::convolution_backward_symint( grad_output, input_, dummy_weight, nullopt, stride, padding, dilation, transposed, output_padding, groups, mask); auto grad_weight = std::get<1>(result); @@ -403,7 +403,7 @@ convolution_backward_weight_batch_rule( if (!transposed) { // regular: N(GO), BN(GI) -> N(GO), N(GBI) -> (GO)(BI) const auto dummy_weight = make_dummy(weight, weight_bdim, 1, batch_size); - const auto result = at::convolution_backward( + const auto result = at::convolution_backward_symint( grad_output, input_, dummy_weight, nullopt, stride, padding, dilation, transposed, output_padding, groups, mask); auto grad_weight = std::get<1>(result); @@ -412,7 +412,7 @@ convolution_backward_weight_batch_rule( } else { // transposed: N(GO), BN(GI) -> N(GO), N(GBI) -> (GBI)O const auto dummy_weight = make_dummy(weight, weight_bdim, 0, batch_size); - const auto result = at::convolution_backward( + const auto result = at::convolution_backward_symint( grad_output, input_, dummy_weight, nullopt, stride, padding, dilation, transposed, output_padding, groups, mask); auto grad_weight = std::get<1>(result); @@ -425,7 +425,7 @@ convolution_backward_weight_batch_rule( } else { TORCH_INTERNAL_ASSERT(weight_bdim); const auto dummy_weight = make_dummy(weight, weight_bdim, 0, 1); - const auto result = at::convolution_backward( + const auto result = at::convolution_backward_symint( grad_output, input, dummy_weight, nullopt, stride, padding, dilation, transposed, output_padding, groups, mask); return std::make_tuple(std::get<1>(result), nullopt); @@ -436,8 +436,8 @@ convolution_backward_weight_batch_rule( std::tuple convolution_backward_plumbing( const Tensor& grad_output_, const Tensor& input_, const Tensor& weight_, const c10::OptionalArrayRef bias_sizes_opt, - IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed, - IntArrayRef output_padding, int64_t groups, std::array output_mask) { + IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed, + c10::SymIntArrayRef output_padding, int64_t groups, std::array output_mask) { const auto maybe_layer = maybeCurrentDynamicLayer(); TORCH_INTERNAL_ASSERT(maybe_layer.has_value()); int64_t cur_level = maybe_layer->layerId(); @@ -487,7 +487,7 @@ std::tuple convolution_backward_plumbing( const auto batch_size = weight.size(*weight_bdim); input = reshape_dim_into(*input_bdim, 1, input); weight = reshape_dim_into(*weight_bdim, 0, weight); - const auto result = at::convolution_backward( + const auto result = at::convolution_backward_symint( grad_output, input, weight, nullopt, stride, padding, dilation, transposed, output_padding, batch_size * groups, output_mask); // N(BI), (BO)I -> NBI, BOI diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp index f1108bac25a0a..24a1c4ab507a0 100644 --- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp +++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp @@ -242,7 +242,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { OP_DECOMPOSE2(where, ScalarSelf); OP_DECOMPOSE(orgqr); OP_DECOMPOSE2(unflatten, int); - OP_DECOMPOSE(_convolution_double_backward); + m.impl("_convolution_double_backward", native::_convolution_double_backward); OP_DECOMPOSE(conv_transpose1d); OP_DECOMPOSE2(conv_transpose2d, input); OP_DECOMPOSE2(conv_transpose3d, input); diff --git a/aten/src/ATen/native/AdaptiveAveragePooling.cpp b/aten/src/ATen/native/AdaptiveAveragePooling.cpp index 40b05d74053ca..b612ef009b651 100644 --- a/aten/src/ATen/native/AdaptiveAveragePooling.cpp +++ b/aten/src/ATen/native/AdaptiveAveragePooling.cpp @@ -130,9 +130,9 @@ namespace { Tensor out = input.mean({-1, -2}, /* keepdim = */ true); if (input.suggest_memory_format() == at::MemoryFormat::ChannelsLast) { // assert ndim == 4, since ndim = 3 doesn't give channels_last - const int n = input.size(0); - const int c = input.size(1); - out.as_strided_({n, c, 1, 1}, {c, 1, c, c}); + const auto n = input.sym_size(0); + const auto c = input.sym_size(1); + out.as_strided__symint({n, c, 1, 1}, {c, 1, c, c}); } return out; } else { diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index 4d68f23c0734f..64f6d141b9299 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -910,8 +910,8 @@ static Tensor convolution_same( auto k = weight.dim(); TORCH_CHECK(k > 2, "weight should have at least three dimensions"); auto dim = static_cast(k - 2); - auto weight_sizes = weight.sizes(); - auto input_sizes = input.sizes(); + auto weight_sizes = weight.sym_sizes(); + auto input_sizes = input.sym_sizes(); TORCH_CHECK(k == input.dim(), "Expected ", k, "-dimensional input for ", k, "-dimensional weight", weight_sizes, ", but got ", @@ -926,7 +926,7 @@ static Tensor convolution_same( } // Calculate the correct padding - DimVector padding_l, padding_r; + SymDimVector padding_l, padding_r; bool symmetric_padding = true; for (auto i: c10::irange(dim)) { auto s = stride.size() == 1 ? stride[0] : stride[i]; @@ -942,14 +942,14 @@ static Tensor convolution_same( if (symmetric_padding) { // All backends handle symmetric padding natively - DimVector output_padding(static_cast(dim)); - return at::convolution(input, weight, bias, stride, padding_l, dilation, + SymDimVector output_padding(static_cast(dim)); + return at::convolution_symint(input, weight, bias, stride, padding_l, dilation, false, output_padding, groups); } TORCH_WARN_ONCE("Using padding='same' with even kernel lengths and odd dilation may" " require a zero-padded copy of the input be created"); - SmallVector pad_nd(static_cast(2 * dim)); + SmallVector pad_nd(static_cast(2 * dim)); for (auto i: c10::irange(dim)) { // Apply padding by the difference, leaving only a symmetric padding auto delta_pad = padding_r[i] - padding_l[i]; @@ -961,10 +961,10 @@ static Tensor convolution_same( padding_l[i] = padding_r[i]; } } - auto padded_input = at::constant_pad_nd(input, pad_nd, 0); - DimVector output_padding(static_cast(dim)); - return at::convolution(padded_input, weight, bias, stride, padding_l, - dilation, false, output_padding, groups); + auto padded_input = at::constant_pad_nd_symint(input, pad_nd, 0); + SymDimVector output_padding(static_cast(dim)); + return at::convolution_symint(padded_input, weight, bias, stride, padding_l, + dilation, false, output_padding, groups); } Tensor _convolution_mode( diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp index 7d4a89d6b40f7..21404947b3dbb 100644 --- a/aten/src/ATen/native/EmbeddingBag.cpp +++ b/aten/src/ATen/native/EmbeddingBag.cpp @@ -1307,7 +1307,7 @@ Tensor _embedding_bag_backward_symint(const Tensor &grad, const Tensor &indices_ checkContiguous("embedding_bag", offsets_arg); Tensor offset2bag_; - if (indices.numel() != 0 && offset2bag.numel() == 0) { + if (indices.sym_numel() != 0 && offset2bag.sym_numel() == 0) { offset2bag_ = offsets.new_zeros( {indices.size(0) + 1}, offsets.options()); // offset2bag = [0 0 0 0 0] diff --git a/aten/src/ATen/native/GridSamplerUtils.h b/aten/src/ATen/native/GridSamplerUtils.h index 0b6f29de8c427..7c22fedfe94e2 100644 --- a/aten/src/ATen/native/GridSamplerUtils.h +++ b/aten/src/ATen/native/GridSamplerUtils.h @@ -101,7 +101,7 @@ bool cond_cudnn_grid_sampler( at::native::canUse32BitIndexMath(input) && at::native::canUse32BitIndexMath(grid) && input.dim() == 4 && - input.size(1) <= 1024); + input.sym_size(1) <= 1024); } } // anonymous namespace diff --git a/aten/src/ATen/native/IndexingUtils.cpp b/aten/src/ATen/native/IndexingUtils.cpp index c5f5ff6fbcc07..2dba1972ce574 100644 --- a/aten/src/ATen/native/IndexingUtils.cpp +++ b/aten/src/ATen/native/IndexingUtils.cpp @@ -4,7 +4,7 @@ namespace at { namespace native { bool canUse32BitIndexMath(const TensorBase& t, int64_t max_elem) { - int64_t elements = t.numel(); + auto elements = t.sym_numel(); if (elements >= max_elem) { return false; } @@ -12,16 +12,16 @@ bool canUse32BitIndexMath(const TensorBase& t, int64_t max_elem) { return max_elem > 0; } - int64_t offset = 0; - int64_t linearId = elements - 1; + c10::SymInt offset = 0; + auto linearId = elements - 1; // NOTE: Assumes all strides are positive, which is true for now // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) for (int i = t.dim() - 1; i >= 0; --i) { - int64_t curDimIndex = linearId % t.size(i); - int64_t curDimOffset = curDimIndex * t.stride(i); + auto curDimIndex = linearId % t.sym_size(i); + auto curDimOffset = curDimIndex * t.sym_stride(i); offset += curDimOffset; - linearId /= t.size(i); + linearId /= t.sym_size(i); } if (offset >= max_elem) { diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp index 8e5864b68728d..28fc60508ab10 100644 --- a/aten/src/ATen/native/LossNLL.cpp +++ b/aten/src/ATen/native/LossNLL.cpp @@ -656,7 +656,7 @@ Tensor nll_loss(const Tensor & self, const Tensor & target, const c10::optional< c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; - return std::get<0>(at::nll_loss_forward(self, target, weight, reduction, ignore_index)); + return std::get<0>(at::nll_loss_forward_symint(self, target, weight, reduction, ignore_index)); } Tensor nll_loss_nd_symint( diff --git a/aten/src/ATen/native/LossNLL2d.cpp b/aten/src/ATen/native/LossNLL2d.cpp index ab7c084eb80df..aee22ce3edeb5 100644 --- a/aten/src/ATen/native/LossNLL2d.cpp +++ b/aten/src/ATen/native/LossNLL2d.cpp @@ -498,7 +498,7 @@ Tensor nll_loss2d(const Tensor & self, const Tensor & target, const c10::optiona c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; - return std::get<0>(at::nll_loss2d_forward(self, target, weight, reduction, ignore_index)); + return std::get<0>(at::nll_loss2d_forward_symint(self, target, weight, reduction, ignore_index)); } } // namespace native diff --git a/aten/src/ATen/native/NonSymbolicBC.h b/aten/src/ATen/native/NonSymbolicBC.h index e7d31ae3fa020..f57c868f345f1 100644 --- a/aten/src/ATen/native/NonSymbolicBC.h +++ b/aten/src/ATen/native/NonSymbolicBC.h @@ -22,4 +22,5 @@ TORCH_API at::Tensor _embedding_bag_sparse_backward(const at::Tensor & grad, con TORCH_API at::Tensor value_selecting_reduction_backward(const at::Tensor & grad, int64_t dim, const at::Tensor & indices, at::IntArrayRef sizes, bool keepdim); TORCH_API at::Tensor trace_backward(const at::Tensor & grad, at::IntArrayRef sizes); TORCH_API at::Tensor index_select_backward(const at::Tensor & grad, at::IntArrayRef self_sizes, int64_t dim, const at::Tensor & index); +TORCH_API std::vector tensor_split(const Tensor& self, IntArrayRef indices, int64_t dim); }} diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h index cf5b45b365d05..0ff4490086b7e 100644 --- a/aten/src/ATen/native/Pool.h +++ b/aten/src/ATen/native/Pool.h @@ -67,17 +67,18 @@ static inline T pooling_output_shape( inputSize, kernelSize, pad, pad, stride, dilation, ceil_mode); } -inline std::pair pooling_same_mode_padding_lr( - int64_t inputSize, int64_t kernelSize, int64_t stride, int64_t dilation) { +template +std::pair _pooling_same_mode_padding_lr( + T inputSize, T kernelSize, int64_t stride, int64_t dilation) { // NOTE: with strides, the output shape is ceil(inputSize/stride) - auto total_padding = dilation * (kernelSize - 1); + auto total_padding = T(dilation) * (kernelSize - 1); // Prefer symmetric padding if possible if (stride > 2 && (total_padding % 2 == 1)) { // The floor in the output size calculation gives us a little wiggle room auto wiggle_room = inputSize % stride - 1; if (wiggle_room > 0) { - --total_padding; + total_padding = total_padding - 1; } } @@ -85,6 +86,15 @@ inline std::pair pooling_same_mode_padding_lr( return {left, total_padding - left}; } +inline std::pair pooling_same_mode_padding_lr( + int64_t inputSize, int64_t kernelSize, int64_t stride, int64_t dilation) { + return _pooling_same_mode_padding_lr(inputSize, kernelSize, stride, dilation); +} + +inline std::pair pooling_same_mode_padding_lr( + c10::SymInt inputSize, c10::SymInt kernelSize, int64_t stride, int64_t dilation) { + return _pooling_same_mode_padding_lr(inputSize, kernelSize, stride, dilation); +} // AveragePool2d/DilatedMaxPool2d (forward) static inline void diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp index 6a703cbe07f90..e37dbf56cc81a 100644 --- a/aten/src/ATen/native/TensorProperties.cpp +++ b/aten/src/ATen/native/TensorProperties.cpp @@ -69,7 +69,7 @@ bool cudnn_is_acceptable(const TensorBase& self) { // tensors. Maybe some cuDNN functions actually support empty tensors, but // native/THNN kernels shouldn't be much slower because the output is also // likely empty. - if (self.numel() == 0) return false; + if (self.sym_numel() == 0) return false; // NB: In the old Python code, there was also a test to see if the // cuDNN library was actually dynamically linked or not. I'm not // sure if we can actually test this. diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index d25113577b2d5..2051cda371b97 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -917,9 +917,12 @@ std::vector chunk(const Tensor& self, int64_t chunks, int64_t dim) { } } -std::vector tensor_split(const Tensor& self, int64_t sections, int64_t dim) { +std::vector tensor_split_sections_symint(const Tensor& self, c10::SymInt sym_sections, int64_t dim) { TORCH_CHECK(self.dim() > 0, "tensor_split expected at least a 1-dimensional tensor, but got a tensor with ", self.dim()," dims"); int64_t dim_ = maybe_wrap_dim(dim, self.dim()); + // NB: intentional, sections specifies number of output tensors, which + // cannot be polymorphic + int64_t sections = sym_sections.guard_int(__FILE__, __LINE__); TORCH_CHECK(sections > 0, "number of sections must be larger than 0, got ", sections); const auto dim_size = self.sym_size(dim_); std::vector splits(sections); @@ -934,21 +937,30 @@ std::vector tensor_split(const Tensor& self, int64_t sections, int64_t d return splits; } -std::vector tensor_split(const Tensor& self, IntArrayRef indices, int64_t dim) { +template +std::vector _tensor_split_indices(const Tensor& self, ArrayRef indices, int64_t dim) { TORCH_CHECK(self.dim() > 0, "tensor_split expected at least a 1-dimensional tensor, but got a tensor with ", self.dim()," dims"); int64_t dim_ = maybe_wrap_dim(dim, self.dim()); int64_t num_indices = indices.size(); std::vector splits(num_indices + 1); - int64_t start_idx = 0; + T start_idx(0); for (const auto split_idx : c10::irange(num_indices)) { - int64_t end_idx = indices[split_idx]; - splits[split_idx] = at::slice(self, dim_, start_idx, end_idx); + auto end_idx = indices[split_idx]; + splits[split_idx] = at::symint::slice(self, dim_, start_idx, end_idx); start_idx = end_idx; } - splits[num_indices] = at::slice(self, dim_, start_idx, self.size(dim_)); + splits[num_indices] = at::symint::slice(self, dim_, start_idx, at::symint::size(self, dim_)); return splits; } +std::vector tensor_split(const Tensor& self, IntArrayRef indices, int64_t dim) { + return _tensor_split_indices(self, indices, dim); +} + +std::vector tensor_split_indices_symint(const Tensor& self, SymIntArrayRef indices, int64_t dim) { + return _tensor_split_indices(self, indices, dim); +} + std::vector tensor_split(const Tensor& self, const Tensor& tensor_indices_or_sections, int64_t dim) { TORCH_CHECK(self.dim() > 0, "tensor_split expected at least a 1-dimensional tensor, but got a tensor with ", self.dim()," dims"); auto split_device = tensor_indices_or_sections.device(); @@ -1174,8 +1186,8 @@ Tensor as_strided_qtensorimpl(const Tensor& self, IntArrayRef size, IntArrayRef return result; } -const Tensor &as_strided_(const Tensor& self, IntArrayRef size, IntArrayRef stride, optional storage_offset_) { - auto storage_offset = storage_offset_.value_or(self.storage_offset()); +const Tensor &as_strided__symint(const Tensor& self, SymIntArrayRef size, SymIntArrayRef stride, optional storage_offset_) { + auto storage_offset = storage_offset_.value_or(self.sym_storage_offset()); setStrided(self, size, stride, storage_offset); return self; } diff --git a/aten/src/ATen/native/group_norm.cpp b/aten/src/ATen/native/group_norm.cpp index 5b38b02702828..c12d8d2142ff9 100644 --- a/aten/src/ATen/native/group_norm.cpp +++ b/aten/src/ATen/native/group_norm.cpp @@ -23,13 +23,15 @@ #include namespace at { + namespace native { +template void check_group_norm_inputs( const Tensor& input, const Tensor& weight, const Tensor& bias, - int64_t C, + T C, int64_t num_groups) { TORCH_CHECK( num_groups > 0, @@ -43,14 +45,14 @@ void check_group_norm_inputs( "num_groups=", num_groups); TORCH_CHECK( - !weight.defined() || (weight.dim() == 1 && weight.numel() == C), + !weight.defined() || (weight.dim() == 1 && at::symint::numel(weight) == C), "Expected weight to be a vector of size equal to the number of ", "channels in input, but got weight of shape ", weight.sizes(), " and input of shape ", input.sizes()); TORCH_CHECK( - !bias.defined() || (bias.dim() == 1 && bias.numel() == C), + !bias.defined() || (bias.dim() == 1 && at::symint::numel(bias) == C), "Expected bias to be a vector of size equal to the number of ", "channels in input, but got bias of shape ", weight.sizes(), @@ -171,13 +173,13 @@ Tensor group_norm( const Tensor& weight = *weight_maybe_owned; const Tensor& bias = c10::value_or_else(bias_opt, [] { return Tensor(); }); - const int64_t N = input.size(0); - const int64_t C = input.size(1); + const auto N = input.sym_size(0); + const auto C = input.sym_size(1); check_group_norm_inputs(input, weight, bias, C, num_groups); - const auto input_shape = input.sizes(); - const int64_t HxW = - c10::multiply_integers(input_shape.cbegin() + 2, input_shape.cend()); + const auto input_shape = input.sym_sizes(); + const auto HxW = + c10::multiply_integers(input_shape.slice(2)); const Tensor kEmpty; auto memory_format = input.suggest_memory_format(); @@ -185,10 +187,10 @@ Tensor group_norm( input.contiguous(memory_format) : input.contiguous(); const auto& gamma = weight.defined() ? weight.contiguous() : kEmpty; const auto& beta = bias.defined() ? bias.contiguous() : kEmpty; - TORCH_CHECK(!gamma.defined() || gamma.numel() == C); - TORCH_CHECK(!beta.defined() || beta.numel() == C); + TORCH_CHECK(!gamma.defined() || gamma.sym_numel() == C); + TORCH_CHECK(!beta.defined() || beta.sym_numel() == C); return std::get<0>( - at::native_group_norm(X, gamma, beta, N, C, HxW, num_groups, eps)); + at::native_group_norm_symint(X, gamma, beta, N, C, HxW, num_groups, eps)); } DEFINE_DISPATCH(GroupNormKernel); diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 69951d7b2fabf..2922e2be825b0 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -815,7 +815,7 @@ device_guard: False tags: inplace_view dispatch: - CompositeExplicitAutogradNonFunctional: as_strided_ + CompositeExplicitAutogradNonFunctional: as_strided__symint - func: asin(Tensor self) -> Tensor device_check: NoCheck # TensorIterator @@ -1294,11 +1294,15 @@ CompositeImplicitAutograd: chunk NestedTensorCPU, NestedTensorCUDA: chunk_nested_tensor -- func: tensor_split.sections(Tensor(a -> *) self, int sections, int dim=0) -> Tensor(a)[] +- func: tensor_split.sections(Tensor(a -> *) self, SymInt sections, int dim=0) -> Tensor(a)[] variants: function, method + dispatch: + CompositeImplicitAutograd: tensor_split_sections_symint -- func: tensor_split.indices(Tensor(a -> *) self, int[] indices, int dim=0) -> Tensor(a)[] +- func: tensor_split.indices(Tensor(a -> *) self, SymInt[] indices, int dim=0) -> Tensor(a)[] variants: function, method + dispatch: + CompositeImplicitAutograd: tensor_split_indices_symint - func: tensor_split.tensor_indices_or_sections(Tensor(a -> *) self, Tensor tensor_indices_or_sections, int dim=0) -> Tensor(a)[] variants: function, method @@ -1465,13 +1469,13 @@ variants: method manual_cpp_binding: True -- func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor +- func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups) -> Tensor dispatch: CompositeExplicitAutograd: convolution autogen: convolution.out tags: canonical -- func: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) +- func: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) dispatch: CompositeExplicitAutograd, CUDA: convolution_backward autogen: convolution_backward.out @@ -1487,7 +1491,7 @@ CompositeExplicitAutograd: convolution_backward_overrideable autogen: convolution_backward_overrideable.out -- func: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor +- func: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor dispatch: CompositeExplicitAutograd: _convolution autogen: _convolution.out @@ -1496,7 +1500,7 @@ - func: _convolution_mode(Tensor input, Tensor weight, Tensor? bias, int[] stride, str padding, int[] dilation, int groups) -> Tensor -- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) +- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor @@ -3561,7 +3565,7 @@ MPS: mps_convolution_backward autogen: mps_convolution_backward.out -- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor +- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups) -> Tensor dispatch: CompositeExplicitAutograd: mkldnn_convolution autogen: mkldnn_convolution.out @@ -3576,17 +3580,17 @@ CUDA: miopen_batch_norm_backward autogen: miopen_batch_norm_backward.out -- func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor +- func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor dispatch: CUDA: miopen_convolution autogen: miopen_convolution.out -- func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor +- func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor dispatch: CUDA: miopen_convolution_transpose autogen: miopen_convolution_transpose.out -- func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor +- func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor dispatch: CUDA: miopen_depthwise_convolution autogen: miopen_depthwise_convolution.out @@ -3840,7 +3844,7 @@ - func: _nnpack_available() -> bool -- func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, int[2] padding, int[2] stride=1) -> Tensor +- func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, int[2] stride=1) -> Tensor variants: function dispatch: CompositeExplicitAutograd: _nnpack_spatial_convolution @@ -11470,24 +11474,24 @@ # these are the same thing, but we give them different prefixes to # make the operational distinction clear. -- func: slow_conv_transpose2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!) +- func: slow_conv_transpose2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!) python_module: nn structured: True dispatch: CPU: slow_conv_transpose2d_structured_cpu CUDA: slow_conv_transpose2d_structured_cuda -- func: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int[2] dilation=1) -> Tensor +- func: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1) -> Tensor python_module: nn structured_delegate: slow_conv_transpose2d.out -- func: slow_conv_transpose3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!) +- func: slow_conv_transpose3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!) python_module: nn dispatch: CPU: slow_conv_transpose3d_out_cpu CUDA: slow_conv_transpose3d_out_cuda -- func: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1) -> Tensor +- func: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1) -> Tensor python_module: nn dispatch: CPU: slow_conv_transpose3d_cpu @@ -11524,47 +11528,47 @@ CUDA: slow_conv2d_backward_cuda autogen: _slow_conv2d_backward.output_mask_out -- func: _conv_depthwise2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation, *, Tensor(a!) out) -> Tensor(a!) +- func: _conv_depthwise2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation, *, Tensor(a!) out) -> Tensor(a!) use_const_ref_for_mutable_tensors: True python_module: nn dispatch: CUDA: conv_depthwise2d_cuda_out -- func: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation) -> Tensor +- func: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation) -> Tensor python_module: nn dispatch: CUDA: conv_depthwise2d_cuda -- func: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding, int[3] dilation) -> Tensor +- func: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, int[3] dilation) -> Tensor python_module: nn dispatch: CUDA: conv_depthwise3d_cuda autogen: conv_depthwise3d.out -- func: slow_conv3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, *, Tensor(a!) out) -> Tensor(a!) +- func: slow_conv3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -- func: slow_conv3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0) -> Tensor +- func: slow_conv3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0) -> Tensor python_module: nn -- func: slow_conv3d_forward.output(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding, *, Tensor(a!) output) -> Tensor(a!) +- func: slow_conv3d_forward.output(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, *, Tensor(a!) output) -> Tensor(a!) python_module: nn dispatch: CPU: slow_conv3d_forward_out_cpu -- func: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding) -> Tensor +- func: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding) -> Tensor python_module: nn dispatch: CPU: slow_conv3d_forward_cpu -- func: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1) -> Tensor +- func: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1) -> Tensor python_module: nn dispatch: CPU: slow_conv_dilated2d_cpu CUDA: slow_conv_dilated2d_cuda autogen: slow_conv_dilated2d.out -- func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1) -> Tensor +- func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1) -> Tensor python_module: nn dispatch: CPU: slow_conv_dilated3d_cpu diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py index d406f2eb53047..15e0e6a43c3b8 100644 --- a/test/functorch/test_aotdispatch.py +++ b/test/functorch/test_aotdispatch.py @@ -1128,8 +1128,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _): skip('nn.functional.batch_norm', ''), # '0 is not tracked with proxy for Tensor +- name: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups) -> Tensor input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple()" result: convolution_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, stride, padding, dilation, transposed, output_padding, groups) # TorchScript serializes calls to _convolution so this entry is present until that is changed to use convolution. # Note that the benchmark, deterministic, cudnn_enabled, and allow_tf32 flags are queried from the global context # by convolution_backward instead of being passed along from the forward pass. -- name: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor +- name: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple()" result: _convolution_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32) -- name: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - grad_output, input, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) +- name: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + grad_output, input, weight: _convolution_double_backward_symint(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) result0: std::get<0>(convolution_backward_symint(grad_output_p, input_p, weight_t, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {true, false, false})) + std::get<0>(convolution_backward_symint(grad_output_t, input_p, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {true, false, false})) result1: std::get<1>(convolution_backward_symint(grad_output_p, input_t, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {false, true, false})) + std::get<1>(convolution_backward_symint(grad_output_t, input_p, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {false, true, false})) result2: convolution_backward_jvp_grad_bias(grad_output_t, result2) @@ -2229,10 +2229,10 @@ - name: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) grad_output, input, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) -- name: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int[2] dilation=1) -> Tensor +- name: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1) -> Tensor self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, 1, grad_input_mask) : std::tuple()" -- name: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1) -> Tensor +- name: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1) -> Tensor self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, 1, grad_input_mask) : std::tuple()" - name: _slow_conv2d_forward(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding) -> Tensor @@ -2241,20 +2241,20 @@ - name: _slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, {{1, 1}}, false, {{0, 0}}, 1, grad_input_mask) -- name: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation) -> Tensor +- name: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation) -> Tensor self, weight, bias: "grad.defined() ? convolution_backward_symint(grad.contiguous(), self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ {{0, 0}}, /*groups=*/ 1, grad_input_mask) : std::tuple()" -- name: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding, int[3] dilation) -> Tensor +- name: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, int[3] dilation) -> Tensor self, weight, bias: "grad.defined() ? convolution_backward_symint(grad.contiguous(), self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ {{0, 0, 0}}, /*groups=*/ 1, grad_input_mask) : std::tuple()" -- name: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding) -> Tensor +- name: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding) -> Tensor self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, /*dilation=*/ {{1, 1, 1}}, false, /*output_padding=*/ {{0, 0, 0}}, 1, grad_input_mask) : std::tuple()" -- name: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1) -> Tensor - self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector(padding.size(), 0), 1, grad_input_mask) : std::tuple()" +- name: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1) -> Tensor + self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector(padding.size(), 0), 1, grad_input_mask) : std::tuple()" -- name: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1) -> Tensor - self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector(padding.size(), 0), 1, grad_input_mask) : std::tuple()" +- name: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1) -> Tensor + self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector(padding.size(), 0), 1, grad_input_mask) : std::tuple()" - name: col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor self: im2col(grad, kernel_size, dilation, padding, stride) @@ -2608,9 +2608,9 @@ # nnpack -- name: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, int[2] padding, int[2] stride=1) -> Tensor +- name: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, int[2] stride=1) -> Tensor # NNPACK does not support strided convolutions in the backwards path, which is the reason why we are using the closest available function that does here. - input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, std::vector(padding.size(), 1), false, std::vector(padding.size(), 0), 1, grad_input_mask) : std::tuple()" + input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, std::vector(padding.size(), 1), false, std::vector(padding.size(), 0), 1, grad_input_mask) : std::tuple()" #LSTM MPS - name: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor) @@ -2641,14 +2641,14 @@ # miopen -- name: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor +- name: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, groups, grad_input_mask) : std::tuple()" -- name: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector(padding.size(), 0), groups, grad_input_mask) : std::tuple()" +- name: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor + self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector(padding.size(), 0), groups, grad_input_mask) : std::tuple()" -- name: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector(padding.size(), 0), groups, grad_input_mask) : std::tuple()" +- name: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor + self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector(padding.size(), 0), groups, grad_input_mask) : std::tuple()" - name: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor) input, weight, bias: "grad.defined() ? (training ? miopen_batch_norm_backward(input, grad.contiguous(), weight, running_mean, running_var, result1, result2, epsilon) : native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, epsilon, grad_input_mask)) : std::tuple()" @@ -2667,8 +2667,8 @@ dropout_state: non_differentiable # mkldnn -- name: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor - self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ std::vector(padding.size(), 0), groups, grad_input_mask) : std::tuple()" +- name: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups) -> Tensor + self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ std::vector(padding.size(), 0), groups, grad_input_mask) : std::tuple()" - name: mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor self, weight, bias: mkldnn_linear_backward(self, grad, weight, grad_input_mask) diff --git a/tools/jit/gen_unboxing.py b/tools/jit/gen_unboxing.py index ebeaa21bc7be9..79c594a9afa07 100644 --- a/tools/jit/gen_unboxing.py +++ b/tools/jit/gen_unboxing.py @@ -116,7 +116,9 @@ def __call__(self, f: NativeFunction) -> str: # from wrapping/unwrapping TensorOptios. # However, we would look to include default args for schema parsing. # Default args only show up in the nonfaithful C++ API, - arg_default = cpp.default_expr(arg.argument.default, arg.argument.type) + arg_default = cpp.default_expr( + arg.argument.default, arg.argument.type, symint=False + ) if arg_default.startswith("{"): arg_cpp = f"c10::IntArrayRef({arg_default})" else: diff --git a/torch/csrc/StorageMethods.cpp b/torch/csrc/StorageMethods.cpp index 2b74c8a2fd290..29f0f67ce6ecb 100644 --- a/torch/csrc/StorageMethods.cpp +++ b/torch/csrc/StorageMethods.cpp @@ -41,7 +41,7 @@ static PyObject* THPStorage_nbytes(PyObject* _self, PyObject* noargs) { HANDLE_TH_ERRORS auto self = (THPStorage*)_self; - return THPUtils_packUInt64(self->cdata->nbytes()); + return py::cast(self->cdata->sym_nbytes()).release().ptr(); END_HANDLE_TH_ERRORS } diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp index 86b893bb014e6..3358d96569598 100644 --- a/torch/csrc/autograd/FunctionsManual.cpp +++ b/torch/csrc/autograd/FunctionsManual.cpp @@ -1098,15 +1098,15 @@ Tensor convolution_jvp( const Tensor& bias_p, const Tensor& bias_t, IntArrayRef stride, - IntArrayRef padding, + at::SymIntArrayRef padding, IntArrayRef dilation, bool transposed, - IntArrayRef output_padding, + at::SymIntArrayRef output_padding, int64_t groups) { auto bias_t_opt = bias_t.defined() ? c10::optional(bias_t) : c10::nullopt; return ( - at::convolution( + at::convolution_symint( input_t, weight_p, c10::nullopt, @@ -1116,7 +1116,7 @@ Tensor convolution_jvp( transposed, output_padding, groups) + - at::convolution( + at::convolution_symint( input_p, weight_t, bias_t_opt, @@ -1136,10 +1136,10 @@ Tensor _convolution_jvp( const Tensor& bias_p, const Tensor& bias_t, IntArrayRef stride, - IntArrayRef padding, + at::SymIntArrayRef padding, IntArrayRef dilation, bool transposed, - IntArrayRef output_padding, + at::SymIntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, @@ -1148,7 +1148,7 @@ Tensor _convolution_jvp( auto bias_t_opt = bias_t.defined() ? c10::optional(bias_t) : c10::nullopt; return ( - at::_convolution( + at::_convolution_symint( input_t, weight_p, c10::nullopt, @@ -1162,7 +1162,7 @@ Tensor _convolution_jvp( deterministic, cudnn_enabled, allow_tf32) + - at::_convolution( + at::_convolution_symint( input_p, weight_t, bias_t_opt, diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h index 04416c2b49e08..4da8aa074a534 100644 --- a/torch/csrc/autograd/FunctionsManual.h +++ b/torch/csrc/autograd/FunctionsManual.h @@ -937,10 +937,10 @@ Tensor convolution_jvp( const Tensor& bias_p, const Tensor& bias_t, IntArrayRef stride, - IntArrayRef padding, + at::SymIntArrayRef padding, IntArrayRef dilation, bool transposed, - IntArrayRef output_padding, + at::SymIntArrayRef output_padding, int64_t groups); Tensor _convolution_jvp( @@ -951,10 +951,10 @@ Tensor _convolution_jvp( const Tensor& bias_p, const Tensor& bias_t, IntArrayRef stride, - IntArrayRef padding, + at::SymIntArrayRef padding, IntArrayRef dilation, bool transposed, - IntArrayRef output_padding, + at::SymIntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, diff --git a/torch/storage.py b/torch/storage.py index 8e35973405b1b..6bfbab3733bc4 100644 --- a/torch/storage.py +++ b/torch/storage.py @@ -646,7 +646,9 @@ def device(self): return self._storage.device def size(self): - return len(self) + # NB: don't indirect through __len__, as that requires + # an int to be returned + return self.nbytes() // self.element_size() def pickle_storage_type(self): try: diff --git a/torchgen/api/cpp.py b/torchgen/api/cpp.py index c3b12d0336df0..4b00b5367b824 100644 --- a/torchgen/api/cpp.py +++ b/torchgen/api/cpp.py @@ -314,7 +314,7 @@ def return_names(f: NativeFunction, *, fallback_name: str = "result") -> Sequenc } # Convert a JIT default into C++ expression representing the default -def default_expr(d: str, t: Type) -> str: +def default_expr(d: str, t: Type, *, symint: bool) -> str: if d == "None" and str(t) == "Tensor?": return "{}" if isinstance(t, BaseType) and t.name is BaseTy.str: @@ -342,11 +342,13 @@ def default_expr(d: str, t: Type) -> str: if d == "None": return "c10::nullopt" - return default_expr(d, t.elem) + return default_expr(d, t.elem, symint=symint) if isinstance(t, ListType): if d.startswith("[") and d.endswith("]"): return "{" + d[1:-1] + "}" + elif symint and d.isdigit() and str(t.elem) == "SymInt": + return f"c10::SymInt({d})" elif t.size is None: # NOTE: Sized lists can have scalar defaults raise ValueError(f"Expected a list default '[...]' but found: '{d}'") @@ -386,7 +388,7 @@ def sub_argument( binds = a.name default: Optional[str] = None if a.name not in cpp_no_default_args and a.default is not None: - default = default_expr(a.default, a.type) + default = default_expr(a.default, a.type, symint=symint) return [ Binding( nctype=argument_type(a, binds=binds, symint=symint), diff --git a/torchgen/api/native.py b/torchgen/api/native.py index b197a2a02983a..7f8b3eb3af2e7 100644 --- a/torchgen/api/native.py +++ b/torchgen/api/native.py @@ -95,7 +95,7 @@ def argument( if isinstance(a, Argument): default: Optional[str] = None if should_default and a.default is not None: - default = cpp.default_expr(a.default, a.type) + default = cpp.default_expr(a.default, a.type, symint=symint) return [ Binding( nctype=argument_type(a, binds=a.name, symint=symint), diff --git a/torchgen/api/python.py b/torchgen/api/python.py index 96c006b303eaa..728ee4c18c0a6 100644 --- a/torchgen/api/python.py +++ b/torchgen/api/python.py @@ -719,7 +719,9 @@ def argument(a: Argument) -> PythonArgument: name=a.name, type=a.type, # TODO: directly translate a.default to python default - default=str(pythonify_default(cpp.default_expr(a.default, a.type))) + default=str( + pythonify_default(cpp.default_expr(a.default, a.type, symint=False)) + ) if a.default is not None else None, default_init=None, @@ -804,7 +806,7 @@ def topt_default_init(name: str) -> Optional[str]: a = getattr(topt_args, name) if a.default is None or a.default == "None": return None - return cpp.default_expr(a.default, a.type) + return cpp.default_expr(a.default, a.type, symint=False) tensor_options_args.append( PythonArgument( diff --git a/torchgen/gen.py b/torchgen/gen.py index e53734969afda..79970c94610dd 100644 --- a/torchgen/gen.py +++ b/torchgen/gen.py @@ -1151,7 +1151,9 @@ def compute_argument_yaml( "type": cpp.argument_type(a, binds="__placeholder__", symint=False).cpp_type(), } if a.default is not None: - arg["default"] = pythonify_default(cpp.default_expr(a.default, a.type)) + arg["default"] = pythonify_default( + cpp.default_expr(a.default, a.type, symint=False) + ) if a.name in kwarg_only_set: arg["kwarg_only"] = True if a.name in out_arg_set: From 84b2111191d95ffbe185dfc6efd508e0e3f4f962 Mon Sep 17 00:00:00 2001 From: albanD Date: Wed, 26 Oct 2022 10:26:44 -0400 Subject: [PATCH 0181/1922] Remove custom Ceil in favor of sympy.ceiling (#87294) [Alban]: the other changes that used to be in this PR (neg and fix for true div) are moved to other places where they already exist. Namely neg is already in master and true div will be in the next PR on the stack where all other functions are fixed at the same time. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87294 Approved by: https://github.com/ezyang --- torch/fx/experimental/symbolic_shapes.py | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py index 29a49b50ba29b..a7030abbcfc41 100644 --- a/torch/fx/experimental/symbolic_shapes.py +++ b/torch/fx/experimental/symbolic_shapes.py @@ -207,26 +207,6 @@ def eval(cls, base, divisor): sympy.simplify(base / gcd), sympy.simplify(divisor / gcd) ) - class Ceil(sympy.Function): - """ - sympy doesn't have its own ceil(), so rolling one here. - We maintain this so that we can simplify a sympy.Rational into a sympy.Float. - sympy.Float isn't supported. - """ - nargs = (1,) - - @classmethod - def eval(cls, a): - if isinstance(a, sympy.Integer): - return a - elif isinstance(a, sympy.core.symbol.Symbol) and a.is_scalar: - # TODO: do we need to simplify expr's first? (e.g. if we have 3/3), is is_scalar() true? - return a - elif isinstance(a, sympy.Rational): - return a.floor() + 1 - else: - raise NotImplementedError("math.ceil() not supported for type: " + str(type(a))) - # Methods that have a `__foo__` as well as `__rfoo__` reflectable_magic_methods = { 'add': lambda a, b: a + b, @@ -245,7 +225,7 @@ def eval(cls, a): 'lt': lambda a, b: sympy.Lt(a, b), 'le': lambda a, b: sympy.Le(a, b), 'ge': lambda a, b: sympy.Ge(a, b), - 'ceil': lambda a: Ceil(a), + 'ceil': lambda a: sympy.ceiling(a), 'neg': lambda a: -a, 'min': lambda a, b: sympy.Min(a, b), 'max': lambda a, b: sympy.Max(a, b), From 03eca170b6b07c755cb6a6fa549cb888df8fd93c Mon Sep 17 00:00:00 2001 From: Sergii Dymchenko Date: Wed, 26 Oct 2022 17:43:35 +0000 Subject: [PATCH 0182/1922] Install py for pytest-sugar (#87803) linux-focal-py3.7-clang10-onnx / test is failng, the issue is https://github.com/Teemu/pytest-sugar/issues/241 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87803 Approved by: https://github.com/seemethere, https://github.com/huydhn --- .jenkins/caffe2/test.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh index 0204907ee865d..2b6f7ec6b246a 100755 --- a/.jenkins/caffe2/test.sh +++ b/.jenkins/caffe2/test.sh @@ -149,6 +149,9 @@ export DNNL_MAX_CPU_ISA=AVX2 # Should still run even in the absence of SHARD_NUMBER if [[ "${SHARD_NUMBER:-1}" == "1" ]]; then + # TODO(sdym@meta.com) remove this when the linked issue resolved. + # py is temporary until https://github.com/Teemu/pytest-sugar/issues/241 is fixed + pip install --user py==1.11.0 pip install --user pytest-sugar # NB: Warnings are disabled because they make it harder to see what # the actual erroring test is From 823fff44b14ac11d915bda0fbcc52a2e6e58779c Mon Sep 17 00:00:00 2001 From: arnaudstiegler Date: Wed, 26 Oct 2022 17:45:46 +0000 Subject: [PATCH 0183/1922] Adding expm1 to MPS (#87147) Fixes #86744 - Implementing the new `expm1_out_mps` function in `aten/src/ATen/native/mps/operations/UnaryOps.mm` - Adding it to `aten/src/ATen/native/native_functions.yaml` - Adding it to existing `test.test_mps.TestNLLLoss.test_unary_ops` Pull Request resolved: https://github.com/pytorch/pytorch/pull/87147 Approved by: https://github.com/kulinseth --- aten/src/ATen/native/mps/operations/UnaryOps.mm | 14 ++++++++++++++ aten/src/ATen/native/native_functions.yaml | 1 + test/test_mps.py | 1 + 3 files changed, 16 insertions(+) diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm index 2763eff39f6a6..6b33e31341c8d 100644 --- a/aten/src/ATen/native/mps/operations/UnaryOps.mm +++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm @@ -249,5 +249,19 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una }); } +TORCH_IMPL_FUNC(expm1_out_mps) (const Tensor& self, const Tensor& output) { + mps::unary_op(self, output, "expm1_out_mps", + ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) { + MPSGraphTensor* oneTensor = [mpsGraph constantWithScalar:1.0 + shape:@[@1] + dataType:inputTensor.dataType]; + MPSGraphTensor* ePowTensor = [mpsGraph exponentWithTensor:inputTensor + name:nil]; + return [mpsGraph subtractionWithPrimaryTensor:ePowTensor + secondaryTensor:oneTensor + name: nil]; + }); +} + } // namespace native } // namespace at diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 2922e2be825b0..0954d1c662707 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -2334,6 +2334,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: expm1_out + MPS: expm1_out_mps SparseCPU, SparseCUDA: expm1_sparse_out SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr_out diff --git a/test/test_mps.py b/test/test_mps.py index 98df393c3e955..2b30ab926035b 100644 --- a/test/test_mps.py +++ b/test/test_mps.py @@ -4901,6 +4901,7 @@ def helper(shape, op): helper((2, 8, 4, 5), torch.exp) helper((2, 8, 3, 5), torch.exp2) + helper((2, 8, 3, 5), torch.expm1) helper((2, 8, 3, 5), torch.log) helper((2, 8, 3, 5), torch.cos) From b6ae8dad725c207410172e7fb377470e91806a24 Mon Sep 17 00:00:00 2001 From: Ethan Pronovost Date: Wed, 26 Oct 2022 18:50:48 +0000 Subject: [PATCH 0184/1922] Add type annotations to distribution.py (#87577) As title. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87577 Approved by: https://github.com/kit1980 --- torch/distributions/distribution.py | 50 ++++++++++++++++------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/torch/distributions/distribution.py b/torch/distributions/distribution.py index 66bd158bd87b6..4159f34d7748a 100644 --- a/torch/distributions/distribution.py +++ b/torch/distributions/distribution.py @@ -2,7 +2,8 @@ import warnings from torch.distributions import constraints from torch.distributions.utils import lazy_property -from typing import Dict, Optional, Any +from torch.types import _size +from typing import Dict, Optional, Any, Tuple __all__ = ['Distribution'] @@ -16,7 +17,7 @@ class Distribution(object): _validate_args = __debug__ @staticmethod - def set_default_validate_args(value): + def set_default_validate_args(value: bool) -> None: """ Sets whether validation is enabled or disabled. @@ -32,7 +33,12 @@ def set_default_validate_args(value): raise ValueError Distribution._validate_args = value - def __init__(self, batch_shape=torch.Size(), event_shape=torch.Size(), validate_args=None): + def __init__( + self, + batch_shape: torch.Size = torch.Size(), + event_shape: torch.Size = torch.Size(), + validate_args: Optional[bool] = None, + ): self._batch_shape = batch_shape self._event_shape = event_shape if validate_args is not None: @@ -62,7 +68,7 @@ def __init__(self, batch_shape=torch.Size(), event_shape=torch.Size(), validate_ ) super(Distribution, self).__init__() - def expand(self, batch_shape, _instance=None): + def expand(self, batch_shape: torch.Size, _instance=None): """ Returns a new distribution instance (or populates an existing instance provided by a derived class) with batch dimensions expanded to @@ -84,14 +90,14 @@ def expand(self, batch_shape, _instance=None): raise NotImplementedError @property - def batch_shape(self): + def batch_shape(self) -> torch.Size: """ Returns the shape over which parameters are batched. """ return self._batch_shape @property - def event_shape(self): + def event_shape(self) -> torch.Size: """ Returns the shape of a single sample (without batching). """ @@ -116,34 +122,34 @@ def support(self) -> Optional[Any]: raise NotImplementedError @property - def mean(self): + def mean(self) -> torch.Tensor: """ Returns the mean of the distribution. """ raise NotImplementedError @property - def mode(self): + def mode(self) -> torch.Tensor: """ Returns the mode of the distribution. """ raise NotImplementedError(f"{self.__class__} does not implement mode") @property - def variance(self): + def variance(self) -> torch.Tensor: """ Returns the variance of the distribution. """ raise NotImplementedError @property - def stddev(self): + def stddev(self) -> torch.Tensor: """ Returns the standard deviation of the distribution. """ return self.variance.sqrt() - def sample(self, sample_shape=torch.Size()): + def sample(self, sample_shape: torch.Size = torch.Size()) -> torch.Tensor: """ Generates a sample_shape shaped sample or sample_shape shaped batch of samples if the distribution parameters are batched. @@ -151,7 +157,7 @@ def sample(self, sample_shape=torch.Size()): with torch.no_grad(): return self.rsample(sample_shape) - def rsample(self, sample_shape=torch.Size()): + def rsample(self, sample_shape: torch.Size = torch.Size()) -> torch.Tensor: """ Generates a sample_shape shaped reparameterized sample or sample_shape shaped batch of reparameterized samples if the distribution parameters @@ -159,7 +165,7 @@ def rsample(self, sample_shape=torch.Size()): """ raise NotImplementedError - def sample_n(self, n): + def sample_n(self, n: int) -> torch.Tensor: """ Generates n samples or n batches of samples if the distribution parameters are batched. @@ -167,7 +173,7 @@ def sample_n(self, n): warnings.warn('sample_n will be deprecated. Use .sample((n,)) instead', UserWarning) return self.sample(torch.Size((n,))) - def log_prob(self, value): + def log_prob(self, value: torch.Tensor) -> torch.Tensor: """ Returns the log of the probability density/mass function evaluated at `value`. @@ -177,7 +183,7 @@ def log_prob(self, value): """ raise NotImplementedError - def cdf(self, value): + def cdf(self, value: torch.Tensor) -> torch.Tensor: """ Returns the cumulative density/mass function evaluated at `value`. @@ -187,7 +193,7 @@ def cdf(self, value): """ raise NotImplementedError - def icdf(self, value): + def icdf(self, value: torch.Tensor) -> torch.Tensor: """ Returns the inverse cumulative density/mass function evaluated at `value`. @@ -197,7 +203,7 @@ def icdf(self, value): """ raise NotImplementedError - def enumerate_support(self, expand=True): + def enumerate_support(self, expand: bool = True) -> torch.Tensor: """ Returns tensor containing all values supported by a discrete distribution. The result will enumerate over dimension 0, so the shape @@ -221,7 +227,7 @@ def enumerate_support(self, expand=True): """ raise NotImplementedError - def entropy(self): + def entropy(self) -> torch.Tensor: """ Returns entropy of distribution, batched over batch_shape. @@ -230,7 +236,7 @@ def entropy(self): """ raise NotImplementedError - def perplexity(self): + def perplexity(self) -> torch.Tensor: """ Returns perplexity of distribution, batched over batch_shape. @@ -239,7 +245,7 @@ def perplexity(self): """ return torch.exp(self.entropy()) - def _extended_shape(self, sample_shape=torch.Size()): + def _extended_shape(self, sample_shape: _size = torch.Size()) -> Tuple[int, ...]: """ Returns the size of the sample returned by the distribution, given a `sample_shape`. Note, that the batch and event shapes of a distribution @@ -253,7 +259,7 @@ def _extended_shape(self, sample_shape=torch.Size()): sample_shape = torch.Size(sample_shape) return sample_shape + self._batch_shape + self._event_shape - def _validate_sample(self, value): + def _validate_sample(self, value: torch.Tensor) -> None: """ Argument validation for distribution methods such as `log_prob`, `cdf` and `icdf`. The rightmost dimensions of a value to be @@ -306,7 +312,7 @@ def _get_checked_instance(self, cls, _instance=None): format(self.__class__.__name__, cls.__name__)) return self.__new__(type(self)) if _instance is None else _instance - def __repr__(self): + def __repr__(self) -> str: param_names = [k for k, _ in self.arg_constraints.items() if k in self.__dict__] args_string = ', '.join(['{}: {}'.format(p, self.__dict__[p] if self.__dict__[p].numel() == 1 From 9f0ad2d7beb0f789fda2ffb664758bd45da11565 Mon Sep 17 00:00:00 2001 From: PyTorch MergeBot Date: Wed, 26 Oct 2022 18:51:36 +0000 Subject: [PATCH 0185/1922] Revert "[pytorch] Layer norm backward speed gain with warp shuffles (#87445)" This reverts commit b6f28334bc3276a56d79dea6cb7ed99411556348. Reverted https://github.com/pytorch/pytorch/pull/87445 on behalf of https://github.com/weiwangmeta due to breaking internal builds due to MS compiler --- .../src/ATen/native/cuda/layer_norm_kernel.cu | 242 ++++-------------- 1 file changed, 54 insertions(+), 188 deletions(-) diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu index 732545465d9c9..ae09f0aaad8f8 100644 --- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu +++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu @@ -33,7 +33,6 @@ namespace { constexpr int kCUDANumThreads = 256; constexpr int kColwiseReduceTileSize = 32; -constexpr int kWarpSize = 32; constexpr int vec_size = 4; //we could make it dependent on dtype, but that would lead to different results between float and low-p types // aligned vector generates vectorized load/store on CUDA (copy-pasted from MemoryAccess.cuh) @@ -556,108 +555,8 @@ __global__ void GammaBetaBackwardCUDAKernel1( } } -template -__global__ void GammaBetaBackwardCUDAKernel_32x32( - int64_t M, - int64_t N, - const T* dY, - const T* X, - const T_ACC* mean, - const T_ACC* rstd, - T* dg, - T* db) { - alignas(sizeof(double)) extern __shared__ char s_data1[]; - T_ACC* s_data_typed = reinterpret_cast(&s_data1); - T_ACC* s_dg; - T_ACC* s_db; - T_ACC dg_sum = 0; - T_ACC db_sum = 0; - const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; - - if (j < N) { - constexpr int unroll_factor = 8; - int laneId = threadIdx.x & 0x1f; - - T_ACC mean_reg, mean_reg_tmp; - T_ACC rstd_reg, rstd_reg_tmp; - T dY_reg; - T X_reg; - - // Main loop - int bcounter; - for (bcounter = 0; bcounter < M / (blockDim.y * unroll_factor); - bcounter++) { - int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor; - - if (laneId < unroll_factor) { - mean_reg_tmp = mean[offset + laneId]; - rstd_reg_tmp = rstd[offset + laneId]; - } -#if !defined(USE_ROCM) - // Volta and newer architectures allow lane divergence within a warp. - __syncwarp(); -#endif - - #pragma unroll - for (int ii = 0; ii < unroll_factor; ++ii) { - dY_reg = dY[(offset + ii) * N + j]; - X_reg = X[(offset + ii) * N + j]; - mean_reg = WARP_SHFL(mean_reg_tmp, ii, kWarpSize); - rstd_reg = WARP_SHFL(rstd_reg_tmp, ii, kWarpSize); - dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg; - db_sum += dY_reg; - } - } - - // Remainder loop - int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor; - for (int ii = 0; ii < unroll_factor; ii++) { - if ((offset + ii) < M) { - mean_reg = mean[offset + ii]; - rstd_reg = rstd[offset + ii]; - dY_reg = dY[(offset + ii) * N + j]; - X_reg = X[(offset + ii) * N + j]; - dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg; - db_sum += dY_reg; - } - } - - // This kernel uses a block of (32 x 32) and gets called when M; N - // divide by 32. We can use warp shuffles for the final reduction - // step. This removes 4 shmem loads and stores with their - // corresponding __syncthreads() - - // This greatly reduces bank conflicts at the expense of a little - // extra shared memory. It does not impact occupancy - int padded_bx = (1 + blockDim.x); - - s_dg = s_data_typed; - s_db = s_data_typed + (padded_bx * blockDim.y); - s_dg[threadIdx.y * padded_bx + threadIdx.x] = dg_sum; - s_db[threadIdx.y * padded_bx + threadIdx.x] = db_sum; - __syncthreads(); - - // Load transposed so that a warp holds an entire column - T_ACC reg_dg = s_dg[threadIdx.x * padded_bx + threadIdx.y]; - T_ACC reg_db = s_db[threadIdx.x * padded_bx + threadIdx.y]; - for (int delta = 16; delta >= 1; delta /= 2) { - reg_dg += WARP_SHFL_XOR(reg_dg, delta, kWarpSize); - reg_db += WARP_SHFL_XOR(reg_db, delta, kWarpSize); - } - - if (threadIdx.x == 0) { - const int64_t j = blockIdx.x * blockDim.x + threadIdx.y; - if (dg) { - dg[j] = reg_dg; - } - if (db) { - db[j] = reg_db; - } - } - } -} template __global__ void GammaBetaBackwardCUDAKernel( @@ -670,75 +569,66 @@ __global__ void GammaBetaBackwardCUDAKernel( T* dg, T* db) { alignas(sizeof(double)) extern __shared__ char s_data1[]; - T_ACC* s_data_typed = reinterpret_cast(&s_data1); - T_ACC* s_dg; - T_ACC* s_db; - + T_ACC * s_data_typed = reinterpret_cast(&s_data1); const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; - + constexpr int unroll = 8; + T dYs[unroll]; + T Xs[unroll]; + T_ACC * means = s_data_typed; + T_ACC * rstds = s_data_typed + unroll * blockDim.y; T_ACC dg_sum = 0; T_ACC db_sum = 0; - if (j < N) { - constexpr int unroll_factor = 8; - - T_ACC mean_reg; - T_ACC rstd_reg; - T dY_reg; - T X_reg; - - // Main Loop int bcounter; - for (bcounter = 0; bcounter < M / (blockDim.y * unroll_factor); bcounter++){ - int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor; + for (bcounter = 0; bcounter < M/(blockDim.y * unroll); bcounter++){ + int offset = (bcounter * blockDim.y + threadIdx.y) * unroll; + #pragma unroll + for (int ii=0; ii= 1; offset /= 2) { + for (int offset = blockDim.y/2; offset >=1; offset /= 2){ if (threadIdx.y < offset) { - s_dg[threadIdx.y * blockDim.x + threadIdx.x] += - s_dg[(threadIdx.y + offset) * blockDim.x + threadIdx.x]; - s_db[threadIdx.y * blockDim.x + threadIdx.x] += - s_db[(threadIdx.y + offset) * blockDim.x + threadIdx.x]; - } + s_data_typed[threadIdx.y * blockDim.x + threadIdx.x] += s_data_typed[(threadIdx.y + offset) * blockDim.x + threadIdx.x]; + s_data_typed[blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x] += + s_data_typed[blockDim.x * blockDim.y + (threadIdx.y + offset) * blockDim.x + threadIdx.x]; + } __syncthreads(); } - if (threadIdx.y == 0) { if (dg) { - dg[j] = s_dg[threadIdx.x]; + dg[j] = s_data_typed[threadIdx.x]; } if (db) { - db[j] = s_db[threadIdx.x]; + db[j] = s_data_typed[threadIdx.x + blockDim.x * blockDim.y]; } } } @@ -873,8 +763,7 @@ void LayerNormBackwardKernelImplInternal( T* dgamma_data = dgamma->defined() ? dgamma->template data_ptr() : nullptr; T* dbeta_data = dbeta->defined() ? dbeta->template data_ptr() : nullptr; - - if (M < 128) { + if (M < 512) { // For small batch size, do colwise reduce directly. const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; GammaBetaBackwardSimpleCUDAKernel @@ -889,42 +778,19 @@ void LayerNormBackwardKernelImplInternal( dbeta_data); C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { - if ((M % kWarpSize == 0) && (N % kWarpSize == 0)) { - // This implementation relies on warp primitives and requires that M and N divide - // exactly to warp size. - dim3 threads{kWarpSize, kWarpSize}; - int blocks = (N + threads.x - 1) / threads.x; - - // If M and N divide by 32, we can use warp shuffles for the final reduction. That requires - // transposing values in shared memory, so we apply a padding to reduce bank conflicts. - size_t shmem_sz = 2 * sizeof(T_ACC) * (threads.x + 1) * threads.y; - GammaBetaBackwardCUDAKernel_32x32 - <<>>( - M, - N, - dY_data, - X_data, - mean_data, - rstd_data, - dgamma_data, - dbeta_data); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - } else { - dim3 threads{16, 32}; - int blocks = (N + threads.x - 1) / threads.x; - size_t shmem_sz = 2 * sizeof(T_ACC) * threads.x * threads.y; - GammaBetaBackwardCUDAKernel - <<>>( - M, - N, - dY_data, - X_data, - mean_data, - rstd_data, - dgamma_data, - dbeta_data); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - } + dim3 threads{16, 32}; + int blocks = (N + threads.x-1)/threads.x; + GammaBetaBackwardCUDAKernel + <<>>( + M, + N, + dY_data, + X_data, + mean_data, + rstd_data, + dgamma_data, + dbeta_data); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } } From 7455967dbdfa6db019638b9cb8886114647e2349 Mon Sep 17 00:00:00 2001 From: HDCharles Date: Tue, 25 Oct 2022 22:15:46 -0700 Subject: [PATCH 0186/1922] [ao][ns] Replacing List[QConfigMapping] in PNP (#86922) Summary: Added QConfigMultiMapping which is essentially a List[QConfigMapping] with set methods and dedicated handling to avoid unwanted matches and improve UX. note: the from __future__ import annotations line caused weird errors when the QConfigMultiMapping class was put in _numeric_suite_fx.py so it was moved. Test Plan: python test/test_quantization.py TestFxNumericSuiteNShadows Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: https://github.com/pytorch/pytorch/pull/86922 Approved by: https://github.com/vkuzo --- test/quantization/fx/test_numeric_suite_fx.py | 229 +++++++++++++++-- torch/ao/ns/_numeric_suite_fx.py | 18 +- torch/ao/ns/fx/qconfig_multi_mapping.py | 242 ++++++++++++++++++ 3 files changed, 452 insertions(+), 37 deletions(-) create mode 100644 torch/ao/ns/fx/qconfig_multi_mapping.py diff --git a/test/quantization/fx/test_numeric_suite_fx.py b/test/quantization/fx/test_numeric_suite_fx.py index 27fe772d2e228..7f46cf0a442b3 100644 --- a/test/quantization/fx/test_numeric_suite_fx.py +++ b/test/quantization/fx/test_numeric_suite_fx.py @@ -31,6 +31,7 @@ LSTMwithHiddenDynamicModel, SparseNNModel, skip_if_no_torchvision, + TwoLayerLinearModel ) from torch.ao.quantization.quantization_mappings import ( get_default_static_quant_module_mappings, @@ -82,6 +83,7 @@ loggers_set_enabled, loggers_set_save_activations, ) +from torch.ao.ns.fx.qconfig_multi_mapping import QConfigMultiMapping from torch.ao.quantization.backend_config import get_native_backend_config from torch.ao.quantization.fx.backend_config_utils import get_pattern_to_quantize_handlers @@ -2096,6 +2098,7 @@ def _test_impl(self, m, example_input, qconfig_mappings): results = extract_results_n_shadows_model(msq) print_comparisons_n_shadows_model(results) + return msq def test_linear_mod(self): class M(nn.Module): @@ -2110,9 +2113,8 @@ def forward(self, x): m = M().eval() example_input = (torch.randn(2, 2),) - qconfig_mappings = [ - QConfigMapping().set_global(torch.quantization.default_qconfig), - ] + qconfig_mappings = \ + QConfigMultiMapping().set_global([torch.quantization.default_qconfig]) self._test_impl(m, example_input, qconfig_mappings) def test_linear_relu_mod(self): @@ -2132,10 +2134,12 @@ def forward(self, x): m = M().eval() example_input = (torch.randn(2, 2),) - qconfig_mappings = [ - QConfigMapping().set_global(torch.quantization.default_qconfig), - QConfigMapping().set_global(torch.quantization.default_dynamic_qconfig), - ] + qconfig_mappings = ( + QConfigMultiMapping().set_global([ + torch.quantization.default_qconfig, + torch.quantization.default_dynamic_qconfig + ]) + ) self._test_impl(m, example_input, qconfig_mappings) def test_conv_bn_relu_mod(self): @@ -2154,10 +2158,12 @@ def forward(self, x): m = M().eval() example_input = (torch.randn(32, 1, 16, 16),) - qconfig_mappings = [ - QConfigMapping().set_global(torch.quantization.default_qconfig), - QConfigMapping().set_global(torch.quantization.default_per_channel_qconfig), - ] + + qconfig_mappings = QConfigMultiMapping() \ + .set_global([ + torch.quantization.default_qconfig, + torch.quantization.default_per_channel_qconfig + ]) self._test_impl(m, example_input, qconfig_mappings) def test_functions(self): @@ -2194,10 +2200,8 @@ def forward(self, x): m = M().eval() example_input = (torch.randn(2, 2),) - qconfig_mappings = [ - QConfigMapping().set_global(torch.quantization.default_qconfig), - # QConfigMapping().set_global(torch.quantization.default_per_channel_qconfig), - ] + qconfig_mappings = QConfigMultiMapping() \ + .set_global([torch.quantization.default_qconfig]) self._test_impl(m, example_input, qconfig_mappings) def test_partial_qconfig_mapping(self): @@ -2220,19 +2224,17 @@ def forward(self, x): example_input = (torch.randn(2, 2),) qconfig = torch.ao.quantization.default_qconfig - qconfig_mappings = [ - QConfigMapping().set_global(None) - .set_object_type(F.linear, qconfig) - .set_object_type(F.relu, qconfig), - ] + qconfig_mappings = QConfigMultiMapping() \ + .set_object_type(F.linear, [qconfig]) \ + .set_object_type(F.relu, [qconfig]) self._test_impl(m, example_input, qconfig_mappings) def test_logger_enabled_and_save_activations_flags(self): m = nn.Sequential(nn.Linear(1, 1)).eval() example_input = (torch.randn(1, 1),) - qconfig_mappings = [ - QConfigMapping().set_global(torch.quantization.default_qconfig), - ] + + qconfig_mappings = QConfigMultiMapping() \ + .set_global([torch.quantization.default_qconfig]) backend_config = get_native_backend_config() msp = prepare_n_shadows_model( @@ -2281,12 +2283,187 @@ def test_mobilenet_v2(self): pretrained=False, quantize=False).eval() example_input = (torch.randn(1, 3, 224, 224),) - qconfig_mappings = [ + qconfig_mappings = QConfigMultiMapping() \ + .set_global([torch.quantization.default_qconfig, torch.quantization.default_dynamic_qconfig]) + + self._test_impl(m, example_input, qconfig_mappings) + + def test_qconfig_multi_mapping_deduplication(self): + # check that insertion deduplicates qconfigs + qconfig_multi_mapping = QConfigMultiMapping().set_global( + [torch.quantization.default_qconfig, torch.quantization.default_qconfig] + ) + self.assertEqual(len(qconfig_multi_mapping.qconfig_mappings_list), 1) + + def test_qconfig_multi_mapping_insert_padding(self): + # test that inserting a higher priority qconfig style with fewer elements than a lower priority qconfig will + # result in adding None to the extra QConfigMappings at that same style+key + qconfig_multi_mapping = ( + QConfigMultiMapping() + .set_global( + [ + torch.quantization.default_qconfig, + torch.quantization.default_dynamic_qconfig, + ] + ) + .set_object_type(torch.nn.Linear, [torch.quantization.default_qconfig]) + .set_module_name_regex("fc", [torch.quantization.default_qconfig]) + .set_module_name("fc2", [torch.quantization.default_qconfig]) + .set_module_name_object_type_order( + "", nn.Linear, 0, [torch.quantization.default_qconfig] + ) + ) + + self.assertEqual( + qconfig_multi_mapping.qconfig_mappings_list[1].object_type_qconfigs[ + torch.nn.Linear + ], + None, + ) + self.assertEqual( + qconfig_multi_mapping.qconfig_mappings_list[1].module_name_regex_qconfigs[ + "fc" + ], + None, + ) + self.assertEqual( + qconfig_multi_mapping.qconfig_mappings_list[1].module_name_qconfigs["fc2"], + None, + ) + self.assertEqual( + qconfig_multi_mapping.qconfig_mappings_list[ + 1 + ].module_name_object_type_order_qconfigs[("", nn.Linear, 0)], + None, + ) + + def test_qconfig_multi_mapping_retroactive_padding(self): + # test that inserting a lower priority qconfig style with more elements thhan lower priority qconfig styles + # will result in the new QConfigMapping having None at all previously existing styles+keys + qconfig_multi_mapping = ( + QConfigMultiMapping() + .set_object_type(torch.nn.Linear, [torch.quantization.default_qconfig]) + .set_module_name_regex("fc", [torch.quantization.default_qconfig]) + .set_module_name("fc2", [torch.quantization.default_qconfig]) + .set_module_name_object_type_order( + "", nn.Linear, 0, [torch.quantization.default_qconfig] + ) + .set_global( + [ + torch.quantization.default_qconfig, + torch.quantization.default_dynamic_qconfig, + ] + ) + ) + + self.assertEqual( + qconfig_multi_mapping.qconfig_mappings_list[1].object_type_qconfigs[ + torch.nn.Linear + ], + None, + ) + self.assertEqual( + qconfig_multi_mapping.qconfig_mappings_list[1].module_name_regex_qconfigs[ + "fc" + ], + None, + ) + self.assertEqual( + qconfig_multi_mapping.qconfig_mappings_list[1].module_name_qconfigs["fc2"], + None, + ) + self.assertEqual( + qconfig_multi_mapping.qconfig_mappings_list[ + 1 + ].module_name_object_type_order_qconfigs[("", nn.Linear, 0)], + None, + ) + + def test_qconfig_multi_mapping_end_to_end(self): + # test that the prepare/convert_n_shadows_model works as expected + # with qconfig_multi_mapping and avoids unwanted matches + + m = TwoLayerLinearModel().eval() + example_input = m.get_example_inputs() + + qconfig_multi_mapping = ( + QConfigMultiMapping() + .set_global( + [ + torch.quantization.default_qconfig, + torch.quantization.default_dynamic_qconfig, + ] + ) + .set_module_name("fc2", [None, torch.quantization.default_qconfig]) + ) + self.assertEqual( + qconfig_multi_mapping.qconfig_mappings_list[1].module_name_qconfigs["fc2"], + None, + ) + msq = self._test_impl(m, example_input, qconfig_multi_mapping) + + self.checkQuantizedLinear(msq.shadow_wrapper_0_1.mod_0) + self.checkDynamicQuantizedLinear(msq.shadow_wrapper_0_2.mod_0, torch.qint8) + self.checkQuantizedLinear(msq.shadow_wrapper_1_1.mod_0) + self.assertRaisesRegex(AttributeError, ".*", lambda: msq.shadow_wrapper_1_2) + + def test_qconfig_multi_mapping_from_list(self): + # test QConfigMultiMapping.from_list_qconfig_mapping works as expected + + m = TwoLayerLinearModel().eval() + example_input = m.get_example_inputs() + + qconfig_mappings_list = [ QConfigMapping().set_global(torch.quantization.default_qconfig), - QConfigMapping().set_global(torch.quantization.default_dynamic_qconfig), + QConfigMapping() + .set_global(torch.quantization.default_dynamic_qconfig) + .set_module_name("fc2", torch.quantization.default_qconfig), ] - self._test_impl(m, example_input, qconfig_mappings) + qconfig_multi_mapping = QConfigMultiMapping().from_list_qconfig_mapping( + qconfig_mappings_list + ) + self.assertEqual( + qconfig_multi_mapping.qconfig_mappings_list[1].module_name_qconfigs["fc2"], + None, + ) + + msq = self._test_impl(m, example_input, qconfig_multi_mapping) + + self.checkQuantizedLinear(msq.shadow_wrapper_0_1.mod_0) + self.checkDynamicQuantizedLinear(msq.shadow_wrapper_0_2.mod_0, torch.qint8) + self.checkQuantizedLinear(msq.shadow_wrapper_1_1.mod_0) + self.assertRaisesRegex(AttributeError, ".*", lambda: msq.shadow_wrapper_1_2) + + def test_qconfig_multi_mapping_ordering(self): + # test that the module ordering ignores None + + m = TwoLayerLinearModel().eval() + example_input = m.get_example_inputs() + qconfig_multi_mapping = ( + QConfigMultiMapping() + .set_global( + [ + torch.ao.quantization.default_qconfig, + torch.ao.quantization.default_dynamic_qconfig, + ] + ) + .set_module_name( + "fc2", + [ + None, + torch.ao.quantization.default_dynamic_qconfig, + torch.ao.quantization.default_qat_qconfig_v2, + ], + ) + ) + self.assertEqual(len(qconfig_multi_mapping.qconfig_mappings_list), 2) + msq = self._test_impl(m, example_input, qconfig_multi_mapping) + + self.checkQuantizedLinear(msq.shadow_wrapper_0_1.mod_0) + self.checkDynamicQuantizedLinear(msq.shadow_wrapper_0_2.mod_0, torch.qint8) + self.checkDynamicQuantizedLinear(msq.shadow_wrapper_1_1.mod_0, torch.qint8) + self.checkQuantizedLinear(msq.shadow_wrapper_1_2.mod_0) class TestFXNumericSuiteCoreAPIsModels(FXNumericSuiteQuantizationTestCase): """ diff --git a/torch/ao/ns/_numeric_suite_fx.py b/torch/ao/ns/_numeric_suite_fx.py index f586de58531a7..860430c40b9f9 100644 --- a/torch/ao/ns/_numeric_suite_fx.py +++ b/torch/ao/ns/_numeric_suite_fx.py @@ -119,10 +119,6 @@ NSResultsType, NSNodeTargetType, ) - -from torch.ao.quantization import ( - QConfigMapping, -) from torch.ao.quantization.backend_config.utils import get_fusion_pattern_to_root_node_getter from torch.ao.quantization.backend_config import BackendConfig from torch.ao.quantization.fx.backend_config_utils import get_pattern_to_quantize_handlers @@ -138,6 +134,7 @@ print_n_shadows_summary, handle_subgraph, ) +from torch.ao.ns.fx.qconfig_multi_mapping import QConfigMultiMapping from typing import Dict, Tuple, Callable, List, Optional, Set, Any, Type @@ -753,7 +750,7 @@ def extend_logger_results_with_comparison( def prepare_n_shadows_model( model: torch.nn.Module, example_inputs: Any, - qconfig_mappings: List[QConfigMapping], + qconfig_multi_mapping: QConfigMultiMapping, backend_config: BackendConfig, ) -> torch.nn.Module: """ @@ -770,9 +767,9 @@ def prepare_n_shadows_model( args_kwargs_m -> op_m -> output_m | | - |---------------------------> mod_with_op_m_transformed_with_qconfig_i + |---------------------------> mod_with_op_m_transformed_with_qconfig_n - Where mod_with_op_m_transformed_with_qconfig_i is a submodule, and its + Where mod_with_op_m_transformed_with_qconfig_n is a submodule, and its inner graph looks like .. code:: @@ -790,8 +787,7 @@ def prepare_n_shadows_model( 1. add deduplication for qconfigs per subgraph 2. figure out a better way to name the output structure 3. return a results data structure instead of printing it out - 4. make specifying sets of QConfigMapping more user friendly - 5. add examples to docblocks + 4. add examples to docblocks """ tracer = quantize_fx.QuantizationTracer([], []) @@ -822,7 +818,7 @@ def prepare_n_shadows_model( # generate node to qconfig for each subgraph # TODO(future PR): deduplicate repeating entries list_of_node_name_to_qconfig: List[Dict[str, QConfigAny]] = [] - for qconfig_mapping in qconfig_mappings: + for qconfig_mapping in qconfig_multi_mapping.qconfig_mappings_list: node_name_to_qconfig = generate_node_name_to_qconfig( mt, modules, mt.graph, qconfig_mapping, tracer.node_name_to_scope) list_of_node_name_to_qconfig.append(node_name_to_qconfig) @@ -838,7 +834,7 @@ def prepare_n_shadows_model( enumerate(subgraphs_dedup.items()): handle_subgraph( mt, subgraph_idx, match_name, nodes_in_this_subgraph, - qconfig_mappings, list_of_node_name_to_qconfig) + qconfig_multi_mapping.qconfig_mappings_list, list_of_node_name_to_qconfig) mt.recompile() return mt diff --git a/torch/ao/ns/fx/qconfig_multi_mapping.py b/torch/ao/ns/fx/qconfig_multi_mapping.py new file mode 100644 index 0000000000000..bff2640e1feb3 --- /dev/null +++ b/torch/ao/ns/fx/qconfig_multi_mapping.py @@ -0,0 +1,242 @@ +from __future__ import annotations + +import copy +from typing import Any, Callable, Dict, List, Union + +import torch +from torch.ao.quantization import QConfigMapping +from torch.ao.quantization.qconfig import QConfigAny + +__all__ = ["QConfigMultiMapping"] + +_QCONFIG_STYLE_ORDER: List[str] = [ + "global_qconfig", + "object_type_qconfigs", + "module_name_regex_qconfigs", + "module_name_qconfigs", + "module_name_object_type_order_qconfigs", +] + +_QCONFIG_STYLE_TO_METHOD: Dict[str, str] = { + "global_qconfig": "set_global", + "object_type_qconfigs": "set_object_type", + "module_name_regex_qconfigs": "set_module_name_regex", + "module_name_qconfigs": "set_module_name", + "module_name_object_type_order_qconfigs": "set_module_name_object_type_order", +} + +def _remove_duplicates_and_none(qconfig_list: List[QConfigAny]) -> None: + to_remove = [] + for index, cur_qconfig in enumerate(qconfig_list): + if cur_qconfig is None: + to_remove.append(index) + break + for checked_qconfig in qconfig_list[:index]: + if torch.ao.quantization.qconfig_equals(cur_qconfig, checked_qconfig): + to_remove.append(index) + break + for index in to_remove[::-1]: + qconfig_list.pop(index) + +class QConfigMultiMapping: + """ + This class, used with the prepare_n_shadows_model API, stores a list of :class:`torch.ao.quantization.QConfigMapping`s + so that multiple QConfigs can be specified for each QConfig matching style. + + The user can specify QConfigs using the following methods (in increasing match priority): + + ``set_global`` : sets the global (default) QConfigs + + ``set_object_type`` : sets the QConfigs for a given module type, function, or method name + + ``set_module_name_regex`` : sets the QConfigs for modules matching the given regex string + + ``set_module_name`` : sets the QConfigs for modules matching the given module name + + ``set_module_name_object_type_order`` : sets the QConfigs for modules matching a combination + of the given module name, object type, and the index at which the module appears + + Note: Usage of set methods is the same as in QConfigMapping except with a passed in list of QConfigs rather than a + single QConfig. + + Example usage:: + + qconfig_mapping = QConfigMultiMapping() + .set_global([qconfig1, qconfig2]) + .set_object_type(torch.nn.Linear, [qconfig2, qconfig3]) + .set_object_type(torch.nn.ReLU, [qconfig1]) + .set_module_name_regex("foo.*bar.*conv[0-9]+", [qconfig2]) + .set_module_name_regex("foo.*", [qconfig1, qconfig2, qconfig3]) + .set_module_name("module1", [None]) + .set_module_name("module2", [qconfig2]) + .set_module_name_object_type_order("foo.bar", torch.nn.functional.linear, 0, [qconfig3]) + + """ + + def __init__(self): + # initialize this with 1 QConfigMapping to avoid corner cases + self.qconfig_mappings_list: List[QConfigMapping] = [QConfigMapping()] + + def _handle_list_size_mismatch( + self, qconfig_list: List[QConfigAny], style: str + ) -> None: + # this method handles cases where the size of qconfig_list does not match + # the size of qconfig_mappings_list. + # Issue: Consider a user inserting global_qconfig A and B first, then inserting + # qconfig C as an object_type_qconfig for conv ops. If we internally store + # 1 QConfigMapping with A and C and another with just B, then the + # second QConfigMapping will match B to conv ops (which is not wanted), since B is global. + + # we avoid this by maintaining the invariant that if any QConfigMapping + # has a qconfig style+key with a qconfig in it, all QConfigMappings must + # have either a qconfig or None for that same style+key. In the above + # example, a None qconfig would prevent the unwanted match in the + # second QConfigMapping + + if len(qconfig_list) > len(self.qconfig_mappings_list): + # Case: we have more qconfigs (in qconfig_list) than QConfigMappings + + # Add new QConfigMappings (initialized so we maintain the `invariant`) + + new_qconfig_mapping = QConfigMapping() + # searches other QConfigMappings for qconfig style+keys + # that need to be inserted as `None` into the new QConfigMapping + for qconfig_mapping in self.qconfig_mappings_list: + + # global_qconfig has None by default + for check_style in _QCONFIG_STYLE_ORDER[1:]: + qconfigs_dict = getattr(qconfig_mapping, check_style) + target_qconfigs_dict = getattr(new_qconfig_mapping, check_style) + for key in qconfigs_dict: + target_qconfigs_dict[key] = None + break + + # insert copies of this new QConfigMapping until all entires + # in qconfig_list can fit among the QConfigMappings + while len(qconfig_list) > len(self.qconfig_mappings_list): + self.qconfig_mappings_list.append(copy.deepcopy(new_qconfig_mapping)) + else: + # Case: we have fewer qconfigs in qconfig_list than QConfigMappings + + # pad qconfig_list with `None` until length is same + while len(qconfig_list) < len(self.qconfig_mappings_list): + qconfig_list.append(None) + + # this function applies the insertion method across each QConfigMapping + def _insert_qconfig_list( + self, + style: str, + args: List[Union[str, int, Callable]], + qconfig_list: List[QConfigAny], + ) -> None: + + # we remove duplicates and None to make the ordering of qconfigs + # deterministic upon insertion. + _remove_duplicates_and_none(qconfig_list) + + self._handle_list_size_mismatch(qconfig_list, style) + method_name = _QCONFIG_STYLE_TO_METHOD[style] + for qconfig_mapping, qconfig in zip(self.qconfig_mappings_list, qconfig_list): + # uses QConfigMapping set method to insert qconfig + set_method = getattr(qconfig_mapping, method_name) + set_method(*args, qconfig) + + def set_global(self, global_qconfig_list: List[QConfigAny]) -> QConfigMultiMapping: + """ + Set global QConfigs + see :func:`~torch.ao.quantization.QConfigMapping.set_global()` for more info + """ + self._insert_qconfig_list("global_qconfig", [], global_qconfig_list) + return self + + def set_object_type( + self, object_type: Union[Callable, str], qconfig_list: List[QConfigAny] + ) -> QConfigMultiMapping: + """ + Set object type QConfigs + see :func:`~torch.ao.quantization.QConfigMapping.set_object_type()` for more info + """ + self._insert_qconfig_list("object_type_qconfigs", [object_type], qconfig_list) + return self + + def set_module_name_regex( + self, module_name_regex: str, qconfig_list: List[QConfigAny] + ) -> QConfigMultiMapping: + """ + Set module_name_regex QConfigs + see :func:`~torch.ao.quantization.QConfigMapping.set_module_name_regex()` for more info + """ + self._insert_qconfig_list( + "module_name_regex_qconfigs", [module_name_regex], qconfig_list + ) + return self + + def set_module_name( + self, module_name: str, qconfig_list: List[QConfigAny] + ) -> QConfigMultiMapping: + """ + Set module_name QConfigs + see :func:`~torch.ao.quantization.QConfigMapping.set_module_name()` for more info + """ + self._insert_qconfig_list("module_name_qconfigs", [module_name], qconfig_list) + return self + + def set_module_name_object_type_order( + self, + module_name: str, + object_type: Callable, + index: int, + qconfig_list: List[QConfigAny], + ) -> QConfigMultiMapping: + """ + Set module_name QConfigs + see :func:`~torch.ao.quantization.QConfigMapping.set_module_name_object_type_order()` for more info + """ + self._insert_qconfig_list( + "module_name_object_type_order_qconfigs", + [module_name, object_type, index], + qconfig_list, + ) + return self + + @classmethod + def from_list_qconfig_mapping( + cls, qconfig_mapping_list: List[QConfigMapping] + ) -> QConfigMultiMapping: + """ + Creates a QConfigMultiMapping from a list of QConfigMappings + """ + new_qconfig_multi_mapping = cls() + + new_qconfig_multi_mapping.qconfig_mappings_list = copy.deepcopy( + qconfig_mapping_list + ) + + # we need to avoid the issue described in _handle_list_size_mismatch, + # so we reinsert all the qconfigs using the QConfigMultiMapping + # set methods + + # go through all qconfig styles + # note: global can be ignored since it is None by default + for style in _QCONFIG_STYLE_ORDER[1:]: + + # gather all key+qconfigs for current style + # into qconfig_dict_list + qconfig_dict_list: Dict[Any, List[QConfigAny]] = {} + for qconfig_mapping in qconfig_mapping_list: + qconfig_dict = getattr(qconfig_mapping, style) + for key, qconfig in qconfig_dict.items(): + if key not in qconfig_dict_list: + qconfig_dict_list[key] = [] + qconfig_dict_list[key].append(qconfig) + + # reinsert all gathered key+qconfigs + set_method_name = _QCONFIG_STYLE_TO_METHOD[style] + set_method = getattr(new_qconfig_multi_mapping, set_method_name) + for key, qconfig_list in qconfig_dict_list.items(): + if isinstance(key, tuple): + set_method(*key, qconfig_list) + else: + set_method(key, qconfig_list) + + return new_qconfig_multi_mapping From c65a40d5ded783b3f762c8ca8b0d4e5a1aa3251e Mon Sep 17 00:00:00 2001 From: Bin Bao Date: Wed, 26 Oct 2022 16:13:20 +0000 Subject: [PATCH 0187/1922] Enable some PyTorch core tests with inductor (#87490) Summary: 1) Graph break on torch.random.set_rng_state since it blocks running inductor core tests; 2) Add several inductor-specific skips; 3) Enable several core tests for inductor CI; cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87490 Approved by: https://github.com/eellison --- .jenkins/pytorch/test.sh | 11 ++++------- test/dynamo/test_repros.py | 2 ++ test/test_modules.py | 6 +++++- test/test_ops.py | 6 ++++++ test/test_ops_gradients.py | 6 ++++-- torch/_dynamo/variables/torch.py | 3 +++ 6 files changed, 24 insertions(+), 10 deletions(-) diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 94896701771c6..89fbd764201a1 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -251,13 +251,10 @@ test_dynamo_shard() { test_inductor() { - echo "TODO: enable inductor unit tests" - # time python test/run_test.py --core --exclude test_autograd --continue-through-error --verbose - - # PYTORCH_TEST_WITH_DYNAMO and PYTORCH_TEST_WITH_INDUCTOR are only needed for PyTorch tests not written with - # using dynamo/inductor. For dynamo/inductor unit tests, specifiying them will trigger an error like - # "Detected two calls to `torchdynamo.optimize(...)` with a different backend compiler arguments." - # PYTORCH_TEST_WITH_DYNAMO=0 PYTORCH_TEST_WITH_INDUCTOR=0 pytest test/inductor + python test/test_modules.py --verbose + # TODO: investigate "RuntimeError: CUDA driver API confirmed a leak" + # seen intest_ops_gradients.py + # pytest test/test_ops_gradients.py --verbose -k "not _complex and not test_inplace_grad_acos_cuda_float64" } test_inductor_huggingface_shard() { diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py index 66fc19895dd62..41564952a7444 100644 --- a/test/dynamo/test_repros.py +++ b/test/dynamo/test_repros.py @@ -1016,6 +1016,8 @@ def test_create_rand_mask_from_inputs(self): self.assertEqual(cnt.frame_count, 1) self.assertEqual(cnt.op_count, 8) + # TODO: make set_rng_state work with FakeTensor/aot_autograd + @patch.object(torch._dynamo.config, "fake_tensor_propagation", False) def test_rng_state(self): def fn(): state = torch.get_rng_state() diff --git a/test/test_modules.py b/test/test_modules.py index e06f0cc617d99..2f5008244d548 100644 --- a/test/test_modules.py +++ b/test/test_modules.py @@ -11,7 +11,8 @@ instantiate_device_type_tests, onlyCUDA, toleranceOverride, tol, skipMeta) from torch.testing._internal.common_modules import module_db, modules, TrainEvalMode from torch.testing._internal.common_utils import ( - TestCase, run_tests, freeze_rng_state, mock_wrapper, get_tensors_from, gradcheck, gradgradcheck, skipIfMps) + TestCase, run_tests, freeze_rng_state, mock_wrapper, get_tensors_from, gradcheck, + gradgradcheck, skipIfMps, skipIfTorchInductor) from unittest.mock import patch, call @@ -326,6 +327,7 @@ def inner_zero_grad(obj): @skipIfMps @modules(module_db) + @skipIfTorchInductor("to be fixed") def test_non_contiguous_tensors(self, device, dtype, module_info, training): # Check modules work with non-contiguous tensors @@ -489,6 +491,7 @@ def test_gradgrad(self, device, dtype, module_info, training): @toleranceOverride({torch.float32: tol(5e-2, 0), torch.float64: tol(4e-4, 0)}) @modules(module_db) + @skipIfTorchInductor("to be fixed") def test_cpu_gpu_parity(self, device, dtype, module_info, training): # TODO: RNN / GRU / LSTM don't support backwards on eval mode for cuDNN; skip this in a # nicer way for eval mode only. @@ -579,6 +582,7 @@ def check_backward(cpu_output, gpu_output): @skipIfMps @modules(module_db) + @skipIfTorchInductor("to be fixed") def test_memory_format(self, device, dtype, module_info, training): is_sm86 = device.startswith("cuda") and torch.cuda.get_device_capability(0) == (8, 6) # TODO tighten it to a specific module diff --git a/test/test_ops.py b/test/test_ops.py index 5e9371e982341..0e5b6f1d607dd 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -36,6 +36,7 @@ first_sample, parametrize, skipIfSlowGradcheckEnv, + skipIfTorchInductor, slowTest, ) from torch.testing._internal.common_methods_invocations import ( @@ -209,6 +210,7 @@ def to_cpu(arg): @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN") @onlyNativeDeviceTypes @ops(python_ref_db) + @skipIfTorchInductor("Takes too long for inductor") def test_python_ref_meta(self, device, dtype, op): with FakeTensorMode() as mode: pass @@ -374,6 +376,7 @@ def _distance(a, b): @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN") @onlyNativeDeviceTypes @ops(python_ref_db) + @skipIfTorchInductor("Takes too long for inductor") def test_python_ref(self, device, dtype, op): # In this test, primTorch refs call into the refs namespace # For example, a ref with torch.foo in it will calls refs.foo instead @@ -386,6 +389,7 @@ def test_python_ref(self, device, dtype, op): @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN") @onlyNativeDeviceTypes @ops(python_ref_db) + @skipIfTorchInductor("Takes too long for inductor") def test_python_ref_torch_fallback(self, device, dtype, op): # In this test, refs call into the torch namespace (after the initial invocation) # For example, a ref with torch.foo in it will call torch.foo instead of refs.foo @@ -397,6 +401,7 @@ def test_python_ref_torch_fallback(self, device, dtype, op): @skipCUDAIfRocm @ops(python_ref_db) @parametrize('executor', ['aten', 'nvfuser']) + @skipIfTorchInductor("Takes too long for inductor") def test_python_ref_executor(self, device, dtype, op, executor): # TODO: Not all dtypes are supported with nvfuser from torch._prims_common import _torch_dtype_to_nvfuser_dtype_map @@ -457,6 +462,7 @@ def test_errors(self, device, op): @skipMeta @onlyNativeDeviceTypes @ops([op for op in python_ref_db if op.error_inputs_func is not None], dtypes=OpDTypes.none) + @skipIfTorchInductor("Takes too long for inductor") def test_python_ref_errors(self, device, op): mode = FakeTensorMode() with mode: diff --git a/test/test_ops_gradients.py b/test/test_ops_gradients.py index 0411f043df9c0..6d517c7a7f8b1 100644 --- a/test/test_ops_gradients.py +++ b/test/test_ops_gradients.py @@ -4,8 +4,9 @@ from itertools import chain import torch -from torch.testing._internal.common_utils import \ - (TestCase, is_iterable_of_tensors, run_tests, gradcheck, gradgradcheck, is_slow_gradcheck_env) +from torch.testing._internal.common_utils import ( + TestCase, is_iterable_of_tensors, run_tests, gradcheck, gradgradcheck, is_slow_gradcheck_env, + skipIfTorchInductor) from torch.testing._internal.common_methods_invocations import op_db from torch.testing._internal.common_device_type import \ (instantiate_device_type_tests, ops, OpDTypes) @@ -253,6 +254,7 @@ def test_forward_mode_AD(self, device, dtype, op): self._forward_grad_helper(device, dtype, op, op.get_op(), is_inplace=False) @_gradcheck_ops(op_db) + @skipIfTorchInductor("to be fixed") def test_inplace_forward_mode_AD(self, device, dtype, op): self._skip_helper(op, device, dtype) diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py index e0c88b2cf059a..36ca6591189de 100644 --- a/torch/_dynamo/variables/torch.py +++ b/torch/_dynamo/variables/torch.py @@ -320,6 +320,9 @@ def get_state_from_generator(): assert isinstance(args[0], TensorVariable) if config.fake_tensor_propagation: + unimplemented( + "TODO: make torch.random.set_rng_state work with FakeTensor/aot_autograd" + ) # In fake tensor case, this state doesn't matter, but # it needs to be valid to not segfault. Pull a real tensor out. # The value won't matter since we are running with fake tensors anyway, so rng doesn't matter. From 3ea401402e31a80ac31a30d500d00be6e680ddf4 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 26 Oct 2022 19:23:55 +0000 Subject: [PATCH 0188/1922] Set check-latest to false when setup python and pip cache in CI (#87621) I missed the fine print in https://github.com/actions/setup-python/blob/main/README.md#caching-packages-dependencies when setting up the cache using setup-python GHA > Restored cache will not be used if the requirements.txt file is not updated for a long time and a newer version of the dependency is available which can lead to an increase in total build time. The latter part is important because it implies that even with the cache, pip will still try to check if a newer version exists and that part can be flaky, i.e. https://github.com/pytorch/pytorch/actions/runs/3313764038/jobs/5472180293 This undesired behavior can be turned off by setting the advance option `check-latest` to false https://github.com/actions/setup-python/blob/main/docs/advanced-usage.md#check-latest-version. Per my understanding, this should tell pip install in these workflows to use the local cached copy of the package avoiding the need to query pypi every single time. `check-latest` was added quite recently https://github.com/actions/setup-python/pull/406, so `actionlint-1.6.15` fails to recognize it. Thus, this PR also upgrades `actionlint` to the latest 1.6.21 to pass the linter check. Here is an example error from 1.6.15 from https://github.com/pytorch/pytorch/actions/runs/3315388073/jobs/5475918454: ``` >>> Lint for .github/workflows/lint.yml: Error (ACTIONLINT) [action] input "check-latest" is not defined in action "actions/setup-python@v4". available inputs are "architecture", "cache", "cache-dependency-path", "python-version", "python-version-file", "token" 25 | with: 26 | python-version: 3.8 27 | architecture: x64 >>> 28 | check-latest: false 29 | cache: pip 30 | cache-dependency-path: | 31 | **/.github/requirements-gha-cache.txt ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/87621 Approved by: https://github.com/ZainRizvi --- .github/actions/setup-win/action.yml | 3 ++- .github/workflows/lint.yml | 23 ++++++++++++++--------- .github/workflows/pr-labels.yml | 3 ++- .github/workflows/revert.yml | 3 ++- .github/workflows/trymerge.yml | 3 ++- .github/workflows/tryrebase.yml | 3 ++- .github/workflows/update-viablestrict.yml | 1 + tools/linter/adapters/s3_init_config.json | 8 ++++---- 8 files changed, 29 insertions(+), 18 deletions(-) diff --git a/.github/actions/setup-win/action.yml b/.github/actions/setup-win/action.yml index d442343430c7d..4447e9203d504 100644 --- a/.github/actions/setup-win/action.yml +++ b/.github/actions/setup-win/action.yml @@ -57,7 +57,8 @@ runs: - name: Setup Python3 uses: actions/setup-python@v4 with: - python-version: "3.x" + python-version: 3.x + check-latest: false cache: pip cache-dependency-path: | **/requirements.txt diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 669977b143a5e..17ffb239b15a7 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -25,16 +25,14 @@ jobs: with: python-version: 3.8 architecture: x64 + check-latest: false cache: pip cache-dependency-path: | **/.github/requirements-gha-cache.txt - - name: Install lintrunner - uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482 - with: - timeout_minutes: 5 - max_attempts: 3 - command: pip install lintrunner==0.9.2 + - name: Install requirements + run: | + pip install -r .github/requirements-gha-cache.txt --user - name: Initialize lint dependencies run: lintrunner init @@ -87,6 +85,7 @@ jobs: with: python-version: 3.x architecture: x64 + check-latest: false cache: pip cache-dependency-path: | **/requirements.txt @@ -151,6 +150,7 @@ jobs: with: python-version: 3.x architecture: x64 + check-latest: false cache: pip cache-dependency-path: | **/requirements.txt @@ -242,7 +242,8 @@ jobs: with: python-version: 3.8 architecture: x64 - cache: 'pip' + check-latest: false + cache: pip cache-dependency-path: | **/.github/requirements-gha-cache.txt @@ -275,6 +276,7 @@ jobs: with: python-version: 3.8 architecture: x64 + check-latest: false cache: pip cache-dependency-path: | **/requirements.txt @@ -320,21 +322,24 @@ jobs: with: python-version: 3.5 architecture: x64 + check-latest: false cache: pip cache-dependency-path: | - **/.github/requirements-gha-cache.txt + **/requirements.txt - name: Setup Python 3.8 if: matrix.test_type != 'older_python_version' uses: actions/setup-python@v4 with: python-version: 3.8 architecture: x64 + check-latest: false cache: pip cache-dependency-path: | - **/.github/requirements-gha-cache.txt + **/requirements.txt - name: Install torch if: matrix.test_type == 'with_torch' run: | + pip install -r requirements.txt # Doesn't really matter what torch version, we just need ANY torch installed pip install 'torch==1.*' - name: Run collect_env.py diff --git a/.github/workflows/pr-labels.yml b/.github/workflows/pr-labels.yml index aa8cf4472b784..de6da1feec02a 100644 --- a/.github/workflows/pr-labels.yml +++ b/.github/workflows/pr-labels.yml @@ -17,7 +17,8 @@ jobs: - name: Set up python uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: 3.10 + check-latest: false cache: pip cache-dependency-path: | **/.github/requirements-gha-cache.txt diff --git a/.github/workflows/revert.yml b/.github/workflows/revert.yml index d207840f383b4..6468f3b8c804c 100644 --- a/.github/workflows/revert.yml +++ b/.github/workflows/revert.yml @@ -23,7 +23,8 @@ jobs: with: python-version: 3.8 architecture: x64 - cache: 'pip' + check-latest: false + cache: pip - run: pip install pyyaml==6.0 - name: Setup committer id diff --git a/.github/workflows/trymerge.yml b/.github/workflows/trymerge.yml index dff92303f5056..372b442163df0 100644 --- a/.github/workflows/trymerge.yml +++ b/.github/workflows/trymerge.yml @@ -22,7 +22,8 @@ jobs: uses: actions/setup-python@v4 with: python-version: 3.8 - cache: 'pip' + check-latest: false + cache: pip architecture: x64 - run: pip install pyyaml==6.0 diff --git a/.github/workflows/tryrebase.yml b/.github/workflows/tryrebase.yml index fed9000c420e9..dd32069932678 100644 --- a/.github/workflows/tryrebase.yml +++ b/.github/workflows/tryrebase.yml @@ -22,7 +22,8 @@ jobs: with: python-version: 3.8 architecture: x64 - cache: 'pip' + check-latest: false + cache: pip - run: pip install pyyaml==6.0 - name: Setup committer id diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml index 5901b1f4cda1b..4be70de020a3b 100644 --- a/.github/workflows/update-viablestrict.yml +++ b/.github/workflows/update-viablestrict.yml @@ -24,6 +24,7 @@ jobs: with: python-version: 3.8 architecture: x64 + check-latest: false cache: pip cache-dependency-path: | **/.circleci/docker/requirements-ci.txt diff --git a/tools/linter/adapters/s3_init_config.json b/tools/linter/adapters/s3_init_config.json index 0b0e87e8e26cf..d48f264f83d5d 100644 --- a/tools/linter/adapters/s3_init_config.json +++ b/tools/linter/adapters/s3_init_config.json @@ -27,12 +27,12 @@ }, "actionlint": { "Darwin": { - "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.15/Darwin_amd64/actionlint", - "hash": "e9a0e0b17e54cfefe7964b6aa1da8921b1f8f2318c31c0eb1a17ea3e8ab10db2" + "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.21/Darwin_amd64/actionlint", + "hash": "b354db83815384d3c3a07f68f44b30cb0a70899757a0d185d7322de9952e8813" }, "Linux": { - "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.15/Linux_arm64/actionlint", - "hash": "d6b45ae67f29a2bf9ddd226071ddd8f158fdf2992e8515a06838e5fef90f3a2d" + "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.21/Linux_arm64/actionlint", + "hash": "025ac157db121b33971ef24af72d73d71cda3cb1e3a94795bb2708ef4032ca76" } } } From 9451c581f3e05868bdc1d73a9512ad0d4f8795dc Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 26 Oct 2022 19:29:05 +0000 Subject: [PATCH 0189/1922] Fix typos under aten directory (#87754) This PR fixes typos in `.md` files under aten directory Pull Request resolved: https://github.com/pytorch/pytorch/pull/87754 Approved by: https://github.com/kit1980 --- aten/src/ATen/native/README.md | 6 +++--- aten/src/ATen/native/cpu/README.md | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md index 01a25e3a978cc..c355423ea7501 100644 --- a/aten/src/ATen/native/README.md +++ b/aten/src/ATen/native/README.md @@ -445,7 +445,7 @@ By default, ATen code generation will generate device check, which will ensure all the tensor parameters passed to kernel are on the same device. -However, in some cases, checking the device is unncessary, because, +However, in some cases, checking the device is unnecessary, because, e.g., you call a function allows to work on multiple devices. In that case, code generation of the device check can be disabled by adding `device_check: NoCheck` to your function definition. @@ -556,7 +556,7 @@ Here're steps to follow to decide the right dispatch keyword: Note: to support training, you're required to write a formula in derivatives.yaml since your backend implementations don't support autograd. - - Yes: you're likely calling other `at::` ops in the implemetation. Go to step 2. + - Yes: you're likely calling other `at::` ops in the implementation. Go to step 2. 2. Think about training: does your kernel support autograd? [check autograd support](#will-your-function-be-automatically-differentiable) - Yes: in other words, you're providing a `CompositeImplicitAutograd` kernel which supports both inference and autograd. @@ -610,7 +610,7 @@ It shows for a certain operator, what the computed dispatch table looks like aft 4. TODO: AutogradCPUOrCUDA Note that in native_functions.yaml you can mix using backend keywords and alias keywords above for one op: - - direct registration to backend always has higher precendence than alias + - direct registration to backend always has higher precedence than alias - DO NOT provide multiple alias keywords to the same op: alias keywords have precedence `CompositeExplicitAutograd > CompositeImplicitAutograd`, e.g. adding both `CompositeImplicitAutograd` and `CompositeExplicitAutograd` kernels for one op will completely ignore `CompositeImplicitAutograd` kernel for both inference and training. Thus this will trigger an error when native_functions.yaml is parsed. diff --git a/aten/src/ATen/native/cpu/README.md b/aten/src/ATen/native/cpu/README.md index ab2f9d3d02609..2cf6fa0a13320 100644 --- a/aten/src/ATen/native/cpu/README.md +++ b/aten/src/ATen/native/cpu/README.md @@ -64,7 +64,7 @@ within 256bit & 512bits registers. vec defines various operators such as As an example `ReduceOpsKernel.cpp` implements a generic `kernel_` that reduces an entire array using a given associative binary operation such as +. -More explicity, calling `kernel_` with template argument `std::plus` will cause +More explicitly, calling `kernel_` with template argument `std::plus` will cause it to sum up the entire array into a single value. `ReduceOpsKernel.cpp` uses the `CPU_CAPABILITY_*` macros to "know" under which @@ -73,7 +73,7 @@ generic code, which will be compiled under multipled compilation settings. `../ReduceOps.cpp` now includes the header `ReduceOpsKernel.h`, which contains a generic definition of `sumImplAll`. This function allows the user to reduce -over a dimension or all dimensions. The appropiate capability is chosen at +over a dimension or all dimensions. The appropriate capability is chosen at runtime using cpuinfo. If the current platform has AVX2, `sumImpl` will be set to `sumImplAll`. From 24ec5083be807bcf668ba85eb3543bc6eb35ef94 Mon Sep 17 00:00:00 2001 From: Will Constable Date: Wed, 26 Oct 2022 04:34:38 +0000 Subject: [PATCH 0190/1922] Fix missing weight init and clean up helper (#87760) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87760 Approved by: https://github.com/davidberard98 --- test/distributed/test_dynamo_distributed.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py index 43a4a23039175..36a459b6f00c3 100644 --- a/test/distributed/test_dynamo_distributed.py +++ b/test/distributed/test_dynamo_distributed.py @@ -12,6 +12,10 @@ from torch._dynamo.utils import same from torch.nn.parallel import DistributedDataParallel as DDP +def init_weights(m): + if isinstance(m, nn.Linear): + nn.init.xavier_uniform_(m.weight) + m.bias.data.fill_(0.01) class ToyModel(nn.Module): def __init__(self, in_feat=10, hidden_feat=5000, num_hidden=2, out_feat=5): @@ -19,7 +23,7 @@ def __init__(self, in_feat=10, hidden_feat=5000, num_hidden=2, out_feat=5): self.net = nn.Sequential( *[nn.Linear(in_feat, hidden_feat), nn.ReLU()] + [nn.Linear(hidden_feat, hidden_feat), nn.ReLU()] * num_hidden - + [nn.Linear(hidden_feat, 5), nn.ReLU()] + + [nn.Linear(hidden_feat, out_feat), nn.ReLU()] ) def forward(self, inputs): @@ -63,9 +67,10 @@ def tearDownClass(cls): dist.destroy_process_group() super().tearDownClass() - def get_model(self): - m = ToyModel().to(self.device) - inputs = torch.randn(20, 10).to(self.device) + def get_model(self, bsz=20, in_feat=10, hidden_feat=5000, out_feat=5): + m = ToyModel(in_feat=in_feat, hidden_feat=hidden_feat, out_feat=out_feat).to(self.device) + m.apply(init_weights) + inputs = torch.rand(bsz, in_feat).to(self.device) outputs = m(inputs) return m, inputs, outputs @@ -161,11 +166,8 @@ def test_no_split(self): introducing graph splits. (Based on model parmeters fitting in the bucket) """ # DDP will always do a 'first bucket' with a really small size; so only a tiny model will escape this - m = ToyModel(hidden_feat=5).to(self.device) - inputs = torch.randn(20, 10).to(self.device) - correct_outputs = m(inputs) + m, inputs, correct_outputs = self.get_model(hidden_feat=5) ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=250) - check_splits_compiler = CheckSplitsCompiler() @torch._dynamo.optimize(check_splits_compiler.compile_fn) @@ -233,7 +235,8 @@ def forward(self, x): return self.seq(x) m = MyModule().to(self.device) - inputs = torch.randn((512, 512)).to(self.device) + m.apply(init_weights) + inputs = torch.rand((512, 512)).to(self.device) correct_outputs = m(inputs) ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=1) From e9dba4d8e21aa5468c3f5453f0e265fe665ac6f1 Mon Sep 17 00:00:00 2001 From: Will Constable Date: Wed, 26 Oct 2022 04:34:41 +0000 Subject: [PATCH 0191/1922] Add dynamo_optimize_ddp arg to dist bench (#87768) cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87768 Approved by: https://github.com/davidberard98 --- benchmarks/dynamo/distributed.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/benchmarks/dynamo/distributed.py b/benchmarks/dynamo/distributed.py index b4332556c7bb3..c2db15563348a 100644 --- a/benchmarks/dynamo/distributed.py +++ b/benchmarks/dynamo/distributed.py @@ -63,6 +63,8 @@ def move_tensor(maybe_tensor): if args.dynamo: if args.verbose: dynamo.config.verbose = True + if args.dynamo_optimize_ddp: + dynamo.config.optimize_ddp = True def print_compile(gm, ex): print( @@ -129,6 +131,11 @@ def experiment(fn, key, world_size, results): parser.add_argument( "--world_size", type=int, default=2, help="Number of ranks/gpus for experiments" ) + parser.add_argument( + "--dynamo_optimize_ddp", + action="store_true", + help="Enable dynamo's ddp optimizer", + ) parser.add_argument( "--fsdp_checkpoint", action="store_true", From eb5c5077fa0c63c867feeda72364d1f20a82ab1b Mon Sep 17 00:00:00 2001 From: jpvillam Date: Wed, 26 Oct 2022 19:39:21 +0000 Subject: [PATCH 0192/1922] ROCm enable sparse_sampled_addmm (#86401) Enables: test_comprehensive_sparse_sampled_addmm_cuda_complex128 test_comprehensive_sparse_sampled_addmm_cuda_complex64 test_comprehensive_sparse_sampled_addmm_cuda_float32 test_comprehensive_sparse_sampled_addmm_cuda_float64 test_dispatch_meta_sparse_sampled_addmm_cuda_complex128 test_dispatch_meta_sparse_sampled_addmm_cuda_complex64 test_dispatch_meta_sparse_sampled_addmm_cuda_float32 test_dispatch_meta_sparse_sampled_addmm_cuda_float64 test_meta_sparse_sampled_addmm_cuda_complex128 test_meta_sparse_sampled_addmm_cuda_complex64 test_meta_sparse_sampled_addmm_cuda_float32 test_meta_sparse_sampled_addmm_cuda_float64 Pull Request resolved: https://github.com/pytorch/pytorch/pull/86401 Approved by: https://github.com/ngimel --- aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp | 2 +- torch/testing/_internal/common_cuda.py | 7 +++++++ torch/testing/_internal/common_methods_invocations.py | 6 ++++-- torch/utils/hipify/cuda_to_hip_mappings.py | 4 ++++ 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp index 379640bad56b9..833fd41eb6a02 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp +++ b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp @@ -1401,7 +1401,7 @@ void sampled_addmm_out_sparse_csr( const Scalar& beta, const Scalar& alpha, const at::sparse_csr::SparseCsrTensor& C) { -#if !AT_USE_CUSPARSE_GENERIC_SDDMM() +#if !(AT_USE_CUSPARSE_GENERIC_SDDMM() || AT_USE_HIPSPARSE_GENERIC_52_API()) TORCH_CHECK( false, "Calling sampled_addmm with sparse GPU tensors requires compiling ", diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py index 1ee8e40ebd062..b226c7af58e51 100644 --- a/torch/testing/_internal/common_cuda.py +++ b/torch/testing/_internal/common_cuda.py @@ -173,6 +173,13 @@ def _get_torch_cuda_version(): cuda_version = str(torch.version.cuda) return tuple(int(x) for x in cuda_version.split(".")) +def _get_torch_rocm_version(): + if not TEST_WITH_ROCM: + return (0, 0) + rocm_version = str(torch.version.hip) + rocm_version = rocm_version.split("-")[0] # ignore git sha + return tuple(int(x) for x in rocm_version.split(".")) + def _check_cusparse_generic_available(): version = _get_torch_cuda_version() min_supported_version = (10, 1) diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 00f454bdf454a..94c12f5bc93d0 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -26,7 +26,7 @@ toleranceOverride, tol) from torch.testing._internal.common_cuda import ( CUDA11OrLater, SM53OrLater, SM60OrLater, with_tf32_off, TEST_CUDNN, - _get_torch_cuda_version) + _get_torch_cuda_version, _get_torch_rocm_version) from torch.testing._internal.common_utils import ( make_fullrank_matrices_with_distinct_singular_values, TEST_WITH_ROCM, IS_WINDOWS, IS_MACOS, TEST_SCIPY, @@ -9392,7 +9392,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1): supports_autograd=True, sample_inputs_func=sample_inputs_sparse_sampled_addmm, decorators=[ - skipCUDAIf(_get_torch_cuda_version() < (11, 3), "cusparseSDDMM was added in 11.2.1"), + skipCUDAIf(not ((_get_torch_cuda_version() >= (11, 3)) + or (_get_torch_rocm_version() >= (5, 2))), + "cusparseSDDMM was added in 11.2.1"), skipCPUIfNoMklSparse, ], skips=( # NotImplementedError: Tensors of type SparseCsrTensorImpl do not have is_contiguous diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py index 3b3a047a7f9b5..33e14e9e0572e 100644 --- a/torch/utils/hipify/cuda_to_hip_mappings.py +++ b/torch/utils/hipify/cuda_to_hip_mappings.py @@ -7920,6 +7920,9 @@ ("cusparseSpGEMM_createDescr", ("hipsparseSpGEMM_createDescr", CONV_MATH_FUNC, API_SPARSE)), ("cusparseDnMatSetStridedBatch", ("hipsparseDnMatSetStridedBatch", CONV_MATH_FUNC, API_SPARSE)), ("cusparseSpGEMM_copy", ("hipsparseSpGEMM_copy", CONV_MATH_FUNC, API_SPARSE)), + ("cusparseSDDMM_bufferSize", ("hipsparseSDDMM_bufferSize", CONV_MATH_FUNC, API_SPARSE)), + ("cusparseSDDMM_preprocess", ("hipsparseSDDMM_preprocess", CONV_MATH_FUNC, API_SPARSE)), + ("cusparseSDDMM", ("hipsparseSDDMM", CONV_MATH_FUNC, API_SPARSE)), ("cusparseSpGEMM_compute", ("hipsparseSpGEMM_compute", CONV_MATH_FUNC, API_SPARSE)), ("cusparseSpGEMM_workEstimation", ("hipsparseSpGEMM_workEstimation", CONV_MATH_FUNC, API_SPARSE)), ("cusparseSpMatGetSize", ("hipsparseSpMatGetSize", CONV_MATH_FUNC, API_SPARSE)), @@ -7947,6 +7950,7 @@ ("CUSPARSE_COOMV_ALG", ("HIPSPARSE_COOMV_ALG", CONV_NUMERIC_LITERAL, API_SPARSE)), ("CUSPARSE_CSRMM_ALG1", ("HIPSPARSE_CSRMM_ALG1", CONV_NUMERIC_LITERAL, API_SPARSE)), ("CUSPARSE_SPGEMM_DEFAULT", ("HIPSPARSE_SPGEMM_DEFAULT", CONV_NUMERIC_LITERAL, API_SPARSE)), + ("CUSPARSE_SDDMM_ALG_DEFAULT", ("HIPSPARSE_SDDMM_ALG_DEFAULT", CONV_NUMERIC_LITERAL, API_SPARSE)), ( "CUSPARSE_STATUS_SUCCESS", ("HIPSPARSE_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_SPARSE), From b1a7e8c4dc0a3893955f2add7cc265e4f15c254a Mon Sep 17 00:00:00 2001 From: PyTorch MergeBot Date: Wed, 26 Oct 2022 19:40:51 +0000 Subject: [PATCH 0193/1922] Revert "Set check-latest to false when setup python and pip cache in CI (#87621)" This reverts commit 4080b1db284fd531654bcb2984a7fe0ff3b310cd. Reverted https://github.com/pytorch/pytorch/pull/87621 on behalf of https://github.com/huydhn due to Somehow setup-python treats Python 3.10 as Python 3.1 in pr-label.yml. I missed this signal because this is only run at push --- .github/actions/setup-win/action.yml | 3 +-- .github/workflows/lint.yml | 23 +++++++++-------------- .github/workflows/pr-labels.yml | 3 +-- .github/workflows/revert.yml | 3 +-- .github/workflows/trymerge.yml | 3 +-- .github/workflows/tryrebase.yml | 3 +-- .github/workflows/update-viablestrict.yml | 1 - tools/linter/adapters/s3_init_config.json | 8 ++++---- 8 files changed, 18 insertions(+), 29 deletions(-) diff --git a/.github/actions/setup-win/action.yml b/.github/actions/setup-win/action.yml index 4447e9203d504..d442343430c7d 100644 --- a/.github/actions/setup-win/action.yml +++ b/.github/actions/setup-win/action.yml @@ -57,8 +57,7 @@ runs: - name: Setup Python3 uses: actions/setup-python@v4 with: - python-version: 3.x - check-latest: false + python-version: "3.x" cache: pip cache-dependency-path: | **/requirements.txt diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 17ffb239b15a7..669977b143a5e 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -25,14 +25,16 @@ jobs: with: python-version: 3.8 architecture: x64 - check-latest: false cache: pip cache-dependency-path: | **/.github/requirements-gha-cache.txt - - name: Install requirements - run: | - pip install -r .github/requirements-gha-cache.txt --user + - name: Install lintrunner + uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482 + with: + timeout_minutes: 5 + max_attempts: 3 + command: pip install lintrunner==0.9.2 - name: Initialize lint dependencies run: lintrunner init @@ -85,7 +87,6 @@ jobs: with: python-version: 3.x architecture: x64 - check-latest: false cache: pip cache-dependency-path: | **/requirements.txt @@ -150,7 +151,6 @@ jobs: with: python-version: 3.x architecture: x64 - check-latest: false cache: pip cache-dependency-path: | **/requirements.txt @@ -242,8 +242,7 @@ jobs: with: python-version: 3.8 architecture: x64 - check-latest: false - cache: pip + cache: 'pip' cache-dependency-path: | **/.github/requirements-gha-cache.txt @@ -276,7 +275,6 @@ jobs: with: python-version: 3.8 architecture: x64 - check-latest: false cache: pip cache-dependency-path: | **/requirements.txt @@ -322,24 +320,21 @@ jobs: with: python-version: 3.5 architecture: x64 - check-latest: false cache: pip cache-dependency-path: | - **/requirements.txt + **/.github/requirements-gha-cache.txt - name: Setup Python 3.8 if: matrix.test_type != 'older_python_version' uses: actions/setup-python@v4 with: python-version: 3.8 architecture: x64 - check-latest: false cache: pip cache-dependency-path: | - **/requirements.txt + **/.github/requirements-gha-cache.txt - name: Install torch if: matrix.test_type == 'with_torch' run: | - pip install -r requirements.txt # Doesn't really matter what torch version, we just need ANY torch installed pip install 'torch==1.*' - name: Run collect_env.py diff --git a/.github/workflows/pr-labels.yml b/.github/workflows/pr-labels.yml index de6da1feec02a..aa8cf4472b784 100644 --- a/.github/workflows/pr-labels.yml +++ b/.github/workflows/pr-labels.yml @@ -17,8 +17,7 @@ jobs: - name: Set up python uses: actions/setup-python@v4 with: - python-version: 3.10 - check-latest: false + python-version: '3.10' cache: pip cache-dependency-path: | **/.github/requirements-gha-cache.txt diff --git a/.github/workflows/revert.yml b/.github/workflows/revert.yml index 6468f3b8c804c..d207840f383b4 100644 --- a/.github/workflows/revert.yml +++ b/.github/workflows/revert.yml @@ -23,8 +23,7 @@ jobs: with: python-version: 3.8 architecture: x64 - check-latest: false - cache: pip + cache: 'pip' - run: pip install pyyaml==6.0 - name: Setup committer id diff --git a/.github/workflows/trymerge.yml b/.github/workflows/trymerge.yml index 372b442163df0..dff92303f5056 100644 --- a/.github/workflows/trymerge.yml +++ b/.github/workflows/trymerge.yml @@ -22,8 +22,7 @@ jobs: uses: actions/setup-python@v4 with: python-version: 3.8 - check-latest: false - cache: pip + cache: 'pip' architecture: x64 - run: pip install pyyaml==6.0 diff --git a/.github/workflows/tryrebase.yml b/.github/workflows/tryrebase.yml index dd32069932678..fed9000c420e9 100644 --- a/.github/workflows/tryrebase.yml +++ b/.github/workflows/tryrebase.yml @@ -22,8 +22,7 @@ jobs: with: python-version: 3.8 architecture: x64 - check-latest: false - cache: pip + cache: 'pip' - run: pip install pyyaml==6.0 - name: Setup committer id diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml index 4be70de020a3b..5901b1f4cda1b 100644 --- a/.github/workflows/update-viablestrict.yml +++ b/.github/workflows/update-viablestrict.yml @@ -24,7 +24,6 @@ jobs: with: python-version: 3.8 architecture: x64 - check-latest: false cache: pip cache-dependency-path: | **/.circleci/docker/requirements-ci.txt diff --git a/tools/linter/adapters/s3_init_config.json b/tools/linter/adapters/s3_init_config.json index d48f264f83d5d..0b0e87e8e26cf 100644 --- a/tools/linter/adapters/s3_init_config.json +++ b/tools/linter/adapters/s3_init_config.json @@ -27,12 +27,12 @@ }, "actionlint": { "Darwin": { - "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.21/Darwin_amd64/actionlint", - "hash": "b354db83815384d3c3a07f68f44b30cb0a70899757a0d185d7322de9952e8813" + "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.15/Darwin_amd64/actionlint", + "hash": "e9a0e0b17e54cfefe7964b6aa1da8921b1f8f2318c31c0eb1a17ea3e8ab10db2" }, "Linux": { - "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.21/Linux_arm64/actionlint", - "hash": "025ac157db121b33971ef24af72d73d71cda3cb1e3a94795bb2708ef4032ca76" + "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.15/Linux_arm64/actionlint", + "hash": "d6b45ae67f29a2bf9ddd226071ddd8f158fdf2992e8515a06838e5fef90f3a2d" } } } From bcb94062d5a801cf1e6f18039b9e0a348b55ec5a Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 26 Oct 2022 20:08:29 +0000 Subject: [PATCH 0194/1922] Set check-latest to false when setup python and pip cache in CI (#87621) I missed the fine print in https://github.com/actions/setup-python/blob/main/README.md#caching-packages-dependencies when setting up the cache using setup-python GHA > Restored cache will not be used if the requirements.txt file is not updated for a long time and a newer version of the dependency is available which can lead to an increase in total build time. The latter part is important because it implies that even with the cache, pip will still try to check if a newer version exists and that part can be flaky, i.e. https://github.com/pytorch/pytorch/actions/runs/3313764038/jobs/5472180293 This undesired behavior can be turned off by setting the advance option `check-latest` to false https://github.com/actions/setup-python/blob/main/docs/advanced-usage.md#check-latest-version. Per my understanding, this should tell pip install in these workflows to use the local cached copy of the package avoiding the need to query pypi every single time. `check-latest` was added quite recently https://github.com/actions/setup-python/pull/406, so `actionlint-1.6.15` fails to recognize it. Thus, this PR also upgrades `actionlint` to the latest 1.6.21 to pass the linter check. Here is an example error from 1.6.15 from https://github.com/pytorch/pytorch/actions/runs/3315388073/jobs/5475918454: ``` >>> Lint for .github/workflows/lint.yml: Error (ACTIONLINT) [action] input "check-latest" is not defined in action "actions/setup-python@v4". available inputs are "architecture", "cache", "cache-dependency-path", "python-version", "python-version-file", "token" 25 | with: 26 | python-version: 3.8 27 | architecture: x64 >>> 28 | check-latest: false 29 | cache: pip 30 | cache-dependency-path: | 31 | **/.github/requirements-gha-cache.txt ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/87621 Approved by: https://github.com/ZainRizvi --- .github/actions/setup-win/action.yml | 3 +- .github/workflows/lint.yml | 37 +++++++++++++---------- .github/workflows/pr-labels.yml | 1 + .github/workflows/revert.yml | 5 +-- .github/workflows/trymerge.yml | 5 +-- .github/workflows/tryrebase.yml | 5 +-- .github/workflows/update-viablestrict.yml | 3 +- tools/linter/adapters/s3_init_config.json | 8 ++--- 8 files changed, 39 insertions(+), 28 deletions(-) diff --git a/.github/actions/setup-win/action.yml b/.github/actions/setup-win/action.yml index d442343430c7d..6dc1a1b6c6fe2 100644 --- a/.github/actions/setup-win/action.yml +++ b/.github/actions/setup-win/action.yml @@ -57,7 +57,8 @@ runs: - name: Setup Python3 uses: actions/setup-python@v4 with: - python-version: "3.x" + python-version: '3.x' + check-latest: false cache: pip cache-dependency-path: | **/requirements.txt diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 669977b143a5e..cff22d72d4d24 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -23,18 +23,16 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: '3.8' architecture: x64 + check-latest: false cache: pip cache-dependency-path: | **/.github/requirements-gha-cache.txt - - name: Install lintrunner - uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482 - with: - timeout_minutes: 5 - max_attempts: 3 - command: pip install lintrunner==0.9.2 + - name: Install requirements + run: | + pip install -r .github/requirements-gha-cache.txt --user - name: Initialize lint dependencies run: lintrunner init @@ -85,8 +83,9 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: 3.x + python-version: '3.x' architecture: x64 + check-latest: false cache: pip cache-dependency-path: | **/requirements.txt @@ -149,8 +148,9 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: 3.x + python-version: '3.x' architecture: x64 + check-latest: false cache: pip cache-dependency-path: | **/requirements.txt @@ -240,9 +240,10 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: '3.8' architecture: x64 - cache: 'pip' + check-latest: false + cache: pip cache-dependency-path: | **/.github/requirements-gha-cache.txt @@ -273,8 +274,9 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: '3.8' architecture: x64 + check-latest: false cache: pip cache-dependency-path: | **/requirements.txt @@ -318,23 +320,26 @@ jobs: if: matrix.test_type == 'older_python_version' uses: actions/setup-python@v4 with: - python-version: 3.5 + python-version: '3.5' architecture: x64 + check-latest: false cache: pip cache-dependency-path: | - **/.github/requirements-gha-cache.txt + **/requirements.txt - name: Setup Python 3.8 if: matrix.test_type != 'older_python_version' uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: '3.8' architecture: x64 + check-latest: false cache: pip cache-dependency-path: | - **/.github/requirements-gha-cache.txt + **/requirements.txt - name: Install torch if: matrix.test_type == 'with_torch' run: | + pip install -r requirements.txt # Doesn't really matter what torch version, we just need ANY torch installed pip install 'torch==1.*' - name: Run collect_env.py diff --git a/.github/workflows/pr-labels.yml b/.github/workflows/pr-labels.yml index aa8cf4472b784..9afa0e721ac60 100644 --- a/.github/workflows/pr-labels.yml +++ b/.github/workflows/pr-labels.yml @@ -18,6 +18,7 @@ jobs: uses: actions/setup-python@v4 with: python-version: '3.10' + check-latest: false cache: pip cache-dependency-path: | **/.github/requirements-gha-cache.txt diff --git a/.github/workflows/revert.yml b/.github/workflows/revert.yml index d207840f383b4..2a2fff27044ea 100644 --- a/.github/workflows/revert.yml +++ b/.github/workflows/revert.yml @@ -21,9 +21,10 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: '3.8' architecture: x64 - cache: 'pip' + check-latest: false + cache: pip - run: pip install pyyaml==6.0 - name: Setup committer id diff --git a/.github/workflows/trymerge.yml b/.github/workflows/trymerge.yml index dff92303f5056..3d1d92967d885 100644 --- a/.github/workflows/trymerge.yml +++ b/.github/workflows/trymerge.yml @@ -21,8 +21,9 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: 3.8 - cache: 'pip' + python-version: '3.8' + check-latest: false + cache: pip architecture: x64 - run: pip install pyyaml==6.0 diff --git a/.github/workflows/tryrebase.yml b/.github/workflows/tryrebase.yml index fed9000c420e9..53434310c3d00 100644 --- a/.github/workflows/tryrebase.yml +++ b/.github/workflows/tryrebase.yml @@ -20,9 +20,10 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: '3.8' architecture: x64 - cache: 'pip' + check-latest: false + cache: pip - run: pip install pyyaml==6.0 - name: Setup committer id diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml index 5901b1f4cda1b..12bf4e271f927 100644 --- a/.github/workflows/update-viablestrict.yml +++ b/.github/workflows/update-viablestrict.yml @@ -22,8 +22,9 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: '3.8' architecture: x64 + check-latest: false cache: pip cache-dependency-path: | **/.circleci/docker/requirements-ci.txt diff --git a/tools/linter/adapters/s3_init_config.json b/tools/linter/adapters/s3_init_config.json index 0b0e87e8e26cf..d48f264f83d5d 100644 --- a/tools/linter/adapters/s3_init_config.json +++ b/tools/linter/adapters/s3_init_config.json @@ -27,12 +27,12 @@ }, "actionlint": { "Darwin": { - "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.15/Darwin_amd64/actionlint", - "hash": "e9a0e0b17e54cfefe7964b6aa1da8921b1f8f2318c31c0eb1a17ea3e8ab10db2" + "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.21/Darwin_amd64/actionlint", + "hash": "b354db83815384d3c3a07f68f44b30cb0a70899757a0d185d7322de9952e8813" }, "Linux": { - "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.15/Linux_arm64/actionlint", - "hash": "d6b45ae67f29a2bf9ddd226071ddd8f158fdf2992e8515a06838e5fef90f3a2d" + "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.21/Linux_arm64/actionlint", + "hash": "025ac157db121b33971ef24af72d73d71cda3cb1e3a94795bb2708ef4032ca76" } } } From 427e641d694e032365b7001d0a49ce721e038d65 Mon Sep 17 00:00:00 2001 From: Sherlock Huang Date: Wed, 26 Oct 2022 17:38:05 +0000 Subject: [PATCH 0195/1922] Fix meta for index_add and index_put (#87775) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87775 Approved by: https://github.com/ezyang, https://github.com/ngimel --- test/test_meta.py | 64 ++++++++++++++++++++++++++++++++++++ torch/_meta_registrations.py | 2 +- torch/_refs/__init__.py | 5 ++- 3 files changed, 69 insertions(+), 2 deletions(-) diff --git a/test/test_meta.py b/test/test_meta.py index 23e7025140138..2431042e01728 100644 --- a/test/test_meta.py +++ b/test/test_meta.py @@ -1,5 +1,6 @@ # Owner(s): ["module: primTorch"] +import itertools import torch import os from enum import Enum @@ -20,6 +21,7 @@ from torch.testing._internal.common_device_type import ( ops, instantiate_device_type_tests, + onlyCUDA, ) from torch.testing._internal.common_methods_invocations import op_db from torchgen.utils import YamlLoader @@ -187,6 +189,8 @@ def test_tensor_outlives_converter(self): CHECK_STRIDES = { torch.Tensor.__getitem__, + torch.ops.aten.index_put, + torch.ops.aten.index_add, } def should_check_strides(func): @@ -1023,6 +1027,66 @@ def test_fill_alias_relationship(self): r2 = torch.ops.aten.fill(inps, 1.0) self.assertNotEqual(id(inps), id(r2)) + def get_stride_variants(self, t): + results = [] + + # contiguous + results.append(t) + + # transposed + if t.ndim > 1: + perm = list(reversed(range(t.ndim))) + transposed = torch.empty(t.shape[::-1], device=t.device, dtype=t.dtype).permute(perm).copy_(t) + results.append(transposed) + + # nondense + nondense = torch.repeat_interleave(t, 2, dim=-1)[..., ::2] + results.append(nondense) + + return results + + @onlyCUDA + def test_index_add_stride(self, device): + to_meta = MetaConverter() + + x = torch.ones(5, 3, device=device) + t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float, device=device) + index = torch.tensor([0, 4, 2], device=device) + + xs = self.get_stride_variants(x) + ts = self.get_stride_variants(t) + + for x, t in itertools.product(xs, ts): + args = (x, 0, index, t) + meta_args = tree_map(to_meta, args) + + r = torch.ops.aten.index_add(*args) + meta_r = torch.ops.aten.index_add(*meta_args) + + self.assertEqual(r.size(), meta_r.size()) + self.assertEqual(r.stride(), meta_r.stride()) + + @onlyCUDA + def test_index_put_stride(self, device): + to_meta = MetaConverter() + + x = torch.rand(5, 5, device=device) + t = torch.rand(5, device=device) + index = torch.tensor([True, False, True, True, False], device=device) + + xs = self.get_stride_variants(x) + ts = self.get_stride_variants(t) + + for x, t in itertools.product(xs, ts): + args = (x, [index], t) + meta_args = tree_map(to_meta, args) + + r = torch.ops.aten.index_put(*args) + meta_r = torch.ops.aten.index_put(*meta_args) + + self.assertEqual(r.size(), meta_r.size()) + self.assertEqual(r.stride(), meta_r.stride()) + def test_map_location_deserialize(self): import io diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py index 0af6813ce4a00..cde0ac96a2d84 100644 --- a/torch/_meta_registrations.py +++ b/torch/_meta_registrations.py @@ -1105,7 +1105,7 @@ def meta_relu_(self): @register_meta(aten.index_put.default) def meta_index_put(self, indices, values, accumulate=False): - return self.new_empty(self.size()) + return torch.empty_like(self) @register_meta(aten.masked_fill_.Scalar) diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py index 44b75bb92df48..5cee1c9a684bb 100644 --- a/torch/_refs/__init__.py +++ b/torch/_refs/__init__.py @@ -3280,7 +3280,10 @@ def index_add( *, alpha: NumberType = 1, ): - return x.clone().index_add_(dim, index, tensor, alpha=alpha) # type: ignore[arg-type] + # index_add always returns a new contiguous tensor + return x.clone(memory_format=torch.contiguous_format).index_add_( + dim, index, tensor, alpha=alpha # type: ignore[arg-type] + ) @register_decomposition(torch.ops.aten.index_select) From 62b83f2428ad64e96129a040f0a761672e26e38f Mon Sep 17 00:00:00 2001 From: Justin Chu Date: Wed, 26 Oct 2022 20:42:06 +0000 Subject: [PATCH 0196/1922] [ONNX] Deprecate operators.py (#87798) Deprecate `torch.onnx.operators` because it's only for backwards compatibility Pull Request resolved: https://github.com/pytorch/pytorch/pull/87798 Approved by: https://github.com/BowenBao --- torch/onnx/operators.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/torch/onnx/operators.py b/torch/onnx/operators.py index e5f12444c3559..07f89b2e41a60 100644 --- a/torch/onnx/operators.py +++ b/torch/onnx/operators.py @@ -9,12 +9,15 @@ """ import torch -import torch.onnx +from torch.onnx import _deprecation +# 180-day deprecation period +@_deprecation.deprecated("1.14", "1.16", "use torch._shape_as_tensor") def shape_as_tensor(x): return torch._shape_as_tensor(x) +@_deprecation.deprecated("1.14", "1.16", "use torch._reshape_from_tensor") def reshape_from_tensor_shape(x, shape): return torch._reshape_from_tensor(x, shape) From e1c85fa44efe828564f0b332b0765f59cad6545f Mon Sep 17 00:00:00 2001 From: Bin Bao Date: Wed, 26 Oct 2022 13:59:07 +0000 Subject: [PATCH 0197/1922] Clean up CPU test in test_torchinductor.py for fbcode (#87783) cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87783 Approved by: https://github.com/bertmaher --- test/inductor/test_torchinductor.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py index a675fc476672b..8e8b371c2780e 100644 --- a/test/inductor/test_torchinductor.py +++ b/test/inductor/test_torchinductor.py @@ -19,6 +19,7 @@ from torch.fx.experimental.proxy_tensor import make_fx from torch.nn import functional as F from torch.testing._internal.common_utils import ( + IS_FBCODE, TEST_WITH_ASAN, TEST_WITH_ROCM, TestCase as TorchTestCase, @@ -54,6 +55,9 @@ HAS_CPU = False try: + if IS_FBCODE: + raise torch._inductor.exc.CppCompileError + from subprocess import CalledProcessError from torch._inductor.codecache import CppCodeCache @@ -410,13 +414,6 @@ def populate(cls): cls.gen_template(name1, name2) -class SweepInputsCpuTest(SweepInputs2, TestCase): - gen = InputGen(10, "cpu") - - -SweepInputsCpuTest.populate() - - class TestIndexingSimplification(TorchTestCase): def test_indexing_simplification(self): sizevars = SizeVarAllocator() @@ -4027,6 +4024,11 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None): if HAS_CPU: + class SweepInputsCpuTest(SweepInputs2, TestCase): + gen = InputGen(10, "cpu") + + SweepInputsCpuTest.populate() + class CpuTests(TestCase): common = check_model device = "cpu" From 3d6c0ba18bb3ee3261ceffb16281dbccccf834db Mon Sep 17 00:00:00 2001 From: Sergii Dymchenko Date: Wed, 26 Oct 2022 20:54:25 +0000 Subject: [PATCH 0198/1922] Update XLA hash (#87818) This is a re-creation of https://github.com/pytorch/pytorch/pull/87808 so we don't have to wait. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87818 Approved by: https://github.com/clee2000 --- .github/ci_commit_pins/xla.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt index e75cb6ffbe979..86063843174d2 100644 --- a/.github/ci_commit_pins/xla.txt +++ b/.github/ci_commit_pins/xla.txt @@ -1 +1 @@ -1812b1d19477707ed027e7b597ff23a46176dab8 +79131e9d31290744afdf3d85118251863e16ab0e From bde87a1d2e25c2b40922a044b6e70f343fc4a632 Mon Sep 17 00:00:00 2001 From: Zafar Date: Wed, 26 Oct 2022 20:55:10 +0000 Subject: [PATCH 0199/1922] [ao] Fixing tests for block pruning shapes (#87326) The current unittests were only checking the tensors whose shapes were already multiples of the block size. That caused some hidden bugs to creep in. Specifically, for the shapes that would require padding for the mask/data, the sparsifier would try to apply shape-mismatching tensors onto each other. This caused segfaults as well as silent failures. This makes minor adjustments to the code to make sure the masks and data shapes are aligned, as well as fixing the tests to catch this. Test Plan: ```python python test/test_ao_sparsity.py ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/87326 Approved by: https://github.com/jcaip --- test/ao/sparsity/test_sparsifier.py | 8 +++++--- torch/ao/pruning/sparsifier/weight_norm_sparsifier.py | 9 +++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/test/ao/sparsity/test_sparsifier.py b/test/ao/sparsity/test_sparsifier.py index 415679337ff2e..512c58b188367 100644 --- a/test/ao/sparsity/test_sparsifier.py +++ b/test/ao/sparsity/test_sparsifier.py @@ -18,14 +18,16 @@ class Model(nn.Module): def __init__(self): super().__init__() self.seq = nn.Sequential( - nn.Linear(16, 16) + nn.Linear(37, 39) ) - self.linear = nn.Linear(16, 16) - self.head = nn.Linear(16, 4) + self.linear = nn.Linear(39, 33) + self.head = nn.Linear(33, 13) def forward(self, x): x = self.seq(x) + x = torch.relu(x) x = self.linear(x) + x = torch.relu(x) x = self.head(x) return x diff --git a/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py b/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py index 8a66280cc852d..2ba2584616e21 100644 --- a/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py +++ b/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py @@ -99,7 +99,7 @@ def _make_tensor_mask(self, data, input_shape, sparsity_level, sparse_block_shap dw = (block_w - w % block_w) % block_w if mask is None: - mask = torch.ones(h, w, device=data.device) + mask = torch.ones(h + dh, w + dw, device=data.device) if sparsity_level >= 1.0: mask.data = torch.zeros_like(mask) @@ -141,14 +141,15 @@ def _make_block_mask(self, data, sparse_block_shape, zeros_per_block, mask=None) In this context the `zeros_per_block` describes the number of zeroed-out elements within a patch. """ - if mask is None: - mask = torch.ones(data.shape, device=data.device) h, w = data.shape[-2:] block_h, block_w = sparse_block_shape dh = (block_h - h % block_h) % block_h dw = (block_w - w % block_w) % block_w values_per_block = reduce((lambda x, y: x * y), sparse_block_shape) + if mask is None: + mask = torch.ones((h + dh, w + dw), device=data.device) + if values_per_block == zeros_per_block: # Everything should be sparsified mask.data = torch.zeros_like(mask) @@ -168,7 +169,7 @@ def _make_block_mask(self, data, sparse_block_shape, zeros_per_block, mask=None) dim=1, indices=sorted_idx, output_shape=padded_data.shape, block_shape=sparse_block_shape, mask=mask_reshape ) - mask.data = mask_reshape.squeeze().reshape(mask.shape)[:h, :w].contiguous() + mask.data = mask_reshape.squeeze().reshape(mask.shape).contiguous() return mask def update_mask(self, module, tensor_name, sparsity_level, sparse_block_shape, From fa1b988f9c28fa295abc3a48d5306fa056425198 Mon Sep 17 00:00:00 2001 From: PyTorch MergeBot Date: Wed, 26 Oct 2022 21:01:09 +0000 Subject: [PATCH 0200/1922] Revert "Disable linux-bionic-py3_7-clang8-xla-test (#87737)" This reverts commit 21f7e7d040c646b4ce7f4a4e973da97660462bdc. Reverted https://github.com/pytorch/pytorch/pull/87737 on behalf of https://github.com/kit1980 due to Re-enable XLA tests after https://github.com/pytorch/pytorch/pull/87818 --- .github/workflows/pull.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index cc25bfc1326d1..849e70dc9f29d 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -219,7 +219,6 @@ jobs: ]} linux-bionic-py3_7-clang8-xla-test: - if: false name: linux-bionic-py3_7-clang8-xla uses: ./.github/workflows/_linux-test.yml needs: linux-bionic-py3_7-clang8-xla-build From 94185b9b7c7dd066e5878e25baa6470a819073e9 Mon Sep 17 00:00:00 2001 From: soulitzer Date: Wed, 26 Oct 2022 13:34:34 -0400 Subject: [PATCH 0201/1922] Expose API for backward execution order (#87507) In this PR: - graph_task stores graph roots on construction so that we can later traverse through the graph - before the nodes are returned, they needed to be converted from raw_ptr to shared_ptr, and this should be OK because the graph is guaranteed to be alive Pull Request resolved: https://github.com/pytorch/pytorch/pull/87507 Approved by: https://github.com/albanD --- test/test_autograd.py | 122 ++++++++++++++++++ torch/csrc/Module.cpp | 26 ++++ torch/csrc/autograd/engine.cpp | 93 +++++++++++-- torch/csrc/autograd/function.h | 3 + torch/csrc/autograd/graph_task.h | 4 + .../autograd/engine/dist_engine.cpp | 8 ++ 6 files changed, 245 insertions(+), 11 deletions(-) diff --git a/test/test_autograd.py b/test/test_autograd.py index 03cc78dc242fb..43f31ae63ed32 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -3073,6 +3073,128 @@ def hook(_): self.assertEqual(torch._C._current_graph_task_id(), -1) + def test_current_graph_task_execution_order(self): + predicted = [None] + + def hook(_): + predicted[0] = torch._C._current_graph_task_execution_order() + + def names(nodes): + return ", ".join([node.name().split(' ')[-1] for node in nodes]) + '\n' + + def grad_fns(*tensors): + # or grad accumulator + out = [] + for t in tensors: + if t.requires_grad and t.grad_fn is None: + out.append(t.clone().grad_fn.next_functions[0][0]) + else: + out.append(t.grad_fn) + return out + + actual = [] + + def register_logging_hooks(*tensors): + # register hooks that log the order in which they are called + def get_hook(i): + def hook(t_): + actual.append(tensors[i]) + return hook + + for i, t in enumerate(tensors): + t.register_hook(get_hook(i)) + + # Basic example: single path + t = torch.tensor(1., requires_grad=True).clone().sin().exp() + t.register_hook(hook) + with torch.autograd.set_multithreading_enabled(False): + t.backward() + self.assertExpectedInline(names(predicted[0]), """\ +ExpBackward0, SinBackward0, CloneBackward0, torch::autograd::AccumulateGrad +""") + + # We don't exactly follow sequence_nr order + a = torch.tensor(1., requires_grad=True) + b = torch.tensor(2., requires_grad=True) + c = b.sin() + d = a.cos() + out = c * d + register_logging_hooks(a, b, c, d, out) + out.register_hook(hook) + with torch.autograd.set_multithreading_enabled(False): + out.backward() + self.assertEqual(predicted[0], grad_fns(*actual)) + actual = [] + + # Multiple roots are also OK + a = torch.tensor(1., requires_grad=True) + b = a * 2 + out = b.sin() + out2 = b.cos() + out3 = b.cos() + register_logging_hooks(a, b, out, out2, out3) + out3.register_hook(hook) + with torch.autograd.set_multithreading_enabled(False): + torch.autograd.grad((out, out3, out2), inputs=(a,)) + self.assertExpectedInline(names(predicted[0]), """\ +CosBackward0, CosBackward0, SinBackward0, MulBackward0, torch::autograd::AccumulateGrad +""") + # TODO: Uncomment after update to hooks behavior + # self.assertEqual(predicted[0], grad_fns(*actual)) + actual = [] + + # Case where next node is nullptr + a = torch.tensor(1., requires_grad=True) + b = a * 2 + out = b.sin() + register_logging_hooks(a, b, out) + out.register_hook(hook) + with torch.autograd.set_multithreading_enabled(False): + out.backward() + self.assertEqual(predicted[0], grad_fns(*actual)) + actual = [] + + # Case where two `inputs` on the same path + a = torch.tensor(1., requires_grad=True) + b = a * 2 + out = b.sin() + register_logging_hooks(a, b, out) + out.register_hook(hook) + with torch.autograd.set_multithreading_enabled(False): + torch.autograd.grad((out,), inputs=(a, b,)) + self.assertEqual(names(predicted[0]), """\ +SinBackward0, MulBackward0, torch::autograd::AccumulateGrad +""") + # TODO: Uncomment after update to hooks behavior + # self.assertEqual(predicted[0], grad_fns(*actual)) + actual = [] + + # Case where `inputs` specifies a subgraph + a = torch.tensor(1., requires_grad=True) + b = torch.tensor(1., requires_grad=True) + c = a * b + out = c.sin() + register_logging_hooks(a, b, c, out) + out.register_hook(hook) + with torch.autograd.set_multithreading_enabled(False): + torch.autograd.grad((out,), inputs=(a,)) + self.assertEqual(names(predicted[0]), """\ +SinBackward0, MulBackward0, torch::autograd::AccumulateGrad +""") + # TODO: Uncomment after update to hooks behavior + # self.assertEqual(predicted[0], grad_fns(*actual)) + actual = [] + + # Errors when not called in a backward + with self.assertRaisesRegex(RuntimeError, "should only be called during the backward pass"): + torch._C._current_graph_task_execution_order() + + # Errors when context manager not enabled + t = torch.tensor(1., requires_grad=True).clone().sin().exp() + t.register_hook(hook) + with self.assertRaisesRegex(RuntimeError, "expects the current backward to be executed with multithreading disabled"): + t.backward() + def test_profiler(self): x = torch.randn(10, 10) diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index e41f0305a2e11..98589a31eaced 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -813,6 +813,28 @@ PyObject* THPModule_willEngineExecuteNode(PyObject* _unused, PyObject* arg) { END_HANDLE_TH_ERRORS } +PyObject* THPModule_getCurrentGraphTaskExecutionOrder( + PyObject* _unused, + PyObject* noargs) { + HANDLE_TH_ERRORS + std::vector nodes = + torch::autograd::get_current_graph_task_execution_order(); + TORCH_CHECK( + nodes.size(), + "_current_graph_task_execution_order should only be called during the backward pass"); + auto list = THPObjectPtr(PyList_New(nodes.size())); + if (!list) + return nullptr; + for (const auto i : c10::irange(nodes.size())) { + // This node is guaranteed to be alive since the backward is still running + PyObject* pyobj_node = + torch::autograd::functionToPyObject(nodes[i]->getptr()); + PyList_SET_ITEM(list.get(), i, pyobj_node); + } + return list.release(); + END_HANDLE_TH_ERRORS +} + PyObject* THPModule_getCurrentGraphTaskId(PyObject* _unused, PyObject* noargs) { HANDLE_TH_ERRORS return THPUtils_packInt64(torch::autograd::get_current_graph_task_id()); @@ -1019,6 +1041,10 @@ static PyMethodDef TorchMethods[] = { THPModule_willEngineExecuteNode, METH_O, nullptr}, + {"_current_graph_task_execution_order", + THPModule_getCurrentGraphTaskExecutionOrder, + METH_NOARGS, + nullptr}, {"_current_graph_task_id", THPModule_getCurrentGraphTaskId, METH_NOARGS, diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp index ca9ae4e443df5..0a2298efc1282 100644 --- a/torch/csrc/autograd/engine.cpp +++ b/torch/csrc/autograd/engine.cpp @@ -398,6 +398,66 @@ void add_node_to_current_graph_task_exec_info(Node* fn) { current_graph_task->exec_info_[fn].needed_ = true; } +// NB: The engine itself does not use the outputs of this function. +std::vector get_current_graph_task_execution_order() { + std::shared_ptr task = current_graph_task; + if (!task) { + return {}; + } + + // We could potentially check if there is only a single device here + // but explicitly require this context doens't seem bad either + TORCH_CHECK( + !c10::AutogradState::get_tls_state().get_multithreading_enabled(), + "get_current_graph_task_execution_order expects the current backward to be " + "executed with multithreading disabled, e.g. by running:\n\n" + ">>> with torch.autograd.set_multithreading_enabled(False):\n" + "... torch.autograd.grad(...)\n"); + + const bool check_exec_info = !task->exec_info_.empty(); + std::vector out{}; + std::unordered_set seen{}; + + auto compare_seq_nr = [](Node* n1, Node* n2) { + return n1->sequence_nr() < n2->sequence_nr(); + }; + std::priority_queue, decltype(compare_seq_nr)> heap( + compare_seq_nr); + + for (Node* ptr : task->graph_roots_) { + heap.push(ptr); + } + + // Implementation notes: + // - Don't need to count dependencies because we have sequence_nr + // - Don't need to check topological_nr because we have exec_info + while (!heap.empty()) { + Node* fn = heap.top(); + heap.pop(); + + const bool was_inserted = seen.insert(fn).second; + if (!was_inserted) { + continue; + } + + out.push_back(fn); + for (const auto& edge : fn->next_edges()) { + Node* next_ptr = edge.function.get(); + if (!next_ptr) { + continue; + } + if (check_exec_info) { + auto it = task->exec_info_.find(next_ptr); + if (it == task->exec_info_.end() || !it->second.should_execute()) { + continue; + } + } + heap.push(next_ptr); + } + } + return out; +} + // NOTE: graph_tasks do not necessarily form a stack. Imagine this // case: // @@ -1050,7 +1110,7 @@ auto Engine::compute_dependencies( } auto Engine::execute( - const edge_list& roots, + const edge_list& root_edges, const variable_list& inputs, bool keep_graph, bool create_graph, @@ -1058,9 +1118,9 @@ auto Engine::execute( const edge_list& outputs) -> variable_list { // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) validate_outputs( - roots, const_cast(inputs), [](const std::string& msg) { - return msg; - }); + root_edges, + const_cast(inputs), + [](const std::string& msg) { return msg; }); if (accumulate_grad && create_graph) { TORCH_WARN_ONCE( "Using backward() with create_graph=True will create a reference cycle " @@ -1083,17 +1143,25 @@ auto Engine::execute( init_local_ready_queue(); bool not_reentrant_backward_call = worker_device == NO_DEVICE; + // Store root nodes so we can traverse through the graph later + // e.g., for get_current_graph_task_execution_order + c10::SmallVector temp_roots{root_edges.size()}; + for (const auto i : c10::irange(root_edges.size())) { + temp_roots[i] = root_edges[i].function.get(); + } + auto graph_task = std::make_shared( /* keep_graph */ keep_graph, /* create_graph */ create_graph, /* depth */ not_reentrant_backward_call ? 0 : total_depth + 1, - /* cpu_ready_queue */ local_ready_queue); + /* cpu_ready_queue */ local_ready_queue, + /* graph_roots */ std::move(temp_roots)); // If we receive a single root, skip creating extra root node - bool skip_dummy_node = roots.size() == 1; + bool skip_dummy_node = root_edges.size() == 1; auto graph_root = skip_dummy_node - ? roots.at(0).function - : std::make_shared(roots, inputs); + ? root_edges.at(0).function + : std::make_shared(root_edges, inputs); auto min_topo_nr = compute_min_topological_nr(outputs); // Now compute the dependencies for all executable functions @@ -1106,14 +1174,17 @@ auto Engine::execute( // Queue the root if (skip_dummy_node) { - InputBuffer input_buffer(roots.at(0).function->num_inputs()); + InputBuffer input_buffer(root_edges.at(0).function->num_inputs()); auto input = inputs.at(0); const auto input_stream = InputMetadata(input).stream(); const auto opt_next_stream = - roots.at(0).function->stream(c10::DeviceType::CUDA); + root_edges.at(0).function->stream(c10::DeviceType::CUDA); input_buffer.add( - roots.at(0).input_nr, std::move(input), input_stream, opt_next_stream); + root_edges.at(0).input_nr, + std::move(input), + input_stream, + opt_next_stream); execute_with_graph_task(graph_task, graph_root, std::move(input_buffer)); } else { diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h index aa82e3ad2c77c..bb5f4b1eaad09 100644 --- a/torch/csrc/autograd/function.h +++ b/torch/csrc/autograd/function.h @@ -143,6 +143,9 @@ struct TORCH_API Node : std::enable_shared_from_this { Node& operator=(Node&& other) = delete; virtual ~Node() = default; + std::shared_ptr getptr() { + return shared_from_this(); + } /// Evaluates the function on the given inputs and returns the result of the /// function call. variable_list operator()(variable_list&& inputs) { diff --git a/torch/csrc/autograd/graph_task.h b/torch/csrc/autograd/graph_task.h index 8eb122313d0a0..4efbc905fed37 100644 --- a/torch/csrc/autograd/graph_task.h +++ b/torch/csrc/autograd/graph_task.h @@ -37,6 +37,7 @@ struct GraphTask : std::enable_shared_from_this { // Records the nodes that are in the graph std::unordered_set nodes_in_graph_; + c10::SmallVector graph_roots_; // Note [Exec info] // Exec info is created for each GraphTask, which allows filtering paths on // the graph that are not needed. It has a bit complicated semantics. If it's @@ -164,8 +165,10 @@ struct GraphTask : std::enable_shared_from_this { bool grad_mode, int reentrant_depth, std::shared_ptr cpu_ready_queue, + c10::SmallVector graph_roots, bool exit_on_error = false) : keep_graph_(keep_graph), + graph_roots_(std::move(graph_roots)), owner_(NO_DEVICE), reentrant_depth_(reentrant_depth), exit_on_error_(exit_on_error), @@ -198,6 +201,7 @@ get_current_graph_task_exec_info(); TORCH_API const std::unordered_set* get_current_graph_task_nodes_in_graph(); TORCH_API bool get_current_graph_task_keep_graph(); +TORCH_API std::vector get_current_graph_task_execution_order(); TORCH_API int get_current_graph_task_id(); void add_node_to_current_graph_task_exec_info(Node* fn); diff --git a/torch/csrc/distributed/autograd/engine/dist_engine.cpp b/torch/csrc/distributed/autograd/engine/dist_engine.cpp index 2da315644845c..06c6927e4c467 100644 --- a/torch/csrc/distributed/autograd/engine/dist_engine.cpp +++ b/torch/csrc/distributed/autograd/engine/dist_engine.cpp @@ -185,6 +185,13 @@ void DistEngine::computeDependencies( bool retainGraph) { TORCH_INTERNAL_ASSERT(graphRoot, "graphRoot is null!"); + // Store root nodes so we can traverse through the graph later + // e.g., for get_current_graph_task_execution_order + c10::SmallVector temp_roots{rootEdges.size()}; + for (const auto i : c10::irange(rootEdges.size())) { + temp_roots[i] = rootEdges[i].function.get(); + } + // Build the graph task and graph root. // NOTE: we don't need to build and pass a cpu_ready_queue to GraphTask // as we use execute_graph_task_until_ready_queue_empty, which will build @@ -194,6 +201,7 @@ void DistEngine::computeDependencies( /* create_graph */ false, /* depth */ 0, /* cpu_ready_queue */ global_cpu_ready_queue_, + /* graph_roots */ temp_roots, /* exit_on_error */ true); // Run BFS to traverse the graph locally. The roots of the graph are From a4e7fdb112024668185b1dc01f9995a3618cf381 Mon Sep 17 00:00:00 2001 From: Cameron Voisey Date: Wed, 26 Oct 2022 21:34:13 +0000 Subject: [PATCH 0202/1922] Simplify installation instruction in contributing file (#87460) Simplification of one of the installation instructions in CONTRIBUTING.md that I found tricky to parse at first. Also adds a link to the "Make no-op build fast" section to make it easier to navigate to. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87460 Approved by: https://github.com/ngimel --- CONTRIBUTING.md | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 05e98c3b9a673..c43d64c4610d6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -118,21 +118,9 @@ git submodule sync --recursive git submodule update --init --recursive --jobs 0 ``` -If you want to have no-op incremental rebuilds (which are fast), see the section below titled "Make no-op build fast." +If you want to have no-op incremental rebuilds (which are fast), see [Make no-op build fast](#make-no-op-build-fast) below. -3. Follow the instructions for [installing PyTorch from source](https://github.com/pytorch/pytorch#from-source), except when it's time to install PyTorch instead of invoking `setup.py install` you'll want to call `setup.py develop` instead: - -Specifically, the change you have to make is to replace - -```bash -python setup.py install -``` - -with - -```bash -python setup.py develop -``` +3. Follow the instructions for [installing PyTorch from source](https://github.com/pytorch/pytorch#from-source), but instead of installing PyTorch via `python setup.py install`, use `python setup.py develop`. This mode will symlink the Python files from the current local source tree into the Python install. This way when you modify a Python file, you From 966df362cdbd79b3eaa50fc5a356368d7e7978ae Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 26 Oct 2022 21:51:13 +0000 Subject: [PATCH 0203/1922] [CI] Delete `nnpack` installation from conda (#87813) Not sure why it was there to begin with and I really hope none of our CI depend on the package that was last updated 5 years ago, see https://anaconda.org/killeent/nnpack Pull Request resolved: https://github.com/pytorch/pytorch/pull/87813 Approved by: https://github.com/atalman, https://github.com/kit1980, https://github.com/ZainRizvi --- .circleci/docker/common/install_conda.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/.circleci/docker/common/install_conda.sh b/.circleci/docker/common/install_conda.sh index 713aad4729110..84f9538ce1248 100755 --- a/.circleci/docker/common/install_conda.sh +++ b/.circleci/docker/common/install_conda.sh @@ -104,9 +104,6 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then conda_install magma-cuda$(TMP=${CUDA_VERSION/./};echo ${TMP%.*[0-9]}) -c pytorch fi - # TODO: This isn't working atm - conda_install nnpack -c killeent - # Install some other packages, including those needed for Python test reporting pip_install -r /opt/conda/requirements-ci.txt From 3eab0f406c455a58c87ea40726812e06ea2c62ce Mon Sep 17 00:00:00 2001 From: Will Constable Date: Wed, 26 Oct 2022 04:34:41 +0000 Subject: [PATCH 0204/1922] Enable graph_split_inductor test as it runs now (#87762) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87762 Approved by: https://github.com/davidberard98 --- test/distributed/test_dynamo_distributed.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py index 36a459b6f00c3..4e8c6ffa981ac 100644 --- a/test/distributed/test_dynamo_distributed.py +++ b/test/distributed/test_dynamo_distributed.py @@ -141,8 +141,6 @@ def opt_fn(inputs): self.assertTrue(same(correct_outputs, opt_outputs)) self.assertEqual(check_splits_compiler.compiler_called, 3) - # hangs/crashes with inductor currently - @unittest.skip("hangs/crashes with inductor currently") @patch.object(config, "optimize_ddp", True) def test_graph_split_inductor(self): """ From a5e255725f1ee102bb3ce900a5fd5bc2d7e96ac2 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Wed, 26 Oct 2022 22:10:10 +0000 Subject: [PATCH 0205/1922] print stderr for ghstack rebase (#87795) current output tends to be empty on failure, which makes it hard to debug Pull Request resolved: https://github.com/pytorch/pytorch/pull/87795 Approved by: https://github.com/huydhn, https://github.com/ZainRizvi --- .github/scripts/tryrebase.py | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/scripts/tryrebase.py b/.github/scripts/tryrebase.py index 1b69f653e525a..2e8987e9faaa1 100755 --- a/.github/scripts/tryrebase.py +++ b/.github/scripts/tryrebase.py @@ -69,6 +69,7 @@ def rebase_ghstack_onto(pr: GitHubPR, repo: GitRepo, onto_branch: str, dry_run: push_result = ghstack_result.stdout.decode("utf-8") print(push_result) if ghstack_result.returncode != 0: + print(ghstack_result.stderr.decode("utf-8")) raise Exception(f"\n```{push_result}```") # The contents of a successful push result should look like: # Summary of changes (ghstack 0.6.0) From a070f7b0601a4a2fdf6de78ca4d580e15b12bc35 Mon Sep 17 00:00:00 2001 From: Jiewen Tan Date: Wed, 26 Oct 2022 22:41:19 +0000 Subject: [PATCH 0206/1922] [LTC] Remove tensor.storage_ (#87645) Summary: Since LTC now supports functionalization, we don't need to fake a storage to support is_alias_of anymore. Let's remove it. Test Plan: ./build/bin/test_lazy --gtest_filter=LazyOpsTest.IsAliasOf Pull Request resolved: https://github.com/pytorch/pytorch/pull/87645 Approved by: https://github.com/JackCaoG, https://github.com/bdhirsh --- torch/csrc/lazy/core/tensor.cpp | 11 ++--------- torch/csrc/lazy/core/tensor.h | 15 --------------- torch/csrc/lazy/core/tensor_impl.h | 9 --------- torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp | 4 +--- 4 files changed, 3 insertions(+), 36 deletions(-) diff --git a/torch/csrc/lazy/core/tensor.cpp b/torch/csrc/lazy/core/tensor.cpp index bf673a72361d3..0a114d0e71179 100644 --- a/torch/csrc/lazy/core/tensor.cpp +++ b/torch/csrc/lazy/core/tensor.cpp @@ -83,12 +83,7 @@ LazyTensor::LazyTensor( const BackendDevice& device) : LazyTensor(std::make_shared(std::move(view), device)) {} -LazyTensor::LazyTensor(std::shared_ptr data) - : data_(std::move(data)), - storage_(c10::Storage( - {}, - 0, - c10::DataPtr(nullptr, backendDeviceToAtenDevice(data_->device)))) {} +LazyTensor::LazyTensor(std::shared_ptr data) : data_(std::move(data)) {} LazyTensor::Data* LazyTensor::data() const { TORCH_CHECK(data_ != nullptr, "Trying to access a null cursor"); @@ -353,9 +348,7 @@ std::shared_ptr LazyTensor::CreateView(ViewInfo view_info) const { } LazyTensorPtr LazyTensor::CreateViewTensor(ViewInfo view_info) const { - auto new_tensor = Create(CreateView(std::move(view_info)), GetDevice()); - new_tensor->storage_ = Storage(); - return new_tensor; + return Create(CreateView(std::move(view_info)), GetDevice()); } at::Tensor LazyTensor::ToTensor(bool detached) { diff --git a/torch/csrc/lazy/core/tensor.h b/torch/csrc/lazy/core/tensor.h index 12cfdd2827d74..052b84b4a60cc 100644 --- a/torch/csrc/lazy/core/tensor.h +++ b/torch/csrc/lazy/core/tensor.h @@ -143,15 +143,6 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target { // Applies the queue of operations in preparation for using the data. void ApplyPendingGraph(); - const c10::Storage& Storage() const { - return storage_; - } - // This is currently only used by outlier view ops such as expand that - // don't go through CreateViewTensor to support Tensor.is_alias_of. - void SetStorage(const c10::Storage& storage) { - storage_ = storage; - } - private: LazyTensor(const at::Tensor& tensor, const BackendDevice& device); LazyTensor(Value ir_value, const BackendDevice& device); @@ -196,12 +187,6 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target { static int64_t GetNextTensorId(); std::shared_ptr data_; - // Temporarily used to suport Tensor.is_alias_of(). - // This is a fake storage that doesn't store anything. - // Instead it serves as a marker to mark LazyTensors that - // points to the same storage, and thus alias of each other. - // FIXME(alanwaketan): Remove this once we have functionalization (bdhirsh). - c10::Storage storage_; }; // Utils to convert at::Tensor to LazyTensor, and vice versa. diff --git a/torch/csrc/lazy/core/tensor_impl.h b/torch/csrc/lazy/core/tensor_impl.h index de1191a3de3e2..710230605cc1f 100644 --- a/torch/csrc/lazy/core/tensor_impl.h +++ b/torch/csrc/lazy/core/tensor_impl.h @@ -49,15 +49,6 @@ class TORCH_API LTCTensorImpl final : public c10::TensorImpl { c10::SymIntArrayRef sym_sizes_custom() const override; c10::SymIntArrayRef sym_strides_custom() const override; -#ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY - const at::Storage& storage() const override { - return tensor_->Storage(); - } - bool has_storage() const override { - return tensor_->Storage(); - } -#endif // C10_DISABLE_TENSORIMPL_EXTENSIBILITY - private: void setup_size_properties(); diff --git a/torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp b/torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp index 534a9bca130db..3f5882f471f5d 100644 --- a/torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp +++ b/torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp @@ -105,14 +105,12 @@ torch::lazy::LazyTensorPtr expand( const torch::lazy::LazyTensorPtr& input, std::vector size) { auto input_shape = input->shape(); - auto output = torch::lazy::LazyTensor::Create( + return torch::lazy::LazyTensor::Create( torch::lazy::MakeExpand( input->GetIrValue(), GetExpandDimensions(input_shape.Get(), std::move(size)), /*is_scalar_expand=*/false), input->GetDevice()); - output->SetStorage(input->Storage()); - return output; } void fill_(torch::lazy::LazyTensorPtr& input, const at::Scalar& value) { From 2a26a514299e315a9cde9aa486588697b661dedb Mon Sep 17 00:00:00 2001 From: Driss Guessous Date: Wed, 26 Oct 2022 22:42:39 +0000 Subject: [PATCH 0207/1922] Add logging for nested tensor usage tracking (#87632) # Summary Add logging message so that we can track nested tensor adoption. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87632 Approved by: https://github.com/cpuhrsch --- aten/src/ATen/NestedTensorImpl.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/aten/src/ATen/NestedTensorImpl.cpp b/aten/src/ATen/NestedTensorImpl.cpp index 94c9c8d073a94..c0199da124c36 100644 --- a/aten/src/ATen/NestedTensorImpl.cpp +++ b/aten/src/ATen/NestedTensorImpl.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -172,6 +173,7 @@ NestedTensorImpl::NestedTensorImpl( nested_stride_tensor_(std::move(nested_stride_tensor)), storage_offsets_(std::move(offsets)), opt_sizes_(construct_opt_sizes(nested_size_tensor_)) { + C10_LOG_API_USAGE_ONCE("Using torch.NestedTensor"); TORCH_WARN_ONCE( "The PyTorch API of nested tensors is in prototype stage and will change " "in the near future."); From f3bc3a57ce7650be933b056f48194b5a5a96c6b5 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 26 Oct 2022 23:16:29 +0000 Subject: [PATCH 0208/1922] [BE] Don't build CUDA-10.2 docker images (#87819) As CUDA-10.2 should not longer be used in CI/CD Test Plan: ` grep cuda10.2 .github -R|grep -v mock` Pull Request resolved: https://github.com/pytorch/pytorch/pull/87819 Approved by: https://github.com/kit1980, https://github.com/ZainRizvi --- .github/workflows/docker-builds.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 572d8146ebe51..dd59d44e8a9d3 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -33,7 +33,6 @@ jobs: strategy: matrix: include: - - docker-image-name: pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7 - docker-image-name: pytorch-linux-bionic-cuda11.3-cudnn8-py3-clang9 - docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7 - docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7 @@ -42,7 +41,6 @@ jobs: - docker-image-name: pytorch-linux-focal-rocm5.2-py3.8 - docker-image-name: pytorch-linux-jammy-cuda11.6-cudnn8-py3.8-clang12 - docker-image-name: pytorch-linux-jammy-cuda11.7-cudnn8-py3.8-clang12 - - docker-image-name: pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7 - docker-image-name: pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7 - docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c - docker-image-name: pytorch-linux-xenial-py3-clang5-asan From e9fbb69aa25bc532ac4e0066a076d0b47c155d5f Mon Sep 17 00:00:00 2001 From: Nikita Karetnikov Date: Wed, 26 Oct 2022 07:36:02 +0200 Subject: [PATCH 0209/1922] [primTorch] Check `error_regex` in `test_python_ref_errors` (#86987) cc @ezyang @mruberry @ngimel @Lezcano @fdrocha Pull Request resolved: https://github.com/pytorch/pytorch/pull/86987 Approved by: https://github.com/lezcano, https://github.com/mruberry --- test/test_ops.py | 3 +- .../_internal/common_methods_invocations.py | 45 +++++++++++++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/test/test_ops.py b/test/test_ops.py index 0e5b6f1d607dd..1d20151c20e89 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -477,8 +477,7 @@ def _to_tensormeta(x): for ei in error_inputs: si = ei.sample_input meta_sample = si.transform(_to_tensormeta) - # TODO: match strings - with self.assertRaisesRegex(ei.error_type, ""): + with self.assertRaisesRegex(ei.error_type, ei.error_regex): op(meta_sample.input, *meta_sample.args, **meta_sample.kwargs) # Tests that the function produces the same result when called with diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 94c12f5bc93d0..900c0987d2f2c 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -16570,6 +16570,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1): aliases=('moveaxis',), torch_opinfo_name="movedim", supports_nvfuser=False, + skips=( + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'), + ), ), PythonRefInfo( "_refs.bucketize", @@ -16765,6 +16768,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1): ElementwiseUnaryPythonRefInfo( "_refs.neg", torch_opinfo_name="neg", + skips=( + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'), + ), ), ElementwiseUnaryPythonRefInfo( "_refs.positive", @@ -16977,10 +16983,16 @@ def reference_flatten(input, start_dim=0, end_dim=-1): PythonRefInfo( "_refs.nn.functional.poisson_nll_loss", torch_opinfo_name="nn.functional.poisson_nll_loss", + skips=( + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'), + ), ), ElementwiseUnaryPythonRefInfo( "_refs.nn.functional.prelu", torch_opinfo_name="nn.functional.prelu", + skips=( + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'), + ), ), ElementwiseUnaryPythonRefInfo( "_refs.nn.functional.relu", @@ -17699,6 +17711,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1): "_refs.dsplit", torch_opinfo_name="dsplit", supports_nvfuser=False, + skips=( + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'), + ), ), PythonRefInfo( "_refs.diag", @@ -17724,6 +17739,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1): "_refs.dstack", torch_opinfo_name="dstack", supports_nvfuser=False, + skips=( + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'), + ), ), PythonRefInfo( "_refs.expand", @@ -17808,6 +17826,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1): "_refs.reshape", torch_opinfo_name="reshape", supports_nvfuser=False, + skips=( + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'), + ), ), PythonRefInfo( "_refs.reshape_as", @@ -17856,6 +17877,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1): "_refs.vsplit", torch_opinfo_name="vsplit", supports_nvfuser=False, + skips=( + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'), + ), ), PythonRefInfo( "_refs.transpose", @@ -17889,6 +17913,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1): "_refs.view", torch_opinfo_name="view", supports_nvfuser=False, + skips=( + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'), + ), ), PythonRefInfo( "_refs.view_as", @@ -17913,6 +17940,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1): "_refs.unbind", torch_opinfo_name="unbind", supports_nvfuser=False, + skips=( + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'), + ), ), # # Reduction Reference OpInfos @@ -17924,10 +17954,16 @@ def reference_flatten(input, start_dim=0, end_dim=-1): ReductionPythonRefInfo( "_refs.amax", torch_opinfo_name="amax", + skips=( + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'), + ), ), ReductionPythonRefInfo( "_refs.amin", torch_opinfo_name="amin", + skips=( + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'), + ), ), ReductionPythonRefInfo( "_refs.any", @@ -17937,6 +17973,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1): "_refs.mean", torch_opinfo_name="mean", supports_out=True, + skips=( + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'), + ), ), ReductionPythonRefInfo( "_refs.std", @@ -18215,12 +18254,18 @@ def reference_flatten(input, start_dim=0, end_dim=-1): "_refs.masked_fill", torch_opinfo_name="masked_fill", supports_nvfuser=False, + skips=( + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'), + ), ), PythonRefInfo( "_refs.where", torch_opinfo_name="where", op=lambda self, condition, other: refs.where(condition, self, other), supports_nvfuser=False, + skips=( + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors', device_type='cuda'), + ), ), PythonRefInfo( "_refs.index_select", From ef1b1733e94adcf486041c38627eb43f9f6fb019 Mon Sep 17 00:00:00 2001 From: wchen61 <183351030@qq.com> Date: Wed, 26 Oct 2022 23:44:13 +0000 Subject: [PATCH 0210/1922] Synchronize before change cuda stream (#82050) (#82056) Summary: Fixes https://github.com/pytorch/pytorch/issues/82050 Need synchronize before change cuda stream ### Description ### Issue ### Testing Pull Request resolved: https://github.com/pytorch/pytorch/pull/82056 Approved by: https://github.com/ngimel --- torch/testing/_internal/common_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index 5da1ffefaba91..2f85b8af1d81f 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -1430,6 +1430,7 @@ def __enter__(self): for d in range(torch.cuda.device_count()): self.beforeStreams.append(torch.cuda.current_stream(d)) deviceStream = torch.cuda.Stream(device=d) + self.beforeStreams[-1].synchronize() torch._C._cuda_setStream(deviceStream._cdata) torch._C._cuda_setDevice(beforeDevice) From ee9ad1320491aca3d826e7b06eeeae04cd9c5636 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Thu, 27 Oct 2022 00:01:10 +0000 Subject: [PATCH 0211/1922] Fix typos under .github directory (#87828) This PR fixes typos in `.md` files under .github directory Pull Request resolved: https://github.com/pytorch/pytorch/pull/87828 Approved by: https://github.com/clee2000 --- .github/requirements/README.md | 2 +- .github/scripts/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/requirements/README.md b/.github/requirements/README.md index 654bb04558b9b..9093b92c62d29 100644 --- a/.github/requirements/README.md +++ b/.github/requirements/README.md @@ -4,7 +4,7 @@ At the moment, the installation of conda and pip dependencies happens at different places in the CI depending at the whim of different developers, which makes it very challenging to handle issues like network flakiness or upstream dependency failures gracefully. So, this -center directory is created to gradually include all the conda enviroment +center directory is created to gradually include all the conda environment and pip requirement files that are used to setup CI jobs. Not only it gives a clear picture of all the dependencies required by different CI jobs, but it also allows them to be cached properly to improve CI diff --git a/.github/scripts/README.md b/.github/scripts/README.md index 22099c3732ea5..73bec509c2c41 100644 --- a/.github/scripts/README.md +++ b/.github/scripts/README.md @@ -36,7 +36,7 @@ New generated binary workflows can be added in the `.github/scripts/generate_ci_ examples from that script in order to add the workflow to the stream that is relevant to what you particularly care about. -Different parameters can be used to acheive different goals, i.e. running jobs on a cron, running only on trunk, etc. +Different parameters can be used to achieve different goals, i.e. running jobs on a cron, running only on trunk, etc. #### ciflow (trunk) From f10e85b2dcb7982c37d957e82c85961e5ff0e3c0 Mon Sep 17 00:00:00 2001 From: Valentin Andrei Date: Thu, 27 Oct 2022 00:18:16 +0000 Subject: [PATCH 0212/1922] [pytorch] Layer norm backward speed gain with warp shuffles (#87814) Summary: Improved native layer norm backward performance. Rewrote `GammaBetaBackwardCUDAKernel` to use shared memory only for the reduction step, but not for loading `mean` and `rstd`. The previous implementation used only `threadIdx.x = 0` to load `mean` and `rstd` into shared memory, and then all threads would access the values in order to do loop unrolling. This approached increased register usage and decreased occupancy, without much benefit from using shared memory (this is because the values were already cached in L1). The new implementation is simpler and register usage is smaller, thus occupancy is better. Added another implementation called `GammaBetaBackwardCUDAKernel_32x32` which is only for shapes dividing exactly to a (32 x 32) block. This permits using warp shuffles for speeding up loading `mean` and `rstd` as well as for the final reduction stage. The effective bandwidth of this implementation is equal to STREAM Triad. Observed that we can get additional benefit if we lower the threshold for calling `GammaBetaBackwardSimpleCUDAKernel` (simple col-wise reduction implementation) from `512` to `128`. Test Plan: Wrote a simple CUDA app that calls the previous implementation of `GammaBetaBackwardCUDAKernel` and the current one, using FP32 values and compares the results. The epsilon value we used for FP comparison is 0.00001 for the weight and 0.0001 for the bias. Ran the benchmark for various sizes A100 GPU and got the results below. Almost all sizes show good speedup. ``` Size (32, 32); Mismatches: dg = 0 db = 0 out of 32. reference = 0.0073 (ms); optimized = 0.0071 (ms); bw_opt = 1.14 GB/s; speedup = 2.68% Size (64, 32); Mismatches: dg = 0 db = 0 out of 32. reference = 0.0107 (ms); optimized = 0.0107 (ms); bw_opt = 1.50 GB/s; speedup = 0.22% Size (256, 128); Mismatches: dg = 0 db = 0 out of 128. reference = 0.0323 (ms); optimized = 0.0075 (ms); bw_opt = 32.89 GB/s; speedup = 330.16% Size (512, 1024); Mismatches: dg = 0 db = 0 out of 1024. reference = 0.0103 (ms); optimized = 0.0089 (ms); bw_opt = 440.54 GB/s; speedup = 15.82% Size (1024, 2048); Mismatches: dg = 0 db = 0 out of 2048. reference = 0.0197 (ms); optimized = 0.0136 (ms); bw_opt = 1151.44 GB/s; speedup = 44.91% Size (2048, 2048); Mismatches: dg = 0 db = 0 out of 2048. reference = 0.0416 (ms); optimized = 0.0283 (ms); bw_opt = 1105.31 GB/s; speedup = 47.01% Size (4096, 16384); Mismatches: dg = 0 db = 0 out of 16384. reference = 0.4420 (ms); optimized = 0.3915 (ms); bw_opt = 1277.58 GB/s; speedup = 12.90% Size (70000, 64); Mismatches: dg = 0 db = 0 out of 64. reference = 0.5908 (ms); optimized = 0.6850 (ms); bw_opt = 49.49 GB/s; speedup = -13.75% Size (131072, 512); Mismatches: dg = 0 db = 0 out of 512. reference = 1.1961 (ms); optimized = 0.9234 (ms); bw_opt = 542.54 GB/s; speedup = 29.53% Size (1000, 520); Mismatches: dg = 0 db = 0 out of 520. reference = 0.0132 (ms); optimized = 0.0113 (ms); bw_opt = 343.83 GB/s; speedup = 16.88% Size (4005, 4005); Mismatches: dg = 0 db = 0 out of 4005. reference = 0.1441 (ms); optimized = 0.1054 (ms); bw_opt = 1134.36 GB/s; speedup = 36.71% Size (10000, 1000); Mismatches: dg = 0 db = 0 out of 1000. reference = 0.1293 (ms); optimized = 0.1248 (ms); bw_opt = 597.71 GB/s; speedup = 3.63% Size (1024, 10000); Mismatches: dg = 0 db = 0 out of 10000. reference = 0.0738 (ms); optimized = 0.0735 (ms); bw_opt = 1039.40 GB/s; speedup = 0.45% Size (8192, 4096); Mismatches: dg = 0 db = 0 out of 4096. reference = 0.2673 (ms); optimized = 0.2223 (ms); bw_opt = 1125.01 GB/s; speedup = 20.25% Size (10000, 10000); Mismatches: dg = 0 db = 0 out of 10000. reference = 0.7331 (ms); optimized = 0.8940 (ms); bw_opt = 833.54 GB/s; speedup = -18.00% Size (3072, 10000); Mismatches: dg = 0 db = 0 out of 10000. reference = 0.2087 (ms); optimized = 0.2364 (ms); bw_opt = 968.64 GB/s; speedup = -11.71% Size (6144, 10000); Mismatches: dg = 0 db = 0 out of 10000. reference = 0.4197 (ms); optimized = 0.5118 (ms); bw_opt = 894.63 GB/s; speedup = -18.00% Size (1024, 20000); Mismatches: dg = 0 db = 0 out of 20000. reference = 0.1480 (ms); optimized = 0.1297 (ms); bw_opt = 1177.68 GB/s; speedup = 14.12% Size (1024, 20000); Mismatches: dg = 0 db = 0 out of 20000. reference = 0.1483 (ms); optimized = 0.1278 (ms); bw_opt = 1195.26 GB/s; speedup = 16.04% Size (512, 1536); Mismatches: dg = 0 db = 0 out of 1536. reference = 0.0104 (ms); optimized = 0.0091 (ms); bw_opt = 646.72 GB/s; speedup = 14.44% Size (512, 6144); Mismatches: dg = 0 db = 0 out of 6144. reference = 0.0219 (ms); optimized = 0.0156 (ms); bw_opt = 1506.30 GB/s; speedup = 40.52% Size (512, 10240); Mismatches: dg = 0 db = 0 out of 10240. reference = 0.0424 (ms); optimized = 0.0370 (ms); bw_opt = 1057.84 GB/s; speedup = 14.63% Size (1000, 1000); Mismatches: dg = 0 db = 0 out of 1000. reference = 0.0139 (ms); optimized = 0.0119 (ms); bw_opt = 627.51 GB/s; speedup = 16.83% Size (2000, 2000); Mismatches: dg = 0 db = 0 out of 2000. reference = 0.0421 (ms); optimized = 0.0412 (ms); bw_opt = 724.10 GB/s; speedup = 2.20% Size (10240, 10240); Mismatches: dg = 0 db = 0 out of 10240. reference = 0.7210 (ms); optimized = 0.6098 (ms); bw_opt = 1281.40 GB/s; speedup = 18.24% Size (384, 128); Mismatches: dg = 0 db = 0 out of 128. reference = 0.0449 (ms); optimized = 0.0089 (ms); bw_opt = 41.50 GB/s; speedup = 403.48% Size (2048, 1024); Mismatches: dg = 0 db = 0 out of 1024. reference = 0.0208 (ms); optimized = 0.0169 (ms); bw_opt = 925.70 GB/s; speedup = 23.13% Size (267, 513); Mismatches: dg = 0 db = 0 out of 513. reference = 0.0342 (ms); optimized = 0.0090 (ms); bw_opt = 114.18 GB/s; speedup = 280.64% Size (67, 123479); Mismatches: dg = 0 db = 0 out of 123479. reference = 0.0562 (ms); optimized = 0.0552 (ms); bw_opt = 1133.46 GB/s; speedup = 1.81% Size (1024, 123479); Mismatches: dg = 0 db = 0 out of 123479. reference = 0.8573 (ms); optimized = 0.9245 (ms); bw_opt = 1020.02 GB/s; speedup = -7.27% Size (2048, 66679); Mismatches: dg = 0 db = 0 out of 66679. reference = 0.8778 (ms); optimized = 0.8590 (ms); bw_opt = 1185.05 GB/s; speedup = 2.19% Size (200, 256); Mismatches: dg = 0 db = 0 out of 256. reference = 0.0215 (ms); optimized = 0.0066 (ms); bw_opt = 58.49 GB/s; speedup = 226.81% Size (1000, 256); Mismatches: dg = 0 db = 0 out of 256. reference = 0.0109 (ms); optimized = 0.0092 (ms); bw_opt = 208.27 GB/s; speedup = 18.65% Size (6000, 256); Mismatches: dg = 0 db = 0 out of 256. reference = 0.0394 (ms); optimized = 0.0301 (ms); bw_opt = 381.90 GB/s; speedup = 30.98% Size (6272, 256); Mismatches: dg = 0 db = 0 out of 256. reference = 0.0403 (ms); optimized = 0.0300 (ms); bw_opt = 400.48 GB/s; speedup = 34.34% Size (200, 512); Mismatches: dg = 0 db = 0 out of 512. reference = 0.0218 (ms); optimized = 0.0066 (ms); bw_opt = 116.33 GB/s; speedup = 229.96% Size (1000, 512); Mismatches: dg = 0 db = 0 out of 512. reference = 0.0110 (ms); optimized = 0.0094 (ms); bw_opt = 407.29 GB/s; speedup = 17.26% Size (6000, 512); Mismatches: dg = 0 db = 0 out of 512. reference = 0.0535 (ms); optimized = 0.0594 (ms); bw_opt = 386.05 GB/s; speedup = -9.95% Size (6272, 512); Mismatches: dg = 0 db = 0 out of 512. reference = 0.0573 (ms); optimized = 0.0387 (ms); bw_opt = 619.62 GB/s; speedup = 48.06% Size (200, 1024); Mismatches: dg = 0 db = 0 out of 1024. reference = 0.0221 (ms); optimized = 0.0069 (ms); bw_opt = 222.78 GB/s; speedup = 220.76% Size (1000, 1024); Mismatches: dg = 0 db = 0 out of 1024. reference = 0.0113 (ms); optimized = 0.0097 (ms); bw_opt = 787.79 GB/s; speedup = 16.46% Size (6000, 1024); Mismatches: dg = 0 db = 0 out of 1024. reference = 0.0723 (ms); optimized = 0.0715 (ms); bw_opt = 640.95 GB/s; speedup = 1.10% Size (6272, 1024); Mismatches: dg = 0 db = 0 out of 1024. reference = 0.0751 (ms); optimized = 0.0572 (ms); bw_opt = 837.57 GB/s; speedup = 31.30% Size (200, 1536); Mismatches: dg = 0 db = 0 out of 1536. reference = 0.0232 (ms); optimized = 0.0071 (ms); bw_opt = 323.97 GB/s; speedup = 226.51% Size (1000, 1536); Mismatches: dg = 0 db = 0 out of 1536. reference = 0.0125 (ms); optimized = 0.0114 (ms); bw_opt = 1005.84 GB/s; speedup = 9.62% Size (6000, 1536); Mismatches: dg = 0 db = 0 out of 1536. reference = 0.0807 (ms); optimized = 0.0830 (ms); bw_opt = 828.02 GB/s; speedup = -2.76% Size (6272, 1536); Mismatches: dg = 0 db = 0 out of 1536. reference = 0.0836 (ms); optimized = 0.0695 (ms); bw_opt = 1033.62 GB/s; speedup = 20.27% Size (200, 2048); Mismatches: dg = 0 db = 0 out of 2048. reference = 0.0224 (ms); optimized = 0.0075 (ms); bw_opt = 408.58 GB/s; speedup = 198.10% Size (1000, 2048); Mismatches: dg = 0 db = 0 out of 2048. reference = 0.0165 (ms); optimized = 0.0135 (ms); bw_opt = 1132.42 GB/s; speedup = 22.26% Size (6000, 2048); Mismatches: dg = 0 db = 0 out of 2048. reference = 0.0993 (ms); optimized = 0.0989 (ms); bw_opt = 926.35 GB/s; speedup = 0.41% Size (6272, 2048); Mismatches: dg = 0 db = 0 out of 2048. reference = 0.1033 (ms); optimized = 0.0826 (ms); bw_opt = 1159.55 GB/s; speedup = 25.09% Size (200, 3072); Mismatches: dg = 0 db = 0 out of 3072. reference = 0.0230 (ms); optimized = 0.0076 (ms); bw_opt = 605.09 GB/s; speedup = 202.51% Size (1000, 3072); Mismatches: dg = 0 db = 0 out of 3072. reference = 0.0207 (ms); optimized = 0.0213 (ms); bw_opt = 1076.45 GB/s; speedup = -2.69% Size (6000, 3072); Mismatches: dg = 0 db = 0 out of 3072. reference = 0.1198 (ms); optimized = 0.1274 (ms); bw_opt = 1078.58 GB/s; speedup = -5.95% Size (6272, 3072); Mismatches: dg = 0 db = 0 out of 3072. reference = 0.1293 (ms); optimized = 0.1189 (ms); bw_opt = 1207.95 GB/s; speedup = 8.76% Average speedup = 52.88% ``` For additional numerical validation used the following script: ``` def run_model_on_device(fs, X, gO, device_string, numeric_type): ln = torch.nn.LayerNorm((fs,), device=device_string, dtype=numeric_type) ln.reset_parameters() X.grad = None ln.zero_grad(set_to_none=True) out = ln(X) out.backward(gO) return (ln.weight.grad, ln.bias.grad) def run_correctness_test(eps_weight, eps_bias): dtype = torch.float for fs in (512, 1024, 2048, 4096, 8192, 10000, 500, 1000, 2001, 4005, 8117): for bs in (512, 1024, 2048, 4096, 525, 1033, 2064, 3000): mean_adjustment = torch.randn(fs, device="cpu", dtype=torch.float) X = mean_adjustment * torch.randn( bs, fs, device="cpu", dtype=torch.float, requires_grad=True ) X = X.detach().requires_grad_() gO = torch.rand_like(X) X_gpu = X.to("cuda") X_gpu = X_gpu.detach().requires_grad_() gO_gpu = gO.to("cuda") gO_gpu = gO_gpu.detach().requires_grad_() grad_cpu_ref = run_model_on_device(fs, X, gO, "cpu", dtype) grad_gpu = run_model_on_device(fs, X_gpu, gO_gpu, "cuda", dtype) weight_grad_gpu_target = grad_gpu[0].detach().to("cpu") bias_grad_gpu_target = grad_gpu[1].detach().to("cpu") weight_delta = torch.abs(grad_cpu_ref[0] - weight_grad_gpu_target) weight_mismatches = (weight_delta >= eps_weight).nonzero() weight_mismatch_pct = len(weight_mismatches) / len(weight_delta) * 100 bias_delta = torch.abs(grad_cpu_ref[1] - bias_grad_gpu_target) bias_mismatches = (bias_delta >= eps_bias).nonzero() bias_mismatch_pct = len(bias_mismatches) / len(bias_delta) * 100 print( "Size ({} x {}) mismatch percentage: weight {:3.2f} bias {:3.2f}".format( fs, bs, weight_mismatch_pct, bias_mismatch_pct ) ) ``` `NVFuserTest.FusionMagicSchedulerLayerNormBackward_CUDA` test also does additional numerical validation and it passes. Differential Revision: D40730981 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87814 Approved by: https://github.com/weiwangmeta --- .../src/ATen/native/cuda/layer_norm_kernel.cu | 242 ++++++++++++++---- 1 file changed, 188 insertions(+), 54 deletions(-) diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu index ae09f0aaad8f8..fa70f075d4fa7 100644 --- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu +++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu @@ -33,6 +33,7 @@ namespace { constexpr int kCUDANumThreads = 256; constexpr int kColwiseReduceTileSize = 32; +constexpr unsigned int kWarpSize = 32; constexpr int vec_size = 4; //we could make it dependent on dtype, but that would lead to different results between float and low-p types // aligned vector generates vectorized load/store on CUDA (copy-pasted from MemoryAccess.cuh) @@ -555,8 +556,108 @@ __global__ void GammaBetaBackwardCUDAKernel1( } } +template +__global__ void GammaBetaBackwardCUDAKernel_32x32( + int64_t M, + int64_t N, + const T* dY, + const T* X, + const T_ACC* mean, + const T_ACC* rstd, + T* dg, + T* db) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T_ACC* s_data_typed = reinterpret_cast(&s_data1); + T_ACC* s_dg; + T_ACC* s_db; + T_ACC dg_sum = 0; + T_ACC db_sum = 0; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + + if (j < N) { + constexpr int unroll_factor = 8; + int laneId = threadIdx.x & 0x1f; + + T_ACC mean_reg, mean_reg_tmp; + T_ACC rstd_reg, rstd_reg_tmp; + T dY_reg; + T X_reg; + + // Main loop + int bcounter; + for (bcounter = 0; bcounter < M / (blockDim.y * unroll_factor); + bcounter++) { + int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor; + + if (laneId < unroll_factor) { + mean_reg_tmp = mean[offset + laneId]; + rstd_reg_tmp = rstd[offset + laneId]; + } +#if !defined(USE_ROCM) + // Volta and newer architectures allow lane divergence within a warp. + __syncwarp(); +#endif + + #pragma unroll + for (int ii = 0; ii < unroll_factor; ++ii) { + dY_reg = dY[(offset + ii) * N + j]; + X_reg = X[(offset + ii) * N + j]; + mean_reg = WARP_SHFL(mean_reg_tmp, ii, kWarpSize); + rstd_reg = WARP_SHFL(rstd_reg_tmp, ii, kWarpSize); + dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg; + db_sum += dY_reg; + } + } + + // Remainder loop + int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor; + for (int ii = 0; ii < unroll_factor; ii++) { + if ((offset + ii) < M) { + mean_reg = mean[offset + ii]; + rstd_reg = rstd[offset + ii]; + dY_reg = dY[(offset + ii) * N + j]; + X_reg = X[(offset + ii) * N + j]; + dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg; + db_sum += dY_reg; + } + } + + // This kernel uses a block of (32 x 32) and gets called when M; N + // divide by 32. We can use warp shuffles for the final reduction + // step. This removes 4 shmem loads and stores with their + // corresponding __syncthreads() + + // This greatly reduces bank conflicts at the expense of a little + // extra shared memory. It does not impact occupancy + int padded_bx = (1 + blockDim.x); + + s_dg = s_data_typed; + s_db = s_data_typed + (padded_bx * blockDim.y); + s_dg[threadIdx.y * padded_bx + threadIdx.x] = dg_sum; + s_db[threadIdx.y * padded_bx + threadIdx.x] = db_sum; + __syncthreads(); + + // Load transposed so that a warp holds an entire column + T_ACC reg_dg = s_dg[threadIdx.x * padded_bx + threadIdx.y]; + T_ACC reg_db = s_db[threadIdx.x * padded_bx + threadIdx.y]; + for (int delta = 16; delta >= 1; delta /= 2) { + reg_dg += WARP_SHFL_XOR(reg_dg, delta, kWarpSize); + reg_db += WARP_SHFL_XOR(reg_db, delta, kWarpSize); + } + + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y; + if (dg) { + dg[j] = reg_dg; + } + if (db) { + db[j] = reg_db; + } + } + } +} template __global__ void GammaBetaBackwardCUDAKernel( @@ -569,66 +670,75 @@ __global__ void GammaBetaBackwardCUDAKernel( T* dg, T* db) { alignas(sizeof(double)) extern __shared__ char s_data1[]; - T_ACC * s_data_typed = reinterpret_cast(&s_data1); + T_ACC* s_data_typed = reinterpret_cast(&s_data1); + T_ACC* s_dg; + T_ACC* s_db; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; - constexpr int unroll = 8; - T dYs[unroll]; - T Xs[unroll]; - T_ACC * means = s_data_typed; - T_ACC * rstds = s_data_typed + unroll * blockDim.y; + T_ACC dg_sum = 0; T_ACC db_sum = 0; + if (j < N) { + constexpr int unroll_factor = 8; + + T_ACC mean_reg; + T_ACC rstd_reg; + T dY_reg; + T X_reg; + + // Main Loop int bcounter; - for (bcounter = 0; bcounter < M/(blockDim.y * unroll); bcounter++){ - int offset = (bcounter * blockDim.y + threadIdx.y) * unroll; - #pragma unroll - for (int ii=0; ii=1; offset /= 2){ + + for (int offset = blockDim.y / 2; offset >= 1; offset /= 2) { if (threadIdx.y < offset) { - s_data_typed[threadIdx.y * blockDim.x + threadIdx.x] += s_data_typed[(threadIdx.y + offset) * blockDim.x + threadIdx.x]; - s_data_typed[blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x] += - s_data_typed[blockDim.x * blockDim.y + (threadIdx.y + offset) * blockDim.x + threadIdx.x]; - } + s_dg[threadIdx.y * blockDim.x + threadIdx.x] += + s_dg[(threadIdx.y + offset) * blockDim.x + threadIdx.x]; + s_db[threadIdx.y * blockDim.x + threadIdx.x] += + s_db[(threadIdx.y + offset) * blockDim.x + threadIdx.x]; + } __syncthreads(); } + if (threadIdx.y == 0) { if (dg) { - dg[j] = s_data_typed[threadIdx.x]; + dg[j] = s_dg[threadIdx.x]; } if (db) { - db[j] = s_data_typed[threadIdx.x + blockDim.x * blockDim.y]; + db[j] = s_db[threadIdx.x]; } } } @@ -763,7 +873,8 @@ void LayerNormBackwardKernelImplInternal( T* dgamma_data = dgamma->defined() ? dgamma->template data_ptr() : nullptr; T* dbeta_data = dbeta->defined() ? dbeta->template data_ptr() : nullptr; - if (M < 512) { + + if (M < 128) { // For small batch size, do colwise reduce directly. const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; GammaBetaBackwardSimpleCUDAKernel @@ -778,19 +889,42 @@ void LayerNormBackwardKernelImplInternal( dbeta_data); C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { - dim3 threads{16, 32}; - int blocks = (N + threads.x-1)/threads.x; - GammaBetaBackwardCUDAKernel - <<>>( - M, - N, - dY_data, - X_data, - mean_data, - rstd_data, - dgamma_data, - dbeta_data); - C10_CUDA_KERNEL_LAUNCH_CHECK(); + if ((M % kWarpSize == 0) && (N % kWarpSize == 0)) { + // This implementation relies on warp primitives and requires that M and N divide + // exactly to warp size. + dim3 threads{kWarpSize, kWarpSize}; + int blocks = (N + threads.x - 1) / threads.x; + + // If M and N divide by 32, we can use warp shuffles for the final reduction. That requires + // transposing values in shared memory, so we apply a padding to reduce bank conflicts. + size_t shmem_sz = 2 * sizeof(T_ACC) * (threads.x + 1) * threads.y; + GammaBetaBackwardCUDAKernel_32x32 + <<>>( + M, + N, + dY_data, + X_data, + mean_data, + rstd_data, + dgamma_data, + dbeta_data); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } else { + dim3 threads{16, 32}; + int blocks = (N + threads.x - 1) / threads.x; + size_t shmem_sz = 2 * sizeof(T_ACC) * threads.x * threads.y; + GammaBetaBackwardCUDAKernel + <<>>( + M, + N, + dY_data, + X_data, + mean_data, + rstd_data, + dgamma_data, + dbeta_data); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } } } } From 650b96d2873dcb0145eb994445716be32f5f99bd Mon Sep 17 00:00:00 2001 From: Charlie Yan Date: Wed, 26 Oct 2022 19:37:52 +0000 Subject: [PATCH 0213/1922] Enable mypy check for distributed.py, and fix type errors (#87543) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87543 Approved by: https://github.com/fduwjj --- torch/_C/_distributed_c10d.pyi | 68 ++++++++++++++++++++++--------- torch/nn/parallel/distributed.py | 25 ++++++++---- torch/nn/parallel/distributed.pyi | 21 ---------- 3 files changed, 65 insertions(+), 49 deletions(-) delete mode 100644 torch/nn/parallel/distributed.pyi diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi index bdf0166b8daa9..493e1d8846e71 100644 --- a/torch/_C/_distributed_c10d.pyi +++ b/torch/_C/_distributed_c10d.pyi @@ -1,8 +1,9 @@ from datetime import timedelta from enum import Enum -from typing import Optional, List, Any, Tuple, overload, Union +from typing import Any, Dict, List, Optional, Tuple, Union, overload from torch import Tensor +from torch.futures import Future # This module is defined in torch/csrc/distributed/c10d/init.cpp @@ -32,13 +33,36 @@ class Reducer: self, params: List[Tensor], bucket_indices: List[List[int]], + per_bucket_size_limits: List[int], process_group: ProcessGroup, - expect_sparse_gradients: List[bool], - bucket_bytes_cap: int, - find_unused_parameters: bool, - gradient_as_bucket_view: bool, + expect_sparse_gradients: List[bool] = [], + bucket_bytes_cap: int = ..., # kDefaultBucketBytesCap in reducer.hpp + find_unused_parameters: bool = False, + gradient_as_bucket_view: bool = False, + param_to_name_mapping: Dict[int, str] = {}, + first_bucket_types_cap: int = ..., # kDefaultFirstBucketBytes in reducer.hpp ): ... - ... + def prepare_for_forward(self) -> None: ... + def prepare_for_backward(self, output: List[Tensor]) -> None: ... + def get_backward_stats(self) -> List[int]: ... + def _install_post_backward_futures(self, futures: List[Future]) -> None: ... + def _rebuild_buckets(self) -> bool: ... + def _get_zeros_like_grad_buckets(self) -> List[GradBucket]: ... + def _push_all_rebuilt_params(self) -> None: ... + def _set_forward_pass_work_handle( + self, work: Work, use_static_world_size: bool + ): ... + def _get_local_used_map(self) -> Tensor: ... + def _set_ddp_runtime_logging_sample_rate( + self, sample_rate: int + ) -> None: ... + def _set_static_graph(self) -> None: ... + def _run_comm_hook(self, bucket: GradBucket) -> Future: ... + def set_logger(self, logger: Logger) -> None: ... + +class DDPLoggingData: + strs_map: Dict[str, str] + ints_map: Dict[str, int] class Logger: def __init__(self, reducer: Reducer): ... @@ -49,8 +73,14 @@ class Logger: output_device: int, broadcast_buffers: bool, has_sync_bn: bool, + static_graph: bool, ): ... - ... + def set_runtime_stats_and_log(self) -> None: ... + def set_error_and_log(self, error: str) -> None: ... + def _get_ddp_logging_data(self) -> DDPLoggingData: ... + def _set_comm_hook_name(self, comm_hook: str) -> None: ... + def _set_uneven_input_join(self) -> None: ... + def _set_static_graph(self) -> None: ... def get_debug_level(): ... def set_debug_level(): ... @@ -118,7 +148,9 @@ class Store: def set(self, key: str, value: str): ... def get(self, key: str) -> bytes: ... def add(self, key: str, value: int) -> int: ... - def compare_set(self, key: str, expected_value: str, desired_value: str) -> bytes: ... + def compare_set( + self, key: str, expected_value: str, desired_value: str + ) -> bytes: ... def delete_key(self, key: str) -> bool: ... def num_keys(self) -> int: ... def set_timeout(self, timeout: timedelta): ... @@ -142,7 +174,7 @@ class TCPStore(Store): is_master: bool = ..., timeout: timedelta = ..., wait_for_workers: bool = ..., - multi_tenant: bool = ... + multi_tenant: bool = ..., ): ... @property def host(self) -> str: ... @@ -167,6 +199,7 @@ class Work: class ProcessGroup: class Options: ... + def __init__(self): ... def rank(self) -> int: ... def size(self) -> int: ... @@ -235,7 +268,7 @@ class ProcessGroup: self, output: Tensor, input: Tensor, - opts = AllGatherOptions(), + opts=AllGatherOptions(), ) -> Work: ... def allgather_coalesced( self, @@ -343,6 +376,7 @@ def _round_robin_process_groups( class ProcessGroupGloo(ProcessGroup): class Device: ... class Options: ... + def __init__( self, store: Store, @@ -358,16 +392,12 @@ class ProcessGroupGloo(ProcessGroup): ... class _ProcessGroupWrapper(ProcessGroup): - def __init__( - self, - pg: ProcessGroup, - gloo_pg: ProcessGroupGloo - ): ... + def __init__(self, pg: ProcessGroup, gloo_pg: ProcessGroupGloo): ... wrapped_pg: ProcessGroup - class ProcessGroupNCCL(ProcessGroup): class Options: ... + def __init__( self, store: Store, @@ -402,9 +432,9 @@ class ProcessGroupMPI(ProcessGroup): def _compute_bucket_assignment_by_size( tensors: List[Tensor], - bucket_size: int, - expect_sparse_gradient: List[bool], - tensor_indices: List[int], + bucket_size_limits: List[int], + expect_sparse_gradient: List[bool] = [], + tensor_indices: List[int] = [], ) -> Tuple[List[List[int]], List[int]]: ... def _broadcast_coalesced( process_group: ProcessGroup, diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py index 23625d9d20014..514b89aad28d6 100644 --- a/torch/nn/parallel/distributed.py +++ b/torch/nn/parallel/distributed.py @@ -1,7 +1,7 @@ import sys import copy from dataclasses import dataclass -from typing import Callable, Any, Type +from typing import Any, Callable, Optional, Type from enum import Enum, auto import inspect import itertools @@ -37,7 +37,7 @@ from ..modules import Module from ._replicated_tensor_ddp_utils import _ddp_with_replicated_tensor_enabled -from .scatter_gather import gather, is_namedtuple, scatter_kwargs # noqa: F401 +from .scatter_gather import gather, scatter_kwargs # noqa: F401 __all__ = ["DistributedDataParallel"] @@ -194,6 +194,7 @@ def __init__(self, ddp, divide_by_initial_world_size): "DDP join hook requires passing in a DistributedDataParallel " "instance as the state" ) + assert ddp.logger is not None ddp.logger._set_uneven_input_join() self.ddp = ddp self.ddp._divide_by_initial_world_size = divide_by_initial_world_size @@ -555,7 +556,7 @@ def __init__( super(DistributedDataParallel, self).__init__() Joinable.__init__(self) - self.logger = None + self.logger: Optional[dist.Logger] = None if not any((p.requires_grad for p in module.parameters())): self._log_and_throw( RuntimeError, @@ -836,6 +837,7 @@ def __setstate__(self, state): ) if self.static_graph: self.reducer._set_static_graph() + assert self.logger is not None self.logger._set_static_graph() def _build_params_for_reducer(self): @@ -863,7 +865,7 @@ def _build_params_for_reducer(self): # "not memo.add(p)" is always True, and it's only there to cause "add(p)" if needed. (m, p) for m, p in modules_and_parameters - if p not in memo and not memo.add(p) + if p not in memo and not memo.add(p) # type: ignore[func-returns-value] ] # Build list of parameters. @@ -1044,7 +1046,7 @@ def _run_ddp_forward(self, *inputs, **kwargs): self.use_side_stream_for_tensor_copies, ) with self._inside_ddp_forward(): - return module_to_run(*inputs[0], **kwargs[0]) + return module_to_run(*inputs[0], **kwargs[0]) # type: ignore[index] else: with self._inside_ddp_forward(): return module_to_run(*inputs, **kwargs) @@ -1054,6 +1056,7 @@ def forward(self, *inputs, **kwargs): "DistributedDataParallel.forward" ): if torch.is_grad_enabled() and self.require_backward_grad_sync: + assert self.logger is not None self.logger.set_runtime_stats_and_log() self.num_iterations += 1 self.reducer.prepare_for_forward() @@ -1063,7 +1066,7 @@ def forward(self, *inputs, **kwargs): work = Join.notify_join_context(self) if work: self.reducer._set_forward_pass_work_handle( - work, self._divide_by_initial_world_size + work, self._divide_by_initial_world_size # type: ignore[arg-type] ) # Calling _rebuild_buckets before forward compuation, @@ -1171,7 +1174,7 @@ def gather(self, outputs, output_device): def train(self, mode=True): super(DistributedDataParallel, self).train(mode) if self._use_replicated_tensor_module: - self._replicated_tensor_module.train(mode) + self._replicated_tensor_module.train(mode) # type: ignore[union-attr] return self # When running in join mode, schedules an allreduce to notify joined ranks @@ -1392,7 +1395,7 @@ def join_process_group(self): def _register_buffer_comm_hook( self, state, - hook: callable, + hook: Callable, comm_hook_location=_BufferCommHookLocation.POST_FORWARD, ): r""" @@ -1438,7 +1441,7 @@ def _register_buffer_comm_hook( buffer_comm_hook_location=comm_hook_location, ) - def register_comm_hook(self, state: object, hook: callable): + def register_comm_hook(self, state: object, hook: Callable): r""" Registers a communication hook which is an enhancement that provides a flexible hook to users where they can specify how DDP aggregates gradients @@ -1518,6 +1521,7 @@ def register_comm_hook(self, state: object, hook: callable): >>> ddp.register_comm_hook(state=None, hook=encode_and_decode) """ self._check_comm_hook(hook) + assert self.logger is not None self.logger._set_comm_hook_name(hook.__qualname__) dist._register_comm_hook(self.reducer, state, hook) @@ -1544,6 +1548,7 @@ def _register_builtin_comm_hook(self, comm_hook_type): >>> ddp._register_builtin_comm_hook(dist.BuiltinCommHookType.FP16_COMPRESS) """ + assert self.logger is not None self.logger._set_comm_hook_name(str(comm_hook_type)) dist._register_builtin_comm_hook(self.reducer, comm_hook_type) @@ -1808,6 +1813,7 @@ def _get_ddp_logging_data(self): these metrics are. This is a prototype interface and subject to change in the future. """ + assert self.logger is not None ddp_logging_data = self.logger._get_ddp_logging_data() return {**ddp_logging_data.strs_map, **ddp_logging_data.ints_map} @@ -1842,6 +1848,7 @@ def _set_static_graph(self): return self.static_graph = True self.reducer._set_static_graph() + assert self.logger is not None self.logger._set_static_graph() if self.find_unused_parameters: warnings.warn( diff --git a/torch/nn/parallel/distributed.pyi b/torch/nn/parallel/distributed.pyi deleted file mode 100644 index a75713afb8282..0000000000000 --- a/torch/nn/parallel/distributed.pyi +++ /dev/null @@ -1,21 +0,0 @@ -from ..modules import Module -from typing import Any, Optional -from .common_types import _devices_t, _device_t - - -class DistributedDataParallel(Module): - process_group: Any = ... - dim: int = ... - module: Module = ... - device_ids: _devices_t = ... - output_device: _device_t = ... - broadcast_buffers: bool = ... - check_reduction: bool = ... - broadcast_bucket_size: float = ... - bucket_bytes_cap: float = ... - - # TODO type process_group once `distributed` module is stubbed - def __init__(self, module: Module, device_ids: Optional[_devices_t] = ..., - output_device: Optional[_device_t] = ..., dim: int = ..., - broadcast_buffers: bool = ..., process_group: Optional[Any] = ..., bucket_cap_mb: float = ..., - find_unused_parameters: bool = ..., check_reduction: bool = ...) -> None: ... From 3016f05f5f4ce8302e4df00e2ac0be00ea90d978 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Thu, 27 Oct 2022 00:59:40 +0000 Subject: [PATCH 0214/1922] [profiler] Standard performance event names for the profiler (#87538) Summary: The goal is to create a hardware/backend independent event abstraction on which a standard set of tooling can be developed. Test Plan: CI Reviewed By: kimishpatel Differential Revision: D40238034 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87538 Approved by: https://github.com/salilsdesai, https://github.com/kirklandsign --- torch/csrc/profiler/events.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 torch/csrc/profiler/events.h diff --git a/torch/csrc/profiler/events.h b/torch/csrc/profiler/events.h new file mode 100644 index 0000000000000..a1a956f132793 --- /dev/null +++ b/torch/csrc/profiler/events.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include +#include + +namespace torch { +namespace profiler { + +/* A vector type to hold a list of performance counters */ +using perf_counters_t = std::vector; + +/* Standard list of performance events independent of hardware or backend */ +constexpr std::array ProfilerPerfEvents = { + /* + * Number of Processing Elelement (PE) cycles between two points of interest + * in time. This should correlate positively with wall-time. Measured in + * uint64_t. PE can be non cpu. TBD reporting behavior for multiple PEs + * participating (i.e. threadpool). + */ + "cycles", + + /* Number of PE instructions between two points of interest in time. This + * should correlate positively with wall time and the amount of computation + * (i.e. work). Across repeat executions, the number of instructions should + * be more or less invariant. Measured in uint64_t. PE can be non cpu. + */ + "instructions"}; +} // namespace profiler +} // namespace torch From 03524ae0d5941f11f07ca24a8bbd8c25b4ac9b0f Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Wed, 26 Oct 2022 14:43:41 -0700 Subject: [PATCH 0215/1922] [quant][fx] Add _convert_to_reference_decomposed (#87094) Summary: _convert_to_reference_decomposed is a private convert function in fx graph mode quantization flow to convert a calibrated/trained model to a reference quantized model with decomposed quantized tensor representations. Test Plan: python test/test_quantization.py TestQuantizeFx.test__convert_to_reference_decomposed_fx Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: https://github.com/pytorch/pytorch/pull/87094 Approved by: https://github.com/andrewor14 --- test/quantization/fx/test_quantize_fx.py | 26 +++++++++++ torch/ao/quantization/fx/convert.py | 55 ++++++++++++++++------- torch/ao/quantization/fx/utils.py | 57 +++++++++++++++++++----- torch/ao/quantization/quantize_fx.py | 55 +++++++++++++++++++++++ torch/ao/quantization/utils.py | 12 +++++ 5 files changed, 177 insertions(+), 28 deletions(-) diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py index 2746b1c9a0173..3f39e4bfbbb41 100644 --- a/test/quantization/fx/test_quantize_fx.py +++ b/test/quantization/fx/test_quantize_fx.py @@ -18,10 +18,12 @@ prepare_fx, convert_fx, convert_to_reference_fx, + _convert_to_reference_decomposed_fx, prepare_qat_fx, fuse_fx, ) + from torch.ao.quantization.fx.quantization_patterns import DefaultNodeQuantizeHandler from torch.ao.quantization.fx.match_utils import ( @@ -5237,6 +5239,30 @@ def test_get_default_qconfig_valid_backend(self): with self.assertRaisesRegex(AssertionError, "not supported"): qconfig_mapping = get_default_qat_qconfig_mapping(invalid_backend) + def test__convert_to_reference_decomposed_fx(self): + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(5, 10) + + def forward(self, x): + return self.linear(x) + + m = M().eval() + qconfig_mapping = get_default_qconfig_mapping("fbgemm") + example_inputs = (torch.randn(1, 5),) + m = prepare_fx(m, qconfig_mapping, example_inputs) + m = _convert_to_reference_decomposed_fx(m) + expected_occurrence = { + ns.call_function(torch.ops.quantized_decomposed.quantize_per_tensor): 2, + ns.call_function(torch.ops.quantized_decomposed.dequantize_per_tensor): 2, + } + self.checkGraphModuleNodes( + m, + expected_node_occurrence=expected_occurrence) + # make sure it runs + m(*example_inputs) + @skipIfNoFBGEMM class TestQuantizeFxOps(QuantizationTestCase): def setUp(self): diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py index aa402e882abc8..74eb8f1ca542b 100644 --- a/torch/ao/quantization/fx/convert.py +++ b/torch/ao/quantization/fx/convert.py @@ -69,6 +69,8 @@ PrepareCustomConfig, ) from .lower_to_fbgemm import lower_to_fbgemm +# importing the lib so that the quantized_decomposed ops are registered +from ._decomposed import quantized_decomposed_lib # noqa: F401 # TODO: revisit this list. Many helper methods shouldn't be public @@ -485,7 +487,8 @@ def convert( is_standalone_module: bool = False, _remove_qconfig_flag: bool = True, qconfig_mapping: Union[QConfigMapping, Dict[str, Any], None] = None, - backend_config: Union[BackendConfig, Dict[str, Any], None] = None) -> torch.nn.Module: + backend_config: Union[BackendConfig, Dict[str, Any], None] = None, + is_decomposed: bool = False) -> torch.nn.Module: """ We will convert an observed model (a module with observer calls) to a reference quantized model, the rule is simple: @@ -497,13 +500,21 @@ def convert( is stored in observed_node_names, we can decide whether we need to swap the module based on this set - standalone_module means it a submodule that is not inlined in - parent module, and will be quantized separately as one unit. - - Returns a quantized standalone module, whether input/output is quantized is - specified by prepare_custom_config, with - input_quantized_idxs, output_quantized_idxs, please - see docs for prepare_fx for details + Args: + * `is_standalone_module`: when this flag is True, it means we are quantizing + a submodule that is not inlined in parent module, and will be quantized + separately as one unit. + + * `is_decomposed`: a boolean flag to indicate whether we want to use the + quantize operator for decomposed quantized tensor + (torch.ops.quantized_decomposed.quantize_per_tensor) or default/standalone + quantized tensor (torch.quantize_per_tensor) + + Returns: + a quantized standalone module, whether input/output is quantized is + specified by prepare_custom_config, with + input_quantized_idxs, output_quantized_idxs, please + see docs for :func:`~torch.ao.quantization.prepare_fx` for details """ if convert_custom_config is None: convert_custom_config = ConvertCustomConfig() @@ -595,7 +606,8 @@ def replace_observer_with_quantize_dequantize_node( node: Node, modules: Dict[str, torch.nn.Module], node_name_to_scope: Dict[str, Tuple[str, type]], - node_name_to_qconfig: Dict[str, QConfigAny]) -> None: + node_name_to_qconfig: Dict[str, QConfigAny], + is_decomposed: bool) -> None: """ Replace activation_post_process module call node with quantize and dequantize node @@ -608,7 +620,7 @@ def replace_observer_with_quantize_dequantize_node( assert isinstance(node.target, str) module_path, prefix = get_module_path_and_prefix(node, node_name_to_scope, node_name_to_qconfig) observer_module = modules[node.target] - maybe_quantize_node_info = get_quantize_node_info(observer_module) + maybe_quantize_node_info = get_quantize_node_info(observer_module, is_decomposed) # Skip replacing observers to quant/dequant nodes if the qconfigs of all # consumers and producers of this observer are None skip_replacement = all([ @@ -626,7 +638,7 @@ def replace_observer_with_quantize_dequantize_node( # replace observer node with quant - dequant node with graph.inserting_before(node): input_node = node.args[0] - inputs = [input_node] + quantize_op_inputs = [input_node] for key, value in qparams.items(): # TODO: we can add the information of whether a value needs to # be registered as an attribute in qparams dict itself @@ -634,13 +646,22 @@ def replace_observer_with_quantize_dequantize_node( # For scale and zero_point values we register them as buffers in the root module. # TODO: maybe need more complex attr name here qparam_node = create_getattr_from_value(model, graph, module_path + prefix + key, value) - inputs.append(qparam_node) + quantize_op_inputs.append(qparam_node) else: # for qparams that are not scale/zero_point (like axis, dtype) we store them as literals in the graph. - inputs.append(value) - - quantized_node = graph.create_node(node_type, quantize_op, tuple(inputs), {}) - dequantized_node = graph.call_method("dequantize", args=(quantized_node,)) + quantize_op_inputs.append(value) + + quantized_node = graph.create_node(node_type, quantize_op, tuple(quantize_op_inputs), {}) + if is_decomposed: + # use the same qparams from quantize op + dq_inputs = [quantized_node] + quantize_op_inputs[1:] + dequantized_node = graph.call_function( + torch.ops.quantized_decomposed.dequantize_per_tensor, + tuple(dq_inputs), + {} + ) + else: + dequantized_node = graph.call_method("dequantize", args=(quantized_node,)) node.replace_all_uses_with(dequantized_node) graph.erase_node(node) @@ -711,7 +732,7 @@ def replace_observer_or_dequant_stub_with_dequantize_node(node: Node, graph: Gra else: replace_observer_with_quantize_dequantize_node( model, model.graph, node, modules, node_name_to_scope, - node_name_to_qconfig) + node_name_to_qconfig, is_decomposed) elif isinstance(mod, DeQuantStub): replace_observer_or_dequant_stub_with_dequantize_node(node, model.graph) elif is_observed_standalone_module(mod): diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py index f359bd90f9e61..7b838b64f41a8 100644 --- a/torch/ao/quantization/fx/utils.py +++ b/torch/ao/quantization/fx/utils.py @@ -17,6 +17,7 @@ activation_is_statically_quantized, is_per_tensor, is_per_channel, + to_underlying_dtype, ) from torch.ao.quantization.quantize import is_activation_post_process @@ -27,6 +28,8 @@ Node, ) from .custom_config import PrepareCustomConfig +# importing the lib so that the quantized_decomposed ops are registered +from ._decomposed import quantized_decomposed_lib # noqa: F401 from typing import Callable, Optional, List, Dict, Any, Set, Tuple, Union, Type from collections import namedtuple @@ -160,11 +163,22 @@ def get_per_tensor_qparams(activation_post_process): dtype = activation_post_process.dtype return scale, zero_point, dtype -def get_quantize_node_info(activation_post_process: Callable) -> Optional[Tuple[str, Union[Callable, str], Dict[str, Any]]]: - ''' Given an activation_post_process module, - return node_type(e.g. call_function), quantize op(e.g. quantize_per_tensor) and a dictionary - of extracted qparams from the module - ''' +def get_quantize_node_info( + activation_post_process: Callable, + is_decomposed: bool +) -> Optional[Tuple[str, Union[Callable[..., Any], str], Dict[str, Any]]]: + """ Extract information about quantize op from activation_post_process module + Args: + * `activation_post_process`: observer module instance or fake quant module instance + after calibration/QAT + * `is_decomposed`: a boolean flag to indicate whether we want to use the + quantize operator for decomposed quantized tensor (torch.ops.quantized_decomposed.quantize_per_tensor) or default/standalone + quantized tensor (torch.quantize_per_tensor) + + Returns + node_type(e.g. call_function), quantize op(e.g. quantize_per_tensor) and a dictionary + of extracted qparams from the module + """ dtype = activation_post_process.dtype # type: ignore[attr-defined] compute_dtype = None if hasattr(activation_post_process, "compute_dtype"): @@ -177,17 +191,36 @@ def get_quantize_node_info(activation_post_process: Callable) -> Optional[Tuple[ if is_per_channel(activation_post_process.qscheme): # type: ignore[attr-defined] ch_axis = int(activation_post_process.ch_axis) # type: ignore[attr-defined] qparams = {"_scale_": scale, "_zero_point_": zero_point, "_axis_": ch_axis, "_dtype_": dtype} - quantize_op = torch.quantize_per_channel + if is_decomposed: + raise NotImplementedError("decomposed quantize_per_channel op not implemented yet") + else: + quantize_op = torch.quantize_per_channel else: scale = float(scale) zero_point = int(zero_point) - qparams = {"_scale_": scale, "_zero_point_": zero_point, "_dtype_": dtype} - quantize_op = torch.quantize_per_tensor + if is_decomposed: + quant_min = activation_post_process.quant_min # type: ignore[attr-defined] + quant_max = activation_post_process.quant_max # type: ignore[attr-defined] + dtype = to_underlying_dtype(dtype) + qparams = { + "_scale_": scale, + "_zero_point_": zero_point, + "_quant_min": quant_max, + "_quant_max": quant_max, + "_dtype_": dtype + } + quantize_op = torch.ops.quantized_decomposed.quantize_per_tensor + else: + qparams = {"_scale_": scale, "_zero_point_": zero_point, "_dtype_": dtype} + quantize_op = torch.quantize_per_tensor elif compute_dtype in [torch.quint8, torch.qint8, torch.float16]: # TODO(future PR): switch compute_dtype to is_dynamic # dynamic quantization node_type = "call_function" - quantize_op = torch.quantize_per_tensor_dynamic + if is_decomposed: + raise NotImplementedError("decomposed quantize_per_tensor_dynamic op not implemented yet") + else: + quantize_op = torch.quantize_per_tensor_dynamic # TODO: get reduce range from observer # reduce_range = activation_post_process.reduce_range reduce_range = torch.backends.quantized.engine in ("fbgemm", "x86") @@ -199,8 +232,9 @@ def get_quantize_node_info(activation_post_process: Callable) -> Optional[Tuple[ else: warnings.warn(f"Unsupported activation_post_process in get_quantize_node_info: {activation_post_process}") return None - return node_type, quantize_op, qparams + return node_type, quantize_op, qparams # type: ignore[return-value] +# TODO: looks like this is not used, remove def quantize_node( in_node: Node, obs_module: torch.nn.Module, @@ -247,7 +281,8 @@ def quantize_node( module_path = "" root_module = modules[''] graph = quantized_graph - maybe_quantize_node_info = get_quantize_node_info(obs_module) + is_decomposed_qtensor = False + maybe_quantize_node_info = get_quantize_node_info(obs_module, is_decomposed_qtensor) assert maybe_quantize_node_info is not None, \ f"Expecting quantize node info not to be None, observer: {obs_module}" node_type, quantize_op, qparams = maybe_quantize_node_info diff --git a/torch/ao/quantization/quantize_fx.py b/torch/ao/quantization/quantize_fx.py index fb6f3dc1fe574..abd1cf1b8edbc 100644 --- a/torch/ao/quantization/quantize_fx.py +++ b/torch/ao/quantization/quantize_fx.py @@ -530,6 +530,7 @@ def _convert_fx( _remove_qconfig: bool = True, qconfig_mapping: Union[QConfigMapping, Dict[str, Any], None] = None, backend_config: Union[BackendConfig, Dict[str, Any], None] = None, + is_decomposed: bool = False, ) -> torch.nn.Module: """ `is_standalone_module`: see docs in :func:`~torch.ao.quantization.prepare_standalone_module_fx` """ @@ -552,6 +553,7 @@ def _convert_fx( _remove_qconfig_flag=_remove_qconfig, qconfig_mapping=qconfig_mapping, backend_config=backend_config, + is_decomposed=is_decomposed, ) preserved_attributes = convert_custom_config.preserved_attributes @@ -676,6 +678,59 @@ def convert_to_reference_fx( backend_config=backend_config, ) +def _convert_to_reference_decomposed_fx( + graph_module: GraphModule, + convert_custom_config: Union[ConvertCustomConfig, Dict[str, Any], None] = None, + _remove_qconfig: bool = True, + qconfig_mapping: Union[QConfigMapping, Dict[str, Any], None] = None, + backend_config: Union[BackendConfig, Dict[str, Any], None] = None, +) -> torch.nn.Module: + r""" Convert a calibrated or trained model to a reference quantized model, with + decomposed representation for quantized Tensor + see https://github.com/pytorch/rfcs/blob/master/RFC-0019-Extending-PyTorch-Quantization-to-Custom-Backends.md for more details, + reference quantzied model is a standard representation of a quantized model provided + by FX Graph Mode Quantization, it can be further lowered to run on the target + hardware, like accelerators + + Note: this is not public API + + Args: + * `graph_module` (GraphModule): A prepared and calibrated/trained model (GraphModule) + + * `convert_custom_config` (ConvertCustomConfig): custom configurations for convert function. + See :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details. + + * `_remove_qconfig` (bool): Option to remove the qconfig attributes in the model after convert. + + * `qconfig_mapping` (QConfigMapping): config for specifying how to convert a model for quantization. + See :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details. + + * `backend_config` (BackendConfig): A configuration for the backend which describes how + operators should be quantized in the backend. See + :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details. + + Return: + A reference quantized model (GraphModule) with operators working with decomposed quantized Tensor + + Example:: + + # prepared_model: the model after prepare_fx/prepare_qat_fx and calibration/training + # TODO: add backend_config after we split the backend_config for fbgemm and qnnpack + # e.g. backend_config = get_default_backend_config("fbgemm") + reference_quantized_model = _convert_to_reference_decomposed_fx(prepared_model) + + """ + torch._C._log_api_usage_once("quantization_api.quantize_fx._convert_to_reference_decomposed_fx") + return _convert_fx( + graph_module, + is_reference=True, + convert_custom_config=convert_custom_config, + _remove_qconfig=_remove_qconfig, + qconfig_mapping=qconfig_mapping, + backend_config=backend_config, + is_decomposed=True, + ) + def _convert_standalone_module_fx( graph_module: GraphModule, diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py index 47ca7e64e329a..afa278a795dd0 100644 --- a/torch/ao/quantization/utils.py +++ b/torch/ao/quantization/utils.py @@ -140,6 +140,17 @@ def getattr_from_fqn(obj: Any, fqn: str) -> Any: """ return functools.reduce(getattr, fqn.split("."), obj) +def to_underlying_dtype(qdtype): + DTYPE_MAPPING = { + torch.quint8: torch.uint8, + torch.qint8: torch.int8, + torch.qint32: torch.int32, + torch.quint4x2: torch.uint8, + torch.quint2x4: torch.uint8, + } + assert qdtype in DTYPE_MAPPING, "Unsupported dtype: " + qdtype + return DTYPE_MAPPING[qdtype] + def get_qparam_dict(observer_or_fake_quant): qscheme = observer_or_fake_quant.qscheme if hasattr(observer_or_fake_quant, "qscheme") else None dtype = observer_or_fake_quant.dtype @@ -562,4 +573,5 @@ def _patched_module_call(self, *args, **kwargs): "calculate_qmin_qmax", "has_no_children_ignoring_parametrizations", "get_fqn_to_example_inputs", + "to_underlying_dtype", ] From f3f3650efc0bf61109625e2a6902b87d9651334d Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Thu, 27 Oct 2022 01:24:01 +0000 Subject: [PATCH 0216/1922] Limit ROCM option to Linux only (#87833) As it's not available on neither Windows nor MacOS cc @jeffdaily @sunway513 @jithunnair-amd @ROCmSupport Pull Request resolved: https://github.com/pytorch/pytorch/pull/87833 Approved by: https://github.com/kit1980 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e2e3bf0e3f8d5..105e38e7c1acf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -190,7 +190,7 @@ option(USE_CUDA "Use CUDA" ON) cmake_dependent_option( BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF) option(USE_FAST_NVCC "Use parallel NVCC build" OFF) -option(USE_ROCM "Use ROCm" ON) +cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF) option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF) cmake_dependent_option( USE_CUDNN "Use cuDNN" ON From 00fd26e35df0e8dd31a8465e7b13e806c4e0bed1 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Wed, 26 Oct 2022 14:43:42 -0700 Subject: [PATCH 0217/1922] [quant][be] Remove unused function `quantize_node` (#87153) Summary: att Test Plan: python test/test_quantization.py TestQuantizeFx Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: https://github.com/pytorch/pytorch/pull/87153 Approved by: https://github.com/andrewor14 --- torch/ao/quantization/fx/utils.py | 67 ++----------------------------- 1 file changed, 3 insertions(+), 64 deletions(-) diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py index 7b838b64f41a8..f2037d1590a93 100644 --- a/torch/ao/quantization/fx/utils.py +++ b/torch/ao/quantization/fx/utils.py @@ -65,7 +65,6 @@ "node_arg_is_weight", "NON_OBSERVABLE_ARG_DICT", "NON_QUANTIZABLE_WEIGHT_OPS", - "quantize_node", "return_arg_list", ] @@ -234,69 +233,9 @@ def get_quantize_node_info( return None return node_type, quantize_op, qparams # type: ignore[return-value] -# TODO: looks like this is not used, remove -def quantize_node( - in_node: Node, - obs_module: torch.nn.Module, - obs_node: Node, - modules: Dict[str, torch.nn.Module], - quantized_graph: Graph, - node_name_to_scope: Dict[str, Tuple[str, type]], - is_input: bool, - output_prefix: str = "_output") -> Node: - ''' Add quantization nodes (eg. quantize_per_tensor/per_channel) for given node to graph - with the qparams calculated from activation_post_process (obs_module). - The observer node (obs_node) is used to find the FQN of the user of act_post_process. - e.g. Given input `node` in `node = self.conv(x)`, insert node: - `quantized_node = torch.quantize_per_tensor(x, self._scale_0, self._zer_point_0, self._dtype_0)` - where self._scale_0, self._zero_point_0 and self._dtype_0 are - calculated from `obs_module` - ''' - # Find the first use of the observer node, we use this to get the scope of the module. - if is_input: - # if the quantize function is at the input of op, then we find the first user of the observer_node - # to get the path. If a linear call_function is in the user list, we return the first instance - # of linear node to get the FQN. - users = list(obs_node.users) - first_linear_use_or_first_use = users[0] if users else None - linear_node = None - for n in users: - if n.op == "call_function" and n.target == torch.nn.functional.linear: - linear_node = n - break - if linear_node: - first_linear_use_or_first_use = linear_node - prefix = "_input" - else: - # if the quantize function is at the output of the op, we use the observer input node to get the path - first_linear_use_or_first_use = in_node - prefix = output_prefix - - if first_linear_use_or_first_use and first_linear_use_or_first_use.name in node_name_to_scope: - module_path, _ = node_name_to_scope[first_linear_use_or_first_use.name] - else: - # TODO: it's not used, so actually we can skip quantization - # but this requires changing return type of quantize_node - # we can fix it later if needed - module_path = "" - root_module = modules[''] - graph = quantized_graph - is_decomposed_qtensor = False - maybe_quantize_node_info = get_quantize_node_info(obs_module, is_decomposed_qtensor) - assert maybe_quantize_node_info is not None, \ - f"Expecting quantize node info not to be None, observer: {obs_module}" - node_type, quantize_op, qparams = maybe_quantize_node_info - inputs = [in_node] - - for key, value in qparams.items(): - if key in ['_scale_', '_zero_point_']: - # For scale and zero_point values we register them as buffers in the root module. - qparam_node = create_getattr_from_value(root_module, graph, module_path + prefix + key, value) - inputs.append(qparam_node) - else: - # for qparams that are not scale/zero_point (like axis, dtype) we store them as literals in the graph. - inputs.append(value) - return graph.create_node(node_type, quantize_op, tuple(inputs), {}) +# Keep it here for BC in torch.quantization namespace, we can remove it after +# we deprecate the torch.quantization namespace +quantize_node = NotImplemented def get_custom_module_class_keys(custom_module_mapping: Dict[QuantType, Dict[Type, Type]]) -> List[Any]: r""" Get all the unique custom module keys in the custom config dict From 8b44aeba7b246188548d016bf480fc74b1d85102 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Wed, 26 Oct 2022 14:43:42 -0700 Subject: [PATCH 0218/1922] [fx][subgraph_rewriter] Change match_filter to be a List in replace_pattern_with_filters (#87257) Summary: att, this is experimental api so not marking it as bc-breaking. The match will be accepted only if all the filters in the list passes. Changing the filter arg to be list also allows us to pass in empty list that means no filter, which makes user code cleaner. Test Plan: python test/test_fx.py -k test_replace_pattern_with_filters Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: https://github.com/pytorch/pytorch/pull/87257 Approved by: https://github.com/SherlockNoMad --- test/fx/test_subgraph_rewriter.py | 6 +++--- torch/fx/subgraph_rewriter.py | 20 +++++++++++++------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/test/fx/test_subgraph_rewriter.py b/test/fx/test_subgraph_rewriter.py index ac3498458d600..ed6d50e44b4ac 100644 --- a/test/fx/test_subgraph_rewriter.py +++ b/test/fx/test_subgraph_rewriter.py @@ -773,7 +773,7 @@ def gemm_bias_mul_replacement_with_c(a, b, bias, c): self.assertEqual(repalcement_node_found, 2) - def test_replace_pattern_with_filter(self): + def test_replace_pattern_with_filters(self): class M(torch.nn.Module): def __init__(self): super().__init__() @@ -833,10 +833,10 @@ def num_repalcement_node_found(traced): # match with filter, should find 1 match traced = symbolic_trace(M()) - matches = subgraph_rewriter.replace_pattern_with_filter( + matches = subgraph_rewriter.replace_pattern_with_filters( traced, BinaryOpScalarReLUPattern, BinaryOpScalarReLUReplacement, - second_input_is_scalar) + [second_input_is_scalar]) self.assertEqual(len(matches), 1) self.assertEqual(num_repalcement_node_found(traced), 1) diff --git a/torch/fx/subgraph_rewriter.py b/torch/fx/subgraph_rewriter.py index 09e5550c5930d..72bb7fd373516 100644 --- a/torch/fx/subgraph_rewriter.py +++ b/torch/fx/subgraph_rewriter.py @@ -8,7 +8,7 @@ from typing import Callable, Dict, List, NamedTuple, Optional, Set import torch -__all__ = ['Match', 'replace_pattern', 'replace_pattern_with_filter'] +__all__ = ['Match', 'replace_pattern', 'replace_pattern_with_filters'] @compatibility(is_backward_compatible=True) class Match(NamedTuple): @@ -185,11 +185,11 @@ def forward(self, x, w1, w2): # Experimental API, not backward compatible @compatibility(is_backward_compatible=False) -def replace_pattern_with_filter( +def replace_pattern_with_filters( gm: GraphModule, pattern: Callable, replacement: Callable, - match_filter: Callable[["InternalMatch", Graph, Graph], bool], # type: ignore[name-defined] + match_filters: List[Callable[["InternalMatch", Graph, Graph], bool]], # type: ignore[name-defined] ) -> List[Match]: """ See replace_pattern for documentation. This function is an overload with an additional match_filter argument. @@ -200,18 +200,21 @@ def replace_pattern_with_filter( definition of InternalMatch. """ - return _replace_pattern(gm, pattern, replacement, match_filter) + return _replace_pattern(gm, pattern, replacement, match_filters) def _replace_pattern( gm: GraphModule, pattern: Callable, replacement: Callable, - match_filter: Optional[Callable[["InternalMatch", Graph, Graph], bool]] = None # type: ignore[name-defined] + match_filters: List[Callable[["InternalMatch", Graph, Graph], bool]] = None # type: ignore[name-defined] ) -> List[Match]: from torch.fx.passes.utils.matcher_utils import SubgraphMatcher, InternalMatch + if match_filters is None: + match_filters = [] + # Get the graphs for `gm`, `pattern`, `replacement` original_graph: Graph = gm.graph pattern_graph: Graph = symbolic_trace(pattern).graph @@ -222,8 +225,11 @@ def _replace_pattern( _matches: List[InternalMatch] = matcher.match(original_graph) # Filter out matches that don't match the filter - if match_filter: - _matches = [m for m in _matches if match_filter(m, original_graph, pattern_graph)] + _matches = [ + m for m in _matches + if all(match_filter(m, original_graph, pattern_graph) + for match_filter in match_filters) + ] replacement_placeholders = [n for n in replacement_graph.nodes if n.op == "placeholder"] From dc72485a4c9479e9e6963249d96f637c7a560bcf Mon Sep 17 00:00:00 2001 From: Horace He Date: Wed, 26 Oct 2022 16:37:10 +0000 Subject: [PATCH 0219/1922] fix sym_storage conversion and some cleanup (#87718) cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87718 Approved by: https://github.com/ezyang --- test/test_proxy_tensor.py | 10 +++++++++- torch/fx/experimental/proxy_tensor.py | 12 +++--------- torch/fx/experimental/symbolic_shapes.py | 4 ++-- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py index fae55367ab192..1d5985a00da8c 100644 --- a/test/test_proxy_tensor.py +++ b/test/test_proxy_tensor.py @@ -857,7 +857,7 @@ def test_neg_shape(self): def f(a): return torch.empty(-a.shape[0] + 10) - r = str(make_fx(f, tracing_mode="symbolic")(torch.empty(1)).code).strip() + r = str(make_fx(f, tracing_mode="symbolic")(torch.empty(2)).code).strip() self.assertExpectedInline(r, """\ def forward(self, a_1): sym_size = torch.ops.aten.sym_size(a_1, 0); a_1 = None @@ -984,6 +984,14 @@ def f(a, b): fx_g = make_fx(f, tracing_mode="symbolic")(torch.randn(16), torch.randn(8)) self.assertExpectedInline(str(fx_g.shape_env.get_guard_expr()), "Eq(s1, 8) & Eq(s0, 2*s1)") + def test_sym_storage_offset(self): + def f(x, y): + return x + y + + inp = (torch.randn(8)[3:], torch.randn(5)) + fx_g = make_fx(f, tracing_mode="symbolic")(*inp) + inp = (torch.randn(8)[3:], torch.randn(5)) + self.assertEqual(fx_g(*inp), f(*inp)) def _assert_no_guards(self, fx_g, free_symbols): assert _get_free_symbols(fx_g.shape_env) == free_symbols, fx_g.shape_env.var_to_val diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py index 2bf6f0cca004a..86d1e19550928 100644 --- a/torch/fx/experimental/proxy_tensor.py +++ b/torch/fx/experimental/proxy_tensor.py @@ -628,7 +628,7 @@ def wrapped(*args): proxy_mode = ProxyTorchDispatchMode(fx_tracer) - def wrap_fake_concrete(x): + def wrap_fake(x): if isinstance(x, torch.Tensor): return fake_tensor_mode.from_tensor(x) # type: ignore[attr-defined] @@ -636,16 +636,10 @@ def wrap_fake_concrete(x): sym_mode = proxy_mode.sym_mode - # todo: Figure out a more informative name for symints - def wrap_fake_symbolic(x): - if isinstance(x, torch.Tensor): - return fake_tensor_mode.from_tensor(x) - return x - wrap_fn_map = { "real": lambda x: x, - "fake": wrap_fake_concrete, - "symbolic": wrap_fake_symbolic, + "fake": wrap_fake, + "symbolic": wrap_fake, } args = pytree.tree_map(wrap_fn_map[tracing_mode], args) diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py index a7030abbcfc41..0a03e5819a90a 100644 --- a/torch/fx/experimental/symbolic_shapes.py +++ b/torch/fx/experimental/symbolic_shapes.py @@ -1,6 +1,6 @@ import torch import torch.utils._pytree as pytree -from typing import Set, Dict, List, Type, Optional, cast, Union +from typing import Set, Dict, List, Type, Optional, cast import operator import builtins import math @@ -389,7 +389,7 @@ def create_symbolic_sizes_strides(self, ex: torch.Tensor): assert all(x is not None for x in stride) return [self.create_symintnode(i) for i in size], [self.create_symintnode(i) for i in stride] # type: ignore[arg-type] - def create_symintnode(self, expr: Union["sympy.Expr", int]): + def create_symintnode(self, expr: "sympy.Expr"): py_sym_int = PySymInt(expr, self) cpp_sym_int = torch.SymIntNode.new_symint(py_sym_int) # type: ignore[attr-defined] return cpp_sym_int From 2e97e4671820e43b454ef231ed3a7cd1fc7d1733 Mon Sep 17 00:00:00 2001 From: PyTorch MergeBot Date: Thu, 27 Oct 2022 04:23:43 +0000 Subject: [PATCH 0220/1922] [vision hash update] update the pinned vision hash (#87831) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml). Update the pinned vision hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/87831 Approved by: https://github.com/pytorchbot --- .github/ci_commit_pins/vision.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt index d4dee5af2936d..4ee9517b28d7a 100644 --- a/.github/ci_commit_pins/vision.txt +++ b/.github/ci_commit_pins/vision.txt @@ -1 +1 @@ -edb3a8069a0b86231f14e84ac9f26fd7c7bffb5f +add75968543f36818691f8b59880f5c04689a88e From a2049dafc6ca000d814c6586bca568db7c90bab4 Mon Sep 17 00:00:00 2001 From: Andrew Gu Date: Thu, 27 Oct 2022 00:03:14 +0000 Subject: [PATCH 0221/1922] [FSDP] ufmt /fsdp (#87811) This applies `ufmt` to all of the FSDP files in the `torch/distributed/fsdp/` directory. **Test Plan** CI **Notes** For VSCode users, - Install `ufmt`: https://pypi.org/project/ufmt/ - Install VSCode `ufmt` extension: https://marketplace.visualstudio.com/items?itemName=omnilib.ufmt - Include in `settings.json`: ``` { "[python]": { "editor.defaultFormatter": "omnilib.ufmt", "editor.formatOnSave": true, }, } ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/87811 Approved by: https://github.com/rohan-varma, https://github.com/fegin --- torch/distributed/fsdp/_fsdp_extensions.py | 1 - torch/distributed/fsdp/_optim_utils.py | 11 +- torch/distributed/fsdp/_shard_utils.py | 6 +- torch/distributed/fsdp/_state_dict_utils.py | 67 +-- torch/distributed/fsdp/_symbolic_trace.py | 15 +- torch/distributed/fsdp/_utils.py | 13 +- torch/distributed/fsdp/flat_param.py | 4 +- .../fsdp/fully_sharded_data_parallel.py | 556 +++++++++++------- torch/distributed/fsdp/sharded_grad_scaler.py | 66 ++- torch/distributed/fsdp/wrap.py | 35 +- 10 files changed, 453 insertions(+), 321 deletions(-) diff --git a/torch/distributed/fsdp/_fsdp_extensions.py b/torch/distributed/fsdp/_fsdp_extensions.py index abe0d901f8ecc..1f087f44b5739 100644 --- a/torch/distributed/fsdp/_fsdp_extensions.py +++ b/torch/distributed/fsdp/_fsdp_extensions.py @@ -5,7 +5,6 @@ import torch.distributed as dist from torch.distributed._shard.sharded_tensor.api import ShardedTensor from torch.distributed._shard.sharded_tensor.shard import Shard - from torch.distributed.fsdp._shard_utils import _create_chunk_sharded_tensor diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py index a5e1ab64278e5..f87f871042217 100644 --- a/torch/distributed/fsdp/_optim_utils.py +++ b/torch/distributed/fsdp/_optim_utils.py @@ -3,6 +3,7 @@ import functools from typing import ( Any, + cast, Dict, Iterable, Iterator, @@ -12,18 +13,18 @@ Sequence, Tuple, Union, - cast, ) import torch import torch.distributed as dist + # Import the entire FSDP file to avoid circular imports import torch.distributed.fsdp.fully_sharded_data_parallel as FSDP import torch.nn as nn from torch.distributed._shard.sharded_tensor import ShardedTensor +from torch.distributed.fsdp._fsdp_extensions import _ext_chunk_tensor from torch.distributed.fsdp._shard_utils import _gather_state_dict from torch.distributed.fsdp.flat_param import FlatParameter, FlatParamHandle -from torch.distributed.fsdp._fsdp_extensions import _ext_chunk_tensor def sorted_items(dictionary: Dict[str, Any]) -> Iterator[Tuple[str, Any]]: @@ -298,9 +299,9 @@ def _flatten_optim_state_dict( unflat_osd_state = unflat_osd["state"] for param, unflat_param_names in param_to_unflat_param_names.items(): if isinstance(param, FlatParameter): # flatten FSDP parameters' states - assert param in flat_param_to_fsdp_module, ( - f"Check the `flat_param_to_fsdp_module` construction\nparam: {param}" - ) + assert ( + param in flat_param_to_fsdp_module + ), f"Check the `flat_param_to_fsdp_module` construction\nparam: {param}" fsdp_module = flat_param_to_fsdp_module[param] flat_state = _flatten_optim_state( unflat_osd_state, diff --git a/torch/distributed/fsdp/_shard_utils.py b/torch/distributed/fsdp/_shard_utils.py index b0382b41c6d20..0cc9dd656f16b 100644 --- a/torch/distributed/fsdp/_shard_utils.py +++ b/torch/distributed/fsdp/_shard_utils.py @@ -250,10 +250,8 @@ def _create_chunk_sharded_tensor( requires_grad=False, memory_format=torch.contiguous_format, pin_memory=tensor.is_pinned(), - ) + ), ) return ShardedTensor._init_from_local_shards_and_global_metadata( - local_shards, - sharded_tensor_metadata=sharded_tensor_metadata, - process_group=pg + local_shards, sharded_tensor_metadata=sharded_tensor_metadata, process_group=pg ) diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py index ed4b8f226c123..90083ef85b18e 100644 --- a/torch/distributed/fsdp/_state_dict_utils.py +++ b/torch/distributed/fsdp/_state_dict_utils.py @@ -6,25 +6,24 @@ import torch import torch.distributed as dist import torch.distributed.algorithms._checkpoint.checkpoint_wrapper as checkpoint_wrapper + # Import the entire FSDP file to avoid circular imports import torch.distributed.fsdp.fully_sharded_data_parallel as FSDP import torch.nn as nn import torch.nn.functional as F - from torch.distributed._shard.sharded_tensor import ( + init_from_local_shards, Shard, ShardedTensor, - init_from_local_shards, -) -from torch.distributed.utils import ( - _replace_by_prefix, ) +from torch.distributed.utils import _replace_by_prefix -from ._fsdp_extensions import _ext_chunk_tensor, _ext_pre_load_state_dict_transform -from ._fsdp_extensions import _extensions as _user_extensions -from .flat_param import ( - FlatParamHandle, +from ._fsdp_extensions import ( + _ext_chunk_tensor, + _ext_pre_load_state_dict_transform, + _extensions as _user_extensions, ) +from .flat_param import FlatParamHandle def _full_post_state_dict_hook( @@ -53,16 +52,12 @@ def _full_post_state_dict_hook( # exiting `summon_full_params()` via the parameter shape. However, for # `NO_SHARD`, we cannot tell from the shape, so we do not return early. if ( - ( - not module._use_orig_params - and FSDP.FLAT_PARAM in module.module._parameters - ) - or ( - module._use_orig_params - and module._handles - and module._handles[0].uses_sharded_strategy - and module._handles[0].is_sharded(module._handles[0].flat_param) - ) + not module._use_orig_params and FSDP.FLAT_PARAM in module.module._parameters + ) or ( + module._use_orig_params + and module._handles + and module._handles[0].uses_sharded_strategy + and module._handles[0].is_sharded(module._handles[0].flat_param) ): return state_dict @@ -79,7 +74,7 @@ def _full_post_state_dict_hook( # do not have prefix considered as they are not computed in `state_dict` # call. if clean_key.startswith(clean_prefix): - clean_key = clean_key[len(clean_prefix):] + clean_key = clean_key[len(clean_prefix) :] # Clone non-ignored parameters before exiting the # `_summon_full_params()` context @@ -88,8 +83,9 @@ def _full_post_state_dict_hook( f"only has {state_dict.keys()}. prefix={prefix}, " f"module_name={module_name} param_name={param_name} rank={module.rank}." ) - if clean_key not in module._ignored_param_names and \ - not getattr(state_dict[fqn], "_has_been_cloned", False): + if clean_key not in module._ignored_param_names and not getattr( + state_dict[fqn], "_has_been_cloned", False + ): try: state_dict[fqn] = state_dict[fqn].clone().detach() state_dict[fqn]._has_been_cloned = True # type: ignore[attr-defined] @@ -129,11 +125,9 @@ def _full_pre_load_state_dict_hook( ) -> None: # We do not expect to be calling pre-hooks twice without post-hook # call in between. - assert getattr(module, '_full_param_ctx', None) is None + assert getattr(module, "_full_param_ctx", None) is None # Note that it needs writeback=True to persist. - module._full_param_ctx = module._summon_full_params( - recurse=False, writeback=True - ) + module._full_param_ctx = module._summon_full_params(recurse=False, writeback=True) module._full_param_ctx.__enter__() _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP.FSDP_PREFIX}") @@ -141,7 +135,7 @@ def _full_pre_load_state_dict_hook( def _full_post_load_state_dict_hook(module, *args, **kwargs) -> None: # We should exit summon_full_params context. module._assert_state([FSDP.TrainingState_.SUMMON_FULL_PARAMS]) - assert getattr(module, '_full_param_ctx', None) is not None + assert getattr(module, "_full_param_ctx", None) is not None module._full_param_ctx.__exit__(None, None, None) module._full_param_ctx = None @@ -189,7 +183,9 @@ def _local_post_load_state_dict_hook(module, *args, **kwargs) -> None: def _local_pre_load_state_dict_hook( - module, state_dict: Dict[str, Any], prefix: str, + module, + state_dict: Dict[str, Any], + prefix: str, ) -> None: """ This hook finds the local flat_param for this FSDP module from the @@ -253,7 +249,7 @@ def _sharded_post_state_dict_hook( rank=module.rank, world_size=module.world_size, num_devices_per_node=torch.cuda.device_count(), - pg=module.process_group + pg=module.process_group, ) if module._state_dict_config.offload_to_cpu: sharded_tensor = sharded_tensor.cpu() @@ -271,7 +267,9 @@ def _sharded_post_load_state_dict_hook(module, *args, **kwargs) -> None: def _sharded_pre_load_state_dict_hook( - module, state_dict: Dict[str, Any], prefix: str, + module, + state_dict: Dict[str, Any], + prefix: str, ) -> None: """ The hook combines the unflattened, sharded parameters (ShardedTensor) to @@ -331,7 +329,9 @@ def _sharded_pre_load_state_dict_hook( # Get the chunk from the loaded flat_param for the local rank. loaded_flat_tensor, num_to_pad = FlatParamHandle._get_shard( - loaded_flat_param, module.rank, module.world_size, + loaded_flat_param, + module.rank, + module.world_size, ) loaded_flat_tensor.to(flat_param.device) assert all(s1 == s2 for s1, s2 in zip(loaded_shapes, flat_param._shapes)), ( @@ -377,10 +377,7 @@ def _post_state_dict_hook( # back to their mixed precision type. This is because buffers are cast # during lazy_init() and stay at their mixed precision type before/after # forward/backward. As a result state_dict() should maintain this. - if ( - fsdp_module._is_root - and fsdp_module._mixed_precision_enabled_for_buffers() - ): + if fsdp_module._is_root and fsdp_module._mixed_precision_enabled_for_buffers(): fsdp_module._cast_buffers(recurse=True) return processed_state_dict diff --git a/torch/distributed/fsdp/_symbolic_trace.py b/torch/distributed/fsdp/_symbolic_trace.py index 026595fd7def0..f6fe5e432252e 100644 --- a/torch/distributed/fsdp/_symbolic_trace.py +++ b/torch/distributed/fsdp/_symbolic_trace.py @@ -5,7 +5,6 @@ import torch - __all__ = ["TracingConfig"] @@ -140,13 +139,18 @@ def _patched_create_proxy( if args is not None: named_params: List[Tuple[str, torch.nn.Parameter]] = [] for arg in args: - if isinstance(arg, torch.fx.Proxy) and arg.node.target in prefixed_param_name_to_param: + if ( + isinstance(arg, torch.fx.Proxy) + and arg.node.target in prefixed_param_name_to_param + ): param = prefixed_param_name_to_param[arg.node.target] named_params.append((arg.node.target, param)) if param not in set(execution_info.param_exec_order): execution_info.param_exec_order.append(param) if named_params: - execution_info.module_to_execution_infos[module].append((module, named_params)) + execution_info.module_to_execution_infos[module].append( + (module, named_params) + ) elif kind == "call_module": named_params = list(module.named_parameters()) if named_params: @@ -234,7 +238,10 @@ def _patch_tracer( ) prefixed_param_name_to_param = dict(root_module.named_parameters()) tracer.create_proxy = functools.partial( - _patched_create_proxy, original_create_proxy, execution_info, prefixed_param_name_to_param + _patched_create_proxy, + original_create_proxy, + execution_info, + prefixed_param_name_to_param, ) try: yield diff --git a/torch/distributed/fsdp/_utils.py b/torch/distributed/fsdp/_utils.py index bd37ce5695984..eb72042b65573 100644 --- a/torch/distributed/fsdp/_utils.py +++ b/torch/distributed/fsdp/_utils.py @@ -10,14 +10,11 @@ ) from torch.nn.utils.rnn import PackedSequence - FSDP_FLATTENED = "_fsdp_flattened" def _contains_batchnorm(module): - return any( - isinstance(mod, _BatchNorm) for mod in module.modules() - ) + return any(isinstance(mod, _BatchNorm) for mod in module.modules()) def _override_batchnorm_mixed_precision(module): @@ -27,11 +24,14 @@ def _override_batchnorm_mixed_precision(module): def _apply_to_tensors( - fn: Callable, container: Union[torch.Tensor, Dict, List, Tuple, Set, OrderedDict, PackedSequence] + fn: Callable, + container: Union[torch.Tensor, Dict, List, Tuple, Set, OrderedDict, PackedSequence], ) -> Any: """Recursively apply to all tensor in different kinds of container types.""" - def apply(x: Union[torch.Tensor, Dict, List, Tuple, Set, OrderedDict, PackedSequence]) -> Any: + def apply( + x: Union[torch.Tensor, Dict, List, Tuple, Set, OrderedDict, PackedSequence] + ) -> Any: if torch.is_tensor(x): return fn(x) elif hasattr(x, "__dataclass_fields__"): @@ -75,6 +75,7 @@ def _apply_to_modules( module prefix name (e.g. "module.submodule." just like in model state dict) and makes that available to ``module_fn``. """ + def f(module: torch.nn.Module, prefix: str, *args, **kwargs): # Call the module function before recursing over children (pre-order) module_fn(module, prefix, *args, **kwargs) diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py index 266dc80b4ed42..3e4eca07df7fa 100644 --- a/torch/distributed/fsdp/flat_param.py +++ b/torch/distributed/fsdp/flat_param.py @@ -33,7 +33,6 @@ p_assert, ) - __all__ = [ "FlatParameter", "FlatParamHandle", @@ -1507,7 +1506,8 @@ def _writeback_orig_params(self) -> bool: # memory and owns the gradient storage, so it will never # require gradient writeback. flat_param_grad = ( - flat_param.grad if self.uses_sharded_strategy or not self._config.offload_params + flat_param.grad + if self.uses_sharded_strategy or not self._config.offload_params else flat_param._cpu_grad # type: ignore[attr-defined] ) needs_grad_writeback = flat_param_grad is None or not _same_storage( diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py index 5fb2e5cdf0f6b..8cd18474d959d 100644 --- a/torch/distributed/fsdp/fully_sharded_data_parallel.py +++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py @@ -8,10 +8,11 @@ import warnings from contextlib import contextmanager from dataclasses import dataclass -from enum import Enum, auto +from enum import auto, Enum from typing import ( Any, Callable, + cast, Deque, Dict, Generator, @@ -22,7 +23,6 @@ Set, Tuple, Union, - cast, ) import torch @@ -35,15 +35,10 @@ from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( _CHECKPOINT_PREFIX, ) -from torch.distributed.algorithms._comm_hooks import ( - LOW_PRECISION_HOOKS, - default_hooks, -) +from torch.distributed.algorithms._comm_hooks import default_hooks, LOW_PRECISION_HOOKS from torch.distributed.distributed_c10d import _get_default_group -from torch.distributed.utils import ( - _sync_params_and_buffers, - _to_kwargs, -) +from torch.distributed.utils import _sync_params_and_buffers, _to_kwargs + from ._optim_utils import ( _broadcast_pos_dim_tensor_states, _broadcast_processed_optim_state_dict, @@ -57,9 +52,9 @@ _rekey_sharded_optim_state_dict, ) from ._state_dict_utils import ( + _post_load_state_dict_hook, _post_state_dict_hook, _pre_load_state_dict_hook, - _post_load_state_dict_hook, ) from ._utils import ( _apply_to_modules, @@ -78,10 +73,10 @@ HandleTrainingState, ) from .wrap import ( - ParamExecOrderWrapPolicy, _or_policy, _recursive_wrap, _wrap_batchnorm_individually, + ParamExecOrderWrapPolicy, ) _TORCHDISTX_AVAIL = True @@ -94,18 +89,23 @@ if not hasattr(torch, "fx"): _TORCH_FX_AVAIL = False if _TORCH_FX_AVAIL: - from ._symbolic_trace import ( - TracingConfig, - _init_execution_info, - _patch_tracer, - ) + from ._symbolic_trace import _init_execution_info, _patch_tracer, TracingConfig __all__ = [ - "FullyShardedDataParallel", "ShardingStrategy", "MixedPrecision", - "CPUOffload", "BackwardPrefetch", "StateDictType", "StateDictConfig", - "FullStateDictConfig", "LocalStateDictConfig", "ShardedStateDictConfig", - "OptimStateKeyType", "TrainingState_", "clean_tensor_name", + "FullyShardedDataParallel", + "ShardingStrategy", + "MixedPrecision", + "CPUOffload", + "BackwardPrefetch", + "StateDictType", + "StateDictConfig", + "FullStateDictConfig", + "LocalStateDictConfig", + "ShardedStateDictConfig", + "OptimStateKeyType", + "TrainingState_", + "clean_tensor_name", ] @@ -148,6 +148,7 @@ class ShardingStrategy(Enum): ``NO_SHARD`` inter-node. """ + FULL_SHARD = auto() SHARD_GRAD_OP = auto() NO_SHARD = auto() @@ -197,6 +198,7 @@ class MixedPrecision: would occur in the `param_dtype` precision, if given, otherwise, in the original parameter precision. """ + # maintain a tensor of this dtype that the fp32 param shard will be cast to. # Will control the precision of model params, inputs, and thus compute as # well. @@ -309,6 +311,7 @@ class StateDictConfig: order to configure settings for the particular type of ``state_dict`` implementation FSDP will use. """ + offload_to_cpu: bool = False @@ -340,6 +343,7 @@ class FullStateDictConfig(StateDictConfig): >>> fsdp = FSDP(model, device_id=torch.cuda.current_device(), auto_wrap_policy=..., sync_module_states=True) >>> # After this point, all ranks have FSDP model with loaded checkpoint. """ + rank0_only: bool = False @@ -366,9 +370,10 @@ class OptimStateKeyType(Enum): class _ExecOrderWarnStatus(Enum): """Used internally for execution order validation.""" - NONE = auto() # no deviation yet + + NONE = auto() # no deviation yet WARNING = auto() # deviated this iteration; currently issuing warnings - WARNED = auto() # deviated in a previous iteration + WARNED = auto() # deviated in a previous iteration class _ExecOrderData: @@ -403,9 +408,10 @@ def __init__( self._forward_prefetch_limit = forward_prefetch_limit # Data structures for execution order validation - self._checking_order: bool = ( - debug_level in [dist.DebugLevel.INFO, dist.DebugLevel.DETAIL] - ) + self._checking_order: bool = debug_level in [ + dist.DebugLevel.INFO, + dist.DebugLevel.DETAIL, + ] self.process_group: Optional[dist.ProcessGroup] = None self.world_size: Optional[int] = None self.all_handles: List[FlatParamHandle] = [] @@ -454,7 +460,9 @@ def get_handles_to_backward_prefetch( prefetch given the current handles key. If there are no valid handles keys to prefetch, then this returns an empty :class:`list`. """ - current_index = self.handles_to_post_forward_order_index.get(current_handles_key, None) + current_index = self.handles_to_post_forward_order_index.get( + current_handles_key, None + ) if current_index is None: return None target_index = current_index - 1 @@ -462,9 +470,7 @@ def get_handles_to_backward_prefetch( for _ in range(self._backward_prefetch_limit): if target_index < 0: break - target_handles_keys.append( - self.handles_post_forward_order[target_index] - ) + target_handles_keys.append(self.handles_post_forward_order[target_index]) target_index -= 1 return target_handles_keys @@ -477,7 +483,9 @@ def get_handles_to_forward_prefetch( prefetch given the current handles key. If there are no valid handles keys to prefetch, then this returns an empty :class:`list`. """ - current_index = self.handles_to_pre_forward_order_index.get(current_handles_key, None) + current_index = self.handles_to_pre_forward_order_index.get( + current_handles_key, None + ) if current_index is None: return None target_index = current_index + 1 @@ -485,9 +493,7 @@ def get_handles_to_forward_prefetch( for _ in range(self._forward_prefetch_limit): if target_index >= len(self.handles_pre_forward_order): break - target_handles_keys.append( - self.handles_pre_forward_order[target_index] - ) + target_handles_keys.append(self.handles_pre_forward_order[target_index]) target_index += 1 return target_handles_keys @@ -511,7 +517,9 @@ def record_post_forward(self, handles: List[FlatParamHandle]) -> None: self.handles_to_post_forward_order_index[handles_key] = index self.handles_post_forward_order.append(handles_key) - def record_pre_forward(self, handles: List[FlatParamHandle], is_training: bool) -> None: + def record_pre_forward( + self, handles: List[FlatParamHandle], is_training: bool + ) -> None: """ Records ``handles`` in the pre-forward order, where ``handles`` should be a group of handles used in the same module's forward. If ``handles`` @@ -597,7 +605,7 @@ def _check_order(self, handles_key: _HandlesKey, is_training: bool) -> None: ( rank, world_indices[ - rank * num_valid_indices: (rank + 1) * num_valid_indices + rank * num_valid_indices : (rank + 1) * num_valid_indices ], ) for rank in range(self.world_size) @@ -683,7 +691,9 @@ def _get_names_from_handle_indices( continue handle = self.all_handles[index] flat_param = handle.flat_param - prefixed_param_names.append(self.flat_param_to_prefixed_param_names[flat_param]) + prefixed_param_names.append( + self.flat_param_to_prefixed_param_names[flat_param] + ) return prefixed_param_names def _get_names_from_handles( @@ -700,7 +710,9 @@ def _get_names_from_handles( flat_param = handle.flat_param if flat_param not in self.flat_param_to_prefixed_param_names: continue - prefixed_param_names.append(self.flat_param_to_prefixed_param_names[flat_param]) + prefixed_param_names.append( + self.flat_param_to_prefixed_param_names[flat_param] + ) return prefixed_param_names def next_iter(self): @@ -970,6 +982,7 @@ class FullyShardedDataParallel(nn.Module): the sharded strategies that schedule all-gathers. Enabling this can help lower the number of CUDA malloc retries. """ + def __init__( self, module: nn.Module, @@ -1062,10 +1075,16 @@ def __init__( self._buffer_name_to_orig_dtype: Dict[str, torch.dtype] = {} self._check_single_device_module(module, ignored_params) - device_from_device_id: Optional[torch.device] = self._get_device_from_device_id(device_id) - self._materialize_module(module, param_init_fn, ignored_params, device_from_device_id) + device_from_device_id: Optional[torch.device] = self._get_device_from_device_id( + device_id + ) + self._materialize_module( + module, param_init_fn, ignored_params, device_from_device_id + ) self._move_module_to_device(module, ignored_params, device_from_device_id) - self.compute_device = self._get_compute_device(module, ignored_params, device_from_device_id) + self.compute_device = self._get_compute_device( + module, ignored_params, device_from_device_id + ) params_to_flatten = list(self._get_orig_params(module, ignored_params)) if sync_module_states: self._sync_module_states(module, params_to_flatten) @@ -1098,7 +1117,10 @@ def __init__( self.params.append(handle.flat_param) self._register_param_handle(handle) handle.shard() - if self.cpu_offload.offload_params and handle.flat_param.device != torch.device("cpu"): + if ( + self.cpu_offload.offload_params + and handle.flat_param.device != torch.device("cpu") + ): handle.flat_param_to(torch.device("cpu")) if not use_orig_params: self._check_orig_params_flattened(ignored_params) @@ -1301,8 +1323,7 @@ def _get_device_from_device_id( self, device_id: Optional[Union[int, torch.device]], ) -> Optional[torch.device]: - """ - """ + """ """ if device_id is None: return None device = ( @@ -1341,11 +1362,15 @@ def _materialize_module( ``reset_parameters()``, and for torchdistX fake tensors, this calls ``deferred_init.materialize_module()``. """ - is_meta_module = any(p.is_meta for p in self._get_orig_params(module, ignored_params)) + is_meta_module = any( + p.is_meta for p in self._get_orig_params(module, ignored_params) + ) is_torchdistX_deferred_init = ( not is_meta_module and _TORCHDISTX_AVAIL - and any(fake.is_fake(p) for p in self._get_orig_params(module, ignored_params)) + and any( + fake.is_fake(p) for p in self._get_orig_params(module, ignored_params) + ) ) if ( is_meta_module or is_torchdistX_deferred_init @@ -1357,7 +1382,9 @@ def _materialize_module( param_init_fn(module) elif is_meta_module: # Run default meta device initialization - materialization_device = device_from_device_id or torch.cuda.current_device() + materialization_device = ( + device_from_device_id or torch.cuda.current_device() + ) module.to_empty(device=materialization_device) try: with torch.no_grad(): @@ -1483,7 +1510,10 @@ def _sync_module_states( module_states.append(buffer.detach()) module_states.extend(param.detach() for param in params) _sync_params_and_buffers( - self.process_group, module_states, _PARAM_BROADCAST_BUCKET_SIZE, src=0, + self.process_group, + module_states, + _PARAM_BROADCAST_BUCKET_SIZE, + src=0, ) def _get_orig_params( @@ -1573,7 +1603,7 @@ def _reshard( p_assert( len(handles) == len(free_unsharded_flat_params), "Expects both lists to have equal length but got " - f"{len(handles)} and {len(free_unsharded_flat_params)}" + f"{len(handles)} and {len(free_unsharded_flat_params)}", ) for handle, free_unsharded_flat_param in zip( handles, @@ -1651,9 +1681,10 @@ def fsdp_modules( the input ``module``. """ return [ - submodule for submodule in module.modules() - if isinstance(submodule, FullyShardedDataParallel) and - (not root_only or submodule.check_is_root()) + submodule + for submodule in module.modules() + if isinstance(submodule, FullyShardedDataParallel) + and (not root_only or submodule.check_is_root()) ] def apply(self, fn: Callable[[nn.Module], None]) -> "FullyShardedDataParallel": @@ -1728,6 +1759,7 @@ def _cast_fp_inputs_to_dtype( precision given by ``dtype``, while respecting the existing ``requires_grad`` on the tensors. """ + def cast_fn(x: torch.Tensor) -> torch.Tensor: if not torch.is_floating_point(x): return x @@ -1741,7 +1773,7 @@ def cast_fn(x: torch.Tensor) -> torch.Tensor: with torch.no_grad(): return ( _apply_to_tensors(cast_fn, args), - _apply_to_tensors(cast_fn, kwargs) + _apply_to_tensors(cast_fn, kwargs), ) def _cast_buffers( @@ -1775,9 +1807,15 @@ def _cast_buffers( if memo is None: memo = set() for module in self.modules(): - if module is not self and isinstance(module, FullyShardedDataParallel) and recurse: + if ( + module is not self + and isinstance(module, FullyShardedDataParallel) + and recurse + ): # Allow any child FSDP instances to handle their own buffers. - module._cast_buffers(device=device, dtype=dtype, memo=memo, recurse=recurse) + module._cast_buffers( + device=device, dtype=dtype, memo=memo, recurse=recurse + ) elif module not in memo: memo.add(module) for name, buf in module.named_buffers(recurse=False): @@ -1863,7 +1901,9 @@ def _lazy_init(self) -> None: fsdp_module.limit_all_gathers = self.limit_all_gathers fsdp_module._free_event_queue = self._free_event_queue fsdp_module._handles_prefetched = self._handles_prefetched - fsdp_module._needs_pre_backward_unshard = self._needs_pre_backward_unshard + fsdp_module._needs_pre_backward_unshard = ( + self._needs_pre_backward_unshard + ) for handle in fsdp_module._handles: fsdp_module._init_param_attributes(handle) if inconsistent_limit_all_gathers: @@ -1936,13 +1976,11 @@ def _init_param_attributes(self, handle: FlatParamHandle) -> None: # fwd/bwd, it is freed and we only hold on to the full precision shard. # As a result, this reduced precision shard is not allocated if we are # not in the forward/backward pass. - if ( - self._mixed_precision_enabled_for_params() - ): + if self._mixed_precision_enabled_for_params(): p._mp_shard = torch.zeros_like( p._local_shard, device=self.compute_device, - dtype=self.mixed_precision.param_dtype + dtype=self.mixed_precision.param_dtype, ) _free_storage(p._mp_shard) @@ -1957,7 +1995,8 @@ def _init_param_attributes(self, handle: FlatParamHandle) -> None: # into full_param_padded it can occur without issues and result in # full_param_padded having the expected param_dtype. full_param_dtype = ( - p.dtype if not self._mixed_precision_enabled_for_params() + p.dtype + if not self._mixed_precision_enabled_for_params() else self.mixed_precision.param_dtype ) p._full_param_padded = torch.zeros( # type: ignore[attr-defined] @@ -2024,7 +2063,9 @@ def _prefetch_handles( for handles_key in handles_to_prefetch: # Prefetch the next set of handles without synchronizing to allow # the sync to happen as late as possible to maximize overlap - self._unshard(handles_key, self._streams["unshard"], self._streams["pre_unshard"]) + self._unshard( + handles_key, self._streams["unshard"], self._streams["pre_unshard"] + ) self._handles_prefetched[handles_key] = True def _get_handles_to_prefetch( @@ -2048,33 +2089,31 @@ def _get_handles_to_prefetch( p_assert( training_state in valid_training_states, f"Prefetching is only supported in {valid_training_states} but " - f"currently in {training_state}" + f"currently in {training_state}", ) eod = self._exec_order_data target_handles_keys: List[_HandlesKey] = [] if ( - ( - training_state == HandleTrainingState.BACKWARD_PRE - and self.backward_prefetch == BackwardPrefetch.BACKWARD_PRE - ) - or ( - training_state == HandleTrainingState.BACKWARD_POST - and self.backward_prefetch == BackwardPrefetch.BACKWARD_POST - ) + training_state == HandleTrainingState.BACKWARD_PRE + and self.backward_prefetch == BackwardPrefetch.BACKWARD_PRE + ) or ( + training_state == HandleTrainingState.BACKWARD_POST + and self.backward_prefetch == BackwardPrefetch.BACKWARD_POST ): target_handles_keys = [ - target_handles_key for target_handles_key in - eod.get_handles_to_backward_prefetch(current_handles_key) + target_handles_key + for target_handles_key in eod.get_handles_to_backward_prefetch( + current_handles_key + ) if self._needs_pre_backward_unshard.get(target_handles_key, False) and not self._handles_prefetched.get(target_handles_key, False) ] - elif ( - training_state == HandleTrainingState.FORWARD - and self.forward_prefetch - ): + elif training_state == HandleTrainingState.FORWARD and self.forward_prefetch: target_handles_keys = [ - target_handles_key for target_handles_key in - eod.get_handles_to_forward_prefetch(current_handles_key) + target_handles_key + for target_handles_key in eod.get_handles_to_forward_prefetch( + current_handles_key + ) if self._needs_pre_forward_unshard.get(target_handles_key, False) and not self._handles_prefetched.get(target_handles_key, False) ] @@ -2089,7 +2128,7 @@ def _get_training_state( training_states = set(handle._training_state for handle in handles_key) p_assert( len(training_states) == 1, - f"Expects uniform training state but got {training_states}" + f"Expects uniform training state but got {training_states}", ) return next(iter(training_states)) @@ -2157,7 +2196,9 @@ def set_state_dict_type( "All FSDP modules should have the same type of state_dict_config." ) - expected_state_dict_config_type = _state_dict_type_to_config[state_dict_type] + expected_state_dict_config_type = _state_dict_type_to_config[ + state_dict_type + ] if expected_state_dict_config_type != type(state_dict_config): raise RuntimeError( f"Expected state_dict_config of type {expected_state_dict_config_type} " @@ -2200,10 +2241,11 @@ def state_dict_type( prev_state_dict_type = None prev_state_dict_config = None try: - prev_state_dict_type, prev_state_dict_config = ( - FullyShardedDataParallel.set_state_dict_type( - module, state_dict_type, state_dict_config - ) + ( + prev_state_dict_type, + prev_state_dict_config, + ) = FullyShardedDataParallel.set_state_dict_type( + module, state_dict_type, state_dict_config ) yield except Exception as e: @@ -2233,18 +2275,14 @@ def _convert_to_wrapped_module_name(self, module_name: str) -> str: def _param_fqns(self) -> Iterator[Tuple[str, str, str]]: if not self._has_params: return - for param_name, module_name in ( - self._handles[0].parameter_module_names() - ): + for param_name, module_name in self._handles[0].parameter_module_names(): module_name = self._convert_to_wrapped_module_name(module_name) fqn = f"{module_name}{param_name}" yield fqn, param_name, module_name @property def _shared_param_fqns(self) -> Iterator[Tuple[str, str, str]]: - for param_name, module_name in ( - self._handles[0].shared_parameter_module_names() - ): + for param_name, module_name in self._handles[0].shared_parameter_module_names(): module_name = self._convert_to_wrapped_module_name(module_name) fqn = f"{module_name}{param_name}" yield fqn, param_name, module_name @@ -2297,17 +2335,21 @@ def state_dict(self, *args, **kwargs): if self._state_dict_type == StateDictType.FULL_STATE_DICT: # Get config args full_state_dict_config = ( - self._state_dict_config if self._state_dict_config is not None + self._state_dict_config + if self._state_dict_config is not None else FullStateDictConfig() ) rank0_only = full_state_dict_config.rank0_only offload_to_cpu = full_state_dict_config.offload_to_cpu summon_ctx = ( self._summon_full_params( - recurse=False, writeback=False, offload_to_cpu=offload_to_cpu, rank0_only=rank0_only + recurse=False, + writeback=False, + offload_to_cpu=offload_to_cpu, + rank0_only=rank0_only, ) - if self.training_state != TrainingState_.SUMMON_FULL_PARAMS else - contextlib.suppress() + if self.training_state != TrainingState_.SUMMON_FULL_PARAMS + else contextlib.suppress() ) with summon_ctx: # Since buffers are not sharded and stay casted, restore them to their @@ -2316,10 +2358,7 @@ def state_dict(self, *args, **kwargs): # buffers stay casted after forward/backward. We must have the # call here instead of above because _summon_full_params itself # calls _lazy_init() which would cast the buffers. - if ( - self._is_root - and self._mixed_precision_enabled_for_buffers() - ): + if self._is_root and self._mixed_precision_enabled_for_buffers(): self._cast_buffers( dtype=self._buffer_name_to_orig_dtype, recurse=False ) @@ -2332,13 +2371,10 @@ def state_dict(self, *args, **kwargs): return {} elif ( - self._state_dict_type == StateDictType.LOCAL_STATE_DICT or - self._state_dict_type == StateDictType.SHARDED_STATE_DICT + self._state_dict_type == StateDictType.LOCAL_STATE_DICT + or self._state_dict_type == StateDictType.SHARDED_STATE_DICT ): - if ( - self._has_params and - not self._handles[0].uses_sharded_strategy - ): + if self._has_params and not self._handles[0].uses_sharded_strategy: raise RuntimeError( "sharded_state_dict/local_state_dict can only be called " "when parameters are flatten and sharded." @@ -2352,17 +2388,22 @@ def forward(self, *args: Any, **kwargs: Any) -> Any: Runs the forward pass for the wrapped module, inserting FSDP-specific pre- and post-forward sharding logic. """ - with torch.autograd.profiler.record_function("FullyShardedDataParallel.forward"): + with torch.autograd.profiler.record_function( + "FullyShardedDataParallel.forward" + ): self._lazy_init() args, kwargs = self._fsdp_root_pre_forward(*args, **kwargs) unused = None - unshard_fn = functools.partial(self._pre_forward_unshard, handles=self._handles) + unshard_fn = functools.partial( + self._pre_forward_unshard, handles=self._handles + ) # Do not free the root's parameters in the post-forward for # `FULL_SHARD` with the intention that they are immediately used # for backward computation (though this may not be true) free_unsharded_flat_params = [ not self._is_root - and handle._config.sharding_strategy == HandleShardingStrategy.FULL_SHARD + and handle._config.sharding_strategy + == HandleShardingStrategy.FULL_SHARD for handle in self._handles ] reshard_fn = functools.partial( @@ -2375,7 +2416,7 @@ def forward(self, *args: Any, **kwargs: Any) -> Any: p_assert( handle.flat_param.device == self.compute_device, "Expected `FlatParameter` to be on the compute device " - f"{self.compute_device} but got {handle.flat_param.device}" + f"{self.compute_device} but got {handle.flat_param.device}", ) output = self._fsdp_wrapped_module(*args, **kwargs) return self._post_forward(self._handles, reshard_fn, unused, unused, output) @@ -2418,7 +2459,9 @@ def _pre_forward_unshard( ) -> None: """Unshards parameters in the pre-forward.""" if handles: - self._unshard(handles, self._streams["unshard"], self._streams["pre_unshard"]) + self._unshard( + handles, self._streams["unshard"], self._streams["pre_unshard"] + ) handles_key = tuple(handles) self._needs_pre_forward_unshard[handles_key] = False torch.cuda.current_stream().wait_stream(self._streams["unshard"]) @@ -2476,7 +2519,9 @@ def _cast_forward_inputs(self, *args, **kwargs): if self._mixed_precision_enabled_for_params(): input_dtype = self.mixed_precision.param_dtype args, kwargs = self._cast_fp_inputs_to_dtype( - input_dtype, *args, **kwargs, + input_dtype, + *args, + **kwargs, ) return args, kwargs @@ -2525,7 +2570,7 @@ def summon_full_params( offload_to_cpu: bool = False, with_grads: bool = False, ) -> Generator: - r""" A context manager to expose full params for FSDP instances. + r"""A context manager to expose full params for FSDP instances. Can be useful *after* forward/backward for a model to get the params for additional processing or checking. It can take a non-FSDP module and will summon full params for all contained FSDP modules as @@ -2663,7 +2708,9 @@ def _summon_full_params( handle._training_state = HandleTrainingState.SUMMON_FULL_PARAMS self._clear_grads_if_needed() - free_unsharded_flat_params = [handle.needs_unshard() for handle in self._handles] + free_unsharded_flat_params = [ + handle.needs_unshard() for handle in self._handles + ] # No need to call `wait_stream()` since we unshard in the computation # stream directly computation_stream = torch.cuda.current_stream() @@ -2742,7 +2789,7 @@ def _writeback_to_local_shard( handle.rank, handle.world_size, ) - handle.flat_param._local_shard[:param_shard.numel()].copy_(param_shard) + handle.flat_param._local_shard[: param_shard.numel()].copy_(param_shard) if writeback_grad: existing_grad = handle.sharded_grad if existing_grad is not None: @@ -2751,7 +2798,7 @@ def _writeback_to_local_shard( handle.rank, handle.world_size, ) - existing_grad[:grad_shard.numel()].copy_(grad_shard) + existing_grad[: grad_shard.numel()].copy_(grad_shard) @contextlib.contextmanager def _unflatten_as_params(self) -> Generator: @@ -2825,7 +2872,7 @@ def _deregister_orig_params(self): p_assert( len(self._handles) <= 1, "Expects <=1 handle per FSDP instance; needs to be refactored " - "for >1 handle (e.g. non-recursive wrapping)" + "for >1 handle (e.g. non-recursive wrapping)", ) if not self._handles: return @@ -2833,7 +2880,7 @@ def _deregister_orig_params(self): p_assert( handle._use_orig_params, f"Inconsistent `_use_orig_params` -- FSDP: {self._use_orig_params} " - f"handle: {handle._use_orig_params}" + f"handle: {handle._use_orig_params}", ) handle._deregister_orig_params() self._register_flat_param() @@ -2973,7 +3020,9 @@ def _pre_backward_hook(_handles: List[FlatParamHandle], *unused: Any) -> None: # If the handles have been prefetched, this `_unshard()` simply # switches to using the unsharded parameter - self._unshard(_handles, self._streams["unshard"], self._streams["pre_unshard"]) + self._unshard( + _handles, self._streams["unshard"], self._streams["pre_unshard"] + ) torch.cuda.current_stream().wait_stream(self._streams["unshard"]) # Set this to `False` to ensure that a mistargeted prefetch @@ -3022,7 +3071,7 @@ def _register_post_backward_hooks( p_assert( temp_flat_param.grad_fn is not None, "The `grad_fn` is needed to access the `AccumulateGrad` and " - "register the post-backward hook" + "register the post-backward hook", ) acc_grad = temp_flat_param.grad_fn.next_functions[0][0] hook_handle = acc_grad.register_hook( @@ -3055,11 +3104,16 @@ def _post_backward_hook( ): # First hook callback will see PRE state. If we have multiple params, # then subsequent hook callbacks will see POST state. - self._assert_state([TrainingState_.BACKWARD_PRE, TrainingState_.BACKWARD_POST]) + self._assert_state( + [TrainingState_.BACKWARD_PRE, TrainingState_.BACKWARD_POST] + ) self.training_state = TrainingState_.BACKWARD_POST handle._training_state = HandleTrainingState.BACKWARD_POST - if self._use_param_exec_order_policy() and self._param_exec_order_prep_stage: + if ( + self._use_param_exec_order_policy() + and self._param_exec_order_prep_stage + ): # In self._fsdp_params_exec_order, the parameters are ordered based on # the execution order in the backward pass in the first iteration. self._fsdp_params_exec_order.append(param) @@ -3103,7 +3157,9 @@ def _post_backward_hook( # TODO: Make this a communication hook when communication hooks # are implemented for FSDP. Note that this is a noop if the # reduce_dtype matches the param dtype. - param.grad.data = param.grad.data.to(self.mixed_precision.reduce_dtype) + param.grad.data = param.grad.data.to( + self.mixed_precision.reduce_dtype + ) if self._exec_order_data.is_first_iter: # For all sharding strategies communication is performed through `_communication_hook`: @@ -3112,11 +3168,11 @@ def _post_backward_hook( # and `_communication_hook_state`, required for communication not `None`.` p_assert( self._communication_hook is not None, - "Communication hook should not be None" + "Communication hook should not be None", ) p_assert( self._communication_hook_state is not None, - "Communication hook state should not be None" + "Communication hook state should not be None", ) grad = param.grad.data if handle.uses_sharded_strategy: @@ -3138,7 +3194,9 @@ def _post_backward_hook( num_pad = self.world_size * chunks[0].numel() - grad.numel() input_flattened = F.pad(grad_flatten, [0, num_pad]) output = torch.zeros_like(chunks[0]) - self._communication_hook(self._communication_hook_state, input_flattened, output) + self._communication_hook( + self._communication_hook_state, input_flattened, output + ) self._cast_grad_to_param_dtype(output, param) @@ -3153,13 +3211,13 @@ def _post_backward_hook( param._saved_grad_shard.shape == output.shape, # type: ignore[attr-defined] "Shape mismatch when accumulating gradients: " # type: ignore[attr-defined] f"existing grad shape={param._saved_grad_shard.shape} " - f"new grad shape={output.shape}" # type: ignore[attr-defined] + f"new grad shape={output.shape}", # type: ignore[attr-defined] ) p_assert( param._saved_grad_shard.device == output.device, # type: ignore[attr-defined] "Device mismatch when accumulating gradients: " # type: ignore[attr-defined] f"existing grad device={param._saved_grad_shard.device} " - f"new grad device={output.device}" # type: ignore[attr-defined] + f"new grad device={output.device}", # type: ignore[attr-defined] ) param._saved_grad_shard += output # type: ignore[attr-defined] else: @@ -3167,7 +3225,9 @@ def _post_backward_hook( grad = param._saved_grad_shard # type: ignore[attr-defined] else: if self.sharding_strategy == ShardingStrategy.NO_SHARD: - self._communication_hook(self._communication_hook_state, param.grad) + self._communication_hook( + self._communication_hook_state, param.grad + ) # For NO_SHARD keeping grads in the reduced precision, we # can simply omit the cast as needed, we can't do this for @@ -3221,12 +3281,9 @@ def _cast_grad_to_param_dtype( dtype cast happens in the hook instead. """ self._assert_state(TrainingState_.BACKWARD_POST) - if ( - not self._low_precision_hook_enabled() - and ( - self._mixed_precision_enabled_for_params() - or self._mixed_precision_enabled_for_reduce() - ) + if not self._low_precision_hook_enabled() and ( + self._mixed_precision_enabled_for_params() + or self._mixed_precision_enabled_for_reduce() ): low_prec_grad_data = grad.data grad.data = grad.data.to(dtype=param.dtype) @@ -3236,9 +3293,8 @@ def _cast_grad_to_param_dtype( def _should_free_unsharded_flat_param(self, handle: FlatParamHandle): return ( - (self._sync_gradients and handle.uses_sharded_strategy) - or handle._config.sharding_strategy == HandleShardingStrategy.FULL_SHARD - ) + self._sync_gradients and handle.uses_sharded_strategy + ) or handle._config.sharding_strategy == HandleShardingStrategy.FULL_SHARD def _queue_wait_for_post_backward(self) -> None: """ @@ -3247,7 +3303,7 @@ def _queue_wait_for_post_backward(self) -> None: """ p_assert( self._is_root, - "`_queue_wait_for_post_backward()` should be called on the root FSDP instance" + "`_queue_wait_for_post_backward()` should be called on the root FSDP instance", ) if self._post_backward_callback_queued: return @@ -3295,18 +3351,21 @@ def _catch_all_reshard(fsdp_module: FullyShardedDataParallel) -> None: # TODO: This already-resharded check is brittle: # https://github.com/pytorch/pytorch/issues/83956 already_resharded = ( - handle.flat_param.data_ptr() == handle.flat_param._local_shard.data_ptr() + handle.flat_param.data_ptr() + == handle.flat_param._local_shard.data_ptr() ) if already_resharded: continue - free_unsharded_flat_params.append(self._should_free_unsharded_flat_param(handle)) + free_unsharded_flat_params.append( + self._should_free_unsharded_flat_param(handle) + ) handles_to_reshard.append(handle) self._reshard(handles_to_reshard, free_unsharded_flat_params) except Exception as e: p_assert( False, f"Got exception while resharding module {fsdp_module}: {str(e)}", - raise_assertion_error=False + raise_assertion_error=False, ) raise e @@ -3318,7 +3377,7 @@ def _finalize_params(fsdp_module: FullyShardedDataParallel) -> None: if hasattr(p, "_post_backward_hook_state"): p_assert( len(p._post_backward_hook_state) == 2, # type: ignore[attr-defined] - "p._post_backward_hook_state fields are not valid." + "p._post_backward_hook_state fields are not valid.", ) p._post_backward_hook_state[1].remove() # type: ignore[attr-defined] delattr(p, "_post_backward_hook_state") @@ -3331,8 +3390,8 @@ def _finalize_params(fsdp_module: FullyShardedDataParallel) -> None: continue handle.prepare_gradient_for_optim() p_assert( - hasattr(p, '_post_backward_called'), - "Expected flag _post_backward_called to be set on param." + hasattr(p, "_post_backward_called"), + "Expected flag _post_backward_called to be set on param.", ) # Reset _post_backward_called in preparation for the next iteration. p._post_backward_called = False @@ -3479,22 +3538,25 @@ def clip_grad_norm_( norm_type = float(norm_type) # Compute the local gradient norm (only including this rank's shard # of the gradients) - local_norm = _get_grad_norm(self.parameters(), norm_type).to(self.compute_device) + local_norm = _get_grad_norm(self.parameters(), norm_type).to( + self.compute_device + ) # Reconstruct the total gradient norm depending on the norm type if norm_type == math.inf: total_norm = local_norm - dist.all_reduce(total_norm, op=torch.distributed.ReduceOp.MAX, group=self.process_group) + dist.all_reduce( + total_norm, op=torch.distributed.ReduceOp.MAX, group=self.process_group + ) else: - total_norm = local_norm ** norm_type + total_norm = local_norm**norm_type dist.all_reduce(total_norm, group=self.process_group) total_norm = total_norm ** (1.0 / norm_type) if self.cpu_offload.offload_params: total_norm = total_norm.cpu() - clip_coef = ( - torch.tensor(max_norm, dtype=total_norm.dtype, device=total_norm.device) - / (total_norm + 1e-6) - ) + clip_coef = torch.tensor( + max_norm, dtype=total_norm.dtype, device=total_norm.device + ) / (total_norm + 1e-6) # Multiplying by the clamped coefficient is meaningless when it is # equal to 1, but it avoids the host-device sync that would result from # `if clip_coef < 1` @@ -3537,9 +3599,12 @@ def _raise_on_use_orig_params_optim_checkpoint(model: nn.Module): def full_optim_state_dict( model: torch.nn.Module, optim: torch.optim.Optimizer, - optim_input: Optional[Union[ - List[Dict[str, Any]], Iterable[torch.nn.Parameter], - ]] = None, + optim_input: Optional[ + Union[ + List[Dict[str, Any]], + Iterable[torch.nn.Parameter], + ] + ] = None, rank0_only: bool = True, group: Optional[dist.ProcessGroup] = None, ) -> Dict[str, Any]: @@ -3592,7 +3657,8 @@ def full_optim_state_dict( FullyShardedDataParallel._raise_on_use_orig_params_optim_checkpoint(model) FullyShardedDataParallel._warn_optim_input(optim_input) using_optim_input = FullyShardedDataParallel._is_using_optim_input( - optim_input, optim, + optim_input, + optim, ) return _optim_state_dict( model=model, @@ -3610,7 +3676,8 @@ def sharded_optim_state_dict( optim: torch.optim.Optimizer, optim_input: Optional[ Union[ - List[Dict[str, Any]], Iterable[torch.nn.Parameter], + List[Dict[str, Any]], + Iterable[torch.nn.Parameter], ] ] = None, group: Optional[dist.ProcessGroup] = None, @@ -3629,7 +3696,8 @@ def sharded_optim_state_dict( FullyShardedDataParallel._raise_on_use_orig_params_optim_checkpoint(model) FullyShardedDataParallel._warn_optim_input(optim_input) using_optim_input = FullyShardedDataParallel._is_using_optim_input( - optim_input, optim, + optim_input, + optim, ) # TODO: The ultimate goal of the optimizer state APIs should be the same # as state_dict/load_state_dict -- using one API to get optimizer states @@ -3655,7 +3723,8 @@ def shard_full_optim_state_dict( model: torch.nn.Module, optim_input: Optional[ Union[ - List[Dict[str, Any]], Iterable[torch.nn.Parameter], + List[Dict[str, Any]], + Iterable[torch.nn.Parameter], ] ] = None, optim: Optional[torch.optim.Optimizer] = None, @@ -3717,13 +3786,20 @@ def shard_full_optim_state_dict( FullyShardedDataParallel._raise_on_use_orig_params_optim_checkpoint(model) FullyShardedDataParallel._warn_optim_input(optim_input) using_optim_input = FullyShardedDataParallel._is_using_optim_input( - optim_input, optim, + optim_input, + optim, ) sharded_osd = _flatten_optim_state_dict( - full_optim_state_dict, model, True, + full_optim_state_dict, + model, + True, ) return _rekey_sharded_optim_state_dict( - sharded_osd, model, optim, optim_input, using_optim_input, + sharded_osd, + model, + optim, + optim_input, + using_optim_input, ) @staticmethod @@ -3732,7 +3808,8 @@ def flatten_sharded_optim_state_dict( model: torch.nn.Module, optim_input: Optional[ Union[ - List[Dict[str, Any]], Iterable[torch.nn.Parameter], + List[Dict[str, Any]], + Iterable[torch.nn.Parameter], ] ] = None, optim: Optional[torch.optim.Optimizer] = None, @@ -3756,7 +3833,8 @@ def flatten_sharded_optim_state_dict( FullyShardedDataParallel._raise_on_use_orig_params_optim_checkpoint(model) FullyShardedDataParallel._warn_optim_input(optim_input) using_optim_input = FullyShardedDataParallel._is_using_optim_input( - optim_input, optim, + optim_input, + optim, ) # TODO: The implementation is the same as ``shard_full_optim_state_dict``. # See the TODO in ``shard_full_optim_state_dict`` for the future @@ -3767,16 +3845,23 @@ def flatten_sharded_optim_state_dict( shard_state=True, ) return _rekey_sharded_optim_state_dict( - flattened_osd, model, optim, optim_input, using_optim_input, + flattened_osd, + model, + optim, + optim_input, + using_optim_input, ) @staticmethod def scatter_full_optim_state_dict( full_optim_state_dict: Optional[Dict[str, Any]], model: torch.nn.Module, - optim_input: Optional[Union[ - List[Dict[str, Any]], Iterable[torch.nn.Parameter], - ]] = None, + optim_input: Optional[ + Union[ + List[Dict[str, Any]], + Iterable[torch.nn.Parameter], + ] + ] = None, optim: Optional[torch.optim.Optimizer] = None, group: Optional[Any] = None, ) -> Dict[str, Any]: @@ -3838,7 +3923,8 @@ def scatter_full_optim_state_dict( FullyShardedDataParallel._raise_on_use_orig_params_optim_checkpoint(model) FullyShardedDataParallel._warn_optim_input(optim_input) using_optim_input = FullyShardedDataParallel._is_using_optim_input( - optim_input, optim, + optim_input, + optim, ) # Try to use the passed-in process group, the model's process group, # or the default process group (i.e. `None`) in that priority order @@ -3848,8 +3934,9 @@ def scatter_full_optim_state_dict( world_size = dist.get_world_size(group) # Check for a valid broadcast device, preferring GPU when available using_nccl = dist.distributed_c10d._check_for_nccl_backend(group) - broadcast_device = torch.device("cuda") if torch.cuda.is_available() \ - else torch.device("cpu") + broadcast_device = ( + torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + ) if using_nccl and not torch.cuda.is_available(): raise RuntimeError("NCCL requires a GPU for collectives") # Flatten the optimizer state dict and construct a copy with the @@ -3867,18 +3954,28 @@ def scatter_full_optim_state_dict( # Broadcast the optim state dict without positive-dimension tensor # state and the FSDP parameter IDs from rank 0 to all ranks processed_osd = _broadcast_processed_optim_state_dict( - processed_osd if rank == 0 else None, rank, group, + processed_osd if rank == 0 else None, + rank, + group, ) # Broadcast positive-dimension tensor state (both sharded tensors for # FSDP parameters and unsharded tensors for non-FSDP parameters) sharded_osd = _broadcast_pos_dim_tensor_states( - processed_osd, flat_osd if rank == 0 else None, rank, world_size, - group, broadcast_device, + processed_osd, + flat_osd if rank == 0 else None, + rank, + world_size, + group, + broadcast_device, ) # Rekey the optimizer state dict to use parameter IDs according to this # rank's `optim` sharded_osd = _rekey_sharded_optim_state_dict( - sharded_osd, model, optim, optim_input, using_optim_input, + sharded_osd, + model, + optim, + optim_input, + using_optim_input, ) return sharded_osd @@ -3887,9 +3984,12 @@ def rekey_optim_state_dict( optim_state_dict: Dict[str, Any], optim_state_key_type: OptimStateKeyType, model: torch.nn.Module, - optim_input: Optional[Union[ - List[Dict[str, Any]], Iterable[torch.nn.Parameter], - ]] = None, + optim_input: Optional[ + Union[ + List[Dict[str, Any]], + Iterable[torch.nn.Parameter], + ] + ] = None, optim: Optional[torch.optim.Optimizer] = None, ) -> Dict[str, Any]: """ @@ -3926,30 +4026,30 @@ def rekey_optim_state_dict( """ FullyShardedDataParallel._warn_optim_input(optim_input) using_optim_input = FullyShardedDataParallel._is_using_optim_input( - optim_input, optim, + optim_input, + optim, ) assert optim_state_key_type in ( - OptimStateKeyType.PARAM_NAME, OptimStateKeyType.PARAM_ID, + OptimStateKeyType.PARAM_NAME, + OptimStateKeyType.PARAM_ID, ) osd = optim_state_dict # alias # Validate that the existing parameter keys are uniformly typed - uses_param_name_mask = [ - type(param_key) is str for param_key in osd["state"] - ] - uses_param_id_mask = [ - type(param_key) is int for param_key in osd["state"] - ] - if ( - (any(uses_param_name_mask) and not all(uses_param_name_mask)) - or (any(uses_param_id_mask) and not all(uses_param_id_mask)) + uses_param_name_mask = [type(param_key) is str for param_key in osd["state"]] + uses_param_id_mask = [type(param_key) is int for param_key in osd["state"]] + if (any(uses_param_name_mask) and not all(uses_param_name_mask)) or ( + any(uses_param_id_mask) and not all(uses_param_id_mask) ): error_msg = f"Invalid parameter keys: {osd['state'].keys()}" raise ValueError(error_msg) # Return directly if the existing key type matches the target key type - if (optim_state_key_type == OptimStateKeyType.PARAM_NAME and - all(uses_param_name_mask)) or \ - (optim_state_key_type == OptimStateKeyType.PARAM_ID and - all(uses_param_id_mask)): + if ( + optim_state_key_type == OptimStateKeyType.PARAM_NAME + and all(uses_param_name_mask) + ) or ( + optim_state_key_type == OptimStateKeyType.PARAM_ID + and all(uses_param_id_mask) + ): return osd # Otherwise, actually perform the re-keying new_osd = {} @@ -3969,10 +4069,12 @@ def rekey_optim_state_dict( } new_osd["param_groups"] = copy.deepcopy(osd["param_groups"]) for param_group in new_osd["param_groups"]: - param_group["params"] = sorted([ - param_id_to_param_name[param_id] - for param_id in param_group["params"] - ]) + param_group["params"] = sorted( + [ + param_id_to_param_name[param_id] + for param_id in param_group["params"] + ] + ) return new_osd elif optim_state_key_type == OptimStateKeyType.PARAM_ID: # name -> ID param_name_to_param = _get_param_name_to_param(model) @@ -3994,10 +4096,12 @@ def rekey_optim_state_dict( } new_osd["param_groups"] = copy.deepcopy(osd["param_groups"]) for param_group in new_osd["param_groups"]: - param_group["params"] = sorted([ - param_name_to_param_id[param_name] - for param_name in param_group["params"] - ]) + param_group["params"] = sorted( + [ + param_name_to_param_id[param_name] + for param_name in param_group["params"] + ] + ) return new_osd return new_osd # should never reach here @@ -4056,12 +4160,17 @@ def register_comm_hook(self, state: object, hook: callable): """ if not self.check_is_root(): - raise AssertionError("register_comm_hook can only be called on a root instance.") + raise AssertionError( + "register_comm_hook can only be called on a root instance." + ) for submodule in self.fsdp_modules(self): - assert not submodule._hook_registered, "communication hook can be only registered once" + assert ( + not submodule._hook_registered + ), "communication hook can be only registered once" submodule._hook_registered = True - assert submodule._communication_hook == self._get_default_comm_hook(),\ - f"communication hook should be default, but it is {submodule._communication_hook.__name__} instead" + assert ( + submodule._communication_hook == self._get_default_comm_hook() + ), f"communication hook should be default, but it is {submodule._communication_hook.__name__} instead" submodule._communication_hook_state = state submodule._communication_hook = hook @@ -4073,10 +4182,7 @@ def _init_param_exec_order_wrap_policy(self, *args, **kwargs) -> None: assert ( auto_wrap_policy.tracing_config is None ), "tracing_config should be None when torch.fx is not enabled" - elif isinstance( - auto_wrap_policy.tracing_config, - TracingConfig - ): + elif isinstance(auto_wrap_policy.tracing_config, TracingConfig): tracer = auto_wrap_policy.tracing_config.tracer execution_info = _init_execution_info(module) @@ -4110,8 +4216,7 @@ def _init_param_exec_order_wrap_policy(self, *args, **kwargs) -> None: # A list that stores the flatten parameters and its name based on the parameter execution order self._fsdp_params_exec_order: List[FlatParameter] = [] if _TORCH_FX_AVAIL and isinstance( - auto_wrap_policy.tracing_config, - TracingConfig + auto_wrap_policy.tracing_config, TracingConfig ): # Initialize a dict that maps each module to its parent FSDP wrap module_to_fsdp: Dict[nn.Module, FullyShardedDataParallel] = dict() @@ -4137,8 +4242,7 @@ def _init_param_exec_order_wrap_policy(self, *args, **kwargs) -> None: def _use_param_exec_order_policy(self) -> bool: return ( - hasattr(self, "_param_exec_order_policy") - and self._param_exec_order_policy + hasattr(self, "_param_exec_order_policy") and self._param_exec_order_policy ) def _is_param_exec_order_prep_stage(self) -> bool: @@ -4148,8 +4252,8 @@ def _is_param_exec_order_prep_stage(self) -> bool: ) if not is_prep_stage: for p in self.parameters(): - assert ( - not hasattr(p, "_params_exec_order_hook_handle") + assert not hasattr( + p, "_params_exec_order_hook_handle" ), "When not in execution order prep stage, all _params_exec_order_hook_handle should be removed." return is_prep_stage @@ -4168,7 +4272,9 @@ def _get_grad_norm( grads = [param.grad for param in params_with_grad] grad_dtypes = set(grad.dtype for grad in grads) if len(grad_dtypes) != 1: - raise ValueError(f"Requires uniform dtype across all gradients but got {grad_dtypes}") + raise ValueError( + f"Requires uniform dtype across all gradients but got {grad_dtypes}" + ) # Compute the gradient norm in FP32, where we treat the gradients as a # single vector grad_norm = torch.linalg.vector_norm( @@ -4206,15 +4312,14 @@ def _get_param_to_unflat_param_names( in the module walk order; if ``False``, then includes all of the unflattened parameter names. """ + def module_fn(module, prefix, param_to_unflat_param_names): for param_name, param in module.named_parameters(recurse=False): module_prefixed_param_names = ( - param._fqns if type(param) is FlatParameter - else [param_name] + param._fqns if type(param) is FlatParameter else [param_name] ) # prefixed from `module` fully_prefixed_param_names = [ - clean_tensor_name(prefix + name) - for name in module_prefixed_param_names + clean_tensor_name(prefix + name) for name in module_prefixed_param_names ] # fully prefixed from the top level including `prefix` # If this parameter has already been visited, then it is a # shared parameter; then, only take the first parameter name @@ -4229,7 +4334,10 @@ def return_fn(param_to_unflat_param_names): param_to_unflat_param_names: Dict[torch.nn.Parameter, List[str]] = {} return _apply_to_modules( - model, module_fn, return_fn, param_to_unflat_param_names, + model, + module_fn, + return_fn, + param_to_unflat_param_names, ) @@ -4250,16 +4358,16 @@ def _get_param_to_param_name( """ param_to_param_names = _get_param_to_unflat_param_names(model) for param_names in param_to_param_names.values(): - assert len(param_names) > 0, "`_get_param_to_unflat_param_names()` " \ - "should not construct empty lists" + assert len(param_names) > 0, ( + "`_get_param_to_unflat_param_names()` " "should not construct empty lists" + ) if len(param_names) > 1: raise RuntimeError( "Each parameter should only map to one parameter name but got " f"{len(param_names)}: {param_names}" ) param_to_param_name = { - param: param_names[0] - for param, param_names in param_to_param_names.items() + param: param_names[0] for param, param_names in param_to_param_names.items() } return param_to_param_name diff --git a/torch/distributed/fsdp/sharded_grad_scaler.py b/torch/distributed/fsdp/sharded_grad_scaler.py index 27ba44e6c1516..86dbfd7edc16e 100644 --- a/torch/distributed/fsdp/sharded_grad_scaler.py +++ b/torch/distributed/fsdp/sharded_grad_scaler.py @@ -1,12 +1,12 @@ -from collections import abc, defaultdict import logging +from collections import abc, defaultdict from typing import Dict, List, Optional, Union import torch +import torch.distributed as dist from torch.cuda import FloatTensor # type: ignore[attr-defined] -from torch.cuda.amp.grad_scaler import GradScaler, OptState, _MultiDeviceReplicator +from torch.cuda.amp.grad_scaler import _MultiDeviceReplicator, GradScaler, OptState from torch.distributed.distributed_c10d import ProcessGroup -import torch.distributed as dist from torch.optim.sgd import SGD @@ -23,6 +23,7 @@ class _GeneralMultiDeviceReplicator(_MultiDeviceReplicator): Lazily serves tensor to request device. This class extends _MultiDeviceReplicator to allow support for "cpu" as a device. """ + def __init__(self, master_tensor: torch.Tensor) -> None: assert _is_supported_device(master_tensor) self.master = master_tensor @@ -77,9 +78,10 @@ class ShardedGradScaler(GradScaler): process_group (ProcessGroup, optional, default=torch.distributed.group.WORLD): process group for sharding """ + def __init__( self, - init_scale: float = 2.0 ** 16, + init_scale: float = 2.0**16, backoff_factor: float = 0.5, growth_factor: float = 2.0, growth_interval: int = 2000, @@ -97,7 +99,9 @@ def __init__( self.process_group = process_group self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state) - def scale(self, outputs: Union[torch.Tensor, List[torch.Tensor]]) -> Union[torch.Tensor, List[torch.Tensor]]: + def scale( + self, outputs: Union[torch.Tensor, List[torch.Tensor]] + ) -> Union[torch.Tensor, List[torch.Tensor]]: if not self._enabled: return outputs @@ -106,7 +110,9 @@ def scale(self, outputs: Union[torch.Tensor, List[torch.Tensor]]) -> Union[torch if self._scale is None: self._lazy_init_scale_growth_tracker(outputs.device) assert self._scale is not None - scaled_output = outputs * self._scale.to(device=outputs.device, non_blocking=True) + scaled_output = outputs * self._scale.to( + device=outputs.device, non_blocking=True + ) # Here we ensure the return dtype is the same as the outputs dtype. # For the FSDP + Mixed Precision use case, the loss output is in the Mixed Precision # format (fp16, bf16) and so the scaled loss should be of the same dtype. @@ -114,7 +120,9 @@ def scale(self, outputs: Union[torch.Tensor, List[torch.Tensor]]) -> Union[torch stash: List[_GeneralMultiDeviceReplicator] = [] - def apply_scale(val: Union[torch.Tensor, abc.Iterable]) -> Union[torch.Tensor, abc.Iterable]: + def apply_scale( + val: Union[torch.Tensor, abc.Iterable] + ) -> Union[torch.Tensor, abc.Iterable]: if isinstance(val, torch.Tensor): assert _is_supported_device(val) if len(stash) == 0: @@ -150,20 +158,30 @@ def _foreach_non_finite_check_and_unscale_cpu_( for grad in grads: for tensor in grad: if tensor.device != expected_device: - logging.error("tensor device is %s and expected device is %s" % (tensor.device, expected_device)) + logging.error( + "tensor device is %s and expected device is %s" + % (tensor.device, expected_device) + ) raise ValueError("Gradients must be on the same device.") # check for non_overlapping_and_dense doesn't exist in the python world # as remarked here https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/AmpKernels.cu#L108 # we assume tensor is not MTA(multi tensor apply) safe. iterate through each item regardless of dtype - if torch.isinf(tensor).any().item() is True or torch.isnan(tensor).any().item() is True: + if ( + torch.isinf(tensor).any().item() is True + or torch.isnan(tensor).any().item() is True + ): found_inf.data = torch.tensor([1.0]) break else: tensor.data *= inv_scale.item() def _unscale_grads_( - self, optimizer: SGD, inv_scale: torch.Tensor, found_inf: torch.Tensor, allow_fp16: bool = True + self, + optimizer: SGD, + inv_scale: torch.Tensor, + found_inf: torch.Tensor, + allow_fp16: bool = True, ) -> Dict[torch.device, torch.Tensor]: per_device_inv_scale = _GeneralMultiDeviceReplicator(inv_scale) per_device_found_inf = _GeneralMultiDeviceReplicator(found_inf) @@ -195,7 +213,9 @@ def _unscale_grads_( else: to_unscale = param.grad - per_device_and_dtype_grads[to_unscale.device][to_unscale.dtype].append(to_unscale) + per_device_and_dtype_grads[to_unscale.device][ + to_unscale.dtype + ].append(to_unscale) for device, per_dtype_grads in per_device_and_dtype_grads.items(): for grads in per_dtype_grads.values(): @@ -222,16 +242,22 @@ def unscale_(self, optimizer: SGD) -> None: optimizer_state = self._per_optimizer_states[id(optimizer)] if optimizer_state["stage"] is OptState.UNSCALED: - raise RuntimeError("unscale_() has already been called on this optimizer since the last update().") + raise RuntimeError( + "unscale_() has already been called on this optimizer since the last update()." + ) elif optimizer_state["stage"] is OptState.STEPPED: raise RuntimeError("unscale_() is being called after step().") # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64. assert self._scale is not None inv_scale = self._scale.double().reciprocal().float() - found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=self._scale.device) + found_inf = torch.full( + (1,), 0.0, dtype=torch.float32, device=self._scale.device + ) - optimizer_state["found_inf_per_device"] = self._unscale_grads_(optimizer, inv_scale, found_inf, True) + optimizer_state["found_inf_per_device"] = self._unscale_grads_( + optimizer, inv_scale, found_inf, True + ) optimizer_state["stage"] = OptState.UNSCALED # Synchronize the detected inf across the ranks @@ -241,10 +267,18 @@ def unscale_(self, optimizer: SGD) -> None: for v in optimizer_state["found_inf_per_device"].values(): if v.device.type == "cpu": v_on_cuda = v.cuda() - future_handles.append(dist.all_reduce(v_on_cuda, async_op=True, group=self.process_group).get_future()) + future_handles.append( + dist.all_reduce( + v_on_cuda, async_op=True, group=self.process_group + ).get_future() + ) v.copy_(v_on_cuda.cpu()) else: - future_handles.append(dist.all_reduce(v, async_op=True, group=self.process_group).get_future()) + future_handles.append( + dist.all_reduce( + v, async_op=True, group=self.process_group + ).get_future() + ) # Make sure that the calls are done before moving out. if future_handles: diff --git a/torch/distributed/fsdp/wrap.py b/torch/distributed/fsdp/wrap.py index 8013da8e37ea1..c529bcde8c859 100644 --- a/torch/distributed/fsdp/wrap.py +++ b/torch/distributed/fsdp/wrap.py @@ -5,22 +5,11 @@ import contextlib from dataclasses import dataclass -from typing import ( - Any, - Callable, - Dict, - Generator, - Optional, - Set, - Tuple, - Type, - cast, -) +from typing import Any, Callable, cast, Dict, Generator, Optional, Set, Tuple, Type import torch.nn as nn from torch.nn.modules.batchnorm import _BatchNorm - __all__ = [ "always_wrap_policy", "lambda_auto_wrap_policy", @@ -41,11 +30,9 @@ def always_wrap_policy(*args, **kwargs) -> bool: """ return True + def lambda_auto_wrap_policy( - module: nn.Module, - recurse: bool, - unwrapped_params: int, - lambda_fn: Callable + module: nn.Module, recurse: bool, unwrapped_params: int, lambda_fn: Callable ) -> bool: """ A convenient auto wrap policy to wrap submodules based on an arbitrary user @@ -78,6 +65,7 @@ def lambda_auto_wrap_policy( # if not recursing, decide whether we should wrap for the leaf node or reminder return lambda_fn(module) + def transformer_auto_wrap_policy( module: nn.Module, recurse: bool, @@ -121,6 +109,7 @@ def transformer_auto_wrap_policy( # if not recursing, decide whether we should wrap for the leaf node or reminder return isinstance(module, tuple(transformer_layer_cls)) + def _wrap_batchnorm_individually( module: nn.Module, recurse: bool, @@ -138,6 +127,7 @@ def _wrap_batchnorm_individually( # BN layer or not. return isinstance(module, _BatchNorm) + def _or_policy( module: nn.Module, recurse: bool, @@ -148,9 +138,7 @@ def _or_policy( A policy that wraps ``module`` if any policy in the passed in iterable of ``policies`` returns ``True``. """ - return any( - policy(module, recurse, unwrapped_params) for policy in policies - ) + return any(policy(module, recurse, unwrapped_params) for policy in policies) def size_based_auto_wrap_policy( @@ -333,13 +321,14 @@ class ParamExecOrderWrapPolicy: ``full``, ``full_like``, ``eye``, ``empty``, ``tensor``). For those cases, users can set ``tracing_config = None`` to disable symbolic tracing. """ + init_policy: Callable = always_wrap_policy tracing_config: Any = None def _wrap(module: nn.Module, wrapper_cls: Callable, **kwargs) -> nn.Module: assert wrapper_cls is not None - if hasattr(module, '_wrap_overrides'): + if hasattr(module, "_wrap_overrides"): # If module has a _wrap_overrides attribute, we force overriding the # FSDP config with these attributes for this module. Currently this # is only used to disable mixed precision for BatchNorm when @@ -357,7 +346,7 @@ def _recursive_wrap( ignored_modules: Set[nn.Module], ignored_params: Set[nn.Parameter], only_wrap_children: bool = False, - **kwargs: Any + **kwargs: Any, ) -> Tuple[nn.Module, int]: """ Automatically wrap child modules of *module* that meet the given @@ -389,9 +378,7 @@ def _recursive_wrap( pass # We count all params, assuming none of them are already wrapped. - num_params = sum( - p.numel() for p in module.parameters() if p not in ignored_params - ) + num_params = sum(p.numel() for p in module.parameters() if p not in ignored_params) assert auto_wrap_policy is not None if auto_wrap_policy(module=module, recurse=True, unwrapped_params=num_params): From a58191df0b6f325ad418d5fa8bd28630d0dda675 Mon Sep 17 00:00:00 2001 From: Andrew Gu Date: Thu, 27 Oct 2022 00:03:15 +0000 Subject: [PATCH 0222/1922] [FSDP] ufmt FSDP test (#87812) This applies `ufmt` to all of the FSDP test files in the `test/distributed/fsdp/` directory. **Test Plan** CI **Notes** For VSCode users, - Install `ufmt`: https://pypi.org/project/ufmt/ - Install VSCode `ufmt` extension: https://marketplace.visualstudio.com/items?itemName=omnilib.ufmt - Include in `settings.json`: ``` { "[python]": { "editor.defaultFormatter": "omnilib.ufmt", "editor.formatOnSave": true, }, } ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/87812 Approved by: https://github.com/rohan-varma --- .../fsdp/test_checkpoint_wrapper.py | 121 ++++--- .../fsdp/test_distributed_checkpoint.py | 25 +- test/distributed/fsdp/test_fsdp_apply.py | 5 +- test/distributed/fsdp/test_fsdp_checkpoint.py | 60 ++-- .../fsdp/test_fsdp_clip_grad_norm.py | 21 +- test/distributed/fsdp/test_fsdp_comm.py | 66 ++-- test/distributed/fsdp/test_fsdp_comm_hooks.py | 220 +++++++------ test/distributed/fsdp/test_fsdp_core.py | 14 +- test/distributed/fsdp/test_fsdp_exec_order.py | 2 +- .../fsdp/test_fsdp_freezing_weights.py | 8 +- test/distributed/fsdp/test_fsdp_grad_acc.py | 74 ++--- .../fsdp/test_fsdp_ignored_modules.py | 27 +- test/distributed/fsdp/test_fsdp_input.py | 7 +- test/distributed/fsdp/test_fsdp_memory.py | 7 +- test/distributed/fsdp/test_fsdp_meta.py | 68 ++-- test/distributed/fsdp/test_fsdp_misc.py | 94 +++--- .../fsdp/test_fsdp_mixed_precision.py | 130 +++++--- .../fsdp/test_fsdp_multiple_forward.py | 8 +- .../fsdp/test_fsdp_multiple_wrapping.py | 3 +- .../distributed/fsdp/test_fsdp_optim_state.py | 307 ++++++++++++------ test/distributed/fsdp/test_fsdp_overlap.py | 7 +- test/distributed/fsdp/test_fsdp_pure_fp16.py | 3 +- .../fsdp/test_fsdp_sharded_grad_scaler.py | 69 ++-- test/distributed/fsdp/test_fsdp_state_dict.py | 144 +++++--- .../fsdp/test_fsdp_summon_full_params.py | 50 ++- test/distributed/fsdp/test_fsdp_traversal.py | 15 +- test/distributed/fsdp/test_fsdp_uneven.py | 7 +- .../fsdp/test_fsdp_use_orig_params.py | 34 +- test/distributed/fsdp/test_utils.py | 11 +- test/distributed/fsdp/test_wrap.py | 95 +++--- torch/testing/_internal/common_fsdp.py | 66 ++-- 31 files changed, 1061 insertions(+), 707 deletions(-) diff --git a/test/distributed/fsdp/test_checkpoint_wrapper.py b/test/distributed/fsdp/test_checkpoint_wrapper.py index 8bd2b74695d3b..d8e005fcf82be 100644 --- a/test/distributed/fsdp/test_checkpoint_wrapper.py +++ b/test/distributed/fsdp/test_checkpoint_wrapper.py @@ -1,30 +1,25 @@ # Owner(s): ["oncall: distributed"] +import unittest from copy import deepcopy from functools import partial import torch import torch.nn as nn from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( - checkpoint_wrapper, - offload_wrapper, apply_activation_checkpointing, + checkpoint_wrapper, + CheckpointImpl, CheckpointWrapper, + offload_wrapper, OffloadWrapper, - CheckpointImpl ) - +from torch.testing._internal.common_utils import run_tests, TestCase from torch.utils.checkpoint import checkpoint -from torch.testing._internal.common_utils import ( - run_tests, - TestCase, -) - -import unittest +_SAVED_PREFIX = "_saved_" +GRAD_FN_NEXT_FUNCTIONS = "next_functions" -_SAVED_PREFIX = '_saved_' -GRAD_FN_NEXT_FUNCTIONS = 'next_functions' class CheckpointWrapperTest(TestCase): def setUp(self): @@ -66,13 +61,7 @@ def __init__(self): self.lin = nn.Linear(10, 10) def forward(self, a, b, c=None, d=None, **kwargs): - return ( - self.lin(a), - self.lin(b), - self.lin(c), - self.lin(d) - ) - + return (self.lin(a), self.lin(b), self.lin(c), self.lin(d)) for wrapper in [ partial(checkpoint_wrapper, checkpoint_impl=CheckpointImpl.REENTRANT), @@ -113,7 +102,6 @@ def forward(self, *, a=None, b=None): out = model(a=inp, b=inp) self.assertEqual(2, len(out)) - @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA") def test_checkpoint_wrapper_parity(self): """ @@ -122,13 +110,14 @@ def test_checkpoint_wrapper_parity(self): results in the same maximum memory usage, i.e. they are equivalent memory usage wise. """ + class Model(nn.Module): def __init__( self, n: int, use_cp: bool, use_wrapper: bool = False, - use_reentrant: bool = True + use_reentrant: bool = True, ): super().__init__() self.layers = nn.ModuleList() @@ -138,10 +127,14 @@ def __init__( self.use_reentrant = use_reentrant wrp = partial( checkpoint_wrapper, - checkpoint_impl=CheckpointImpl.REENTRANT if use_reentrant else CheckpointImpl.NO_REENTRANT + checkpoint_impl=CheckpointImpl.REENTRANT + if use_reentrant + else CheckpointImpl.NO_REENTRANT, ) for i in range(self.n): - l = nn.Sequential(nn.Linear(256, 256), nn.Linear(256, 256), nn.Linear(256, 256)) + l = nn.Sequential( + nn.Linear(256, 256), nn.Linear(256, 256), nn.Linear(256, 256) + ) use_checkpoint_wrapper = self.use_wrapper if use_checkpoint_wrapper: l = wrp(l) @@ -149,29 +142,41 @@ def __init__( def forward(self, x): for i in range(self.n): - if ( - self.use_wrapper or - not self.use_cp - ): + if self.use_wrapper or not self.use_cp: x = self.layers[i](x) else: - x = checkpoint(self.layers[i], x, use_reentrant=self.use_reentrant) + x = checkpoint( + self.layers[i], x, use_reentrant=self.use_reentrant + ) return x def test(use_checkpointing, use_wrapper, use_reentrant): - a = Model(8, use_checkpointing, use_wrapper=use_wrapper, use_reentrant=use_reentrant).cuda() + a = Model( + 8, + use_checkpointing, + use_wrapper=use_wrapper, + use_reentrant=use_reentrant, + ).cuda() x = torch.randn(10000, 256, requires_grad=True).cuda() torch.cuda.reset_peak_memory_stats() loss = a(x).sum() loss.backward() return torch.cuda.max_memory_allocated() - functional_no_reentrant = test(use_checkpointing=True, use_wrapper=False, use_reentrant=False) - wrapper_no_reentrant = test(use_checkpointing=False, use_wrapper=True, use_reentrant=False) + functional_no_reentrant = test( + use_checkpointing=True, use_wrapper=False, use_reentrant=False + ) + wrapper_no_reentrant = test( + use_checkpointing=False, use_wrapper=True, use_reentrant=False + ) self.assertEqual(functional_no_reentrant, wrapper_no_reentrant) - functional_reentrant = test(use_checkpointing=True, use_wrapper=False, use_reentrant=True) - wrapper_reentrant = test(use_checkpointing=False, use_wrapper=True, use_reentrant=True) + functional_reentrant = test( + use_checkpointing=True, use_wrapper=False, use_reentrant=True + ) + wrapper_reentrant = test( + use_checkpointing=False, use_wrapper=True, use_reentrant=True + ) self.assertEqual(functional_reentrant, wrapper_reentrant) def test_forward_missing_attributes(self): @@ -181,8 +186,8 @@ def test_forward_missing_attributes(self): # Test indexing is forwarded self.assertEqual(wrapped[0], lin) # Test missing attributes are forwarded. - m._foo = 'bar' - self.assertEqual(wrapped._foo, 'bar') + m._foo = "bar" + self.assertEqual(wrapped._foo, "bar") def test_apply_activation_checkpointing(self): """ @@ -190,6 +195,7 @@ def test_apply_activation_checkpointing(self): to swap modules for their checkpoint-wrapped counterparts given a model. """ + class LinearWithBatchNorm(nn.Module): def __init__(self): super().__init__() @@ -210,7 +216,6 @@ def __init__(self): def forward(self, x): return self.seq(x) - def check_fn(l): return isinstance(l, nn.Linear) @@ -231,13 +236,27 @@ def check_fn(l): apply_activation_checkpointing( model, checkpoint_wrapper_fn=wrapper, check_fn=check_fn ) - n_linear_wrapped = sum(1 if isinstance(x, nn.Linear) else 0 for x in model.modules()) - n_checkpointed = sum(1 if isinstance(x, (CheckpointWrapper, OffloadWrapper)) else 0 for x in model.modules()) + n_linear_wrapped = sum( + 1 if isinstance(x, nn.Linear) else 0 for x in model.modules() + ) + n_checkpointed = sum( + 1 if isinstance(x, (CheckpointWrapper, OffloadWrapper)) else 0 + for x in model.modules() + ) self.assertEqual(n_checkpointed, n_linear_wrapped) self.assertEqual(n_linear, n_linear_wrapped) for j in range(3): - self.assertTrue(isinstance(model.seq[j].lin, (CheckpointWrapper, OffloadWrapper))) - self.assertTrue(isinstance(model.seq[j].nested_linear[0], (CheckpointWrapper, OffloadWrapper))) + self.assertTrue( + isinstance( + model.seq[j].lin, (CheckpointWrapper, OffloadWrapper) + ) + ) + self.assertTrue( + isinstance( + model.seq[j].nested_linear[0], + (CheckpointWrapper, OffloadWrapper), + ) + ) inp = torch.randn(4, 10, requires_grad=True) for i in range(6): @@ -249,9 +268,22 @@ def check_fn(l): for j in range(3): weight_lin = model.seq[j].lin._checkpoint_wrapped_module.weight bias_lin = model.seq[j].lin._checkpoint_wrapped_module.bias - weight_nested_lin = model.seq[j].nested_linear[0]._checkpoint_wrapped_module.weight - bias_nested_lin = model.seq[j].nested_linear[0]._checkpoint_wrapped_module.bias - for param in [weight_lin, bias_lin, weight_nested_lin, bias_nested_lin]: + weight_nested_lin = ( + model.seq[j] + .nested_linear[0] + ._checkpoint_wrapped_module.weight + ) + bias_nested_lin = ( + model.seq[j] + .nested_linear[0] + ._checkpoint_wrapped_module.bias + ) + for param in [ + weight_lin, + bias_lin, + weight_nested_lin, + bias_nested_lin, + ]: self.assertTrue(param.requires_grad) self.assertFalse(param.grad is None) @@ -287,7 +319,7 @@ def testing_cpu_offload_unpack_hook(packed): model = offload_wrapper(model) - inp = torch.randn(3, 10, device='cuda') + inp = torch.randn(3, 10, device="cuda") loss = model(inp).sum() # All autograd saved tensors should be offloaded to CPU. @@ -314,5 +346,6 @@ def dfs(grad_fn): torch.autograd.graph.saved_tensors_hooks.__init__ = orig_init + if __name__ == "__main__": run_tests() diff --git a/test/distributed/fsdp/test_distributed_checkpoint.py b/test/distributed/fsdp/test_distributed_checkpoint.py index ef95973764c43..e64fd358a305e 100644 --- a/test/distributed/fsdp/test_distributed_checkpoint.py +++ b/test/distributed/fsdp/test_distributed_checkpoint.py @@ -8,20 +8,14 @@ from torch.distributed._shard.checkpoint import ( FileSystemReader, FileSystemWriter, - save_state_dict, load_state_dict, + save_state_dict, ) -from torch.distributed.fsdp import ( - FullyShardedDataParallel as FSDP, - StateDictType, -) +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, StateDictType from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel from torch.distributed.fsdp.wrap import enable_wrap, wrap from torch.testing._internal.common_distributed import skip_if_lt_x_gpu -from torch.testing._internal.common_fsdp import ( - FSDPTest, - SkipModel, -) +from torch.testing._internal.common_fsdp import FSDPTest, SkipModel from torch.testing._internal.common_utils import ( instantiate_parametrized_tests, parametrize, @@ -29,7 +23,6 @@ TEST_WITH_DEV_DBG_ASAN, ) - if not dist.is_available(): print("Distributed not available, skipping tests", file=sys.stderr) sys.exit(0) @@ -75,16 +68,16 @@ def test_distributed_checkpoint(self, state_dict_type) -> None: path = paths[0] writer = FileSystemWriter(path) reader = FileSystemReader(path) - with FSDP.state_dict_type( - model, state_dict_type - ), FSDP.state_dict_type(new_model, state_dict_type): + with FSDP.state_dict_type(model, state_dict_type), FSDP.state_dict_type( + new_model, state_dict_type + ): state_dict = model.state_dict() save_state_dict(state_dict, writer) - with FSDP.state_dict_type( - model, state_dict_type - ), FSDP.state_dict_type(new_model, state_dict_type): + with FSDP.state_dict_type(model, state_dict_type), FSDP.state_dict_type( + new_model, state_dict_type + ): state_dict = new_model.state_dict() load_state_dict(state_dict, reader) new_model.load_state_dict(state_dict) diff --git a/test/distributed/fsdp/test_fsdp_apply.py b/test/distributed/fsdp/test_fsdp_apply.py index d72d57d133b0d..d44239a329344 100644 --- a/test/distributed/fsdp/test_fsdp_apply.py +++ b/test/distributed/fsdp/test_fsdp_apply.py @@ -14,10 +14,7 @@ NestedWrappedModule, TransformerWithSharedParams, ) -from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, - run_tests, -) +from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN if not dist.is_available(): print("Distributed not available, skipping tests", file=sys.stderr) diff --git a/test/distributed/fsdp/test_fsdp_checkpoint.py b/test/distributed/fsdp/test_fsdp_checkpoint.py index 50a5573f901f8..f0e8188641459 100644 --- a/test/distributed/fsdp/test_fsdp_checkpoint.py +++ b/test/distributed/fsdp/test_fsdp_checkpoint.py @@ -1,37 +1,31 @@ # Owner(s): ["oncall: distributed"] import contextlib +import sys from copy import deepcopy from functools import partial -import sys import torch import torch.distributed as dist import torch.nn as nn -from torch.distributed.fsdp.fully_sharded_data_parallel import ( - FullyShardedDataParallel as FSDP, - CPUOffload, -) from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( checkpoint_wrapper, offload_wrapper, ) -from torch.testing._internal.common_distributed import ( - skip_if_lt_x_gpu, -) -from torch.testing._internal.common_fsdp import ( - FSDPTest, - _maybe_wrap_fsdp, +from torch.distributed.fsdp.fully_sharded_data_parallel import ( + CPUOffload, + FullyShardedDataParallel as FSDP, ) +from torch.testing._internal.common_distributed import skip_if_lt_x_gpu +from torch.testing._internal.common_fsdp import _maybe_wrap_fsdp, FSDPTest from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, - run_tests, - parametrize, instantiate_parametrized_tests, + parametrize, + run_tests, + TEST_WITH_DEV_DBG_ASAN, ) from torch.utils.checkpoint import checkpoint - if not dist.is_available(): print("Distributed not available, skipping tests", file=sys.stderr) sys.exit(0) @@ -44,10 +38,13 @@ sys.exit(0) - _save_on_cpu_called = False + + def get_patched_save_on_cpu(): - orig_save_on_cpu = torch.distributed.algorithms._checkpoint.checkpoint_wrapper.save_on_cpu + orig_save_on_cpu = ( + torch.distributed.algorithms._checkpoint.checkpoint_wrapper.save_on_cpu + ) def patched_save_on_cpu(*args, **kwargs): global _save_on_cpu_called @@ -56,14 +53,22 @@ def patched_save_on_cpu(*args, **kwargs): return patched_save_on_cpu + @contextlib.contextmanager def patch_save_on_cpu(new_save_on_cpu): - orig_save_on_cpu = torch.distributed.algorithms._checkpoint.checkpoint_wrapper.save_on_cpu - torch.distributed.algorithms._checkpoint.checkpoint_wrapper.save_on_cpu = new_save_on_cpu + orig_save_on_cpu = ( + torch.distributed.algorithms._checkpoint.checkpoint_wrapper.save_on_cpu + ) + torch.distributed.algorithms._checkpoint.checkpoint_wrapper.save_on_cpu = ( + new_save_on_cpu + ) try: yield finally: - torch.distributed.algorithms._checkpoint.checkpoint_wrapper.save_on_cpu = orig_save_on_cpu + torch.distributed.algorithms._checkpoint.checkpoint_wrapper.save_on_cpu = ( + orig_save_on_cpu + ) + class TestFSDPCheckpoint(FSDPTest): class SequentialModule(nn.Module): @@ -143,7 +148,8 @@ def test_checkpoint_fsdp_wrapping( fsdp_kwargs = {"cpu_offload": cpu_offload, "use_orig_params": use_orig_params} ckpt_sequential_wrapped_fsdp = wrapper_to_use( TestFSDPCheckpoint.SequentialModule( - wrap_fsdp=True, **fsdp_kwargs, + wrap_fsdp=True, + **fsdp_kwargs, ), ) # Test FSDP(checkpoint(layer1)), FSDP(checkpoint(layer2)), .... @@ -155,7 +161,8 @@ def test_checkpoint_fsdp_wrapping( ) baseline = TestFSDPCheckpoint.SequentialModule( - wrap_fsdp=True, **fsdp_kwargs, + wrap_fsdp=True, + **fsdp_kwargs, ) # note that reentrant-based checkpointing requires inputs to have grad @@ -223,7 +230,9 @@ def test_basic_checkpoint_end_to_end( # note that reentrant-based checkpointing requires inputs to have grad # flag set. - inp = torch.randn(10, 3, device=torch.cuda.current_device(), requires_grad=True) + inp = torch.randn( + 10, 3, device=torch.cuda.current_device(), requires_grad=True + ) models = [ fsdp_only_seq, @@ -237,7 +246,9 @@ def test_basic_checkpoint_end_to_end( losses = [] outputs = [] for m in models: - check_offload = m != fsdp_only_seq and i == 0 and offload_activations + check_offload = ( + m != fsdp_only_seq and i == 0 and offload_activations + ) if m == fsdp_call_checkpoint: # _save_on_cpu should not be called yet self.assertFalse(_save_on_cpu_called) @@ -265,6 +276,7 @@ def test_basic_checkpoint_end_to_end( dist.barrier() + instantiate_parametrized_tests(TestFSDPCheckpoint) if __name__ == "__main__": diff --git a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py index 3af5a83cdde42..ddba50a9e4561 100644 --- a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py +++ b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py @@ -8,8 +8,8 @@ import torch import torch.nn as nn from torch import distributed as dist -from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload from torch.distributed.fsdp.fully_sharded_data_parallel import ( + CPUOffload, FullyShardedDataParallel as FSDP, ) from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy @@ -23,9 +23,9 @@ TransformerWithSharedParams, ) from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, instantiate_parametrized_tests, run_tests, + TEST_WITH_DEV_DBG_ASAN, ) if not dist.is_available(): @@ -53,6 +53,7 @@ def test_non_root(self): Tests that calling ``clip_grad_norm_()`` on a non-root FSDP instance raises an error. """ + class Model(nn.Module): def __init__(self) -> None: super().__init__() @@ -132,18 +133,26 @@ def _test_ddp_parity( # Multiply gradients by a large factor to ensure that gradients will # actually be clipped for param in itertools.chain(ddp_model.parameters(), fsdp_model.parameters()): - if param.grad is not None: # gradients may be `None` for `use_orig_params=True` + if ( + param.grad is not None + ): # gradients may be `None` for `use_orig_params=True` param.grad *= LARGE_FACTOR - orig_ddp_grads = [param.grad.detach().clone() for param in ddp_model.parameters()] + orig_ddp_grads = [ + param.grad.detach().clone() for param in ddp_model.parameters() + ] orig_fsdp_grads = [ param.grad.detach().clone() if param.grad is not None else None for param in fsdp_model.parameters() ] ddp_total_norm = torch.nn.utils.clip_grad_norm_( - ddp_model.parameters(), max_norm=max_norm, norm_type=norm_type, + ddp_model.parameters(), + max_norm=max_norm, + norm_type=norm_type, + ) + fsdp_total_norm = fsdp_model.clip_grad_norm_( + max_norm=max_norm, norm_type=norm_type ) - fsdp_total_norm = fsdp_model.clip_grad_norm_(max_norm=max_norm, norm_type=norm_type) self.assertEqual(ddp_total_norm, fsdp_total_norm) # Check that the gradients were modified by `clip_grad_norm_()` diff --git a/test/distributed/fsdp/test_fsdp_comm.py b/test/distributed/fsdp/test_fsdp_comm.py index d19617e31acd3..117e756da252e 100644 --- a/test/distributed/fsdp/test_fsdp_comm.py +++ b/test/distributed/fsdp/test_fsdp_comm.py @@ -2,7 +2,7 @@ import sys from contextlib import suppress -from enum import Enum, auto +from enum import auto, Enum from typing import Optional from unittest.mock import patch @@ -19,10 +19,10 @@ TransformerWithSharedParams, ) from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, instantiate_parametrized_tests, parametrize, run_tests, + TEST_WITH_DEV_DBG_ASAN, ) if not dist.is_available(): @@ -45,6 +45,7 @@ class PassType(Enum): class TestCommunication(FSDPTest): """Tests ``FullyShardedDataParallel``'s collective communication usage.""" + def _init_model( self, nested_model: bool, @@ -106,7 +107,8 @@ def _get_ref_num_all_gathers( pass_type, is_first_iter, is_last_iter_no_sync, - ) for pass_type in PassType + ) + for pass_type in PassType ) def _get_ref_num_all_gathers_in_pass( @@ -121,9 +123,11 @@ def _get_ref_num_all_gathers_in_pass( if sharding_strategy is None: sharding_strategy = ShardingStrategy.FULL_SHARD # default # Forward pass: - if pass_type == PassType.FWD and \ - sharding_strategy == ShardingStrategy.SHARD_GRAD_OP and \ - is_last_iter_no_sync: + if ( + pass_type == PassType.FWD + and sharding_strategy == ShardingStrategy.SHARD_GRAD_OP + and is_last_iter_no_sync + ): # Modules do not free the full parameters in the last # iteration's backward pass if it was in `no_sync()` num_all_gathers = 0 @@ -132,21 +136,27 @@ def _get_ref_num_all_gathers_in_pass( # forward pass num_all_gathers = num_fsdp # Backward pass: - elif pass_type == PassType.BWD and \ - sharding_strategy == ShardingStrategy.FULL_SHARD: + elif ( + pass_type == PassType.BWD + and sharding_strategy == ShardingStrategy.FULL_SHARD + ): # Root does not free the full parameters at the end of the # forward pass num_all_gathers = num_fsdp - 1 - elif pass_type == PassType.BWD and \ - sharding_strategy == ShardingStrategy.SHARD_GRAD_OP: + elif ( + pass_type == PassType.BWD + and sharding_strategy == ShardingStrategy.SHARD_GRAD_OP + ): # Modules do not free the full parameters at the end of the # forward pass num_all_gathers = 0 else: - assert 0, f"Unsupported: add a branch for pass_type={pass_type} " \ - f"is_first_iter={is_first_iter} " \ - f"is_last_iter_no_sync={is_last_iter_no_sync} " \ + assert 0, ( + f"Unsupported: add a branch for pass_type={pass_type} " + f"is_first_iter={is_first_iter} " + f"is_last_iter_no_sync={is_last_iter_no_sync} " f"sharding_strategy={sharding_strategy}" + ) if is_first_iter and pass_type == PassType.FWD: # With execution order validation, on the first iteration, we have # an additional two all-gathers before every actual all-gather in @@ -167,7 +177,10 @@ def _print_ref_num_all_gathers_in_pass( if self.rank != 0: return # only print on one rank num_all_gathers = self._get_ref_num_all_gathers_in_pass( - num_fsdp, sharding_strategy, pass_type, is_first_iter, + num_fsdp, + sharding_strategy, + pass_type, + is_first_iter, is_last_iter_no_sync, ) print( @@ -211,8 +224,7 @@ def test_communication( # Count the number of FSDP instances that manage parameters since the # number of collectives are a function of this number num_fsdp = sum( - (isinstance(m, FSDP) and len(m.params) > 0) - for m in fsdp_model.modules() + (isinstance(m, FSDP) and len(m.params) > 0) for m in fsdp_model.modules() ) # If `use_no_sync=True`, we run `num_iters` iterations inside @@ -220,11 +232,16 @@ def test_communication( # and if `use_no_sync=False`, we only run `num_iters` iterations # outside `no_sync()` num_iters = 3 - with patch("torch.distributed.all_gather_into_tensor") as mock_all_gather, \ - patch("torch.distributed.reduce_scatter_tensor") as mock_reduce_scatter: + with patch( + "torch.distributed.all_gather_into_tensor" + ) as mock_all_gather, patch( + "torch.distributed.reduce_scatter_tensor" + ) as mock_reduce_scatter: + def reset_mocks(): mock_all_gather.reset_mock() mock_reduce_scatter.reset_mock() + # Check the communication cost when using `no_sync()` if use_no_sync: for i in range(num_iters): @@ -233,11 +250,14 @@ def reset_mocks(): num_all_gathers = mock_all_gather.call_count num_reduce_scatters = mock_reduce_scatter.call_count ref_num_all_gathers = self._get_ref_num_all_gathers( - num_fsdp, sharding_strategy, is_first_iter=i == 0, + num_fsdp, + sharding_strategy, + is_first_iter=i == 0, is_last_iter_no_sync=i > 0, ) ref_num_reduce_scatters = self._get_ref_num_reduce_scatters( - num_fsdp, in_no_sync=True, + num_fsdp, + in_no_sync=True, ) self.assertEqual(num_all_gathers, ref_num_all_gathers) self.assertEqual(num_reduce_scatters, ref_num_reduce_scatters) @@ -248,12 +268,14 @@ def reset_mocks(): num_all_gathers = mock_all_gather.call_count num_reduce_scatters = mock_reduce_scatter.call_count ref_num_all_gathers = self._get_ref_num_all_gathers( - num_fsdp, sharding_strategy, + num_fsdp, + sharding_strategy, is_first_iter=not use_no_sync and i == 0, is_last_iter_no_sync=use_no_sync and i == 0, ) ref_num_reduce_scatters = self._get_ref_num_reduce_scatters( - num_fsdp, in_no_sync=False, + num_fsdp, + in_no_sync=False, ) self.assertEqual(num_all_gathers, ref_num_all_gathers) self.assertEqual(num_reduce_scatters, ref_num_reduce_scatters) diff --git a/test/distributed/fsdp/test_fsdp_comm_hooks.py b/test/distributed/fsdp/test_fsdp_comm_hooks.py index bfd710cdac486..125606fbff5cb 100644 --- a/test/distributed/fsdp/test_fsdp_comm_hooks.py +++ b/test/distributed/fsdp/test_fsdp_comm_hooks.py @@ -7,10 +7,9 @@ import torch.nn as nn import torch.nn.functional as F from torch import distributed as dist -from torch.distributed.distributed_c10d import _get_default_group from torch.distributed.algorithms._comm_hooks import default_hooks -from torch.distributed.fsdp import FullyShardedDataParallel as FSDP -from torch.distributed.fsdp import MixedPrecision +from torch.distributed.distributed_c10d import _get_default_group +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, MixedPrecision from torch.distributed.fsdp.fully_sharded_data_parallel import ShardingStrategy from torch.testing._internal.common_distributed import ( requires_nccl, @@ -26,7 +25,6 @@ run_tests, ) - if not dist.is_available(): print("Distributed not available, skipping tests", file=sys.stderr) sys.exit(0) @@ -35,10 +33,11 @@ BFLOAT16_AVAILABLE = ( torch.cuda.is_available() and torch.version.cuda is not None - and int(torch.version.cuda.split('.')[0]) >= 11) + and int(torch.version.cuda.split(".")[0]) >= 11 +) -class Net(nn.Module): +class Net(nn.Module): def __init__(self, has_wrapping, sharding_strategy, mixed_precision=None): # to ensure determinism torch.manual_seed(0) @@ -46,45 +45,40 @@ def __init__(self, has_wrapping, sharding_strategy, mixed_precision=None): super().__init__() if has_wrapping: - self.net = FSDP(nn.Sequential( - nn.Linear(8, 16), - nn.ReLU(), - FSDP( - nn.Linear(16, 8), - device_id=torch.cuda.current_device(), - sharding_strategy=sharding_strategy, - mixed_precision=mixed_precision, - ) - ), + self.net = FSDP( + nn.Sequential( + nn.Linear(8, 16), + nn.ReLU(), + FSDP( + nn.Linear(16, 8), + device_id=torch.cuda.current_device(), + sharding_strategy=sharding_strategy, + mixed_precision=mixed_precision, + ), + ), device_id=torch.cuda.current_device(), sharding_strategy=sharding_strategy, mixed_precision=mixed_precision, ) else: - self.net = nn.Sequential( - nn.Linear(8, 16), - nn.ReLU(), - nn.Linear(16, 8) - ) + self.net = nn.Sequential(nn.Linear(8, 16), nn.ReLU(), nn.Linear(16, 8)) self.out = nn.Linear(8, 4) def forward(self, x): return self.out(F.relu(self.net(x))) + class DummyState(object): - __slots__ = [ - "process_group", - "noise" - ] + __slots__ = ["process_group", "noise"] def __init__(self, process_group: dist.ProcessGroup, noise: int): self.process_group = process_group self.noise = noise -class DummyHook(object): +class DummyHook(object): def dummy_hook_for_no_shard_fsdp(self, state: DummyState, grad: torch.Tensor): """ This communication hook is for illustration and testing purpose only. @@ -104,7 +98,9 @@ def custom_reduce_scatter(self, output, input, group=None): """ pass - def dummy_hook_for_sharded_fsdp(self, state: DummyState, grad: torch.Tensor, output: torch.Tensor): + def dummy_hook_for_sharded_fsdp( + self, state: DummyState, grad: torch.Tensor, output: torch.Tensor + ): """ This communication hook is for illustration and testing purposes only. This communication hook is used during FSDP ``FULL_SHARD`` or ``SHARD_GRAD_OP`` training. @@ -112,23 +108,21 @@ def dummy_hook_for_sharded_fsdp(self, state: DummyState, grad: torch.Tensor, out ``reduce_scatter`` for gradient communication and stores a sharded gradient in ``output``. """ grad.add_(state.noise) - self.custom_reduce_scatter( - output, grad, group=state.process_group - ) + self.custom_reduce_scatter(output, grad, group=state.process_group) -class TestCommunicationHooks(FSDPTest): +class TestCommunicationHooks(FSDPTest): @skip_if_lt_x_gpu(2) @parametrize( "sharding_strategy", [ ShardingStrategy.NO_SHARD, ShardingStrategy.FULL_SHARD, - ShardingStrategy.SHARD_GRAD_OP - ]) + ShardingStrategy.SHARD_GRAD_OP, + ], + ) def test_default_communication_hook_behavior( - self, - sharding_strategy: Optional[ShardingStrategy] + self, sharding_strategy: Optional[ShardingStrategy] ): """ Tests FSDP's default communication hook's behavior and correctness. @@ -148,14 +142,16 @@ def test_default_communication_hook_behavior( net_default_hook = FSDP( net, device_id=torch.cuda.current_device(), - sharding_strategy=sharding_strategy + sharding_strategy=sharding_strategy, ).to(self.rank) # Check that default hook is set to `all_reduce` for `NO_SHARD` # or `reduce_scatter` for sharded cases - default_hook = default_hooks.reduce_scatter_hook\ - if sharding_strategy != ShardingStrategy.NO_SHARD\ + default_hook = ( + default_hooks.reduce_scatter_hook + if sharding_strategy != ShardingStrategy.NO_SHARD else default_hooks.allreduce_hook + ) for entry in FSDP.fsdp_modules(net_default_hook): self.assertEqual(entry._communication_hook, default_hook) @@ -176,11 +172,13 @@ def test_default_communication_hook_behavior( self.assertEqual( grad[0].item(), expected_grad, - msg=f"Expected hook grad of {expected_grad} but got {grad[0].item()}") + msg=f"Expected hook grad of {expected_grad} but got {grad[0].item()}", + ) def _get_submodules(self, fsdp_net): return [ - submodule for submodule in FSDP.fsdp_modules(fsdp_net) + submodule + for submodule in FSDP.fsdp_modules(fsdp_net) if not submodule.check_is_root() ] @@ -201,12 +199,11 @@ def _init_model(self, core, sharding_strategy, mixed_precision=None): [ ShardingStrategy.NO_SHARD, ShardingStrategy.FULL_SHARD, - ShardingStrategy.SHARD_GRAD_OP - ]) + ShardingStrategy.SHARD_GRAD_OP, + ], + ) def test_default_communication_hook_initialization( - self, - has_wrapping: bool, - sharding_strategy: Optional[ShardingStrategy] + self, has_wrapping: bool, sharding_strategy: Optional[ShardingStrategy] ): """ Tests FSDP's communication hook interface behavior. @@ -219,45 +216,39 @@ def test_default_communication_hook_initialization( # Initialize a model fsdp_model_with_hook = self._init_model( Net(has_wrapping=has_wrapping, sharding_strategy=sharding_strategy), - sharding_strategy=sharding_strategy + sharding_strategy=sharding_strategy, ) # Check that default hook is set to `all_reduce` for `NO_SHARD` # or `reduce_scatter` for sharded cases - default_hook = default_hooks.reduce_scatter_hook\ - if sharding_strategy != ShardingStrategy.NO_SHARD\ + default_hook = ( + default_hooks.reduce_scatter_hook + if sharding_strategy != ShardingStrategy.NO_SHARD else default_hooks.allreduce_hook + ) for entry in FSDP.fsdp_modules(fsdp_model_with_hook): self.assertEqual(entry._communication_hook, default_hook) dummy_state = DummyState(process_group=None, noise=1234) - dummy_hook = DummyHook.dummy_hook_for_no_shard_fsdp\ - if sharding_strategy != ShardingStrategy.NO_SHARD\ + dummy_hook = ( + DummyHook.dummy_hook_for_no_shard_fsdp + if sharding_strategy != ShardingStrategy.NO_SHARD else DummyHook.dummy_hook_for_sharded_fsdp - - fsdp_model_with_hook.register_comm_hook( - dummy_state, - dummy_hook ) + fsdp_model_with_hook.register_comm_hook(dummy_state, dummy_hook) + # Check that we can't register comm hook twice - with self.assertRaisesRegex(AssertionError, '^communication hook can be only registered once$'): - fsdp_model_with_hook.register_comm_hook( - dummy_state, - dummy_hook - ) + with self.assertRaisesRegex( + AssertionError, "^communication hook can be only registered once$" + ): + fsdp_model_with_hook.register_comm_hook(dummy_state, dummy_hook) # Check dummy hook was registered for the root and all submodules if any for entry in FSDP.fsdp_modules(fsdp_model_with_hook): - self.assertEqual( - entry._communication_hook, - dummy_hook - ) - self.assertEqual( - entry._communication_hook_state, - dummy_state - ) + self.assertEqual(entry._communication_hook, dummy_hook) + self.assertEqual(entry._communication_hook_state, dummy_state) for entry in FSDP.fsdp_modules(fsdp_model_with_hook): entry._communication_hook = None @@ -277,18 +268,17 @@ def test_default_communication_hook_initialization( with self.assertRaises(AssertionError): loss.backward() - @skip_if_lt_x_gpu(2) @parametrize( "sharding_strategy", [ ShardingStrategy.NO_SHARD, ShardingStrategy.FULL_SHARD, - ShardingStrategy.SHARD_GRAD_OP - ]) + ShardingStrategy.SHARD_GRAD_OP, + ], + ) def test_registering_hook_non_root( - self, - sharding_strategy: Optional[ShardingStrategy] + self, sharding_strategy: Optional[ShardingStrategy] ): """ Tests FSDP's communication hook registering for submodules. @@ -301,16 +291,21 @@ def test_registering_hook_non_root( fsdp_model_with_hook = self._init_model( Net(has_wrapping=True, sharding_strategy=sharding_strategy), - sharding_strategy=sharding_strategy + sharding_strategy=sharding_strategy, ) dummy_state = DummyState(process_group=None, noise=1234) - dummy_hook = DummyHook.dummy_hook_for_no_shard_fsdp\ - if sharding_strategy != ShardingStrategy.NO_SHARD\ + dummy_hook = ( + DummyHook.dummy_hook_for_no_shard_fsdp + if sharding_strategy != ShardingStrategy.NO_SHARD else DummyHook.dummy_hook_for_sharded_fsdp + ) # Creating a list of non-root submodules to test submodules = self._get_submodules(fsdp_model_with_hook) # Check that assertion is raised for registering a comm hook on a non-root - with self.assertRaisesRegex(AssertionError, '^register_comm_hook can only be called on a root instance.$'): + with self.assertRaisesRegex( + AssertionError, + "^register_comm_hook can only be called on a root instance.$", + ): submodules[1].register_comm_hook(dummy_state, dummy_hook) @skip_if_lt_x_gpu(2) @@ -319,11 +314,11 @@ def test_registering_hook_non_root( [ ShardingStrategy.NO_SHARD, ShardingStrategy.FULL_SHARD, - ShardingStrategy.SHARD_GRAD_OP - ]) + ShardingStrategy.SHARD_GRAD_OP, + ], + ) def test_registering_hook_submodules( - self, - sharding_strategy: Optional[ShardingStrategy] + self, sharding_strategy: Optional[ShardingStrategy] ): """ Tests FSDP's communication hook registering for submodules. @@ -336,24 +331,28 @@ def test_registering_hook_submodules( fsdp_model_with_hook = self._init_model( Net(has_wrapping=True, sharding_strategy=sharding_strategy), - sharding_strategy=sharding_strategy + sharding_strategy=sharding_strategy, ) dummy_state = DummyState(process_group=None, noise=1234) - dummy_hook = DummyHook.dummy_hook_for_no_shard_fsdp\ - if sharding_strategy != ShardingStrategy.NO_SHARD\ + dummy_hook = ( + DummyHook.dummy_hook_for_no_shard_fsdp + if sharding_strategy != ShardingStrategy.NO_SHARD else DummyHook.dummy_hook_for_sharded_fsdp + ) submodules = self._get_submodules(fsdp_model_with_hook) # Simulate a registration of a hook on a submodule submodules[1]._hook_registered = True # Check that an error is raised when some of submodules have a non-default hook assigned - with self.assertRaisesRegex(AssertionError, '^communication hook can be only registered once$'): + with self.assertRaisesRegex( + AssertionError, "^communication hook can be only registered once$" + ): fsdp_model_with_hook.register_comm_hook(dummy_state, dummy_hook) # Reinitialize the model fsdp_model_with_hook = self._init_model( Net(has_wrapping=True, sharding_strategy=sharding_strategy), - sharding_strategy=sharding_strategy + sharding_strategy=sharding_strategy, ) submodules = self._get_submodules(fsdp_model_with_hook) submodules[1]._communication_hook = dummy_hook @@ -361,29 +360,32 @@ def test_registering_hook_submodules( # Check that an error is raised when some of submodules have a non-default hook assigned with self.assertRaisesRegex( AssertionError, - f'^communication hook should be default, but it is {submodules[1]._communication_hook.__name__} instead$' + f"^communication hook should be default, but it is {submodules[1]._communication_hook.__name__} instead$", ): - fsdp_model_with_hook.register_comm_hook( - dummy_state, - dummy_hook - ) + fsdp_model_with_hook.register_comm_hook(dummy_state, dummy_hook) - def _check_low_precision_hook(self, state, hook, sharding_strategy, dtype, has_wrapping): + def _check_low_precision_hook( + self, state, hook, sharding_strategy, dtype, has_wrapping + ): # keep everything deterministic for input data torch.manual_seed(0) torch.cuda.manual_seed(0) fsdp_with_hook = self._init_model( Net(has_wrapping=has_wrapping, sharding_strategy=sharding_strategy), - sharding_strategy=sharding_strategy + sharding_strategy=sharding_strategy, ) fsdp_with_hook.register_comm_hook(state, hook) mp_only_grad = MixedPrecision(reduce_dtype=dtype) fsdp_with_mp = self._init_model( - Net(has_wrapping=has_wrapping, sharding_strategy=sharding_strategy, mixed_precision=mp_only_grad), + Net( + has_wrapping=has_wrapping, + sharding_strategy=sharding_strategy, + mixed_precision=mp_only_grad, + ), sharding_strategy=sharding_strategy, - mixed_precision=mp_only_grad + mixed_precision=mp_only_grad, ) optim_hook = torch.optim.SGD(fsdp_with_hook.parameters(), lr=0.1) @@ -403,7 +405,9 @@ def _check_low_precision_hook(self, state, hook, sharding_strategy, dtype, has_w dist.barrier() - for hook_param, mp_param in zip(fsdp_with_hook.parameters(), fsdp_with_mp.parameters()): + for hook_param, mp_param in zip( + fsdp_with_hook.parameters(), fsdp_with_mp.parameters() + ): self.assertEqual(hook_param.grad, mp_param.grad) @requires_nccl() @@ -414,18 +418,19 @@ def _check_low_precision_hook(self, state, hook, sharding_strategy, dtype, has_w [ ShardingStrategy.NO_SHARD, ShardingStrategy.FULL_SHARD, - ShardingStrategy.SHARD_GRAD_OP - ]) + ShardingStrategy.SHARD_GRAD_OP, + ], + ) def test_fp16_hook( - self, - has_wrapping: bool, - sharding_strategy: Optional[ShardingStrategy] + self, has_wrapping: bool, sharding_strategy: Optional[ShardingStrategy] ): state = default_hooks.LowPrecisionState(process_group=_get_default_group()) hook = default_hooks.fp16_compress_hook - self._check_low_precision_hook(state, hook, sharding_strategy, torch.float16, has_wrapping) + self._check_low_precision_hook( + state, hook, sharding_strategy, torch.float16, has_wrapping + ) @requires_nccl() @requires_nccl_version((2, 10), "Need NCCL 2.10+ for BF16_COMPRESS") @@ -441,18 +446,19 @@ def test_fp16_hook( [ ShardingStrategy.NO_SHARD, ShardingStrategy.FULL_SHARD, - ShardingStrategy.SHARD_GRAD_OP - ]) + ShardingStrategy.SHARD_GRAD_OP, + ], + ) def test_bf16_hook( - self, - has_wrapping: bool, - sharding_strategy: Optional[ShardingStrategy] + self, has_wrapping: bool, sharding_strategy: Optional[ShardingStrategy] ): state = default_hooks.LowPrecisionState(process_group=_get_default_group()) hook = default_hooks.bf16_compress_hook - self._check_low_precision_hook(state, hook, sharding_strategy, torch.bfloat16, has_wrapping) + self._check_low_precision_hook( + state, hook, sharding_strategy, torch.bfloat16, has_wrapping + ) instantiate_parametrized_tests(TestCommunicationHooks) diff --git a/test/distributed/fsdp/test_fsdp_core.py b/test/distributed/fsdp/test_fsdp_core.py index 9557f2abcfbcb..93d5e4f45ad28 100644 --- a/test/distributed/fsdp/test_fsdp_core.py +++ b/test/distributed/fsdp/test_fsdp_core.py @@ -24,14 +24,14 @@ MixtureOfExperts, NestedWrappedModule, NestedWrappedModuleWithDelay, - TransformerWithSharedParams, subtest_name, + TransformerWithSharedParams, ) from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, instantiate_parametrized_tests, parametrize, run_tests, + TEST_WITH_DEV_DBG_ASAN, ) if not dist.is_available(): @@ -47,7 +47,11 @@ params = "cpu_offload,sharding_strategy" cpu_offload_config = [CPUOffload(offload_params=True), CPUOffload(offload_params=False)] -sharding_strategy_config = [None, ShardingStrategy.SHARD_GRAD_OP, ShardingStrategy.NO_SHARD] +sharding_strategy_config = [ + None, + ShardingStrategy.SHARD_GRAD_OP, + ShardingStrategy.NO_SHARD, +] configs = list(itertools.product(cpu_offload_config, sharding_strategy_config)) test_name_mapping = { str(CPUOffload(offload_params=True)): "offload_true", @@ -259,7 +263,7 @@ def test_mixture_of_experts_with_delay_before_free( ref_init_fn=self._dummy_ddp_fn, cpu_offload=cpu_offload, sharding_strategy=sharding_strategy, - init_kwargs={"delay_before_free_ms": 250} + init_kwargs={"delay_before_free_ms": 250}, ) @@ -397,7 +401,7 @@ def test_transformer_no_grad(self, mixed_precision): fsdp_model, num_steps=1, autocast=False, - mixed_precision=fsdp_kwargs["mixed_precision"] + mixed_precision=fsdp_kwargs["mixed_precision"], ) input = fsdp_model.module.get_input(torch.device("cuda")) # Run a forward in eval mode diff --git a/test/distributed/fsdp/test_fsdp_exec_order.py b/test/distributed/fsdp/test_fsdp_exec_order.py index eaf3066d1bad0..6cd00e5302181 100644 --- a/test/distributed/fsdp/test_fsdp_exec_order.py +++ b/test/distributed/fsdp/test_fsdp_exec_order.py @@ -11,10 +11,10 @@ from torch.testing._internal.common_distributed import skip_if_lt_x_gpu from torch.testing._internal.common_fsdp import FSDPTest from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, instantiate_parametrized_tests, parametrize, run_tests, + TEST_WITH_DEV_DBG_ASAN, ) if not dist.is_available(): diff --git a/test/distributed/fsdp/test_fsdp_freezing_weights.py b/test/distributed/fsdp/test_fsdp_freezing_weights.py index 23836130818c9..430e47adf71e0 100644 --- a/test/distributed/fsdp/test_fsdp_freezing_weights.py +++ b/test/distributed/fsdp/test_fsdp_freezing_weights.py @@ -10,18 +10,14 @@ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from torch.nn.parallel import DistributedDataParallel from torch.testing._internal.common_distributed import skip_if_lt_x_gpu -from torch.testing._internal.common_fsdp import ( - FSDPTest, - get_full_params, -) +from torch.testing._internal.common_fsdp import FSDPTest, get_full_params from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, instantiate_parametrized_tests, parametrize, run_tests, + TEST_WITH_DEV_DBG_ASAN, ) - if not dist.is_available(): print("Distributed not available, skipping tests", file=sys.stderr) sys.exit(0) diff --git a/test/distributed/fsdp/test_fsdp_grad_acc.py b/test/distributed/fsdp/test_fsdp_grad_acc.py index 1e44f865027d0..ef20d2a2db76e 100644 --- a/test/distributed/fsdp/test_fsdp_grad_acc.py +++ b/test/distributed/fsdp/test_fsdp_grad_acc.py @@ -8,8 +8,7 @@ import torch from torch import distributed as dist -from torch.distributed.fsdp import CPUOffload -from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp import CPUOffload, FullyShardedDataParallel as FSDP from torch.distributed.fsdp.fully_sharded_data_parallel import ( BackwardPrefetch, ShardingStrategy, @@ -22,10 +21,10 @@ TransformerWithSharedParams, ) from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, instantiate_parametrized_tests, parametrize, run_tests, + TEST_WITH_DEV_DBG_ASAN, ) if not dist.is_available(): @@ -53,16 +52,14 @@ class _GradAccConfig: manager as the way to accumulate gradients. num_iters (int): Number of iterations to accumulate gradients. """ + use_no_sync: bool num_iters: int def __repr__(self) -> str: # Override to remove any spaces in the string to appease the internal # build's test name parser - return ( - f"(use_no_sync={self.use_no_sync}," - f"num_iters={self.num_iters})" - ) + return f"(use_no_sync={self.use_no_sync}," f"num_iters={self.num_iters})" @dataclass @@ -71,14 +68,13 @@ class _GradAccConfigs: This wraps a :class:`list` of :class:`_GradAccConfig` instances with the sole purpose of overriding :meth:`__repr__` to remove spaces. """ + configs: List[_GradAccConfig] def __repr__(self) -> str: # Override to remove any spaces in the string to appease the internal # build's test name parser - return ( - "[" + ",".join(config.__repr__() for config in self.configs) + "]" - ) + return "[" + ",".join(config.__repr__() for config in self.configs) + "]" class TestGradAcc(FSDPTest): @@ -118,9 +114,8 @@ def _test_grad_acc( """ # Gradient accumulation outside `no_sync()` is not currently compatible # with CPU offloading - if ( - cpu_offload.offload_params - and any(not config.use_no_sync for config in configs) + if cpu_offload.offload_params and any( + not config.use_no_sync for config in configs ): return old_allow_tf32 = torch.backends.cuda.matmul.allow_tf32 @@ -144,7 +139,9 @@ def _test_grad_acc( ) device = torch.device("cuda") optim = torch.optim.SGD( - fsdp_model.parameters(), lr=0.01, momentum=0.9, + fsdp_model.parameters(), + lr=0.01, + momentum=0.9, ) # Generate the sequence of batches, each containing the same data @@ -152,16 +149,16 @@ def _test_grad_acc( def permute_tensor(x: torch.Tensor): return x.view(-1)[torch.randperm(x.numel())].view_as(x) - batch: Tuple[torch.Tensor, ...] = \ - fsdp_model.module.get_input(device) + batch: Tuple[torch.Tensor, ...] = fsdp_model.module.get_input(device) batches: List[Tuple[torch.Tensor, ...]] = [batch] num_iters_to_acc = sum(config.num_iters for config in configs) for _ in range(num_iters_to_acc - 1): batches.append(tuple(permute_tensor(t) for t in batch)) for (batch1, batch2) in itertools.combinations(batches, r=2): for t1, t2 in zip(batch1, batch2): - assert not torch.all(t1 == t2), \ - "Check the test to make sure that batches are distinct" + assert not torch.all( + t1 == t2 + ), "Check the test to make sure that batches are distinct" # Concatenate the batches along the given batch dimension concat_batch: Tuple[torch.Tensor, ...] = tuple( @@ -173,17 +170,18 @@ def permute_tensor(x: torch.Tensor): output = fsdp_model(*concat_batch) ref_loss = fsdp_model.module.get_loss(concat_batch, output) ref_loss.backward() - ref_grads = [ - p.grad.detach().clone() for p in fsdp_model.parameters() - ] + ref_grads = [p.grad.detach().clone() for p in fsdp_model.parameters()] # Compute and accumulate the gradients fsdp_model.zero_grad() losses = [] batch_idx = 0 for config in configs: - sync_context = fsdp_model.no_sync() if config.use_no_sync \ + sync_context = ( + fsdp_model.no_sync() + if config.use_no_sync else contextlib.suppress() + ) with sync_context: for _ in range(config.num_iters): if batch_idx == num_iters_to_acc - 1: @@ -199,9 +197,7 @@ def permute_tensor(x: torch.Tensor): loss.backward() losses.append(loss) acc_loss = sum(losses) - acc_grads = [ - p.grad.detach().clone() for p in fsdp_model.parameters() - ] + acc_grads = [p.grad.detach().clone() for p in fsdp_model.parameters()] # Compare the losses and gradients torch.testing.assert_close(ref_loss, acc_loss) @@ -231,17 +227,21 @@ def _get_subtest_config(self) -> Dict[str, List[Any]]: @parametrize( "configs", [ - _GradAccConfigs([ - _GradAccConfig(use_no_sync=True, num_iters=3), - _GradAccConfig(use_no_sync=False, num_iters=3), - _GradAccConfig(use_no_sync=True, num_iters=3), - ]), - _GradAccConfigs([ - _GradAccConfig(use_no_sync=False, num_iters=3), - _GradAccConfig(use_no_sync=True, num_iters=3), - _GradAccConfig(use_no_sync=False, num_iters=3), - ]), - ] + _GradAccConfigs( + [ + _GradAccConfig(use_no_sync=True, num_iters=3), + _GradAccConfig(use_no_sync=False, num_iters=3), + _GradAccConfig(use_no_sync=True, num_iters=3), + ] + ), + _GradAccConfigs( + [ + _GradAccConfig(use_no_sync=False, num_iters=3), + _GradAccConfig(use_no_sync=True, num_iters=3), + _GradAccConfig(use_no_sync=False, num_iters=3), + ] + ), + ], ) @parametrize( "cpu_offload", @@ -253,7 +253,7 @@ def _get_subtest_config(self) -> Dict[str, List[Any]]: ShardingStrategy.FULL_SHARD, ShardingStrategy.SHARD_GRAD_OP, ShardingStrategy.NO_SHARD, - ] + ], ) def test_grad_acc( self, diff --git a/test/distributed/fsdp/test_fsdp_ignored_modules.py b/test/distributed/fsdp/test_fsdp_ignored_modules.py index 60c3fd6f88110..83babee7d482f 100644 --- a/test/distributed/fsdp/test_fsdp_ignored_modules.py +++ b/test/distributed/fsdp/test_fsdp_ignored_modules.py @@ -14,10 +14,10 @@ TransformerWithSharedParams, ) from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, instantiate_parametrized_tests, parametrize, run_tests, + TEST_WITH_DEV_DBG_ASAN, ) if not dist.is_available(): @@ -74,12 +74,15 @@ def forward(self, x): class ModelWithIgnoredModules(Model): """Adds a variable number of :class:`IgnoredModule` to ``self.layer1``.""" + def __init__(self, num_ignored: int) -> None: assert num_ignored >= 0 super().__init__() - layer1_modules = [torch.nn.Linear(5, 4), torch.nn.Linear(4, 4)] + \ - [IgnoredModule(4, 4) for _ in range(num_ignored)] + \ - [torch.nn.Linear(4, 4)] + layer1_modules = ( + [torch.nn.Linear(5, 4), torch.nn.Linear(4, 4)] + + [IgnoredModule(4, 4) for _ in range(num_ignored)] + + [torch.nn.Linear(4, 4)] + ) self.layer1 = torch.nn.Sequential(*layer1_modules) @@ -143,9 +146,7 @@ def test_ignored_modules_nested(self): # the ignored nested sequential's parameters nonwrapped_model = Model() total_numel = sum(p.numel() for p in nonwrapped_model.parameters()) - ignored_numel = sum( - p.numel() for p in nonwrapped_model.layer1.parameters() - ) + ignored_numel = sum(p.numel() for p in nonwrapped_model.layer1.parameters()) nonignored_numel = total_numel - ignored_numel with FSDP.summon_full_params(wrapped_model): flat_param_numel = wrapped_model.params[0].numel() @@ -176,7 +177,9 @@ def test_ignored_modules_invalid(self): @skip_if_lt_x_gpu(2) @parametrize("pass_ignored_modules_to_root", [False, True]) - def test_diff_ignored_modules_across_ranks(self, pass_ignored_modules_to_root: bool): + def test_diff_ignored_modules_across_ranks( + self, pass_ignored_modules_to_root: bool + ): """ Tests ignoring different modules across ranks. @@ -196,9 +199,11 @@ def test_diff_ignored_modules_across_ranks(self, pass_ignored_modules_to_root: b ] model.layer1 = FSDP(model.layer1, ignored_modules=layer1_ignored_modules) model.layer3 = FSDP(model.layer3) - model_ignored_modules = [ - m for m in model.modules() if isinstance(m, IgnoredModule) - ] if pass_ignored_modules_to_root else [] + model_ignored_modules = ( + [m for m in model.modules() if isinstance(m, IgnoredModule)] + if pass_ignored_modules_to_root + else [] + ) wrapped_model = FSDP(model, ignored_modules=model_ignored_modules) optim = torch.optim.Adam(wrapped_model.parameters(), lr=1e-3) self._train_model(wrapped_model, optim, 3) diff --git a/test/distributed/fsdp/test_fsdp_input.py b/test/distributed/fsdp/test_fsdp_input.py index 136b65c3b28ec..06a516faaa97b 100644 --- a/test/distributed/fsdp/test_fsdp_input.py +++ b/test/distributed/fsdp/test_fsdp_input.py @@ -8,18 +8,15 @@ from torch.nn import Linear, Module from torch.optim import SGD from torch.testing._internal.common_distributed import skip_if_lt_x_gpu -from torch.testing._internal.common_fsdp import ( - FSDPTest, -) +from torch.testing._internal.common_fsdp import FSDPTest from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, instantiate_parametrized_tests, parametrize, run_tests, subtest, + TEST_WITH_DEV_DBG_ASAN, ) - if not dist.is_available(): print("Distributed not available, skipping tests", file=sys.stderr) sys.exit(0) diff --git a/test/distributed/fsdp/test_fsdp_memory.py b/test/distributed/fsdp/test_fsdp_memory.py index b26aa249dc798..fe2ad8879ad1b 100644 --- a/test/distributed/fsdp/test_fsdp_memory.py +++ b/test/distributed/fsdp/test_fsdp_memory.py @@ -8,18 +8,15 @@ from torch import distributed as dist from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from torch.testing._internal.common_distributed import skip_if_lt_x_gpu -from torch.testing._internal.common_fsdp import ( - FSDPTest, -) +from torch.testing._internal.common_fsdp import FSDPTest from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, instantiate_parametrized_tests, parametrize, run_tests, + TEST_WITH_DEV_DBG_ASAN, ) from torch.utils.checkpoint import checkpoint - if not dist.is_available(): print("Distributed not available, skipping tests", file=sys.stderr) sys.exit(0) diff --git a/test/distributed/fsdp/test_fsdp_meta.py b/test/distributed/fsdp/test_fsdp_meta.py index 1aa426800db62..09e5c7ae83292 100644 --- a/test/distributed/fsdp/test_fsdp_meta.py +++ b/test/distributed/fsdp/test_fsdp_meta.py @@ -6,20 +6,19 @@ import torch.distributed as dist import torch.nn as nn from torch.distributed.fsdp import FullyShardedDataParallel as FSDP -from torch.distributed.fsdp.wrap import always_wrap_policy as always_wrap -from torch.distributed.fsdp.wrap import wrap, enable_wrap -from torch.testing._internal.common_fsdp import ( - FSDPTest, +from torch.distributed.fsdp.wrap import ( + always_wrap_policy as always_wrap, + enable_wrap, + wrap, ) +from torch.testing._internal.common_distributed import skip_if_lt_x_gpu +from torch.testing._internal.common_fsdp import FSDPTest from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, - run_tests, - parametrize, instantiate_parametrized_tests, + parametrize, + run_tests, sandcastle_skip_if, -) -from torch.testing._internal.common_distributed import ( - skip_if_lt_x_gpu, + TEST_WITH_DEV_DBG_ASAN, ) _TORCHDISTX_AVAIL = True @@ -47,10 +46,12 @@ def _reset_params_if_meta(is_meta, model): if is_meta: model.reset_parameters() + class MyLinear(nn.Linear): """ Linear layer with deterministic reset_parameters for testing. """ + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -58,6 +59,7 @@ def reset_parameters(self, *args, **kwargs): with torch.no_grad(): self.weight.fill_(1) + class MyModel(nn.Module): def __init__(self, device): super().__init__() @@ -90,6 +92,7 @@ def reset_parameters(self): if not isinstance(m, FSDP): m.reset_parameters() + def _init_with_reset_params(module): """ to_empty + reset_parameters() init function example for modules @@ -101,6 +104,7 @@ def _init_with_reset_params(module): with torch.no_grad(): module.reset_parameters() + def _init_with_torchdistX(module): """ torchdistX-based deferred module initialization function example @@ -113,6 +117,7 @@ def check_fn(k): deferred_init.materialize_module(module, check_fn=check_fn) + class TestFSDPWithMetaDevice(FSDPTest): @property def world_size(self): @@ -148,7 +153,7 @@ def _test_simple_model_with_meta_device(self, meta_module_fn, init_fn=None): regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3) self._compare_fsdp(fsdp_meta, fsdp_regular) - inp = torch.randn(10, 2, device='cuda') + inp = torch.randn(10, 2, device="cuda") fsdp_meta(inp).sum().backward() fsdp_regular(inp).sum().backward() meta_opt.step() @@ -176,6 +181,7 @@ def _test_simple_model_with_meta_device(self, meta_module_fn, init_fn=None): def test_simple_model_with_meta_device_reset_params(self): def meta_module_fn(): return MyModel(device="meta") + self._test_simple_model_with_meta_device( meta_module_fn, _init_with_reset_params ) @@ -184,11 +190,13 @@ def meta_module_fn(): def test_simple_model_with_meta_device_default_init(self): def meta_module_fn(): return MyModel(device="meta") + self._test_simple_model_with_meta_device(meta_module_fn) @skip_if_lt_x_gpu(2) @sandcastle_skip_if( - not _TORCHDISTX_AVAIL, "Test requires torchdistX: https://github.com/pytorch/torchdistX" + not _TORCHDISTX_AVAIL, + "Test requires torchdistX: https://github.com/pytorch/torchdistX", ) def test_simple_model_with_torchdistX_default_init(self): def meta_module_fn(): @@ -198,15 +206,20 @@ def meta_module_fn(): @skip_if_lt_x_gpu(2) @sandcastle_skip_if( - not _TORCHDISTX_AVAIL, "Test requires torchdistX: https://github.com/pytorch/torchdistX" + not _TORCHDISTX_AVAIL, + "Test requires torchdistX: https://github.com/pytorch/torchdistX", ) def test_simple_model_with_torchdistX_init_fn(self): def meta_module_fn(): return deferred_init.deferred_init(MyModel, device="cuda") - self._test_simple_model_with_meta_device(meta_module_fn, init_fn=_init_with_torchdistX) + self._test_simple_model_with_meta_device( + meta_module_fn, init_fn=_init_with_torchdistX + ) - def _test_nested_model_with_meta_device(self, auto_wrap, meta_module_fn, init_fn=None): + def _test_nested_model_with_meta_device( + self, auto_wrap, meta_module_fn, init_fn=None + ): if auto_wrap: module = meta_module_fn() is_meta = next(module.parameters()).is_meta @@ -225,7 +238,8 @@ def _test_nested_model_with_meta_device(self, auto_wrap, meta_module_fn, init_fn regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3) else: with enable_wrap( - wrapper_cls=FSDP, param_init_fn=init_fn, + wrapper_cls=FSDP, + param_init_fn=init_fn, ): module = meta_module_fn() is_meta = next(module.parameters()).is_meta @@ -246,7 +260,7 @@ def _test_nested_model_with_meta_device(self, auto_wrap, meta_module_fn, init_fn # Compare it before training self._compare_fsdp(fsdp_meta, fsdp_regular) - inp = torch.randn(10, 2, device='cuda') + inp = torch.randn(10, 2, device="cuda") fsdp_meta(inp).sum().backward() fsdp_regular(inp).sum().backward() meta_opt.step() @@ -260,7 +274,9 @@ def meta_module_fn(): return NestedModel(device="meta") self._test_nested_model_with_meta_device( - auto_wrap=auto_wrap, meta_module_fn=meta_module_fn, init_fn=_init_with_reset_params + auto_wrap=auto_wrap, + meta_module_fn=meta_module_fn, + init_fn=_init_with_reset_params, ) @skip_if_lt_x_gpu(2) @@ -270,12 +286,14 @@ def meta_module_fn(): return NestedModel(device="meta") self._test_nested_model_with_meta_device( - auto_wrap=auto_wrap, meta_module_fn=meta_module_fn, + auto_wrap=auto_wrap, + meta_module_fn=meta_module_fn, ) @skip_if_lt_x_gpu(2) @sandcastle_skip_if( - not _TORCHDISTX_AVAIL, "Test requires torchdistX: https://github.com/pytorch/torchdistX" + not _TORCHDISTX_AVAIL, + "Test requires torchdistX: https://github.com/pytorch/torchdistX", ) @parametrize("auto_wrap", [True, False]) def test_nested_model_with_torchdistX_default_init(self, auto_wrap): @@ -288,7 +306,8 @@ def meta_module_fn(): @skip_if_lt_x_gpu(2) @sandcastle_skip_if( - not _TORCHDISTX_AVAIL, "Test requires torchdistX: https://github.com/pytorch/torchdistX" + not _TORCHDISTX_AVAIL, + "Test requires torchdistX: https://github.com/pytorch/torchdistX", ) @parametrize("auto_wrap", [True, False]) def test_nested_model_with_torchdistX_init_fn(self, auto_wrap): @@ -296,7 +315,9 @@ def meta_module_fn(): return deferred_init.deferred_init(NestedModel, device="cuda") self._test_nested_model_with_meta_device( - auto_wrap=auto_wrap, meta_module_fn=meta_module_fn, init_fn=_init_with_torchdistX, + auto_wrap=auto_wrap, + meta_module_fn=meta_module_fn, + init_fn=_init_with_torchdistX, ) def _test_bad_arg(self, meta_module_fn): @@ -306,7 +327,8 @@ def _test_bad_arg(self, meta_module_fn): @skip_if_lt_x_gpu(2) @sandcastle_skip_if( - not _TORCHDISTX_AVAIL, "Test requires torchdistX: https://github.com/pytorch/torchdistX" + not _TORCHDISTX_AVAIL, + "Test requires torchdistX: https://github.com/pytorch/torchdistX", ) def test_bad_arg_torchdistx(self): def meta_module_fn(): diff --git a/test/distributed/fsdp/test_fsdp_misc.py b/test/distributed/fsdp/test_fsdp_misc.py index ca566b984002a..98cd6488ae5e7 100644 --- a/test/distributed/fsdp/test_fsdp_misc.py +++ b/test/distributed/fsdp/test_fsdp_misc.py @@ -1,36 +1,36 @@ # Owner(s): ["oncall: distributed"] -from copy import deepcopy import functools import sys from collections import namedtuple from contextlib import suppress +from copy import deepcopy import torch import torch.distributed as dist import torch.nn as nn -from torch.distributed.fsdp import FlatParameter -from torch.distributed.fsdp import FullyShardedDataParallel as FSDP -from torch.distributed.fsdp import ShardingStrategy, CPUOffload -from torch.distributed.fsdp.wrap import ( - always_wrap_policy, - transformer_auto_wrap_policy, +from torch.distributed.fsdp import ( + CPUOffload, + FlatParameter, + FullyShardedDataParallel as FSDP, + ShardingStrategy, ) +from torch.distributed.fsdp.wrap import always_wrap_policy, transformer_auto_wrap_policy from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer from torch.testing._internal.common_distributed import skip_if_lt_x_gpu from torch.testing._internal.common_fsdp import ( + _assert_module_states, CUDAInitMode, FSDPInitMode, FSDPTest, NestedWrappedModule, TransformerWithSharedParams, - _assert_module_states, ) from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, instantiate_parametrized_tests, parametrize, run_tests, + TEST_WITH_DEV_DBG_ASAN, ) if not dist.is_available(): @@ -71,9 +71,7 @@ def forward(self, x): t = torch.ones(1, device="cuda", requires_grad=True) MyOutputType = namedtuple( - "MyOutputType", - ["a", "b", "c", "d"], - defaults=(t, t, t, t) + "MyOutputType", ["a", "b", "c", "d"], defaults=(t, t, t, t) ) inp = MyOutputType() @@ -89,7 +87,6 @@ def forward(self, x): @skip_if_lt_x_gpu(2) def test_fsdp_not_all_outputs_used_in_loss(self): - class MyModule(nn.Module): def __init__(self): super().__init__() @@ -108,10 +105,7 @@ def _check_resharded(fsdp_module): full_param = param._full_param_padded self.assertEqual(full_param.storage().size(), 0) - self.assertEqual( - param.data_ptr(), - param._local_shard.data_ptr() - ) + self.assertEqual(param.data_ptr(), param._local_shard.data_ptr()) def _check_equal(local, fsdp): with FSDP.summon_full_params(fsdp): @@ -121,7 +115,7 @@ def _check_equal(local, fsdp): for sharding_strategy in [ ShardingStrategy.FULL_SHARD, ShardingStrategy.SHARD_GRAD_OP, - ShardingStrategy.NO_SHARD + ShardingStrategy.NO_SHARD, ]: with self.subTest(sharding_strategy=sharding_strategy): fsdp_ctor = functools.partial(FSDP, sharding_strategy=sharding_strategy) @@ -160,7 +154,10 @@ def _check_equal(local, fsdp): # Ensure at least some change from previous params, otherwise # above check would be vacuously true. self.assertTrue( - any(not torch.equal(p1, p2) for p1, p2 in zip(prev_params, m_local.parameters())) + any( + not torch.equal(p1, p2) + for p1, p2 in zip(prev_params, m_local.parameters()) + ) ) prev_params = [p.clone() for p in local_m.parameters()] opt.zero_grad() @@ -168,7 +165,6 @@ def _check_equal(local, fsdp): dist.barrier() - @skip_if_lt_x_gpu(2) @parametrize("use_second_layer", [True, False]) @parametrize("sharding_strategy", [ShardingStrategy.NO_SHARD, None]) @@ -193,10 +189,10 @@ def forward(self, x, y): fsdp = FSDP( MyModel().cuda(), sharding_strategy=sharding_strategy, - auto_wrap_policy=always_wrap_policy + auto_wrap_policy=always_wrap_policy, ) - x = torch.randn(10, 10, device='cuda') - y = torch.randn(10, 10, device='cuda') + x = torch.randn(10, 10, device="cuda") + y = torch.randn(10, 10, device="cuda") for i in range(4): if use_second_layer: a, b = fsdp(x, y) @@ -241,6 +237,7 @@ def test_fsdp_device_id_cpu_offload(self): Ensures that even if device_id is specified but we have CPU offload, module is on CPU after init. """ + class MyModel(nn.Module): def __init__(self): super().__init__() @@ -256,7 +253,7 @@ def forward(self, x): model, auto_wrap_policy=always_wrap_policy, cpu_offload=CPUOffload(offload_params=True), - device_id=torch.cuda.current_device() + device_id=torch.cuda.current_device(), ) cpu_device = torch.device("cpu") @@ -281,7 +278,8 @@ def test_fsdp_device_id(self, use_index): without specifying a device ID (i.e. ``torch.device("cuda")``) warns """ dev_id = ( - torch.cuda.current_device() if use_index + torch.cuda.current_device() + if use_index else torch.device("cuda", torch.cuda.current_device()) ) @@ -289,8 +287,7 @@ def _check_device_matches(module, device_id): """Checks that the ``FlatParameter``s in ``module`` have device matching ``device_id``.""" devices = { - p.device for p in module.parameters() - if isinstance(p, FlatParameter) + p.device for p in module.parameters() if isinstance(p, FlatParameter) } assert len(devices) > 0 self.assertEqual(1, len(devices)) @@ -328,11 +325,10 @@ def _check_device_matches(module, device_id): self.process_group, FSDPInitMode.RECURSIVE, CUDAInitMode.CUDA_BEFORE, - fsdp_kwargs={"device_id": torch.device("cuda")} + fsdp_kwargs={"device_id": torch.device("cuda")}, ) _check_device_matches( - nested_wrapped_module, - torch.device("cuda", torch.cuda.current_device()) + nested_wrapped_module, torch.device("cuda", torch.cuda.current_device()) ) @skip_if_lt_x_gpu(2) @@ -340,10 +336,9 @@ def test_module_device_mismatches_device_id(self): """Tests that specifying a ``device_id`` argument to FSDP for a GPU module that does not match the GPU device ID raises an error.""" context = ( - self.assertRaisesRegex( - ValueError, - f"cuda:{self.rank} vs cuda:0" - ) if self.rank != 0 else suppress() + self.assertRaisesRegex(ValueError, f"cuda:{self.rank} vs cuda:0") + if self.rank != 0 + else suppress() ) with context: NestedWrappedModule.init( @@ -360,6 +355,7 @@ def test_module_device_mismatches_device_id(self): def test_multi_device_not_supported(self): """Tests that wrapping a multi-device module (i.e. with submodules on both GPU and CPU) with FSDP raises an error.""" + class MultiDeviceModule(nn.Module): def __init__(self): super().__init__() @@ -392,11 +388,14 @@ def test_no_params(self): # is computed as torch.cuda.current_device when there are no params. no_params = nn.ReLU().cuda() context = ( - self.assertRaisesRegex( - ValueError, - f"Inconsistent.*cuda:{self.rank} vs cuda:0" + ( + self.assertRaisesRegex( + ValueError, f"Inconsistent.*cuda:{self.rank} vs cuda:0" + ) ) - ) if self.rank != 0 else suppress() + if self.rank != 0 + else suppress() + ) with context: module = FSDP(no_params, device_id=0) @@ -439,7 +438,7 @@ def test_cpu_init_with_sync_module_states(self): ) with self.assertRaisesRegex( ValueError, - "Module has CPU parameters, but sync_module_states=True is specified." + "Module has CPU parameters, but sync_module_states=True is specified.", ): FSDP(nested_wrapped_module, self.process_group, sync_module_states=True) @@ -457,6 +456,7 @@ def test_fsdp_same_model_across_ranks(self): FSDP broadcasts model from rank 0 to ensure it starts off with the same values. """ + class MyModel(nn.Module): def __init__(self, rank): super().__init__() @@ -467,19 +467,27 @@ def __init__(self, rank): self.register_buffer("buffer", torch.ones(1) * rank) m = MyModel(self.rank).cuda() - _assert_module_states(m, process_group=self.process_group, assert_fn=self.assertNotEqual) + _assert_module_states( + m, process_group=self.process_group, assert_fn=self.assertNotEqual + ) # Passing sync_module_states into FSDP makes model the same during init. fsdp = FSDP(m, sync_module_states=True) with fsdp.summon_full_params(fsdp): - _assert_module_states(fsdp, process_group=self.process_group, assert_fn=self.assertEqual) + _assert_module_states( + fsdp, process_group=self.process_group, assert_fn=self.assertEqual + ) # sync_module_states also works with CPU module with device_id passed in m = MyModel(self.rank) - _assert_module_states(m, process_group=self.process_group, assert_fn=self.assertNotEqual) + _assert_module_states( + m, process_group=self.process_group, assert_fn=self.assertNotEqual + ) # Passing sync_module_states into FSDP makes model the same during init. fsdp = FSDP(m, device_id=torch.cuda.current_device(), sync_module_states=True) with fsdp.summon_full_params(fsdp): - _assert_module_states(fsdp, process_group=self.process_group, assert_fn=self.assertEqual) + _assert_module_states( + fsdp, process_group=self.process_group, assert_fn=self.assertEqual + ) instantiate_parametrized_tests(TestFSDPMisc) diff --git a/test/distributed/fsdp/test_fsdp_mixed_precision.py b/test/distributed/fsdp/test_fsdp_mixed_precision.py index 4440e394179ab..a65d0378a3a94 100644 --- a/test/distributed/fsdp/test_fsdp_mixed_precision.py +++ b/test/distributed/fsdp/test_fsdp_mixed_precision.py @@ -11,9 +11,13 @@ import torch.nn as nn import torch.nn.functional as F from torch import distributed as dist -from torch.distributed.fsdp import BackwardPrefetch, CPUOffload -from torch.distributed.fsdp import FullyShardedDataParallel as FSDP -from torch.distributed.fsdp import MixedPrecision, ShardingStrategy +from torch.distributed.fsdp import ( + BackwardPrefetch, + CPUOffload, + FullyShardedDataParallel as FSDP, + MixedPrecision, + ShardingStrategy, +) from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy from torch.nn.modules.batchnorm import _BatchNorm @@ -23,19 +27,20 @@ CUDAInitMode, FSDPInitMode, FSDPTest, - TransformerWithSharedParams, subtest_name, + TransformerWithSharedParams, ) from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, instantiate_parametrized_tests, parametrize, run_tests, sandcastle_skip_if, + TEST_WITH_DEV_DBG_ASAN, ) try: import torchvision + HAS_TORCHVISION = True except ImportError: HAS_TORCHVISION = False @@ -66,7 +71,9 @@ mp_only_reduce = MixedPrecision(reduce_dtype=torch.float16) # Only parameters are cast (thus comm should happen in the param_dtype precision) -mp_only_param_and_buf = MixedPrecision(param_dtype=torch.float16, buffer_dtype=torch.float16) +mp_only_param_and_buf = MixedPrecision( + param_dtype=torch.float16, buffer_dtype=torch.float16 +) # Nothing is cast (thus param, comm, grad, and buffer should be in the full precision) mp_no_mixed_precision = MixedPrecision() @@ -80,7 +87,7 @@ mp_diff_buffer_and_reduce = MixedPrecision( param_dtype=torch.float16, buffer_dtype=torch.bfloat16, - reduce_dtype=torch.float32 + reduce_dtype=torch.float32, ) mp_configs.extend([mp_diff_buffer_and_reduce]) @@ -88,18 +95,18 @@ _BUFFER_ORIG_DTYPE = torch.float64 params = "mp_config,cpu_offload,full_precision_param_dtype,enable_sharded_grad_scaler" -cpu_offload_config = [ - CPUOffload(offload_params=True), CPUOffload(offload_params=False) -] +cpu_offload_config = [CPUOffload(offload_params=True), CPUOffload(offload_params=False)] full_precision_param_dtype_config = [torch.float32, torch.float64] enable_sharded_grad_scaler = ["enable_sharded_grad_scaler", None] -configs = list(product( - mp_configs, - cpu_offload_config, - full_precision_param_dtype_config, - enable_sharded_grad_scaler, -)) +configs = list( + product( + mp_configs, + cpu_offload_config, + full_precision_param_dtype_config, + enable_sharded_grad_scaler, + ) +) test_name_mapping = { str(CPUOffload(offload_params=True)): "offload_true", @@ -110,18 +117,21 @@ str(mp_no_mixed_precision): "mp_no_mp", str(torch.float32): "fp32", str(torch.float64): "fp64", - "enable_sharded_grad_scaler": "enable_sharded_grad_scaler" + "enable_sharded_grad_scaler": "enable_sharded_grad_scaler", } if nccl_supports_bf16: - test_name_mapping.update({ - str(mp_diff_buffer_and_reduce): "mp_diff_buffer_reduce", - }) + test_name_mapping.update( + { + str(mp_diff_buffer_and_reduce): "mp_diff_buffer_reduce", + } + ) subtest_name = partial(subtest_name, test_name_mapping) _CURRENT_FULL_PRECISION_PARAM_DTYPE = None + @contextlib.contextmanager def patch_reduce_scatter(new_reduce_scatter, full_precision_param_dtype): """ @@ -138,14 +148,16 @@ def patch_reduce_scatter(new_reduce_scatter, full_precision_param_dtype): dist.reduce_scatter_tensor = orig_reduce_scatter _CURRENT_FULL_PRECISION_PARAM_DTYPE = None + class LinearMixedPrecision(nn.Module): """ A linear module with extra checks for mixed precision training. """ + def __init__(self, param_dtype): super().__init__() self.lin = nn.Linear(10, 10, bias=False).to(param_dtype) - self.register_buffer('buffer', torch.randn((1, 2), dtype=_BUFFER_ORIG_DTYPE)) + self.register_buffer("buffer", torch.randn((1, 2), dtype=_BUFFER_ORIG_DTYPE)) self._orig_param_type = param_dtype self._orig_buffer_dtype = _BUFFER_ORIG_DTYPE @@ -153,11 +165,13 @@ def forward(self, tup): # Param and input should be the mixed precision type inp, cls, fsdp, mp_config, full_precision_param_dtype = tup expected_param_type = ( - mp_config.param_dtype if mp_config.param_dtype is not None + mp_config.param_dtype + if mp_config.param_dtype is not None else self._orig_param_type ) expected_buffer_type = ( - mp_config.buffer_dtype if mp_config.buffer_dtype is not None + mp_config.buffer_dtype + if mp_config.buffer_dtype is not None else self._orig_buffer_dtype ) cls.assertEqual(inp.dtype, expected_param_type) @@ -193,7 +207,7 @@ def forward(self, tup): if mp_config.param_dtype is not None: cls.assertEqual(0, param._mp_shard.storage().size()) else: - cls.assertFalse(hasattr(param, '_mp_shard')) + cls.assertFalse(hasattr(param, "_mp_shard")) elif param_is_sharded: # This FSDP unit is not active as full param has been # freed or not yet allocated. Ensure param points to full @@ -219,7 +233,9 @@ def world_size(self): def _get_simple_nested_model(self, param_dtype, *fsdp_args, **fsdp_kwargs): model = FSDP( nn.Sequential( - FSDP(LinearMixedPrecision(param_dtype).cuda(), *fsdp_args, **fsdp_kwargs), + FSDP( + LinearMixedPrecision(param_dtype).cuda(), *fsdp_args, **fsdp_kwargs + ), LinearMixedPrecision(param_dtype).cuda(), ), *fsdp_args, @@ -228,7 +244,9 @@ def _get_simple_nested_model(self, param_dtype, *fsdp_args, **fsdp_kwargs): return model def _get_simple_model(self, param_dtype, *fsdp_args, **fsdp_kwargs): - model = FSDP(LinearMixedPrecision(param_dtype).cuda(), *fsdp_args, **fsdp_kwargs) + model = FSDP( + LinearMixedPrecision(param_dtype).cuda(), *fsdp_args, **fsdp_kwargs + ) return model def _validate_no_mp_shard(self, fsdp_model): @@ -239,7 +257,7 @@ def _validate_no_mp_shard(self, fsdp_model): fsdp_units = FSDP.fsdp_modules(fsdp_model) for fsdp in fsdp_units: for param in fsdp.params: - self.assertFalse(hasattr(param, '_mp_shard')) + self.assertFalse(hasattr(param, "_mp_shard")) def _validate_mp_shard_freed(self, fsdp_model): """ @@ -251,11 +269,7 @@ def _validate_mp_shard_freed(self, fsdp_model): self.assertEqual(0, param._mp_shard.storage().size()) def _reduce_scatter_validate_mp( - self, - orig_reduce_scatter, - mp_config, - *args, - **kwargs + self, orig_reduce_scatter, mp_config, *args, **kwargs ): """ Runs reduce-scatter but verifies mixed precision settings before. This @@ -278,9 +292,11 @@ def _reduce_scatter_validate_mp( # If reduce_dtype is not specified (is None) we comm. in the param_dtype # if that is specified, otherwise full precision dtype. expected_dtype = ( - mp_config.reduce_dtype if mp_config.reduce_dtype is not None + mp_config.reduce_dtype + if mp_config.reduce_dtype is not None else ( - mp_config.param_dtype if mp_config.param_dtype is not None + mp_config.param_dtype + if mp_config.param_dtype is not None else _CURRENT_FULL_PRECISION_PARAM_DTYPE ) ) @@ -357,14 +373,18 @@ def _run_test_mixed_precision_e2e( # Patch reduce_scatter to add validation for mixed precision types. orig_reduce_scatter = dist.reduce_scatter_tensor test_reduce_scatter = partial( - self._reduce_scatter_validate_mp, orig_reduce_scatter, mp_config, + self._reduce_scatter_validate_mp, + orig_reduce_scatter, + mp_config, ) with patch_reduce_scatter(test_reduce_scatter, full_precision_param_dtype): scaler = ShardedGradScaler(enabled=enable_sharded_grad_scaler) optim = torch.optim.Adam(model.parameters()) for _ in range(3): - inp = torch.randn(3, 10, device='cuda', dtype=full_precision_param_dtype) + inp = torch.randn( + 3, 10, device="cuda", dtype=full_precision_param_dtype + ) # Forward pass of LinearMixedPrecision check casting of # inputs, params, buffers. act, *_ = model( @@ -409,7 +429,9 @@ def _run_test_mixed_precision_e2e( for param in model.parameters(): self.assertEqual(param.dtype, full_precision_param_dtype) if param.grad is not None: - self.assertEqual(param.grad.dtype, full_precision_param_dtype) + self.assertEqual( + param.grad.dtype, full_precision_param_dtype + ) # Unscale the gradients and step scaler.step(optim) @@ -448,8 +470,9 @@ def _run_test_mixed_precision_e2e( self.assertEqual(tensor.dtype, _BUFFER_ORIG_DTYPE) else: self.assertEqual( - tensor.dtype, full_precision_param_dtype, - f"{name}: {tensor.dtype} vs {full_precision_param_dtype}" + tensor.dtype, + full_precision_param_dtype, + f"{name}: {tensor.dtype} vs {full_precision_param_dtype}", ) # After state_dict, buffer's dtype should have been restored @@ -475,7 +498,7 @@ def _get_subtest_config(self) -> Dict[str, List[Any]]: None, BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST, - ] + ], } @skip_if_lt_x_gpu(2) @@ -518,7 +541,9 @@ def _test_mixed_precision_embedding_table(self, mp_config): param_dtype = mp_config.param_dtype or torch.float32 orig_reduce_scatter = dist.reduce_scatter_tensor test_reduce_scatter = partial( - self._reduce_scatter_validate_mp, orig_reduce_scatter, mp_config, + self._reduce_scatter_validate_mp, + orig_reduce_scatter, + mp_config, ) with patch_reduce_scatter(test_reduce_scatter, param_dtype): # TODO: `test_mp_embedding_reduce()` fails if we do not wrap the @@ -570,9 +595,11 @@ def test_mp_embedding_params_and_reduce_diff(self): params_and_reduce_different = MixedPrecision( param_dtype=torch.float16, reduce_dtype=torch.float32, - buffer_dtype=torch.float16 + buffer_dtype=torch.float16, + ) + self._test_mixed_precision_embedding_table( + mp_config=params_and_reduce_different ) - self._test_mixed_precision_embedding_table(mp_config=params_and_reduce_different) @skip_if_lt_x_gpu(2) @skipIfNoTorchVision @@ -583,11 +610,12 @@ def test_mixed_precision_resnet(self): """ resnet_model = torchvision.models.resnet50().cuda() resnet_model = nn.SyncBatchNorm.convert_sync_batchnorm( - resnet_model, - process_group=dist.distributed_c10d._get_default_group() + resnet_model, process_group=dist.distributed_c10d._get_default_group() ) - n_bn = sum(1 if isinstance(x, _BatchNorm) else 0 for x in resnet_model.modules()) - inp = torch.ones(1, 3, 1000, 1000, device='cuda') + n_bn = sum( + 1 if isinstance(x, _BatchNorm) else 0 for x in resnet_model.modules() + ) + inp = torch.ones(1, 3, 1000, 1000, device="cuda") mp_config = MixedPrecision( param_dtype=torch.float16, reduce_dtype=torch.float16, @@ -596,7 +624,7 @@ def test_mixed_precision_resnet(self): fsdp = FSDP( resnet_model, auto_wrap_policy=size_based_auto_wrap_policy, - mixed_precision=mp_config + mixed_precision=mp_config, ) # Batchnorm units should be wrapped individually. Validate this by # ensuring there are equal no. of FSDP units that are BN as BN units @@ -652,7 +680,7 @@ def never_wrap_policy(*args, **kwargs): ) with self.assertWarnsRegex( expected_warning=UserWarning, - expected_regex="batch norm submodules will be wrapped as separate" + expected_regex="batch norm submodules will be wrapped as separate", ): model = FSDP( net, @@ -669,7 +697,7 @@ def never_wrap_policy(*args, **kwargs): self.assertEqual(no_mixed_precision, bn.mixed_precision) self.assertNotEqual(no_mixed_precision, model.mixed_precision) - inp = torch.randn((1, 2), device='cuda') + inp = torch.randn((1, 2), device="cuda") # Without FSDP BN mixed precision fix, this would result in # RuntimeError: Expected counts to have type Half but got Float # for syncBN @@ -680,6 +708,7 @@ class TestFSDPMixedPrecisionUnsharded(TestFSDPMixedPrecision): """ Smaller test suite for unshared param (i.e. world_size == 1) case. """ + @property def world_size(self): return 1 @@ -719,6 +748,7 @@ def test_mixed_precision_e2e_full_shard(self): enable_sharded_grad_scaler=False, ) + instantiate_parametrized_tests(TestFSDPMixedPrecisionSharded) if __name__ == "__main__": diff --git a/test/distributed/fsdp/test_fsdp_multiple_forward.py b/test/distributed/fsdp/test_fsdp_multiple_forward.py index c9afbd465f28e..7823f9349a005 100644 --- a/test/distributed/fsdp/test_fsdp_multiple_forward.py +++ b/test/distributed/fsdp/test_fsdp_multiple_forward.py @@ -9,12 +9,8 @@ from torch.nn.parallel import DistributedDataParallel from torch.optim import SGD from torch.testing._internal.common_distributed import skip_if_lt_x_gpu -from torch.testing._internal.common_fsdp import ( - FSDPTest, - get_full_params, -) -from torch.testing._internal.common_utils import TEST_WITH_DEV_DBG_ASAN, run_tests - +from torch.testing._internal.common_fsdp import FSDPTest, get_full_params +from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN if not dist.is_available(): print("Distributed not available, skipping tests", file=sys.stderr) diff --git a/test/distributed/fsdp/test_fsdp_multiple_wrapping.py b/test/distributed/fsdp/test_fsdp_multiple_wrapping.py index 0a3b9e2e2e068..58298fcce26ff 100644 --- a/test/distributed/fsdp/test_fsdp_multiple_wrapping.py +++ b/test/distributed/fsdp/test_fsdp_multiple_wrapping.py @@ -9,8 +9,7 @@ from torch.optim import SGD from torch.testing._internal.common_distributed import skip_if_lt_x_gpu from torch.testing._internal.common_fsdp import FSDPTest -from torch.testing._internal.common_utils import TEST_WITH_DEV_DBG_ASAN, run_tests - +from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN if not dist.is_available(): print("Distributed not available, skipping tests", file=sys.stderr) diff --git a/test/distributed/fsdp/test_fsdp_optim_state.py b/test/distributed/fsdp/test_fsdp_optim_state.py index e4199ad532a6b..5fe75ee309fa5 100644 --- a/test/distributed/fsdp/test_fsdp_optim_state.py +++ b/test/distributed/fsdp/test_fsdp_optim_state.py @@ -2,7 +2,7 @@ import bisect import sys -from enum import Enum, auto +from enum import auto, Enum from typing import Any, Callable, Dict, List, Optional, Tuple, Type import torch @@ -25,15 +25,13 @@ TransformerWithSharedParams, ) from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, instantiate_parametrized_tests, parametrize, run_tests, + TEST_WITH_DEV_DBG_ASAN, ) -STATE_DICT_TYPE = [ - StateDictType.FULL_STATE_DICT, StateDictType.SHARDED_STATE_DICT -] +STATE_DICT_TYPE = [StateDictType.FULL_STATE_DICT, StateDictType.SHARDED_STATE_DICT] if not dist.is_available(): print("Distributed not available, skipping tests", file=sys.stderr) @@ -49,6 +47,7 @@ class _OSDCommMethod(Enum): """Method for communicating the optimizer state dict for internal tests.""" + BROADCAST_OBJECT_LIST = auto() SCATTER_FULL_OSD = auto() FLATTEN_SHARDED_OSD = auto() @@ -56,12 +55,14 @@ class _OSDCommMethod(Enum): class _ModelClass(Enum): """Different model type to test.""" + NESTED = auto() TRANSFORMER = auto() class Bias(torch.nn.Module): """This module applies a 1D additive bias with dimension ``dim``.""" + def __init__(self, dim: int) -> None: super().__init__() assert dim > 0 @@ -82,6 +83,7 @@ class BlockA(torch.nn.Module): Bias1 bias """ + def __init__(self, in_dim: int, out_dim: int) -> None: super().__init__() assert all(v > 0 for v in (in_dim, out_dim)) @@ -98,6 +100,7 @@ def forward(self, x): x = self.bias_module1(x) return x + class BlockB(torch.nn.Module): """ Used to define interesting nested structure for FSDP wrapping. @@ -108,6 +111,7 @@ class BlockB(torch.nn.Module): Bias bias """ + def __init__(self, in_dim: int, out_dim: int) -> None: super().__init__() assert all(v > 0 for v in (in_dim, out_dim)) @@ -166,21 +170,30 @@ def wrap( fsdp_kwargs = {} # Flatten Bias0; then flatten weight and Bias1 together into `block1` model.block1.bias_module0 = FSDP( - model.block1.bias_module0, process_group=group, **fsdp_kwargs, + model.block1.bias_module0, + process_group=group, + **fsdp_kwargs, ) model.block1 = FSDP(model.block1, process_group=group, **fsdp_kwargs) # Flatten Bias0; flatten Bias1; then flatten weight into `block2[1]` model.block2[1].bias_module0 = FSDP( - model.block2[1].bias_module0, process_group=group, **fsdp_kwargs, + model.block2[1].bias_module0, + process_group=group, + **fsdp_kwargs, ) model.block2[1].bias_module1 = FSDP( - model.block2[1].bias_module1, process_group=group, **fsdp_kwargs, + model.block2[1].bias_module1, + process_group=group, + **fsdp_kwargs, ) model.block2[1] = FSDP(model.block2[1], process_group=group, **fsdp_kwargs) # Flatten weight, Bias, bias into `block2[2]` ignored_modules = [model.block2[2].bias_module0] if ignore_modules else None model.block2[2] = FSDP( - model.block2[2], process_group=group, ignored_modules=ignored_modules, **fsdp_kwargs, + model.block2[2], + process_group=group, + ignored_modules=ignored_modules, + **fsdp_kwargs, ) return model @@ -193,7 +206,9 @@ def wrap_alt( if fsdp_kwargs is None: fsdp_kwargs = {} model.block0.bias_module0 = FSDP( - model.block0.bias_module0, process_group=group, **fsdp_kwargs, + model.block0.bias_module0, + process_group=group, + **fsdp_kwargs, ) model.block0 = FSDP(model.block0, process_group=group, **fsdp_kwargs) return model @@ -211,7 +226,8 @@ def wrap_with_unmanaged_params( # (`model.block2[2]`) or a module not to be wrapped with FSDP (`model`) register_module = model.block2[2] if add_to_fsdp_module else model register_module.register_parameter( - "unmanaged_param", unmanaged_param, + "unmanaged_param", + unmanaged_param, ) # For simplicity, we only add a single unmanaged parameter, but should # be easy to generalize if needed @@ -256,8 +272,7 @@ def param_group0(self) -> List[torch.nn.Parameter]: def param_group1(self) -> List[torch.nn.Parameter]: # Deviate from the `model.parameters()` order further by rearranging # `block2`'s parameters to be before `block0`'s parameters - return list(self.block2.parameters()) + \ - list(self.block0.parameters()) + return list(self.block2.parameters()) + list(self.block0.parameters()) class TestFSDPOptimState(FSDPTest): @@ -281,14 +296,17 @@ def _init_nested_model( ): model = NestedModel().to(device) if wrap: - model = NestedModel.wrap_alt(model, group, fsdp_kwargs) if wrap_alt \ + model = ( + NestedModel.wrap_alt(model, group, fsdp_kwargs) + if wrap_alt else NestedModel.wrap(model, group, fsdp_kwargs=fsdp_kwargs) + ) if not use_multiple_param_groups: optim_input = list(model.parameters()) else: optim_input = [ {"params": model.param_group0()}, - {"params": model.param_group1(), "weight_decay": 0.9} + {"params": model.param_group1(), "weight_decay": 0.9}, ] # Use a reversed parameter order for the optimizer input on odd ranks if use_diff_optim_inputs and self.rank % 2 == 1: @@ -353,7 +371,9 @@ def _broadcast_full_osd(self, full_osd: Dict[str, Any], group=None): ``torch.save()`` and ``torch.load()`` so that all ranks can have it.""" obj_list = [full_osd] dist.broadcast_object_list( - obj_list, src=0, group=group, + obj_list, + src=0, + group=group, ) full_osd = obj_list[0] return full_osd @@ -375,8 +395,9 @@ def _are_equal_states( # Check the values on CPU to be device-agnostic value1 = value1.cpu() value2 = value2.cpu() - if value1.shape != value2.shape or \ - not torch.all(torch.isclose(value1, value2)): + if value1.shape != value2.shape or not torch.all( + torch.isclose(value1, value2) + ): return False else: # non-tensor state if value1 != value2: @@ -422,10 +443,12 @@ def _check_same_state( # Check for at least one match (may be > 1 in toy edge cases, e.g. # multiple biases); nonetheless, each having >= 1 match and the two # lists having equal length imply that the list contents are equal - self.assertTrue(any( - self._are_equal_states(fsdp_osd_state, ref_osd_state) - for ref_osd_state in ref_osd_states - )) + self.assertTrue( + any( + self._are_equal_states(fsdp_osd_state, ref_osd_state) + for ref_osd_state in ref_osd_states + ) + ) def _check_same_param_groups( self, @@ -443,10 +466,12 @@ def _check_same_param_groups( full_osd_param_groups = full_osd["param_groups"] self.assertTrue(len(full_osd_param_groups), len(ref_osd_param_groups)) for full_osd_pg, ref_osd_pg in zip( - full_osd_param_groups, ref_osd_param_groups, + full_osd_param_groups, + ref_osd_param_groups, ): self.assertEqual( - set(full_osd_pg.keys()), set(ref_osd_pg.keys()), + set(full_osd_pg.keys()), + set(ref_osd_pg.keys()), ) for name, full_osd_value in full_osd_pg.items(): if name == "params" and not check_same_param_keys: @@ -508,18 +533,24 @@ def _test_optim_state_dict_nested( return # not supported NUM_ITERS = 3 model1, optim1, optim_input = self._init_nested_model( - wrap=True, use_multiple_param_groups=use_multiple_param_groups, + wrap=True, + use_multiple_param_groups=use_multiple_param_groups, use_diff_optim_inputs=use_diff_optim_inputs, ) losses1 = self._step_model(model1, optim1, num_iters=NUM_ITERS) if state_dict_type == StateDictType.FULL_STATE_DICT: if use_optim_input: fsdp_osd = FSDP.full_optim_state_dict( - model1, optim1, optim_input, rank0_only=rank0_only, + model1, + optim1, + optim_input, + rank0_only=rank0_only, ) else: fsdp_osd = FSDP.full_optim_state_dict( - model1, optim1, rank0_only=rank0_only, + model1, + optim1, + rank0_only=rank0_only, ) else: if use_optim_input: @@ -531,7 +562,8 @@ def _test_optim_state_dict_nested( self.assertEqual(len(fsdp_osd), 0) return model2, optim2, _ = self._init_nested_model( - wrap=False, use_multiple_param_groups=use_multiple_param_groups, + wrap=False, + use_multiple_param_groups=use_multiple_param_groups, use_diff_optim_inputs=use_diff_optim_inputs, ) losses2 = self._step_model(model2, optim2, num_iters=NUM_ITERS) @@ -544,10 +576,14 @@ def _test_optim_state_dict_nested( # parameter IDs check_same_param_keys = False self._check_same_param_groups( - fsdp_osd, ref_osd, check_same_param_keys=check_same_param_keys, + fsdp_osd, + ref_osd, + check_same_param_keys=check_same_param_keys, ) self._check_same_state( - fsdp_osd, ref_osd, check_same_param_keys=check_same_param_keys, + fsdp_osd, + ref_osd, + check_same_param_keys=check_same_param_keys, ) @skip_if_lt_x_gpu(2) @@ -562,12 +598,13 @@ def test_full_optim_state_dict_keys(self): # Add checkpointing to ensure optim_state_dict and state_dict strip out # checkpointing prefixes. apply_activation_checkpointing( - model, - check_fn=lambda module: isinstance(module, torch.nn.Sequential) + model, check_fn=lambda module: isinstance(module, torch.nn.Sequential) ) optim = torch.optim.Adam(wrapped_model.parameters(), lr=1e-3) self._step_model(model, optim, device) - optim_state_dict = FSDP.full_optim_state_dict(wrapped_model, optim, rank0_only=False) + optim_state_dict = FSDP.full_optim_state_dict( + wrapped_model, optim, rank0_only=False + ) with FSDP.state_dict_type(wrapped_model, StateDictType.FULL_STATE_DICT): state_dict = wrapped_model.state_dict() self.assertEqual(optim_state_dict["state"].keys(), state_dict.keys()) @@ -771,11 +808,13 @@ def _test_load_optim_state( # First, run a wrapped model with full world size for a few iterations model1, optim1, optim_input1 = initializer( - wrap=True, use_multiple_param_groups=use_multiple_param_groups, + wrap=True, + use_multiple_param_groups=use_multiple_param_groups, ) self._step_model(model1, optim1, num_iters=NUM_ITERS) fsdp_osd1 = ( - osd_method(model1, optim1, optim_input1) if use_optim_input + osd_method(model1, optim1, optim_input1) + if use_optim_input else osd_method(model1, optim1) ) if halve_world_size: @@ -790,7 +829,8 @@ def _test_load_optim_state( # Second, run a wrapped model with (possibly) halved world size and # (possibly) differing `optim_input` across ranks model2, optim2, optim_input2 = initializer( - wrap=True, group=new_group, + wrap=True, + group=new_group, use_multiple_param_groups=use_multiple_param_groups, use_diff_optim_inputs=use_diff_optim_inputs, **new_model_kwargs, # specify `wrap_alt` to change wrapping @@ -807,13 +847,17 @@ def _test_load_optim_state( if osd_comm_method == _OSDCommMethod.BROADCAST_OBJECT_LIST: fsdp_osd1 = self._broadcast_full_osd(fsdp_osd1, group=new_group) sharded_osd1 = ( - FSDP.shard_full_optim_state_dict(fsdp_osd1, model2, optim_input=optim_input2) + FSDP.shard_full_optim_state_dict( + fsdp_osd1, model2, optim_input=optim_input2 + ) if use_optim_input else FSDP.shard_full_optim_state_dict(fsdp_osd1, model2, optim=optim2) ) fsdp_osd2 = self._broadcast_full_osd(fsdp_osd2, group=new_group) sharded_osd2 = ( - FSDP.shard_full_optim_state_dict(fsdp_osd2, model2, optim_input=optim_input2) + FSDP.shard_full_optim_state_dict( + fsdp_osd2, model2, optim_input=optim_input2 + ) if use_optim_input else FSDP.shard_full_optim_state_dict(fsdp_osd2, model2, optim=optim2) ) @@ -824,7 +868,8 @@ def _test_load_optim_state( model2, optim_input=optim_input2, group=new_group, - ) if use_optim_input + ) + if use_optim_input else FSDP.scatter_full_optim_state_dict( fsdp_osd1 if self.rank == 0 else None, model2, @@ -838,7 +883,8 @@ def _test_load_optim_state( model2, optim_input=optim_input2, group=new_group, - ) if use_optim_input + ) + if use_optim_input else FSDP.scatter_full_optim_state_dict( fsdp_osd2 if self.rank == 0 else None, model2, @@ -851,18 +897,28 @@ def _test_load_optim_state( elif osd_comm_method == _OSDCommMethod.FLATTEN_SHARDED_OSD: sharded_osd1 = ( FSDP.flatten_sharded_optim_state_dict( - fsdp_osd1, model2, optim_input=optim_input2, - ) if use_optim_input + fsdp_osd1, + model2, + optim_input=optim_input2, + ) + if use_optim_input else FSDP.flatten_sharded_optim_state_dict( - fsdp_osd1, model2, optim=optim2, + fsdp_osd1, + model2, + optim=optim2, ) ) sharded_osd2 = ( FSDP.flatten_sharded_optim_state_dict( - fsdp_osd2, model2, optim_input=optim_input2, - ) if use_optim_input + fsdp_osd2, + model2, + optim_input=optim_input2, + ) + if use_optim_input else FSDP.flatten_sharded_optim_state_dict( - fsdp_osd2, model2, optim=optim2, + fsdp_osd2, + model2, + optim=optim2, ) ) @@ -872,22 +928,26 @@ def _test_load_optim_state( local_osd2 = optim2.state_dict() check_same_param_keys = True # should all have matching parameter IDs self._check_same_param_groups( - sharded_osd2, local_osd2, + sharded_osd2, + local_osd2, check_same_param_keys=check_same_param_keys, ) self._check_same_state( - sharded_osd2, local_osd2, + sharded_osd2, + local_osd2, check_same_param_keys=check_same_param_keys, ) # Check that sharding the first model's full/sharded optimizer state dict # according to the second model is equivalent to the second model's # local optimizer state dict self._check_same_param_groups( - sharded_osd1, local_osd2, + sharded_osd1, + local_osd2, check_same_param_keys=check_same_param_keys, ) self._check_same_state( - sharded_osd1, local_osd2, + sharded_osd1, + local_osd2, check_same_param_keys=check_same_param_keys, ) # As a sanity check, check that we can load and run a few iterations @@ -955,7 +1015,8 @@ def _test_shard_full_optim_state_dict_unmanaged_params( device = torch.device("cuda") model = NestedModel().to(device) model, unmanaged_params = NestedModel.wrap_with_unmanaged_params( - model, add_to_fsdp_module, + model, + add_to_fsdp_module, ) optim_input = list(model.parameters()) optim = torch.optim.Adam(optim_input, lr=1e-3) @@ -965,21 +1026,31 @@ def _test_shard_full_optim_state_dict_unmanaged_params( # unflattened parameters with zero-dimensional tensor state (i.e. # Adam "step") and others without (i.e. the unmanaged parameters), # which triggers an error that we have to ensure correctness - error_prefix = "^(All unflattened parameters comprising a " \ - "single flattened parameter must have scalar state with the " \ + error_prefix = ( + "^(All unflattened parameters comprising a " + "single flattened parameter must have scalar state with the " "same value and dtype)" + ) with self.assertRaisesRegex(ValueError, error_prefix): if state_dict_type == StateDictType.FULL_STATE_DICT: ( - FSDP.shard_full_optim_state_dict(fsdp_osd, model, optim_input=optim_input) + FSDP.shard_full_optim_state_dict( + fsdp_osd, model, optim_input=optim_input + ) if use_optim_input - else FSDP.shard_full_optim_state_dict(fsdp_osd, model, optim=optim) + else FSDP.shard_full_optim_state_dict( + fsdp_osd, model, optim=optim + ) ) else: ( - FSDP.flatten_sharded_optim_state_dict(fsdp_osd, model, optim_input=optim_input) + FSDP.flatten_sharded_optim_state_dict( + fsdp_osd, model, optim_input=optim_input + ) if use_optim_input - else FSDP.flatten_sharded_optim_state_dict(fsdp_osd, model, optim=optim) + else FSDP.flatten_sharded_optim_state_dict( + fsdp_osd, model, optim=optim + ) ) else: # If we add the unmanaged parameters to a module not wrapped with @@ -988,20 +1059,28 @@ def _test_shard_full_optim_state_dict_unmanaged_params( # externally to FSDP if state_dict_type == StateDictType.FULL_STATE_DICT: flattened_osd = ( - FSDP.shard_full_optim_state_dict(fsdp_osd, model, optim_input=optim_input) + FSDP.shard_full_optim_state_dict( + fsdp_osd, model, optim_input=optim_input + ) if use_optim_input else FSDP.shard_full_optim_state_dict(fsdp_osd, model, optim=optim) ) else: flattened_osd = ( - FSDP.flatten_sharded_optim_state_dict(fsdp_osd, model, optim_input=optim_input) + FSDP.flatten_sharded_optim_state_dict( + fsdp_osd, model, optim_input=optim_input + ) if use_optim_input - else FSDP.flatten_sharded_optim_state_dict(fsdp_osd, model, optim=optim) + else FSDP.flatten_sharded_optim_state_dict( + fsdp_osd, model, optim=optim + ) ) # Add entries for the unmanaged parameters to be able to load for unmanaged_param in unmanaged_params: NestedModel.add_unmanaged_param_entry( - flattened_osd, unmanaged_param, NUM_ITERS, + flattened_osd, + unmanaged_param, + NUM_ITERS, ) # Check that we can load the optimizer state dict optim.load_state_dict(flattened_osd) @@ -1035,7 +1114,8 @@ def _test_rekey_optim_state_dict_to_ids( NUM_ITERS = 3 # Run a wrapped model for a few iterations model1, optim1, optim_input1 = self._init_nested_model( - wrap=True, use_multiple_param_groups=use_multiple_param_groups, + wrap=True, + use_multiple_param_groups=use_multiple_param_groups, ) self._step_model(model1, optim1, num_iters=NUM_ITERS) if state_dict_type == StateDictType.FULL_STATE_DICT: @@ -1055,28 +1135,39 @@ def _test_rekey_optim_state_dict_to_ids( ) # Run a non-wrapped model for a few iterations model2, optim2, optim_input2 = self._init_nested_model( - wrap=False, use_multiple_param_groups=use_multiple_param_groups, + wrap=False, + use_multiple_param_groups=use_multiple_param_groups, ) self._step_model(model2, optim2, num_iters=NUM_ITERS) # Re-key the wrapped model's optimizer state dict using parameter IDs # according to the non-wrapped model rekeyed_osd = ( FSDP.rekey_optim_state_dict( - fsdp_osd, OptimStateKeyType.PARAM_ID, model2, optim_input=optim_input2, + fsdp_osd, + OptimStateKeyType.PARAM_ID, + model2, + optim_input=optim_input2, ) if use_optim_input else FSDP.rekey_optim_state_dict( - fsdp_osd, OptimStateKeyType.PARAM_ID, model2, optim=optim2, + fsdp_osd, + OptimStateKeyType.PARAM_ID, + model2, + optim=optim2, ) ) # Check that the re-keyed dict and actual dict are the same osd = optim2.state_dict() check_same_param_keys = True self._check_same_param_groups( - rekeyed_osd, osd, check_same_param_keys=check_same_param_keys, + rekeyed_osd, + osd, + check_same_param_keys=check_same_param_keys, ) self._check_same_state( - rekeyed_osd, osd, check_same_param_keys=check_same_param_keys, + rekeyed_osd, + osd, + check_same_param_keys=check_same_param_keys, ) # As a sanity check, check that we can load and run a few iterations if state_dict_type != StateDictType.SHARDED_STATE_DICT: @@ -1106,12 +1197,14 @@ def _test_rekey_optim_state_dict_to_names( NUM_ITERS = 3 # Run a wrapped model for a few iterations model1, optim1, optim_input1 = self._init_nested_model( - wrap=True, use_multiple_param_groups=use_multiple_param_groups, + wrap=True, + use_multiple_param_groups=use_multiple_param_groups, ) self._step_model(model1, optim1, num_iters=NUM_ITERS) # Run a non-wrapped model for a few iterations model2, optim2, optim_input2 = self._init_nested_model( - wrap=False, use_multiple_param_groups=use_multiple_param_groups, + wrap=False, + use_multiple_param_groups=use_multiple_param_groups, ) self._step_model(model2, optim2, num_iters=NUM_ITERS) # Re-key the non-wrapped model's optimizer state dict using parameter @@ -1119,20 +1212,32 @@ def _test_rekey_optim_state_dict_to_names( osd2 = optim2.state_dict() rekeyed_osd = ( FSDP.rekey_optim_state_dict( - osd2, OptimStateKeyType.PARAM_NAME, model2, optim_input=optim_input2, - ) if use_optim_input + osd2, + OptimStateKeyType.PARAM_NAME, + model2, + optim_input=optim_input2, + ) + if use_optim_input else FSDP.rekey_optim_state_dict( - osd2, OptimStateKeyType.PARAM_NAME, model2, optim=optim2, + osd2, + OptimStateKeyType.PARAM_NAME, + model2, + optim=optim2, ) ) # Shard the non-wrapped model's re-keyed optimizer state dict, which # maps back to (flattened) parameter IDs sharded_osd = ( FSDP.shard_full_optim_state_dict( - rekeyed_osd, model1, optim_input=optim_input1, - ) if use_optim_input + rekeyed_osd, + model1, + optim_input=optim_input1, + ) + if use_optim_input else FSDP.shard_full_optim_state_dict( - rekeyed_osd, model1, optim=optim1, + rekeyed_osd, + model1, + optim=optim1, ) ) # Check that this sharded optimizer state dict matches the wrapped @@ -1140,10 +1245,14 @@ def _test_rekey_optim_state_dict_to_names( osd1 = optim1.state_dict() check_same_param_keys = True self._check_same_param_groups( - sharded_osd, osd1, check_same_param_keys=check_same_param_keys, + sharded_osd, + osd1, + check_same_param_keys=check_same_param_keys, ) self._check_same_state( - sharded_osd, osd1, check_same_param_keys=check_same_param_keys, + sharded_osd, + osd1, + check_same_param_keys=check_same_param_keys, ) # As a sanity check, check that we can load and run a few iterations optim1.load_state_dict(sharded_osd) @@ -1153,6 +1262,7 @@ def _test_rekey_optim_state_dict_to_names( def test_optim_input_warning(self): """Tests that passing the ``optim_input`` argument into optimizer state checkpointing APIs issues a warning.""" + def should_check_method(method_name: str): # Check every method since they all accept `optim_input` return True @@ -1163,12 +1273,15 @@ def get_warning_context(): expected_warning=UserWarning, expected_regex=warning_regex ) - self._run_on_all_optim_state_apis(should_check_method, get_warning_context, fsdp_kwargs=None) + self._run_on_all_optim_state_apis( + should_check_method, get_warning_context, fsdp_kwargs=None + ) @skip_if_lt_x_gpu(2) def test_use_orig_params_error(self): """Tests that the optimizer state checkpointing APIs raise an error when ``use_orig_params=True``.""" + def should_check_method(method_name: str): # Skip `rekey_optim_state_dict` since that does not depend on # `use_orig_params=True` @@ -1181,7 +1294,9 @@ def get_error_context(): ) fsdp_kwargs = {"use_orig_params": True} - self._run_on_all_optim_state_apis(should_check_method, get_error_context, fsdp_kwargs) + self._run_on_all_optim_state_apis( + should_check_method, get_error_context, fsdp_kwargs + ) def _run_on_all_optim_state_apis( self, @@ -1195,12 +1310,10 @@ def _run_on_all_optim_state_apis( via ``should_check_method_fn``, which gets passed the string name of the method. """ - wrapped_model, wrapped_optim, wrapped_optim_input = ( - self._init_nested_model( - wrap=True, - use_multiple_param_groups=False, - fsdp_kwargs=fsdp_kwargs, - ) + wrapped_model, wrapped_optim, wrapped_optim_input = self._init_nested_model( + wrap=True, + use_multiple_param_groups=False, + fsdp_kwargs=fsdp_kwargs, ) self._step_model(wrapped_model, wrapped_optim, num_iters=2) @@ -1208,14 +1321,18 @@ def _run_on_all_optim_state_apis( if should_check_method_fn("sharded_optim_state_dict"): with context_fn(): fsdp_osd = FSDP.sharded_optim_state_dict( - wrapped_model, wrapped_optim, optim_input=wrapped_optim_input, + wrapped_model, + wrapped_optim, + optim_input=wrapped_optim_input, ) if "fsdp_osd" not in locals(): fsdp_osd = {} # may not be defined due to previous method erroring if should_check_method_fn("flatten_sharded_optim_state_dict"): with context_fn(): FSDP.flatten_sharded_optim_state_dict( - fsdp_osd, wrapped_model, optim_input=wrapped_optim_input, + fsdp_osd, + wrapped_model, + optim_input=wrapped_optim_input, ) # Full optim state dict if should_check_method_fn("full_optim_state_dict"): @@ -1229,17 +1346,23 @@ def _run_on_all_optim_state_apis( if should_check_method_fn("shard_full_optim_state_dict"): with context_fn(): FSDP.shard_full_optim_state_dict( - fsdp_osd, wrapped_model, optim_input=wrapped_optim_input, + fsdp_osd, + wrapped_model, + optim_input=wrapped_optim_input, ) if should_check_method_fn("scatter_full_optim_state_dict"): with context_fn(): FSDP.scatter_full_optim_state_dict( - fsdp_osd, wrapped_model, optim_input=wrapped_optim_input, + fsdp_osd, + wrapped_model, + optim_input=wrapped_optim_input, ) # Rekey optim state dict - nonwrapped_model, nonwrapped_optim, nonwrapped_optim_input = ( - self._init_nested_model(wrap=False, use_multiple_param_groups=False) - ) + ( + nonwrapped_model, + nonwrapped_optim, + nonwrapped_optim_input, + ) = self._init_nested_model(wrap=False, use_multiple_param_groups=False) if should_check_method_fn("rekey_optim_state_dict"): with context_fn(): rekeyed_osd = FSDP.rekey_optim_state_dict( diff --git a/test/distributed/fsdp/test_fsdp_overlap.py b/test/distributed/fsdp/test_fsdp_overlap.py index 07e8eba09c6c2..8bd5354b2b701 100644 --- a/test/distributed/fsdp/test_fsdp_overlap.py +++ b/test/distributed/fsdp/test_fsdp_overlap.py @@ -11,16 +11,13 @@ from torch.cuda import Event from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from torch.testing._internal.common_distributed import skip_if_lt_x_gpu -from torch.testing._internal.common_fsdp import ( - FSDPTest, -) +from torch.testing._internal.common_fsdp import FSDPTest from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, get_cycles_per_ms, run_tests, + TEST_WITH_DEV_DBG_ASAN, ) - if not dist.is_available(): print("Distributed not available, skipping tests", file=sys.stderr) sys.exit(0) diff --git a/test/distributed/fsdp/test_fsdp_pure_fp16.py b/test/distributed/fsdp/test_fsdp_pure_fp16.py index ed4aef39da0f9..1c663f8263354 100644 --- a/test/distributed/fsdp/test_fsdp_pure_fp16.py +++ b/test/distributed/fsdp/test_fsdp_pure_fp16.py @@ -12,10 +12,10 @@ NestedWrappedModule, ) from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, instantiate_parametrized_tests, parametrize, run_tests, + TEST_WITH_DEV_DBG_ASAN, ) if not dist.is_available(): @@ -31,7 +31,6 @@ class TestPureFP16(FSDPTest): - @property def world_size(self): # Test fails due to inaccuracies when using more than 5 GPUs diff --git a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py index 1c230cb7400c4..2124e6b0450f5 100644 --- a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py +++ b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py @@ -22,11 +22,11 @@ subtest_name, ) from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, - TestCase, instantiate_parametrized_tests, parametrize, run_tests, + TEST_WITH_DEV_DBG_ASAN, + TestCase, ) if not dist.is_available(): @@ -47,21 +47,23 @@ sharding_strategy_config = [ShardingStrategy.SHARD_GRAD_OP, None] mixed_precision = ["enable_mixed_precision", None] -configs = list(itertools.product(cpu_offload_config, - sharding_strategy_config, - mixed_precision)) +configs = list( + itertools.product(cpu_offload_config, sharding_strategy_config, mixed_precision) +) test_name_mapping = { str(CPUOffload(offload_params=True)): "offload_true", str(CPUOffload(offload_params=False)): "offload_false", str(ShardingStrategy.SHARD_GRAD_OP): "shard_grad_op", - "enable_mixed_precision": "mixed_precision" + "enable_mixed_precision": "mixed_precision", } subtest_name = functools.partial(subtest_name, test_name_mapping) class TestShardGradScaler(TestCase): - @unittest.skipIf(amp_definitely_not_available(), "no supported device (cuda, xla) found") + @unittest.skipIf( + amp_definitely_not_available(), "no supported device (cuda, xla) found" + ) def test_grad_scaling(self): pg = DummyProcessGroup(0, 1) scaler = ShardedGradScaler(init_scale=2.0, process_group=pg, enabled=True) @@ -69,21 +71,26 @@ def test_grad_scaling(self): t1 = torch.full((1,), 8.0, dtype=torch.float32, device="cpu") outputs = [t1.clone(), (t0.clone(), t1.clone()), [t0.clone(), t1.clone()]] outputs = scaler.scale(outputs) - self.assertTrue(outputs[0] == 16.0 and outputs[1][0] == 8.0 and outputs[1][1] == 16.0) + self.assertTrue( + outputs[0] == 16.0 and outputs[1][0] == 8.0 and outputs[1][1] == 16.0 + ) self.assertTrue(outputs[2][0] == 8.0 and outputs[2][1] == 16.0) self.assertTrue(scaler._scale.device == t1.device) - @unittest.skipIf(amp_definitely_not_available(), "no supported device (cuda, xla) found") + @unittest.skipIf( + amp_definitely_not_available(), "no supported device (cuda, xla) found" + ) def test_scaling_unscaling_sparse(self): pg = DummyProcessGroup(0, 1) scaler = ShardedGradScaler(init_scale=2.0, process_group=pg, enabled=True) inv_scale = torch.full((1,), 0.5, dtype=torch.float, device="cpu") found_inf = torch.full((1,), 0, dtype=torch.float, device="cpu") - i = torch.tensor([[0, 1, 1], - [2, 0, 2]], device="cpu", dtype=torch.int64) + i = torch.tensor([[0, 1, 1], [2, 0, 2]], device="cpu", dtype=torch.int64) v = torch.tensor([16.0, 32.0, 64.0], dtype=torch.float, device="cpu") - s = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cpu", dtype=torch.float) + s = torch.sparse_coo_tensor( + i, v, torch.Size([2, 3]), device="cpu", dtype=torch.float + ) # unscale sparse tensors s1 = s.clone() @@ -95,29 +102,34 @@ def test_scaling_unscaling_sparse(self): self.assertEqual(s1.grad.to_dense(), (s / 2).to_dense()) # unscale sparse tensor: inf - v = torch.tensor([16.0, 32.0, float('inf')], dtype=torch.float, device="cpu") - s1.grad = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cpu", dtype=torch.float) + v = torch.tensor([16.0, 32.0, float("inf")], dtype=torch.float, device="cpu") + s1.grad = torch.sparse_coo_tensor( + i, v, torch.Size([2, 3]), device="cpu", dtype=torch.float + ) found_inf.zero_() found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf)[s1.device] self.assertEqual(found_inf, 1.0) # unscale sparse tensor: overflow (marked as inf) - i = torch.tensor([[1, 1, 1], - [0, 0, 2]], device="cpu", dtype=torch.int64) + i = torch.tensor([[1, 1, 1], [0, 0, 2]], device="cpu", dtype=torch.int64) # coalescing sparse tensor here will cause the value to be Inf v = torch.tensor([2**15, 2**15, 1.0], dtype=torch.float16, device="cpu") - s1 = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cpu", dtype=torch.float16) + s1 = torch.sparse_coo_tensor( + i, v, torch.Size([2, 3]), device="cpu", dtype=torch.float16 + ) s1.grad = s1.clone() found_inf.zero_() found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf)[s1.device] self.assertEqual(found_inf, 1.0) - @unittest.skipIf(amp_definitely_not_available(), "no supported device (cuda, xla) found") + @unittest.skipIf( + amp_definitely_not_available(), "no supported device (cuda, xla) found" + ) def test_inf_gradients_skip_optim_step(self): pg = DummyProcessGroup(0, 1) scaler = ShardedGradScaler(init_scale=2.0, process_group=pg, enabled=True) loss = torch.full((1,), 4.0, dtype=torch.float32, device="cpu") - t0 = torch.tensor([float('inf')], dtype=torch.float32, device="cpu") + t0 = torch.tensor([float("inf")], dtype=torch.float32, device="cpu") t0.grad = t0.clone() opt = torch.optim.SGD([t0], lr=1.0) scaler.scale(loss) @@ -127,10 +139,7 @@ def test_inf_gradients_skip_optim_step(self): class TestShardedGradScalerParityWithDDP(FSDPTest): def _get_init_modes_for_test(self, cpu_offload): - modes = [ - CUDAInitMode.CUDA_AFTER, - CUDAInitMode.CUDA_BEFORE - ] + modes = [CUDAInitMode.CUDA_AFTER, CUDAInitMode.CUDA_BEFORE] # Note that CUDAInitMode.CUDA_NEVER works currently only with CPU # offload as we explicitly bring the param back to CUDA device. In # general, it will not work since we try to all_gather p.data which is @@ -149,11 +158,15 @@ def test_fsdp_ddp_parity_with_grad_scaler( mixed_precision: Optional[str], ): init_modes = self._get_init_modes_for_test(cpu_offload) - mp = MixedPrecision( - param_dtype=torch.float16, - reduce_dtype=torch.float16, - buffer_dtype=torch.float16, - ) if mixed_precision is not None else None + mp = ( + MixedPrecision( + param_dtype=torch.float16, + reduce_dtype=torch.float16, + buffer_dtype=torch.float16, + ) + if mixed_precision is not None + else None + ) for cuda_init_mode in init_modes: self._test_fsdp_parity( NestedWrappedModule, diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py index 6592ec108f074..f5a401590414a 100644 --- a/test/distributed/fsdp/test_fsdp_state_dict.py +++ b/test/distributed/fsdp/test_fsdp_state_dict.py @@ -13,10 +13,10 @@ from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( checkpoint_wrapper, ) -from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from torch.distributed.fsdp import ( CPUOffload, FullStateDictConfig, + FullyShardedDataParallel as FSDP, LocalStateDictConfig, MixedPrecision, ShardedStateDictConfig, @@ -24,36 +24,27 @@ ) from torch.distributed.fsdp._shard_utils import _gather_state_dict from torch.distributed.fsdp.fully_sharded_data_parallel import FLAT_PARAM -from torch.distributed.fsdp.wrap import ( - enable_wrap, - transformer_auto_wrap_policy, - wrap, -) -from torch.nn import ( - Linear, - Module, - TransformerDecoderLayer, - TransformerEncoderLayer, -) +from torch.distributed.fsdp.wrap import enable_wrap, transformer_auto_wrap_policy, wrap +from torch.nn import Linear, Module, TransformerDecoderLayer, TransformerEncoderLayer from torch.nn.parallel import DistributedDataParallel from torch.optim import SGD from torch.testing._internal.common_distributed import skip_if_lt_x_gpu from torch.testing._internal.common_fsdp import ( + _assert_module_states, + _get_state_dict, + _zero_model, CUDAInitMode, FSDPInitMode, FSDPTest, + get_full_params, SkipModel, TransformerWithSharedParams, - _assert_module_states, - _get_state_dict, - _zero_model, - get_full_params, ) from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, instantiate_parametrized_tests, parametrize, run_tests, + TEST_WITH_DEV_DBG_ASAN, ) if not dist.is_available(): @@ -71,7 +62,7 @@ OUTER_SHAPE = [4, 5] BUFFER_SHAPE = [5, 5] -NON_ROOT_FSDP_PREFIX = 'non_fsdp_lin' +NON_ROOT_FSDP_PREFIX = "non_fsdp_lin" _UNFLATTENED_STATE_DICT_IMPLS = ["state_dict", "sharded_state_dict"] _FLATTENED_STATE_DICT_IMPLS = ["local_state_dict"] @@ -96,7 +87,9 @@ def __init__(self, wrap_fsdp, register_buffers=False, ignore_inner=False): "non_persistent_buffer", torch.randn(BUFFER_SHAPE), persistent=False ) if wrap_fsdp: - self.inner = FSDP(self.inner, ignored_modules=([self.inner] if ignore_inner else [])) + self.inner = FSDP( + self.inner, ignored_modules=([self.inner] if ignore_inner else []) + ) self.outer = Linear(*OUTER_SHAPE) if register_buffers: self.outer.register_buffer("buffer", torch.randn(BUFFER_SHAPE)) @@ -135,7 +128,9 @@ def _compare_models(self, model, model_new, assert_fn, check_fp16=False): for tensor in model_new.parameters(): self.assertEqual(tensor.dtype, torch.float16) - def _get_simple_nested_model(self, *fsdp_args, wrap=True, checkpoint_wrap=False, **fsdp_kwargs): + def _get_simple_nested_model( + self, *fsdp_args, wrap=True, checkpoint_wrap=False, **fsdp_kwargs + ): if wrap: lin1 = nn.Linear(10, 10, bias=False).cuda() lin2 = nn.Linear(10, 10, bias=False).cuda() @@ -148,7 +143,8 @@ def _get_simple_nested_model(self, *fsdp_args, wrap=True, checkpoint_wrap=False, model = FSDP(seq, *fsdp_args, **fsdp_kwargs) else: model = nn.Sequential( - nn.Linear(10, 10, bias=False).cuda(), nn.Linear(10, 10, bias=False).cuda() + nn.Linear(10, 10, bias=False).cuda(), + nn.Linear(10, 10, bias=False).cuda(), ) return model @@ -225,20 +221,24 @@ def _validate_state_dict_contents( @skip_if_lt_x_gpu(2) @parametrize("state_dict_type", _UNFLATTENED_STATE_DICT_IMPLS) @parametrize("checkpoint_wrap", ["first", "second", "both"]) - def test_fsdp_state_dict_with_activation_checkpoint(self, state_dict_type, checkpoint_wrap): + def test_fsdp_state_dict_with_activation_checkpoint( + self, state_dict_type, checkpoint_wrap + ): """Tests saving the state dict, zeroing a target model's parameters, and loading the state dict, where the source and target models may have a checkpoint wrapper.""" for model_call in [ partial(self._get_simple_model), - partial(self._get_simple_nested_model) + partial(self._get_simple_nested_model), ]: model = model_call(checkpoint_wrap=(checkpoint_wrap in ["first", "both"])) with FSDP.state_dict_type(model, STATE_DICT_MAPPING[state_dict_type]): state_dict = _gather_state_dict(_get_state_dict(model, False, False)) # Possibly wrap new model in activation checkpoint wrapper to test save/ # load with this wrapper - model_new = model_call(checkpoint_wrap=(checkpoint_wrap in ["second", "both"])) + model_new = model_call( + checkpoint_wrap=(checkpoint_wrap in ["second", "both"]) + ) _zero_model(model_new) self._compare_models(model, model_new, self.assertNotEqual) # Would fail if checkpoint_wrapper did not correctly implement state_dict pre/post hooks @@ -250,16 +250,14 @@ def test_fsdp_state_dict_with_activation_checkpoint(self, state_dict_type, check def test_state_dict_with_shared_parameters(self, state_dict_type): auto_wrap_policy = partial( transformer_auto_wrap_policy, - transformer_layer_cls={ - TransformerEncoderLayer, TransformerDecoderLayer - }, + transformer_layer_cls={TransformerEncoderLayer, TransformerDecoderLayer}, ) model_creator = partial( TransformerWithSharedParams.init, self.process_group, FSDPInitMode.RECURSIVE, CUDAInitMode.CUDA_BEFORE, - {"auto_wrap_policy": auto_wrap_policy} + {"auto_wrap_policy": auto_wrap_policy}, ) fsdp_model = model_creator() @@ -293,10 +291,14 @@ def test_state_dict_rank0_offload_save_load_flow(self, use_orig_params: bool): ) # Force model parameters and buffers to be nonzero with FSDP.summon_full_params(fsdp_model): - for tensor in itertools.chain(fsdp_model.parameters(), fsdp_model.buffers()): + for tensor in itertools.chain( + fsdp_model.parameters(), fsdp_model.buffers() + ): if torch.count_nonzero(tensor) == 0: with torch.no_grad(): - tensor.add_(torch.tensor(1, dtype=tensor.dtype, device=tensor.device)) + tensor.add_( + torch.tensor(1, dtype=tensor.dtype, device=tensor.device) + ) with self._get_state_dict_mgr(fsdp_model, "state_dict", True): state_dict = deepcopy(_get_state_dict(fsdp_model)) # Initialize a non-wrapped model on all ranks @@ -357,15 +359,26 @@ def test_basic_save_and_load_state_dict( with various configs such as fp16 and cpu offload and parameters match as expected. """ - if ( - (state_dict_rank0_and_offload and state_dict_type != "state_dict") - or (use_orig_params and state_dict_type not in _UNFLATTENED_STATE_DICT_IMPLS) + if (state_dict_rank0_and_offload and state_dict_type != "state_dict") or ( + use_orig_params and state_dict_type not in _UNFLATTENED_STATE_DICT_IMPLS ): return # not supported for model_call in [ - partial(self._get_non_fsdp_root_module, cpu_offload=cpu_offload, use_orig_params=use_orig_params), - partial(self._get_simple_nested_model, cpu_offload=cpu_offload, use_orig_params=use_orig_params), - partial(self._get_simple_model, cpu_offload=cpu_offload, use_orig_params=use_orig_params), + partial( + self._get_non_fsdp_root_module, + cpu_offload=cpu_offload, + use_orig_params=use_orig_params, + ), + partial( + self._get_simple_nested_model, + cpu_offload=cpu_offload, + use_orig_params=use_orig_params, + ), + partial( + self._get_simple_model, + cpu_offload=cpu_offload, + use_orig_params=use_orig_params, + ), ]: model = model_call() @@ -377,10 +390,15 @@ def test_basic_save_and_load_state_dict( model, cpu_offload.offload_params, fp16 ) - ignore_keys = [k for k in fsdp_state_dict.keys() if NON_ROOT_FSDP_PREFIX in k] + ignore_keys = [ + k for k in fsdp_state_dict.keys() if NON_ROOT_FSDP_PREFIX in k + ] self._validate_state_dict_contents( - model, fsdp_state_dict, state_dict_rank0_and_offload, ignore_keys=ignore_keys, + model, + fsdp_state_dict, + state_dict_rank0_and_offload, + ignore_keys=ignore_keys, ) if fp16: # Verify fp16 is the type @@ -465,7 +483,9 @@ def test_save_and_load_after_forward_state_dict( for sharded_tensor in state_dict.values(): shard = sharded_tensor._local_shards[0] shard.tensor = shard.tensor.clone().detach_() - self._validate_state_dict_contents(model, state_dict, state_dict_rank0_and_offload) + self._validate_state_dict_contents( + model, state_dict, state_dict_rank0_and_offload + ) _zero_model(model) # Ensure checkpointed params have the full param dtype @@ -562,7 +582,9 @@ def test_state_dict_save_load_flow(self, state_dict_type): for move_to_cpu in [True, False]: with self.subTest(move_to_cpu=move_to_cpu): fsdp_params = self._dist_train( - wrap_fsdp=True, state_dict_type=state_dict_type, move_to_cpu=move_to_cpu, + wrap_fsdp=True, + state_dict_type=state_dict_type, + move_to_cpu=move_to_cpu, ) ddp_params = self._dist_train(wrap_fsdp=False) self.assertEqual(ddp_params, fsdp_params) @@ -572,7 +594,9 @@ def test_state_dict_save_load_flow(self, state_dict_type): def test_fsdp_state_dict_keys(self, state_dict_type): state_dict = self._state_dict(self._initialize_model(True), state_dict_type) if state_dict_type == "local_state_dict": - self.assertEqual(set([FLAT_PARAM, f"inner.{FLAT_PARAM}"]), state_dict.keys()) + self.assertEqual( + set([FLAT_PARAM, f"inner.{FLAT_PARAM}"]), state_dict.keys() + ) elif state_dict_type in ("state_dict", "sharded_state_dict"): # Keys should match local model. local_model = self._initialize_model(wrap_fsdp=False, wrap_ddp=False) @@ -586,7 +610,10 @@ def test_fsdp_state_dict_keys(self, state_dict_type): @parametrize("state_dict_rank0_and_offload", [True, False]) @parametrize("fsdp_root", [True, False]) def test_state_dict_load_into_local_module( - self, state_dict_type, state_dict_rank0_and_offload, fsdp_root, + self, + state_dict_type, + state_dict_rank0_and_offload, + fsdp_root, ): """ Tests that FSDP's state_dict can be loaded into a local model. @@ -599,7 +626,9 @@ def test_state_dict_load_into_local_module( model = self._initialize_model(wrap_fsdp=True, register_buffers=True) optim = SGD(model.parameters(), lr=0.1) if not fsdp_root: - in_data = torch.randn(1, 10, requires_grad=True, device=torch.device("cuda")) + in_data = torch.randn( + 1, 10, requires_grad=True, device=torch.device("cuda") + ) else: in_data = torch.rand(64, 4, requires_grad=True, device=torch.device("cuda")) for _ in range(3): @@ -620,7 +649,10 @@ def test_state_dict_load_into_local_module( ignore_keys = [k for k in fsdp_state_dict.keys() if NON_ROOT_FSDP_PREFIX in k] self._validate_state_dict_contents( - model, fsdp_state_dict, state_dict_rank0_and_offload, ignore_keys=ignore_keys, + model, + fsdp_state_dict, + state_dict_rank0_and_offload, + ignore_keys=ignore_keys, ) # Create zeroed local model if not fsdp_root: @@ -749,10 +781,14 @@ def test_wrong_state_dict_config(self): @parametrize("state_dict_type", _UNFLATTENED_STATE_DICT_IMPLS) @parametrize("prefix", [True, False]) @parametrize("ignore_inner", [True, False]) - def test_state_dict_with_ignored_modules(self, state_dict_type, prefix, ignore_inner): + def test_state_dict_with_ignored_modules( + self, state_dict_type, prefix, ignore_inner + ): # Initialize an FSDP-wrapped model with an ignored module that includes # both parameters and a buffer - model = Model(wrap_fsdp=True, register_buffers=True, ignore_inner=ignore_inner).cuda() + model = Model( + wrap_fsdp=True, register_buffers=True, ignore_inner=ignore_inner + ).cuda() ignored_modules = [model.outer] ignored_tensor_to_tensor_name = { model.outer.bias: "outer.bias", @@ -767,7 +803,8 @@ def test_state_dict_with_ignored_modules(self, state_dict_type, prefix, ignore_i # Note that when model.inner is not ignored this test also ensures # non-ignored buffers are not cloned. buffer_to_buffer_name = { - model.inner.buffer: "inner.buffer", model.outer.buffer: "outer.buffer", + model.inner.buffer: "inner.buffer", + model.outer.buffer: "outer.buffer", } fsdp_model = FSDP(model, ignored_modules=ignored_modules) prefix_str = "foo." if prefix else "" @@ -782,7 +819,11 @@ def test_state_dict_with_ignored_modules(self, state_dict_type, prefix, ignore_i }.items(): prefixed_tensor_name = f"{prefix_str}{tensor_name}" self.assertTrue(prefixed_tensor_name in sd1) - self.assertEqual(tensor.data_ptr(), sd1[prefixed_tensor_name].data_ptr(), f"{prefixed_tensor_name}") + self.assertEqual( + tensor.data_ptr(), + sd1[prefixed_tensor_name].data_ptr(), + f"{prefixed_tensor_name}", + ) # Check that the state dict can be loaded into a non-wrapped version of # the model nonwrapped_model = Model(wrap_fsdp=False, register_buffers=True).cuda() @@ -790,7 +831,7 @@ def test_state_dict_with_ignored_modules(self, state_dict_type, prefix, ignore_i with torch.no_grad(): param.zero_() - to_load = {k[len(prefix_str):] : v for k, v in sd1.items()} + to_load = {k[len(prefix_str) :]: v for k, v in sd1.items()} nonwrapped_model.load_state_dict(to_load, strict=True) local_params = list(nonwrapped_model.parameters()) for fsdp_param, local_param in zip(fsdp_params, local_params): @@ -806,7 +847,10 @@ def test_state_dict_with_ignored_modules(self, state_dict_type, prefix, ignore_i prefixed_tensor_name = f"{prefix_str}{tensor_name}" self.assertTrue(prefixed_tensor_name in sd2) self.assertEqual(tensor.data_ptr(), sd2[prefixed_tensor_name].data_ptr()) - self.assertEqual(sd1[prefixed_tensor_name].data_ptr(), sd2[prefixed_tensor_name].data_ptr()) + self.assertEqual( + sd1[prefixed_tensor_name].data_ptr(), + sd2[prefixed_tensor_name].data_ptr(), + ) @skip_if_lt_x_gpu(2) def test_state_dict_type(self): diff --git a/test/distributed/fsdp/test_fsdp_summon_full_params.py b/test/distributed/fsdp/test_fsdp_summon_full_params.py index 82fd8e1c0737b..5b995a9ab23f6 100644 --- a/test/distributed/fsdp/test_fsdp_summon_full_params.py +++ b/test/distributed/fsdp/test_fsdp_summon_full_params.py @@ -9,9 +9,12 @@ import torch import torch.nn as nn from torch import distributed as dist -from torch.distributed.fsdp import CPUOffload -from torch.distributed.fsdp import FullyShardedDataParallel as FSDP -from torch.distributed.fsdp import MixedPrecision, ShardingStrategy +from torch.distributed.fsdp import ( + CPUOffload, + FullyShardedDataParallel as FSDP, + MixedPrecision, + ShardingStrategy, +) from torch.distributed.fsdp.flat_param import FlatParamHandle from torch.distributed.fsdp.wrap import enable_wrap, wrap from torch.nn.parallel.distributed import DistributedDataParallel as DDP @@ -25,10 +28,10 @@ TransformerWithSharedParams, ) from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, instantiate_parametrized_tests, parametrize, run_tests, + TEST_WITH_DEV_DBG_ASAN, ) if not dist.is_available(): @@ -129,7 +132,9 @@ def test_summon_full_param_writeback(self): @skip_if_lt_x_gpu(2) @parametrize("mixed_precision", [True, False]) def test_summon_full_param_shard_value(self, mixed_precision): - mixed_precision = MixedPrecision(param_dtype=torch.float16) if mixed_precision else None + mixed_precision = ( + MixedPrecision(param_dtype=torch.float16) if mixed_precision else None + ) raw_model = nn.Linear(10, 11) raw_model_size = self.get_model_param_count(raw_model) expected_shard_size = self.get_expected_sharded_size(raw_model_size) @@ -159,7 +164,9 @@ def test_summon_full_param_shard_value(self, mixed_precision): @parametrize("summon_outer", [True, False]) @parametrize("mixed_precision", [True, False]) def test_summon_full_param_recursive(self, recurse, summon_outer, mixed_precision): - mixed_precision = MixedPrecision(param_dtype=torch.float16) if mixed_precision else None + mixed_precision = ( + MixedPrecision(param_dtype=torch.float16) if mixed_precision else None + ) model = FSDP( nn.Sequential( FSDP(nn.Linear(5, 5, bias=False), mixed_precision=mixed_precision), @@ -239,9 +246,7 @@ def test_summon_full_params_respects_reshard_after_forward(self): ) def _test_summon_full_params_respects_reshard_after_forward( - self, - mixed_precision: Optional[MixedPrecision], - use_orig_params: bool + self, mixed_precision: Optional[MixedPrecision], use_orig_params: bool ): fsdp_kwargs = { "mixed_precision": mixed_precision, @@ -373,7 +378,9 @@ def __init__(self, fsdp_1, fsdp_2, fsdp_3): def test_reshard_outside_forward_backward_iteration( self, rank0_only, offload_to_cpu, mixed_precision ): - mixed_precision = MixedPrecision(param_dtype=torch.float16) if mixed_precision else None + mixed_precision = ( + MixedPrecision(param_dtype=torch.float16) if mixed_precision else None + ) model = FSDP( nn.Sequential( FSDP(nn.Linear(5, 5, bias=False), mixed_precision=mixed_precision), @@ -437,7 +444,9 @@ def test_reshard_outside_forward_backward_iteration( def test_params_are_unflattenned(self, rank0_only, offload_to_cpu, mixed_precision): layer_shape = (10, 12) model = nn.Linear(*layer_shape, bias=False).cuda(self.rank) - mixed_precision = MixedPrecision(param_dtype=torch.float16) if mixed_precision else None + mixed_precision = ( + MixedPrecision(param_dtype=torch.float16) if mixed_precision else None + ) fsdp_model = FSDP(deepcopy(model), mixed_precision=mixed_precision).cuda( self.rank ) @@ -486,7 +495,9 @@ def test_params_count_and_value( offload_to_cpu: bool, mixed_precision: bool, ): - mixed_precision = MixedPrecision(param_dtype=torch.float16) if mixed_precision else None + mixed_precision = ( + MixedPrecision(param_dtype=torch.float16) if mixed_precision else None + ) model = NestedWrappedModule.init( self.process_group, FSDPInitMode.NO_FSDP, @@ -624,10 +635,13 @@ def _check_grads( assert torch.count_nonzero(p2.grad) > 0 p2.grad *= WRITEBACK_FACTOR new_fsdp_grads = [ - param.grad for param in fsdp_model.parameters() + param.grad + for param in fsdp_model.parameters() if param.grad is not None ] - writeback_persists = writeback or sharding_strategy == ShardingStrategy.NO_SHARD + writeback_persists = ( + writeback or sharding_strategy == ShardingStrategy.NO_SHARD + ) for old_grad, new_grad in zip(old_fsdp_grads, new_fsdp_grads): if writeback_persists: torch.testing.assert_close(old_grad * WRITEBACK_FACTOR, new_grad) @@ -640,14 +654,16 @@ def _check_grads( def _get_error_context(is_supported: bool): return ( - contextlib.suppress() if is_supported + contextlib.suppress() + if is_supported else self.assertRaises(NotImplementedError) ) # some configs not implemented yet def _get_fsdp_grads(fsdp_model: FSDP, is_supported: bool): if is_supported: return [ - param.grad.clone() for param in fsdp_model.parameters() + param.grad.clone() + for param in fsdp_model.parameters() if param.grad is not None ] return None # unused @@ -706,7 +722,7 @@ def test_with_grads_none_grads(self): ShardingStrategy.NO_SHARD, ] }, - self._test_with_grads_none_grads + self._test_with_grads_none_grads, ) def _test_with_grads_none_grads(self, sharding_strategy: ShardingStrategy): diff --git a/test/distributed/fsdp/test_fsdp_traversal.py b/test/distributed/fsdp/test_fsdp_traversal.py index e1b0a77cfe791..b9c7a0aeac9b2 100644 --- a/test/distributed/fsdp/test_fsdp_traversal.py +++ b/test/distributed/fsdp/test_fsdp_traversal.py @@ -11,10 +11,7 @@ FSDPTest, NestedWrappedModule, ) -from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, - run_tests, -) +from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN if not dist.is_available(): print("Distributed not available, skipping tests", file=sys.stderr) @@ -42,18 +39,20 @@ def test_fsdp_modules(self): ) modules = FSDP.fsdp_modules(nested_wrapped_module) self.assertEquals( - modules, [ + modules, + [ nested_wrapped_module.module.get_submodule("1"), nested_wrapped_module.module.get_submodule("1").get_submodule("0"), nested_wrapped_module.module.get_submodule("2"), - ] + ], ) modules = FSDP.fsdp_modules(nested_wrapped_module, root_only=True) self.assertEqual( - modules, [ + modules, + [ nested_wrapped_module.module.get_submodule("1"), nested_wrapped_module.module.get_submodule("2"), - ] + ], ) diff --git a/test/distributed/fsdp/test_fsdp_uneven.py b/test/distributed/fsdp/test_fsdp_uneven.py index 295afbce508bc..6ffeb279b617b 100644 --- a/test/distributed/fsdp/test_fsdp_uneven.py +++ b/test/distributed/fsdp/test_fsdp_uneven.py @@ -8,11 +8,8 @@ from torch.nn import Linear from torch.optim import SGD from torch.testing._internal.common_distributed import skip_if_lt_x_gpu -from torch.testing._internal.common_fsdp import ( - FSDPTest, -) -from torch.testing._internal.common_utils import TEST_WITH_DEV_DBG_ASAN, run_tests - +from torch.testing._internal.common_fsdp import FSDPTest +from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN if not dist.is_available(): print("Distributed not available, skipping tests", file=sys.stderr) diff --git a/test/distributed/fsdp/test_fsdp_use_orig_params.py b/test/distributed/fsdp/test_fsdp_use_orig_params.py index 1091200206135..81657dcfae5e1 100644 --- a/test/distributed/fsdp/test_fsdp_use_orig_params.py +++ b/test/distributed/fsdp/test_fsdp_use_orig_params.py @@ -8,16 +8,14 @@ import torch import torch.nn as nn from torch import distributed as dist -from torch.distributed.fsdp import BackwardPrefetch, CPUOffload -from torch.distributed.fsdp import FullyShardedDataParallel as FSDP -from torch.distributed.fsdp import ShardingStrategy -from torch.distributed.fsdp.fully_sharded_data_parallel import ( - clean_tensor_name, -) -from torch.distributed.fsdp.wrap import ( - always_wrap_policy, - transformer_auto_wrap_policy, +from torch.distributed.fsdp import ( + BackwardPrefetch, + CPUOffload, + FullyShardedDataParallel as FSDP, + ShardingStrategy, ) +from torch.distributed.fsdp.fully_sharded_data_parallel import clean_tensor_name +from torch.distributed.fsdp.wrap import always_wrap_policy, transformer_auto_wrap_policy from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer from torch.nn.parallel.distributed import DistributedDataParallel as DDP from torch.testing._internal.common_distributed import skip_if_lt_x_gpu @@ -28,10 +26,10 @@ TransformerWithSharedParams, ) from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, instantiate_parametrized_tests, parametrize, run_tests, + TEST_WITH_DEV_DBG_ASAN, ) if not dist.is_available(): @@ -361,11 +359,14 @@ def _test_multiple_optimizers(self, sharding_strategy: ShardingStrategy): ddp_model = self._get_ddp_transformer(find_unused_params=True) ddp_param_groups = self._get_param_groups(ddp_model) assert len(ddp_param_groups) == 3, f"{len(ddp_param_groups)}" - fsdp_model, _ = self._get_fsdp_transformer_and_optim( # ignore returned optimizer + ( + fsdp_model, + _, + ) = self._get_fsdp_transformer_and_optim( # ignore returned optimizer cuda_init_mode=CUDAInitMode.CUDA_BEFORE, init_optim_before_wrap=False, optim_class=torch.optim.Adam, # ignored - multi_tensor=False, # ignored + multi_tensor=False, # ignored sharding_strategy=sharding_strategy, backward_prefetch=BackwardPrefetch.BACKWARD_PRE, cpu_offload=None, @@ -386,7 +387,9 @@ def _test_multiple_optimizers(self, sharding_strategy: ShardingStrategy): ] for optim_ctor, ddp_param_group, fsdp_param_group in zip( - optim_ctors, ddp_param_groups[:2], fsdp_param_groups[:2], + optim_ctors, + ddp_param_groups[:2], + fsdp_param_groups[:2], ): ddp_optims.append(optim_ctor(ddp_param_group["params"])) fsdp_optims.append(optim_ctor(fsdp_param_group["params"])) @@ -406,7 +409,7 @@ def _test_multiple_optimizers(self, sharding_strategy: ShardingStrategy): has_weight = True elif "bias" in fqn and param.numel() > 0: has_bias = True - has_both |= (has_weight and has_bias) + has_both |= has_weight and has_bias assert has_both, ( f"Rank {self.rank} does not have a `FlatParameter` with both a " "weight and a bias in its shard, meaning that this test is vacuous" @@ -440,7 +443,8 @@ def run_iter(): # Check that FSDP correctly exposes gradients even after forward # (namely, `None` for weights and non-`None` for biases) for (ddp_n, ddp_p), (fsdp_n, fsdp_p) in zip( - ddp_model.module.named_parameters(), fsdp_model.named_parameters(), + ddp_model.module.named_parameters(), + fsdp_model.named_parameters(), ): self.assertEqual(ddp_n, fsdp_n) if fsdp_p.numel() == 0: diff --git a/test/distributed/fsdp/test_utils.py b/test/distributed/fsdp/test_utils.py index 2aa7fa0b6d97e..6ac2f78be7150 100644 --- a/test/distributed/fsdp/test_utils.py +++ b/test/distributed/fsdp/test_utils.py @@ -2,9 +2,10 @@ import random import sys -from typing import List import unittest from collections import OrderedDict +from dataclasses import dataclass +from typing import List import torch import torch.nn as nn @@ -12,14 +13,13 @@ from torch.distributed.fsdp._utils import _apply_to_tensors from torch.distributed.utils import _replace_by_prefix from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, - TestCase, instantiate_parametrized_tests, parametrize, run_tests, subtest, + TEST_WITH_DEV_DBG_ASAN, + TestCase, ) -from dataclasses import dataclass if not dist.is_available(): print("Distributed not available, skipping tests", file=sys.stderr) @@ -60,8 +60,6 @@ class SomeDataClass: some_float: float some_tensor: List[torch.Tensor] - - # create a mixed bag of data. data = [1, "str"] data.append({"key1": get_a_tensor(), "key2": {1: get_a_tensor()}, "key3": 3}) @@ -100,7 +98,6 @@ def test_replace_by_prefix(self): _replace_by_prefix(state_dict, "module.layer.", "layer.") assert state_dict == original_state_dict - def test_packed_sequence(self): """Test to ensure RNN packed sequences are modified correctly.""" rnn = nn.RNN(5, 5) diff --git a/test/distributed/fsdp/test_wrap.py b/test/distributed/fsdp/test_wrap.py index 98ba324f46f18..cd0d11ba9b4b1 100644 --- a/test/distributed/fsdp/test_wrap.py +++ b/test/distributed/fsdp/test_wrap.py @@ -4,7 +4,7 @@ import os import tempfile import unittest -from enum import Enum, auto +from enum import auto, Enum import torch import torch.nn as nn @@ -12,8 +12,6 @@ from torch.distributed.fsdp.fully_sharded_data_parallel import ( BackwardPrefetch, CPUOffload, -) -from torch.distributed.fsdp.fully_sharded_data_parallel import ( FullyShardedDataParallel as FSDP, ) from torch.distributed.fsdp.wrap import ( @@ -28,20 +26,20 @@ from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer from torch.testing._internal.common_distributed import skip_if_lt_x_gpu from torch.testing._internal.common_fsdp import ( + _maybe_cuda, CUDAInitMode, DummyProcessGroup, FSDPInitMode, FSDPTest, TransformerWithSharedParams, - _maybe_cuda, ) from torch.testing._internal.common_utils import ( FILE_SCHEMA, - TestCase, find_free_port, instantiate_parametrized_tests, parametrize, run_tests, + TestCase, ) @@ -54,6 +52,7 @@ def __init__(self): self.bn3 = nn.BatchNorm3d(10) self.sync_bn = nn.SyncBatchNorm(10) + class WrapMethod(Enum): FSDP_CTOR = auto() # FSDP_CTOR is the supported way forward, but keep WRAP_API in case we miss @@ -61,8 +60,6 @@ class WrapMethod(Enum): WRAP_API = auto() - - class TestFSDPWrap(FSDPTest): """ Tests main API for wrapping FSDP, which is to pass auto_wrap_policy into @@ -144,7 +141,9 @@ def test_error_already_wrapped(self, nested, cuda_init_mode): Test that an error is raised if we attempt to wrap when submodules are already FSDP. """ - wrapped_fsdp = self._get_already_wrapped_fsdp(nested=nested, cuda_init_mode=cuda_init_mode) + wrapped_fsdp = self._get_already_wrapped_fsdp( + nested=nested, cuda_init_mode=cuda_init_mode + ) if cuda_init_mode == CUDAInitMode.CUDA_AFTER: wrapped_fsdp = wrapped_fsdp.cuda() @@ -159,9 +158,10 @@ def never_wrap_policy(*args, **kwargs): policy = ( functools.partial( - _or_policy, - policies=[never_wrap_policy, _wrap_batchnorm_individually] - ) if use_or_policy else _wrap_batchnorm_individually + _or_policy, policies=[never_wrap_policy, _wrap_batchnorm_individually] + ) + if use_or_policy + else _wrap_batchnorm_individually ) model = BatchNormNet() fsdp = FSDP(model, auto_wrap_policy=policy) @@ -178,6 +178,7 @@ def test_bn_always_wrapped_individually(self): if the other policy results in a module containing a BN unit being wrapped, the contained BN unit will still be individually wrapped. """ + class MyModule(nn.Module): def __init__(self): super().__init__() @@ -189,8 +190,7 @@ def wrap_bn_container(module, recurse, *args, **kwargs): return isinstance(module, BatchNormNet) my_policy = functools.partial( - _or_policy, - policies=[wrap_bn_container, _wrap_batchnorm_individually] + _or_policy, policies=[wrap_bn_container, _wrap_batchnorm_individually] ) mod = MyModule() fsdp = FSDP(mod, auto_wrap_policy=my_policy) @@ -203,7 +203,7 @@ def wrap_bn_container(module, recurse, *args, **kwargs): fsdp.bn_container.bn1, fsdp.bn_container.bn2, fsdp.bn_container.bn3, - fsdp.bn_container.sync_bn + fsdp.bn_container.sync_bn, ]: self.assertTrue(isinstance(bn, FSDP)) @@ -216,24 +216,21 @@ def wrap_bn_container(module, recurse, *args, **kwargs): fsdp.bn_container.bn1, fsdp.bn_container.bn2, fsdp.bn_container.bn3, - fsdp.bn_container.sync_bn + fsdp.bn_container.sync_bn, ]: self.assertFalse(isinstance(bn, FSDP)) @skip_if_lt_x_gpu(2) @parametrize( "cpu_offload", - [CPUOffload(offload_params=False), CPUOffload(offload_params=True)] + [CPUOffload(offload_params=False), CPUOffload(offload_params=True)], ) @parametrize( "backward_prefetch", - [BackwardPrefetch.BACKWARD_POST, BackwardPrefetch.BACKWARD_PRE] + [BackwardPrefetch.BACKWARD_POST, BackwardPrefetch.BACKWARD_PRE], ) @parametrize("forward_prefetch", [False, True]) - @parametrize( - "cuda_init_mode", - [CUDAInitMode.CUDA_AFTER, CUDAInitMode.CUDA_BEFORE] - ) + @parametrize("cuda_init_mode", [CUDAInitMode.CUDA_AFTER, CUDAInitMode.CUDA_BEFORE]) def test_main_wrap_api( self, cpu_offload: CPUOffload, @@ -286,7 +283,7 @@ def forward(self, input): wrapped_model.module.lin3, wrapped_model.module.lin4.module.nested_lin, wrapped_model.module.lin4, - wrapped_model + wrapped_model, ] for module in modules_in_fsdp_graph_order: @@ -322,7 +319,9 @@ def test_wrap(self, wrap_method): layer = FSDP( nn.Linear(5, 5), process_group=self.process_group, - auto_wrap_policy=functools.partial(size_based_auto_wrap_policy, min_num_params=1) + auto_wrap_policy=functools.partial( + size_based_auto_wrap_policy, min_num_params=1 + ), ) self.assertTrue(isinstance(layer, FSDP)) self.assertEqual(layer.rank, self.process_group.rank()) @@ -362,7 +361,9 @@ def test_always_wrap(self): passed into FSDP, all submodules are wrapped. """ seq = TestFSDPWrap.NestedSequentialModel.get_model(cuda=True) - model = FSDP(seq, process_group=self.process_group, auto_wrap_policy=always_wrap_policy) + model = FSDP( + seq, process_group=self.process_group, auto_wrap_policy=always_wrap_policy + ) TestFSDPWrap.NestedSequentialModel.verify_model_all_wrapped(self, model) @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs") @@ -383,7 +384,11 @@ def test_transformer_auto_wrap_policy(self): encoder_layers = set(fsdp_model.module.transformer.encoder.layers) decoder_layers = set(fsdp_model.module.transformer.decoder.layers) for module in modules: - if module is fsdp_model or module in encoder_layers or module in decoder_layers: + if ( + module is fsdp_model + or module in encoder_layers + or module in decoder_layers + ): self.assertTrue(isinstance(module, FSDP)) else: self.assertFalse(isinstance(module, FSDP)) @@ -401,7 +406,7 @@ def test_auto_wrap_api(self): model = FSDP( sequential, process_group=self.process_group, - auto_wrap_policy=my_auto_wrap_policy + auto_wrap_policy=my_auto_wrap_policy, ) TestFSDPWrap.NestedSequentialModel.verify_model(self, model) @@ -420,7 +425,7 @@ def test_auto_wrap_preset_exclude_wrap(self): model = FSDP( sequential, process_group=self.process_group, - auto_wrap_policy=my_auto_wrap_policy + auto_wrap_policy=my_auto_wrap_policy, ) self.assertTrue(isinstance(model, FSDP)) @@ -437,7 +442,11 @@ def test_auto_wrap_preset_exclude_wrap_include_children(self): my_auto_wrap_policy = functools.partial( size_based_auto_wrap_policy, min_num_params=40 ) - model = FSDP(sequential, process_group=self.process_group, auto_wrap_policy=my_auto_wrap_policy) + model = FSDP( + sequential, + process_group=self.process_group, + auto_wrap_policy=my_auto_wrap_policy, + ) self.assertTrue(isinstance(model, FSDP)) self.assertTrue(isinstance(model[0], FSDP)) @@ -452,7 +461,11 @@ def test_auto_wrap_preset_force_leaf(self): my_auto_wrap_policy = functools.partial( size_based_auto_wrap_policy, min_num_params=40 ) - model = FSDP(sequential, process_group=self.process_group, auto_wrap_policy=my_auto_wrap_policy) + model = FSDP( + sequential, + process_group=self.process_group, + auto_wrap_policy=my_auto_wrap_policy, + ) self.assertTrue(isinstance(model.module[0], FSDP)) # Assert children of multihead attention are not wrapped self.assertTrue(isinstance(model.module[1], nn.MultiheadAttention)) @@ -473,7 +486,11 @@ def test_auto_wrap_preset_force_leaf_custom(self): sequential = nn.Sequential( nn.Linear(10, 10), nn.ModuleList([nn.Linear(10, 10)]) ) - model = FSDP(sequential, process_group=self.process_group, auto_wrap_policy=my_auto_wrap_policy) + model = FSDP( + sequential, + process_group=self.process_group, + auto_wrap_policy=my_auto_wrap_policy, + ) # Model was wrapped in FSDP as no inner modules were wrapped. self.assertTrue(isinstance(model, FSDP)) self.assertTrue(isinstance(model.module[0], nn.Linear)) @@ -483,14 +500,12 @@ def test_auto_wrap_preset_force_leaf_custom(self): @parametrize("cuda_init_mode", [CUDAInitMode.CUDA_BEFORE, CUDAInitMode.CUDA_AFTER]) @parametrize( "cpu_offload", - [CPUOffload(offload_params=False), CPUOffload(offload_params=True)] + [CPUOffload(offload_params=False), CPUOffload(offload_params=True)], ) @parametrize("use_device_id", [True, False]) def test_auto_wrap_smoke_test(self, cuda_init_mode, cpu_offload, use_device_id): # CPU offload and CUDA after don't work together as expected. - if ( - cpu_offload.offload_params and cuda_init_mode == CUDAInitMode.CUDA_AFTER - ): + if cpu_offload.offload_params and cuda_init_mode == CUDAInitMode.CUDA_AFTER: return device = torch.device("cuda") @@ -515,12 +530,17 @@ def test_auto_wrap_smoke_test(self, cuda_init_mode, cpu_offload, use_device_id): # cases where full model cannot be loaded onto GPU, but their shards can. cuda_after_init = cuda_init_mode == CUDAInitMode.CUDA_AFTER try: - sequential = TestFSDPWrap.NestedSequentialModel.get_model(cuda=(not cuda_after_init)) + sequential = TestFSDPWrap.NestedSequentialModel.get_model( + cuda=(not cuda_after_init) + ) my_auto_wrap_policy = functools.partial( size_based_auto_wrap_policy, min_num_params=40 ) model = FSDP( - sequential, cpu_offload=cpu_offload, auto_wrap_policy=my_auto_wrap_policy, device_id=device_id + sequential, + cpu_offload=cpu_offload, + auto_wrap_policy=my_auto_wrap_policy, + device_id=device_id, ) TestFSDPWrap.NestedSequentialModel.verify_model(self, model) if cuda_after_init: @@ -568,7 +588,8 @@ def test_auto_wrap_with_ignored_modules(self, wrap_method: WrapMethod): sequential = TestFSDPWrap.NestedSequentialModel.get_model(cuda=False) ignored_modules = [sequential[1], sequential[2][0]] my_auto_wrap_policy = functools.partial( - size_based_auto_wrap_policy, min_num_params=40, + size_based_auto_wrap_policy, + min_num_params=40, ) fsdp_kwargs = { "process_group": self.process_group, diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py index 7fdbe573ed217..3c159313f0890 100644 --- a/torch/testing/_internal/common_fsdp.py +++ b/torch/testing/_internal/common_fsdp.py @@ -6,15 +6,14 @@ from abc import ABC, abstractmethod from contextlib import suppress from copy import deepcopy -from enum import Enum, auto +from enum import auto, Enum from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union from unittest import mock import torch import torch.distributed as dist import torch.nn as nn -from torch.distributed.fsdp import CPUOffload -from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp import CPUOffload, FullyShardedDataParallel as FSDP from torch.distributed.fsdp.fully_sharded_data_parallel import ( BackwardPrefetch, MixedPrecision, @@ -29,10 +28,7 @@ ) from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer from torch.nn.parallel.distributed import DistributedDataParallel as DDP -from torch.testing._internal.common_distributed import ( - TEST_SKIPS, - MultiProcessTestCase, -) +from torch.testing._internal.common_distributed import MultiProcessTestCase, TEST_SKIPS from torch.testing._internal.common_utils import FILE_SCHEMA, get_cycles_per_ms @@ -57,6 +53,7 @@ class CUDAInitMode(Enum): class FSDPTestModel(nn.Module, ABC): """This defines the interface expected from all models used commonly for FSDP unit tests.""" + @abstractmethod def get_input(self, device) -> Tuple[torch.Tensor, ...]: """Returns an input for the model as as tuple.""" @@ -87,7 +84,6 @@ def init( ... - def _assert_module_states( model: nn.Module, process_group: dist.ProcessGroup, @@ -116,6 +112,7 @@ def _assert_module_states( for (_, p1), (_, p2) in zip(rank0_states, state): assert_fn(p1, p2) + def _zero_model( model: nn.Module, zero_buffers: bool = False, @@ -130,6 +127,7 @@ def _zero_model( with torch.no_grad(): buffer.zero_() + def _get_state_dict(model, cpu_offload=False, half=False): if not cpu_offload: model = model.cuda() @@ -138,11 +136,13 @@ def _get_state_dict(model, cpu_offload=False, half=False): return model.state_dict() + def subtest_name(test_name_mapping, *args): - return '_'.join( + return "_".join( [test_name_mapping[str(s)] if s is not None else "none" for s in args] ) + def get_full_params(model: nn.Module, recurse: bool = True): """ Returns the full unsharded parameters of ``model``. Any FSDP-managed @@ -156,14 +156,14 @@ def get_full_params(model: nn.Module, recurse: bool = True): with FSDP.summon_full_params(model, recurse=recurse): return deepcopy(list(model.parameters())) + def _maybe_cuda(model: nn.Module, move_to_cuda: bool): return model.cuda() if move_to_cuda else model + def _maybe_wrap_fsdp(model: nn.Module, wrap_fsdp: bool, *args, **kwargs): - return ( - model if not wrap_fsdp - else FSDP(model, *args, **kwargs) - ) + return model if not wrap_fsdp else FSDP(model, *args, **kwargs) + class DummyProcessGroup: def __init__(self, rank: int, size: int): @@ -187,13 +187,13 @@ def get_future(): dist_wait.get_future = get_future return dist_wait + class DeterministicModel(torch.nn.Module): def __init__(self, wrap_fsdp, cpu_offload=CPUOffload(offload_params=False)): super().__init__() # keep everything deterministic for model initialization torch.manual_seed(0) - self.inner: Union[torch.nn.Linear, FSDP] = \ - torch.nn.Linear(2, 2).cuda() + self.inner: Union[torch.nn.Linear, FSDP] = torch.nn.Linear(2, 2).cuda() if wrap_fsdp: self.inner = FSDP(self.inner, cpu_offload=cpu_offload) self.outer = torch.nn.Linear(2, 2).cuda() @@ -202,6 +202,7 @@ def forward(self, x): y = self.inner(x) return self.outer(y) + class TransformerWithSharedParams(FSDPTestModel): def __init__( self, @@ -297,7 +298,9 @@ def init( if fsdp_kwargs is None: fsdp_kwargs = {} if fsdp_init_mode == FSDPInitMode.NO_FSDP: - return TransformerWithSharedParams(group, cuda_init_mode, add_bn, deterministic) + return TransformerWithSharedParams( + group, cuda_init_mode, add_bn, deterministic + ) elif fsdp_init_mode == FSDPInitMode.RECURSIVE: # Default to the `transformer_auto_wrap_policy()` if "auto_wrap_policy" not in fsdp_kwargs: @@ -311,7 +314,9 @@ def init( else: auto_wrap_policy = fsdp_kwargs.pop("auto_wrap_policy") fsdp_model = FSDP( - TransformerWithSharedParams(group, cuda_init_mode, add_bn, deterministic), + TransformerWithSharedParams( + group, cuda_init_mode, add_bn, deterministic + ), group, auto_wrap_policy=auto_wrap_policy, **fsdp_kwargs, @@ -454,6 +459,7 @@ def init( class ModuleWithDelay(FSDPTestModel): """This class wraps a :class:`FSDPTestModel` to optionally add a delay after computing the loss and/or before the gradient reduction.""" + def __init__( self, module: nn.Module, @@ -519,6 +525,7 @@ def init( delay_before_reduction_ms, ) + class NestedWrappedModuleWithDelay(ModuleWithDelay): @staticmethod def init( @@ -601,7 +608,7 @@ def __init__( _maybe_cuda(nn.Linear(d_input, d_shared), self.move_to_cuda), shared, expert, - _maybe_cuda(nn.Linear(d_shared, d_input), self.move_to_cuda) + _maybe_cuda(nn.Linear(d_shared, d_input), self.move_to_cuda), ) def forward(self, x): @@ -738,7 +745,9 @@ def run_subtests( # Convert the config mapping to a list to have a fixed order subtest_config_items: List[Tuple[str, List[Any]]] = list(subtest_config.items()) subtest_config_keys: List[str] = [item[0] for item in subtest_config_items] - subtest_config_values: List[List[Any]] = [item[1] for item in subtest_config_items] + subtest_config_values: List[List[Any]] = [ + item[1] for item in subtest_config_items + ] for values in itertools.product(*subtest_config_values): # Map keyword to chosen value subtest_kwargs = { @@ -850,7 +859,9 @@ def _train_for_several_steps( model, norm_type, self.rank ) else: - torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm, norm_type) + torch.nn.utils.clip_grad_norm_( + model.parameters(), max_norm, norm_type + ) total_norm_after_clip = _collect_total_grad_norm_local( model, norm_type ) @@ -910,7 +921,9 @@ def _test_fsdp_parity( wrapper should provide data parallel semantics. If ``None``, then the callable defaults to the DDP constructor. """ - assert fsdp_init_mode != FSDPInitMode.NO_FSDP, "Expects an FSDP init mode that wraps with FSDP" + assert ( + fsdp_init_mode != FSDPInitMode.NO_FSDP + ), "Expects an FSDP init mode that wraps with FSDP" if init_kwargs is None: init_kwargs = {} lr = 1e-2 @@ -977,15 +990,20 @@ def _test_fsdp_parity( # Offloading parameters with `CUDA_AFTER` should raise an error during # lazy initialization due to the parameter devices not being CPU; # otherwise, all parameter devices should be CPU - expects_device_error = offload_params and cuda_init_mode == CUDAInitMode.CUDA_AFTER - expects_cpu_device = offload_params and cuda_init_mode != CUDAInitMode.CUDA_AFTER + expects_device_error = ( + offload_params and cuda_init_mode == CUDAInitMode.CUDA_AFTER + ) + expects_cpu_device = ( + offload_params and cuda_init_mode != CUDAInitMode.CUDA_AFTER + ) if expects_cpu_device: cpu_device = torch.device("cpu") for param in fsdp_model.parameters(): self.assertEqual(param.device, cpu_device) context = ( self.assertRaisesRegex(AssertionError, "Expected param to be on CPU") - if expects_device_error else suppress() + if expects_device_error + else suppress() ) with context: fsdp_loss = self._train_for_several_steps( From fb5b1006c72692d61d285bbddde86cc5fe66ff02 Mon Sep 17 00:00:00 2001 From: Richard Barnes Date: Thu, 27 Oct 2022 05:15:16 +0000 Subject: [PATCH 0223/1922] use nv_diag_suppress (#87712) Fixes: ``` /dev/shm/rbarnes/tempfs/pytorch/aten/src/ATen/native/cuda/UnaryFractionKernels.cu(125): warning #20236-D: pragma "diag_suppress" is deprecated, use "nv_diag_suppress" instead /dev/shm/rbarnes/tempfs/pytorch/aten/src/ATen/native/cuda/UnaryFractionKernels.cu(125): warning #20236-D: pragma "diag_suppress" is deprecated, use "nv_diag_suppress" instead /dev/shm/rbarnes/tempfs/pytorch/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu(73): warning #20236-D: pragma "diag_suppress" is deprecated, use "nv_diag_suppress" instead /dev/shm/rbarnes/tempfs/pytorch/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu(73): warning #20236-D: pragma "diag_suppress" is deprecated, use "nv_diag_suppress" instead ``` cc @ngimel Pull Request resolved: https://github.com/pytorch/pytorch/pull/87712 Approved by: https://github.com/soumith --- aten/src/ATen/native/cuda/UnaryFractionKernels.cu | 2 +- aten/src/ATen/native/sparse/cuda/SparseMatMul.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/native/cuda/UnaryFractionKernels.cu b/aten/src/ATen/native/cuda/UnaryFractionKernels.cu index 87aa784b7d5d3..ae4d4a01aa00d 100644 --- a/aten/src/ATen/native/cuda/UnaryFractionKernels.cu +++ b/aten/src/ATen/native/cuda/UnaryFractionKernels.cu @@ -122,7 +122,7 @@ __host__ __device__ static inline c10::complex nearbyint_wrapper(c10::com } #pragma push -#pragma diag_suppress 177 // Function was declared but never referenced +#pragma nv_diag_suppress 177 // Function was declared but never referenced __host__ __device__ static inline c10::complex nearbyint_wrapper(c10::complex a) { return c10::complex(::nearbyint(static_cast(a.real())), ::nearbyint(static_cast(a.imag()))); } diff --git a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu index 8cc5fc3157c38..33123abccbe93 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu @@ -70,7 +70,7 @@ Tensor _to_csr_int(const Tensor& rowIndices, int64_t dim, int64_t nnz) { #pragma push // NVCC complains that confirm_mult_size is not used, // but it is used in specializations of CusparseMatrixMultiplyOp below -#pragma diag_suppress 177 // Function was declared but never referenced +#pragma nv_diag_suppress 177 // Function was declared but never referenced int confirm_mult_size(const std::vector& mat1_size, const std::vector& mat2_size) { TORCH_CHECK( mat1_size[1] == mat2_size[0], From 85816bbdd570015ec93613f7149607c0e2ec9ff0 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Thu, 27 Oct 2022 06:04:22 +0000 Subject: [PATCH 0224/1922] [torch] Add torch cpp cpu target for torch/csrc/api/src files (#87327) Summary: Duplicating fbcode target `fbcode//caffe2:torch-cpp-cpu` target in xplat. In D40460749 our user wants to use `torch::kNearest` enum which is defined in `torch/csrc/api/src/enum.cpp`. Adding this target to support it. Test Plan: Rely on CI Differential Revision: D40532087 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87327 Approved by: https://github.com/ezyang --- buckbuild.bzl | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/buckbuild.bzl b/buckbuild.bzl index 24302e64c92f1..0003353f1040f 100644 --- a/buckbuild.bzl +++ b/buckbuild.bzl @@ -22,6 +22,7 @@ load( "jit_core_headers", "jit_core_sources", "libtorch_profiler_sources", + "torch_cpp_srcs", "torch_mobile_tracer_sources", ) load( @@ -1368,6 +1369,19 @@ def define_buck_targets( ], ) + pt_xplat_cxx_library( + name = "torch_cpp_cpu", + srcs = torch_cpp_srcs, + headers = native.glob(["torch/csrc/api/include/**/*.h"]) + ["torch/script.h"], + compiler_flags = get_pt_compiler_flags(), + exported_preprocessor_flags = get_pt_preprocessor_flags(), + visibility = ["PUBLIC"], + exported_deps = [ + ":torch", + ":torch_mobile_deserialize_common", # for torch/csrc/api/src/serialize/input-archive.cpp + ], + ) + pt_xplat_cxx_library( name = "torch_core", srcs = core_sources_full_mobile_no_backend_interface + [ From 1ff7e3125428e9e56c80b2127e7f80132d1bf247 Mon Sep 17 00:00:00 2001 From: Taylor Robie Date: Wed, 26 Oct 2022 16:56:47 -0700 Subject: [PATCH 0225/1922] [Profiler] Hold weak reference to prevent TensorImpl address reuse during profiling. (#87244) A recurring problem with assigning Tensor IDs is that we want to preserve identity when storage changes but we don't observe TensorImpl destruction so identity assignment is not robust to the ABA problem with respect to TensorImpl*. ~TensorImpl is far too hot to instrument; even adding a call to a no-op function in a different compilation unit increases overhead by tens of percent. (OSS builds do not have any sort of LTO.) Fortunately there is a solution. A PyTorch Tensor is a `c10::intrusive_ptr`, which in turn holds a storage. (Which is a `c10::intrusive_ptr`) `c10::intrusive_ptr` has a `c10::weak_intrusive_ptr` class for taking non-owning references to the underlying object. The implementation involves both a strong refcount and weak refcount in `c10::intrusive_ptr`. If the strong refcount of an intrusive_ptr goes to zero and there are no weak references then everything is deleted. However if there is a weak reference then the intrusive_ptr calls `release_resources()` but not delete. This has the effect of freeing the underlying resources (ensuring that program semantics are unchanged) but leaves behind an empty shell of an `intrusive_ptr` that the `weak_intrusive_ptr`s use to check status. And herein lies the solution: as long as we hold a weak reference to a TensorImpl we will block deletion and prevent the `TensorImpl*` from being reused. This PR uses a `c10::weak_intrusive_ptr` to store the address of profiled TensorImpls and then converts it to a raw pointer (or rather, a `TensorImplAddress`) during post processing when we no longer care about blocking address reuse. Differential Revision: [D40492848](https://our.internmc.facebook.com/intern/diff/D40492848/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87244 Approved by: https://github.com/slgong-fb, https://github.com/albanD --- test/profiler/test_profiler.py | 17 ++++++ torch/csrc/autograd/profiler_python.cpp | 21 +++++-- torch/csrc/profiler/collection.cpp | 9 ++- torch/csrc/profiler/collection.h | 72 +++++++++------------- torch/csrc/profiler/data_flow.h | 79 +++++++++++++++++++++++++ torch/csrc/profiler/python/init.cpp | 2 +- 6 files changed, 144 insertions(+), 56 deletions(-) create mode 100644 torch/csrc/profiler/data_flow.h diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py index 3831b6bd1247d..22db16d1943af 100644 --- a/test/profiler/test_profiler.py +++ b/test/profiler/test_profiler.py @@ -1505,6 +1505,23 @@ def test_allocation_ids_with_other_ops(self) -> None: lambda: torch.zeros((1,)).cos() ) + def test_impl_reuse(self) -> None: + repeats = 1_000 + with profile(profile_memory=True, record_shapes=True) as p: + for _ in range(repeats): + torch.ones((1,)) + gc.collect() + + roots = p.profiler.kineto_results.experimental_event_tree() + tensor_impls = tuple( + e.extra_fields.inputs.tensor_metadata[0].impl_ptr + for e in _utils.traverse_dfs(roots) + if e.name == "aten::fill_" + ) + + self.assertEqual(len(tensor_impls), repeats) + self.assertEqual(len(set(tensor_impls)), repeats) + def test_extra_fields(self): with profile(with_stack=True, profile_memory=True) as p: _ = torch.ones((1,)) diff --git a/torch/csrc/autograd/profiler_python.cpp b/torch/csrc/autograd/profiler_python.cpp index 308dcdcde49c8..43479c3f15668 100644 --- a/torch/csrc/autograd/profiler_python.cpp +++ b/torch/csrc/autograd/profiler_python.cpp @@ -129,7 +129,7 @@ class CallTypeHelper final { std::index_sequence); template - static void map(T& t, FunctorT& f, Args... args) { + static void map(T& t, FunctorT& f, Args&&... args) { f(std::get(t), args...); c10::guts::if_constexpr( [&](auto _) { map(_(t), f, std::forward(args)...); }); @@ -139,7 +139,7 @@ class CallTypeHelper final { using tuple_type = decltype(make_tuple_impl(std::make_index_sequence{})); template - static void map(tuple_type& t, FunctorT& f, Args... args) { + static void map(tuple_type& t, FunctorT& f, Args&&... args) { map<0>(t, f, std::forward(args)...); } }; @@ -281,6 +281,9 @@ using PyOptimizerCallKey = Config::key_t; class ValueCache { public: + ValueCache() = default; + ValueCache(const ValueCache&) = delete; + template void store(const typename Config::key_t&, typename Config::ephemeral_t); @@ -295,6 +298,9 @@ class ValueCache { load(callsite.value_)}; } + c10::optional recordIfTensor(py::handle p); + std::vector> unpackTensorMap( + py::dict tensor_map); void trimPrefixes(); private: @@ -330,18 +336,21 @@ typename Config::cls_t set_class( return cls; } -auto toTensorMetadata(PyObject* self) { +TensorMetadata toTensorMetadata(PyObject* self) { TORCH_INTERNAL_ASSERT(THPVariable_CheckExact(self)); - return TensorMetadata{THPVariable_Unpack(self)}; + const auto& t = THPVariable_Unpack(self); + RawTensorMetadata m{t}; + return TensorMetadata{m}; } -auto recordIfTensor(py::handle p) { +c10::optional ValueCache::recordIfTensor(py::handle p) { return THPVariable_CheckExact(p.ptr()) ? c10::optional{toTensorMetadata(p.ptr())} : c10::nullopt; } -auto unpackTensorMap(py::dict tensor_map) { +std::vector> ValueCache::unpackTensorMap( + py::dict tensor_map) { std::vector> out; for (auto& it : tensor_map) { auto* value = it.second.ptr(); diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp index 01b7c4024f269..8bb57fda9cf48 100644 --- a/torch/csrc/profiler/collection.cpp +++ b/torch/csrc/profiler/collection.cpp @@ -30,9 +30,8 @@ using result_ptr_t = std::shared_ptr; using trace_ptr_t = std::unique_ptr; -RawTensorMetadata::RawTensorMetadata(const at::Tensor& t) - : impl_{t.unsafeGetTensorImpl()}, - data_{t.has_storage() ? t.storage().data() : nullptr}, +RawTensorMetadataBase::RawTensorMetadataBase(const at::Tensor& t) + : data_{t.has_storage() ? t.storage().data() : nullptr}, device_type_{t.device().type()}, device_index_{t.device().index()}, dtype_{t.scalar_type()}, @@ -864,10 +863,10 @@ void calculate_unique_tensor_ids(std::vector& sorted_results) { ska::flat_hash_set tensor_set; auto insert_tensor = [&lookup, &tensors, &tensor_set](TensorMetadata& m) { - if (m.impl_ && m.data_) { + if (m.impl() && m.data_) { const auto id = lookup(m.data_); tensor_set.insert(id); - tensors.emplace_back(TensorStoragePair{m.impl_, id, m.id_}); + tensors.emplace_back(TensorStoragePair{m.impl(), id, m.id_}); } }; diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h index 5402e613eb858..096568285a713 100644 --- a/torch/csrc/profiler/collection.h +++ b/torch/csrc/profiler/collection.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -37,48 +38,10 @@ enum class EventType : uint8_t { // ============================================================================ // == Value (Tensor, Scalar) summary ========================================== // ============================================================================ +struct TORCH_API RawTensorMetadataBase { + RawTensorMetadataBase() = default; + explicit RawTensorMetadataBase(const at::Tensor& t); -// We use a Tensor's TensorImpl adress and StorageImpl data start to build the -// data flow graph. We do not hold a reference so we wrap them in strong types -// to prevent direct access. -using TensorImplAddress = strong::type< - const c10::TensorImpl*, - struct TensorImplAddress_, - strong::regular, - strong::hashable, - strong::boolean>; - -using StorageImplData = strong::type< - void*, - struct StorageImplData_, - strong::regular, - strong::hashable, - strong::boolean>; - -// Identity is a complex concept in PyTorch. A Tensor might not have a -// an associated storage, multiple Tensors might share the same underlying -// storage, the storage of a Tensor might change over time, etc. -// -// For the purpose of profiling we're mostly interested in data flow -// analysis. As a result, we can take an expansive view of identity: -// Tensors share an ID if they share a TensorImpl or storage data. -// -// This identity equality is transitive; If Tensors T0 and T1 share a storage -// S0 and T1 later points to a different storage S1 then all Tensors which -// point to either S0 or S1 are considered to have the same identity. (Since -// profiler cannot reason beyond that.) -// -// The profiler will handle lifetime analysis to ensure that identities do -// not run afoul of the ABA problem. This does, however, mean that identities -// can only be assigned when memory profiling is enabled. (And we cannot -// handle ABA for TensorImpl as those allocations are not instrumented.) -using TensorID = strong::type; - -struct TORCH_API RawTensorMetadata { - RawTensorMetadata() = default; - RawTensorMetadata(const RawTensorMetadata&) = default; - explicit RawTensorMetadata(const at::Tensor& t); - TensorImplAddress impl_; StorageImplData data_; // Device is separated into DeviceType and DeviceIndex as Device @@ -91,13 +54,34 @@ struct TORCH_API RawTensorMetadata { uint32_t dim_; }; -struct TensorMetadata : public RawTensorMetadata { - explicit TensorMetadata(const RawTensorMetadata& r) : RawTensorMetadata(r) {} - explicit TensorMetadata(const at::Tensor& t) : RawTensorMetadata(t) {} +// Collected during profiling. +struct TORCH_API RawTensorMetadata : RawTensorMetadataBase { + RawTensorMetadata() = default; + RawTensorMetadata(const RawTensorMetadata&) = default; + explicit RawTensorMetadata(const at::Tensor& t) + : RawTensorMetadataBase(t), weak_self_{WeakTensor(t)} {}; + + // Wrap in `c10::optional` to make `weak_self_` default constructable. + c10::optional weak_self_; +}; + +// Used during post processing. +struct TensorMetadata : public RawTensorMetadataBase { + explicit TensorMetadata(const RawTensorMetadata& r) + : RawTensorMetadataBase(r), + weak_self_{r.weak_self_.value_or(WeakTensor(at::Tensor()))} { + SOFT_ASSERT(r.weak_self_.has_value()); + } + c10::Device device() const { return {device_type_, device_index_}; } + TensorImplAddress impl() { + return weak_self_.get(); + } + + WeakTensor weak_self_; c10::optional id_; }; diff --git a/torch/csrc/profiler/data_flow.h b/torch/csrc/profiler/data_flow.h new file mode 100644 index 0000000000000..7afd0204d41db --- /dev/null +++ b/torch/csrc/profiler/data_flow.h @@ -0,0 +1,79 @@ +#pragma once + +#include +#include +#include +#include + +namespace torch { +namespace profiler { +namespace impl { + +// Identity is a complex concept in PyTorch. A Tensor might not have a +// an associated storage, multiple Tensors might share the same underlying +// storage, the storage of a Tensor might change over time, etc. +// +// For the purpose of profiling we're mostly interested in data flow +// analysis. As a result, we can take an expansive view of identity: +// Tensors share an ID if they share a TensorImpl or storage data. +// +// This identity equality is transitive; If Tensors T0 and T1 share a storage +// S0 and T1 later points to a different storage S1 then all Tensors which +// point to either S0 or S1 are considered to have the same identity. (Since +// profiler cannot reason beyond that.) +// +// The profiler will handle lifetime analysis to ensure that identities do +// not run afoul of the ABA problem. This does, however, mean that identities +// can only be assigned when memory profiling is enabled. +using TensorID = strong::type; + +// We use a Tensor's TensorImpl adress and StorageImpl data start to build the +// data flow graph. We do not hold an owning reference so we wrap them in strong +// types to prevent direct access. +using TensorImplAddress = strong::type< + const c10::TensorImpl*, + struct TensorImplAddress_, + strong::regular, + strong::hashable, + strong::boolean>; + +using StorageImplData = strong::type< + void*, + struct StorageImplData_, + strong::regular, + strong::hashable, + strong::boolean>; + +// ============================================================================ +// == weak_intrusive_ptr and the ABA problem for TensorImpl* ================== +// ============================================================================ +// Tracking `TensorImpl`s is an important part of identity tracking, because +// a Tensor might change storage; however when it does we want to retain the +// fact that the old and new storage belong to the same logical Tensor. We +// cannot take an owning reference to the Tensor because that would change +// program semantics by extending the lifetime of the Tensor. However if we +// store a raw TensorImpl* pointer the TensorImpl might be deleted and a new +// TensorImpl might be created that reuses the address. (ABA problem) +// +// Fortunately, there is a feature of `c10::intrusive_ptr` that we can use to +// prevent address reuse for the duration of profiling: the weak intrusive ptr. +// When a Tensor's refcount reaches zero but there are outstanding weak +// references (`weakcount_ > 0`) it will free the underlying managed resources +// by calling `target_->release_resources()`, but it will not call `delete`. +// (Instead, `delete` is called when the last weak reference is destroyed.) +// This means that we can safely use address identity to track `TensorImpls`. +class WeakTensor { + public: + explicit WeakTensor(const at::Tensor& t) : weak_self_(t.getIntrusivePtr()) {} + + auto get() const { + return TensorImplAddress{weak_self_._unsafe_get_target()}; + } + + private: + c10::weak_intrusive_ptr weak_self_; +}; + +} // namespace impl +} // namespace profiler +} // namespace torch diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp index 8a800a3d5f82b..8c3d10af0bd0c 100644 --- a/torch/csrc/profiler/python/init.cpp +++ b/torch/csrc/profiler/python/init.cpp @@ -129,7 +129,7 @@ void initPythonBindings(PyObject* module) { .def_readonly("tensor_metadata", &Inputs::tensor_metadata_); py::class_(m, "_TensorMetadata") - .def_readonly("impl_ptr", &TensorMetadata::impl_) + .def_property_readonly("impl_ptr", &TensorMetadata::impl) .def_readonly("storage_data_ptr", &TensorMetadata::data_) .def_readonly("id", &TensorMetadata::id_) .def_property_readonly( From d3c3665f8d7172ea6632ebc0469c8e5376aa7a37 Mon Sep 17 00:00:00 2001 From: XiaobingSuper Date: Wed, 26 Oct 2022 01:46:46 -0400 Subject: [PATCH 0226/1922] TorchDynamo: Add convolution unary fusion for cpu in inference mode (#87063) cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang Pull Request resolved: https://github.com/pytorch/pytorch/pull/87063 Approved by: https://github.com/jgong5, https://github.com/jansel --- test/inductor/test_torchinductor.py | 60 ++++++++++++ torch/_inductor/compile_fx.py | 1 + torch/_inductor/ir.py | 146 ++++++++++++++++++++++++++++ torch/_inductor/lowering.py | 38 ++++++++ torch/_inductor/overrides.py | 144 +++++++++++++++++++++++++++ 5 files changed, 389 insertions(+) diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py index 8e8b371c2780e..dd846e5f405a4 100644 --- a/test/inductor/test_torchinductor.py +++ b/test/inductor/test_torchinductor.py @@ -3,6 +3,7 @@ import dataclasses import functools import importlib +import itertools import os import random import sys @@ -1292,6 +1293,65 @@ def fn(a, b): check_lowp=False, ) + # For gpu path, there has a accurcy issue, + # see https://github.com/pytorch/pytorch/issues/87745. + @unittest.skipIf(HAS_CUDA, "only support cpu conv2d unary test") + def test_conv2d_unary(self): + def _unary_list(): + unary_list = [ + torch.nn.ReLU(), + torch.nn.Sigmoid(), + torch.nn.Tanh(), + torch.nn.Hardswish(), + torch.nn.LeakyReLU(0.1, inplace=False), + torch.nn.Hardtanh(min_val=-0.5, max_val=4, inplace=False), + torch.nn.GELU(approximate="none"), + torch.nn.GELU(approximate="tanh"), + ] + return unary_list + + test_memory_format = [torch.contiguous_format, torch.channels_last] + options = itertools.product( + _unary_list(), + [True, False], + [1, 3], + [1, 2], + [1, 4], + test_memory_format, + ) + + for ( + unary_fn, + bias, + kernel_size, + dilation, + groups, + memory_format, + ) in options: + oC = 32 * groups + iC = 3 * groups + x_shape = (1, iC, 112, 112) + mod = torch.nn.Sequential( + torch.nn.Conv2d( + iC, + oC, + kernel_size=kernel_size, + dilation=dilation, + groups=groups, + bias=bias, + ), + unary_fn, + ).eval() + + # TODO: add bf16 test for cpu path? + v = torch.randn(x_shape, dtype=torch.float32).to( + memory_format=memory_format + ) + self.common( + mod, + (v,), + ) + def test_gather1(self): def fn(a, b): return ( diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py index e6b27420a941a..26770b0671838 100644 --- a/torch/_inductor/compile_fx.py +++ b/torch/_inductor/compile_fx.py @@ -340,6 +340,7 @@ def compile_fx(model_: torch.fx.GraphModule, example_inputs_: List[torch.Tensor] with overrides.patch_functions(): model_ = normalize_ir(model_, example_inputs_) model_ = overrides.replace_fx(model_) + model_ = overrides.fuse_fx(model_, example_inputs_) num_example_inputs = len(example_inputs_) cudagraphs = BoxedBool(config.triton.cudagraphs and not config.dynamic_shapes) diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py index 7554dc905e23f..156eeb11bdc7b 100644 --- a/torch/_inductor/ir.py +++ b/torch/_inductor/ir.py @@ -3295,6 +3295,152 @@ def get_template_tiling(self): ) +def _prepare_convolution_fusion_create( + cls, + x: "TensorBox", + weight: "TensorBox", + bias: "TensorBox", + padding_: List[int], + stride_: List[int], + dilation_: List[int], + groups: int, +): + """ + This function is a helper function to prepare inputs, layout and constant args + for convolution post-op fusion's create function, including deciding the output + layout (channels first or channels last), realizing inputs and make them etc. The + function only supports the CPU device since conv post-op fusion kernel is only + supported on CPU right now. + """ + + x = cls.require_stride1(cls.realize_input(x)) + weight = cls.require_stride1(cls.realize_input(weight)) + assert x.get_device().type == "cpu" and weight.get_device().type == "cpu" + inputs = [x, weight] + stride = tuple(stride_) + padding = tuple(padding_) + dilation = tuple(dilation_) + assert isinstance(groups, int) + + weight_shape = [ + sympy.Integer(V.graph.sizevars.guard_static_shape(s)) for s in weight.get_size() + ] + + out_channels, in_channels1, *kernel_size = weight_shape + in_channels1 = in_channels1 * groups + assert len(x.get_size()) == 2 + len(kernel_size) + batch, in_channels2, *input_size = x.get_size() + output_size = [batch] + V.graph.sizevars.guard_equals(in_channels1, in_channels2) + + output_size.append(out_channels) + assert ( + len(stride) + == len(padding) + == len(dilation) + == len(kernel_size) + == len(input_size) + ) + for i in range(len(stride)): + output_size.append( + IndexingDiv( + input_size[i] + + 2 * padding[i] + - dilation[i] * (kernel_size[i] - 1) + - 1 + + stride[i], + stride[i], + ) + ) + output_size[-1] = sympy.Integer( + V.graph.sizevars.guard_static_shape(output_size[-1]) + ) + + output_layout_str = "torch.contiguous_format" + # If x or weight have one channels_last(2d or 3d) format, it will call channels_last path, + # which align with aten.convolutuion path(cpu only support 2d case now). + # TODO: after cpu 3d convolution support channels_last path, the size check can be removed. + if len(x.get_size()) == 4 and ( + x.get_layout().is_channels_last_stride_ordered() + or weight.get_layout().is_channels_last_stride_ordered() + ): + output_layout_str = "torch.channels_last" + + if output_layout_str == "torch.channels_last": + stride_order = [0] + list(reversed(range(1, len(kernel_size) + 1))) + if len(stride_order) < len(output_size): + # add batch dim if it exists + stride_order = [len(stride_order)] + stride_order + else: + stride_order = list(reversed(range(len(output_size)))) + + kernel_layout = FlexibleLayout( + device=inputs[0].get_device(), + dtype=inputs[0].get_dtype(), + size=output_size, + stride_order=stride_order, + ) + constant_args = [padding, stride, dilation, groups] + + if bias is not None: + inputs.append(bias) + else: + constant_args.insert(0, bias) + return inputs, constant_args, kernel_layout + + +class ConvolutionUnary(ExternKernelAlloc): + kernel = "torch.ops.mkldnn._convolution_pointwise" + + def __init__( + self, + layout, + inputs, + constant_args=(), + kernel="torch.ops.mkldnn._convolution_pointwise", + ): + super().__init__(layout, inputs, constant_args) + self.kernel = kernel + + def codegen(self, wrapper): + wrapper.writeline( + f"{self.get_name()} = {self.kernel}({', '.join(self.codegen_args())})" + ) + + @classmethod + def create( + cls, + x: "TensorBox", + weight: "TensorBox", + bias: "TensorBox", + padding_: List[int], + stride_: List[int], + dilation_: List[int], + groups: int, + attr, + scalars, + algorithm, + ): + kernel = "torch.ops.mkldnn._convolution_pointwise" + (inputs, constant_args, kernel_layout,) = _prepare_convolution_fusion_create( + cls, x, weight, bias, padding_, stride_, dilation_, groups + ) + constant_args = constant_args + [attr, scalars, algorithm] + return ConvolutionUnary( + layout=kernel_layout, + inputs=inputs, + constant_args=constant_args, + kernel=kernel, + ) + + def apply_constraint(self): + x = self.inputs[0] + # FixedLayout of input + x = self.require_stride_order(x, self.layout.preferred_stride_order) + self.inputs[0] = x + self.freeze_layout_with_stride_order(self.layout.preferred_stride_order) + + @dataclasses.dataclass class MutableBox(IRNode): """ diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py index fd94aa9bc5d5a..6b047e843c301 100644 --- a/torch/_inductor/lowering.py +++ b/torch/_inductor/lowering.py @@ -886,6 +886,44 @@ def bmm(a: TensorBox, b: TensorBox): return TensorBox.create(ir.BatchMatrixMultiply.create(a, b)) +def register_onednn_fusion_ops(): + if torch._C.has_mkldnn: + + @register_lowering(torch.ops.mkldnn._convolution_pointwise) + def convolution_unary( + x: TensorBox, + weight: TensorBox, + bias: TensorBox, + padding, + stride, + dilation, + groups, + attr, + scalars, + algorithm, + ): + return TensorBox.create( + ir.ConvolutionUnary.create( + x, + weight, + bias, + padding, + stride, + dilation, + groups, + attr, + scalars, + algorithm, + ) + ) + + else: + pass + + +register_onednn_fusion_ops() + + def fallback_handler(kernel): fallbacks.add(kernel) diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py index 85a0e0c1c2459..4078d442e8704 100644 --- a/torch/_inductor/overrides.py +++ b/torch/_inductor/overrides.py @@ -1,10 +1,19 @@ +import copy +import itertools import logging import random import weakref import torch +import torch.nn as nn from torch import _prims +from torch.fx.experimental.optimization import ( + matches_module_pattern, + replace_node_module, +) from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode +from torch.nn import functional as F +from torch.nn.modules.utils import _pair from torch.overrides import TorchFunctionMode log = logging.getLogger(__name__) @@ -37,6 +46,127 @@ def replace_fx(gm: torch.fx.GraphModule): return gm +class UnaryAttr(object): + def __init__(self, op_name: str, scalars_attr=None, algorithm_attr=None): + self.op_name = op_name + self.scalars_attr = scalars_attr if scalars_attr else [] + self.algorithm_attr = algorithm_attr if algorithm_attr else "" + super(UnaryAttr, self).__init__() + + def __call__(self, unary_module: nn.Module): + assert all(hasattr(unary_module, item) for item in self.scalars_attr) + scalars = [getattr(unary_module, item) for item in self.scalars_attr] + + algorithm = "" + if self.algorithm_attr: + assert hasattr(unary_module, self.algorithm_attr) + algorithm = getattr(unary_module, self.algorithm_attr) + + return self.op_name, scalars, algorithm + + +class ConvUnary2d(nn.Conv2d): + def __init__( + self, + conv: nn.Module, + unary: nn.Module, + ): + super(ConvUnary2d, self).__init__( + conv.in_channels, + conv.out_channels, + conv.kernel_size, + conv.stride, + conv.padding, + conv.dilation, + conv.groups, + conv.bias is not None, + conv.padding_mode, + conv.weight.device, + conv.weight.dtype, + ) + self._update_module_params(conv, unary) + + def _update_module_params(self, conv, unary): + self.__dict__ = copy.deepcopy(conv.__dict__) + self.attr, self.scalars, self.algorithm = unary_modules_map[unary.__class__]( + unary + ) + + def _conv_forward(self, input, weight, bias): + if self.padding_mode != "zeros": + return torch.ops.mkldnn._convolution_pointwise( + F.pad( + input, self._reversed_padding_repeated_twice, mode=self.padding_mode + ), + weight, + bias, + _pair(0), + self.stride, + self.dilation, + self.groups, + self.attr, + self.scalars, + self.algorithm, + ) + return torch.ops.mkldnn._convolution_pointwise( + input, + weight, + bias, + self.padding, + self.stride, + self.dilation, + self.groups, + self.attr, + self.scalars, + self.algorithm, + ) + + def forward(self, input): + return self._conv_forward(input, self.weight, self.bias) + + +def fused_conv_unary_eval(conv: nn.Module, unary: nn.Module): + assert not (conv.training), "Fusion only for eval!" + return ConvUnary2d( + conv, + unary, + ) + + +def fuse_fx(gm: torch.fx.GraphModule, example_inputs): + if not (torch.backends.mkldnn.enabled and torch.backends.mkldnn.is_available()): + return gm + is_cpu = all( + example_input.device == torch.device("cpu") for example_input in example_inputs + ) + if not is_cpu: + return gm + modules = dict(gm.named_modules()) + + for (unary_module, _), (computation_module, fuse_func,) in itertools.product( + unary_modules_map.items(), computation_op_unary_op_fusion_map.items() + ): + pattern = (computation_module, unary_module) + for node in gm.graph.nodes: + if matches_module_pattern(pattern, node, modules): + if ( + len(node.args[0].users) > 1 + ): # Output of computation_node is used by other nodes + continue + conv = modules[node.args[0].target] + unary_node = modules[node.target] + eval_mode = all(not n.training for n in [conv, unary_node]) + if not eval_mode: + continue + fused_conv = fuse_func(conv, unary_node) + replace_node_module(node.args[0], modules, fused_conv) + node.replace_all_uses_with(node.args[0]) + gm.graph.erase_node(node) + gm.graph.lint() + gm.recompile() + return gm + + def _philox_rand_like_meta(input, seed, offset): return _prims.TensorMeta(input) @@ -163,3 +293,17 @@ def rand_like(x, **kwargs): replacements = {torch.nn.functional.dropout: lowmem_dropout, torch.rand_like: rand_like} + + +computation_op_unary_op_fusion_map = {nn.Conv2d: fused_conv_unary_eval} + + +unary_modules_map = { + nn.ReLU: UnaryAttr("relu"), + nn.Sigmoid: UnaryAttr("sigmoid"), + nn.Tanh: UnaryAttr("tanh"), + nn.Hardswish: UnaryAttr("hardswish"), + nn.LeakyReLU: UnaryAttr("leaky_relu", scalars_attr=["negative_slope"]), + nn.Hardtanh: UnaryAttr("hardtanh", scalars_attr=["min_val", "max_val"]), + nn.GELU: UnaryAttr("gelu", algorithm_attr="approximate"), +} From c7c275eb19b3403c168045d37a4fba0a248a86c9 Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Thu, 27 Oct 2022 10:46:53 +0000 Subject: [PATCH 0227/1922] [complex] conv_transpose2d (#81805) Reference: https://github.com/pytorch/pytorch/issues/71108 Fixes : #86414 Pull Request resolved: https://github.com/pytorch/pytorch/pull/81805 Approved by: https://github.com/anjali411 --- aten/src/ATen/native/Convolution.cpp | 8 +++- .../_internal/common_methods_invocations.py | 40 ++++++++++++++----- torch/testing/_internal/common_modules.py | 21 +++++++++- 3 files changed, 57 insertions(+), 12 deletions(-) diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index 64f6d141b9299..2dd7d515c14f9 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -1066,8 +1066,14 @@ at::Tensor conv_transpose2d( Tensor input; bool is_batched; std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 2, "conv_transpose2d"); - auto output = at::convolution( + Tensor output; + if (at::isComplexType(input_.scalar_type())) { + output = complex_convolution( input, weight, bias, stride, padding, dilation, true, output_padding, groups); + } else { + output = at::convolution( + input, weight, bias, stride, padding, dilation, true, output_padding, groups); + } return is_batched ? output : output.squeeze(0); } diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 900c0987d2f2c..f04a2cc5465bb 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -3079,8 +3079,12 @@ def conv_transpose_ref(input, weight, bias, stride=1, padding=0, assert fn is not None - grad_fn_map = {torch.nn.functional.conv_transpose1d: torch.nn.grad.conv1d_input} - batched_dim_map = {torch.nn.functional.conv_transpose1d: 3} + grad_fn_map = {torch.nn.functional.conv_transpose1d: torch.nn.grad.conv1d_input, + torch.nn.functional.conv_transpose2d: torch.nn.grad.conv2d_input, + torch.nn.functional.conv_transpose3d: torch.nn.grad.conv3d_input} + batched_dim_map = {torch.nn.functional.conv_transpose1d: 3, + torch.nn.functional.conv_transpose2d: 4, + torch.nn.functional.conv_transpose3d: 5} # Input for `ref` is ndarray. input, weight = torch.from_numpy(input), torch.from_numpy(weight) @@ -3090,7 +3094,10 @@ def conv_transpose_ref(input, weight, bias, stride=1, padding=0, input = input.unsqueeze(0) if bias is not None: - bias = torch.from_numpy(bias).unsqueeze(1) + bias = torch.from_numpy(bias) + unsqueeze_dims = input.ndim - 2 + for _ in range(unsqueeze_dims): + bias = bias.unsqueeze(1) grad_output = input # Get the input shape for grad_fn. @@ -3156,9 +3163,8 @@ def sample_inputs_conv_transpose2d(op_info, device, dtype, requires_grad, **kwar {'stride': 2, 'padding': 1, 'output_padding': 1, 'groups': 1, 'dilation': (2, 3)}), ((1, 1, 4, 3), (1, 2, 3, 4), None, {'stride': 2, 'padding': 1, 'output_padding': 1, 'groups': 1}), - ((2, 8, 4, 4), (8, 1, 3, 3), None, {'groups': 4}), - ((1, 4, 5, 5), (4, 8, 3, 3), None, - {}) + ((2, 4, 4, 4), (4, 1, 3, 3), None, {'groups': 4}), + ((1, 2, 5, 5), (2, 4, 3, 3), None, {}) ) for input_shape, weight, bias, kwargs in cases: @@ -10668,10 +10674,15 @@ def reference_flatten(input, start_dim=0, end_dim=-1): OpInfo('nn.functional.conv_transpose2d', aten_name='conv_transpose2d', aliases=('conv_transpose2d',), - dtypes=floating_types_and(torch.int64), - dtypesIfCUDA=floating_types_and(torch.float16, - *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []), + # `ref` for this function is backward of + # corresponding `conv*d` + ref=partial(conv_transpose_ref, fn=torch.nn.functional.conv_transpose2d), + dtypes=floating_and_complex_types_and(torch.int64), + dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.chalf, + *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []), sample_inputs_func=sample_inputs_conv_transpose2d, + # Runs very slowly on slow-gradcheck for complex. + gradcheck_fast_mode=True, supports_forward_ad=True, supports_fwgrad_bwgrad=True, assert_jit_shape_analysis=True, @@ -10679,7 +10690,16 @@ def reference_flatten(input, start_dim=0, end_dim=-1): decorators=[ DecorateInfo( toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1.3e-06), }), - 'TestCommon', 'test_variant_consistency_eager', device_type='cuda')], + 'TestCommon', 'test_variant_consistency_eager', device_type='cuda'), + DecorateInfo( + toleranceOverride({torch.float32: tol(atol=2e-05, rtol=5e-05), }), + 'TestCommon', 'test_noncontiguous_samples', device_type='cuda'), + DecorateInfo( + toleranceOverride({torch.complex32: tol(atol=5e-2, rtol=5e-2)}), + "TestCudaFuserOpInfo", "test_nvfuser_correctness"), + DecorateInfo( + toleranceOverride({torch.chalf: tol(atol=5e-2, rtol=5e-2), }), + 'TestCommon', 'test_complex_half_reference_testing')], skips=( # RuntimeError: !lhs.isAliasOf(rhs)INTERNAL ASSERT FAILED at # "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":104, please report a bug to PyTorch. diff --git a/torch/testing/_internal/common_modules.py b/torch/testing/_internal/common_modules.py index f214ffbb8b3d2..fed908e14dd03 100644 --- a/torch/testing/_internal/common_modules.py +++ b/torch/testing/_internal/common_modules.py @@ -1143,6 +1143,7 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, train module_inputs_func=partial(module_inputs_torch_nn_ConvNd, N=2, lazy=False, transposed=True), gradcheck_nondet_tol=GRADCHECK_NONDET_TOL, module_memformat_affects_out=True, + dtypes=floating_and_complex_types_and(torch.chalf), skips=( # channels_last support on cuda requires cudnn >= 7603 DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'), @@ -1153,7 +1154,25 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, train # See https://github.com/pytorch/pytorch/issues/80247 DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cpu'), DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cuda', - dtypes=[torch.float64]), + dtypes=[torch.float64, torch.complex128]), + # These fail only on ROCm + DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cuda', + dtypes=[torch.complex32, torch.complex64], active_if=TEST_WITH_ROCM), + # Not implmented for chalf on CPU + DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_forward', + dtypes=(torch.chalf,), device_type='cpu'), + DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_memory_format', + dtypes=(torch.chalf,), device_type='cpu'), + DecorateInfo(unittest.expectedFailure, 'TestModule', + 'test_if_train_and_eval_modes_differ', dtypes=(torch.chalf,), device_type='cpu'), + DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_non_contiguous_tensors', + dtypes=(torch.chalf,), device_type='cpu'), + DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_cpu_gpu_parity', + dtypes=(torch.chalf,), device_type='cuda'), + DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_multiple_device_transfer', + dtypes=(torch.chalf,), device_type='cuda'), + # Ref: https://github.com/pytorch/pytorch/issues/73502 + DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_pickle', dtypes=(torch.chalf,)), ), decorators=( DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'), From a0ca695dc4d44bca115efae7322b58a1015d0eb3 Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Thu, 27 Oct 2022 12:29:51 +0000 Subject: [PATCH 0228/1922] [JIT] Fix torch.jit.script for functions with many decorators (#87804) Summary: Python's function parsing from the `ast` module records the line number of the function definition, not the first decorator. So this diff fixes crashes like this: ``` IndexError: vector::_M_range_check: __n (which is 10) >= this->size() (which is 8) ``` Test Plan: New unit test Differential Revision: D40726352 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87804 Approved by: https://github.com/tugsbayasgalan, https://github.com/davidberard98 --- test/jit/test_misc.py | 19 +++++++++++++++++++ torch/jit/frontend.py | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/test/jit/test_misc.py b/test/jit/test_misc.py index db37af81993f3..98ec7831d940e 100644 --- a/test/jit/test_misc.py +++ b/test/jit/test_misc.py @@ -361,3 +361,22 @@ def test_parse_ir_single_element_tensor_negative(self): ret = func() self.assertTrue(ret.numel() == 1) self.assertTrue(len(ret.size()) == 1) + + + def test_script_many_decorators(self): + def no_op_decorator(f): + return f + + @no_op_decorator + @no_op_decorator + @no_op_decorator + @no_op_decorator + @no_op_decorator + def foo(x, dim: int): + return x.unsqueeze(dim) + + x = torch.randn(1,) + expected = foo(x, 0) + scripted = torch.jit.script(foo) + actual = scripted(x, 0) + torch.testing.assert_allclose(expected, actual) diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py index 62548ba7e2cd6..4b5e3d68f75cd 100644 --- a/torch/jit/frontend.py +++ b/torch/jit/frontend.py @@ -324,7 +324,7 @@ def build_class_def(ctx, py_def, methods, properties, self_name, assigns): def build_def(ctx, py_def, type_line, def_name, self_name=None, pdt_arg_types=None): body = py_def.body - r = ctx.make_range(py_def.lineno + len(py_def.decorator_list), + r = ctx.make_range(py_def.lineno, py_def.col_offset, py_def.col_offset + len("def")) From 8139cdaf7e4b3ed6cccf5caa41eed16e0a518e61 Mon Sep 17 00:00:00 2001 From: jpvillam Date: Thu, 27 Oct 2022 15:11:28 +0000 Subject: [PATCH 0229/1922] [ROCM] Enable Sparse Pickle Test (#82729) Missed stream context for serialization ### Description Missing ROCm stream context on memory operations for serialization ### Testing Ran the sparse pickle test Pull Request resolved: https://github.com/pytorch/pytorch/pull/82729 Approved by: https://github.com/ngimel --- test/test_sparse.py | 1 - torch/csrc/serialization.cpp | 25 +++++++++++++++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/test/test_sparse.py b/test/test_sparse.py index 8ae982c034ae4..125fb6d83b300 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -3019,7 +3019,6 @@ def test_change_tensor_metadata(self, device, dtype): self.assertEqual(list(t.coalesce().indices().size()), [2, 1]) self.assertEqual(list(t.coalesce().values().size()), [1, 3]) - @skipIfRocm @coalescedonoff @dtypes(torch.double) def test_pickle(self, device, dtype, coalesced): diff --git a/torch/csrc/serialization.cpp b/torch/csrc/serialization.cpp index 46f3a04f355b4..385a074b1ccb4 100644 --- a/torch/csrc/serialization.cpp +++ b/torch/csrc/serialization.cpp @@ -233,7 +233,18 @@ void THPStorage_writeFileRaw( int64_t numel = size_bytes / element_size; if (self->device_type() == at::kCPU) { data = self->data(); -#ifdef USE_CUDA +#if defined(USE_CUDA) && defined(TORCH_HIP_VERSION) && \ + (TORCH_HIP_VERSION >= 301) + } else if (self->device_type() == at::kCUDA) { + cpu_data = std::unique_ptr(new char[size_bytes]); + data = (uint8_t*)cpu_data.get(); + C10_CUDA_CHECK(hipMemcpyWithStream( + data, + self->data(), + size_bytes, + cudaMemcpyDeviceToHost, + c10::hip::getCurrentHIPStreamMasqueradingAsCUDA())); +#elif defined(USE_CUDA) } else if (self->device_type() == at::kCUDA) { cpu_data = std::unique_ptr(new char[size_bytes]); data = (uint8_t*)cpu_data.get(); @@ -398,7 +409,17 @@ c10::intrusive_ptr THPStorage_readFileRaw( } } -#ifdef USE_CUDA +#if defined(USE_CUDA) && defined(TORCH_HIP_VERSION) && \ + (TORCH_HIP_VERSION >= 301) + if (storage->device_type() == at::kCUDA) { + C10_CUDA_CHECK(hipMemcpyWithStream( + storage->data(), + data, + nbytes, + cudaMemcpyHostToDevice, + c10::hip::getCurrentHIPStreamMasqueradingAsCUDA())); + } +#elif defined(USE_CUDA) if (storage->device_type() == at::kCUDA) { C10_CUDA_CHECK(cudaMemcpy( storage->data(), data, nbytes, cudaMemcpyHostToDevice)); From e602b0ca950f28035397250c3da8f1f9d4bfe871 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Thu, 27 Oct 2022 15:38:48 +0000 Subject: [PATCH 0230/1922] [BE] Move remaining workflows off Xenial (#87834) Both BE and prerequisite for moving our CI/CD to C++17 compiler (gcc-5.4 is not fully C++17 compliant) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87834 Approved by: https://github.com/weiwangmeta, https://github.com/kit1980, https://github.com/huydhn --- .circleci/docker/build.sh | 80 ++----------------- .circleci/scripts/build_android_gradle.sh | 2 +- .../workflows/_android-full-build-test.yml | 2 +- .github/workflows/docker-builds.yml | 5 +- .github/workflows/pull.yml | 32 ++++---- .github/workflows/trunk.yml | 8 +- 6 files changed, 28 insertions(+), 101 deletions(-) diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh index b38456badc271..ec2dfe8cb60ce 100755 --- a/.circleci/docker/build.sh +++ b/.circleci/docker/build.sh @@ -33,7 +33,7 @@ function extract_all_from_image_name() { if [ "x${name}" = xpy ]; then vername=ANACONDA_PYTHON_VERSION fi - # skip non-conforming fields such as "pytorch", "linux" or "xenial" without version string + # skip non-conforming fields such as "pytorch", "linux" or "bionic" without version string if [ -n "${name}" ]; then extract_version_from_image_name "${name}" "${vername}" fi @@ -46,11 +46,7 @@ if [[ "$image" == *xla* ]]; then exit 0 fi -if [[ "$image" == *-xenial* ]]; then - UBUNTU_VERSION=16.04 -elif [[ "$image" == *-artful* ]]; then - UBUNTU_VERSION=17.10 -elif [[ "$image" == *-bionic* ]]; then +if [[ "$image" == *-bionic* ]]; then UBUNTU_VERSION=18.04 elif [[ "$image" == *-focal* ]]; then UBUNTU_VERSION=20.04 @@ -79,7 +75,7 @@ elif [[ "$image" == *rocm* ]]; then DOCKERFILE="${OS}-rocm/Dockerfile" fi -if [[ "$image" == *xenial* ]] || [[ "$image" == *bionic* ]]; then +if [[ "$image" == *bionic* ]]; then CMAKE_VERSION=3.13.5 fi @@ -91,44 +87,6 @@ _UCC_COMMIT=12944da33f911daf505d9bbc51411233d0ed85e1 # configuration, so we hardcode everything here rather than do it # from scratch case "$image" in - pytorch-linux-xenial-py3.8) - ANACONDA_PYTHON_VERSION=3.8 - GCC_VERSION=7 - # Do not install PROTOBUF, DB, and VISION as a test - ;; - pytorch-linux-xenial-py3.7-gcc7.2) - ANACONDA_PYTHON_VERSION=3.7 - GCC_VERSION=7 - # Do not install PROTOBUF, DB, and VISION as a test - ;; - pytorch-linux-xenial-py3.7-gcc7) - ANACONDA_PYTHON_VERSION=3.7 - GCC_VERSION=7 - PROTOBUF=yes - DB=yes - VISION=yes - ;; - pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7) - CUDA_VERSION=10.2 - CUDNN_VERSION=7 - ANACONDA_PYTHON_VERSION=3.7 - GCC_VERSION=7 - PROTOBUF=yes - DB=yes - VISION=yes - KATEX=yes - ;; - pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7) - CUDA_VERSION=11.3.0 # Deviating from major.minor to conform to nvidia's Docker image names - CUDNN_VERSION=8 - TENSORRT_VERSION=8.0.1.6 - ANACONDA_PYTHON_VERSION=3.7 - GCC_VERSION=7 - PROTOBUF=yes - DB=yes - VISION=yes - KATEX=yes - ;; pytorch-linux-bionic-cuda11.3-cudnn8-py3-clang9) CUDA_VERSION=11.3.0 # Deviating from major.minor to conform to nvidia's Docker image names CUDNN_VERSION=8 @@ -167,20 +125,6 @@ case "$image" in UCC_COMMIT=${_UCC_COMMIT} CONDA_CMAKE=yes ;; - pytorch-linux-xenial-py3-clang5-asan) - ANACONDA_PYTHON_VERSION=3.7 - CLANG_VERSION=5.0 - PROTOBUF=yes - DB=yes - VISION=yes - ;; - pytorch-linux-xenial-py3-clang7-asan) - ANACONDA_PYTHON_VERSION=3.7 - CLANG_VERSION=7 - PROTOBUF=yes - DB=yes - VISION=yes - ;; pytorch-linux-focal-py3-clang7-asan) ANACONDA_PYTHON_VERSION=3.7 CLANG_VERSION=7 @@ -189,13 +133,6 @@ case "$image" in VISION=yes CONDA_CMAKE=yes ;; - pytorch-linux-xenial-py3-clang7-onnx) - ANACONDA_PYTHON_VERSION=3.7 - CLANG_VERSION=7 - PROTOBUF=yes - DB=yes - VISION=yes - ;; pytorch-linux-focal-py3-clang10-onnx) ANACONDA_PYTHON_VERSION=3.7 CLANG_VERSION=10 @@ -204,9 +141,9 @@ case "$image" in VISION=yes CONDA_CMAKE=yes ;; - pytorch-linux-xenial-py3-clang5-android-ndk-r19c) + pytorch-linux-focal-py3-clang7-android-ndk-r19c) ANACONDA_PYTHON_VERSION=3.7 - CLANG_VERSION=5.0 + CLANG_VERSION=7 LLVMDEV=yes PROTOBUF=yes ANDROID=yes @@ -214,13 +151,6 @@ case "$image" in GRADLE_VERSION=6.8.3 NINJA_VERSION=1.9.0 ;; - pytorch-linux-xenial-py3.7-clang7) - ANACONDA_PYTHON_VERSION=3.7 - CLANG_VERSION=7 - PROTOBUF=yes - DB=yes - VISION=yes - ;; pytorch-linux-bionic-py3.7-clang9) ANACONDA_PYTHON_VERSION=3.7 CLANG_VERSION=9 diff --git a/.circleci/scripts/build_android_gradle.sh b/.circleci/scripts/build_android_gradle.sh index 598e9cd0a6bd2..2007c91fe395a 100755 --- a/.circleci/scripts/build_android_gradle.sh +++ b/.circleci/scripts/build_android_gradle.sh @@ -24,7 +24,7 @@ export GRADLE_LOCAL_PROPERTIES=~/workspace/android/local.properties rm -f $GRADLE_LOCAL_PROPERTIES echo "sdk.dir=/opt/android/sdk" >> $GRADLE_LOCAL_PROPERTIES echo "ndk.dir=/opt/ndk" >> $GRADLE_LOCAL_PROPERTIES -echo "cmake.dir=/usr/local" >> $GRADLE_LOCAL_PROPERTIES +echo "cmake.dir=/usr" >> $GRADLE_LOCAL_PROPERTIES retry () { $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) diff --git a/.github/workflows/_android-full-build-test.yml b/.github/workflows/_android-full-build-test.yml index 1680461be78ef..9f110db307aea 100644 --- a/.github/workflows/_android-full-build-test.yml +++ b/.github/workflows/_android-full-build-test.yml @@ -128,7 +128,7 @@ jobs: # run gradle buildRelease (echo "./.circleci/scripts/build_android_gradle.sh" | docker exec \ - -e BUILD_ENVIRONMENT="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build" \ + -e BUILD_ENVIRONMENT="pytorch-linux-focal-py3-clang7-android-ndk-r19c-gradle-build" \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e AWS_DEFAULT_REGION \ -e PR_NUMBER \ diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index dd59d44e8a9d3..3108f4b926a89 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -41,10 +41,7 @@ jobs: - docker-image-name: pytorch-linux-focal-rocm5.2-py3.8 - docker-image-name: pytorch-linux-jammy-cuda11.6-cudnn8-py3.8-clang12 - docker-image-name: pytorch-linux-jammy-cuda11.7-cudnn8-py3.8-clang12 - - docker-image-name: pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7 - - docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c - - docker-image-name: pytorch-linux-xenial-py3-clang5-asan - - docker-image-name: pytorch-linux-xenial-py3-clang7-onnx + - docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c - docker-image-name: pytorch-linux-focal-py3.7-gcc7 - docker-image-name: pytorch-linux-focal-py3-clang7-asan - docker-image-name: pytorch-linux-focal-py3-clang10-onnx diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 849e70dc9f29d..0f95186141bfb 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -184,12 +184,12 @@ jobs: docker-image: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-build.outputs.docker-image }} test-matrix: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-build.outputs.test-matrix }} - linux-xenial-py3-clang5-mobile-build: - name: linux-xenial-py3-clang5-mobile-build + linux-focal-py3-clang7-mobile-build: + name: linux-focal-py3-clang7-mobile-build uses: ./.github/workflows/_linux-build.yml with: - build-environment: linux-xenial-py3-clang5-mobile-build - docker-image-name: pytorch-linux-xenial-py3-clang5-asan + build-environment: linux-focal-py3-clang7-mobile-build + docker-image-name: pytorch-linux-focal-py3-clang7-asan build-generates-artifacts: false linux-jammy-cuda-11_6-cudnn8-py3_8-clang12-build: @@ -199,12 +199,12 @@ jobs: build-environment: linux-jammy-cuda11.6-cudnn8-py3.8-clang12 docker-image-name: pytorch-linux-jammy-cuda11.6-cudnn8-py3.8-clang12 - linux-xenial-py3-clang5-mobile-custom-build-static: - name: linux-xenial-py3-clang5-mobile-custom-build-static + linux-focal-py3-clang7-mobile-custom-build-static: + name: linux-focal-py3-clang7-mobile-custom-build-static uses: ./.github/workflows/_linux-build.yml with: - build-environment: linux-xenial-py3-clang5-mobile-custom-build-static - docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c + build-environment: linux-focal-py3-clang7-mobile-custom-build-static + docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c build-generates-artifacts: false linux-bionic-py3_7-clang8-xla-build: @@ -275,19 +275,19 @@ jobs: build-environment: linux-bionic-cuda11.6-py3.10-gcc7-bazel-test docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7 - linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single: - name: linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single + linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single: + name: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single uses: ./.github/workflows/_android-build-test.yml with: - build-environment: linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single - docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c + build-environment: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single + docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c - linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit: - name: linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit + linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit: + name: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit uses: ./.github/workflows/_android-build-test.yml with: - build-environment: linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit - docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c + build-environment: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit + docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c linux-focal-py3_7-gcc7-mobile-lightweight-dispatch-build: name: linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 29dc9f3c44d3f..d92c5a079d978 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -118,12 +118,12 @@ jobs: build-environment: linux-bionic-cuda11.7-py3.10-gcc7-no-ops docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7 - pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build: - name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build + pytorch-linux-focal-py3-clang7-android-ndk-r19c-build: + name: pytorch-linux-focal-py3-clang7-android-ndk-r19c-build uses: ./.github/workflows/_android-full-build-test.yml with: - build-environment: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build - docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c + build-environment: pytorch-linux-focal-py3-clang7-android-ndk-r19c-build + docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c linux-bionic-py3_7-clang9-slow-build: name: linux-bionic-py3.7-clang9-slow From 1a54fa338569de96a43a86300e403a7bf2f80989 Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Wed, 26 Oct 2022 14:11:22 -0700 Subject: [PATCH 0231/1922] functionalization: fix detach() (#87750) `.detach()` worked in basic cases previously, but didn't properly preserve view relationships between the base and the output. This wasn't heavily tested, because autograd doesn't normally encounter `FunctionalTensorWrapper` directly, but could become more common if we fuse functionalization and autograd into a single tracing pass. This will also be a bug fix for LTC (and XLA when they use functionalization) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87750 Approved by: https://github.com/ezyang --- aten/src/ATen/FunctionalTensorWrapper.cpp | 4 ++ test/test_functionalization.py | 51 +++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp index 91136f921b1ad..03630c39bbf8b 100644 --- a/aten/src/ATen/FunctionalTensorWrapper.cpp +++ b/aten/src/ATen/FunctionalTensorWrapper.cpp @@ -302,12 +302,16 @@ c10::intrusive_ptr FunctionalTensorWrapper::shallow_copy_and_detach_ return r; } } + auto impl = c10::make_intrusive(value_); copy_tensor_metadata( /*src_impl=*/this, /*dest_impl=*/impl.get(), /*version_counter=*/std::forward(version_counter), /*allow_tensor_metadata_change=*/allow_tensor_metadata_change); + impl->level_ = level_; + impl->generation_ = generation_; + impl->view_metas_ = view_metas_; impl->refresh_numel(); impl->refresh_contiguous(); return impl; diff --git a/test/test_functionalization.py b/test/test_functionalization.py index 041e5b84f6945..521cb4e9e0cec 100644 --- a/test/test_functionalization.py +++ b/test/test_functionalization.py @@ -24,6 +24,7 @@ def are_aliased(x, y): def _functionalize(f, *, reapply_views: bool): def wrapped(a): input_functional = torch._to_functional_tensor(a) + input_functional.requires_grad = a.requires_grad torch._enable_functionalization(reapply_views=reapply_views) try: out = f(input_functional) @@ -101,6 +102,56 @@ def f(x): return z2 self.assert_functionalization(f, torch.ones(4)) + def test_view_clone_view_inplace(self): + def f(input): + shape = [1, 1024, 128, 128] + input_reshaped = input.view(shape) + out = input_reshaped.clone() + r = out.view(input.shape) + r.relu_() + return r + + def g(x): + loss = f(x).sum() + from functorch._src.aot_autograd import setup_stacktrace_preservation_hooks + import torch.fx.traceback as fx_traceback + setup_stacktrace_preservation_hooks([loss.grad_fn]) + with fx_traceback.override_stack_trace(): + loss.backward() + return x.grad + + with torch.autograd.detect_anomaly(check_nan=False): + logs = self.get_logs(g, torch.ones(16, 64, 128, 128, requires_grad=True)) + self.assertExpectedInline(logs, """\ + + + +def forward(self, a_1): + view_copy = torch.ops.aten.view_copy.default(a_1, [1, 1024, 128, 128]); a_1 = None + clone = torch.ops.aten.clone.default(view_copy); view_copy = None + view_copy_1 = torch.ops.aten.view_copy.default(clone, [16, 64, 128, 128]); clone = None + relu = torch.ops.aten.relu.default(view_copy_1); view_copy_1 = None + sum_1 = torch.ops.aten.sum.default(relu) + ones_like = torch.ops.aten.ones_like.default(sum_1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False, memory_format = torch.preserve_format); sum_1 = None + expand_copy = torch.ops.aten.expand_copy.default(ones_like, [16, 64, 128, 128]); ones_like = None + new_zeros = torch.ops.aten.new_zeros.default(expand_copy, [16777216]) + as_strided_copy = torch.ops.aten.as_strided_copy.default(new_zeros, [16, 64, 128, 128], [1048576, 16384, 128, 1], 0) + as_strided_copy_1 = torch.ops.aten.as_strided_copy.default(new_zeros, [1, 1024, 128, 128], [16777216, 16384, 128, 1], 0) + as_strided_scatter = torch.ops.aten.as_strided_scatter.default(new_zeros, expand_copy, [16, 64, 128, 128], [1048576, 16384, 128, 1], 0); new_zeros = expand_copy = None + as_strided_copy_2 = torch.ops.aten.as_strided_copy.default(as_strided_scatter, [1, 1024, 128, 128], [16777216, 16384, 128, 1], 0); as_strided_scatter = None + new_empty_strided = torch.ops.aten.new_empty_strided.default(as_strided_copy_2, [1, 1024, 128, 128], [16777216, 16384, 128, 1]) + as_strided_copy_3 = torch.ops.aten.as_strided_copy.default(as_strided_copy_2, [16, 64, 128, 128], [1048576, 16384, 128, 1], 0) + as_strided_copy_4 = torch.ops.aten.as_strided_copy.default(as_strided_copy_2, [16, 64, 128, 128], [1048576, 16384, 128, 1], 0) + clone_1 = torch.ops.aten.clone.default(as_strided_copy_4, memory_format = torch.contiguous_format); as_strided_copy_4 = None + threshold_backward = torch.ops.aten.threshold_backward.default(clone_1, relu, 0); clone_1 = relu = None + _reshape_alias_copy = torch.ops.aten._reshape_alias_copy.default(as_strided_copy_2, [16, 64, 128, 128], [1048576, 16384, 128, 1]) + detach_copy = torch.ops.aten.detach_copy.default(_reshape_alias_copy); _reshape_alias_copy = None + as_strided_scatter_1 = torch.ops.aten.as_strided_scatter.default(as_strided_copy_2, threshold_backward, [16, 64, 128, 128], [1048576, 16384, 128, 1], 0); as_strided_copy_2 = threshold_backward = None + _reshape_alias_copy_1 = torch.ops.aten._reshape_alias_copy.default(as_strided_scatter_1, [16, 64, 128, 128], [1048576, 16384, 128, 1]); as_strided_scatter_1 = None + detach_copy_1 = torch.ops.aten.detach_copy.default(_reshape_alias_copy_1); _reshape_alias_copy_1 = None + return detach_copy_1 + """) # noqa: B950 + def test_simple(self): def f(x): # simple test: 1 view op, 1 inplace op From 72ef093855039cbc6b661461288eeb0a8befae19 Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Wed, 26 Oct 2022 14:11:22 -0700 Subject: [PATCH 0232/1922] add nesting to TORCH_SHOW_DISPATCH_TRACE (#87751) Added indents to `TORCH_SHOW_DISPATCH_TRACE` so that you more easily see the call tree from the dispatcher. Definitely slower, but it's all guarded under the `DEBUG` build. Example output: I know we have the PyDispatcher now, but I still found this helpful for debugging ``` [call] op=[aten::ones], key=[BackendSelect] [redispatch] op=[aten::ones], key=[CPU] [call] op=[aten::empty.memory_format], key=[BackendSelect] [redispatch] op=[aten::empty.memory_format], key=[CPU] [call] op=[aten::fill_.Scalar], key=[CPU] [call] op=[aten::clone], key=[AutogradCPU] [redispatch] op=[aten::clone], key=[CPU] [call] op=[aten::empty_strided], key=[BackendSelect] [redispatch] op=[aten::empty_strided], key=[CPU] [call] op=[aten::copy_], key=[CPU] [call] op=[aten::view], key=[PythonTLSSnapshot] [redispatchBoxed] op=[aten::view], key=[AutogradCPU] [redispatch] op=[aten::view], key=[ADInplaceOrView] [redispatch] op=[aten::view], key=[Functionalize] [call] op=[aten::view], key=[PythonTLSSnapshot] [redispatchBoxed] op=[aten::view], key=[Meta] [call] op=[aten::view], key=[PythonTLSSnapshot] [redispatchBoxed] op=[aten::view], key=[Python] [callBoxed] op=[aten::view], key=[CPU] [call] op=[aten::clone], key=[PythonTLSSnapshot] [redispatchBoxed] op=[aten::clone], key=[AutogradCPU] [redispatch] op=[aten::clone], key=[Functionalize] [callBoxed] op=[aten::clone], key=[PythonTLSSnapshot] [redispatchBoxed] op=[aten::clone], key=[Python] [callBoxed] op=[aten::clone], key=[CPU] [call] op=[aten::empty_strided], key=[BackendSelect] [redispatch] op=[aten::empty_strided], key=[CPU] [call] op=[aten::copy_], key=[CPU] ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/87751 Approved by: https://github.com/ezyang, https://github.com/zou3519 --- aten/src/ATen/core/dispatch/Dispatcher.cpp | 6 ++++++ aten/src/ATen/core/dispatch/Dispatcher.h | 20 ++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp index 667eefdcc5ab8..45214a3fd20f2 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.cpp +++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp @@ -9,6 +9,12 @@ bool show_dispatch_trace() { return temp != nullptr; } +static thread_local int64_t dispatch_trace_nesting_value_; + +void dispatch_trace_nesting_incr() { ++dispatch_trace_nesting_value_; } +void dispatch_trace_nesting_decr() { --dispatch_trace_nesting_value_; } +int64_t dispatch_trace_nesting_value() { return dispatch_trace_nesting_value_; } + namespace detail { class RegistrationListenerList final { diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h index 1ea677b54ef5a..2f383d589e29f 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.h +++ b/aten/src/ATen/core/dispatch/Dispatcher.h @@ -19,6 +19,14 @@ namespace c10 { TORCH_API bool show_dispatch_trace(); +TORCH_API void dispatch_trace_nesting_incr(); +TORCH_API void dispatch_trace_nesting_decr(); +TORCH_API int64_t dispatch_trace_nesting_value(); + +struct DispatchTraceNestingGuard { + DispatchTraceNestingGuard() { dispatch_trace_nesting_incr(); } + ~DispatchTraceNestingGuard() { dispatch_trace_nesting_decr(); } +}; class TORCH_API OperatorHandle; template class TypedOperatorHandle; @@ -583,7 +591,10 @@ C10_ALWAYS_INLINE_UNLESS_MOBILE Return Dispatcher::call(const TypedOperatorHandl auto dispatchKeySet = op.operatorDef_->op.dispatchKeyExtractor() .template getDispatchKeySetUnboxed(args...); #ifndef NDEBUG + DispatchTraceNestingGuard debug_guard; if (show_dispatch_trace()) { + auto nesting_value = dispatch_trace_nesting_value(); + for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " "; std::cerr << "[call] op=[" << op.operator_name() << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl; } #endif @@ -603,7 +614,10 @@ inline Return Dispatcher::redispatch(const TypedOperatorHandle detail::unused_arg_(args...); // workaround for a false-positive warning about unused parameters in gcc 5 // do not use RecordFunction on redispatch #ifndef NDEBUG + DispatchTraceNestingGuard debug_guard; if (show_dispatch_trace()) { + auto nesting_value = dispatch_trace_nesting_value(); + for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " "; std::cerr << "[redispatch] op=[" << op.operator_name() << "], key=[" << toString(currentDispatchKeySet.highestPriorityTypeId()) << "]" << std::endl; } #endif @@ -616,7 +630,10 @@ inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const const auto& entry = op.operatorDef_->op; auto dispatchKeySet = entry.dispatchKeyExtractor().getDispatchKeySetBoxed(stack); #ifndef NDEBUG + DispatchTraceNestingGuard debug_guard; if (show_dispatch_trace()) { + auto nesting_value = dispatch_trace_nesting_value(); + for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " "; std::cerr << "[callBoxed] op=[" << op.operator_name() << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl; } #endif @@ -666,7 +683,10 @@ inline void Dispatcher::redispatchBoxed(const OperatorHandle& op, DispatchKeySet // note: this doesn't need the mutex because write operations on the list keep iterators intact. const auto& entry = op.operatorDef_->op; #ifndef NDEBUG + DispatchTraceNestingGuard debug_guard; if (show_dispatch_trace()) { + auto nesting_value = dispatch_trace_nesting_value(); + for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " "; std::cerr << "[redispatchBoxed] op=[" << op.operator_name() << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl; } #endif From 44d9123d16cf6979d7867eec2a6f3aa6d9038f85 Mon Sep 17 00:00:00 2001 From: Natalia Gimelshein Date: Thu, 27 Oct 2022 15:53:11 +0000 Subject: [PATCH 0233/1922] Fix type promotion for 2 wrapped scalar args (#87845) Fixes #76801 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87845 Approved by: https://github.com/SherlockNoMad, https://github.com/mruberry --- aten/src/ATen/TensorIterator.cpp | 3 ++- aten/src/ATen/TensorIterator.h | 7 +++++-- test/test_binary_ufuncs.py | 20 +++++++------------ test/test_ops.py | 3 ++- .../_internal/common_methods_invocations.py | 4 ++-- 5 files changed, 18 insertions(+), 19 deletions(-) diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp index 7b1442db75ad4..7e86163f1ca4c 100644 --- a/aten/src/ATen/TensorIterator.cpp +++ b/aten/src/ATen/TensorIterator.cpp @@ -431,7 +431,7 @@ void TensorIteratorBase::compute_types(const TensorIteratorConfig& config) { } // Computes a common dtype, if needed - if (has_different_input_dtypes && config.promote_inputs_to_common_dtype_) { + if ((has_different_input_dtypes || all_ops_are_scalars_) && config.promote_inputs_to_common_dtype_) { common_dtype_ = compute_common_dtype(); } @@ -1237,6 +1237,7 @@ void TensorIteratorBase::compute_shape(const TensorIteratorConfig& config) { shape_ = infer_size_dimvector(shape_, shape); } } + all_ops_are_scalars_ = !has_tensors; } void TensorIteratorBase::compute_strides(const TensorIteratorConfig& config) { diff --git a/aten/src/ATen/TensorIterator.h b/aten/src/ATen/TensorIterator.h index 59f52d9dbd2ed..31ae65466870a 100644 --- a/aten/src/ATen/TensorIterator.h +++ b/aten/src/ATen/TensorIterator.h @@ -659,9 +659,12 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase { /// in operands_). int num_outputs_ = 0; - /// Whether or not all operands have the same shape. Having all the same - /// shape affects whether or not the iterator is eligible for fast setup. + /// Whether or not all operands have the same shape and are 1d+. Having all + /// the same shape affects whether or not the iterator is eligible for fast + /// setup. bool all_ops_same_shape_ = false; + /// Whether or not all operands are 0d, this affects type promotion + bool all_ops_are_scalars_ = false; /// The "computation" dtype of TensorIterator, specifying what the dtype /// we will do the internal computation in TensorIterator. Typically, diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py index abcbb493342bf..8ffab2daa6e28 100644 --- a/test/test_binary_ufuncs.py +++ b/test/test_binary_ufuncs.py @@ -491,9 +491,6 @@ def test_type_promotion(self, device, op): make_tensor, (5,), device=device, **op.rhs_make_tensor_kwargs ) - make_lhs_scalar_tensor = partial( - make_tensor, (), device='cpu', **op.lhs_make_tensor_kwargs - ) make_rhs_scalar_tensor = partial( make_tensor, (), device='cpu', **op.rhs_make_tensor_kwargs ) @@ -782,17 +779,14 @@ def _supported(dtypes): ) self.assertEqual(result.dtype, expected_dtype) - # scalar int x scalar float + # scalar x scalar # Note: result dtype is default float type - # TODO: FIXME: re-enable this, scalar x scalar type promotion is currently broken - # https://github.com/pytorch/pytorch/issues/76801 - # if op.supports_two_python_scalars and _supported((torch.long, torch.float32)): - # lhs_i_scalar = 1 - # rhs_f_scalar = 2. - - # result = op(lhs_i_scalar, rhs_f_scalar) - # expected_dtype = torch.get_default_dtype() if not op.always_returns_bool else torch.bool - # self.assertEqual(result.dtype, expected_dtype) + if op.supports_two_python_scalars and _supported((torch.long, torch.float32)): + rhs_f_scalar = 2. + for lhs in (1, 1.): + result = op(lhs, rhs_f_scalar) + expected_dtype = torch.get_default_dtype() if not op.always_returns_bool else torch.bool + self.assertEqual(result.dtype, expected_dtype) # TODO: move to error input test @ops(binary_ufuncs, allowed_dtypes=(torch.float32,)) diff --git a/test/test_ops.py b/test/test_ops.py index 1d20151c20e89..fa8812aa5d8ee 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -457,7 +457,8 @@ def test_errors(self, device, op): for ei in error_inputs: si = ei.sample_input with self.assertRaisesRegex(ei.error_type, ei.error_regex): - op(si.input, *si.args, **si.kwargs) + out = op(si.input, *si.args, **si.kwargs) + self.assertFalse(isinstance(out, type(NotImplemented))) @skipMeta @onlyNativeDeviceTypes diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index f04a2cc5465bb..fb4238234a98f 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -12826,7 +12826,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1): supports_out=False, supports_forward_ad=True, supports_fwgrad_bwgrad=True, - supports_two_python_scalars=True, + supports_one_python_scalar=True, skips=( DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'), DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',), @@ -12861,7 +12861,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1): supports_forward_ad=True, supports_fwgrad_bwgrad=True, supports_out=False, - supports_two_python_scalars=True, + supports_one_python_scalar=True, skips=( DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'), DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',), From cfef1a1e8023b673e64dfecdb41e445eb63d4c61 Mon Sep 17 00:00:00 2001 From: Michael Suo Date: Wed, 26 Oct 2022 10:49:38 -0700 Subject: [PATCH 0234/1922] [dynamo] Error when user nests FX with dynamo (#87797) Today, this doesn't work and dynamo errors out in a very non-obvious way (see: https://gist.github.com/suo/dde04830372ab51a4a34ea760f14200a). Here, we detect the error early and exit with a nicer msg. Also add a config option to just no-op dynamo (which need to unblock internal enablement). cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87797 Approved by: https://github.com/yf225, https://github.com/soumith, https://github.com/jansel --- test/dynamo/test_misc.py | 14 ++++++++++++++ torch/_dynamo/config.py | 4 ++++ torch/_dynamo/eval_frame.py | 9 +++++++++ 3 files changed, 27 insertions(+) diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py index a0f592212f4e1..a63a6d8930c80 100644 --- a/test/dynamo/test_misc.py +++ b/test/dynamo/test_misc.py @@ -2732,6 +2732,20 @@ def forward(self, x): dynamo_result = graph(x) self.assertTrue(same(real, dynamo_result)) + def test_error_on_nested_fx_trace(self): + input = torch.rand(2, 3) + + def f(x): + x + x + + real = f(input) + + optimized = torch._dynamo.optimize("eager")(f) + self.assertTrue(same(optimized(input), real)) + + with self.assertRaisesRegex(RuntimeError, "Detected that you are using FX"): + gm = torch.fx.symbolic_trace(optimized) + class CustomFunc(torch.autograd.Function): @staticmethod diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py index f24eeeae76882..162891d2fd9dc 100644 --- a/torch/_dynamo/config.py +++ b/torch/_dynamo/config.py @@ -152,6 +152,10 @@ # How to import torchinductor, either torchinductor or torch.inductor inductor_import = dynamo_import.replace("dynamo", "inductor") +# If true, error with a better message if we symbolically trace over a +# dynamo-optimized function. If false, silently suppress dynamo. +error_on_nested_fx_trace = True + # root folder of the project if "torch." in dynamo_import: base_dir = dirname(dirname(dirname(abspath(__file__)))) diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py index d86653f9973cc..2d0938a83a123 100644 --- a/torch/_dynamo/eval_frame.py +++ b/torch/_dynamo/eval_frame.py @@ -14,6 +14,7 @@ import torch import torch.utils._pytree as pytree +from torch.fx._symbolic_trace import is_fx_tracing from torch.fx.experimental.proxy_tensor import make_fx from torch.nn.parallel.distributed import DistributedDataParallel @@ -149,6 +150,14 @@ def __call__(self, *args, **kwargs): @functools.wraps(fn) def _fn(*args, **kwargs): + if is_fx_tracing(): + if config.error_on_nested_fx_trace: + raise RuntimeError( + "Detected that you are using FX to symbolically trace " + "a dynamo-optimized function. This is not supported at the moment." + ) + return fn + on_enter() prior = set_eval_frame(callback) backend_ctx = backend_ctx_ctor() From 5373502038b9a4566656590546d085e3acbf5b51 Mon Sep 17 00:00:00 2001 From: Akshit Khurana Date: Wed, 26 Oct 2022 15:44:00 -0700 Subject: [PATCH 0235/1922] [dynamo] Add ao.nn to skipfiles inline allowlist (#87820) Summary: Allow torch.ao.nn module to be inlined Test Plan: Tested manually for https://github.com/pytorch/torchdynamo/issues/1737 Reviewers: Subscribers: Tasks: Tags: cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx Differential Revision: [D40768679](https://our.internmc.facebook.com/intern/diff/D40768679) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87820 Approved by: https://github.com/jansel --- torch/_dynamo/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py index 162891d2fd9dc..2601be8983f2a 100644 --- a/torch/_dynamo/config.py +++ b/torch/_dynamo/config.py @@ -96,6 +96,7 @@ torch.nn, torch.distributions, torch.testing, + torch.ao.nn, } if HAS_REFS_PRIMS: skipfiles_inline_module_allowlist |= { From 071a18c8cd0f2d1f2cea176545b8e5dd30939682 Mon Sep 17 00:00:00 2001 From: samdow Date: Thu, 27 Oct 2022 17:10:04 +0000 Subject: [PATCH 0236/1922] fix typo in per sample grad test (#87790) Pull Request resolved: https://github.com/pytorch/pytorch/pull/87790 Approved by: https://github.com/zou3519 --- test/functorch/test_eager_transforms.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py index d8cd765706e5a..9361e51454787 100644 --- a/test/functorch/test_eager_transforms.py +++ b/test/functorch/test_eager_transforms.py @@ -3089,8 +3089,10 @@ def test_resnet18_per_sample_grads(self, device): func_model, weights = make_functional(model) def compute_loss(weights, image, target): - output = func_model(weights, images) - loss = criterion(output, targets) + image = image.unsqueeze(0) + target = target.unsqueeze(0) + output = func_model(weights, image) + loss = criterion(output, target) return loss batch_size = 3 @@ -3100,7 +3102,7 @@ def compute_loss(weights, image, target): result_grads = vmap(grad(compute_loss), in_dims=(None, 0, 0))(weights, images, targets) expected_grads = [ - torch.autograd.grad(compute_loss(weights, images[i].unsqueeze(0), targets[i].unsqueeze(0)), weights) + torch.autograd.grad(compute_loss(weights, images[i], targets[i]), weights) for i in range(batch_size) ] expected_grads = [torch.stack(shards) for shards in zip(*expected_grads)] From 7a7f46c5c56ab25314455490a21cc3b8cda3e77a Mon Sep 17 00:00:00 2001 From: Animesh Jain Date: Thu, 27 Oct 2022 19:49:29 +0000 Subject: [PATCH 0237/1922] [dynamo] add inductor runs w/o cudagraphs (#87847) as title cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx Pull Request resolved: https://github.com/pytorch/pytorch/pull/87847 Approved by: https://github.com/jansel --- benchmarks/dynamo/Makefile_dashboard | 5 ++++- benchmarks/dynamo/common.py | 12 ++++++++++-- benchmarks/dynamo/runner.py | 2 ++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/benchmarks/dynamo/Makefile_dashboard b/benchmarks/dynamo/Makefile_dashboard index 729178f538408..1c75d608e7d71 100644 --- a/benchmarks/dynamo/Makefile_dashboard +++ b/benchmarks/dynamo/Makefile_dashboard @@ -5,15 +5,17 @@ PIP ?= python -m pip clone-deps: (cd ../../.. \ && (test -e torchvision || git clone --recursive https://github.com/pytorch/vision torchvision) \ + && (test -e torchdata || git clone --recursive https://github.com/pytorch/data.git torchdata) \ && (test -e torchtext || git clone --recursive https://github.com/pytorch/text torchtext) \ && (test -e detectron2 || git clone --recursive https://github.com/facebookresearch/detectron2) \ && (test -e torchbenchmark || git clone --recursive https://github.com/pytorch/benchmark torchbenchmark) \ && (test -e triton || git clone --recursive https://github.com/openai/triton.git) \ ) -pull-deps: +pull-deps: clone-deps echo $(TRITON_VERSION) (cd ../../../torchvision && git pull && git submodule update --init --recursive) + (cd ../../../torchdata && git pull && git submodule update --init --recursive) (cd ../../../torchtext && git pull && git submodule update --init --recursive) (cd ../../../detectron2 && git pull && git submodule update --init --recursive) (cd ../../../torchbenchmark && git pull && git submodule update --init --recursive) @@ -28,6 +30,7 @@ build-deps: clone-deps conda install -y -c pytorch magma-cuda116 conda install -y -c conda-forge librosa (cd ../../../torchvision && python setup.py clean && python setup.py develop) + (cd ../../../torchdata && python setup.py install) (cd ../../../torchtext && python setup.py clean && python setup.py develop) (cd ../../../detectron2 && python setup.py clean && python setup.py develop) (cd ../../../torchbenchmark && python install.py --continue_on_fail) diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py index 86e6bb62842f6..88de22f326cfe 100644 --- a/benchmarks/dynamo/common.py +++ b/benchmarks/dynamo/common.py @@ -27,6 +27,7 @@ from torch._dynamo.profiler import fx_insert_profiling, Profiler from torch._dynamo.testing import dummy_fx_compile, format_speedup, same from torch._dynamo.utils import clone_inputs +from torch._inductor import config as inductor_config from torch._inductor.utils import fresh_triton_cache from torch._subclasses.fake_tensor import FakeTensorMode from torch.utils._pytree import tree_map @@ -1360,6 +1361,11 @@ def parse_args(): action="store_true", help="Use a fresh triton cachedir when running each model, to force cold-start compile.", ) + parser.add_argument( + "--disable-cudagraphs", + action="store_true", + help="Disables cudagraphs for Inductor", + ) group_fuser = parser.add_mutually_exclusive_group() # --nvfuser is now the default, keep the option to not break scripts @@ -1619,8 +1625,6 @@ def main(runner, original_dir=None): experiment = speedup_experiment output_filename = "overheads.csv" elif args.inductor or args.inductor_dynamic: - from torch._inductor import config as inductor_config - inductor_config.debug = args.verbose if args.threads: inductor_config.cpp.threads = args.threads @@ -1705,6 +1709,10 @@ def main(runner, original_dir=None): experiment = coverage_experiment output_filename = "coverage.csv" + if args.inductor or args.backend == "inductor": + if args.disable_cudagraphs: + inductor_config.triton.cudagraphs = False + runner.setup_amp() if args.output: diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py index ce952095bd352..f5ec96e4f500b 100755 --- a/benchmarks/dynamo/runner.py +++ b/benchmarks/dynamo/runner.py @@ -65,6 +65,7 @@ "aot_cudagraphs": "--training --backend=aot_cudagraphs ", "aot_nvfuser": "--training --nvfuser --backend=aot_nvfuser ", "inductor": "--training --inductor ", + "inductor_no_cudagraphs": "--training --inductor --disable-cudagraphs ", }, "inference": { "ts_nnc": "--speedup-ts", @@ -85,6 +86,7 @@ "aot_cudagraphs", "aot_nvfuser", "inductor", + "inductor_no_cudagraphs", ], "inference": ["ts_nvfuser_cudagraphs", "inductor"], "dtypes": [ From 8db1612c3d74951c7c3700d27dd2852706871791 Mon Sep 17 00:00:00 2001 From: Jiewen Tan Date: Thu, 27 Oct 2022 20:39:30 +0000 Subject: [PATCH 0238/1922] [LTC] Remove lazy::View (#87822) Summary: This is the first part to remove the whole view and aliasing infrastructure in LTC, which is deprecated in favor of functionalization. It mainly removes things that use lazy::View. Test Plan: CI Pull Request resolved: https://github.com/pytorch/pytorch/pull/87822 Approved by: https://github.com/JackCaoG, https://github.com/antoniojkim, https://github.com/wconstab --- .github/ci_commit_pins/xla.txt | 2 +- build_variables.bzl | 1 - torch/csrc/lazy/core/ir_builder.h | 133 --------- torch/csrc/lazy/core/lazy_graph_executor.cpp | 61 ---- torch/csrc/lazy/core/lazy_graph_executor.h | 5 - torch/csrc/lazy/core/lazy_view.cpp | 262 ------------------ torch/csrc/lazy/core/lazy_view.h | 173 ------------ torch/csrc/lazy/core/tensor.cpp | 156 +---------- torch/csrc/lazy/core/tensor.h | 33 --- torch/csrc/lazy/ts_backend/ir_builder.h | 78 ------ .../csrc/lazy/ts_backend/tensor_aten_ops.cpp | 193 ------------- torch/csrc/lazy/ts_backend/tensor_aten_ops.h | 85 ------ 12 files changed, 14 insertions(+), 1168 deletions(-) delete mode 100644 torch/csrc/lazy/core/lazy_view.cpp delete mode 100644 torch/csrc/lazy/core/lazy_view.h diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt index 86063843174d2..a3de2aba624ea 100644 --- a/.github/ci_commit_pins/xla.txt +++ b/.github/ci_commit_pins/xla.txt @@ -1 +1 @@ -79131e9d31290744afdf3d85118251863e16ab0e +095ee628212f0235ad0d6908bdd514123639fc86 diff --git a/build_variables.bzl b/build_variables.bzl index f1801b446ed8c..017ed9aef5413 100644 --- a/build_variables.bzl +++ b/build_variables.bzl @@ -414,7 +414,6 @@ lazy_tensor_core_sources = [ "torch/csrc/lazy/core/ir_metadata.cpp", "torch/csrc/lazy/core/ir_util.cpp", "torch/csrc/lazy/core/lazy_graph_executor.cpp", - "torch/csrc/lazy/core/lazy_view.cpp", "torch/csrc/lazy/core/metrics.cpp", "torch/csrc/lazy/core/multi_wait.cpp", "torch/csrc/lazy/core/ops/arithmetic_ir_ops.cpp", diff --git a/torch/csrc/lazy/core/ir_builder.h b/torch/csrc/lazy/core/ir_builder.h index 8e645c485158e..20e4730d50135 100644 --- a/torch/csrc/lazy/core/ir_builder.h +++ b/torch/csrc/lazy/core/ir_builder.h @@ -73,59 +73,6 @@ struct IrBuilder { const size_t& num_outputs = 1, const hash_t& hash_seed = static_cast(0x5a2d296e9)) const = 0; - // View op nodes - virtual NodePtr MakeAsStridedViewUpdate( - const Value& input0, - const Value& input1, - const std::vector& size, - const std::vector& stride, - const int64_t& storage_offset) const = 0; - virtual NodePtr MakeAsStrided( - const Value& input0, - const std::vector& size, - const std::vector& stride, - const int64_t& storage_offset) const = 0; - virtual NodePtr MakeDiagonalViewUpdate( - const Value& input0, - const Value& input1, - const int64_t& offset, - const int64_t& dim1, - const int64_t& dim2) const = 0; - virtual NodePtr MakeDiagonal( - const Value& input0, - const int64_t& offset, - const int64_t& dim1, - const int64_t& dim2) const = 0; - virtual NodePtr MakeNarrowViewUpdate( - const Value& input0, - const Value& input1, - const std::vector& base_indices) const = 0; - virtual NodePtr MakeNarrow( - const Value& input0, - const std::vector& base_indices, - const std::vector& sizes) const = 0; - virtual NodePtr MakePermute( - const Value& input0, - const std::vector& dims) const = 0; - virtual NodePtr MakeResize( - const Value& input0, - const std::vector& size) const = 0; - virtual NodePtr MakeSelectViewUpdate( - const Value& input0, - const Value& input1, - const int64_t& dim, - const int64_t& start, - const int64_t& end, - const int64_t& stride) const = 0; - virtual NodePtr MakeSelect( - const Value& input0, - const int64_t& dim, - const int64_t& start, - const int64_t& end, - const int64_t& stride) const = 0; - virtual NodePtr MakeSqueeze(const Value& input0, const int& dim) const = 0; - virtual NodePtr MakeUnsqueeze(const Value& input0, const int& dim) const = 0; - // dynamic ir nodes virtual NodePtr MakeSizeNode(const Value& input, size_t dim) const = 0; virtual NodePtr MakeSizeAdd(const Value& a, const Value& b) const = 0; @@ -173,86 +120,6 @@ static inline NodePtr MakeGeneric( op, operands, shape, num_outputs, hash_seed); } -// View op nodes -static inline NodePtr MakeAsStridedViewUpdate( - const Value& input0, - const Value& input1, - const std::vector& size, - const std::vector& stride, - const int64_t& storage_offset) { - return getIrBuilder()->MakeAsStridedViewUpdate( - input0, input1, size, stride, storage_offset); -} -static inline NodePtr MakeAsStrided( - const Value& input0, - const std::vector& size, - const std::vector& stride, - const int64_t& storage_offset) { - return getIrBuilder()->MakeAsStrided(input0, size, stride, storage_offset); -} -static inline NodePtr MakeDiagonalViewUpdate( - const Value& input0, - const Value& input1, - const int64_t& offset, - const int64_t& dim1, - const int64_t& dim2) { - return getIrBuilder()->MakeDiagonalViewUpdate( - input0, input1, offset, dim1, dim2); -} -static inline NodePtr MakeDiagonal( - const Value& input0, - const int64_t& offset, - const int64_t& dim1, - const int64_t& dim2) { - return getIrBuilder()->MakeDiagonal(input0, offset, dim1, dim2); -} -static inline NodePtr MakeNarrowViewUpdate( - const Value& input0, - const Value& input1, - const std::vector& base_indices) { - return getIrBuilder()->MakeNarrowViewUpdate(input0, input1, base_indices); -} -static inline NodePtr MakeNarrow( - const Value& input0, - const std::vector& base_indices, - const std::vector& sizes) { - return getIrBuilder()->MakeNarrow(input0, base_indices, sizes); -} -static inline NodePtr MakePermute( - const Value& input0, - const std::vector& dims) { - return getIrBuilder()->MakePermute(input0, dims); -} -static inline NodePtr MakeResize( - const Value& input0, - const std::vector& size) { - return getIrBuilder()->MakeResize(input0, size); -} -static inline NodePtr MakeSelectViewUpdate( - const Value& input0, - const Value& input1, - const int64_t& dim, - const int64_t& start, - const int64_t& end, - const int64_t& stride) { - return getIrBuilder()->MakeSelectViewUpdate( - input0, input1, dim, start, end, stride); -} -static inline NodePtr MakeSelect( - const Value& input0, - const int64_t& dim, - const int64_t& start, - const int64_t& end, - const int64_t& stride) { - return getIrBuilder()->MakeSelect(input0, dim, start, end, stride); -} -static inline NodePtr MakeSqueeze(const Value& input0, const int& dim) { - return getIrBuilder()->MakeSqueeze(input0, dim); -} -static inline NodePtr MakeUnsqueeze(const Value& input0, const int& dim) { - return getIrBuilder()->MakeUnsqueeze(input0, dim); -} - // dynamic ir nodes static inline NodePtr MakeSizeNode(const Value& input, size_t dim) { return getIrBuilder()->MakeSizeNode(input, dim); diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp index 06b37797d3fa6..4989ce24a0ef1 100644 --- a/torch/csrc/lazy/core/lazy_graph_executor.cpp +++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp @@ -789,33 +789,6 @@ LazyGraphExecutor::CompilationResult LazyGraphExecutor::Compile( Value ir_value = tensors[index]->CurrentIrValue(); lowering_ctx->AddResult(ir_value); } - if (FLAGS_torch_lazy_param_aliasing && coll.config.sync_ltc_data) { - // We can only alias at the step barrier, when force_ltc_data is true. - // Consider the case: - // 1. Tensor A(DEVICE_DATA) - // 2. Tensor B = A + 0.9 - // 3. A += 0.4 - // If we activate aliasing for A's graph, and we do: - // print(A) - // print(A) - // The first print will update DEVICE_DATA' with DEVICE_DATA+0.4, and the - // second print will again update DEVICE_DATA" with DEVICE_DATA'+0.4, which - // will lead to incorrect results. - // We cannot normally turn A's state into DEVICE_DATA, as if any of the - // sources is a view, this will not lead to correct results (as A's value - // taken at different times need to reflect view source changes): - // 1. Tensor A = some_graph_with_view_source(V) - // 2. print(A) - // 3. V += 1 - // 4. print(A) - // The second print should reflect the new value due to V's changes. - // Also in the first example, unless we are doing a step barrier and hence - // include all live tensors, if the B value is not part of the graph, it - // will later fetch the new value of A, which is incorrect. - // But, when we issue a step barrier (force_ltc_data == true) we have to - // turn everything into DEVICE_DATA, so we can activate aliasing. - BuildInputOutputAliases(tensors, coll.indices, lowering_ctx.get()); - } ComputationPtr computation = lowering_ctx->Build(); // If force_ltc_data is true it means that we did a proper sync and are @@ -866,40 +839,6 @@ LazyGraphExecutor::ComputationCache::TypePtr LazyGraphExecutor:: typedef SSIZE_T ssize_t; #endif -void LazyGraphExecutor::BuildInputOutputAliases( - const std::vector& tensors, - c10::ArrayRef indices, - LoweringContext* lowering_ctx) { - std::unordered_map output_tensor_id_map; - for (const auto i : c10::irange(indices.size())) { - size_t tensor_index = indices[i]; - int64_t tensor_id = tensors[tensor_index]->GetUniqueId(); - output_tensor_id_map[tensor_id] = i; - } - const std::vector& parameters_data = - lowering_ctx->GetParametersData(); - std::vector alias_map(indices.size(), -1); - for (const auto i : c10::irange(parameters_data.size())) { - DeviceDataInfo* data_info = - dynamic_cast(parameters_data[i]->info()); - if (data_info != nullptr && !data_info->read_only) { - auto it = output_tensor_id_map.find(data_info->tensor_id); - if (it != output_tensor_id_map.end()) { - size_t output_index = it->second; - if (lowering_ctx->CheckResultShape(parameters_data[i], output_index) && - alias_map[output_index] < 0) { - lowering_ctx->SetUpAlias({static_cast(output_index)}, i, {}); - alias_map[output_index] = i; - - VLOG(6) << "Aliased parameter " << i << " with output " - << output_index << ": " << Shape(parameters_data[i]->shape()); - } - } - } - } - TORCH_LAZY_VALUE_METRIC("InputOutputAliasCount", alias_map.size()); -} - std::shared_ptr LazyGraphExecutor:: SyncTensorsGraphInternal( std::vector* tensors, diff --git a/torch/csrc/lazy/core/lazy_graph_executor.h b/torch/csrc/lazy/core/lazy_graph_executor.h index 7a4498d85fc0f..b7e10374fbb76 100644 --- a/torch/csrc/lazy/core/lazy_graph_executor.h +++ b/torch/csrc/lazy/core/lazy_graph_executor.h @@ -223,11 +223,6 @@ class TORCH_API LazyGraphExecutor { ComputationCache::TypePtr LookupCachedCompile(const hash_t& hash); - void BuildInputOutputAliases( - const std::vector& tensors, - c10::ArrayRef indices, - LoweringContext* lowering_ctx); - std::shared_ptr SyncTensorsGraphInternal( std::vector* tensors, c10::ArrayRef devices, diff --git a/torch/csrc/lazy/core/lazy_view.cpp b/torch/csrc/lazy/core/lazy_view.cpp deleted file mode 100644 index d52c0f62fb77e..0000000000000 --- a/torch/csrc/lazy/core/lazy_view.cpp +++ /dev/null @@ -1,262 +0,0 @@ -#include - -#include -#include -#include -#include - -#include -#include -#include -#include - -namespace torch { -namespace lazy { -namespace { - -Value ApplyViewInfo(Value ir_value, const ViewInfo& view_info) { - switch (view_info.view_type) { - case ViewInfo::Type::kSelect: - return MakeSelect( - ir_value, - view_info.select->dim, - view_info.select->start, - view_info.select->end, - view_info.select->stride); - case ViewInfo::Type::kNarrow: - return MakeNarrow( - ir_value, view_info.indices, view_info.shape.sizes().vec()); - case ViewInfo::Type::kNoOp: - return ir_value; - case ViewInfo::Type::kPermute: - return MakePermute(ir_value, view_info.permutation); - case ViewInfo::Type::kReshape: - return MakeView(ir_value, view_info.shape.sizes().vec()); - case ViewInfo::Type::kResize: - return MakeResize(ir_value, view_info.shape.sizes().vec()); - case ViewInfo::Type::kSqueeze: - return MakeSqueeze(ir_value, view_info.squeeze_index); - case ViewInfo::Type::kUnsqueeze: - return MakeUnsqueeze(ir_value, view_info.squeeze_index); - case ViewInfo::Type::kAsStrided: - return MakeAsStrided( - ir_value, - view_info.shape.sizes().vec(), - view_info.as_strided->stride, - view_info.as_strided->offset); - case ViewInfo::Type::kDiagonal: - return MakeDiagonal( - ir_value, - view_info.diagonal->offset, - view_info.diagonal->dim1, - view_info.diagonal->dim2); - default: - TORCH_INTERNAL_ASSERT( - false, "Invalid view type: ", GetEnumValue(view_info.view_type)); - } -} - -// Here we are trying to populate inplace updated values from the latest view -// all the way back to the original tensor. -// For example: -// a = torch.diagonal(b) -// b.add_(1) # a should be updated as well. -// -// Ideally we should all have a *ViewUpdate IR which updates the original -// tensor/view withe current value. See DiagonalViewUpdate and corresponding -// LowerDiagonalViewUpdate in ts_node_lowering.cpp. There are some "edge cases" -// here simply because they can smartly reuse some other ops to undo themselves. -Value ApplyUpdate(Value ir_value, const Alias::UpdateData& update_data) { - // We first bring the source IR value forward, by reshaping and slicing. - std::vector tmp_values({ir_value}); - for (const ViewInfo& view_info : update_data.view_infos) { - tmp_values.push_back(ApplyViewInfo(tmp_values.back(), view_info)); - } - // We then move backward given the source update value, by reshaping and - // slice-updating. - Value result = update_data.ir_value; - for (size_t i = update_data.view_infos.size(); i > 0; --i) { - const ViewInfo& view_info = update_data.view_infos[i - 1]; - switch (view_info.view_type) { - case ViewInfo::Type::kSelect: - result = MakeSelectViewUpdate( - tmp_values[i - 1], - result, - view_info.select->dim, - view_info.select->start, - view_info.select->end, - view_info.select->stride); - break; - case ViewInfo::Type::kNarrow: - result = - MakeNarrowViewUpdate(tmp_values[i - 1], result, view_info.indices); - break; - case ViewInfo::Type::kNoOp: - break; - case ViewInfo::Type::kPermute: - result = MakePermute(result, InversePermutation(view_info.permutation)); - break; - case ViewInfo::Type::kReshape: - result = MakeView(result, view_info.source_shape.sizes().vec()); - break; - case ViewInfo::Type::kResize: - result = MakeResize(result, view_info.source_shape.sizes().vec()); - break; - case ViewInfo::Type::kSqueeze: - result = MakeUnsqueeze(ir_value, view_info.squeeze_index); - break; - case ViewInfo::Type::kUnsqueeze: - result = MakeSqueeze(ir_value, view_info.squeeze_index); - break; - case ViewInfo::Type::kAsStrided: - result = MakeAsStridedViewUpdate( - tmp_values[i - 1], - result, - view_info.source_shape.sizes().vec(), - view_info.as_strided->stride, - view_info.as_strided->offset); - break; - case ViewInfo::Type::kDiagonal: - result = MakeDiagonalViewUpdate( - tmp_values[i - 1], - result, - view_info.diagonal->offset, - view_info.diagonal->dim1, - view_info.diagonal->dim2); - break; - default: - TORCH_INTERNAL_ASSERT( - false, "Invalid view type: ", GetEnumValue(view_info.view_type)); - } - } - return result; -} - -} // namespace - -ViewInfo::ViewInfo(Type view_type, Shape shape, Shape source_shape) - : view_type(view_type), - shape(std::move(shape)), - indices(source_shape.dim(), 0), - source_shape(std::move(source_shape)) {} - -ViewInfo::ViewInfo(Type view_type, Shape shape, Shape source_shape, int64_t sqi) - : view_type(view_type), - shape(std::move(shape)), - source_shape(std::move(source_shape)), - squeeze_index(sqi) { - TORCH_CHECK(view_type == Type::kSqueeze); -} - -ViewInfo::ViewInfo( - Type view_type, - Shape source_shape, - std::vector permutation) - : view_type(view_type), - shape(MakePermuteShape(source_shape, permutation)), - source_shape(std::move(source_shape)), - permutation(std::move(permutation)) { - TORCH_CHECK(view_type == Type::kPermute); -} - -ViewInfo::ViewInfo(Type view_type, const Shape& source_shape, SelectInfo select) - : view_type(view_type), - shape(MakeSelectShape( - source_shape, - select.dim, - select.start, - select.end, - select.stride)), - source_shape(source_shape), - select(select) { - TORCH_CHECK(view_type == Type::kSelect); -} - -ViewInfo::ViewInfo( - Type view_type, - Shape shape, - Shape source_shape, - AsStridedInfo as_strided) - : view_type(view_type), - shape(std::move(shape)), - source_shape(std::move(source_shape)), - as_strided(std::move(as_strided)) { - TORCH_CHECK(view_type == Type::kAsStrided); -} - -ViewInfo::ViewInfo( - Type view_type, - const Shape& source_shape, - DiagonalInfo diagonal) - : view_type(view_type), - shape(MakeDiagonalShape( - source_shape, - diagonal.offset, - diagonal.dim1, - diagonal.dim2)), - source_shape(source_shape), - diagonal(diagonal) { - TORCH_CHECK(view_type == Type::kDiagonal); -} - -void Alias::Update(Value ir_value, std::vector view_infos) { - if (!updates_.empty() && updates_.back().view_infos == view_infos) { - updates_.back().ir_value = std::move(ir_value); - } else { - updates_.push_back({std::move(ir_value), std::move(view_infos)}); - } - ++generation_; -} - -Value Alias::SyncUpdateOperations() { - for (auto& update_data : updates_) { - root_ir_value_ = ApplyUpdate(root_ir_value_, update_data); - } - updates_.clear(); - return root_ir_value_; -} - -LazyView::LazyView( - Shape shape, - std::shared_ptr alias, - ViewInfo view_info) - : shape_(std::move(shape)), alias_(std::move(alias)) { - view_infos_.push_back(std::move(view_info)); -} - -LazyView::LazyView( - Shape shape, - std::shared_ptr alias, - std::vector view_infos) - : view_infos_(std::move(view_infos)), - shape_(std::move(shape)), - alias_(std::move(alias)) {} - -void LazyView::Update(Value ir_value) { - alias_->Update(std::move(ir_value), view_infos_); -} - -std::shared_ptr LazyView::CreateSubView( - Shape shape, - ViewInfo view_info) { - std::vector view_infos(view_infos_); - view_infos.push_back(std::move(view_info)); - return std::make_shared( - std::move(shape), alias_, std::move(view_infos)); -} - -std::tuple LazyView::GetViewIrNode() { - if (IsUpToDate()) { - return std::make_tuple(ir_value_, false); - } - Value update = alias_->SyncUpdateOperations(); - for (auto& view_info : view_infos_) { - update = ApplyViewInfo(update, view_info); - } - ir_value_ = update; - generation_ = alias_->generation(); - return std::make_tuple(ir_value_, true); -} - -} // namespace lazy -} // namespace torch diff --git a/torch/csrc/lazy/core/lazy_view.h b/torch/csrc/lazy/core/lazy_view.h deleted file mode 100644 index 5e1a106494cfb..0000000000000 --- a/torch/csrc/lazy/core/lazy_view.h +++ /dev/null @@ -1,173 +0,0 @@ -#pragma once - -#include -#include -#include - -#include -#include - -namespace torch { -namespace lazy { - -struct TORCH_API SelectInfo { - bool operator==(const SelectInfo& ref) const { - return dim == ref.dim && start == ref.start && end == ref.end && - stride == ref.stride; - } - - int64_t dim = 0; - int64_t start = 0; - int64_t end = 0; - int64_t stride = 0; -}; - -struct TORCH_API AsStridedInfo { - bool operator==(const AsStridedInfo& ref) const { - return offset == ref.offset && stride == ref.stride; - } - - std::vector stride; - int64_t offset = 0; -}; - -struct TORCH_API DiagonalInfo { - bool operator==(const DiagonalInfo& ref) const { - return offset == ref.offset && dim1 == ref.dim1 && dim2 == ref.dim2; - } - - int64_t offset = 0; - int64_t dim1 = 0; - int64_t dim2 = 1; -}; - -struct TORCH_API ViewInfo { - enum class Type { - kInvalid, - kNarrow, - kNoOp, - kPermute, - kReshape, - kResize, - kSelect, - kAsStrided, - kDiagonal, - kSqueeze, - kUnsqueeze, - }; - - ViewInfo() = default; - ViewInfo(Type view_type, Shape shape, Shape source_shape); - ViewInfo(Type view_type, Shape shape, Shape source_shape, int64_t sqi); - ViewInfo( - Type view_type, - Shape source_shape, - std::vector permutation); - ViewInfo(Type view_type, const Shape& source_shape, SelectInfo select); - ViewInfo( - Type view_type, - Shape shape, - Shape source_shape, - AsStridedInfo as_strided); - ViewInfo(Type view_type, const Shape& source_shape, DiagonalInfo diagonal); - - bool operator==(const ViewInfo& ref) const { - return view_type == ref.view_type && shape == ref.shape && - indices == ref.indices && source_shape == ref.source_shape && - permutation == ref.permutation && select == ref.select && - as_strided == ref.as_strided && diagonal == ref.diagonal; - } - - Type view_type = Type::kInvalid; - // The shape of the result of a view. In case of narrowing, this represents - // the size of the narrow slice. - Shape shape; - // In case of narrowing, the starting indices from where the narrow slice is - // cut. - std::vector indices; - // The shape of the source of this view. - Shape source_shape; - // The permutation to be used. If empty, this is not a permute operation. - std::vector permutation; - // Information used for sliced views. - c10::optional select; - // Information used for as_strided views. - c10::optional as_strided; - // Information used for diagonal views. - c10::optional diagonal; - // Squeeze/Unsqueeze Index - int64_t squeeze_index; -}; - -// When a "view" (capture by reference) is taken on a node, an Alias object is -// created on the captured node itself, with its current IR Node value. -class TORCH_API Alias { - public: - struct UpdateData { - Value ir_value; - std::vector view_infos; - }; - - explicit Alias(Value ir_value) : root_ir_value_(std::move(ir_value)) {} - - size_t generation() const { - return generation_; - } - - // Appends an update to the IR value stored within the alias. The ir_value is - // the value to be written, and view_infos represents the forward path from - // the alias's ir_value to the update ir_value. - void Update(Value ir_value, std::vector view_infos); - - Value SyncUpdateOperations(); - - private: - // The IR value which is the root at which the view was created. - Value root_ir_value_; - // The stacked updates on the view. Orders matter, as most recent updates - // might overwrite older ones. - std::vector updates_; - // Incremented every time an update happens. Used by view to track alias - // changes and regenerate the most current value. - size_t generation_ = 0; -}; - -class TORCH_API LazyView { - public: - LazyView(Shape shape, std::shared_ptr alias, ViewInfo view_info); - LazyView( - Shape shape, - std::shared_ptr alias, - std::vector view_infos); - - void Update(Value ir_value); - - const Shape& shape() const { - return shape_; - } - - const std::shared_ptr& alias() const { - return alias_; - } - - std::shared_ptr CreateSubView(Shape shape, ViewInfo view_info); - - // Extracts the current IrNode out of a view, into a IrNode structure - // where the updated fields tells whether a new IR value has been created, or - // the cached one returned. - std::tuple GetViewIrNode(); - - bool IsUpToDate() const { - return ir_value_ && generation_ == alias_->generation(); - } - - private: - std::vector view_infos_; - Shape shape_; - std::shared_ptr alias_; - Value ir_value_; - size_t generation_ = 0; -}; - -} // namespace lazy -} // namespace torch diff --git a/torch/csrc/lazy/core/tensor.cpp b/torch/csrc/lazy/core/tensor.cpp index 0a114d0e71179..734dc5fdbd9ac 100644 --- a/torch/csrc/lazy/core/tensor.cpp +++ b/torch/csrc/lazy/core/tensor.cpp @@ -47,15 +47,6 @@ LazyTensorPtr LazyTensor::Create(Value ir_value, const BackendDevice& device) { return lazy_tensor; } -LazyTensorPtr LazyTensor::Create( - std::shared_ptr view, - const BackendDevice& device) { - LazyTensorPtr lazy_tensor = - c10::make_intrusive(LazyTensor(std::move(view), device)); - LazyGraphExecutor::Get()->RegisterTensor(lazy_tensor->data_ptr()); - return lazy_tensor; -} - LazyTensorPtr LazyTensor::Create(BackendDataPtr handle) { LazyTensorPtr lazy_tensor = c10::make_intrusive(LazyTensor(std::move(handle))); @@ -78,11 +69,6 @@ LazyTensor::LazyTensor(Value ir_value, const BackendDevice& device) TryLimitGraphSize(); } -LazyTensor::LazyTensor( - std::shared_ptr view, - const BackendDevice& device) - : LazyTensor(std::make_shared(std::move(view), device)) {} - LazyTensor::LazyTensor(std::shared_ptr data) : data_(std::move(data)) {} LazyTensor::Data* LazyTensor::data() const { @@ -102,9 +88,6 @@ at::ScalarType LazyTensor::dtype() const { } MaybeRef LazyTensor::shape() const { - if (data()->view != nullptr) { - return data()->view->shape(); - } if (data()->handle != nullptr) { return Shape(data()->handle->shape()); } @@ -126,45 +109,23 @@ int64_t LazyTensor::GetUniqueId() const { return data()->unique_id; } -std::ptrdiff_t LazyTensor::GetViewAliasId() const { - return data()->view != nullptr - ? reinterpret_cast(data()->view->alias().get()) - : 0; -} - BackendDataPtr LazyTensor::GetDataHandle() { - // Data can coexist with a view, but we need to check that the view did - // not receive any updates before calling the current IR valid. - bool up_to_date = true; - Value ir_value; - if (data()->view != nullptr) { - bool updated = false; - std::tie(ir_value, updated) = GetViewUpdate(data()->view); - up_to_date = !updated; - } - if (up_to_date) { - BackendDataPtr handle = CurrentDataHandle(); - if (handle != nullptr) { - TORCH_CHECK( - handle->HasValue(), - "Trying to access data while an async operation is in flight: ", - handle->shape().to_string()); - return handle; - } - } - if (ir_value) { - // The view gave us an updated IR value. We usually do not have a valid IR - // value field together with a view, but to allow code reuse in - // ApplyPendingGraph() we temporarily set it here. The following call to - // ApplyPendingGraph() will clear it. - AssignIrValue(std::move(ir_value)); + BackendDataPtr handle = CurrentDataHandle(); + if (handle != nullptr) { + TORCH_CHECK( + handle->HasValue(), + "Trying to access data while an async operation is in flight: ", + handle->shape().to_string()); + return handle; } + if (data()->ir_value) { ApplyPendingGraph(); } else { TORCH_CHECK(data()->tensor_data); data()->handle = TensorToDataHandle(*data()->tensor_data, GetDevice()); } + return data()->handle; } @@ -179,10 +140,9 @@ void LazyTensor::SetDataHandle(BackendDataPtr handle) { void LazyTensor::SetDataHandle(BackendDataPtr handle, bool sync) { data()->handle = std::move(handle); // Assigning a device data should always clear the IR node, to allow graph - // trimming. A view cannot be reset though, unless we are at a step-end sync. + // trimming. AssignIrValue(Value()); if (sync) { - data()->view = nullptr; data()->tensor_data = c10::nullopt; } } @@ -190,16 +150,8 @@ void LazyTensor::SetDataHandle(BackendDataPtr handle, bool sync) { void LazyTensor::SetIrValue(Value ir_value) { data()->handle = nullptr; data()->tensor_data = c10::nullopt; - if (data()->view != nullptr) { - // If we have an active view, and a SetIrValue() happens, it means we are - // within an in-place execution context, and we need to update the view's - // alias as well. - data()->view = UpdateView(data()->view, std::move(ir_value)); - data()->generation += 1; - } else { - AssignIrValue(std::move(ir_value)); - TryLimitGraphSize(); - } + AssignIrValue(std::move(ir_value)); + TryLimitGraphSize(); } void LazyTensor::SetInPlaceIrValue(Value ir_value) { @@ -252,9 +204,6 @@ Value LazyTensor::GetIrValue() const { } Value LazyTensor::CurrentIrValue() const { - if (data()->view != nullptr) { - return std::get<0>(GetViewUpdate(data()->view)); - } return data()->ir_value; } @@ -263,9 +212,6 @@ void LazyTensor::SetTensorData(at::Tensor tensor_data) { } c10::optional LazyTensor::CurrentTensorData() const { - if (data()->view != nullptr && !data()->view->IsUpToDate()) { - return c10::nullopt; - } return data()->tensor_data; } @@ -288,69 +234,6 @@ Value LazyTensor::GetIrValueForTensor( return CreateTensorNode(std::move(data), read_only); } -std::tuple LazyTensor::GetViewUpdate( - const std::shared_ptr& view) const { - auto value_with_update = view->GetViewIrNode(); - if (std::get<1>(value_with_update)) { - data()->handle = nullptr; - data()->tensor_data = c10::nullopt; - } - return value_with_update; -} - -std::shared_ptr LazyTensor::UpdateView( - std::shared_ptr view, - Value ir_value) const { - if (ir_value.shape().sizes() != view->shape().sizes()) { - TORCH_CHECK(ir_value.shape().numel() == view->shape().numel()); - - ViewInfo view_info( - ViewInfo::Type::kReshape, ir_value.shape(), view->shape()); - view = view->CreateSubView(view_info.shape, view_info); - } - view->Update(std::move(ir_value)); - return view; -} - -void LazyTensor::SetSubView(ViewInfo view_info) const { - data()->view = data()->view->CreateSubView(view_info.shape, view_info); - data()->generation += 1; -} - -void LazyTensor::ModifyCurrentView(ViewInfo view_info) const { - if (data()->view != nullptr) { - SetSubView(view_info); - return; - } - // This node is not a view. Since this function is meant to modify a view - // in place, we need to turn this existing tensor into a view. - Value ir_value = GetIrValue(); - std::shared_ptr alias = std::make_shared(ir_value); - data()->view = std::make_shared(view_info.shape, alias, view_info); - AssignIrValue(Value()); -} - -std::shared_ptr LazyTensor::CreateView(ViewInfo view_info) const { - if (data()->view != nullptr) { - return data()->view->CreateSubView(view_info.shape, view_info); - } - // This node is not a view, and creating a view forks the current node into - // becoming one itself. This means creating an alias with the current IR - // Node, and using the same alias for the created IR Node. - Value ir_value = GetIrValue(); - std::shared_ptr alias = std::make_shared(ir_value); - ViewInfo this_view_info( - ViewInfo::Type::kNoOp, ir_value.shape(), ir_value.shape()); - data()->view = std::make_shared( - ir_value.shape(), alias, std::move(this_view_info)); - AssignIrValue(Value()); - return std::make_shared(view_info.shape, alias, view_info); -} - -LazyTensorPtr LazyTensor::CreateViewTensor(ViewInfo view_info) const { - return Create(CreateView(std::move(view_info)), GetDevice()); -} - at::Tensor LazyTensor::ToTensor(bool detached) { at::Tensor tensor; c10::optional tensor_data = CurrentTensorData(); @@ -367,8 +250,7 @@ at::Tensor LazyTensor::ToTensor(bool detached) { } else { tensor = *tensor_data; if (detached) { - if (data()->ir_value || data()->handle != nullptr || - data()->view != nullptr) { + if (data()->ir_value || data()->handle != nullptr) { // If we have other authoritive sources, just drop our reference and // transfer it to the caller. data()->tensor_data = c10::nullopt; @@ -388,7 +270,6 @@ void LazyTensor::ShallowCopyTo(LazyTensorPtr dest) const { void LazyTensor::SetTensor(at::Tensor tensor) { SetTensorData(tensor); - data()->view = nullptr; data()->handle = nullptr; AssignIrValue(Value()); } @@ -401,25 +282,14 @@ void LazyTensor::UpdateFromTensor(at::Tensor tensor, bool sync) { SetTensorData(tensor); data()->handle = nullptr; AssignIrValue(Value()); - if (data()->view != nullptr) { - Value ir_value = GetIrValueForTensor(tensor, GetDevice()); - data()->view = UpdateView(data()->view, std::move(ir_value)); - } } } void LazyTensor::UpdateFromTensorOut(at::Tensor tensor) { - if (data()->view != nullptr && shape().Get().numel() != tensor.numel()) { - data()->view = nullptr; - } UpdateFromTensor(std::move(tensor), /*sync=*/false); } void LazyTensor::UpdateFromTensorOut(const LazyTensorPtr& tensor) { - if (data()->view != nullptr && - shape().Get().numel() != tensor->shape().Get().numel()) { - data()->view = nullptr; - } SetIrValue(tensor->GetIrValue()); } diff --git a/torch/csrc/lazy/core/tensor.h b/torch/csrc/lazy/core/tensor.h index 052b84b4a60cc..85ea6ab4f4c61 100644 --- a/torch/csrc/lazy/core/tensor.h +++ b/torch/csrc/lazy/core/tensor.h @@ -5,7 +5,6 @@ #include #include #include -#include #include namespace torch { @@ -37,10 +36,6 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target { : ir_value(std::move(ir_value)), device(std::move(device)), unique_id(GetNextTensorId()) {} - Data(std::shared_ptr view, BackendDevice device) - : view(std::move(view)), - device(std::move(device)), - unique_id(GetNextTensorId()) {} Data(at::Tensor tensor_data, BackendDevice device) : tensor_data(std::move(tensor_data)), device(std::move(device)), @@ -50,7 +45,6 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target { BackendDataPtr handle; Value ir_value; - std::shared_ptr view; c10::optional tensor_data; const BackendDevice device; const int64_t unique_id = 0; @@ -76,10 +70,6 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target { return data()->generation; } - LazyTensorPtr alias() const { - return c10::make_intrusive(LazyTensor(data_ptr())); - } - int64_t size(int64_t dim) const; at::Tensor ToTensor(bool detached); @@ -102,10 +92,6 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target { const BackendDevice& GetDevice() const; int64_t GetUniqueId() const; - // Retrieves an opaque ID of the alias object upon which the tensor's view is - // rooted, or 0 if this tensor is not a view. - std::ptrdiff_t GetViewAliasId() const; - // Fetches the data behind the tensor. If the tensor has a graph defining // its current value, executes the graph and fetches the data result. BackendDataPtr GetDataHandle(); @@ -129,31 +115,21 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target { void SetIrValue(Value ir_value); void SetInPlaceIrValue(Value ir_value); - void SetSubView(ViewInfo view_info) const; - c10::optional CurrentTensorData() const; std::vector MakeOutputTensors(NodePtr node) const; - LazyTensorPtr CreateViewTensor(ViewInfo view_info) const; LazyTensorPtr CopyTensorToDevice(const BackendDevice& device); - void ModifyCurrentView(ViewInfo view_info) const; - // Applies the queue of operations in preparation for using the data. void ApplyPendingGraph(); private: LazyTensor(const at::Tensor& tensor, const BackendDevice& device); LazyTensor(Value ir_value, const BackendDevice& device); - LazyTensor(std::shared_ptr view, const BackendDevice& device); explicit LazyTensor(BackendDataPtr handle); explicit LazyTensor(std::shared_ptr data); - static LazyTensorPtr Create( - std::shared_ptr view, - const BackendDevice& device); - std::shared_ptr data_ptr() const { return data_; } @@ -164,15 +140,6 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target { Value CreateTensorNode(BackendDataPtr data, bool read_only) const; - std::tuple GetViewUpdate( - const std::shared_ptr& view) const; - - std::shared_ptr UpdateView( - std::shared_ptr view, - Value ir_value) const; - - std::shared_ptr CreateView(ViewInfo view_info) const; - // We build a graph accumulating operations, but at a given point we // need to force a rendering, otherwise the graph can grow without control. // Think: diff --git a/torch/csrc/lazy/ts_backend/ir_builder.h b/torch/csrc/lazy/ts_backend/ir_builder.h index 600243b67f622..067efc784ee5a 100644 --- a/torch/csrc/lazy/ts_backend/ir_builder.h +++ b/torch/csrc/lazy/ts_backend/ir_builder.h @@ -55,84 +55,6 @@ struct TorchScriptIrBuilder : IrBuilder { return MakeNode(op, operands, shape, num_outputs, hash_seed); } - // View op nodes - NodePtr MakeAsStridedViewUpdate( - const Value& input0, - const Value& input1, - const std::vector& size, - const std::vector& stride, - const int64_t& storage_offset) const override { - return ReuseOrMakeNode( - input0, input1, size, stride, storage_offset); - } - NodePtr MakeAsStrided( - const Value& input0, - const std::vector& size, - const std::vector& stride, - const int64_t& storage_offset) const override { - return ReuseOrMakeNode(input0, size, stride, storage_offset); - } - NodePtr MakeDiagonalViewUpdate( - const Value& input0, - const Value& input1, - const int64_t& offset, - const int64_t& dim1, - const int64_t& dim2) const override { - return ReuseOrMakeNode( - input0, input1, offset, dim1, dim2); - } - NodePtr MakeDiagonal( - const Value& input0, - const int64_t& offset, - const int64_t& dim1, - const int64_t& dim2) const override { - return ReuseOrMakeNode(input0, offset, dim1, dim2); - } - NodePtr MakeNarrowViewUpdate( - const Value& input0, - const Value& input1, - const std::vector& base_indices) const override { - return ReuseOrMakeNode(input0, input1, base_indices); - } - NodePtr MakeNarrow( - const Value& input0, - const std::vector& base_indices, - const std::vector& sizes) const override { - return ReuseOrMakeNode(input0, base_indices, sizes); - } - NodePtr MakePermute(const Value& input0, const std::vector& dims) - const override { - return ReuseOrMakeNode(input0, dims); - } - NodePtr MakeResize(const Value& input0, const std::vector& size) - const override { - return ReuseOrMakeNode(input0, size); - } - NodePtr MakeSelectViewUpdate( - const Value& input0, - const Value& input1, - const int64_t& dim, - const int64_t& start, - const int64_t& end, - const int64_t& stride) const override { - return ReuseOrMakeNode( - input0, input1, dim, start, end, stride); - } - NodePtr MakeSelect( - const Value& input0, - const int64_t& dim, - const int64_t& start, - const int64_t& end, - const int64_t& stride) const override { - return ReuseOrMakeNode