pytorch · AlannaBurke · Dec 15, 2025 · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025
diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
@@ -11,12 +11,14 @@ sphinx-copybutton==0.5.2
 sphinx_sitemap==2.7.1
 sphinxcontrib-mermaid==1.0.0
 sphinxcontrib.katex==0.9.10
+sphinx_tippy==0.4.3
 pypandoc==1.15
 pandocfilters==1.5.1
 markdown==3.8.2
 
 # PyTorch Theme
-pytorch_sphinx_theme2==0.2.0
+#pytorch_sphinx_theme2==0.2.0
+git+https://github.com/pytorch/pytorch_sphinx_theme.git@5b6d2df5660d2ccf4b34cf819b7ab7c69f65f20d#egg=pytorch_sphinx_theme2
 
 # Tutorial dependencies
 tqdm==4.66.1

diff --git a/advanced_source/cpp_autograd.rst b/advanced_source/cpp_autograd.rst
@@ -15,7 +15,7 @@ Basic autograd operations
 
 (Adapted from `this tutorial <https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html#autograd-automatic-differentiation>`_)
 
-Create a tensor and set ``torch::requires_grad()`` to track computation with it
+Create a :term:`tensor` and set ``torch::requires_grad()`` to track computation with it
 
 .. code-block:: cpp
 
@@ -64,7 +64,7 @@ Do more operations on ``y``
 
   auto z = y * y * 3;
   auto out = z.mean();
-  
+
   std::cout << z << std::endl;
   std::cout << z.grad_fn()->name() << std::endl;
   std::cout << out << std::endl;
@@ -90,10 +90,10 @@ Out:
   auto a = torch::randn({2, 2});
   a = ((a * 3) / (a - 1));
   std::cout << a.requires_grad() << std::endl;
-  
+
   a.requires_grad_(true);
   std::cout << a.requires_grad() << std::endl;
-  
+
   auto b = (a * a).sum();
   std::cout << b.grad_fn()->name() << std::endl;
 
@@ -106,13 +106,13 @@ Out:
   SumBackward0
 
 Let's backprop now. Because ``out`` contains a single scalar, ``out.backward()``
-is equivalent to ``out.backward(torch::tensor(1.))``.
+is equivalent to ``out.backward(torch::tensor(1.))``. This is part of the :term:`backward pass`.
 
 .. code-block:: cpp
 
   out.backward();
 
-Print gradients d(out)/dx
+Print :term:`gradients<gradient>` d(out)/dx
 
 .. code-block:: cpp
 
@@ -134,12 +134,12 @@ Now let's take a look at an example of vector-Jacobian product:
 .. code-block:: cpp
 
   x = torch::randn(3, torch::requires_grad());
-  
+
   y = x * 2;
   while (y.norm().item<double>() < 1000) {
     y = y * 2;
   }
-    
+
   std::cout << y << std::endl;
   std::cout << y.grad_fn()->name() << std::endl;
 
@@ -159,7 +159,7 @@ If we want the vector-Jacobian product, pass the vector to ``backward`` as argum
 
   auto v = torch::tensor({0.1, 1.0, 0.0001}, torch::kFloat);
   y.backward(v);
-  
+
   std::cout << x.grad() << std::endl;
 
 Out:
@@ -178,7 +178,7 @@ either by putting ``torch::NoGradGuard`` in a code block
 
   std::cout << x.requires_grad() << std::endl;
   std::cout << x.pow(2).requires_grad() << std::endl;
-  
+
   {
     torch::NoGradGuard no_grad;
     std::cout << x.pow(2).requires_grad() << std::endl;
@@ -218,31 +218,31 @@ please see `the corresponding C++ API docs <https://pytorch.org/cppdocs/api/clas
 Computing higher-order gradients in C++
 ---------------------------------------
 
-One of the applications of higher-order gradients is calculating gradient penalty.
+One of the applications of higher-order :term:`gradients<gradient>` is calculating :term:`gradient` penalty.
 Let's see an example of it using ``torch::autograd::grad``:
 
 .. code-block:: cpp
 
   #include <torch/torch.h>
-  
+
   auto model = torch::nn::Linear(4, 3);
-  
+
   auto input = torch::randn({3, 4}).requires_grad_(true);
   auto output = model(input);
-  
+
   // Calculate loss
   auto target = torch::randn({3, 3});
   auto loss = torch::nn::MSELoss()(output, target);
-  
+
   // Use norm of gradients as penalty
   auto grad_output = torch::ones_like(output);
   auto gradient = torch::autograd::grad({output}, {input}, /*grad_outputs=*/{grad_output}, /*create_graph=*/true)[0];
   auto gradient_penalty = torch::pow((gradient.norm(2, /*dim=*/1) - 1), 2).mean();
-  
+
   // Add gradient penalty to loss
   auto combined_loss = loss + gradient_penalty;
   combined_loss.backward();
-  
+
   std::cout << input.grad() << std::endl;
 
 Out:
@@ -277,14 +277,14 @@ Below you can find code for a ``Linear`` function from ``torch::nn``:
 .. code-block:: cpp
 
   #include <torch/torch.h>
-  
+
   using namespace torch::autograd;
-  
+
   // Inherit from Function
   class LinearFunction : public Function<LinearFunction> {
    public:
     // Note that both forward and backward are static functions
-  
+
     // bias is an optional argument
     static torch::Tensor forward(
         AutogradContext *ctx, torch::Tensor input, torch::Tensor weight, torch::Tensor bias = torch::Tensor()) {
@@ -295,21 +295,21 @@ Below you can find code for a ``Linear`` function from ``torch::nn``:
       }
       return output;
     }
-  
+
     static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) {
       auto saved = ctx->get_saved_variables();
       auto input = saved[0];
       auto weight = saved[1];
       auto bias = saved[2];
-  
+
       auto grad_output = grad_outputs[0];
       auto grad_input = grad_output.mm(weight);
       auto grad_weight = grad_output.t().mm(input);
       auto grad_bias = torch::Tensor();
       if (bias.defined()) {
         grad_bias = grad_output.sum(0);
       }
-  
+
       return {grad_input, grad_weight, grad_bias};
     }
   };
@@ -322,7 +322,7 @@ Then, we can use the ``LinearFunction`` in the following way:
   auto weight = torch::randn({4, 3}).requires_grad_();
   auto y = LinearFunction::apply(x, weight);
   y.sum().backward();
-  
+
   std::cout << x.grad() << std::endl;
   std::cout << weight.grad() << std::endl;
 
@@ -344,9 +344,9 @@ Here, we give an additional example of a function that is parametrized by non-te
 .. code-block:: cpp
 
   #include <torch/torch.h>
-  
+
   using namespace torch::autograd;
-  
+
   class MulConstant : public Function<MulConstant> {
    public:
     static torch::Tensor forward(AutogradContext *ctx, torch::Tensor tensor, double constant) {
@@ -355,7 +355,7 @@ Here, we give an additional example of a function that is parametrized by non-te
       ctx->saved_data["constant"] = constant;
       return tensor * constant;
     }
-  
+
     static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) {
       // We return as many input gradients as there were arguments.
       // Gradients of non-tensor arguments to forward must be `torch::Tensor()`.

diff --git a/advanced_source/cpp_export.rst b/advanced_source/cpp_export.rst
@@ -1,3 +1,3 @@
 .. warning::
-    TorchScript is deprecated, please use 
-    `torch.export <https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ instead.
+    :term:`TorchScript` is deprecated, please use
+    `torch.export <https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ instead.
diff --git a/advanced_source/cpp_frontend.rst b/advanced_source/cpp_frontend.rst
@@ -36,10 +36,12 @@ This tutorial will walk you through an end-to-end example of training a model
 with the C++ frontend. Concretely, we will be training a `DCGAN
 <https://arxiv.org/abs/1511.06434>`_ -- a kind of generative model -- to
 generate images of MNIST digits. While conceptually a simple example, it should
-be enough to give you a whirlwind overview of the PyTorch C++ frontend and wet
-your appetite for training more complex models. We will begin with some
-motivating words for why you would want to use the C++ frontend to begin with,
-and then dive straight into defining and training our model.
+be enough to give you a whirlwind overview of the PyTorch C++ frontend and whet
+your appetite for training more complex models.
+
+We'll begin with some motivating words for why you would want to use the C++ frontend to begin with,
+and then dive straight into defining and training our model. In this tutorial, we'll train
+a model on :term:`GPU` for optimal performance.
 
 .. tip::
 
@@ -960,8 +962,8 @@ Writing the Training Loop
 
 Let's now finish the algorithmic part of our example and implement the delicate
 dance between the generator and discriminator. First, we'll create two
-optimizers, one for the generator and one for the discriminator. The optimizers
-we use implement the `Adam <https://arxiv.org/pdf/1412.6980.pdf>`_ algorithm:
+optimizers, one for the generator and one for the discriminator.
+The :term:`optimizer`s we use implement the `Adam <https://arxiv.org/pdf/1412.6980.pdf>`_ algorithm:
 
 .. code-block:: cpp
 

diff --git a/advanced_source/custom_class_pt2.rst b/advanced_source/custom_class_pt2.rst
@@ -247,7 +247,7 @@ After re-compilation, we can export the custom op with:
 Why do we need to make a Fake Class?
 ------------------------------------
 
-Tracing with real custom object has several major downsides:
+:term:`Tracing` with real custom object has several major downsides:
 
 1. Operators on real objects can be time consuming e.g. the custom object
    might be reading from the network or loading data from the disk.

diff --git a/advanced_source/dispatcher.rst b/advanced_source/dispatcher.rst
@@ -18,7 +18,7 @@ of another.  Here is a sampling of some of the things it handles:
   depending on whether or not autograd handling is necessary.
 * Applying autocasting when necessary for automatic mixed precision.
 * Applying batching rules when an operator is run under a ``vmap`` call.
-* Tracing execution of operations, if you are tracing a model for export.
+* :term:`Tracing` execution of operations, if you are tracing a model for export.
 
 If in your `custom operator code <torch_script_custom_ops>`_ you find yourself
 manually writing if statements to handle these cases, the dispatcher APIs can
@@ -403,8 +403,8 @@ a kernel at the Batched dispatch key.
 Tracer
 ^^^^^^
 
-The Tracer dispatch key implements support for recording invocations of operators
-into a trace when you run ``torch.jit.trace``.  We intend to provide a
+The Tracer dispatch key implements support for recording invocations of :term:`operations<Operation>`
+into a trace when you run ``torch.jit.trace`` (:term:`Tracing`).
 boxed fallback that will implement tracing for arbitrary operations,
 see `issue #41478 <https://github.com/pytorch/pytorch/issues/41478>`_ to track
 progress.
diff --git a/advanced_source/extend_dispatcher.rst b/advanced_source/extend_dispatcher.rst
@@ -3,10 +3,13 @@ Extending dispatcher for a new backend in C++
 
 In this tutorial we will walk through all necessary steps to extend the dispatcher to
 add a new device living outside ``pytorch/pytorch`` repo and maintain it to keep in
-sync with native PyTorch devices.  Here we'll assume that you're familiar with how
+with native PyTorch devices.  Here we'll assume that you're familiar with how
 to `register a dispatched operator in C++ <dispatcher>`_ and how to write a
 `custom autograd function <cpp_autograd>`_.
 
+Note: This tutorial covers extending the dispatcher for custom backends that
+implement :term:`device kernels<Device Kernel>` for :term:`operations<Operation>`.
+
 
 .. note::
 
@@ -295,7 +298,7 @@ JIT support
 
 As we mentioned in `Registering a Dispatched Operator in C++ <dispatcher>`_, kernels registered through `m.impl()` API
 support being called in both unboxed and boxed ways. In other words your customized backend can also work with our
-JIT tracing/scripting frontend just like the in-tree backends like CPU or CUDA do.  You could potentially also write specialized optimization
+:term:`JIT` :term:`tracing<Tracing>`/:term:`scripting<Scripting>` frontend just like the in-tree backends like CPU or CUDA do.
 passes for your backend on a JIT graph.  But we will not discuss it here since we haven't finalized the integration point
 in JIT, so the current backend support will focus on the eager frontend for now.
 
@@ -377,4 +380,3 @@ any feature requests or bug reports, please `file an issue on github <https://gi
 
 If you're interested in helping in any of the future work items above (e.g adding more ``Math``
 kernels for PyTorch operators in C++), please reach out to us through Github or Slack!
-
diff --git a/advanced_source/torch-script-parallelism.rst b/advanced_source/torch-script-parallelism.rst
@@ -1,3 +1,3 @@
 .. warning::
-    TorchScript is deprecated, please use 
-    `torch.export <https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ instead.
+    :term:`TorchScript` is deprecated, please use
+    `torch.export <https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ instead.
diff --git a/advanced_source/torch_script_custom_ops.rst b/advanced_source/torch_script_custom_ops.rst
@@ -2,5 +2,5 @@
   TODO(gmagogsfm): Replace/delete this document by 2.9 release. https://github.com/pytorch/tutorials/issues/3456
 
 .. warning::
-    TorchScript is deprecated, please use 
-    `torch.export <https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ instead.
+    :term:`TorchScript` is deprecated, please use
+    `torch.export <https://docs.pytorch.org/tutorials/intermediate/torch_export_tutorial.html>`__ instead.
diff --git a/compilers_index.rst b/compilers_index.rst
@@ -10,7 +10,7 @@ control, as well as third-party backend solutions.
 
 .. warning::
 
-   TorchScript is no longer in active development.
+   :term:`TorchScript` is no longer in active development.
 
 .. raw:: html
 

diff --git a/conf.py b/conf.py
@@ -141,8 +141,20 @@ def wrapper(*args, **kwargs):
     "sphinx_sitemap",
     "sphinx_reredirects",
     "sphinxcontrib.mermaid",
+    "sphinx_tippy",
 ]
 
+# sphinx-tippy configuration
+tippy_props = {
+    "placement": "auto-start",
+    "maxWidth": 500,
+    "interactive": True,  # Allow clicking links inside tooltips
+    "theme": "material",
+}
+
+# Skip all URLs except glossary term links (glossary.html#term-*)
+tippy_skip_urls = (r"^(?!.*glossary\.html#term-).*$",)
+
 intersphinx_mapping = {
     "torch": ("https://docs.pytorch.org/docs/stable/", None),
     "tensordict": ("https://docs.pytorch.org/tensordict/stable", None),