From bd4dd4f902aa532f2c1a166155edaee43e9269ea Mon Sep 17 00:00:00 2001
From: DevmateXplatTypoFixes Bot <noreply+1899117597419293@fb.com>
Date: Mon, 16 Mar 2026 12:01:07 -0700
Subject: [PATCH] [AI Codemod][DevmateXplatTypoFixes]
 fbsource//xplat/pytorch/audio:src-tree

Reviewed By: franklinho

Differential Revision: D96711074
---
 .../ctc_forced_alignment_api_tutorial.py      | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/examples/tutorials/ctc_forced_alignment_api_tutorial.py b/examples/tutorials/ctc_forced_alignment_api_tutorial.py
index a31b63e345..fb96234dcd 100644
--- a/examples/tutorials/ctc_forced_alignment_api_tutorial.py
+++ b/examples/tutorials/ctc_forced_alignment_api_tutorial.py
@@ -57,7 +57,7 @@
 import torchaudio.functional as F
 
 ######################################################################
-# First we prepare the speech data and the transcript we area going
+# First we prepare the speech data and the transcript we are going
 # to use.
 #
 
@@ -71,15 +71,15 @@
 # ~~~~~~~~~~~~~~~~~~~~
 #
 # :py:func:`~torchaudio.functional.forced_align` takes emission and
-# token sequences and outputs timestaps of the tokens and their scores.
+# token sequences and outputs timestamps of the tokens and their scores.
 #
-# Emission reperesents the frame-wise probability distribution over
+# Emission represents the frame-wise probability distribution over
 # tokens, and it can be obtained by passing waveform to an acoustic
 # model.
 #
 # Tokens are numerical expression of transcripts. There are many ways to
 # tokenize transcripts, but here, we simply map alphabets into integer,
-# which is how labels were constructed when the acoustice model we are
+# which is how labels were constructed when the acoustic model we are
 # going to use was trained.
 #
 # We will use a pre-trained Wav2Vec2 model,
@@ -161,7 +161,7 @@ def align(emission, tokens):
 #
 # .. note::
 #
-#    The alignment is expressed in the frame cordinate of the emission,
+#    The alignment is expressed in the frame coordinate of the emission,
 #    which is different from the original waveform.
 #
 # It contains blank tokens and repeated tokens. The following is the
@@ -184,7 +184,7 @@ def align(emission, tokens):
 #
 # .. note::
 #
-#    When same token occured after blank tokens, it is not treated as
+#    When same token occurred after blank tokens, it is not treated as
 #    a repeat, but as a new occurrence.
 #
 #    .. code-block::
@@ -200,7 +200,7 @@ def align(emission, tokens):
 # Token-level alignments
 # ~~~~~~~~~~~~~~~~~~~~~~
 #
-# Next step is to resolve the repetation, so that each alignment does
+# Next step is to resolve the repetition, so that each alignment does
 # not depend on previous alignments.
 # :py:func:`torchaudio.functional.merge_tokens` computes the
 # :py:class:`~torchaudio.functional.TokenSpan` object, which represents
@@ -352,7 +352,7 @@ def plot_alignments(waveform, token_spans, emission, transcript, sample_rate=bun
 #
 # When splitting the token-level alignments into words, you will
 # notice that some blank tokens are treated differently, and this makes
-# the interpretation of the result somehwat ambigious.
+# the interpretation of the result somewhat ambiguous.
 #
 # This is easy to see when we plot the scores. The following figure
 # shows word regions and non-word regions, with the frame-level scores
@@ -387,7 +387,7 @@ def plot_scores(word_spans, scores):
 #
 # One reason for this is because the model was trained without a
 # label for the word boundary. The blank tokens are treated not just
-# as repeatation but also as silence between words.
+# as repetition but also as silence between words.
 #
 # But then, a question arises. Should frames immediately after or
 # near the end of a word be silent or repeat?
@@ -400,12 +400,12 @@ def plot_scores(word_spans, scores):
 #
 # Unfortunately, CTC does not provide a comprehensive solution to this.
 # Models trained with CTC are known to exhibit "peaky" response,
-# that is, they tend to spike for an aoccurance of a label, but the
+# that is, they tend to spike for an occurrence of a label, but the
 # spike does not last for the duration of the label.
 # (Note: Pre-trained Wav2Vec2 models tend to spike at the beginning of
-# label occurances, but this not always the case.)
+# label occurrences, but this not always the case.)
 #
-# :cite:`zeyer2021does` has in-depth alanysis on the peaky behavior of
+# :cite:`zeyer2021does` has in-depth analysis on the peaky behavior of
 # CTC.
 # We encourage those who are interested understanding more to refer
 # to the paper.