From d1edf21c0f8f372871db6d9b6250310f31c83ab3 Mon Sep 17 00:00:00 2001
From: Simon Zuberek <szuberek@nvidia.com>
Date: Mon, 19 May 2025 11:55:25 -0400
Subject: [PATCH] Adds descriptions of audio quality metrics

Signed-off-by: Simon Zuberek <szuberek@nvidia.com>
---
 sdp/processors/tts/metrics.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/sdp/processors/tts/metrics.py b/sdp/processors/tts/metrics.py
index cc8c6283..039ca15e 100644
--- a/sdp/processors/tts/metrics.py
+++ b/sdp/processors/tts/metrics.py
@@ -31,7 +31,18 @@ class TorchSquimObjectiveQualityMetricsProcessor(BaseProcessor):
     """This processor calculates Squim quality metrics for audio files.
 
     It uses a pre-trained Squim model to calculate audio quality metrics like PESQ, STOI
-    and SI-SDR for each audio segment in the manifest.
+    and SI-SDR for each audio segment in the manifest:
+
+        PESQ (Perceptual Evaluation of Speech Quality)
+        A measure of overall quality for speech (originally designed to detect codec distortions but highly correlated to all kinds of distortion.
+        
+        STOI (Short-Time Objective Intelligibility)
+        A measure of speech intelligibility, basically measures speech envelope integrity. 
+        A STOI value of 1.0 means 100% of the speech being evaluated is intelligible on average.
+
+        SI-SDR (Scale-Invariant Signal-to-Distortion Ratio)
+        A measure of how strong the speech signal is vs. all the distortion present in the audio, in decibels. 
+        0 dB means the energies of speech and distortion are the same. A value between 15-20 dB is what is considered "clean enough" speech in general.
 
     Args:
         device (str, Optional): Device to run the model on. Defaults to "cuda".