From d1edf21c0f8f372871db6d9b6250310f31c83ab3 Mon Sep 17 00:00:00 2001 From: Simon Zuberek Date: Mon, 19 May 2025 11:55:25 -0400 Subject: [PATCH] Adds descriptions of audio quality metrics Signed-off-by: Simon Zuberek --- sdp/processors/tts/metrics.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/sdp/processors/tts/metrics.py b/sdp/processors/tts/metrics.py index cc8c6283..039ca15e 100644 --- a/sdp/processors/tts/metrics.py +++ b/sdp/processors/tts/metrics.py @@ -31,7 +31,18 @@ class TorchSquimObjectiveQualityMetricsProcessor(BaseProcessor): """This processor calculates Squim quality metrics for audio files. It uses a pre-trained Squim model to calculate audio quality metrics like PESQ, STOI - and SI-SDR for each audio segment in the manifest. + and SI-SDR for each audio segment in the manifest: + + PESQ (Perceptual Evaluation of Speech Quality) + A measure of overall quality for speech (originally designed to detect codec distortions but highly correlated to all kinds of distortion. + + STOI (Short-Time Objective Intelligibility) + A measure of speech intelligibility, basically measures speech envelope integrity. + A STOI value of 1.0 means 100% of the speech being evaluated is intelligible on average. + + SI-SDR (Scale-Invariant Signal-to-Distortion Ratio) + A measure of how strong the speech signal is vs. all the distortion present in the audio, in decibels. + 0 dB means the energies of speech and distortion are the same. A value between 15-20 dB is what is considered "clean enough" speech in general. Args: device (str, Optional): Device to run the model on. Defaults to "cuda".