Skip to content

关于百度的语言合成改为字节跳动 #17

@36dian5hao

Description

@36dian5hao

我想问一下,麦克风那边是使用讯飞的语言转文字然后传到大模型,大模型再传文字回来,再使用百度的文字合成语音回来进行播放,在bool Audio2::connecttospeech(const char *speech, const char *lang)这个函数里,以下是我的修改
// 更新后的 connecttospeech 函数
bool Audio2::connecttospeech(const char *speech, const char *lang)
{
xSemaphoreTakeRecursive(mutex_audio, portMAX_DELAY);

setDefaults();

// 修改为正确的域名和路径
const char* host = "openspeech.bytedance.com";
const char* path = "/api/v1/tts";

// 构建 HTTP 请求头,包含必要的认证信息
String headers = "Content-Type: application/json\r\n"
                "Accept: audio/mp3\r\n"  // 明确指定接受 MP3 格式
                "Authorization: Bearer your_token_here\r\n"  // 替换为实际的token
                "Connection: keep-alive\r\n";

// 构建 JSON 请求体
StaticJsonDocument<1024> doc;

JsonObject app = doc.createNestedObject("app");
app["appid"] = "7224888888";
app["token"] = "i0_c-jhtJlywGIKfOdUAUkXma5VDfM8d";
app["cluster"] = "volcano_tts";

JsonObject user = doc.createNestedObject("user");
user["uid"] = "uid123";

JsonObject audio = doc.createNestedObject("audio");
audio["voice_type"] = "BV700_streaming";
audio["encoding"] = "mp3";
audio["compression_rate"] = 1;
audio["rate"] = 24000;
audio["speed_ratio"] = 1.0;
audio["volume_ratio"] = 1.0;
audio["pitch_ratio"] = 1.0;
audio["emotion"] = "happy";
audio["language"] = lang;

JsonObject request = doc.createNestedObject("request");
request["reqid"] = "123456";
request["text"] = speech;
request["text_type"] = "plain";
request["operation"] = "query";
request["silence_duration"] = "125";
request["with_frontend"] = "1";
request["frontend_type"] = "unitTson";
request["pure_english_opt"] = "1";

String jsonString;
serializeJson(doc, jsonString);

// 使用 WiFiClientSecure 进行 HTTPS 连接
_client = static_cast<WiFiClient *>(&clientsecure);
if (!_client->connect(host, 443))
{
    log_e("Connection failed");
    xSemaphoreGiveRecursive(mutex_audio);
    return false;
}

// 发送 HTTP POST 请求
String httpRequest = String("POST ") + path + " HTTP/1.1\r\n" +
                    "Host: " + host + "\r\n" +
                    headers +
                    "Content-Length: " + jsonString.length() + "\r\n" +
                    "\r\n" +
                    jsonString;

if (!_client->print(httpRequest))
{
    log_e("Failed to send request");
    _client->stop();
    xSemaphoreGiveRecursive(mutex_audio);
    return false;
}

m_streamType = ST_WEBFILE;
isplaying = 1;
m_f_running = true;
m_f_ssl = true;
m_f_tts = true;
setDatamode(HTTP_RESPONSE_HEADER);

xSemaphoreGiveRecursive(mutex_audio);
return true;

}
// 更新后的 parseHttpResponseHeader 函数
bool Audio2::parseHttpResponseHeader()
{
if (getDatamode() != HTTP_RESPONSE_HEADER)
return false;
if (!_client->available())
return false;

char rhl[512] = {0}; // responseHeaderline
bool ct_seen = false;
uint32_t ctime = millis();
uint32_t timeout = 2500; // ms

while (true)
{
    uint16_t pos = 0;
    if ((millis() - ctime) > timeout)
    {
        log_e("Response header timeout");
        goto exit;
    }

    while (_client->available())
    {
        uint8_t b = _client->read();
        if (b == '\n')
        {
            if (!pos)
            { 
                if (ct_seen)
                    goto lastToDo;
                else
                    goto exit;
            }
            break;
        }
        if (b == '\r')
            rhl[pos] = 0;
        if (b < 0x20)
            continue;
        rhl[pos] = b;
        pos++;
        if (pos >= 511)
        {
            rhl[510] = '\0';
            if (m_f_Log)
                log_i("Response header line overflow");
            break;
        }
    }

    if (!pos)
    {
        vTaskDelay(3);
        continue;
    }

    if (m_f_Log)
        log_i("HTTP Response Header: %s", rhl);

    // 转换冒号前的所有字母为小写
    int16_t posColon = indexOf(rhl, ":", 0);
    if (posColon >= 0)
    {
        for (int i = 0; i < posColon; i++)
            rhl[i] = toLowerCase(rhl[i]);
    }

    // 检查 HTTP 状态码
    if (startsWith(rhl, "HTTP/"))
    {
        char statusCode[5];
        statusCode[0] = rhl[9];
        statusCode[1] = rhl[10];
        statusCode[2] = rhl[11];
        statusCode[3] = '\0';
        int sc = atoi(statusCode);
        if (sc != 200)
        {
            log_e("HTTP Error: %d", sc);
            if (audio_showstreamtitle)
                audio_showstreamtitle(rhl);
            goto exit;
        }
    }

    // 处理内容类型
    else if (startsWith(rhl, "content-type:"))
    {
        // 检查是否是 MP3 音频
        if (indexOf(rhl, "audio/mp3") > 0 || indexOf(rhl, "audio/mpeg") > 0)
        {
            ct_seen = true;
            m_codec = CODEC_MP3;
            setBitrate(128000); // 默认比特率
            setSampleRate(24000); // 字节跳动 TTS 的采样率
            setBitsPerSample(16);
            setChannels(2);
        }
        else
        {
            log_e("Unsupported content type: %s", rhl);
            goto exit;
        }
    }

    // 处理错误信息
    else if (startsWith(rhl, "x-error-code:") || startsWith(rhl, "x-error-message:"))
    {
        log_e("TTS Error: %s", rhl);
        goto exit;
    }
}

lastToDo:
if (!ct_seen)
{
log_e("No content type received");
goto exit;
}

setDatamode(AUDIO_DATA);
if (!initializeDecoder())
{
    log_e("Failed to initialize decoder");
    goto exit;
}

if (m_f_Log)
    log_i("Switch to DATA mode, metaint is %d", m_metaint);

if (m_playlistFormat != FORMAT_M3U8 && audio_lasthost)
    audio_lasthost(m_lastHost);

m_controlCounter = 0;
m_f_firstCall = true;
return true;

exit:
stopSong();
return false;
}
可是无法正常跑,没有报错,但是没有语音回应。

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions