我想问一下,麦克风那边是使用讯飞的语言转文字然后传到大模型,大模型再传文字回来,再使用百度的文字合成语音回来进行播放,在bool Audio2::connecttospeech(const char *speech, const char *lang)这个函数里,以下是我的修改
// 更新后的 connecttospeech 函数
bool Audio2::connecttospeech(const char *speech, const char *lang)
{
xSemaphoreTakeRecursive(mutex_audio, portMAX_DELAY);
setDefaults();
// 修改为正确的域名和路径
const char* host = "openspeech.bytedance.com";
const char* path = "/api/v1/tts";
// 构建 HTTP 请求头,包含必要的认证信息
String headers = "Content-Type: application/json\r\n"
"Accept: audio/mp3\r\n" // 明确指定接受 MP3 格式
"Authorization: Bearer your_token_here\r\n" // 替换为实际的token
"Connection: keep-alive\r\n";
// 构建 JSON 请求体
StaticJsonDocument<1024> doc;
JsonObject app = doc.createNestedObject("app");
app["appid"] = "7224888888";
app["token"] = "i0_c-jhtJlywGIKfOdUAUkXma5VDfM8d";
app["cluster"] = "volcano_tts";
JsonObject user = doc.createNestedObject("user");
user["uid"] = "uid123";
JsonObject audio = doc.createNestedObject("audio");
audio["voice_type"] = "BV700_streaming";
audio["encoding"] = "mp3";
audio["compression_rate"] = 1;
audio["rate"] = 24000;
audio["speed_ratio"] = 1.0;
audio["volume_ratio"] = 1.0;
audio["pitch_ratio"] = 1.0;
audio["emotion"] = "happy";
audio["language"] = lang;
JsonObject request = doc.createNestedObject("request");
request["reqid"] = "123456";
request["text"] = speech;
request["text_type"] = "plain";
request["operation"] = "query";
request["silence_duration"] = "125";
request["with_frontend"] = "1";
request["frontend_type"] = "unitTson";
request["pure_english_opt"] = "1";
String jsonString;
serializeJson(doc, jsonString);
// 使用 WiFiClientSecure 进行 HTTPS 连接
_client = static_cast<WiFiClient *>(&clientsecure);
if (!_client->connect(host, 443))
{
log_e("Connection failed");
xSemaphoreGiveRecursive(mutex_audio);
return false;
}
// 发送 HTTP POST 请求
String httpRequest = String("POST ") + path + " HTTP/1.1\r\n" +
"Host: " + host + "\r\n" +
headers +
"Content-Length: " + jsonString.length() + "\r\n" +
"\r\n" +
jsonString;
if (!_client->print(httpRequest))
{
log_e("Failed to send request");
_client->stop();
xSemaphoreGiveRecursive(mutex_audio);
return false;
}
m_streamType = ST_WEBFILE;
isplaying = 1;
m_f_running = true;
m_f_ssl = true;
m_f_tts = true;
setDatamode(HTTP_RESPONSE_HEADER);
xSemaphoreGiveRecursive(mutex_audio);
return true;
}
// 更新后的 parseHttpResponseHeader 函数
bool Audio2::parseHttpResponseHeader()
{
if (getDatamode() != HTTP_RESPONSE_HEADER)
return false;
if (!_client->available())
return false;
char rhl[512] = {0}; // responseHeaderline
bool ct_seen = false;
uint32_t ctime = millis();
uint32_t timeout = 2500; // ms
while (true)
{
uint16_t pos = 0;
if ((millis() - ctime) > timeout)
{
log_e("Response header timeout");
goto exit;
}
while (_client->available())
{
uint8_t b = _client->read();
if (b == '\n')
{
if (!pos)
{
if (ct_seen)
goto lastToDo;
else
goto exit;
}
break;
}
if (b == '\r')
rhl[pos] = 0;
if (b < 0x20)
continue;
rhl[pos] = b;
pos++;
if (pos >= 511)
{
rhl[510] = '\0';
if (m_f_Log)
log_i("Response header line overflow");
break;
}
}
if (!pos)
{
vTaskDelay(3);
continue;
}
if (m_f_Log)
log_i("HTTP Response Header: %s", rhl);
// 转换冒号前的所有字母为小写
int16_t posColon = indexOf(rhl, ":", 0);
if (posColon >= 0)
{
for (int i = 0; i < posColon; i++)
rhl[i] = toLowerCase(rhl[i]);
}
// 检查 HTTP 状态码
if (startsWith(rhl, "HTTP/"))
{
char statusCode[5];
statusCode[0] = rhl[9];
statusCode[1] = rhl[10];
statusCode[2] = rhl[11];
statusCode[3] = '\0';
int sc = atoi(statusCode);
if (sc != 200)
{
log_e("HTTP Error: %d", sc);
if (audio_showstreamtitle)
audio_showstreamtitle(rhl);
goto exit;
}
}
// 处理内容类型
else if (startsWith(rhl, "content-type:"))
{
// 检查是否是 MP3 音频
if (indexOf(rhl, "audio/mp3") > 0 || indexOf(rhl, "audio/mpeg") > 0)
{
ct_seen = true;
m_codec = CODEC_MP3;
setBitrate(128000); // 默认比特率
setSampleRate(24000); // 字节跳动 TTS 的采样率
setBitsPerSample(16);
setChannels(2);
}
else
{
log_e("Unsupported content type: %s", rhl);
goto exit;
}
}
// 处理错误信息
else if (startsWith(rhl, "x-error-code:") || startsWith(rhl, "x-error-message:"))
{
log_e("TTS Error: %s", rhl);
goto exit;
}
}
lastToDo:
if (!ct_seen)
{
log_e("No content type received");
goto exit;
}
setDatamode(AUDIO_DATA);
if (!initializeDecoder())
{
log_e("Failed to initialize decoder");
goto exit;
}
if (m_f_Log)
log_i("Switch to DATA mode, metaint is %d", m_metaint);
if (m_playlistFormat != FORMAT_M3U8 && audio_lasthost)
audio_lasthost(m_lastHost);
m_controlCounter = 0;
m_f_firstCall = true;
return true;
exit:
stopSong();
return false;
}
可是无法正常跑,没有报错,但是没有语音回应。
我想问一下,麦克风那边是使用讯飞的语言转文字然后传到大模型,大模型再传文字回来,再使用百度的文字合成语音回来进行播放,在bool Audio2::connecttospeech(const char *speech, const char *lang)这个函数里,以下是我的修改
// 更新后的 connecttospeech 函数
bool Audio2::connecttospeech(const char *speech, const char *lang)
{
xSemaphoreTakeRecursive(mutex_audio, portMAX_DELAY);
}
// 更新后的 parseHttpResponseHeader 函数
bool Audio2::parseHttpResponseHeader()
{
if (getDatamode() != HTTP_RESPONSE_HEADER)
return false;
if (!_client->available())
return false;
lastToDo:
if (!ct_seen)
{
log_e("No content type received");
goto exit;
}
exit:
stopSong();
return false;
}
可是无法正常跑,没有报错,但是没有语音回应。