From 40be191e0c005aa5d1e77451fb96e268ec2ec301 Mon Sep 17 00:00:00 2001 From: bodharma Date: Sun, 4 Jan 2026 15:15:44 +0000 Subject: [PATCH 1/7] Add audio transcription with per-chat language support Built over a weekend to read voice messages as text instead of listening. Features: - Alt-U to transcribe voice messages - Alt-Shift-U to re-transcribe - Alt-L to set language per chat - 22+ languages (en, ru, uk, es, fr, de, zh, ja, etc.) - Auto-truncate long transcriptions (configurable max lines) - Works with OpenAI API, whisper.cpp, or local Whisper Config in ~/.config/nchat/ui.conf: - audio_transcribe_enabled=1 # turn it on - audio_transcribe_language=auto # or en, ru, uk, etc. - audio_transcribe_max_lines=15 # truncate long ones Database: Added transcriptionLanguage to chats2 table (schema v9) Docs: TRANSCRIPTION.md for usage, TRANSCRIPTION-SETUP.md for setup --- CMakeLists.txt | 5 + doc/TRANSCRIPTION-SETUP.md | 245 +++++++++++++++++ doc/TRANSCRIPTION.md | 180 +++++++++++++ lib/common/src/protocol.h | 1 + lib/ncutil/src/messagecache.cpp | 201 +++++++++++++- lib/ncutil/src/messagecache.h | 23 ++ src/main.cpp | 2 + src/transcribe | 460 ++++++++++++++++++++++++++++++++ src/uiconfig.cpp | 8 + src/uihelpview.cpp | 4 + src/uihistoryview.cpp | 80 ++++++ src/uikeyconfig.cpp | 3 + src/uilanguagelistdialog.cpp | 106 ++++++++ src/uilanguagelistdialog.h | 41 +++ src/uimodel.cpp | 331 +++++++++++++++++++++++ src/uimodel.h | 7 + 16 files changed, 1691 insertions(+), 6 deletions(-) create mode 100644 doc/TRANSCRIPTION-SETUP.md create mode 100644 doc/TRANSCRIPTION.md create mode 100755 src/transcribe create mode 100644 src/uilanguagelistdialog.cpp create mode 100644 src/uilanguagelistdialog.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 69835763..373e9832 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -269,6 +269,8 @@ add_executable(nchat src/uilistborderview.h src/uilistdialog.cpp src/uilistdialog.h + src/uilanguagelistdialog.cpp + src/uilanguagelistdialog.h src/uilistview.cpp src/uilistview.h src/uimessagedialog.cpp @@ -419,6 +421,9 @@ install(FILES src/nchat.1 DESTINATION "${CMAKE_INSTALL_MANDIR}/man1") configure_file(src/compose ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_INSTALL_LIBEXECDIR}/nchat/compose COPYONLY) install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_INSTALL_LIBEXECDIR}/nchat/compose DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/nchat) +configure_file(src/transcribe ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_INSTALL_LIBEXECDIR}/nchat/transcribe COPYONLY) +install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_INSTALL_LIBEXECDIR}/nchat/transcribe DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/nchat) + # Uninstall if(HAS_SHARED_LIBS) add_custom_target(uninstall diff --git a/doc/TRANSCRIPTION-SETUP.md b/doc/TRANSCRIPTION-SETUP.md new file mode 100644 index 00000000..d5654590 --- /dev/null +++ b/doc/TRANSCRIPTION-SETUP.md @@ -0,0 +1,245 @@ +# Transcription Setup + +How to set up different transcription backends for nchat. + +## What You Need + +- nchat installed +- Python 3.7+ (`python3 --version`) +- pip (`pip3 --version`) + +--- + +## Option 1: OpenAI API (Easiest) + +Fast (2-3 sec), accurate, easy. Costs $0.006/min. Audio goes to OpenAI. + +**Setup:** + +1. Get API key: https://platform.openai.com/api-keys + +2. Install packages: + ```bash + pip3 install openai requests + ``` + +3. Set the key: + ```bash + export OPENAI_API_KEY='sk-...' + echo 'export OPENAI_API_KEY="sk-..."' >> ~/.bashrc + ``` + + macOS users also need: + ```bash + echo 'export OPENAI_API_KEY="sk-..."' >> ~/.zshenv + ``` + +4. Configure nchat (`~/.config/nchat/ui.conf`): + ```conf + audio_transcribe_enabled=1 + audio_transcribe_cache=1 + ``` + +5. Test: + ```bash + /usr/local/libexec/nchat/transcribe -f /path/to/test.ogg + ``` + +Monitor costs at https://platform.openai.com/usage (set a budget limit!) + +--- + +## Option 2: whisper.cpp (Local Server) + +Free, private, offline. Bit more setup. Fast with GPU. + +**Setup:** + +1. Install deps: + ```bash + # macOS + brew install ffmpeg cmake + + # Linux + sudo apt install build-essential ffmpeg cmake git # Debian/Ubuntu + sudo dnf install gcc-c++ ffmpeg cmake git # Fedora + ``` + +2. Build it: + ```bash + mkdir -p ~/whisper.cpp && cd ~/whisper.cpp + git clone https://github.com/ggerganov/whisper.cpp.git . + mkdir build && cd build + cmake .. -DWHISPER_BUILD_SERVER=ON + cmake --build . --config Release + ``` + +3. Download a model: + ```bash + cd ~/whisper.cpp + bash ./models/download-ggml-model.sh base # or tiny/small/medium/large + ``` + +4. Start the server: + ```bash + cd ~/whisper.cpp + ./build/bin/server --model models/ggml-base.bin --host 127.0.0.1 --port 8080 --convert + ``` + + Run in background: + ```bash + nohup ./build/bin/server --model models/ggml-base.bin --host 127.0.0.1 --port 8080 --convert > server.log 2>&1 & + ``` + +5. Install Python package: + ```bash + pip3 install requests + ``` + +6. Configure nchat: + ```conf + audio_transcribe_enabled=1 + audio_transcribe_command=/usr/local/libexec/nchat/transcribe -f '%1' -s whisper-cpp + audio_transcribe_cache=1 + ``` + +7. Test: + ```bash + curl http://localhost:8080/health + /usr/local/libexec/nchat/transcribe -f /path/to/test.ogg -s whisper-cpp + ``` + +**Want auto-start on boot?** Set up a systemd service (Linux) or launchd (macOS) - Google it. + +--- + +## Option 3: Whisper Python (Local) + +Free, private, simple. Slower than whisper.cpp, uses more RAM. + +**Setup:** + +1. Install ffmpeg: + ```bash + brew install ffmpeg # macOS + sudo apt install ffmpeg python3-dev # Debian/Ubuntu + sudo dnf install ffmpeg python3-devel # Fedora + ``` + +2. Install Whisper: + ```bash + pip3 install faster-whisper # Faster (recommended) + # or + pip3 install openai-whisper # Original (slower) + ``` + +3. Configure nchat: + ```conf + audio_transcribe_enabled=1 + audio_transcribe_command=/usr/local/libexec/nchat/transcribe -f '%1' -s whisper-local -m base + audio_transcribe_cache=1 + ``` + +4. Test: + ```bash + /usr/local/libexec/nchat/transcribe -f /path/to/test.ogg -s whisper-local + ``` + +Models download automatically on first use. Pick a size: `tiny` (fast, meh), `base` (balanced), `small`/`medium`/`large` (slower, better). + +Got an NVIDIA GPU? Use `pip3 install faster-whisper[gpu]` after installing CUDA. + +--- + +## Option 4: Groq API (Cheaper Alternative) + +Like OpenAI but cheaper ($0.001/min vs $0.006/min). Still fast. + +**Setup:** + +1. Get API key from https://console.groq.com/ + +2. Install: + ```bash + pip3 install groq requests + ``` + +3. Set key: + ```bash + export GROQ_API_KEY='gsk-...' + echo 'export GROQ_API_KEY="gsk-..."' >> ~/.bashrc + echo 'export GROQ_API_KEY="gsk-..."' >> ~/.zshenv # macOS + ``` + +4. You'll need to hack the transcribe script to add Groq support (it's not built-in yet). Or just use OpenAI. + +--- + +## Testing + +After setup: + +```bash +# Test the script +/usr/local/libexec/nchat/transcribe -f /path/to/test.ogg + +# Check nchat config +grep transcribe ~/.config/nchat/ui.conf +``` + +In nchat: Select a voice message, press `Alt-u`, see if it works. + +--- + +## Troubleshooting + +**"Command not found"** +```bash +ls -l /usr/local/libexec/nchat/transcribe # Check if it exists +``` + +**"No module named 'openai'"** +```bash +pip3 install openai # or whatever package is missing +``` + +**"API key not set"** +```bash +echo $OPENAI_API_KEY # Check it's set +echo 'export OPENAI_API_KEY="sk-..."' >> ~/.bashrc +echo 'export OPENAI_API_KEY="sk-..."' >> ~/.zshenv # macOS +``` + +**whisper.cpp server not responding** +```bash +curl http://localhost:8080/health # Check if running +cd ~/whisper.cpp && ./build/bin/server --model models/ggml-base.bin --host 127.0.0.1 --port 8080 --convert +``` + +**Timeouts** +```conf +audio_transcribe_timeout=60 # Increase in ui.conf +``` + +**Need ffmpeg** +```bash +brew install ffmpeg # macOS +sudo apt install ffmpeg # Linux +``` + +**High CPU/RAM** + +Use a smaller model (`-m tiny`), or switch to whisper.cpp, or just use the API. + +--- + +## Quick Comparison + +| Option | Speed | Privacy | Cost | Setup | +|--------|-------|---------|------|-------| +| OpenAI | Fast | Low | $0.006/min | Easy | +| whisper.cpp | Good | High | Free | Medium | +| Whisper Python | OK | High | Free | Easy | +| Groq | Fast | Low | $0.001/min | Easy | + +Pick what works for you. diff --git a/doc/TRANSCRIPTION.md b/doc/TRANSCRIPTION.md new file mode 100644 index 00000000..f1eeb8e9 --- /dev/null +++ b/doc/TRANSCRIPTION.md @@ -0,0 +1,180 @@ +# Audio Transcription + +nchat can transcribe voice messages to text using Whisper. Press Alt-u on any voice message and boom - you can read it instead of listening. + +Works with Telegram and WhatsApp voice notes. You can use OpenAI's API (fast, costs a few cents) or run it locally for free (slower but private). + +## Quick Start + +**Option 1: OpenAI API (easiest)** + +```bash +# Get an API key from https://platform.openai.com/api-keys +export OPENAI_API_KEY='sk-your-key-here' + +# Enable in nchat config +echo "audio_transcribe_enabled=1" >> ~/.config/nchat/ui.conf +``` + +Costs about $0.006 per minute of audio (so like a penny for a 2-minute voice note). + +**Option 2: Local Whisper (free, private)** + +See [TRANSCRIPTION-SETUP.md](TRANSCRIPTION-SETUP.md) - takes a bit more setup but runs offline. + +**Using it:** + +1. Select a voice message in nchat +2. Press `Alt-u` +3. Wait for the text to appear + +That's it. + +## How to Use + +Press `Alt-u` on any voice message. The text appears below it: + +``` +┌─────────────────────────────────────────────┐ +│ Alice 10:30 AM │ +│ 🎤 Voice message (0:15) │ +│ │ +│ 📝 Hey, can you pick up groceries on your │ +│ way home? We need milk and eggs. │ +│ [Transcribed]│ +└─────────────────────────────────────────────┘ +``` + +Use `Alt-Shift-u` to re-transcribe if you want to ignore the cache (like if the first try messed up). + +Supports: `.ogg`, `.opus`, `.mp3`, `.m4a`, `.wav`, `.flac` + +## Configuration + +Edit `~/.config/nchat/ui.conf`: + +```conf +audio_transcribe_enabled=1 # Turn it on/off +audio_transcribe_cache=1 # Cache results (saves API costs) +audio_transcribe_inline=1 # Show text below message +audio_transcribe_auto=0 # Don't auto-transcribe (costs $$$) +audio_transcribe_timeout=30 # Wait max 30 seconds +``` + +The command that does the work: +```conf +audio_transcribe_command=/usr/local/libexec/nchat/transcribe -f '%1' +``` + +You can add flags to it (see below). + +## Tweaking It + +**Pick a specific service:** +```conf +audio_transcribe_command=/usr/local/libexec/nchat/transcribe -f '%1' -s openai # OpenAI API +audio_transcribe_command=/usr/local/libexec/nchat/transcribe -f '%1' -s whisper-cpp # Local server +audio_transcribe_command=/usr/local/libexec/nchat/transcribe -f '%1' -s whisper-local # Local Python +``` + +**Set the language (better accuracy):** +```conf +audio_transcribe_command=/usr/local/libexec/nchat/transcribe -f '%1' -l en # English +audio_transcribe_command=/usr/local/libexec/nchat/transcribe -f '%1' -l es # Spanish +``` + +Supports 90+ languages (en, es, fr, de, it, pt, ru, zh, ja, ko, etc.) + +**Local model sizes:** +```conf +# Pick one based on speed vs accuracy: +... -m tiny # 75 MB - fast but meh +... -m base # 150 MB - good balance +... -m small # 500 MB - better +... -m medium # 1.5 GB - pretty good +... -m large # 3 GB - best but slow +``` + +## Keyboard Shortcuts + +- `Alt-u` - Transcribe message +- `Alt-Shift-u` - Re-transcribe (ignore cache) +- `Ctrl-t` - Toggle visibility + +Change them in `~/.config/nchat/key.conf` if you want (see nchat docs for the escape codes). + +## Troubleshooting + +**"No API key set"** +```bash +export OPENAI_API_KEY='sk-...' # Add to ~/.bashrc or ~/.zshrc +``` + +**"Timeout"** + +Bump the timeout or use a faster service: +```conf +audio_transcribe_timeout=60 +``` + +**"Audio format not supported"** + +Install ffmpeg: +```bash +brew install ffmpeg # macOS +sudo apt install ffmpeg # Linux +``` + +**Wrong language / bad accuracy** + +Specify the language: +```conf +audio_transcribe_command=/usr/local/libexec/nchat/transcribe -f '%1' -l en +``` + +Or use a bigger model (local) or switch to OpenAI API. + +**API costs too high** + +Turn off auto-transcribe (`audio_transcribe_auto=0`) and use local Whisper instead (see [TRANSCRIPTION-SETUP.md](TRANSCRIPTION-SETUP.md)). + +## Privacy + +**OpenAI API:** Audio gets sent to their servers. They may keep it for 30 days. Don't use for super sensitive stuff. + +**Local Whisper:** Everything stays on your machine. 100% private. + +## Cache Management + +Transcriptions are cached in `~/.config/nchat/db.sqlite`. + +Clear cache if needed: +```bash +sqlite3 ~/.config/nchat/db.sqlite "DELETE FROM transcriptions;" +``` + +## Tips + +- OpenAI API is fastest (2-3 sec/message) +- Keep caching enabled to save money +- Specify language for better accuracy +- Use local for privacy, API for speed +- Don't enable auto-transcribe unless you hate money + +## FAQ + +**Q: Supports video?** +Nope, just audio. + +**Q: Offline?** +Yes with local Whisper. No with API. + +**Q: How accurate?** +Pretty good (~95%) with clear audio. Gets worse with noise/accents. + +**Q: Languages?** +99+ including English, Spanish, French, German, Chinese, Japanese, etc. + +## See Also + +[TRANSCRIPTION-SETUP.md](TRANSCRIPTION-SETUP.md) - How to set up local Whisper diff --git a/lib/common/src/protocol.h b/lib/common/src/protocol.h index da8ff24b..caff2784 100644 --- a/lib/common/src/protocol.h +++ b/lib/common/src/protocol.h @@ -141,6 +141,7 @@ struct ChatInfo bool isPinned = false; bool isArchived = false; int64_t lastMessageTime = -1; + std::string transcriptionLanguage; // language for audio transcription (e.g., "en", "ru", "auto", or empty for global default) }; enum FileStatus diff --git a/lib/ncutil/src/messagecache.cpp b/lib/ncutil/src/messagecache.cpp index 1c785f99..a37735a0 100644 --- a/lib/ncutil/src/messagecache.cpp +++ b/lib/ncutil/src/messagecache.cpp @@ -440,7 +440,40 @@ void MessageCache::AddProfile(const std::string& p_ProfileId, bool p_CheckSequen "SET schema = ?;" << schemaVersion; } - static const int64_t s_SchemaVersion = 9; + if (schemaVersion == 9) + { + LOG_INFO("update db schema 9 to 10"); + + *m_Dbs[p_ProfileId] << "CREATE TABLE IF NOT EXISTS transcriptions (" + "chatId TEXT NOT NULL," + "msgId TEXT NOT NULL," + "transcription TEXT NOT NULL," + "language TEXT DEFAULT ''," + "service TEXT DEFAULT ''," + "timestamp INTEGER NOT NULL," + "PRIMARY KEY (chatId, msgId)" + ");"; + + *m_Dbs[p_ProfileId] << "CREATE INDEX IF NOT EXISTS idx_transcriptions_timestamp " + "ON transcriptions(timestamp);"; + + schemaVersion = 10; + *m_Dbs[p_ProfileId] << "UPDATE version " + "SET schema = ?;" << schemaVersion; + } + + if (schemaVersion == 10) + { + LOG_INFO("update db schema 10 to 11"); + + *m_Dbs[p_ProfileId] << "ALTER TABLE chats2 ADD COLUMN transcriptionLanguage TEXT DEFAULT '';"; + + schemaVersion = 11; + *m_Dbs[p_ProfileId] << "UPDATE version " + "SET schema = ?;" << schemaVersion; + } + + static const int64_t s_SchemaVersion = 11; if (schemaVersion > s_SchemaVersion) { LOG_WARNING("cache db schema %d from newer nchat version detected, if cache issues are encountered " @@ -871,6 +904,19 @@ std::vector MessageCache::FetchGroupMembersSync(const std::string& return contactInfos; } +void MessageCache::UpdateTranscriptionLanguage(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_TranscriptionLanguage) +{ + if (!m_CacheEnabled) return; + + std::shared_ptr updateTranscriptionLanguageRequest = + std::make_shared(); + updateTranscriptionLanguageRequest->profileId = p_ProfileId; + updateTranscriptionLanguageRequest->chatId = p_ChatId; + updateTranscriptionLanguageRequest->transcriptionLanguage = p_TranscriptionLanguage; + EnqueueRequest(updateTranscriptionLanguageRequest); +} + void MessageCache::Export(const std::string& p_ExportDir) { if (!m_CacheEnabled) @@ -1227,10 +1273,10 @@ void MessageCache::PerformRequest(std::shared_ptr p_Request) for (const auto& chatInfo : addChatsRequest->chatInfos) { *m_Dbs[profileId] << "INSERT INTO " + s_TableChats + " " - "(id, isMuted, isPinned, lastMessageTime, isArchived) VALUES " - "(?, ?, ?, ?, ?);" << + "(id, isMuted, isPinned, lastMessageTime, isArchived, transcriptionLanguage) VALUES " + "(?, ?, ?, ?, ?, ?);" << chatInfo.id << chatInfo.isMuted << chatInfo.isPinned << - chatInfo.lastMessageTime << chatInfo.isArchived; + chatInfo.lastMessageTime << chatInfo.isArchived << chatInfo.transcriptionLanguage; } *m_Dbs[profileId] << "COMMIT;"; } @@ -1296,14 +1342,16 @@ void MessageCache::PerformRequest(std::shared_ptr p_Request) std::map chatIdPinned; std::map chatIdLastMessageTime; std::map chatIdArchived; - *m_Dbs[profileId] << "SELECT id, isMuted, isPinned, lastMessageTime, isArchived FROM " + s_TableChats + ";" >> + std::map chatIdTranscriptionLanguage; + *m_Dbs[profileId] << "SELECT id, isMuted, isPinned, lastMessageTime, isArchived, transcriptionLanguage FROM " + s_TableChats + ";" >> [&](const std::string& chatId, int32_t isMuted, int32_t isPinned, int64_t lastMessageTime, - int32_t isArchived) + int32_t isArchived, const std::string& transcriptionLanguage) { chatIdMuted[chatId] = isMuted; chatIdPinned[chatId] = isPinned; chatIdLastMessageTime[chatId] = lastMessageTime; chatIdArchived[chatId] = isArchived; + chatIdTranscriptionLanguage[chatId] = transcriptionLanguage; }; *m_Dbs[profileId] << "SELECT chatId, MAX(timeSent), isOutgoing, isRead FROM " + s_TableMessages + " " @@ -1319,6 +1367,7 @@ void MessageCache::PerformRequest(std::shared_ptr p_Request) chatInfo.isPinned = chatIdPinned[chatId]; chatInfo.isArchived = chatIdArchived[chatId]; chatInfo.lastMessageTime = chatInfo.isPinned ? chatIdLastMessageTime[chatId] : timeSent; + chatInfo.transcriptionLanguage = chatIdTranscriptionLanguage[chatId]; chatInfos.push_back(chatInfo); } }; @@ -1940,6 +1989,33 @@ void MessageCache::PerformRequest(std::shared_ptr p_Request) } break; + case UpdateTranscriptionLanguageRequestType: + { + std::unique_lock lock(m_DbMutex); + std::shared_ptr updateTranscriptionLanguageRequest = + std::static_pointer_cast(p_Request); + const std::string& profileId = updateTranscriptionLanguageRequest->profileId; + if (!m_Dbs[profileId]) return; + + const std::string& chatId = updateTranscriptionLanguageRequest->chatId; + const std::string& transcriptionLanguage = updateTranscriptionLanguageRequest->transcriptionLanguage; + + try + { + *m_Dbs[profileId] << "INSERT INTO " + s_TableChats + " " + "(id, transcriptionLanguage) VALUES " + "(?, ?) ON CONFLICT(id) DO UPDATE SET transcriptionLanguage=?;" << + chatId << transcriptionLanguage << transcriptionLanguage; + } + catch (const sqlite::sqlite_exception& ex) + { + HANDLE_SQLITE_EXCEPTION(ex); + } + + LOG_DEBUG("cache update transcription language %s %s", chatId.c_str(), transcriptionLanguage.c_str()); + } + break; + default: { LOG_WARNING("cache unknown request type %d", p_Request->GetRequestType()); @@ -2046,3 +2122,116 @@ void MessageCache::CallMessageHandler(std::shared_ptr p_ServiceM LOG_WARNING("message handler not set"); } } + +bool MessageCache::StoreTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_MsgId, const std::string& p_Transcription, + const std::string& p_Language, const std::string& p_Service) +{ + if (!m_CacheEnabled) return false; + + std::unique_lock lock(m_DbMutex); + + if (m_Dbs.find(p_ProfileId) == m_Dbs.end()) return false; + + try + { + int64_t timestamp = TimeUtil::GetCurrentTimeMSec() / 1000; + + *m_Dbs[p_ProfileId] << "INSERT OR REPLACE INTO transcriptions " + "(chatId, msgId, transcription, language, service, timestamp) " + "VALUES (?, ?, ?, ?, ?, ?);" + << p_ChatId << p_MsgId << p_Transcription << p_Language << p_Service << timestamp; + + return true; + } + catch (const sqlite::sqlite_exception& ex) + { + HANDLE_SQLITE_EXCEPTION(ex); + return false; + } +} + +std::string MessageCache::GetTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_MsgId) +{ + if (!m_CacheEnabled) return ""; + + std::unique_lock lock(m_DbMutex); + + if (m_Dbs.find(p_ProfileId) == m_Dbs.end()) return ""; + + try + { + std::string transcription; + + // *INDENT-OFF* + *m_Dbs[p_ProfileId] << "SELECT transcription FROM transcriptions " + "WHERE chatId = ? AND msgId = ?;" + << p_ChatId << p_MsgId >> + [&](const std::string& p_Transcription) + { + transcription = p_Transcription; + }; + // *INDENT-ON* + + return transcription; + } + catch (const sqlite::sqlite_exception& ex) + { + HANDLE_SQLITE_EXCEPTION(ex); + return ""; + } +} + +bool MessageCache::HasTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_MsgId) +{ + if (!m_CacheEnabled) return false; + + std::unique_lock lock(m_DbMutex); + + if (m_Dbs.find(p_ProfileId) == m_Dbs.end()) return false; + + try + { + bool hasTranscription = false; + + // *INDENT-OFF* + *m_Dbs[p_ProfileId] << "SELECT 1 FROM transcriptions " + "WHERE chatId = ? AND msgId = ? LIMIT 1;" + << p_ChatId << p_MsgId >> + [&](int) + { + hasTranscription = true; + }; + // *INDENT-ON* + + return hasTranscription; + } + catch (const sqlite::sqlite_exception& ex) + { + HANDLE_SQLITE_EXCEPTION(ex); + return false; + } +} + +void MessageCache::DeleteTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_MsgId) +{ + if (!m_CacheEnabled) return; + + std::unique_lock lock(m_DbMutex); + + if (m_Dbs.find(p_ProfileId) == m_Dbs.end()) return; + + try + { + *m_Dbs[p_ProfileId] << "DELETE FROM transcriptions " + "WHERE chatId = ? AND msgId = ?;" + << p_ChatId << p_MsgId; + } + catch (const sqlite::sqlite_exception& ex) + { + HANDLE_SQLITE_EXCEPTION(ex); + } +} diff --git a/lib/ncutil/src/messagecache.h b/lib/ncutil/src/messagecache.h index 31956078..2d5e9032 100644 --- a/lib/ncutil/src/messagecache.h +++ b/lib/ncutil/src/messagecache.h @@ -48,6 +48,7 @@ class MessageCache UpdateArchivedRequestType, AddGroupMembersRequestType, FetchGroupMembersRequestType, + UpdateTranscriptionLanguageRequestType, }; class Request @@ -223,6 +224,15 @@ class MessageCache std::string chatId; }; + class UpdateTranscriptionLanguageRequest : public Request + { + public: + virtual RequestType GetRequestType() const { return UpdateTranscriptionLanguageRequestType; } + std::string profileId; + std::string chatId; + std::string transcriptionLanguage; + }; + public: static void Init(); static void Cleanup(); @@ -266,8 +276,21 @@ class MessageCache const std::vector& p_MemberIds); static void FetchGroupMembers(const std::string& p_ProfileId, const std::string& p_ChatId); static std::vector FetchGroupMembersSync(const std::string& p_ProfileId, const std::string& p_ChatId); + static void UpdateTranscriptionLanguage(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_TranscriptionLanguage); static void Export(const std::string& p_ExportDir); + // Transcription methods + static bool StoreTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_MsgId, const std::string& p_Transcription, + const std::string& p_Language = "", const std::string& p_Service = ""); + static std::string GetTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_MsgId); + static bool HasTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_MsgId); + static void DeleteTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_MsgId); + private: static void Process(); static void EnqueueRequest(std::shared_ptr p_Request); diff --git a/src/main.cpp b/src/main.cpp index 1a259aca..01636d0b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -684,6 +684,8 @@ void ShowHelp() " Alt-q jump to quoted/replied message\n" " Alt-r forward selected message\n" " Alt-s add/remove reaction on selected message\n" + " Alt-u transcribe selected audio message\n" + " Alt-U re-transcribe selected audio message\n" " Alt-w external message viewer\n" "\n" "Interactive Commands for Text Input:\n" diff --git a/src/transcribe b/src/transcribe new file mode 100755 index 00000000..a9e95346 --- /dev/null +++ b/src/transcribe @@ -0,0 +1,460 @@ +#!/usr/bin/env python3 +""" +nchat audio transcription script +Supports multiple Whisper backends: OpenAI API, whisper.cpp, Whisper Python +""" + +import argparse +import os +import sys +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Optional + +# Exit codes +EXIT_SUCCESS = 0 +EXIT_INVALID_ARGS = 1 +EXIT_FILE_ERROR = 2 +EXIT_SERVICE_ERROR = 3 +EXIT_TIMEOUT = 4 +EXIT_UNSUPPORTED_FORMAT = 5 + +# Supported audio formats +SUPPORTED_FORMATS = {'.ogg', '.oga', '.opus', '.mp3', '.m4a', '.wav', '.flac', '.webm'} + + +class TranscriptionService(ABC): + """Base class for transcription services""" + + @abstractmethod + def transcribe(self, audio_file: str, language: str = "auto") -> str: + """ + Transcribe audio file. + + Args: + audio_file: Path to audio file + language: Language code or "auto" for detection + + Returns: + Transcribed text as string + + Raises: + RuntimeError: If transcription fails + """ + pass + + +class OpenAIWhisperService(TranscriptionService): + """OpenAI Whisper API service""" + + def __init__(self, api_key: Optional[str] = None): + """ + Initialize OpenAI service. + + Args: + api_key: OpenAI API key (defaults to OPENAI_API_KEY env var) + + Raises: + RuntimeError: If API key is not provided + """ + self.api_key = api_key or os.getenv('OPENAI_API_KEY') + if not self.api_key: + raise RuntimeError("OpenAI API key not set. Please set OPENAI_API_KEY environment variable.") + + try: + from openai import OpenAI + self.client = OpenAI(api_key=self.api_key) + except ImportError: + raise RuntimeError("OpenAI package not installed. Install with: pip install openai") + + def transcribe(self, audio_file: str, language: str = "auto") -> str: + """ + Transcribe audio file using OpenAI Whisper API. + + Args: + audio_file: Path to audio file + language: Language code or "auto" for detection + + Returns: + Transcribed text as string + + Raises: + FileNotFoundError: If audio file doesn't exist + RuntimeError: If API request fails + """ + if not os.path.isfile(audio_file): + raise FileNotFoundError(f"Audio file not found: {audio_file}") + + try: + with open(audio_file, "rb") as f: + # OpenAI API: if language is "auto", don't specify language parameter + # This allows Whisper to auto-detect + kwargs = { + "model": "whisper-1", + "file": f, + } + + # Only add language if not auto-detect + if language != "auto": + kwargs["language"] = language + + response = self.client.audio.transcriptions.create(**kwargs) + + return response.text.strip() + + except Exception as e: + # Handle API errors + error_msg = str(e) + if "authentication" in error_msg.lower() or "api_key" in error_msg.lower(): + raise RuntimeError(f"OpenAI API authentication failed: {error_msg}") + elif "rate_limit" in error_msg.lower(): + raise RuntimeError(f"OpenAI API rate limit exceeded: {error_msg}") + elif "invalid" in error_msg.lower() and "audio" in error_msg.lower(): + raise RuntimeError(f"Invalid audio file: {error_msg}") + else: + raise RuntimeError(f"OpenAI API error: {error_msg}") + + +class WhisperCppService(TranscriptionService): + """whisper.cpp HTTP server service""" + + def __init__(self, server_url: Optional[str] = None): + """ + Initialize whisper.cpp service. + + Args: + server_url: URL of whisper.cpp server (defaults to WHISPER_CPP_SERVER env var or http://localhost:8080) + """ + self.server_url = server_url or os.getenv('WHISPER_CPP_SERVER', 'http://localhost:8080') + + try: + import requests + self.requests = requests + except ImportError: + raise RuntimeError("requests package not installed. Install with: pip install requests") + + def transcribe(self, audio_file: str, language: str = "auto") -> str: + """ + Transcribe audio file using whisper.cpp HTTP server. + + Args: + audio_file: Path to audio file + language: Language code or "auto" for detection + + Returns: + Transcribed text as string + + Raises: + FileNotFoundError: If audio file doesn't exist + RuntimeError: If server request fails + """ + if not os.path.isfile(audio_file): + raise FileNotFoundError(f"Audio file not found: {audio_file}") + + try: + with open(audio_file, 'rb') as f: + files = {'file': f} + data = {} + + # Add language if not auto-detect + if language != "auto": + data['language'] = language + + response = self.requests.post( + f"{self.server_url}/inference", + files=files, + data=data, + timeout=30 + ) + + if response.status_code != 200: + raise RuntimeError(f"Server returned error: {response.status_code} - {response.text}") + + result = response.json() + transcription = result.get('text', '') + + return transcription.strip() + + except self.requests.exceptions.ConnectionError: + raise RuntimeError(f"Could not connect to whisper.cpp server at {self.server_url}") + except self.requests.exceptions.Timeout: + raise RuntimeError(f"Request to whisper.cpp server timed out") + except Exception as e: + raise RuntimeError(f"whisper.cpp error: {str(e)}") + + +class WhisperLocalService(TranscriptionService): + """Local Whisper Python package service""" + + def __init__(self, model_size: str = "base"): + """ + Initialize local Whisper service. + + Args: + model_size: Model size (tiny, base, small, medium, large) + """ + self.model_size = model_size + self.model = None + + try: + import whisper + self.whisper = whisper + except ImportError: + # Try faster-whisper as alternative + try: + from faster_whisper import WhisperModel + self.faster_whisper = WhisperModel + self.use_faster = True + except ImportError: + raise RuntimeError( + "Whisper package not installed. Install with: pip install openai-whisper " + "or pip install faster-whisper" + ) + else: + self.use_faster = False + + def transcribe(self, audio_file: str, language: str = "auto") -> str: + """ + Transcribe audio file using local Whisper. + + Args: + audio_file: Path to audio file + language: Language code or "auto" for detection + + Returns: + Transcribed text as string + + Raises: + FileNotFoundError: If audio file doesn't exist + RuntimeError: If transcription fails + """ + if not os.path.isfile(audio_file): + raise FileNotFoundError(f"Audio file not found: {audio_file}") + + try: + # Load model if not already loaded + if self.model is None: + if self.use_faster: + # Use faster-whisper (more efficient) + self.model = self.faster_whisper(self.model_size, device="cpu", compute_type="int8") + else: + # Use standard whisper + self.model = self.whisper.load_model(self.model_size) + + # Perform transcription + if self.use_faster: + # faster-whisper API + segments, info = self.model.transcribe( + audio_file, + language=None if language == "auto" else language + ) + transcription = " ".join([segment.text for segment in segments]) + else: + # standard whisper API + result = self.model.transcribe( + audio_file, + language=None if language == "auto" else language + ) + transcription = result['text'] + + return transcription.strip() + + except Exception as e: + raise RuntimeError(f"Local Whisper error: {str(e)}") + + +def detect_service() -> str: + """ + Auto-detect available transcription service. + + Returns: + Service name (openai, whisper-cpp, or whisper-local) + + Priority: + 1. Environment variable WHISPER_SERVICE + 2. OpenAI API (if OPENAI_API_KEY is set) + 3. whisper.cpp server (if running on localhost:8080) + 4. Local Whisper (if installed) + """ + # Check environment variable override + env_service = os.getenv("WHISPER_SERVICE") + if env_service: + return env_service + + # Check for OpenAI API key + if os.getenv("OPENAI_API_KEY"): + return "openai" + + # Check for whisper.cpp server + try: + import requests + server_url = os.getenv('WHISPER_CPP_SERVER', 'http://localhost:8080') + response = requests.get(f"{server_url}/", timeout=1) + if response.status_code in [200, 404]: # Server is running + return "whisper-cpp" + except Exception: + pass + + # Check for local Whisper + try: + import whisper + return "whisper-local" + except ImportError: + try: + from faster_whisper import WhisperModel + return "whisper-local" + except ImportError: + pass + + # No service available + raise RuntimeError( + "No transcription service available. Please either:\n" + " 1. Set OPENAI_API_KEY environment variable for OpenAI API\n" + " 2. Start whisper.cpp server (see doc/TRANSCRIPTION-SETUP.md)\n" + " 3. Install local Whisper: pip install openai-whisper" + ) + + +def validate_audio_file(file_path: str) -> bool: + """ + Validate audio file exists and has supported format. + + Args: + file_path: Path to audio file + + Returns: + True if valid, False otherwise + + Raises: + FileNotFoundError: If file doesn't exist + ValueError: If format is unsupported + """ + if not os.path.isfile(file_path): + raise FileNotFoundError(f"Audio file not found: {file_path}") + + file_ext = Path(file_path).suffix.lower() + if file_ext not in SUPPORTED_FORMATS: + raise ValueError( + f"Unsupported audio format: {file_ext}\n" + f"Supported formats: {', '.join(sorted(SUPPORTED_FORMATS))}" + ) + + return True + + +def create_service(service_name: str, model: str = "base") -> TranscriptionService: + """ + Factory method to create transcription service. + + Args: + service_name: Service to create (openai, whisper-cpp, whisper-local) + model: Model size for local services (tiny, base, small, medium, large) + + Returns: + Transcription service instance + + Raises: + ValueError: If service name is invalid + RuntimeError: If service initialization fails + """ + if service_name == "openai": + return OpenAIWhisperService() + elif service_name == "whisper-cpp": + return WhisperCppService() + elif service_name == "whisper-local": + return WhisperLocalService(model_size=model) + else: + raise ValueError(f"Unknown service: {service_name}") + + +def main(): + """Main entry point for transcription script""" + + parser = argparse.ArgumentParser( + description="Transcribe audio files using Whisper", + epilog="See doc/TRANSCRIPTION.md for more information" + ) + + parser.add_argument( + "-f", "--file", + required=True, + help="Audio file path (required)" + ) + + parser.add_argument( + "-s", "--service", + default="auto", + choices=["auto", "openai", "whisper-cpp", "whisper-local"], + help="Transcription service to use (default: auto)" + ) + + parser.add_argument( + "-l", "--language", + default="auto", + help="Audio language code (en, es, fr, etc.) or 'auto' for detection (default: auto)" + ) + + parser.add_argument( + "-m", "--model", + default="base", + choices=["tiny", "base", "small", "medium", "large"], + help="Model size for local services (default: base)" + ) + + parser.add_argument( + "--timeout", + type=int, + default=30, + help="Request timeout in seconds (default: 30)" + ) + + parser.add_argument( + "--version", + action="version", + version="%(prog)s 1.0" + ) + + args = parser.parse_args() + + try: + # Validate audio file + validate_audio_file(args.file) + + # Auto-detect service if needed + service_name = args.service + if service_name == "auto": + service_name = detect_service() + + # Create service instance + service = create_service(service_name, args.model) + + # Perform transcription + transcription = service.transcribe(args.file, args.language) + + # Output result to stdout + print(transcription) + sys.exit(EXIT_SUCCESS) + + except FileNotFoundError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(EXIT_FILE_ERROR) + + except ValueError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(EXIT_UNSUPPORTED_FORMAT) + + except RuntimeError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(EXIT_SERVICE_ERROR) + + except KeyboardInterrupt: + print("\nTranscription interrupted", file=sys.stderr) + sys.exit(EXIT_TIMEOUT) + + except Exception as e: + print(f"Unexpected error: {e}", file=sys.stderr) + sys.exit(EXIT_SERVICE_ERROR) + + +if __name__ == "__main__": + main() diff --git a/src/uiconfig.cpp b/src/uiconfig.cpp index 77909bfd..0014d49c 100644 --- a/src/uiconfig.cpp +++ b/src/uiconfig.cpp @@ -20,6 +20,14 @@ void UiConfig::Init() { { "attachment_indicator", "\xF0\x9F\x93\x8E" }, { "attachment_open_command", "" }, + { "audio_transcribe_auto", "0" }, + { "audio_transcribe_cache", "1" }, + { "audio_transcribe_command", "/usr/local/libexec/nchat/transcribe -f '%1'" }, + { "audio_transcribe_enabled", "0" }, + { "audio_transcribe_inline", "1" }, + { "audio_transcribe_language", "auto" }, + { "audio_transcribe_max_lines", "15" }, + { "audio_transcribe_timeout", "30" }, { "auto_compose_command", "" }, { "auto_compose_enabled", "0" }, { "auto_compose_history_count", "25" }, diff --git a/src/uihelpview.cpp b/src/uihelpview.cpp index f92d3604..c08f1a5b 100644 --- a/src/uihelpview.cpp +++ b/src/uihelpview.cpp @@ -125,6 +125,10 @@ void UiHelpView::Draw() AppendHelpItem("save", "SaveFile", helpItems); AppendHelpItem("open_link", "OpenLink", helpItems); + AppendHelpItem("transcribe_audio", "Transcribe", helpItems); + AppendHelpItem("retranscribe_audio", "ReTrans", helpItems); + AppendHelpItem("set_transcription_lang", "SetLang", helpItems); + AppendHelpItem("jump_quoted", "JumpQuoted", helpItems); AppendHelpItem("react", "AddReact", helpItems); AppendHelpItem("open_msg", "ExtView", helpItems); diff --git a/src/uihistoryview.cpp b/src/uihistoryview.cpp index 56174621..9833fd80 100644 --- a/src/uihistoryview.cpp +++ b/src/uihistoryview.cpp @@ -17,6 +17,7 @@ #include "uicolorconfig.h" #include "uiconfig.h" #include "uimodel.h" +#include "messagecache.h" UiHistoryView::UiHistoryView(const UiViewParams& p_Params) : UiViewBase(p_Params) @@ -71,6 +72,7 @@ void UiHistoryView::Draw() static std::wstring attachmentIndicator = StrUtil::ToWString(UiConfig::GetStr("attachment_indicator") + " "); static std::wstring quoteIndicator = L"> "; + static std::wstring transcriptionIndicator = L"[Transcribed] "; std::pair& currentChat = m_Model->GetCurrentChatLocked(); const bool emojiEnabled = m_Model->GetEmojiEnabledLocked(); @@ -171,6 +173,9 @@ void UiHistoryView::Draw() wlines.insert(wlines.begin(), quote); } + // Transcription lines counter (needs to be accessible in rendering loop) + int transcriptionLines = 0; + // File attachment if (!msg.fileInfo.empty()) { @@ -222,6 +227,66 @@ void UiHistoryView::Draw() std::wstring fileStr = attachmentIndicator + StrUtil::ToWString(fileName + fileStatus); wlines.insert(wlines.begin(), fileStr); + + // Transcription (if audio file and transcription available) + static const bool transcribeInline = UiConfig::GetBool("audio_transcribe_inline"); + if (transcribeInline) + { + std::string ext = FileUtil::GetFileExt(fileInfo.filePath); + + // Remove leading dot if present + if (!ext.empty() && ext[0] == '.') + { + ext = ext.substr(1); + } + + static const std::set audioExtensions = { + "ogg", "opus", "mp3", "m4a", "aac", "wav", "flac", "oga" + }; + + if (audioExtensions.find(ext) != audioExtensions.end()) + { + std::string transcription = MessageCache::GetTranscription(currentChat.first, currentChat.second, msg.id); + if (!transcription.empty()) + { + StrUtil::SanitizeMessageStr(transcription); + if (!emojiEnabled) + { + transcription = StrUtil::Textize(transcription); + } + + std::vector transcriptionWLines = + StrUtil::WordWrap(StrUtil::ToWString(transcription), m_PaddedW - 2, false, false, false, 2); + + // Check if transcription exceeds max lines limit + static const int maxTranscriptionLines = UiConfig::GetNum("audio_transcribe_max_lines"); + const bool needsTruncation = (maxTranscriptionLines > 0) && + (static_cast(transcriptionWLines.size()) > maxTranscriptionLines); + + if (needsTruncation) + { + int hiddenLines = transcriptionWLines.size() - maxTranscriptionLines + 1; // +1 for truncation indicator line + transcriptionWLines.resize(maxTranscriptionLines - 1); // Reserve last line for indicator + std::wstring truncationMsg = L"... (" + std::to_wstring(hiddenLines) + L" more lines)"; + transcriptionWLines.push_back(truncationMsg); + } + + // Add transcription indicator on first line + if (!transcriptionWLines.empty()) + { + transcriptionWLines[0] = transcriptionIndicator + transcriptionWLines[0]; + } + + // Insert transcription lines after file attachment + for (auto tline = transcriptionWLines.rbegin(); tline != transcriptionWLines.rend(); ++tline) + { + wlines.insert(wlines.begin() + 1, *tline); + } + + transcriptionLines = transcriptionWLines.size(); + } + } + } } // Reactions @@ -319,6 +384,13 @@ void UiHistoryView::Draw() bool isAttachment = (wline->rfind(attachmentIndicator, 0) == 0); bool isQuote = (wline->rfind(quoteIndicator, 0) == 0); bool isReaction = (reactionLines == 1) && (std::distance(wline, wlines.rbegin()) == 0); + + // Transcription lines are at positions 1 to (1 + transcriptionLines - 1) in forward iteration + // In reverse, calculate the position from the end + size_t posFromEnd = std::distance(wline, wlines.rbegin()); + size_t vectorSize = wlines.size(); + size_t posFromBegin = vectorSize - 1 - posFromEnd; + bool isTranscription = (transcriptionLines > 0) && (posFromBegin >= 1) && (posFromBegin < 1 + static_cast(transcriptionLines)); if (isAttachment) { @@ -332,6 +404,10 @@ void UiHistoryView::Draw() { wattron(m_PaddedWin, attributeTextNormal | colorPairTextReaction); } + else if (isTranscription) + { + wattron(m_PaddedWin, attributeText | colorPairTextQuoted | A_DIM); + } else { wattron(m_PaddedWin, attributeText | colorPairText); @@ -352,6 +428,10 @@ void UiHistoryView::Draw() { wattroff(m_PaddedWin, attributeTextNormal | colorPairTextReaction); } + else if (isTranscription) + { + wattroff(m_PaddedWin, attributeText | colorPairTextQuoted | A_DIM); + } else { wattroff(m_PaddedWin, attributeText | colorPairText); diff --git a/src/uikeyconfig.cpp b/src/uikeyconfig.cpp index a01b3d75..a4dc2e9e 100644 --- a/src/uikeyconfig.cpp +++ b/src/uikeyconfig.cpp @@ -232,6 +232,9 @@ void UiKeyConfig::Init(bool p_MapKeys) { "toggle_help", "KEY_CTRLG" }, { "toggle_list", "KEY_CTRLL" }, { "toggle_top", "KEY_CTRLP" }, + { "transcribe_audio", "\\33\\165" }, // alt/opt-u + { "retranscribe_audio", "\\33\\125" }, // alt/opt-U + { "set_transcription_lang", "\\33\\154" }, // alt/opt-l { "next_chat", "KEY_TAB" }, { "prev_chat", "KEY_BTAB" }, { "unread_chat", "KEY_CTRLF" }, diff --git a/src/uilanguagelistdialog.cpp b/src/uilanguagelistdialog.cpp new file mode 100644 index 00000000..23cbad31 --- /dev/null +++ b/src/uilanguagelistdialog.cpp @@ -0,0 +1,106 @@ +// uilanguagelistdialog.cpp +// +// Copyright (c) 2019-2025 Kristofer Berggren +// All rights reserved. +// +// nchat is distributed under the MIT license, see LICENSE for details. + +#include "uilanguagelistdialog.h" + +#include "strutil.h" + +UiLanguageListDialog::UiLanguageListDialog(const UiDialogParams& p_Params, const std::string& p_CurrentLanguage) + : UiListDialog(p_Params, false /*p_ShadeHidden*/) + , m_CurrentLanguage(p_CurrentLanguage) +{ + // Define common languages for transcription + m_Languages = { + {"", "Default (from global settings)"}, + {"auto", "Auto-detect"}, + {"en", "English"}, + {"es", "Spanish"}, + {"fr", "French"}, + {"de", "German"}, + {"it", "Italian"}, + {"pt", "Portuguese"}, + {"ru", "Russian"}, + {"uk", "Ukrainian"}, + {"zh", "Chinese"}, + {"ja", "Japanese"}, + {"ko", "Korean"}, + {"ar", "Arabic"}, + {"hi", "Hindi"}, + {"nl", "Dutch"}, + {"pl", "Polish"}, + {"tr", "Turkish"}, + {"sv", "Swedish"}, + {"no", "Norwegian"}, + {"da", "Danish"}, + {"fi", "Finnish"}, + }; + + UpdateList(); +} + +UiLanguageListDialog::~UiLanguageListDialog() +{ +} + +std::string UiLanguageListDialog::GetSelectedLanguage() +{ + return m_SelectedLanguage; +} + +void UiLanguageListDialog::OnSelect() +{ + if ((m_Index >= 0) && (m_Index < (int)m_FilteredIndices.size())) + { + int languageIndex = m_FilteredIndices[m_Index]; + m_SelectedLanguage = m_Languages[languageIndex].code; + m_Result = true; + m_Running = false; + } +} + +void UiLanguageListDialog::OnBack() +{ + m_Result = false; + m_Running = false; +} + +bool UiLanguageListDialog::OnTimer() +{ + return false; +} + +void UiLanguageListDialog::UpdateList() +{ + m_Items.clear(); + m_FilteredIndices.clear(); + + std::wstring filterStrLower = StrUtil::ToLower(m_FilterStr); + + for (size_t i = 0; i < m_Languages.size(); ++i) + { + const LanguageOption& lang = m_Languages[i]; + std::wstring displayName = StrUtil::ToWString(lang.name); + + // Add indicator if this is the current language + if (lang.code == m_CurrentLanguage) + { + displayName = L"* " + displayName; + } + + // Filter by name or code + std::wstring displayNameLower = StrUtil::ToLower(displayName); + std::wstring codeLower = StrUtil::ToLower(StrUtil::ToWString(lang.code)); + + if (filterStrLower.empty() || + (displayNameLower.find(filterStrLower) != std::wstring::npos) || + (codeLower.find(filterStrLower) != std::wstring::npos)) + { + m_Items.push_back(displayName); + m_FilteredIndices.push_back(i); + } + } +} diff --git a/src/uilanguagelistdialog.h b/src/uilanguagelistdialog.h new file mode 100644 index 00000000..e58dd057 --- /dev/null +++ b/src/uilanguagelistdialog.h @@ -0,0 +1,41 @@ +// uilanguagelistdialog.h +// +// Copyright (c) 2019-2025 Kristofer Berggren +// All rights reserved. +// +// nchat is distributed under the MIT license, see LICENSE for details. + +#pragma once + +#include "uilistdialog.h" + +#include +#include + +class UiLanguageListDialog : public UiListDialog +{ +public: + UiLanguageListDialog(const UiDialogParams& p_Params, const std::string& p_CurrentLanguage = ""); + virtual ~UiLanguageListDialog(); + + std::string GetSelectedLanguage(); + +protected: + virtual void OnSelect(); + virtual void OnBack(); + virtual bool OnTimer(); + + void UpdateList(); + +private: + struct LanguageOption + { + std::string code; + std::string name; + }; + + std::vector m_Languages; + std::vector m_FilteredIndices; // Indices into m_Languages after filtering + std::string m_SelectedLanguage; + std::string m_CurrentLanguage; +}; diff --git a/src/uimodel.cpp b/src/uimodel.cpp index 93308003..90202a86 100644 --- a/src/uimodel.cpp +++ b/src/uimodel.cpp @@ -19,6 +19,7 @@ #include "clipboard.h" #include "fileutil.h" #include "log.h" +#include "messagecache.h" #include "numutil.h" #include "protocolutil.h" #include "sethelp.h" @@ -35,6 +36,7 @@ #include "uiemojilistdialog.h" #include "uifilelistdialog.h" #include "uikeyconfig.h" +#include "uilanguagelistdialog.h" #include "uikeyinput.h" #include "uimessagedialog.h" #include "uitextinputdialog.h" @@ -4277,6 +4279,183 @@ bool UiModel::Impl::AutoCompose() return rv; } +bool UiModel::Impl::TranscribeAudio(bool p_ForceRetranscribe) +{ + AnyUserKeyInput(); + + const std::string& profileId = m_CurrentChat.first; + const std::string& chatId = m_CurrentChat.second; + const std::vector& messageVec = m_MessageVec[profileId][chatId]; + const std::unordered_map& messages = m_Messages[profileId][chatId]; + + const int messageOffset = GetSelectMessageActive() ? m_MessageOffset[profileId][chatId] : 0; + const int editOffset = GetEditMessageActive() ? 1 : 0; + const int offset = messageOffset + editOffset; + + auto it = std::next(messageVec.begin(), offset); + if (it == messageVec.end()) + { + LOG_WARNING("end of message history"); + return false; + } + + auto msgIt = messages.find(*it); + if (msgIt == messages.end()) + { + LOG_WARNING("message not found"); + return false; + } + + const ChatMessage& msg = msgIt->second; + const std::string& msgId = msg.id; + + // Check if message has a file attachment + if (msg.fileInfo.empty()) + { + LOG_DEBUG("message has no file attachment"); + LOG_WARNING("Selected message has no file attachment"); + return false; + } + + FileInfo fileInfo = ProtocolUtil::FileInfoFromHex(msg.fileInfo); + + // Check if file is downloaded + if (!IsAttachmentDownloaded(fileInfo)) + { + LOG_DEBUG("audio file not downloaded"); + LOG_WARNING("File not downloaded yet - select it to download first"); + return false; + } + + // Check if file is an audio file (based on extension) + const std::string filePath = fileInfo.filePath; + std::string ext = FileUtil::GetFileExt(filePath); + + // Remove leading dot if present + if (!ext.empty() && ext[0] == '.') + { + ext = ext.substr(1); + } + + static const std::set audioExtensions = { + "ogg", "opus", "mp3", "m4a", "aac", "wav", "flac", "oga" + }; + + if (audioExtensions.find(ext) == audioExtensions.end()) + { + LOG_DEBUG("file is not an audio file: %s", ext.c_str()); + LOG_WARNING("File is not an audio file (extension: %s). Supported: ogg, opus, mp3, m4a, aac, wav, flac, oga", ext.c_str()); + return false; + } + + // Check cache unless forcing retranscription + static const bool useCache = UiConfig::GetBool("audio_transcribe_cache"); + if (!p_ForceRetranscribe && useCache) + { + std::string cachedTranscription = MessageCache::GetTranscription(profileId, chatId, msgId); + if (!cachedTranscription.empty()) + { + LOG_DEBUG("using cached transcription"); + UpdateHistory(); + return true; + } + } + + // Get transcribe command + static const std::string cmdTemplate = []() + { + std::string transcribeCommand = UiConfig::GetStr("audio_transcribe_command"); + if (transcribeCommand.empty()) + { + transcribeCommand = FileUtil::DirName(FileUtil::GetSelfPath()) + + "/../" CMAKE_INSTALL_LIBEXECDIR "/nchat/transcribe -f '%1'"; + } + + return transcribeCommand; + }(); + + // Build and execute command + std::string transcription; + std::string cmd = cmdTemplate; + StrUtil::ReplaceString(cmd, "%1", filePath); + + // Add language parameter if specified + // First check per-chat language setting, then fall back to global setting + std::string language; + if (m_ChatInfos.count(profileId) && m_ChatInfos[profileId].count(chatId)) + { + language = m_ChatInfos[profileId][chatId].transcriptionLanguage; + } + + // Fall back to global setting if per-chat language is not set + if (language.empty()) + { + language = UiConfig::GetStr("audio_transcribe_language"); + } + + // Default to auto if still empty + if (language.empty()) + { + language = "auto"; + } + + if (language != "auto") + { + cmd += " -l " + language; + } + + static const int timeout = UiConfig::GetNum("audio_transcribe_timeout"); + LOG_TRACE("transcribe cmd \"%s\" start", cmd.c_str()); + + const bool rv = RunCommand(cmd, &transcription); + + if (rv && !transcription.empty()) + { + // Store in cache + if (useCache) + { + MessageCache::StoreTranscription(profileId, chatId, msgId, transcription, language); + } + + // Update UI + UpdateHistory(); + return true; + } + else + { + LOG_WARNING("transcription failed"); + return false; + } +} + +std::string UiModel::Impl::GetCurrentTranscriptionLanguage(const std::string& p_ProfileId, + const std::string& p_ChatId) +{ + if (m_ChatInfos.count(p_ProfileId) && m_ChatInfos[p_ProfileId].count(p_ChatId)) + { + return m_ChatInfos[p_ProfileId][p_ChatId].transcriptionLanguage; + } + return ""; +} + +void UiModel::Impl::UpdateCurrentTranscriptionLanguage(const std::string& p_ProfileId, + const std::string& p_ChatId, + const std::string& p_Language) +{ + if (m_ChatInfos.count(p_ProfileId) && m_ChatInfos[p_ProfileId].count(p_ChatId)) + { + m_ChatInfos[p_ProfileId][p_ChatId].transcriptionLanguage = p_Language; + } + else + { + // Create chat info if it doesn't exist + ChatInfo chatInfo; + chatInfo.id = p_ChatId; + chatInfo.transcriptionLanguage = p_Language; + m_ChatInfos[p_ProfileId][p_ChatId] = chatInfo; + } +} + // --------------------------------------------------------------------- // UiModel // --------------------------------------------------------------------- @@ -4392,6 +4571,9 @@ void UiModel::KeyHandler(wint_t p_Key) static wint_t keyAutoCompose = UiKeyConfig::GetKey("auto_compose"); static wint_t keySelectMention = UiKeyConfig::GetKey("select_mention"); + static wint_t keyTranscribeAudio = UiKeyConfig::GetKey("transcribe_audio"); + static wint_t keyRetranscribeAudio = UiKeyConfig::GetKey("retranscribe_audio"); + static wint_t keySetTranscriptionLang = UiKeyConfig::GetKey("set_transcription_lang"); if (p_Key == keyTerminalResize) { @@ -4615,6 +4797,21 @@ void UiModel::KeyHandler(wint_t p_Key) { OnKeyAutoCompose(); } + else if (p_Key == keyTranscribeAudio) + { + LOG_DEBUG("transcribe_audio key pressed"); + OnKeyTranscribeAudio(); + } + else if (p_Key == keyRetranscribeAudio) + { + LOG_DEBUG("retranscribe_audio key pressed"); + OnKeyRetranscribeAudio(); + } + else if (p_Key == keySetTranscriptionLang) + { + LOG_DEBUG("set_transcription_lang key pressed"); + OnKeySetTranscriptionLang(); + } else { std::unique_lock lock(m_ModelMutex); @@ -5534,6 +5731,140 @@ void UiModel::OnKeyPaste() } } +void UiModel::OnKeyTranscribeAudio() +{ + LOG_DEBUG("OnKeyTranscribeAudio called"); + + // Check if transcription is enabled (check every time, not static) + bool transcribeEnabled = UiConfig::GetBool("audio_transcribe_enabled"); + LOG_DEBUG("audio_transcribe_enabled = %d", transcribeEnabled); + + if (!transcribeEnabled) + { + LOG_DEBUG("transcription not enabled, showing dialog"); + MessageDialog("Warning", "Audio transcription not enabled.", 0.7, 5); + return; + } + + LOG_DEBUG("transcription enabled, checking prerequisites"); + + // Pre-req + { + std::unique_lock lock(m_ModelMutex); + if (!GetImpl().GetSelectMessageActive()) + { + MessageDialog("Info", "Please select a message first (press Up arrow).", 0.7, 5); + return; + } + if (GetImpl().GetEditMessageActive()) + { + MessageDialog("Info", "Cannot transcribe while editing a message.", 0.7, 5); + return; + } + } + + bool rv = false; + { + std::unique_lock lock(m_ModelMutex); + rv = GetImpl().TranscribeAudio(false /* forceRetranscribe */); + } + + if (!rv) + { + MessageDialog("Warning", "Transcription failed.", 0.7, 5); + } +} + +void UiModel::OnKeyRetranscribeAudio() +{ + // Check if transcription is enabled (check every time, not static) + bool transcribeEnabled = UiConfig::GetBool("audio_transcribe_enabled"); + + if (!transcribeEnabled) + { + MessageDialog("Warning", "Audio transcription not enabled.", 0.7, 5); + return; + } + + // Pre-req + { + std::unique_lock lock(m_ModelMutex); + if (!GetImpl().GetSelectMessageActive()) + { + MessageDialog("Info", "Please select a message first (press Up arrow).", 0.7, 5); + return; + } + if (GetImpl().GetEditMessageActive()) + { + MessageDialog("Info", "Cannot transcribe while editing a message.", 0.7, 5); + return; + } + } + + bool rv = false; + { + std::unique_lock lock(m_ModelMutex); + rv = GetImpl().TranscribeAudio(true /* forceRetranscribe */); + } + + if (!rv) + { + MessageDialog("Warning", "Re-transcription failed.", 0.7, 5); + } +} + +void UiModel::OnKeySetTranscriptionLang() +{ + // Get current chat info + std::string profileId; + std::string chatId; + std::string currentLanguage; + + { + std::unique_lock lock(m_ModelMutex); + + // Check if we have a valid current chat + if (GetImpl().GetCurrentChat().first.empty() || GetImpl().GetCurrentChat().second.empty()) + { + MessageDialog("Info", "Please select a chat first.", 0.7, 5); + return; + } + + profileId = GetImpl().GetCurrentChat().first; + chatId = GetImpl().GetCurrentChat().second; + + // Get current transcription language for this chat + currentLanguage = GetImpl().GetCurrentTranscriptionLanguage(profileId, chatId); + } + + // Open modal dialog without model mutex held + UiDialogParams params(this, "Set Transcription Language", 0.75, 0.65); + UiLanguageListDialog dialog(params, currentLanguage); + if (dialog.Run()) + { + std::string selectedLanguage = dialog.GetSelectedLanguage(); + + { + std::unique_lock lock(m_ModelMutex); + + // Update in-memory chat info + GetImpl().UpdateCurrentTranscriptionLanguage(profileId, chatId, selectedLanguage); + + // Persist to database + MessageCache::UpdateTranscriptionLanguage(profileId, chatId, selectedLanguage); + + GetImpl().ReinitView(); + } + + MessageDialog("Info", "Transcription language updated successfully.", 0.7, 5); + } + else + { + std::unique_lock lock(m_ModelMutex); + GetImpl().ReinitView(); + } +} + bool UiModel::IsAttachmentDownloaded(const FileInfo& p_FileInfo) { return UiModel::Impl::IsAttachmentDownloaded(p_FileInfo); diff --git a/src/uimodel.h b/src/uimodel.h index 76378666..4fc5eedc 100644 --- a/src/uimodel.h +++ b/src/uimodel.h @@ -190,6 +190,10 @@ class UiModel void HandleProtocolUiControlStart(); void HandleProtocolUiControlEnd(); bool AutoCompose(); + bool TranscribeAudio(bool p_ForceRetranscribe); + std::string GetCurrentTranscriptionLanguage(const std::string& p_ProfileId, const std::string& p_ChatId); + void UpdateCurrentTranscriptionLanguage(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_Language); static bool IsAttachmentDownloaded(const FileInfo& p_FileInfo); static bool IsAttachmentDownloadable(const FileInfo& p_FileInfo); @@ -415,6 +419,9 @@ class UiModel void OnKeyCut(); void OnKeyCopy(); void OnKeyPaste(); + void OnKeyTranscribeAudio(); + void OnKeyRetranscribeAudio(); + void OnKeySetTranscriptionLang(); private: Impl m_Impl; From fc97a6c2c6a0bc39a3c8f449f7c162ac3e6c9b60 Mon Sep 17 00:00:00 2001 From: bodharma Date: Mon, 27 Apr 2026 15:14:13 +0100 Subject: [PATCH 2/7] refactor: move transcription storage into messages table (schema v12) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add schema 11→12 migration that adds a transcription TEXT column to the messages table and migrates existing data from the transcriptions table. Rewrite StoreTranscription, GetTranscription, HasTranscription, and DeleteTranscription to operate on messages instead of the separate transcriptions table. Remove p_Language and p_Service params from StoreTranscription as they are no longer needed. --- lib/ncutil/src/messagecache.cpp | 57 ++++++++++++++++++++------------- lib/ncutil/src/messagecache.h | 3 +- 2 files changed, 35 insertions(+), 25 deletions(-) diff --git a/lib/ncutil/src/messagecache.cpp b/lib/ncutil/src/messagecache.cpp index a37735a0..84264b25 100644 --- a/lib/ncutil/src/messagecache.cpp +++ b/lib/ncutil/src/messagecache.cpp @@ -473,7 +473,27 @@ void MessageCache::AddProfile(const std::string& p_ProfileId, bool p_CheckSequen "SET schema = ?;" << schemaVersion; } - static const int64_t s_SchemaVersion = 11; + if (schemaVersion == 11) + { + LOG_INFO("update db schema 11 to 12"); + + *m_Dbs[p_ProfileId] << "ALTER TABLE messages ADD COLUMN transcription TEXT DEFAULT '';"; + + // Migrate existing transcriptions into messages table + *m_Dbs[p_ProfileId] << + "UPDATE messages SET transcription = (" + " SELECT t.transcription FROM transcriptions t" + " WHERE t.chatId = messages.chatId AND t.msgId = messages.id" + ") WHERE EXISTS (" + " SELECT 1 FROM transcriptions t" + " WHERE t.chatId = messages.chatId AND t.msgId = messages.id" + ");"; + + schemaVersion = 12; + *m_Dbs[p_ProfileId] << "UPDATE version SET schema = ?;" << schemaVersion; + } + + static const int64_t s_SchemaVersion = 12; if (schemaVersion > s_SchemaVersion) { LOG_WARNING("cache db schema %d from newer nchat version detected, if cache issues are encountered " @@ -2124,8 +2144,7 @@ void MessageCache::CallMessageHandler(std::shared_ptr p_ServiceM } bool MessageCache::StoreTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, - const std::string& p_MsgId, const std::string& p_Transcription, - const std::string& p_Language, const std::string& p_Service) + const std::string& p_MsgId, const std::string& p_Transcription) { if (!m_CacheEnabled) return false; @@ -2135,13 +2154,9 @@ bool MessageCache::StoreTranscription(const std::string& p_ProfileId, const std: try { - int64_t timestamp = TimeUtil::GetCurrentTimeMSec() / 1000; - - *m_Dbs[p_ProfileId] << "INSERT OR REPLACE INTO transcriptions " - "(chatId, msgId, transcription, language, service, timestamp) " - "VALUES (?, ?, ?, ?, ?, ?);" - << p_ChatId << p_MsgId << p_Transcription << p_Language << p_Service << timestamp; - + *m_Dbs[p_ProfileId] << + "UPDATE messages SET transcription = ? WHERE chatId = ? AND id = ?;" + << p_Transcription << p_ChatId << p_MsgId; return true; } catch (const sqlite::sqlite_exception& ex) @@ -2152,7 +2167,7 @@ bool MessageCache::StoreTranscription(const std::string& p_ProfileId, const std: } std::string MessageCache::GetTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, - const std::string& p_MsgId) + const std::string& p_MsgId) { if (!m_CacheEnabled) return ""; @@ -2163,17 +2178,15 @@ std::string MessageCache::GetTranscription(const std::string& p_ProfileId, const try { std::string transcription; - // *INDENT-OFF* - *m_Dbs[p_ProfileId] << "SELECT transcription FROM transcriptions " - "WHERE chatId = ? AND msgId = ?;" + *m_Dbs[p_ProfileId] << + "SELECT transcription FROM messages WHERE chatId = ? AND id = ?;" << p_ChatId << p_MsgId >> [&](const std::string& p_Transcription) { transcription = p_Transcription; }; // *INDENT-ON* - return transcription; } catch (const sqlite::sqlite_exception& ex) @@ -2184,7 +2197,7 @@ std::string MessageCache::GetTranscription(const std::string& p_ProfileId, const } bool MessageCache::HasTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, - const std::string& p_MsgId) + const std::string& p_MsgId) { if (!m_CacheEnabled) return false; @@ -2195,17 +2208,15 @@ bool MessageCache::HasTranscription(const std::string& p_ProfileId, const std::s try { bool hasTranscription = false; - // *INDENT-OFF* - *m_Dbs[p_ProfileId] << "SELECT 1 FROM transcriptions " - "WHERE chatId = ? AND msgId = ? LIMIT 1;" + *m_Dbs[p_ProfileId] << + "SELECT 1 FROM messages WHERE chatId = ? AND id = ? AND transcription != '' LIMIT 1;" << p_ChatId << p_MsgId >> [&](int) { hasTranscription = true; }; // *INDENT-ON* - return hasTranscription; } catch (const sqlite::sqlite_exception& ex) @@ -2216,7 +2227,7 @@ bool MessageCache::HasTranscription(const std::string& p_ProfileId, const std::s } void MessageCache::DeleteTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, - const std::string& p_MsgId) + const std::string& p_MsgId) { if (!m_CacheEnabled) return; @@ -2226,8 +2237,8 @@ void MessageCache::DeleteTranscription(const std::string& p_ProfileId, const std try { - *m_Dbs[p_ProfileId] << "DELETE FROM transcriptions " - "WHERE chatId = ? AND msgId = ?;" + *m_Dbs[p_ProfileId] << + "UPDATE messages SET transcription = '' WHERE chatId = ? AND id = ?;" << p_ChatId << p_MsgId; } catch (const sqlite::sqlite_exception& ex) diff --git a/lib/ncutil/src/messagecache.h b/lib/ncutil/src/messagecache.h index 2d5e9032..4773bcd7 100644 --- a/lib/ncutil/src/messagecache.h +++ b/lib/ncutil/src/messagecache.h @@ -282,8 +282,7 @@ class MessageCache // Transcription methods static bool StoreTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, - const std::string& p_MsgId, const std::string& p_Transcription, - const std::string& p_Language = "", const std::string& p_Service = ""); + const std::string& p_MsgId, const std::string& p_Transcription); static std::string GetTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, const std::string& p_MsgId); static bool HasTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, From ec6c591f2a85edabcc99428f1ee0f7deb23f4633 Mon Sep 17 00:00:00 2001 From: bodharma Date: Mon, 27 Apr 2026 15:23:24 +0100 Subject: [PATCH 3/7] refactor: collapse retranscribe into alt-u, remove inline/cache config options - Remove retranscribe_audio key binding (alt-shift-U) and its dispatch handler; alt-U now always (re-)transcribes unconditionally - Simplify TranscribeAudio(): drop p_ForceRetranscribe param, remove the cache-check block and the useCache/timeout static variables - Fix StoreTranscription call to 4-arg signature (drop language arg) - Remove audio_transcribe_cache and audio_transcribe_inline config defaults - Remove OnKeyRetranscribeAudio() declaration and definition --- src/uiconfig.cpp | 2 -- src/uihelpview.cpp | 1 - src/uikeyconfig.cpp | 1 - src/uimodel.cpp | 68 ++------------------------------------------- src/uimodel.h | 3 +- 5 files changed, 4 insertions(+), 71 deletions(-) diff --git a/src/uiconfig.cpp b/src/uiconfig.cpp index 0014d49c..784661c0 100644 --- a/src/uiconfig.cpp +++ b/src/uiconfig.cpp @@ -21,10 +21,8 @@ void UiConfig::Init() { "attachment_indicator", "\xF0\x9F\x93\x8E" }, { "attachment_open_command", "" }, { "audio_transcribe_auto", "0" }, - { "audio_transcribe_cache", "1" }, { "audio_transcribe_command", "/usr/local/libexec/nchat/transcribe -f '%1'" }, { "audio_transcribe_enabled", "0" }, - { "audio_transcribe_inline", "1" }, { "audio_transcribe_language", "auto" }, { "audio_transcribe_max_lines", "15" }, { "audio_transcribe_timeout", "30" }, diff --git a/src/uihelpview.cpp b/src/uihelpview.cpp index c08f1a5b..8a6cd157 100644 --- a/src/uihelpview.cpp +++ b/src/uihelpview.cpp @@ -126,7 +126,6 @@ void UiHelpView::Draw() AppendHelpItem("open_link", "OpenLink", helpItems); AppendHelpItem("transcribe_audio", "Transcribe", helpItems); - AppendHelpItem("retranscribe_audio", "ReTrans", helpItems); AppendHelpItem("set_transcription_lang", "SetLang", helpItems); AppendHelpItem("jump_quoted", "JumpQuoted", helpItems); diff --git a/src/uikeyconfig.cpp b/src/uikeyconfig.cpp index a4dc2e9e..70e8978d 100644 --- a/src/uikeyconfig.cpp +++ b/src/uikeyconfig.cpp @@ -233,7 +233,6 @@ void UiKeyConfig::Init(bool p_MapKeys) { "toggle_list", "KEY_CTRLL" }, { "toggle_top", "KEY_CTRLP" }, { "transcribe_audio", "\\33\\165" }, // alt/opt-u - { "retranscribe_audio", "\\33\\125" }, // alt/opt-U { "set_transcription_lang", "\\33\\154" }, // alt/opt-l { "next_chat", "KEY_TAB" }, { "prev_chat", "KEY_BTAB" }, diff --git a/src/uimodel.cpp b/src/uimodel.cpp index 90202a86..99370265 100644 --- a/src/uimodel.cpp +++ b/src/uimodel.cpp @@ -4279,7 +4279,7 @@ bool UiModel::Impl::AutoCompose() return rv; } -bool UiModel::Impl::TranscribeAudio(bool p_ForceRetranscribe) +bool UiModel::Impl::TranscribeAudio() { AnyUserKeyInput(); @@ -4348,19 +4348,6 @@ bool UiModel::Impl::TranscribeAudio(bool p_ForceRetranscribe) return false; } - // Check cache unless forcing retranscription - static const bool useCache = UiConfig::GetBool("audio_transcribe_cache"); - if (!p_ForceRetranscribe && useCache) - { - std::string cachedTranscription = MessageCache::GetTranscription(profileId, chatId, msgId); - if (!cachedTranscription.empty()) - { - LOG_DEBUG("using cached transcription"); - UpdateHistory(); - return true; - } - } - // Get transcribe command static const std::string cmdTemplate = []() { @@ -4404,18 +4391,13 @@ bool UiModel::Impl::TranscribeAudio(bool p_ForceRetranscribe) cmd += " -l " + language; } - static const int timeout = UiConfig::GetNum("audio_transcribe_timeout"); LOG_TRACE("transcribe cmd \"%s\" start", cmd.c_str()); const bool rv = RunCommand(cmd, &transcription); if (rv && !transcription.empty()) { - // Store in cache - if (useCache) - { - MessageCache::StoreTranscription(profileId, chatId, msgId, transcription, language); - } + MessageCache::StoreTranscription(profileId, chatId, msgId, transcription); // Update UI UpdateHistory(); @@ -4572,7 +4554,6 @@ void UiModel::KeyHandler(wint_t p_Key) static wint_t keyAutoCompose = UiKeyConfig::GetKey("auto_compose"); static wint_t keySelectMention = UiKeyConfig::GetKey("select_mention"); static wint_t keyTranscribeAudio = UiKeyConfig::GetKey("transcribe_audio"); - static wint_t keyRetranscribeAudio = UiKeyConfig::GetKey("retranscribe_audio"); static wint_t keySetTranscriptionLang = UiKeyConfig::GetKey("set_transcription_lang"); if (p_Key == keyTerminalResize) @@ -4802,11 +4783,6 @@ void UiModel::KeyHandler(wint_t p_Key) LOG_DEBUG("transcribe_audio key pressed"); OnKeyTranscribeAudio(); } - else if (p_Key == keyRetranscribeAudio) - { - LOG_DEBUG("retranscribe_audio key pressed"); - OnKeyRetranscribeAudio(); - } else if (p_Key == keySetTranscriptionLang) { LOG_DEBUG("set_transcription_lang key pressed"); @@ -5766,7 +5742,7 @@ void UiModel::OnKeyTranscribeAudio() bool rv = false; { std::unique_lock lock(m_ModelMutex); - rv = GetImpl().TranscribeAudio(false /* forceRetranscribe */); + rv = GetImpl().TranscribeAudio(); } if (!rv) @@ -5775,44 +5751,6 @@ void UiModel::OnKeyTranscribeAudio() } } -void UiModel::OnKeyRetranscribeAudio() -{ - // Check if transcription is enabled (check every time, not static) - bool transcribeEnabled = UiConfig::GetBool("audio_transcribe_enabled"); - - if (!transcribeEnabled) - { - MessageDialog("Warning", "Audio transcription not enabled.", 0.7, 5); - return; - } - - // Pre-req - { - std::unique_lock lock(m_ModelMutex); - if (!GetImpl().GetSelectMessageActive()) - { - MessageDialog("Info", "Please select a message first (press Up arrow).", 0.7, 5); - return; - } - if (GetImpl().GetEditMessageActive()) - { - MessageDialog("Info", "Cannot transcribe while editing a message.", 0.7, 5); - return; - } - } - - bool rv = false; - { - std::unique_lock lock(m_ModelMutex); - rv = GetImpl().TranscribeAudio(true /* forceRetranscribe */); - } - - if (!rv) - { - MessageDialog("Warning", "Re-transcription failed.", 0.7, 5); - } -} - void UiModel::OnKeySetTranscriptionLang() { // Get current chat info diff --git a/src/uimodel.h b/src/uimodel.h index 4fc5eedc..7508db6d 100644 --- a/src/uimodel.h +++ b/src/uimodel.h @@ -190,7 +190,7 @@ class UiModel void HandleProtocolUiControlStart(); void HandleProtocolUiControlEnd(); bool AutoCompose(); - bool TranscribeAudio(bool p_ForceRetranscribe); + bool TranscribeAudio(); std::string GetCurrentTranscriptionLanguage(const std::string& p_ProfileId, const std::string& p_ChatId); void UpdateCurrentTranscriptionLanguage(const std::string& p_ProfileId, const std::string& p_ChatId, const std::string& p_Language); @@ -420,7 +420,6 @@ class UiModel void OnKeyCopy(); void OnKeyPaste(); void OnKeyTranscribeAudio(); - void OnKeyRetranscribeAudio(); void OnKeySetTranscriptionLang(); private: From 441a223c7764c46cde72ffa7032d7dcd60f40bae Mon Sep 17 00:00:00 2001 From: bodharma Date: Mon, 27 Apr 2026 15:27:07 +0100 Subject: [PATCH 4/7] refactor: always show transcription inline when enabled --- src/uihistoryview.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/uihistoryview.cpp b/src/uihistoryview.cpp index 9833fd80..64eb1b25 100644 --- a/src/uihistoryview.cpp +++ b/src/uihistoryview.cpp @@ -229,8 +229,8 @@ void UiHistoryView::Draw() wlines.insert(wlines.begin(), fileStr); // Transcription (if audio file and transcription available) - static const bool transcribeInline = UiConfig::GetBool("audio_transcribe_inline"); - if (transcribeInline) + static const bool transcribeEnabled = UiConfig::GetBool("audio_transcribe_enabled"); + if (transcribeEnabled) { std::string ext = FileUtil::GetFileExt(fileInfo.filePath); From 3bbc2c5a0fd82dd60f5ff81307507b39d53cb02d Mon Sep 17 00:00:00 2001 From: bodharma Date: Mon, 27 Apr 2026 15:28:54 +0100 Subject: [PATCH 5/7] docs: add real UI example to TRANSCRIPTION.md --- doc/TRANSCRIPTION.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/doc/TRANSCRIPTION.md b/doc/TRANSCRIPTION.md index f1eeb8e9..bec12948 100644 --- a/doc/TRANSCRIPTION.md +++ b/doc/TRANSCRIPTION.md @@ -45,6 +45,25 @@ Press `Alt-u` on any voice message. The text appears below it: └─────────────────────────────────────────────┘ ``` +### UI Example + +After pressing `Alt-u` on a voice message, the transcription appears inline with the `[Transcribed]` indicator at the start of each transcribed line: + +``` + ┌───────────────────────────────────────────────────────────┐ + │ Bob [14:22] │ + │ PTT-20250115-WA0012.opus │ + │ [Transcribed] Hey, are you coming to the meeting │ + │ at three? Let me know if you need the │ + │ dial-in link. │ + │ │ + │ Alice [14:23] │ + │ Sure, I'll be there! │ + └───────────────────────────────────────────────────────────┘ +``` + +Long transcriptions are truncated to `audio_transcribe_max_lines` lines (default: 15); the last visible line shows how many lines were hidden. + Use `Alt-Shift-u` to re-transcribe if you want to ignore the cache (like if the first try messed up). Supports: `.ogg`, `.opus`, `.mp3`, `.m4a`, `.wav`, `.flac` From 9c334f943d5481170e208d6155c1a819e2bdbf29 Mon Sep 17 00:00:00 2001 From: bodharma Date: Mon, 27 Apr 2026 15:30:41 +0100 Subject: [PATCH 6/7] docs: reorder backends, replace pip with uv, add whisper-cli bash option --- doc/TRANSCRIPTION-SETUP.md | 241 +++++++++++++++++++------------------ 1 file changed, 125 insertions(+), 116 deletions(-) diff --git a/doc/TRANSCRIPTION-SETUP.md b/doc/TRANSCRIPTION-SETUP.md index d5654590..420b21a3 100644 --- a/doc/TRANSCRIPTION-SETUP.md +++ b/doc/TRANSCRIPTION-SETUP.md @@ -5,173 +5,172 @@ How to set up different transcription backends for nchat. ## What You Need - nchat installed -- Python 3.7+ (`python3 --version`) -- pip (`pip3 --version`) +- FFmpeg (`ffmpeg --version`) --- -## Option 1: OpenAI API (Easiest) +## Option 1: whisper.cpp (Recommended) -Fast (2-3 sec), accurate, easy. Costs $0.006/min. Audio goes to OpenAI. +No Python required. Free, private, offline. `whisper.cpp` is available as a system package on most distributions. -**Setup:** +**Install whisper.cpp:** -1. Get API key: https://platform.openai.com/api-keys +- **Arch:** `sudo pacman -S whisper.cpp` or AUR: `yay -S whisper.cpp` +- **Debian/Ubuntu (sid):** `sudo apt install whisper.cpp` +- **Fedora:** `sudo dnf install whisper-cpp` +- **macOS:** `brew install whisper-cpp` -2. Install packages: - ```bash - pip3 install openai requests - ``` +**Download a model** (e.g., `base` for speed, `large-v3` for accuracy): -3. Set the key: - ```bash - export OPENAI_API_KEY='sk-...' - echo 'export OPENAI_API_KEY="sk-..."' >> ~/.bashrc - ``` +```bash +mkdir -p ~/.local/share/whisper +wget https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin \ + -O ~/.local/share/whisper/ggml-base.bin +``` - macOS users also need: - ```bash - echo 'export OPENAI_API_KEY="sk-..."' >> ~/.zshenv - ``` +(On macOS, use `curl -L` instead of `wget`.) -4. Configure nchat (`~/.config/nchat/ui.conf`): - ```conf - audio_transcribe_enabled=1 - audio_transcribe_cache=1 - ``` +**Create a transcribe script:** -5. Test: - ```bash - /usr/local/libexec/nchat/transcribe -f /path/to/test.ogg - ``` +Create `~/.config/nchat/transcribe.sh`: -Monitor costs at https://platform.openai.com/usage (set a budget limit!) +```bash +#!/usr/bin/env bash +ffmpeg -loglevel quiet -i "$1" -f wav - \ + | whisper-cli -m ~/.local/share/whisper/ggml-base.bin -l auto -otxt -f - +``` ---- +Make it executable: -## Option 2: whisper.cpp (Local Server) +```bash +chmod +x ~/.config/nchat/transcribe.sh +``` -Free, private, offline. Bit more setup. Fast with GPU. +**Configure nchat** (`~/.config/nchat/ui.conf`): -**Setup:** +```conf +audio_transcribe_enabled=1 +audio_transcribe_command=~/.config/nchat/transcribe.sh '%1' +audio_transcribe_cache=1 +``` -1. Install deps: - ```bash - # macOS - brew install ffmpeg cmake +**Test:** - # Linux - sudo apt install build-essential ffmpeg cmake git # Debian/Ubuntu - sudo dnf install gcc-c++ ffmpeg cmake git # Fedora - ``` +```bash +~/.config/nchat/transcribe.sh /path/to/test.ogg +``` -2. Build it: - ```bash - mkdir -p ~/whisper.cpp && cd ~/whisper.cpp - git clone https://github.com/ggerganov/whisper.cpp.git . - mkdir build && cd build - cmake .. -DWHISPER_BUILD_SERVER=ON - cmake --build . --config Release - ``` +> **Tip:** The `audio_transcribe_command` is not special — any script that reads an audio file +> path as its first argument and prints the transcription to stdout will work. Write your own! -3. Download a model: - ```bash - cd ~/whisper.cpp - bash ./models/download-ggml-model.sh base # or tiny/small/medium/large - ``` +--- -4. Start the server: - ```bash - cd ~/whisper.cpp - ./build/bin/server --model models/ggml-base.bin --host 127.0.0.1 --port 8080 --convert - ``` +## Option 2: Local Python Whisper (faster-whisper) - Run in background: +Free, private, simple. Uses Python. Slower than whisper.cpp, uses more RAM. + +**Setup:** + +1. Install ffmpeg and Python 3.13 (or earlier; 3.14+ not yet supported): ```bash - nohup ./build/bin/server --model models/ggml-base.bin --host 127.0.0.1 --port 8080 --convert > server.log 2>&1 & + brew install ffmpeg python@3.13 # macOS + sudo apt install ffmpeg python3.13 # Debian/Ubuntu + sudo dnf install ffmpeg python3.13 # Fedora ``` -5. Install Python package: +2. Create a virtual environment with `uv` (recommended; `python -m venv` also works): ```bash - pip3 install requests + mkdir -p ~/.config/nchat + cd ~/.config/nchat + uv venv --python 3.13 + uv pip install faster-whisper ``` -6. Configure nchat: + > **Note:** Most Linux distributions no longer allow system-wide `pip install`. + > Use `uv` (install via `curl -LsSf https://astral.sh/uv/install.sh | sh`) + > or `python -m venv` instead. + +3. Configure nchat: ```conf audio_transcribe_enabled=1 - audio_transcribe_command=/usr/local/libexec/nchat/transcribe -f '%1' -s whisper-cpp + audio_transcribe_command=~/.config/nchat/.venv/bin/python /usr/local/libexec/nchat/transcribe -f '%1' -s whisper-local -m base audio_transcribe_cache=1 ``` -7. Test: +4. Test: ```bash - curl http://localhost:8080/health - /usr/local/libexec/nchat/transcribe -f /path/to/test.ogg -s whisper-cpp + /usr/local/libexec/nchat/transcribe -f /path/to/test.ogg -s whisper-local ``` -**Want auto-start on boot?** Set up a systemd service (Linux) or launchd (macOS) - Google it. +Models download automatically on first use. Pick a size: `tiny` (fast, meh), `base` (balanced), `small`/`medium`/`large` (slower, better). + +Got an NVIDIA GPU? Install CUDA first, then: +```bash +uv pip install faster-whisper[gpu] +``` --- -## Option 3: Whisper Python (Local) +## Option 3: Groq API (Faster than OpenAI, Cheaper) -Free, private, simple. Slower than whisper.cpp, uses more RAM. +Cloud-based, free tier available. Fast ($0.001/min vs OpenAI's $0.006/min). **Setup:** -1. Install ffmpeg: - ```bash - brew install ffmpeg # macOS - sudo apt install ffmpeg python3-dev # Debian/Ubuntu - sudo dnf install ffmpeg python3-devel # Fedora - ``` +1. Get API key from https://console.groq.com/ -2. Install Whisper: +2. Install with uv or system pip: ```bash - pip3 install faster-whisper # Faster (recommended) - # or - pip3 install openai-whisper # Original (slower) + uv pip install groq requests ``` -3. Configure nchat: - ```conf - audio_transcribe_enabled=1 - audio_transcribe_command=/usr/local/libexec/nchat/transcribe -f '%1' -s whisper-local -m base - audio_transcribe_cache=1 - ``` - -4. Test: +3. Set the key: ```bash - /usr/local/libexec/nchat/transcribe -f /path/to/test.ogg -s whisper-local + export GROQ_API_KEY='gsk-...' + echo 'export GROQ_API_KEY="gsk-..."' >> ~/.bashrc + echo 'export GROQ_API_KEY="gsk-..."' >> ~/.zshenv # macOS ``` -Models download automatically on first use. Pick a size: `tiny` (fast, meh), `base` (balanced), `small`/`medium`/`large` (slower, better). - -Got an NVIDIA GPU? Use `pip3 install faster-whisper[gpu]` after installing CUDA. +4. You'll need to hack the transcribe script to add Groq support (it's not built-in yet). Or use OpenAI. --- -## Option 4: Groq API (Cheaper Alternative) +## Option 4: OpenAI API -Like OpenAI but cheaper ($0.001/min vs $0.006/min). Still fast. +Cloud-based, easiest. Fast (2-3 sec), accurate. Costs $0.006/min. Audio goes to OpenAI. **Setup:** -1. Get API key from https://console.groq.com/ +1. Get API key: https://platform.openai.com/api-keys + +2. Install packages: + ```bash + uv pip install openai requests + ``` -2. Install: +3. Set the key: ```bash - pip3 install groq requests + export OPENAI_API_KEY='sk-...' + echo 'export OPENAI_API_KEY="sk-..."' >> ~/.bashrc ``` -3. Set key: + macOS users also need: ```bash - export GROQ_API_KEY='gsk-...' - echo 'export GROQ_API_KEY="gsk-..."' >> ~/.bashrc - echo 'export GROQ_API_KEY="gsk-..."' >> ~/.zshenv # macOS + echo 'export OPENAI_API_KEY="sk-..."' >> ~/.zshenv ``` -4. You'll need to hack the transcribe script to add Groq support (it's not built-in yet). Or just use OpenAI. +4. Configure nchat (`~/.config/nchat/ui.conf`): + ```conf + audio_transcribe_enabled=1 + audio_transcribe_cache=1 + ``` + +5. Test: + ```bash + /usr/local/libexec/nchat/transcribe -f /path/to/test.ogg + ``` + +Monitor costs at https://platform.openai.com/usage (set a budget limit!) --- @@ -180,7 +179,10 @@ Like OpenAI but cheaper ($0.001/min vs $0.006/min). Still fast. After setup: ```bash -# Test the script +# Test your transcription command +~/.config/nchat/transcribe.sh /path/to/test.ogg + +# Or for API-based: /usr/local/libexec/nchat/transcribe -f /path/to/test.ogg # Check nchat config @@ -196,24 +198,30 @@ In nchat: Select a voice message, press `Alt-u`, see if it works. **"Command not found"** ```bash ls -l /usr/local/libexec/nchat/transcribe # Check if it exists +ls -l ~/.config/nchat/transcribe.sh # Check if script exists ``` -**"No module named 'openai'"** +**"No module named 'openai'" or other Python imports** ```bash -pip3 install openai # or whatever package is missing +# If using uv: +uv pip install openai # or whatever package is missing + +# If using venv: +~/.config/nchat/.venv/bin/pip install openai ``` **"API key not set"** ```bash echo $OPENAI_API_KEY # Check it's set +echo $GROQ_API_KEY # Check it's set echo 'export OPENAI_API_KEY="sk-..."' >> ~/.bashrc echo 'export OPENAI_API_KEY="sk-..."' >> ~/.zshenv # macOS ``` -**whisper.cpp server not responding** +**whisper-cli not found** ```bash -curl http://localhost:8080/health # Check if running -cd ~/whisper.cpp && ./build/bin/server --model models/ggml-base.bin --host 127.0.0.1 --port 8080 --convert +which whisper-cli # Check if installed +# If missing, install via your package manager (pacman, apt, dnf, brew) ``` **Timeouts** @@ -224,7 +232,8 @@ audio_transcribe_timeout=60 # Increase in ui.conf **Need ffmpeg** ```bash brew install ffmpeg # macOS -sudo apt install ffmpeg # Linux +sudo apt install ffmpeg # Debian/Ubuntu +sudo dnf install ffmpeg # Fedora ``` **High CPU/RAM** @@ -235,11 +244,11 @@ Use a smaller model (`-m tiny`), or switch to whisper.cpp, or just use the API. ## Quick Comparison -| Option | Speed | Privacy | Cost | Setup | -|--------|-------|---------|------|-------| -| OpenAI | Fast | Low | $0.006/min | Easy | -| whisper.cpp | Good | High | Free | Medium | -| Whisper Python | OK | High | Free | Easy | -| Groq | Fast | Low | $0.001/min | Easy | +| Option | Speed | Privacy | Cost | Setup | Needs Python | +|--------|-------|---------|------|-------|--------------| +| whisper.cpp | Good | High | Free | Easy | No | +| Whisper Python | OK | High | Free | Easy | Yes | +| Groq | Fast | Low | $0.001/min | Medium | Yes | +| OpenAI | Fast | Low | $0.006/min | Easy | Yes | Pick what works for you. From 7ce7b3514637da8effbdf3a66cd530d6c51c346a Mon Sep 17 00:00:00 2001 From: bodharma Date: Tue, 28 Apr 2026 11:39:22 +0100 Subject: [PATCH 7/7] fix: address PR review issues in transcription feature - Fix shell injection: escape single quotes in filePath and validate language against safe character whitelist before shell substitution - Align audio extension lists: replace aac with webm in both uimodel and uihistoryview to match the transcribe script SUPPORTED_FORMATS - Fix isTranscription dim color: reverse std::distance args so posFromEnd increases correctly in reverse iteration (was wrapping to huge size_t) - Clamp WordWrap width to minimum 1 to prevent unsigned underflow on very narrow terminals - Check StoreTranscription return and log warning on failure - Replace INSERT ON CONFLICT with UPDATE-only for transcriptionLanguage to prevent partial chat rows if chatId is missing - Drop legacy transcriptions table and index in schema 12 migration - Remove unimplemented config defaults: audio_transcribe_auto, audio_transcribe_timeout, and hardcoded audio_transcribe_command path (runtime fallback uses CMake install prefix instead) - Update TRANSCRIPTION.md: remove dead config keys, fix keyboard shortcuts, fix cache path/SQL, add webm to supported formats --- doc/TRANSCRIPTION.md | 30 ++++++++++------------------ lib/ncutil/src/messagecache.cpp | 11 +++++++---- src/uiconfig.cpp | 4 +--- src/uihistoryview.cpp | 7 ++++--- src/uimodel.cpp | 35 ++++++++++++++++++++++++++------- 5 files changed, 50 insertions(+), 37 deletions(-) diff --git a/doc/TRANSCRIPTION.md b/doc/TRANSCRIPTION.md index bec12948..609a6b0f 100644 --- a/doc/TRANSCRIPTION.md +++ b/doc/TRANSCRIPTION.md @@ -64,9 +64,9 @@ After pressing `Alt-u` on a voice message, the transcription appears inline with Long transcriptions are truncated to `audio_transcribe_max_lines` lines (default: 15); the last visible line shows how many lines were hidden. -Use `Alt-Shift-u` to re-transcribe if you want to ignore the cache (like if the first try messed up). +Press `Alt-u` again on the same message to re-transcribe. -Supports: `.ogg`, `.opus`, `.mp3`, `.m4a`, `.wav`, `.flac` +Supports: `.ogg`, `.opus`, `.mp3`, `.m4a`, `.wav`, `.flac`, `.webm` ## Configuration @@ -74,13 +74,9 @@ Edit `~/.config/nchat/ui.conf`: ```conf audio_transcribe_enabled=1 # Turn it on/off -audio_transcribe_cache=1 # Cache results (saves API costs) -audio_transcribe_inline=1 # Show text below message -audio_transcribe_auto=0 # Don't auto-transcribe (costs $$$) -audio_transcribe_timeout=30 # Wait max 30 seconds ``` -The command that does the work: +The command that does the work (defaults to the bundled `transcribe` script): ```conf audio_transcribe_command=/usr/local/libexec/nchat/transcribe -f '%1' ``` @@ -116,9 +112,8 @@ Supports 90+ languages (en, es, fr, de, it, pt, ru, zh, ja, ko, etc.) ## Keyboard Shortcuts -- `Alt-u` - Transcribe message -- `Alt-Shift-u` - Re-transcribe (ignore cache) -- `Ctrl-t` - Toggle visibility +- `Alt-u` - Transcribe (or re-transcribe) message +- `Alt-l` - Set per-chat transcription language Change them in `~/.config/nchat/key.conf` if you want (see nchat docs for the escape codes). @@ -131,10 +126,7 @@ export OPENAI_API_KEY='sk-...' # Add to ~/.bashrc or ~/.zshrc **"Timeout"** -Bump the timeout or use a faster service: -```conf -audio_transcribe_timeout=60 -``` +Switch to a faster service or use a smaller local model. See [TRANSCRIPTION-SETUP.md](TRANSCRIPTION-SETUP.md). **"Audio format not supported"** @@ -155,7 +147,7 @@ Or use a bigger model (local) or switch to OpenAI API. **API costs too high** -Turn off auto-transcribe (`audio_transcribe_auto=0`) and use local Whisper instead (see [TRANSCRIPTION-SETUP.md](TRANSCRIPTION-SETUP.md)). +Use local Whisper instead (see [TRANSCRIPTION-SETUP.md](TRANSCRIPTION-SETUP.md)). ## Privacy @@ -165,20 +157,18 @@ Turn off auto-transcribe (`audio_transcribe_auto=0`) and use local Whisper inste ## Cache Management -Transcriptions are cached in `~/.config/nchat/db.sqlite`. +Transcriptions are stored per-profile in `~/.config/nchat/history//db.sqlite`. -Clear cache if needed: +Clear cached transcriptions if needed: ```bash -sqlite3 ~/.config/nchat/db.sqlite "DELETE FROM transcriptions;" +sqlite3 ~/.config/nchat/history//db.sqlite "UPDATE messages SET transcription = '' WHERE transcription != '';" ``` ## Tips - OpenAI API is fastest (2-3 sec/message) -- Keep caching enabled to save money - Specify language for better accuracy - Use local for privacy, API for speed -- Don't enable auto-transcribe unless you hate money ## FAQ diff --git a/lib/ncutil/src/messagecache.cpp b/lib/ncutil/src/messagecache.cpp index 84264b25..a99ad60f 100644 --- a/lib/ncutil/src/messagecache.cpp +++ b/lib/ncutil/src/messagecache.cpp @@ -489,6 +489,10 @@ void MessageCache::AddProfile(const std::string& p_ProfileId, bool p_CheckSequen " WHERE t.chatId = messages.chatId AND t.msgId = messages.id" ");"; + // Drop the now-unused transcriptions table + *m_Dbs[p_ProfileId] << "DROP INDEX IF EXISTS idx_transcriptions_timestamp;"; + *m_Dbs[p_ProfileId] << "DROP TABLE IF EXISTS transcriptions;"; + schemaVersion = 12; *m_Dbs[p_ProfileId] << "UPDATE version SET schema = ?;" << schemaVersion; } @@ -2022,10 +2026,9 @@ void MessageCache::PerformRequest(std::shared_ptr p_Request) try { - *m_Dbs[profileId] << "INSERT INTO " + s_TableChats + " " - "(id, transcriptionLanguage) VALUES " - "(?, ?) ON CONFLICT(id) DO UPDATE SET transcriptionLanguage=?;" << - chatId << transcriptionLanguage << transcriptionLanguage; + *m_Dbs[profileId] << "UPDATE " + s_TableChats + " " + "SET transcriptionLanguage = ? WHERE id = ?;" << + transcriptionLanguage << chatId; } catch (const sqlite::sqlite_exception& ex) { diff --git a/src/uiconfig.cpp b/src/uiconfig.cpp index 784661c0..ffdf2a47 100644 --- a/src/uiconfig.cpp +++ b/src/uiconfig.cpp @@ -20,12 +20,10 @@ void UiConfig::Init() { { "attachment_indicator", "\xF0\x9F\x93\x8E" }, { "attachment_open_command", "" }, - { "audio_transcribe_auto", "0" }, - { "audio_transcribe_command", "/usr/local/libexec/nchat/transcribe -f '%1'" }, + { "audio_transcribe_command", "" }, { "audio_transcribe_enabled", "0" }, { "audio_transcribe_language", "auto" }, { "audio_transcribe_max_lines", "15" }, - { "audio_transcribe_timeout", "30" }, { "auto_compose_command", "" }, { "auto_compose_enabled", "0" }, { "auto_compose_history_count", "25" }, diff --git a/src/uihistoryview.cpp b/src/uihistoryview.cpp index 64eb1b25..446e1a56 100644 --- a/src/uihistoryview.cpp +++ b/src/uihistoryview.cpp @@ -241,7 +241,7 @@ void UiHistoryView::Draw() } static const std::set audioExtensions = { - "ogg", "opus", "mp3", "m4a", "aac", "wav", "flac", "oga" + "ogg", "opus", "mp3", "m4a", "wav", "flac", "oga", "webm" }; if (audioExtensions.find(ext) != audioExtensions.end()) @@ -255,8 +255,9 @@ void UiHistoryView::Draw() transcription = StrUtil::Textize(transcription); } + const unsigned transcriptionWrapWidth = (m_PaddedW > 2) ? static_cast(m_PaddedW - 2) : 1u; std::vector transcriptionWLines = - StrUtil::WordWrap(StrUtil::ToWString(transcription), m_PaddedW - 2, false, false, false, 2); + StrUtil::WordWrap(StrUtil::ToWString(transcription), transcriptionWrapWidth, false, false, false, 2); // Check if transcription exceeds max lines limit static const int maxTranscriptionLines = UiConfig::GetNum("audio_transcribe_max_lines"); @@ -387,7 +388,7 @@ void UiHistoryView::Draw() // Transcription lines are at positions 1 to (1 + transcriptionLines - 1) in forward iteration // In reverse, calculate the position from the end - size_t posFromEnd = std::distance(wline, wlines.rbegin()); + size_t posFromEnd = std::distance(wlines.rbegin(), wline); size_t vectorSize = wlines.size(); size_t posFromBegin = vectorSize - 1 - posFromEnd; bool isTranscription = (transcriptionLines > 0) && (posFromBegin >= 1) && (posFromBegin < 1 + static_cast(transcriptionLines)); diff --git a/src/uimodel.cpp b/src/uimodel.cpp index 99370265..f705f464 100644 --- a/src/uimodel.cpp +++ b/src/uimodel.cpp @@ -4338,13 +4338,13 @@ bool UiModel::Impl::TranscribeAudio() } static const std::set audioExtensions = { - "ogg", "opus", "mp3", "m4a", "aac", "wav", "flac", "oga" + "ogg", "opus", "mp3", "m4a", "wav", "flac", "oga", "webm" }; if (audioExtensions.find(ext) == audioExtensions.end()) { LOG_DEBUG("file is not an audio file: %s", ext.c_str()); - LOG_WARNING("File is not an audio file (extension: %s). Supported: ogg, opus, mp3, m4a, aac, wav, flac, oga", ext.c_str()); + LOG_WARNING("File is not an audio file (extension: %s). Supported: ogg, opus, mp3, m4a, wav, flac, oga, webm", ext.c_str()); return false; } @@ -4364,9 +4364,17 @@ bool UiModel::Impl::TranscribeAudio() // Build and execute command std::string transcription; std::string cmd = cmdTemplate; - StrUtil::ReplaceString(cmd, "%1", filePath); - // Add language parameter if specified + // Escape single quotes in filePath before substituting into shell command + std::string escapedFilePath; + escapedFilePath.reserve(filePath.size() + 4); + for (char c : filePath) + { + if (c == '\'') { escapedFilePath += "'\\''"; } + else { escapedFilePath += c; } + } + StrUtil::ReplaceString(cmd, "%1", escapedFilePath); + // First check per-chat language setting, then fall back to global setting std::string language; if (m_ChatInfos.count(profileId) && m_ChatInfos[profileId].count(chatId)) @@ -4374,18 +4382,28 @@ bool UiModel::Impl::TranscribeAudio() language = m_ChatInfos[profileId][chatId].transcriptionLanguage; } - // Fall back to global setting if per-chat language is not set if (language.empty()) { language = UiConfig::GetStr("audio_transcribe_language"); } - // Default to auto if still empty if (language.empty()) { language = "auto"; } + // Validate language: only allow ISO 639 codes, "auto", or safe variants (no shell metacharacters) + const bool languageSafe = language.size() <= 20 && + std::all_of(language.begin(), language.end(), [](char c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || + (c >= '0' && c <= '9') || c == '-' || c == '_'; + }); + if (!languageSafe) + { + LOG_WARNING("invalid transcription language '%s', ignoring", language.c_str()); + language = "auto"; + } + if (language != "auto") { cmd += " -l " + language; @@ -4397,7 +4415,10 @@ bool UiModel::Impl::TranscribeAudio() if (rv && !transcription.empty()) { - MessageCache::StoreTranscription(profileId, chatId, msgId, transcription); + if (!MessageCache::StoreTranscription(profileId, chatId, msgId, transcription)) + { + LOG_WARNING("failed to store transcription for msg %s (cache disabled?)", msgId.c_str()); + } // Update UI UpdateHistory();