diff --git a/CMakeLists.txt b/CMakeLists.txt index 69835763..373e9832 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -269,6 +269,8 @@ add_executable(nchat src/uilistborderview.h src/uilistdialog.cpp src/uilistdialog.h + src/uilanguagelistdialog.cpp + src/uilanguagelistdialog.h src/uilistview.cpp src/uilistview.h src/uimessagedialog.cpp @@ -419,6 +421,9 @@ install(FILES src/nchat.1 DESTINATION "${CMAKE_INSTALL_MANDIR}/man1") configure_file(src/compose ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_INSTALL_LIBEXECDIR}/nchat/compose COPYONLY) install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_INSTALL_LIBEXECDIR}/nchat/compose DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/nchat) +configure_file(src/transcribe ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_INSTALL_LIBEXECDIR}/nchat/transcribe COPYONLY) +install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_INSTALL_LIBEXECDIR}/nchat/transcribe DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/nchat) + # Uninstall if(HAS_SHARED_LIBS) add_custom_target(uninstall diff --git a/doc/TRANSCRIPTION-SETUP.md b/doc/TRANSCRIPTION-SETUP.md new file mode 100644 index 00000000..420b21a3 --- /dev/null +++ b/doc/TRANSCRIPTION-SETUP.md @@ -0,0 +1,254 @@ +# Transcription Setup + +How to set up different transcription backends for nchat. + +## What You Need + +- nchat installed +- FFmpeg (`ffmpeg --version`) + +--- + +## Option 1: whisper.cpp (Recommended) + +No Python required. Free, private, offline. `whisper.cpp` is available as a system package on most distributions. + +**Install whisper.cpp:** + +- **Arch:** `sudo pacman -S whisper.cpp` or AUR: `yay -S whisper.cpp` +- **Debian/Ubuntu (sid):** `sudo apt install whisper.cpp` +- **Fedora:** `sudo dnf install whisper-cpp` +- **macOS:** `brew install whisper-cpp` + +**Download a model** (e.g., `base` for speed, `large-v3` for accuracy): + +```bash +mkdir -p ~/.local/share/whisper +wget https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin \ + -O ~/.local/share/whisper/ggml-base.bin +``` + +(On macOS, use `curl -L` instead of `wget`.) + +**Create a transcribe script:** + +Create `~/.config/nchat/transcribe.sh`: + +```bash +#!/usr/bin/env bash +ffmpeg -loglevel quiet -i "$1" -f wav - \ + | whisper-cli -m ~/.local/share/whisper/ggml-base.bin -l auto -otxt -f - +``` + +Make it executable: + +```bash +chmod +x ~/.config/nchat/transcribe.sh +``` + +**Configure nchat** (`~/.config/nchat/ui.conf`): + +```conf +audio_transcribe_enabled=1 +audio_transcribe_command=~/.config/nchat/transcribe.sh '%1' +audio_transcribe_cache=1 +``` + +**Test:** + +```bash +~/.config/nchat/transcribe.sh /path/to/test.ogg +``` + +> **Tip:** The `audio_transcribe_command` is not special — any script that reads an audio file +> path as its first argument and prints the transcription to stdout will work. Write your own! + +--- + +## Option 2: Local Python Whisper (faster-whisper) + +Free, private, simple. Uses Python. Slower than whisper.cpp, uses more RAM. + +**Setup:** + +1. Install ffmpeg and Python 3.13 (or earlier; 3.14+ not yet supported): + ```bash + brew install ffmpeg python@3.13 # macOS + sudo apt install ffmpeg python3.13 # Debian/Ubuntu + sudo dnf install ffmpeg python3.13 # Fedora + ``` + +2. Create a virtual environment with `uv` (recommended; `python -m venv` also works): + ```bash + mkdir -p ~/.config/nchat + cd ~/.config/nchat + uv venv --python 3.13 + uv pip install faster-whisper + ``` + + > **Note:** Most Linux distributions no longer allow system-wide `pip install`. + > Use `uv` (install via `curl -LsSf https://astral.sh/uv/install.sh | sh`) + > or `python -m venv` instead. + +3. Configure nchat: + ```conf + audio_transcribe_enabled=1 + audio_transcribe_command=~/.config/nchat/.venv/bin/python /usr/local/libexec/nchat/transcribe -f '%1' -s whisper-local -m base + audio_transcribe_cache=1 + ``` + +4. Test: + ```bash + /usr/local/libexec/nchat/transcribe -f /path/to/test.ogg -s whisper-local + ``` + +Models download automatically on first use. Pick a size: `tiny` (fast, meh), `base` (balanced), `small`/`medium`/`large` (slower, better). + +Got an NVIDIA GPU? Install CUDA first, then: +```bash +uv pip install faster-whisper[gpu] +``` + +--- + +## Option 3: Groq API (Faster than OpenAI, Cheaper) + +Cloud-based, free tier available. Fast ($0.001/min vs OpenAI's $0.006/min). + +**Setup:** + +1. Get API key from https://console.groq.com/ + +2. Install with uv or system pip: + ```bash + uv pip install groq requests + ``` + +3. Set the key: + ```bash + export GROQ_API_KEY='gsk-...' + echo 'export GROQ_API_KEY="gsk-..."' >> ~/.bashrc + echo 'export GROQ_API_KEY="gsk-..."' >> ~/.zshenv # macOS + ``` + +4. You'll need to hack the transcribe script to add Groq support (it's not built-in yet). Or use OpenAI. + +--- + +## Option 4: OpenAI API + +Cloud-based, easiest. Fast (2-3 sec), accurate. Costs $0.006/min. Audio goes to OpenAI. + +**Setup:** + +1. Get API key: https://platform.openai.com/api-keys + +2. Install packages: + ```bash + uv pip install openai requests + ``` + +3. Set the key: + ```bash + export OPENAI_API_KEY='sk-...' + echo 'export OPENAI_API_KEY="sk-..."' >> ~/.bashrc + ``` + + macOS users also need: + ```bash + echo 'export OPENAI_API_KEY="sk-..."' >> ~/.zshenv + ``` + +4. Configure nchat (`~/.config/nchat/ui.conf`): + ```conf + audio_transcribe_enabled=1 + audio_transcribe_cache=1 + ``` + +5. Test: + ```bash + /usr/local/libexec/nchat/transcribe -f /path/to/test.ogg + ``` + +Monitor costs at https://platform.openai.com/usage (set a budget limit!) + +--- + +## Testing + +After setup: + +```bash +# Test your transcription command +~/.config/nchat/transcribe.sh /path/to/test.ogg + +# Or for API-based: +/usr/local/libexec/nchat/transcribe -f /path/to/test.ogg + +# Check nchat config +grep transcribe ~/.config/nchat/ui.conf +``` + +In nchat: Select a voice message, press `Alt-u`, see if it works. + +--- + +## Troubleshooting + +**"Command not found"** +```bash +ls -l /usr/local/libexec/nchat/transcribe # Check if it exists +ls -l ~/.config/nchat/transcribe.sh # Check if script exists +``` + +**"No module named 'openai'" or other Python imports** +```bash +# If using uv: +uv pip install openai # or whatever package is missing + +# If using venv: +~/.config/nchat/.venv/bin/pip install openai +``` + +**"API key not set"** +```bash +echo $OPENAI_API_KEY # Check it's set +echo $GROQ_API_KEY # Check it's set +echo 'export OPENAI_API_KEY="sk-..."' >> ~/.bashrc +echo 'export OPENAI_API_KEY="sk-..."' >> ~/.zshenv # macOS +``` + +**whisper-cli not found** +```bash +which whisper-cli # Check if installed +# If missing, install via your package manager (pacman, apt, dnf, brew) +``` + +**Timeouts** +```conf +audio_transcribe_timeout=60 # Increase in ui.conf +``` + +**Need ffmpeg** +```bash +brew install ffmpeg # macOS +sudo apt install ffmpeg # Debian/Ubuntu +sudo dnf install ffmpeg # Fedora +``` + +**High CPU/RAM** + +Use a smaller model (`-m tiny`), or switch to whisper.cpp, or just use the API. + +--- + +## Quick Comparison + +| Option | Speed | Privacy | Cost | Setup | Needs Python | +|--------|-------|---------|------|-------|--------------| +| whisper.cpp | Good | High | Free | Easy | No | +| Whisper Python | OK | High | Free | Easy | Yes | +| Groq | Fast | Low | $0.001/min | Medium | Yes | +| OpenAI | Fast | Low | $0.006/min | Easy | Yes | + +Pick what works for you. diff --git a/doc/TRANSCRIPTION.md b/doc/TRANSCRIPTION.md new file mode 100644 index 00000000..609a6b0f --- /dev/null +++ b/doc/TRANSCRIPTION.md @@ -0,0 +1,189 @@ +# Audio Transcription + +nchat can transcribe voice messages to text using Whisper. Press Alt-u on any voice message and boom - you can read it instead of listening. + +Works with Telegram and WhatsApp voice notes. You can use OpenAI's API (fast, costs a few cents) or run it locally for free (slower but private). + +## Quick Start + +**Option 1: OpenAI API (easiest)** + +```bash +# Get an API key from https://platform.openai.com/api-keys +export OPENAI_API_KEY='sk-your-key-here' + +# Enable in nchat config +echo "audio_transcribe_enabled=1" >> ~/.config/nchat/ui.conf +``` + +Costs about $0.006 per minute of audio (so like a penny for a 2-minute voice note). + +**Option 2: Local Whisper (free, private)** + +See [TRANSCRIPTION-SETUP.md](TRANSCRIPTION-SETUP.md) - takes a bit more setup but runs offline. + +**Using it:** + +1. Select a voice message in nchat +2. Press `Alt-u` +3. Wait for the text to appear + +That's it. + +## How to Use + +Press `Alt-u` on any voice message. The text appears below it: + +``` +┌─────────────────────────────────────────────┐ +│ Alice 10:30 AM │ +│ 🎤 Voice message (0:15) │ +│ │ +│ 📝 Hey, can you pick up groceries on your │ +│ way home? We need milk and eggs. │ +│ [Transcribed]│ +└─────────────────────────────────────────────┘ +``` + +### UI Example + +After pressing `Alt-u` on a voice message, the transcription appears inline with the `[Transcribed]` indicator at the start of each transcribed line: + +``` + ┌───────────────────────────────────────────────────────────┐ + │ Bob [14:22] │ + │ PTT-20250115-WA0012.opus │ + │ [Transcribed] Hey, are you coming to the meeting │ + │ at three? Let me know if you need the │ + │ dial-in link. │ + │ │ + │ Alice [14:23] │ + │ Sure, I'll be there! │ + └───────────────────────────────────────────────────────────┘ +``` + +Long transcriptions are truncated to `audio_transcribe_max_lines` lines (default: 15); the last visible line shows how many lines were hidden. + +Press `Alt-u` again on the same message to re-transcribe. + +Supports: `.ogg`, `.opus`, `.mp3`, `.m4a`, `.wav`, `.flac`, `.webm` + +## Configuration + +Edit `~/.config/nchat/ui.conf`: + +```conf +audio_transcribe_enabled=1 # Turn it on/off +``` + +The command that does the work (defaults to the bundled `transcribe` script): +```conf +audio_transcribe_command=/usr/local/libexec/nchat/transcribe -f '%1' +``` + +You can add flags to it (see below). + +## Tweaking It + +**Pick a specific service:** +```conf +audio_transcribe_command=/usr/local/libexec/nchat/transcribe -f '%1' -s openai # OpenAI API +audio_transcribe_command=/usr/local/libexec/nchat/transcribe -f '%1' -s whisper-cpp # Local server +audio_transcribe_command=/usr/local/libexec/nchat/transcribe -f '%1' -s whisper-local # Local Python +``` + +**Set the language (better accuracy):** +```conf +audio_transcribe_command=/usr/local/libexec/nchat/transcribe -f '%1' -l en # English +audio_transcribe_command=/usr/local/libexec/nchat/transcribe -f '%1' -l es # Spanish +``` + +Supports 90+ languages (en, es, fr, de, it, pt, ru, zh, ja, ko, etc.) + +**Local model sizes:** +```conf +# Pick one based on speed vs accuracy: +... -m tiny # 75 MB - fast but meh +... -m base # 150 MB - good balance +... -m small # 500 MB - better +... -m medium # 1.5 GB - pretty good +... -m large # 3 GB - best but slow +``` + +## Keyboard Shortcuts + +- `Alt-u` - Transcribe (or re-transcribe) message +- `Alt-l` - Set per-chat transcription language + +Change them in `~/.config/nchat/key.conf` if you want (see nchat docs for the escape codes). + +## Troubleshooting + +**"No API key set"** +```bash +export OPENAI_API_KEY='sk-...' # Add to ~/.bashrc or ~/.zshrc +``` + +**"Timeout"** + +Switch to a faster service or use a smaller local model. See [TRANSCRIPTION-SETUP.md](TRANSCRIPTION-SETUP.md). + +**"Audio format not supported"** + +Install ffmpeg: +```bash +brew install ffmpeg # macOS +sudo apt install ffmpeg # Linux +``` + +**Wrong language / bad accuracy** + +Specify the language: +```conf +audio_transcribe_command=/usr/local/libexec/nchat/transcribe -f '%1' -l en +``` + +Or use a bigger model (local) or switch to OpenAI API. + +**API costs too high** + +Use local Whisper instead (see [TRANSCRIPTION-SETUP.md](TRANSCRIPTION-SETUP.md)). + +## Privacy + +**OpenAI API:** Audio gets sent to their servers. They may keep it for 30 days. Don't use for super sensitive stuff. + +**Local Whisper:** Everything stays on your machine. 100% private. + +## Cache Management + +Transcriptions are stored per-profile in `~/.config/nchat/history//db.sqlite`. + +Clear cached transcriptions if needed: +```bash +sqlite3 ~/.config/nchat/history//db.sqlite "UPDATE messages SET transcription = '' WHERE transcription != '';" +``` + +## Tips + +- OpenAI API is fastest (2-3 sec/message) +- Specify language for better accuracy +- Use local for privacy, API for speed + +## FAQ + +**Q: Supports video?** +Nope, just audio. + +**Q: Offline?** +Yes with local Whisper. No with API. + +**Q: How accurate?** +Pretty good (~95%) with clear audio. Gets worse with noise/accents. + +**Q: Languages?** +99+ including English, Spanish, French, German, Chinese, Japanese, etc. + +## See Also + +[TRANSCRIPTION-SETUP.md](TRANSCRIPTION-SETUP.md) - How to set up local Whisper diff --git a/lib/common/src/protocol.h b/lib/common/src/protocol.h index da8ff24b..caff2784 100644 --- a/lib/common/src/protocol.h +++ b/lib/common/src/protocol.h @@ -141,6 +141,7 @@ struct ChatInfo bool isPinned = false; bool isArchived = false; int64_t lastMessageTime = -1; + std::string transcriptionLanguage; // language for audio transcription (e.g., "en", "ru", "auto", or empty for global default) }; enum FileStatus diff --git a/lib/ncutil/src/messagecache.cpp b/lib/ncutil/src/messagecache.cpp index 1c785f99..a99ad60f 100644 --- a/lib/ncutil/src/messagecache.cpp +++ b/lib/ncutil/src/messagecache.cpp @@ -440,7 +440,64 @@ void MessageCache::AddProfile(const std::string& p_ProfileId, bool p_CheckSequen "SET schema = ?;" << schemaVersion; } - static const int64_t s_SchemaVersion = 9; + if (schemaVersion == 9) + { + LOG_INFO("update db schema 9 to 10"); + + *m_Dbs[p_ProfileId] << "CREATE TABLE IF NOT EXISTS transcriptions (" + "chatId TEXT NOT NULL," + "msgId TEXT NOT NULL," + "transcription TEXT NOT NULL," + "language TEXT DEFAULT ''," + "service TEXT DEFAULT ''," + "timestamp INTEGER NOT NULL," + "PRIMARY KEY (chatId, msgId)" + ");"; + + *m_Dbs[p_ProfileId] << "CREATE INDEX IF NOT EXISTS idx_transcriptions_timestamp " + "ON transcriptions(timestamp);"; + + schemaVersion = 10; + *m_Dbs[p_ProfileId] << "UPDATE version " + "SET schema = ?;" << schemaVersion; + } + + if (schemaVersion == 10) + { + LOG_INFO("update db schema 10 to 11"); + + *m_Dbs[p_ProfileId] << "ALTER TABLE chats2 ADD COLUMN transcriptionLanguage TEXT DEFAULT '';"; + + schemaVersion = 11; + *m_Dbs[p_ProfileId] << "UPDATE version " + "SET schema = ?;" << schemaVersion; + } + + if (schemaVersion == 11) + { + LOG_INFO("update db schema 11 to 12"); + + *m_Dbs[p_ProfileId] << "ALTER TABLE messages ADD COLUMN transcription TEXT DEFAULT '';"; + + // Migrate existing transcriptions into messages table + *m_Dbs[p_ProfileId] << + "UPDATE messages SET transcription = (" + " SELECT t.transcription FROM transcriptions t" + " WHERE t.chatId = messages.chatId AND t.msgId = messages.id" + ") WHERE EXISTS (" + " SELECT 1 FROM transcriptions t" + " WHERE t.chatId = messages.chatId AND t.msgId = messages.id" + ");"; + + // Drop the now-unused transcriptions table + *m_Dbs[p_ProfileId] << "DROP INDEX IF EXISTS idx_transcriptions_timestamp;"; + *m_Dbs[p_ProfileId] << "DROP TABLE IF EXISTS transcriptions;"; + + schemaVersion = 12; + *m_Dbs[p_ProfileId] << "UPDATE version SET schema = ?;" << schemaVersion; + } + + static const int64_t s_SchemaVersion = 12; if (schemaVersion > s_SchemaVersion) { LOG_WARNING("cache db schema %d from newer nchat version detected, if cache issues are encountered " @@ -871,6 +928,19 @@ std::vector MessageCache::FetchGroupMembersSync(const std::string& return contactInfos; } +void MessageCache::UpdateTranscriptionLanguage(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_TranscriptionLanguage) +{ + if (!m_CacheEnabled) return; + + std::shared_ptr updateTranscriptionLanguageRequest = + std::make_shared(); + updateTranscriptionLanguageRequest->profileId = p_ProfileId; + updateTranscriptionLanguageRequest->chatId = p_ChatId; + updateTranscriptionLanguageRequest->transcriptionLanguage = p_TranscriptionLanguage; + EnqueueRequest(updateTranscriptionLanguageRequest); +} + void MessageCache::Export(const std::string& p_ExportDir) { if (!m_CacheEnabled) @@ -1227,10 +1297,10 @@ void MessageCache::PerformRequest(std::shared_ptr p_Request) for (const auto& chatInfo : addChatsRequest->chatInfos) { *m_Dbs[profileId] << "INSERT INTO " + s_TableChats + " " - "(id, isMuted, isPinned, lastMessageTime, isArchived) VALUES " - "(?, ?, ?, ?, ?);" << + "(id, isMuted, isPinned, lastMessageTime, isArchived, transcriptionLanguage) VALUES " + "(?, ?, ?, ?, ?, ?);" << chatInfo.id << chatInfo.isMuted << chatInfo.isPinned << - chatInfo.lastMessageTime << chatInfo.isArchived; + chatInfo.lastMessageTime << chatInfo.isArchived << chatInfo.transcriptionLanguage; } *m_Dbs[profileId] << "COMMIT;"; } @@ -1296,14 +1366,16 @@ void MessageCache::PerformRequest(std::shared_ptr p_Request) std::map chatIdPinned; std::map chatIdLastMessageTime; std::map chatIdArchived; - *m_Dbs[profileId] << "SELECT id, isMuted, isPinned, lastMessageTime, isArchived FROM " + s_TableChats + ";" >> + std::map chatIdTranscriptionLanguage; + *m_Dbs[profileId] << "SELECT id, isMuted, isPinned, lastMessageTime, isArchived, transcriptionLanguage FROM " + s_TableChats + ";" >> [&](const std::string& chatId, int32_t isMuted, int32_t isPinned, int64_t lastMessageTime, - int32_t isArchived) + int32_t isArchived, const std::string& transcriptionLanguage) { chatIdMuted[chatId] = isMuted; chatIdPinned[chatId] = isPinned; chatIdLastMessageTime[chatId] = lastMessageTime; chatIdArchived[chatId] = isArchived; + chatIdTranscriptionLanguage[chatId] = transcriptionLanguage; }; *m_Dbs[profileId] << "SELECT chatId, MAX(timeSent), isOutgoing, isRead FROM " + s_TableMessages + " " @@ -1319,6 +1391,7 @@ void MessageCache::PerformRequest(std::shared_ptr p_Request) chatInfo.isPinned = chatIdPinned[chatId]; chatInfo.isArchived = chatIdArchived[chatId]; chatInfo.lastMessageTime = chatInfo.isPinned ? chatIdLastMessageTime[chatId] : timeSent; + chatInfo.transcriptionLanguage = chatIdTranscriptionLanguage[chatId]; chatInfos.push_back(chatInfo); } }; @@ -1940,6 +2013,32 @@ void MessageCache::PerformRequest(std::shared_ptr p_Request) } break; + case UpdateTranscriptionLanguageRequestType: + { + std::unique_lock lock(m_DbMutex); + std::shared_ptr updateTranscriptionLanguageRequest = + std::static_pointer_cast(p_Request); + const std::string& profileId = updateTranscriptionLanguageRequest->profileId; + if (!m_Dbs[profileId]) return; + + const std::string& chatId = updateTranscriptionLanguageRequest->chatId; + const std::string& transcriptionLanguage = updateTranscriptionLanguageRequest->transcriptionLanguage; + + try + { + *m_Dbs[profileId] << "UPDATE " + s_TableChats + " " + "SET transcriptionLanguage = ? WHERE id = ?;" << + transcriptionLanguage << chatId; + } + catch (const sqlite::sqlite_exception& ex) + { + HANDLE_SQLITE_EXCEPTION(ex); + } + + LOG_DEBUG("cache update transcription language %s %s", chatId.c_str(), transcriptionLanguage.c_str()); + } + break; + default: { LOG_WARNING("cache unknown request type %d", p_Request->GetRequestType()); @@ -2046,3 +2145,107 @@ void MessageCache::CallMessageHandler(std::shared_ptr p_ServiceM LOG_WARNING("message handler not set"); } } + +bool MessageCache::StoreTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_MsgId, const std::string& p_Transcription) +{ + if (!m_CacheEnabled) return false; + + std::unique_lock lock(m_DbMutex); + + if (m_Dbs.find(p_ProfileId) == m_Dbs.end()) return false; + + try + { + *m_Dbs[p_ProfileId] << + "UPDATE messages SET transcription = ? WHERE chatId = ? AND id = ?;" + << p_Transcription << p_ChatId << p_MsgId; + return true; + } + catch (const sqlite::sqlite_exception& ex) + { + HANDLE_SQLITE_EXCEPTION(ex); + return false; + } +} + +std::string MessageCache::GetTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_MsgId) +{ + if (!m_CacheEnabled) return ""; + + std::unique_lock lock(m_DbMutex); + + if (m_Dbs.find(p_ProfileId) == m_Dbs.end()) return ""; + + try + { + std::string transcription; + // *INDENT-OFF* + *m_Dbs[p_ProfileId] << + "SELECT transcription FROM messages WHERE chatId = ? AND id = ?;" + << p_ChatId << p_MsgId >> + [&](const std::string& p_Transcription) + { + transcription = p_Transcription; + }; + // *INDENT-ON* + return transcription; + } + catch (const sqlite::sqlite_exception& ex) + { + HANDLE_SQLITE_EXCEPTION(ex); + return ""; + } +} + +bool MessageCache::HasTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_MsgId) +{ + if (!m_CacheEnabled) return false; + + std::unique_lock lock(m_DbMutex); + + if (m_Dbs.find(p_ProfileId) == m_Dbs.end()) return false; + + try + { + bool hasTranscription = false; + // *INDENT-OFF* + *m_Dbs[p_ProfileId] << + "SELECT 1 FROM messages WHERE chatId = ? AND id = ? AND transcription != '' LIMIT 1;" + << p_ChatId << p_MsgId >> + [&](int) + { + hasTranscription = true; + }; + // *INDENT-ON* + return hasTranscription; + } + catch (const sqlite::sqlite_exception& ex) + { + HANDLE_SQLITE_EXCEPTION(ex); + return false; + } +} + +void MessageCache::DeleteTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_MsgId) +{ + if (!m_CacheEnabled) return; + + std::unique_lock lock(m_DbMutex); + + if (m_Dbs.find(p_ProfileId) == m_Dbs.end()) return; + + try + { + *m_Dbs[p_ProfileId] << + "UPDATE messages SET transcription = '' WHERE chatId = ? AND id = ?;" + << p_ChatId << p_MsgId; + } + catch (const sqlite::sqlite_exception& ex) + { + HANDLE_SQLITE_EXCEPTION(ex); + } +} diff --git a/lib/ncutil/src/messagecache.h b/lib/ncutil/src/messagecache.h index 31956078..4773bcd7 100644 --- a/lib/ncutil/src/messagecache.h +++ b/lib/ncutil/src/messagecache.h @@ -48,6 +48,7 @@ class MessageCache UpdateArchivedRequestType, AddGroupMembersRequestType, FetchGroupMembersRequestType, + UpdateTranscriptionLanguageRequestType, }; class Request @@ -223,6 +224,15 @@ class MessageCache std::string chatId; }; + class UpdateTranscriptionLanguageRequest : public Request + { + public: + virtual RequestType GetRequestType() const { return UpdateTranscriptionLanguageRequestType; } + std::string profileId; + std::string chatId; + std::string transcriptionLanguage; + }; + public: static void Init(); static void Cleanup(); @@ -266,8 +276,20 @@ class MessageCache const std::vector& p_MemberIds); static void FetchGroupMembers(const std::string& p_ProfileId, const std::string& p_ChatId); static std::vector FetchGroupMembersSync(const std::string& p_ProfileId, const std::string& p_ChatId); + static void UpdateTranscriptionLanguage(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_TranscriptionLanguage); static void Export(const std::string& p_ExportDir); + // Transcription methods + static bool StoreTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_MsgId, const std::string& p_Transcription); + static std::string GetTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_MsgId); + static bool HasTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_MsgId); + static void DeleteTranscription(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_MsgId); + private: static void Process(); static void EnqueueRequest(std::shared_ptr p_Request); diff --git a/src/main.cpp b/src/main.cpp index 1a259aca..01636d0b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -684,6 +684,8 @@ void ShowHelp() " Alt-q jump to quoted/replied message\n" " Alt-r forward selected message\n" " Alt-s add/remove reaction on selected message\n" + " Alt-u transcribe selected audio message\n" + " Alt-U re-transcribe selected audio message\n" " Alt-w external message viewer\n" "\n" "Interactive Commands for Text Input:\n" diff --git a/src/transcribe b/src/transcribe new file mode 100755 index 00000000..a9e95346 --- /dev/null +++ b/src/transcribe @@ -0,0 +1,460 @@ +#!/usr/bin/env python3 +""" +nchat audio transcription script +Supports multiple Whisper backends: OpenAI API, whisper.cpp, Whisper Python +""" + +import argparse +import os +import sys +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Optional + +# Exit codes +EXIT_SUCCESS = 0 +EXIT_INVALID_ARGS = 1 +EXIT_FILE_ERROR = 2 +EXIT_SERVICE_ERROR = 3 +EXIT_TIMEOUT = 4 +EXIT_UNSUPPORTED_FORMAT = 5 + +# Supported audio formats +SUPPORTED_FORMATS = {'.ogg', '.oga', '.opus', '.mp3', '.m4a', '.wav', '.flac', '.webm'} + + +class TranscriptionService(ABC): + """Base class for transcription services""" + + @abstractmethod + def transcribe(self, audio_file: str, language: str = "auto") -> str: + """ + Transcribe audio file. + + Args: + audio_file: Path to audio file + language: Language code or "auto" for detection + + Returns: + Transcribed text as string + + Raises: + RuntimeError: If transcription fails + """ + pass + + +class OpenAIWhisperService(TranscriptionService): + """OpenAI Whisper API service""" + + def __init__(self, api_key: Optional[str] = None): + """ + Initialize OpenAI service. + + Args: + api_key: OpenAI API key (defaults to OPENAI_API_KEY env var) + + Raises: + RuntimeError: If API key is not provided + """ + self.api_key = api_key or os.getenv('OPENAI_API_KEY') + if not self.api_key: + raise RuntimeError("OpenAI API key not set. Please set OPENAI_API_KEY environment variable.") + + try: + from openai import OpenAI + self.client = OpenAI(api_key=self.api_key) + except ImportError: + raise RuntimeError("OpenAI package not installed. Install with: pip install openai") + + def transcribe(self, audio_file: str, language: str = "auto") -> str: + """ + Transcribe audio file using OpenAI Whisper API. + + Args: + audio_file: Path to audio file + language: Language code or "auto" for detection + + Returns: + Transcribed text as string + + Raises: + FileNotFoundError: If audio file doesn't exist + RuntimeError: If API request fails + """ + if not os.path.isfile(audio_file): + raise FileNotFoundError(f"Audio file not found: {audio_file}") + + try: + with open(audio_file, "rb") as f: + # OpenAI API: if language is "auto", don't specify language parameter + # This allows Whisper to auto-detect + kwargs = { + "model": "whisper-1", + "file": f, + } + + # Only add language if not auto-detect + if language != "auto": + kwargs["language"] = language + + response = self.client.audio.transcriptions.create(**kwargs) + + return response.text.strip() + + except Exception as e: + # Handle API errors + error_msg = str(e) + if "authentication" in error_msg.lower() or "api_key" in error_msg.lower(): + raise RuntimeError(f"OpenAI API authentication failed: {error_msg}") + elif "rate_limit" in error_msg.lower(): + raise RuntimeError(f"OpenAI API rate limit exceeded: {error_msg}") + elif "invalid" in error_msg.lower() and "audio" in error_msg.lower(): + raise RuntimeError(f"Invalid audio file: {error_msg}") + else: + raise RuntimeError(f"OpenAI API error: {error_msg}") + + +class WhisperCppService(TranscriptionService): + """whisper.cpp HTTP server service""" + + def __init__(self, server_url: Optional[str] = None): + """ + Initialize whisper.cpp service. + + Args: + server_url: URL of whisper.cpp server (defaults to WHISPER_CPP_SERVER env var or http://localhost:8080) + """ + self.server_url = server_url or os.getenv('WHISPER_CPP_SERVER', 'http://localhost:8080') + + try: + import requests + self.requests = requests + except ImportError: + raise RuntimeError("requests package not installed. Install with: pip install requests") + + def transcribe(self, audio_file: str, language: str = "auto") -> str: + """ + Transcribe audio file using whisper.cpp HTTP server. + + Args: + audio_file: Path to audio file + language: Language code or "auto" for detection + + Returns: + Transcribed text as string + + Raises: + FileNotFoundError: If audio file doesn't exist + RuntimeError: If server request fails + """ + if not os.path.isfile(audio_file): + raise FileNotFoundError(f"Audio file not found: {audio_file}") + + try: + with open(audio_file, 'rb') as f: + files = {'file': f} + data = {} + + # Add language if not auto-detect + if language != "auto": + data['language'] = language + + response = self.requests.post( + f"{self.server_url}/inference", + files=files, + data=data, + timeout=30 + ) + + if response.status_code != 200: + raise RuntimeError(f"Server returned error: {response.status_code} - {response.text}") + + result = response.json() + transcription = result.get('text', '') + + return transcription.strip() + + except self.requests.exceptions.ConnectionError: + raise RuntimeError(f"Could not connect to whisper.cpp server at {self.server_url}") + except self.requests.exceptions.Timeout: + raise RuntimeError(f"Request to whisper.cpp server timed out") + except Exception as e: + raise RuntimeError(f"whisper.cpp error: {str(e)}") + + +class WhisperLocalService(TranscriptionService): + """Local Whisper Python package service""" + + def __init__(self, model_size: str = "base"): + """ + Initialize local Whisper service. + + Args: + model_size: Model size (tiny, base, small, medium, large) + """ + self.model_size = model_size + self.model = None + + try: + import whisper + self.whisper = whisper + except ImportError: + # Try faster-whisper as alternative + try: + from faster_whisper import WhisperModel + self.faster_whisper = WhisperModel + self.use_faster = True + except ImportError: + raise RuntimeError( + "Whisper package not installed. Install with: pip install openai-whisper " + "or pip install faster-whisper" + ) + else: + self.use_faster = False + + def transcribe(self, audio_file: str, language: str = "auto") -> str: + """ + Transcribe audio file using local Whisper. + + Args: + audio_file: Path to audio file + language: Language code or "auto" for detection + + Returns: + Transcribed text as string + + Raises: + FileNotFoundError: If audio file doesn't exist + RuntimeError: If transcription fails + """ + if not os.path.isfile(audio_file): + raise FileNotFoundError(f"Audio file not found: {audio_file}") + + try: + # Load model if not already loaded + if self.model is None: + if self.use_faster: + # Use faster-whisper (more efficient) + self.model = self.faster_whisper(self.model_size, device="cpu", compute_type="int8") + else: + # Use standard whisper + self.model = self.whisper.load_model(self.model_size) + + # Perform transcription + if self.use_faster: + # faster-whisper API + segments, info = self.model.transcribe( + audio_file, + language=None if language == "auto" else language + ) + transcription = " ".join([segment.text for segment in segments]) + else: + # standard whisper API + result = self.model.transcribe( + audio_file, + language=None if language == "auto" else language + ) + transcription = result['text'] + + return transcription.strip() + + except Exception as e: + raise RuntimeError(f"Local Whisper error: {str(e)}") + + +def detect_service() -> str: + """ + Auto-detect available transcription service. + + Returns: + Service name (openai, whisper-cpp, or whisper-local) + + Priority: + 1. Environment variable WHISPER_SERVICE + 2. OpenAI API (if OPENAI_API_KEY is set) + 3. whisper.cpp server (if running on localhost:8080) + 4. Local Whisper (if installed) + """ + # Check environment variable override + env_service = os.getenv("WHISPER_SERVICE") + if env_service: + return env_service + + # Check for OpenAI API key + if os.getenv("OPENAI_API_KEY"): + return "openai" + + # Check for whisper.cpp server + try: + import requests + server_url = os.getenv('WHISPER_CPP_SERVER', 'http://localhost:8080') + response = requests.get(f"{server_url}/", timeout=1) + if response.status_code in [200, 404]: # Server is running + return "whisper-cpp" + except Exception: + pass + + # Check for local Whisper + try: + import whisper + return "whisper-local" + except ImportError: + try: + from faster_whisper import WhisperModel + return "whisper-local" + except ImportError: + pass + + # No service available + raise RuntimeError( + "No transcription service available. Please either:\n" + " 1. Set OPENAI_API_KEY environment variable for OpenAI API\n" + " 2. Start whisper.cpp server (see doc/TRANSCRIPTION-SETUP.md)\n" + " 3. Install local Whisper: pip install openai-whisper" + ) + + +def validate_audio_file(file_path: str) -> bool: + """ + Validate audio file exists and has supported format. + + Args: + file_path: Path to audio file + + Returns: + True if valid, False otherwise + + Raises: + FileNotFoundError: If file doesn't exist + ValueError: If format is unsupported + """ + if not os.path.isfile(file_path): + raise FileNotFoundError(f"Audio file not found: {file_path}") + + file_ext = Path(file_path).suffix.lower() + if file_ext not in SUPPORTED_FORMATS: + raise ValueError( + f"Unsupported audio format: {file_ext}\n" + f"Supported formats: {', '.join(sorted(SUPPORTED_FORMATS))}" + ) + + return True + + +def create_service(service_name: str, model: str = "base") -> TranscriptionService: + """ + Factory method to create transcription service. + + Args: + service_name: Service to create (openai, whisper-cpp, whisper-local) + model: Model size for local services (tiny, base, small, medium, large) + + Returns: + Transcription service instance + + Raises: + ValueError: If service name is invalid + RuntimeError: If service initialization fails + """ + if service_name == "openai": + return OpenAIWhisperService() + elif service_name == "whisper-cpp": + return WhisperCppService() + elif service_name == "whisper-local": + return WhisperLocalService(model_size=model) + else: + raise ValueError(f"Unknown service: {service_name}") + + +def main(): + """Main entry point for transcription script""" + + parser = argparse.ArgumentParser( + description="Transcribe audio files using Whisper", + epilog="See doc/TRANSCRIPTION.md for more information" + ) + + parser.add_argument( + "-f", "--file", + required=True, + help="Audio file path (required)" + ) + + parser.add_argument( + "-s", "--service", + default="auto", + choices=["auto", "openai", "whisper-cpp", "whisper-local"], + help="Transcription service to use (default: auto)" + ) + + parser.add_argument( + "-l", "--language", + default="auto", + help="Audio language code (en, es, fr, etc.) or 'auto' for detection (default: auto)" + ) + + parser.add_argument( + "-m", "--model", + default="base", + choices=["tiny", "base", "small", "medium", "large"], + help="Model size for local services (default: base)" + ) + + parser.add_argument( + "--timeout", + type=int, + default=30, + help="Request timeout in seconds (default: 30)" + ) + + parser.add_argument( + "--version", + action="version", + version="%(prog)s 1.0" + ) + + args = parser.parse_args() + + try: + # Validate audio file + validate_audio_file(args.file) + + # Auto-detect service if needed + service_name = args.service + if service_name == "auto": + service_name = detect_service() + + # Create service instance + service = create_service(service_name, args.model) + + # Perform transcription + transcription = service.transcribe(args.file, args.language) + + # Output result to stdout + print(transcription) + sys.exit(EXIT_SUCCESS) + + except FileNotFoundError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(EXIT_FILE_ERROR) + + except ValueError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(EXIT_UNSUPPORTED_FORMAT) + + except RuntimeError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(EXIT_SERVICE_ERROR) + + except KeyboardInterrupt: + print("\nTranscription interrupted", file=sys.stderr) + sys.exit(EXIT_TIMEOUT) + + except Exception as e: + print(f"Unexpected error: {e}", file=sys.stderr) + sys.exit(EXIT_SERVICE_ERROR) + + +if __name__ == "__main__": + main() diff --git a/src/uiconfig.cpp b/src/uiconfig.cpp index 77909bfd..ffdf2a47 100644 --- a/src/uiconfig.cpp +++ b/src/uiconfig.cpp @@ -20,6 +20,10 @@ void UiConfig::Init() { { "attachment_indicator", "\xF0\x9F\x93\x8E" }, { "attachment_open_command", "" }, + { "audio_transcribe_command", "" }, + { "audio_transcribe_enabled", "0" }, + { "audio_transcribe_language", "auto" }, + { "audio_transcribe_max_lines", "15" }, { "auto_compose_command", "" }, { "auto_compose_enabled", "0" }, { "auto_compose_history_count", "25" }, diff --git a/src/uihelpview.cpp b/src/uihelpview.cpp index f92d3604..8a6cd157 100644 --- a/src/uihelpview.cpp +++ b/src/uihelpview.cpp @@ -125,6 +125,9 @@ void UiHelpView::Draw() AppendHelpItem("save", "SaveFile", helpItems); AppendHelpItem("open_link", "OpenLink", helpItems); + AppendHelpItem("transcribe_audio", "Transcribe", helpItems); + AppendHelpItem("set_transcription_lang", "SetLang", helpItems); + AppendHelpItem("jump_quoted", "JumpQuoted", helpItems); AppendHelpItem("react", "AddReact", helpItems); AppendHelpItem("open_msg", "ExtView", helpItems); diff --git a/src/uihistoryview.cpp b/src/uihistoryview.cpp index 56174621..446e1a56 100644 --- a/src/uihistoryview.cpp +++ b/src/uihistoryview.cpp @@ -17,6 +17,7 @@ #include "uicolorconfig.h" #include "uiconfig.h" #include "uimodel.h" +#include "messagecache.h" UiHistoryView::UiHistoryView(const UiViewParams& p_Params) : UiViewBase(p_Params) @@ -71,6 +72,7 @@ void UiHistoryView::Draw() static std::wstring attachmentIndicator = StrUtil::ToWString(UiConfig::GetStr("attachment_indicator") + " "); static std::wstring quoteIndicator = L"> "; + static std::wstring transcriptionIndicator = L"[Transcribed] "; std::pair& currentChat = m_Model->GetCurrentChatLocked(); const bool emojiEnabled = m_Model->GetEmojiEnabledLocked(); @@ -171,6 +173,9 @@ void UiHistoryView::Draw() wlines.insert(wlines.begin(), quote); } + // Transcription lines counter (needs to be accessible in rendering loop) + int transcriptionLines = 0; + // File attachment if (!msg.fileInfo.empty()) { @@ -222,6 +227,67 @@ void UiHistoryView::Draw() std::wstring fileStr = attachmentIndicator + StrUtil::ToWString(fileName + fileStatus); wlines.insert(wlines.begin(), fileStr); + + // Transcription (if audio file and transcription available) + static const bool transcribeEnabled = UiConfig::GetBool("audio_transcribe_enabled"); + if (transcribeEnabled) + { + std::string ext = FileUtil::GetFileExt(fileInfo.filePath); + + // Remove leading dot if present + if (!ext.empty() && ext[0] == '.') + { + ext = ext.substr(1); + } + + static const std::set audioExtensions = { + "ogg", "opus", "mp3", "m4a", "wav", "flac", "oga", "webm" + }; + + if (audioExtensions.find(ext) != audioExtensions.end()) + { + std::string transcription = MessageCache::GetTranscription(currentChat.first, currentChat.second, msg.id); + if (!transcription.empty()) + { + StrUtil::SanitizeMessageStr(transcription); + if (!emojiEnabled) + { + transcription = StrUtil::Textize(transcription); + } + + const unsigned transcriptionWrapWidth = (m_PaddedW > 2) ? static_cast(m_PaddedW - 2) : 1u; + std::vector transcriptionWLines = + StrUtil::WordWrap(StrUtil::ToWString(transcription), transcriptionWrapWidth, false, false, false, 2); + + // Check if transcription exceeds max lines limit + static const int maxTranscriptionLines = UiConfig::GetNum("audio_transcribe_max_lines"); + const bool needsTruncation = (maxTranscriptionLines > 0) && + (static_cast(transcriptionWLines.size()) > maxTranscriptionLines); + + if (needsTruncation) + { + int hiddenLines = transcriptionWLines.size() - maxTranscriptionLines + 1; // +1 for truncation indicator line + transcriptionWLines.resize(maxTranscriptionLines - 1); // Reserve last line for indicator + std::wstring truncationMsg = L"... (" + std::to_wstring(hiddenLines) + L" more lines)"; + transcriptionWLines.push_back(truncationMsg); + } + + // Add transcription indicator on first line + if (!transcriptionWLines.empty()) + { + transcriptionWLines[0] = transcriptionIndicator + transcriptionWLines[0]; + } + + // Insert transcription lines after file attachment + for (auto tline = transcriptionWLines.rbegin(); tline != transcriptionWLines.rend(); ++tline) + { + wlines.insert(wlines.begin() + 1, *tline); + } + + transcriptionLines = transcriptionWLines.size(); + } + } + } } // Reactions @@ -319,6 +385,13 @@ void UiHistoryView::Draw() bool isAttachment = (wline->rfind(attachmentIndicator, 0) == 0); bool isQuote = (wline->rfind(quoteIndicator, 0) == 0); bool isReaction = (reactionLines == 1) && (std::distance(wline, wlines.rbegin()) == 0); + + // Transcription lines are at positions 1 to (1 + transcriptionLines - 1) in forward iteration + // In reverse, calculate the position from the end + size_t posFromEnd = std::distance(wlines.rbegin(), wline); + size_t vectorSize = wlines.size(); + size_t posFromBegin = vectorSize - 1 - posFromEnd; + bool isTranscription = (transcriptionLines > 0) && (posFromBegin >= 1) && (posFromBegin < 1 + static_cast(transcriptionLines)); if (isAttachment) { @@ -332,6 +405,10 @@ void UiHistoryView::Draw() { wattron(m_PaddedWin, attributeTextNormal | colorPairTextReaction); } + else if (isTranscription) + { + wattron(m_PaddedWin, attributeText | colorPairTextQuoted | A_DIM); + } else { wattron(m_PaddedWin, attributeText | colorPairText); @@ -352,6 +429,10 @@ void UiHistoryView::Draw() { wattroff(m_PaddedWin, attributeTextNormal | colorPairTextReaction); } + else if (isTranscription) + { + wattroff(m_PaddedWin, attributeText | colorPairTextQuoted | A_DIM); + } else { wattroff(m_PaddedWin, attributeText | colorPairText); diff --git a/src/uikeyconfig.cpp b/src/uikeyconfig.cpp index a01b3d75..70e8978d 100644 --- a/src/uikeyconfig.cpp +++ b/src/uikeyconfig.cpp @@ -232,6 +232,8 @@ void UiKeyConfig::Init(bool p_MapKeys) { "toggle_help", "KEY_CTRLG" }, { "toggle_list", "KEY_CTRLL" }, { "toggle_top", "KEY_CTRLP" }, + { "transcribe_audio", "\\33\\165" }, // alt/opt-u + { "set_transcription_lang", "\\33\\154" }, // alt/opt-l { "next_chat", "KEY_TAB" }, { "prev_chat", "KEY_BTAB" }, { "unread_chat", "KEY_CTRLF" }, diff --git a/src/uilanguagelistdialog.cpp b/src/uilanguagelistdialog.cpp new file mode 100644 index 00000000..23cbad31 --- /dev/null +++ b/src/uilanguagelistdialog.cpp @@ -0,0 +1,106 @@ +// uilanguagelistdialog.cpp +// +// Copyright (c) 2019-2025 Kristofer Berggren +// All rights reserved. +// +// nchat is distributed under the MIT license, see LICENSE for details. + +#include "uilanguagelistdialog.h" + +#include "strutil.h" + +UiLanguageListDialog::UiLanguageListDialog(const UiDialogParams& p_Params, const std::string& p_CurrentLanguage) + : UiListDialog(p_Params, false /*p_ShadeHidden*/) + , m_CurrentLanguage(p_CurrentLanguage) +{ + // Define common languages for transcription + m_Languages = { + {"", "Default (from global settings)"}, + {"auto", "Auto-detect"}, + {"en", "English"}, + {"es", "Spanish"}, + {"fr", "French"}, + {"de", "German"}, + {"it", "Italian"}, + {"pt", "Portuguese"}, + {"ru", "Russian"}, + {"uk", "Ukrainian"}, + {"zh", "Chinese"}, + {"ja", "Japanese"}, + {"ko", "Korean"}, + {"ar", "Arabic"}, + {"hi", "Hindi"}, + {"nl", "Dutch"}, + {"pl", "Polish"}, + {"tr", "Turkish"}, + {"sv", "Swedish"}, + {"no", "Norwegian"}, + {"da", "Danish"}, + {"fi", "Finnish"}, + }; + + UpdateList(); +} + +UiLanguageListDialog::~UiLanguageListDialog() +{ +} + +std::string UiLanguageListDialog::GetSelectedLanguage() +{ + return m_SelectedLanguage; +} + +void UiLanguageListDialog::OnSelect() +{ + if ((m_Index >= 0) && (m_Index < (int)m_FilteredIndices.size())) + { + int languageIndex = m_FilteredIndices[m_Index]; + m_SelectedLanguage = m_Languages[languageIndex].code; + m_Result = true; + m_Running = false; + } +} + +void UiLanguageListDialog::OnBack() +{ + m_Result = false; + m_Running = false; +} + +bool UiLanguageListDialog::OnTimer() +{ + return false; +} + +void UiLanguageListDialog::UpdateList() +{ + m_Items.clear(); + m_FilteredIndices.clear(); + + std::wstring filterStrLower = StrUtil::ToLower(m_FilterStr); + + for (size_t i = 0; i < m_Languages.size(); ++i) + { + const LanguageOption& lang = m_Languages[i]; + std::wstring displayName = StrUtil::ToWString(lang.name); + + // Add indicator if this is the current language + if (lang.code == m_CurrentLanguage) + { + displayName = L"* " + displayName; + } + + // Filter by name or code + std::wstring displayNameLower = StrUtil::ToLower(displayName); + std::wstring codeLower = StrUtil::ToLower(StrUtil::ToWString(lang.code)); + + if (filterStrLower.empty() || + (displayNameLower.find(filterStrLower) != std::wstring::npos) || + (codeLower.find(filterStrLower) != std::wstring::npos)) + { + m_Items.push_back(displayName); + m_FilteredIndices.push_back(i); + } + } +} diff --git a/src/uilanguagelistdialog.h b/src/uilanguagelistdialog.h new file mode 100644 index 00000000..e58dd057 --- /dev/null +++ b/src/uilanguagelistdialog.h @@ -0,0 +1,41 @@ +// uilanguagelistdialog.h +// +// Copyright (c) 2019-2025 Kristofer Berggren +// All rights reserved. +// +// nchat is distributed under the MIT license, see LICENSE for details. + +#pragma once + +#include "uilistdialog.h" + +#include +#include + +class UiLanguageListDialog : public UiListDialog +{ +public: + UiLanguageListDialog(const UiDialogParams& p_Params, const std::string& p_CurrentLanguage = ""); + virtual ~UiLanguageListDialog(); + + std::string GetSelectedLanguage(); + +protected: + virtual void OnSelect(); + virtual void OnBack(); + virtual bool OnTimer(); + + void UpdateList(); + +private: + struct LanguageOption + { + std::string code; + std::string name; + }; + + std::vector m_Languages; + std::vector m_FilteredIndices; // Indices into m_Languages after filtering + std::string m_SelectedLanguage; + std::string m_CurrentLanguage; +}; diff --git a/src/uimodel.cpp b/src/uimodel.cpp index 93308003..f705f464 100644 --- a/src/uimodel.cpp +++ b/src/uimodel.cpp @@ -19,6 +19,7 @@ #include "clipboard.h" #include "fileutil.h" #include "log.h" +#include "messagecache.h" #include "numutil.h" #include "protocolutil.h" #include "sethelp.h" @@ -35,6 +36,7 @@ #include "uiemojilistdialog.h" #include "uifilelistdialog.h" #include "uikeyconfig.h" +#include "uilanguagelistdialog.h" #include "uikeyinput.h" #include "uimessagedialog.h" #include "uitextinputdialog.h" @@ -4277,6 +4279,186 @@ bool UiModel::Impl::AutoCompose() return rv; } +bool UiModel::Impl::TranscribeAudio() +{ + AnyUserKeyInput(); + + const std::string& profileId = m_CurrentChat.first; + const std::string& chatId = m_CurrentChat.second; + const std::vector& messageVec = m_MessageVec[profileId][chatId]; + const std::unordered_map& messages = m_Messages[profileId][chatId]; + + const int messageOffset = GetSelectMessageActive() ? m_MessageOffset[profileId][chatId] : 0; + const int editOffset = GetEditMessageActive() ? 1 : 0; + const int offset = messageOffset + editOffset; + + auto it = std::next(messageVec.begin(), offset); + if (it == messageVec.end()) + { + LOG_WARNING("end of message history"); + return false; + } + + auto msgIt = messages.find(*it); + if (msgIt == messages.end()) + { + LOG_WARNING("message not found"); + return false; + } + + const ChatMessage& msg = msgIt->second; + const std::string& msgId = msg.id; + + // Check if message has a file attachment + if (msg.fileInfo.empty()) + { + LOG_DEBUG("message has no file attachment"); + LOG_WARNING("Selected message has no file attachment"); + return false; + } + + FileInfo fileInfo = ProtocolUtil::FileInfoFromHex(msg.fileInfo); + + // Check if file is downloaded + if (!IsAttachmentDownloaded(fileInfo)) + { + LOG_DEBUG("audio file not downloaded"); + LOG_WARNING("File not downloaded yet - select it to download first"); + return false; + } + + // Check if file is an audio file (based on extension) + const std::string filePath = fileInfo.filePath; + std::string ext = FileUtil::GetFileExt(filePath); + + // Remove leading dot if present + if (!ext.empty() && ext[0] == '.') + { + ext = ext.substr(1); + } + + static const std::set audioExtensions = { + "ogg", "opus", "mp3", "m4a", "wav", "flac", "oga", "webm" + }; + + if (audioExtensions.find(ext) == audioExtensions.end()) + { + LOG_DEBUG("file is not an audio file: %s", ext.c_str()); + LOG_WARNING("File is not an audio file (extension: %s). Supported: ogg, opus, mp3, m4a, wav, flac, oga, webm", ext.c_str()); + return false; + } + + // Get transcribe command + static const std::string cmdTemplate = []() + { + std::string transcribeCommand = UiConfig::GetStr("audio_transcribe_command"); + if (transcribeCommand.empty()) + { + transcribeCommand = FileUtil::DirName(FileUtil::GetSelfPath()) + + "/../" CMAKE_INSTALL_LIBEXECDIR "/nchat/transcribe -f '%1'"; + } + + return transcribeCommand; + }(); + + // Build and execute command + std::string transcription; + std::string cmd = cmdTemplate; + + // Escape single quotes in filePath before substituting into shell command + std::string escapedFilePath; + escapedFilePath.reserve(filePath.size() + 4); + for (char c : filePath) + { + if (c == '\'') { escapedFilePath += "'\\''"; } + else { escapedFilePath += c; } + } + StrUtil::ReplaceString(cmd, "%1", escapedFilePath); + + // First check per-chat language setting, then fall back to global setting + std::string language; + if (m_ChatInfos.count(profileId) && m_ChatInfos[profileId].count(chatId)) + { + language = m_ChatInfos[profileId][chatId].transcriptionLanguage; + } + + if (language.empty()) + { + language = UiConfig::GetStr("audio_transcribe_language"); + } + + if (language.empty()) + { + language = "auto"; + } + + // Validate language: only allow ISO 639 codes, "auto", or safe variants (no shell metacharacters) + const bool languageSafe = language.size() <= 20 && + std::all_of(language.begin(), language.end(), [](char c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || + (c >= '0' && c <= '9') || c == '-' || c == '_'; + }); + if (!languageSafe) + { + LOG_WARNING("invalid transcription language '%s', ignoring", language.c_str()); + language = "auto"; + } + + if (language != "auto") + { + cmd += " -l " + language; + } + + LOG_TRACE("transcribe cmd \"%s\" start", cmd.c_str()); + + const bool rv = RunCommand(cmd, &transcription); + + if (rv && !transcription.empty()) + { + if (!MessageCache::StoreTranscription(profileId, chatId, msgId, transcription)) + { + LOG_WARNING("failed to store transcription for msg %s (cache disabled?)", msgId.c_str()); + } + + // Update UI + UpdateHistory(); + return true; + } + else + { + LOG_WARNING("transcription failed"); + return false; + } +} + +std::string UiModel::Impl::GetCurrentTranscriptionLanguage(const std::string& p_ProfileId, + const std::string& p_ChatId) +{ + if (m_ChatInfos.count(p_ProfileId) && m_ChatInfos[p_ProfileId].count(p_ChatId)) + { + return m_ChatInfos[p_ProfileId][p_ChatId].transcriptionLanguage; + } + return ""; +} + +void UiModel::Impl::UpdateCurrentTranscriptionLanguage(const std::string& p_ProfileId, + const std::string& p_ChatId, + const std::string& p_Language) +{ + if (m_ChatInfos.count(p_ProfileId) && m_ChatInfos[p_ProfileId].count(p_ChatId)) + { + m_ChatInfos[p_ProfileId][p_ChatId].transcriptionLanguage = p_Language; + } + else + { + // Create chat info if it doesn't exist + ChatInfo chatInfo; + chatInfo.id = p_ChatId; + chatInfo.transcriptionLanguage = p_Language; + m_ChatInfos[p_ProfileId][p_ChatId] = chatInfo; + } +} + // --------------------------------------------------------------------- // UiModel // --------------------------------------------------------------------- @@ -4392,6 +4574,8 @@ void UiModel::KeyHandler(wint_t p_Key) static wint_t keyAutoCompose = UiKeyConfig::GetKey("auto_compose"); static wint_t keySelectMention = UiKeyConfig::GetKey("select_mention"); + static wint_t keyTranscribeAudio = UiKeyConfig::GetKey("transcribe_audio"); + static wint_t keySetTranscriptionLang = UiKeyConfig::GetKey("set_transcription_lang"); if (p_Key == keyTerminalResize) { @@ -4615,6 +4799,16 @@ void UiModel::KeyHandler(wint_t p_Key) { OnKeyAutoCompose(); } + else if (p_Key == keyTranscribeAudio) + { + LOG_DEBUG("transcribe_audio key pressed"); + OnKeyTranscribeAudio(); + } + else if (p_Key == keySetTranscriptionLang) + { + LOG_DEBUG("set_transcription_lang key pressed"); + OnKeySetTranscriptionLang(); + } else { std::unique_lock lock(m_ModelMutex); @@ -5534,6 +5728,102 @@ void UiModel::OnKeyPaste() } } +void UiModel::OnKeyTranscribeAudio() +{ + LOG_DEBUG("OnKeyTranscribeAudio called"); + + // Check if transcription is enabled (check every time, not static) + bool transcribeEnabled = UiConfig::GetBool("audio_transcribe_enabled"); + LOG_DEBUG("audio_transcribe_enabled = %d", transcribeEnabled); + + if (!transcribeEnabled) + { + LOG_DEBUG("transcription not enabled, showing dialog"); + MessageDialog("Warning", "Audio transcription not enabled.", 0.7, 5); + return; + } + + LOG_DEBUG("transcription enabled, checking prerequisites"); + + // Pre-req + { + std::unique_lock lock(m_ModelMutex); + if (!GetImpl().GetSelectMessageActive()) + { + MessageDialog("Info", "Please select a message first (press Up arrow).", 0.7, 5); + return; + } + if (GetImpl().GetEditMessageActive()) + { + MessageDialog("Info", "Cannot transcribe while editing a message.", 0.7, 5); + return; + } + } + + bool rv = false; + { + std::unique_lock lock(m_ModelMutex); + rv = GetImpl().TranscribeAudio(); + } + + if (!rv) + { + MessageDialog("Warning", "Transcription failed.", 0.7, 5); + } +} + +void UiModel::OnKeySetTranscriptionLang() +{ + // Get current chat info + std::string profileId; + std::string chatId; + std::string currentLanguage; + + { + std::unique_lock lock(m_ModelMutex); + + // Check if we have a valid current chat + if (GetImpl().GetCurrentChat().first.empty() || GetImpl().GetCurrentChat().second.empty()) + { + MessageDialog("Info", "Please select a chat first.", 0.7, 5); + return; + } + + profileId = GetImpl().GetCurrentChat().first; + chatId = GetImpl().GetCurrentChat().second; + + // Get current transcription language for this chat + currentLanguage = GetImpl().GetCurrentTranscriptionLanguage(profileId, chatId); + } + + // Open modal dialog without model mutex held + UiDialogParams params(this, "Set Transcription Language", 0.75, 0.65); + UiLanguageListDialog dialog(params, currentLanguage); + if (dialog.Run()) + { + std::string selectedLanguage = dialog.GetSelectedLanguage(); + + { + std::unique_lock lock(m_ModelMutex); + + // Update in-memory chat info + GetImpl().UpdateCurrentTranscriptionLanguage(profileId, chatId, selectedLanguage); + + // Persist to database + MessageCache::UpdateTranscriptionLanguage(profileId, chatId, selectedLanguage); + + GetImpl().ReinitView(); + } + + MessageDialog("Info", "Transcription language updated successfully.", 0.7, 5); + } + else + { + std::unique_lock lock(m_ModelMutex); + GetImpl().ReinitView(); + } +} + bool UiModel::IsAttachmentDownloaded(const FileInfo& p_FileInfo) { return UiModel::Impl::IsAttachmentDownloaded(p_FileInfo); diff --git a/src/uimodel.h b/src/uimodel.h index 76378666..7508db6d 100644 --- a/src/uimodel.h +++ b/src/uimodel.h @@ -190,6 +190,10 @@ class UiModel void HandleProtocolUiControlStart(); void HandleProtocolUiControlEnd(); bool AutoCompose(); + bool TranscribeAudio(); + std::string GetCurrentTranscriptionLanguage(const std::string& p_ProfileId, const std::string& p_ChatId); + void UpdateCurrentTranscriptionLanguage(const std::string& p_ProfileId, const std::string& p_ChatId, + const std::string& p_Language); static bool IsAttachmentDownloaded(const FileInfo& p_FileInfo); static bool IsAttachmentDownloadable(const FileInfo& p_FileInfo); @@ -415,6 +419,8 @@ class UiModel void OnKeyCut(); void OnKeyCopy(); void OnKeyPaste(); + void OnKeyTranscribeAudio(); + void OnKeySetTranscriptionLang(); private: Impl m_Impl;