From db0f8e599019b35fd5d190f19b50cb0a7dc8e302 Mon Sep 17 00:00:00 2001 From: StillHammer Date: Tue, 2 Dec 2025 09:44:06 +0800 Subject: [PATCH] refactor: Improve VAD trailing silence detection and update docs - Replace hang time logic with consecutive silence frame counter for more precise speech end detection - Update Whisper prompt to utilize previous context for better transcription coherence - Expand README with comprehensive feature list, architecture details, debugging status, and session logging structure - Add troubleshooting section for real-world testing conditions and known issues --- README.md | 334 +++++++++++++++++++++++++++++++------ config.json | 2 +- src/audio/AudioCapture.cpp | 29 ++-- src/audio/AudioCapture.h | 5 +- 4 files changed, 306 insertions(+), 64 deletions(-) diff --git a/README.md b/README.md index 5b2f96a..83622ae 100644 --- a/README.md +++ b/README.md @@ -4,16 +4,50 @@ Real-time Chinese to French translation system for live meetings. ## Overview -SecondVoice captures audio, transcribes Chinese speech using OpenAI's Whisper API, and translates it to French using Claude AI in real-time. Perfect for understanding Chinese meetings on the fly. +SecondVoice captures audio, transcribes Chinese speech using OpenAI's Whisper API (gpt-4o-mini-transcribe), and translates it to French using Claude AI in real-time. Designed for understanding Chinese meetings, calls, and conversations on the fly. + +### Why This Project? + +Built to solve a real need: understanding Chinese meetings in real-time without constant reliance on bilingual support. Perfect for: +- Business meetings with Chinese speakers +- Family/administrative calls +- Professional conferences +- Any live Chinese conversation where real-time comprehension is needed + +**Status**: MVP complete, actively being debugged and improved based on real-world usage. + +## Quick Start + +### Windows (MinGW) - Recommended + +```batch +# First-time setup +.\setup_mingw.bat + +# Build +.\build_mingw.bat + +# Run +cd build\mingw-Release +SecondVoice.exe +``` + +**Requirements**: `.env` file with `OPENAI_API_KEY` and `ANTHROPIC_API_KEY`, plus a working microphone. + +See full setup instructions below for other platforms. ## Features -- ๐ŸŽค Real-time audio capture -- ๐Ÿ—ฃ๏ธ Chinese speech-to-text (Whisper API) -- ๐ŸŒ Chinese to French translation (Claude API) -- ๐Ÿ–ฅ๏ธ Clean ImGui interface -- ๐Ÿ’พ Full recording saved to disk -- โš™๏ธ Configurable chunk sizes and settings +- ๐ŸŽค **Real-time audio capture** with Voice Activity Detection (VAD) +- ๐Ÿ”‡ **Noise reduction** using RNNoise neural network +- ๐Ÿ—ฃ๏ธ **Chinese speech-to-text** via Whisper API (gpt-4o-mini-transcribe) +- ๐Ÿง  **Hallucination filtering** - removes known Whisper artifacts +- ๐ŸŒ **Chinese to French translation** via Claude AI (claude-haiku-4-20250514) +- ๐Ÿ–ฅ๏ธ **Clean ImGui interface** with adjustable VAD thresholds +- ๐Ÿ’พ **Full session recording** with structured logging +- ๐Ÿ“Š **Session archival** - audio, transcripts, translations, and metadata +- โšก **Opus compression** - 46x bandwidth reduction (16kHz PCM โ†’ 24kbps Opus) +- โš™๏ธ **Configurable settings** via config.json ## Requirements @@ -116,20 +150,138 @@ The application will: ## Architecture ``` -Audio Capture (PortAudio) +Audio Input (16kHz mono) โ†“ -Whisper API (Speech-to-Text) +Voice Activity Detection (VAD) - RMS + Peak thresholds โ†“ -Claude API (Translation) +Noise Reduction (RNNoise) - 16โ†’48โ†’16 kHz resampling โ†“ -ImGui UI (Display) +Opus Encoding (24kbps OGG) - 46x compression + โ†“ +Whisper API (gpt-4o-mini-transcribe) - Chinese STT + โ†“ +Hallucination Filter - Remove known artifacts + โ†“ +Claude API (claude-haiku-4) - Chinese โ†’ French translation + โ†“ +ImGui UI Display + Session Logging ``` -### Threading Model +### Threading Model (3 threads) -- **Thread 1**: Audio capture (PortAudio callback) -- **Thread 2**: AI processing (Whisper + Claude API calls) -- **Thread 3**: UI rendering (ImGui + OpenGL) +1. **Audio Thread** (`Pipeline::audioThread`) + - PortAudio callback captures 16kHz mono audio + - Applies VAD (Voice Activity Detection) using RMS + Peak thresholds + - Pushes speech chunks to processing queue + +2. **Processing Thread** (`Pipeline::processingThread`) + - Consumes audio chunks from queue + - Applies RNNoise denoising (upsampled to 48kHz โ†’ denoised โ†’ downsampled to 16kHz) + - Encodes to Opus/OGG for bandwidth efficiency + - Calls Whisper API for Chinese transcription + - Filters known hallucinations (YouTube phrases, music markers, etc.) + - Calls Claude API for French translation + - Logs to session files + +3. **UI Thread** (main) + - GLFW/ImGui rendering loop (must run on main thread) + - Displays real-time transcription and translation + - Allows runtime VAD threshold adjustment + - Handles user controls (stop recording, etc.) + +### Core Components + +**Audio Processing**: +- `AudioCapture.cpp` - PortAudio wrapper with VAD-based segmentation +- `AudioBuffer.cpp` - Accumulates samples, exports WAV/Opus +- `NoiseReducer.cpp` - RNNoise denoising with resampling + +**API Clients**: +- `WhisperClient.cpp` - OpenAI Whisper API (multipart/form-data) +- `ClaudeClient.cpp` - Anthropic Claude API (JSON) +- `WinHttpClient.cpp` - Native Windows HTTP client (replaced libcurl) + +**Core Logic**: +- `Pipeline.cpp` - Orchestrates audio โ†’ transcription โ†’ translation flow +- `TranslationUI.cpp` - ImGui interface with VAD controls + +**Utilities**: +- `Config.cpp` - Loads config.json + .env +- `ThreadSafeQueue.h` - Lock-free queue for audio chunks + +## Known Issues & Active Debugging + +**Status**: Real-world testing has identified issues with degraded audio conditions (see `PLAN_DEBUG.md` for details). + +### Current Problems + +Based on transcript analysis from actual meetings (November 2025): + +1. **VAD cutting speech too early** + - Voice Activity Detection triggers end-of-segment prematurely + - Results in fragmented phrases ("ๆˆ‘ๅพˆใ€‚" โ†’ "Je suis.") + - **Hypothesis**: Silence threshold too aggressive for multi-speaker scenarios + +2. **Segments too short for context** + - Whisper receives insufficient audio context for accurate Chinese transcription + - Single-word or two-word segments lack conversational context + - **Impact**: Lower accuracy, especially with homonyms + +3. **Ambient noise interpreted as speech** + - Background sounds trigger false VAD positives + - Test transcript shows "ๅคชๅคšๅฃฐ้Ÿณไบ†" (too much noise) being captured + - **Mitigation**: RNNoise helps but not sufficient for very noisy environments + +4. **Loss of inter-segment context** + - Each audio chunk processed independently + - Whisper cannot use previous context for better transcription + - **Potential solution**: Pass previous 2-3 transcriptions in prompt + +### Test Conditions + +Testing has been performed under **deliberately degraded conditions** to ensure robustness: +- Multiple simultaneous speakers +- Variable microphone distance +- Variable volume levels +- Fast-paced conversations +- Low-quality microphone + +These conditions are intentionally harsh to validate real-world meeting scenarios. + +### Debug Plan + +See `PLAN_DEBUG.md` for: +- Detailed session logging implementation (JSON per segment + metadata) +- Improved Whisper prompt engineering +- VAD threshold tuning recommendations +- Context propagation strategies + +## Session Logging + +### Structure + +``` +sessions/ +โ””โ”€โ”€ YYYY-MM-DD_HHMMSS/ + โ”œโ”€โ”€ session.json # Session metadata + โ”œโ”€โ”€ segments/ + โ”‚ โ”œโ”€โ”€ 001.json # Segment: Chinese + French + metadata + โ”‚ โ”œโ”€โ”€ 002.json + โ”‚ โ””โ”€โ”€ ... + โ””โ”€โ”€ transcript.txt # Final export +``` + +### Segment Format + +```json +{ + "id": 1, + "chinese": "ไธคไธช่€้ผ ๆฑ‚ๆˆ‘", + "french": "Deux souris me supplient" +} +``` + +**Future enhancements**: Audio duration, RMS levels, timestamps, Whisper/Claude latencies per segment. ## Configuration @@ -143,8 +295,9 @@ ImGui UI (Display) "chunk_duration_seconds": 10 }, "whisper": { - "model": "whisper-1", - "language": "zh" + "model": "gpt-4o-mini-transcribe", + "language": "zh", + "prompt": "Transcription d'une rรฉunion en chinois mandarin. Plusieurs interlocuteurs. Ne transcris PAS : musique, silence, bruits de fond. Si l'audio est inaudible, renvoie une chaรฎne vide. Noms possibles: Tingting, Alexis." }, "claude": { "model": "claude-haiku-4-20250514", @@ -166,23 +319,33 @@ ANTHROPIC_API_KEY=sk-ant-... - **Claude Haiku**: ~$0.03-0.05/hour - **Total**: ~$0.40/hour of recording -## Project Structure +## Advanced Features -``` -secondvoice/ -โ”œโ”€โ”€ src/ -โ”‚ โ”œโ”€โ”€ main.cpp # Entry point -โ”‚ โ”œโ”€โ”€ audio/ # Audio capture & buffer -โ”‚ โ”œโ”€โ”€ api/ # Whisper & Claude clients -โ”‚ โ”œโ”€โ”€ ui/ # ImGui interface -โ”‚ โ”œโ”€โ”€ utils/ # Config & thread-safe queue -โ”‚ โ””โ”€โ”€ core/ # Pipeline orchestration -โ”œโ”€โ”€ docs/ # Documentation -โ”œโ”€โ”€ recordings/ # Output recordings -โ”œโ”€โ”€ config.json # Runtime configuration -โ”œโ”€โ”€ .env # API keys (not committed) -โ””โ”€โ”€ CMakeLists.txt # Build configuration -``` +### GPU Forcing (Hybrid Graphics Systems) + +`main.cpp` exports symbols to force dedicated GPU on Optimus/PowerXpress systems: +- `NvOptimusEnablement` - Forces NVIDIA GPU +- `AmdPowerXpressRequestHighPerformance` - Forces AMD GPU + +Critical for laptops with both integrated and dedicated GPUs. + +### Hallucination Filtering + +`Pipeline.cpp` maintains an extensive list (~65 patterns) of known Whisper hallucinations: +- YouTube phrases: "Thank you for watching", "Subscribe", "Like and comment" +- Chinese video endings: "่ฐข่ฐข่ง‚็œ‹", "ๅ†่ง", "่ฎข้˜…ๆˆ‘็š„้ข‘้“" +- Music symbols: "โ™ชโ™ช", "๐ŸŽต" +- Silence markers: "...", "silence", "inaudible" + +These are automatically filtered before translation to avoid wasting API calls. + +### Console-Only Build + +A `SecondVoice_Console` target exists for headless testing: +- Uses `main_console.cpp` +- No ImGui/GLFW dependencies +- Outputs transcriptions to stdout +- Useful for debugging and automated testing ## Development @@ -219,30 +382,101 @@ cmake --build build - Check all system dependencies are installed - Try `cmake --build build --clean-first` +## Project Structure + +``` +secondvoice/ +โ”œโ”€โ”€ src/ +โ”‚ โ”œโ”€โ”€ main.cpp # Entry point, forces NVIDIA GPU +โ”‚ โ”œโ”€โ”€ core/ +โ”‚ โ”‚ โ””โ”€โ”€ Pipeline.cpp # Audioโ†’Transcriptionโ†’Translation orchestration +โ”‚ โ”œโ”€โ”€ audio/ +โ”‚ โ”‚ โ”œโ”€โ”€ AudioCapture.cpp # PortAudio + VAD segmentation +โ”‚ โ”‚ โ”œโ”€โ”€ AudioBuffer.cpp # Sample accumulation, WAV/Opus export +โ”‚ โ”‚ โ””โ”€โ”€ NoiseReducer.cpp # RNNoise (16โ†’48โ†’16 kHz) +โ”‚ โ”œโ”€โ”€ api/ +โ”‚ โ”‚ โ”œโ”€โ”€ WhisperClient.cpp # OpenAI Whisper (multipart/form-data) +โ”‚ โ”‚ โ”œโ”€โ”€ ClaudeClient.cpp # Anthropic Claude (JSON) +โ”‚ โ”‚ โ””โ”€โ”€ WinHttpClient.cpp # Native Windows HTTP +โ”‚ โ”œโ”€โ”€ ui/ +โ”‚ โ”‚ โ””โ”€โ”€ TranslationUI.cpp # ImGui interface + VAD controls +โ”‚ โ””โ”€โ”€ utils/ +โ”‚ โ”œโ”€โ”€ Config.cpp # config.json + .env loader +โ”‚ โ””โ”€โ”€ ThreadSafeQueue.h # Lock-free audio queue +โ”œโ”€โ”€ docs/ # Build guides +โ”œโ”€โ”€ sessions/ # Session recordings + logs +โ”œโ”€โ”€ recordings/ # Legacy recordings directory +โ”œโ”€โ”€ denoised/ # Denoised audio outputs +โ”œโ”€โ”€ config.json # Runtime configuration +โ”œโ”€โ”€ .env # API keys (not committed) +โ”œโ”€โ”€ CLAUDE.md # Development guide for Claude Code +โ”œโ”€โ”€ PLAN_DEBUG.md # Active debugging plan +โ””โ”€โ”€ CMakeLists.txt # Build configuration +``` + +### External Dependencies + +**Fetched via CMake FetchContent**: +- ImGui v1.90.1 - UI framework +- Opus v1.5.2 - Audio encoding +- Ogg v1.3.6 - Container format +- RNNoise v0.1.1 - Neural network noise reduction + +**vcpkg Dependencies** (x64-mingw-static triplet): +- portaudio - Cross-platform audio I/O +- nlohmann_json - JSON parsing +- glfw3 - Windowing/input +- glad - OpenGL loader + ## Roadmap -### Phase 1 - MVP (Current) -- โœ… Audio capture -- โœ… Whisper integration -- โœ… Claude integration -- โœ… ImGui UI -- โœ… Stop button +### Phase 1 - MVP โœ… (Complete) +- โœ… Audio capture with VAD +- โœ… Noise reduction (RNNoise) +- โœ… Whisper API integration +- โœ… Claude API integration +- โœ… ImGui UI with runtime VAD adjustment +- โœ… Opus compression +- โœ… Hallucination filtering +- โœ… Session recording -### Phase 2 - Enhancement -- โฌœ Auto-summary post-meeting -- โฌœ Export transcripts -- โฌœ Search functionality +### Phase 2 - Debugging ๐Ÿ”„ (Current) +- ๐Ÿ”„ Session logging (JSON per segment) +- ๐Ÿ”„ Improved Whisper prompt engineering +- ๐Ÿ”„ VAD threshold optimization +- ๐Ÿ”„ Context propagation between segments +- โฌœ Automated testing with sample audio + +### Phase 3 - Enhancement +- โฌœ Auto-summary post-meeting (Claude analysis) +- โฌœ Full-text search (SQLite FTS5) +- โฌœ Semantic search (embeddings) - โฌœ Speaker diarization -- โฌœ Replay mode +- โฌœ Replay mode with synced transcripts +- โฌœ Multi-language support extension + +## Development Documentation + +- **CLAUDE.md** - Development guide for Claude Code AI assistant +- **PLAN_DEBUG.md** - Active debugging plan with identified issues and solutions +- **WINDOWS_BUILD.md** - Detailed Windows build instructions +- **WINDOWS_MINGW.md** - MinGW-specific build guide +- **WINDOWS_QUICK_START.md** - Quick start for Windows users + +## Contributing + +This is a personal project built to solve a real need. Bug reports and suggestions welcome: + +**Known issues**: See `PLAN_DEBUG.md` for current debugging efforts +**Architecture**: See `CLAUDE.md` for detailed system design ## License See LICENSE file. -## Contributing +## Acknowledgments -This is a personal project, but suggestions and bug reports are welcome via issues. - -## Contact - -See docs/SecondVoice.md for project context and motivation. +- OpenAI Whisper for excellent Chinese transcription +- Anthropic Claude for context-aware translation +- RNNoise for neural network-based noise reduction +- ImGui for clean, immediate-mode UI diff --git a/config.json b/config.json index b9115f1..6a706d2 100644 --- a/config.json +++ b/config.json @@ -10,7 +10,7 @@ "model": "gpt-4o-mini-transcribe", "language": "zh", "temperature": 0.0, - "prompt": "Transcription en direct d'une conversation en chinois mandarin. Plusieurs interlocuteurs parlent, parfois en mรชme temps. RรˆGLES STRICTES: (1) Ne transcris QUE les paroles audibles en chinois. (2) Si l'audio est inaudible, du bruit, ou du silence, renvoie une chaรฎne vide. (3) NE Gร‰NรˆRE JAMAIS ces phrases: ่ฐข่ฐข่ง‚็œ‹, ๆ„Ÿ่ฐขๆ”ถ็œ‹, ่ฎข้˜…, ่ฏท่ฎข้˜…, ไธ‹ๆœŸๅ†่ง, Thank you, Subscribe, ๅญ—ๅน•. (4) Ignore: musique, applaudissements, rires, bruits de fond, respirations.", + "prompt": "Transcription en direct d'une conversation en chinois mandarin. Plusieurs interlocuteurs parlent, parfois en mรชme temps. Si un contexte de phrases prรฉcรฉdentes est fourni, utilise-le pour maintenir la cohรฉrence (noms propres, sujets, terminologie). RรˆGLES STRICTES: (1) Ne transcris QUE les paroles audibles en chinois. (2) Si l'audio est inaudible, du bruit, ou du silence, renvoie une chaรฎne vide. (3) NE Gร‰NรˆRE JAMAIS ces phrases: ่ฐข่ฐข่ง‚็œ‹, ๆ„Ÿ่ฐขๆ”ถ็œ‹, ่ฎข้˜…, ่ฏท่ฎข้˜…, ไธ‹ๆœŸๅ†่ง, Thank you, Subscribe, ๅญ—ๅน•. (4) Ignore: musique, applaudissements, rires, bruits de fond, respirations.", "stream": false, "response_format": "text" }, diff --git a/src/audio/AudioCapture.cpp b/src/audio/AudioCapture.cpp index b384df7..8d2c62c 100644 --- a/src/audio/AudioCapture.cpp +++ b/src/audio/AudioCapture.cpp @@ -135,16 +135,12 @@ int AudioCapture::audioCallback(const void* input, void* output, // Speech = energy OK AND (ZCR OK or very high energy) bool frame_has_speech = energy_ok && (zcr_ok || denoised_rms > adaptive_rms_thresh * 3.0f); - // Hang time logic: don't immediately cut on silence + // Reset trailing silence counter when speech detected if (frame_has_speech) { - self->hang_frames_ = self->hang_frames_threshold_; // Reset hang counter - } else if (self->hang_frames_ > 0) { - self->hang_frames_--; - frame_has_speech = true; // Keep "speaking" during hang time + self->consecutive_silence_frames_ = 0; } // Calculate durations in samples - int silence_samples_threshold = (self->silence_duration_ms_ * self->sample_rate_ * self->channels_) / 1000; int min_speech_samples = (self->min_speech_duration_ms_ * self->sample_rate_ * self->channels_) / 1000; int max_speech_samples = (self->max_speech_duration_ms_ * self->sample_rate_ * self->channels_) / 1000; @@ -183,16 +179,17 @@ int AudioCapture::audioCallback(const void* input, void* output, } self->speech_buffer_.clear(); self->speech_samples_count_ = 0; + self->consecutive_silence_frames_ = 0; // Reset after forced flush // Reset stream for next segment if (self->noise_reducer_) { self->noise_reducer_->resetStream(); } } } else { - // True silence (after hang time expired) + // Silence detected self->silence_samples_count_ += sample_count; - // If we were speaking and now have enough silence, flush + // If we were speaking and now have silence, track consecutive silence frames if (self->speech_buffer_.size() > 0) { // Add trailing silence (denoised) if (!denoised_samples.empty()) { @@ -204,7 +201,16 @@ int AudioCapture::audioCallback(const void* input, void* output, } } - if (self->silence_samples_count_ >= silence_samples_threshold) { + // Increment consecutive silence frame counter + self->consecutive_silence_frames_++; + + // Calculate threshold in frames (callbacks) + // frames_per_buffer = frame_count from callback + int frames_per_buffer = static_cast(frame_count); + int silence_threshold_frames = (self->silence_duration_ms_ * self->sample_rate_) / (1000 * frames_per_buffer); + + // Flush when consecutive silence exceeds threshold + if (self->consecutive_silence_frames_ >= silence_threshold_frames) { self->is_speech_active_.store(false, std::memory_order_relaxed); // Flush if we have enough speech @@ -220,7 +226,9 @@ int AudioCapture::audioCallback(const void* input, void* output, float duration = static_cast(self->speech_buffer_.size()) / (self->sample_rate_ * self->channels_); - std::cout << "[VAD] Speech ended (noise_floor=" << self->noise_floor_ + std::cout << "[VAD] Speech ended (trailing silence detected, " + << self->consecutive_silence_frames_ << " frames, " + << "noise_floor=" << self->noise_floor_ << "), flushing " << duration << "s (denoised)" << std::endl; if (self->callback_) { @@ -233,6 +241,7 @@ int AudioCapture::audioCallback(const void* input, void* output, self->speech_buffer_.clear(); self->speech_samples_count_ = 0; + self->consecutive_silence_frames_ = 0; // Reset after flush // Reset stream for next segment if (self->noise_reducer_) { self->noise_reducer_->resetStream(); diff --git a/src/audio/AudioCapture.h b/src/audio/AudioCapture.h index 5a004e9..ff1eba8 100644 --- a/src/audio/AudioCapture.h +++ b/src/audio/AudioCapture.h @@ -77,9 +77,8 @@ private: float noise_floor_ = 0.005f; // Estimated background noise level float noise_floor_alpha_ = 0.001f; // Slower adaptation - // Hang time - wait before cutting to avoid mid-sentence cuts - int hang_frames_ = 0; - int hang_frames_threshold_ = 35; // ~350ms tolerance for pauses (was 20) + // Trailing silence detection - count consecutive silence frames after speech + int consecutive_silence_frames_ = 0; // Zero-crossing rate for speech vs noise discrimination float last_zcr_ = 0.0f;