From 5b60acaa738daa4fa3df6ea8f197902277d70e89 Mon Sep 17 00:00:00 2001 From: StillHammer Date: Thu, 20 Nov 2025 03:08:03 +0800 Subject: [PATCH] feat: Implement complete MVP architecture for SecondVoice MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Complete implementation of the real-time Chinese-to-French translation system: Architecture: - 3-threaded pipeline: Audio capture → AI processing → UI rendering - Thread-safe queues for inter-thread communication - Configurable audio chunk sizes for latency tuning Core Features: - Audio capture with PortAudio (configurable sample rate/channels) - Whisper API integration for Chinese speech-to-text - Claude API integration for Chinese-to-French translation - ImGui real-time display with stop button - Full recording saved to WAV on stop Modules Implemented: - audio/: AudioCapture (PortAudio wrapper) + AudioBuffer (WAV export) - api/: WhisperClient + ClaudeClient (HTTP API wrappers) - ui/: TranslationUI (ImGui interface) - core/: Pipeline (orchestrates all threads) - utils/: Config (JSON/.env loader) + ThreadSafeQueue (template) Build System: - CMake with vcpkg for dependency management - vcpkg.json manifest for reproducible builds - build.sh helper script Configuration: - config.json: Audio settings, API parameters, UI config - .env: API keys (OpenAI + Anthropic) Documentation: - README.md: Setup instructions, usage, architecture - docs/implementation_plan.md: Technical design document - docs/SecondVoice.md: Project vision and motivation Next Steps: - Test build with vcpkg dependencies - Test audio capture on real hardware - Validate API integrations - Tune chunk size for optimal latency 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .env.example | 5 + .gitignore | 45 ++++ CMakeLists.txt | 67 +++++ README.md | 223 ++++++++++++++++ build.sh | 50 ++++ config.json | 29 +++ docs/implementation_plan.md | 494 ++++++++++++++++++++++++++++++++++++ recordings/.gitkeep | 0 src/api/ClaudeClient.cpp | 79 ++++++ src/api/ClaudeClient.h | 30 +++ src/api/WhisperClient.cpp | 85 +++++++ src/api/WhisperClient.h | 30 +++ src/audio/AudioBuffer.cpp | 76 ++++++ src/audio/AudioBuffer.h | 27 ++ src/audio/AudioCapture.cpp | 110 ++++++++ src/audio/AudioCapture.h | 39 +++ src/core/Pipeline.cpp | 214 ++++++++++++++++ src/core/Pipeline.h | 59 +++++ src/main.cpp | 56 ++++ src/ui/TranslationUI.cpp | 160 ++++++++++++ src/ui/TranslationUI.h | 47 ++++ src/utils/Config.cpp | 106 ++++++++ src/utils/Config.h | 69 +++++ src/utils/ThreadSafeQueue.h | 65 +++++ vcpkg.json | 15 ++ 25 files changed, 2180 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 CMakeLists.txt create mode 100644 README.md create mode 100644 build.sh create mode 100644 config.json create mode 100644 docs/implementation_plan.md create mode 100644 recordings/.gitkeep create mode 100644 src/api/ClaudeClient.cpp create mode 100644 src/api/ClaudeClient.h create mode 100644 src/api/WhisperClient.cpp create mode 100644 src/api/WhisperClient.h create mode 100644 src/audio/AudioBuffer.cpp create mode 100644 src/audio/AudioBuffer.h create mode 100644 src/audio/AudioCapture.cpp create mode 100644 src/audio/AudioCapture.h create mode 100644 src/core/Pipeline.cpp create mode 100644 src/core/Pipeline.h create mode 100644 src/main.cpp create mode 100644 src/ui/TranslationUI.cpp create mode 100644 src/ui/TranslationUI.h create mode 100644 src/utils/Config.cpp create mode 100644 src/utils/Config.h create mode 100644 src/utils/ThreadSafeQueue.h create mode 100644 vcpkg.json diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..4ad1b5c --- /dev/null +++ b/.env.example @@ -0,0 +1,5 @@ +# OpenAI API Key (for Whisper) +OPENAI_API_KEY=sk-... + +# Anthropic API Key (for Claude) +ANTHROPIC_API_KEY=sk-ant-... diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8be1a01 --- /dev/null +++ b/.gitignore @@ -0,0 +1,45 @@ +# Build directories +build/ +cmake-build-*/ +out/ + +# vcpkg +vcpkg_installed/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# Compiled files +*.o +*.a +*.so +*.exe + +# Environment and secrets +.env + +# Recordings (generated at runtime) +recordings/*.wav +recordings/*.mp3 +recordings/*.txt +recordings/*.md +recordings/*.json +!recordings/.gitkeep + +# Logs +*.log + +# OS +.DS_Store +Thumbs.db + +# CMake +CMakeCache.txt +CMakeFiles/ +cmake_install.cmake +Makefile +compile_commands.json diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..966f7ac --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,67 @@ +cmake_minimum_required(VERSION 3.20) +project(SecondVoice VERSION 0.1.0 LANGUAGES CXX) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# Find packages +find_package(portaudio CONFIG REQUIRED) +find_package(httplib CONFIG REQUIRED) +find_package(nlohmann_json CONFIG REQUIRED) +find_package(imgui CONFIG REQUIRED) +find_package(glfw3 CONFIG REQUIRED) +find_package(OpenGL REQUIRED) + +# Source files +set(SOURCES + src/main.cpp + # Audio module + src/audio/AudioCapture.cpp + src/audio/AudioBuffer.cpp + # API clients + src/api/WhisperClient.cpp + src/api/ClaudeClient.cpp + # UI + src/ui/TranslationUI.cpp + # Utils + src/utils/Config.cpp + # Core + src/core/Pipeline.cpp +) + +# Executable +add_executable(${PROJECT_NAME} ${SOURCES}) + +# Include directories +target_include_directories(${PROJECT_NAME} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src +) + +# Link libraries +target_link_libraries(${PROJECT_NAME} PRIVATE + portaudio + httplib::httplib + nlohmann_json::nlohmann_json + imgui::imgui + glfw + OpenGL::GL +) + +# Compiler options +if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") + target_compile_options(${PROJECT_NAME} PRIVATE + -Wall + -Wextra + -Wpedantic + -Werror + ) +endif() + +# Copy config files to build directory +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.json + ${CMAKE_CURRENT_BINARY_DIR}/config.json COPYONLY) + +# Install target +install(TARGETS ${PROJECT_NAME} DESTINATION bin) +install(FILES config.json DESTINATION bin) diff --git a/README.md b/README.md new file mode 100644 index 0000000..a9aa367 --- /dev/null +++ b/README.md @@ -0,0 +1,223 @@ +# SecondVoice + +Real-time Chinese to French translation system for live meetings. + +## Overview + +SecondVoice captures audio, transcribes Chinese speech using OpenAI's Whisper API, and translates it to French using Claude AI in real-time. Perfect for understanding Chinese meetings on the fly. + +## Features + +- 🎤 Real-time audio capture +- 🗣️ Chinese speech-to-text (Whisper API) +- 🌐 Chinese to French translation (Claude API) +- 🖥️ Clean ImGui interface +- 💾 Full recording saved to disk +- ⚙️ Configurable chunk sizes and settings + +## Requirements + +### System Dependencies (Linux) + +```bash +# PortAudio +sudo apt install libasound2-dev + +# OpenGL +sudo apt install libgl1-mesa-dev libglu1-mesa-dev +``` + +### vcpkg + +Install vcpkg if not already installed: + +```bash +git clone https://github.com/microsoft/vcpkg.git +cd vcpkg +./bootstrap-vcpkg.sh +export VCPKG_ROOT=$(pwd) +``` + +## Setup + +1. **Clone the repository** + +```bash +git clone +cd secondvoice +``` + +2. **Create `.env` file** (copy from `.env.example`) + +```bash +cp .env.example .env +# Edit .env and add your API keys: +# OPENAI_API_KEY=sk-... +# ANTHROPIC_API_KEY=sk-ant-... +``` + +3. **Configure settings** (optional) + +Edit `config.json` to customize: +- Audio chunk duration (default: 10s) +- Sample rate (default: 16kHz) +- UI window size +- Output directory + +4. **Build the project** + +```bash +# Configure with vcpkg +cmake -B build -DCMAKE_TOOLCHAIN_FILE=$VCPKG_ROOT/scripts/buildsystems/vcpkg.cmake + +# Build +cmake --build build -j$(nproc) +``` + +## Usage + +```bash +cd build +./SecondVoice +``` + +The application will: +1. Open an ImGui window +2. Start capturing audio from your microphone +3. Display Chinese transcriptions and French translations in real-time +4. Click **STOP RECORDING** button to finish +5. Save the full audio recording to `recordings/recording_YYYYMMDD_HHMMSS.wav` + +## Architecture + +``` +Audio Capture (PortAudio) + ↓ +Whisper API (Speech-to-Text) + ↓ +Claude API (Translation) + ↓ +ImGui UI (Display) +``` + +### Threading Model + +- **Thread 1**: Audio capture (PortAudio callback) +- **Thread 2**: AI processing (Whisper + Claude API calls) +- **Thread 3**: UI rendering (ImGui + OpenGL) + +## Configuration + +### config.json + +```json +{ + "audio": { + "sample_rate": 16000, + "channels": 1, + "chunk_duration_seconds": 10 + }, + "whisper": { + "model": "whisper-1", + "language": "zh" + }, + "claude": { + "model": "claude-haiku-4-20250514", + "max_tokens": 1024 + } +} +``` + +### .env + +```env +OPENAI_API_KEY=sk-... +ANTHROPIC_API_KEY=sk-ant-... +``` + +## Cost Estimation + +- **Whisper**: ~$0.006/minute (~$0.36/hour) +- **Claude Haiku**: ~$0.03-0.05/hour +- **Total**: ~$0.40/hour of recording + +## Project Structure + +``` +secondvoice/ +├── src/ +│ ├── main.cpp # Entry point +│ ├── audio/ # Audio capture & buffer +│ ├── api/ # Whisper & Claude clients +│ ├── ui/ # ImGui interface +│ ├── utils/ # Config & thread-safe queue +│ └── core/ # Pipeline orchestration +├── docs/ # Documentation +├── recordings/ # Output recordings +├── config.json # Runtime configuration +├── .env # API keys (not committed) +└── CMakeLists.txt # Build configuration +``` + +## Development + +### Building in Debug Mode + +```bash +cmake -B build -DCMAKE_BUILD_TYPE=Debug -DCMAKE_TOOLCHAIN_FILE=$VCPKG_ROOT/scripts/buildsystems/vcpkg.cmake +cmake --build build +``` + +### Running Tests + +```bash +# TODO: Add tests +``` + +## Troubleshooting + +### No audio capture + +- Check microphone permissions +- Verify PortAudio is properly installed: `pa_devs` (if available) +- Try different audio device in code + +### API errors + +- Verify API keys in `.env` are correct +- Check internet connection +- Monitor API rate limits + +### Build errors + +- Ensure vcpkg is properly set up +- Check all system dependencies are installed +- Try `cmake --build build --clean-first` + +## Roadmap + +### Phase 1 - MVP (Current) +- ✅ Audio capture +- ✅ Whisper integration +- ✅ Claude integration +- ✅ ImGui UI +- ✅ Stop button + +### Phase 2 - Enhancement +- ⬜ Auto-summary post-meeting +- ⬜ Export transcripts +- ⬜ Search functionality +- ⬜ Speaker diarization +- ⬜ Replay mode + +## License + +See LICENSE file. + +## Contributing + +This is a personal project, but suggestions and bug reports are welcome via issues. + +## Contact + +See docs/SecondVoice.md for project context and motivation. diff --git a/build.sh b/build.sh new file mode 100644 index 0000000..f66fb90 --- /dev/null +++ b/build.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# Build script for SecondVoice + +set -e + +# Colors +GREEN='\033[0;32m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +echo "SecondVoice Build Script" +echo "=======================" +echo "" + +# Check if vcpkg is set +if [ -z "$VCPKG_ROOT" ]; then + echo -e "${RED}Error: VCPKG_ROOT not set${NC}" + echo "Please install vcpkg and set VCPKG_ROOT environment variable:" + echo " git clone https://github.com/microsoft/vcpkg.git" + echo " cd vcpkg && ./bootstrap-vcpkg.sh" + echo " export VCPKG_ROOT=\$(pwd)" + exit 1 +fi + +echo -e "${GREEN}vcpkg found at: $VCPKG_ROOT${NC}" +echo "" + +# Check if .env exists +if [ ! -f ".env" ]; then + echo -e "${RED}Warning: .env file not found${NC}" + echo "Please create .env from .env.example and add your API keys" + echo " cp .env.example .env" + echo "" +fi + +# Configure +echo "Configuring CMake..." +cmake -B build -DCMAKE_TOOLCHAIN_FILE=$VCPKG_ROOT/scripts/buildsystems/vcpkg.cmake + +# Build +echo "" +echo "Building..." +cmake --build build -j$(nproc) + +echo "" +echo -e "${GREEN}Build successful!${NC}" +echo "" +echo "To run the application:" +echo " cd build" +echo " ./SecondVoice" diff --git a/config.json b/config.json new file mode 100644 index 0000000..ce1da5d --- /dev/null +++ b/config.json @@ -0,0 +1,29 @@ +{ + "audio": { + "sample_rate": 16000, + "channels": 1, + "chunk_duration_seconds": 10, + "format": "wav" + }, + "whisper": { + "model": "whisper-1", + "language": "zh", + "temperature": 0.0 + }, + "claude": { + "model": "claude-haiku-4-20250514", + "max_tokens": 1024, + "temperature": 0.3, + "system_prompt": "Tu es un traducteur professionnel chinois-français. Traduis le texte suivant de manière naturelle et contextuelle." + }, + "ui": { + "window_width": 800, + "window_height": 600, + "font_size": 16, + "max_display_lines": 50 + }, + "recording": { + "save_audio": true, + "output_directory": "./recordings" + } +} diff --git a/docs/implementation_plan.md b/docs/implementation_plan.md new file mode 100644 index 0000000..653c5af --- /dev/null +++ b/docs/implementation_plan.md @@ -0,0 +1,494 @@ +# SecondVoice - Plan d'Implémentation MVP + +**Date**: 20 novembre 2025 +**Target**: MVP minimal fonctionnel +**Platform**: Linux +**Package Manager**: vcpkg + +--- + +## 🎯 Objectif MVP Minimal + +Application desktop qui: +1. Capture audio microphone en continu +2. Transcrit chinois → texte (Whisper API) +3. Traduit texte → français (Claude API) +4. Affiche traduction temps réel (ImGui) +5. Bouton Stop pour arrêter (pas de résumé MVP) + +--- + +## 🏗️ Architecture Technique + +### Pipeline +``` +Audio Capture (PortAudio) + ↓ (chunks audio configurables) +Whisper API (STT) + ↓ (texte chinois) +Claude API (traduction) + ↓ (texte français) +ImGui UI (display temps réel + bouton Stop) +``` + +### Threading Model +``` +Thread 1 - Audio Capture: + - PortAudio callback capture audio + - Accumule chunks (taille configurable) + - Push dans queue thread-safe + - Save WAV backup en background + +Thread 2 - AI Processing: + - Pop chunk depuis audio queue + - POST Whisper API → transcription chinoise + - POST Claude API → traduction française + - Push résultat dans UI queue + +Thread 3 - Main UI (ImGui): + - Render window ImGui + - Display traductions depuis queue + - Handle bouton Stop + - Update status/duration +``` + +--- + +## 📁 Structure Projet + +``` +secondvoice/ +├── .env # API keys (OPENAI_API_KEY, ANTHROPIC_API_KEY) +├── .gitignore +├── CMakeLists.txt # Build configuration +├── vcpkg.json # Dependencies manifest +├── config.json # Runtime config (audio chunk size, etc) +├── README.md +├── docs/ +│ ├── SecondVoice.md # Vision document +│ └── implementation_plan.md # Ce document +├── src/ +│ ├── main.cpp # Entry point + ImGui main loop +│ ├── audio/ +│ │ ├── AudioCapture.h +│ │ ├── AudioCapture.cpp # PortAudio wrapper +│ │ ├── AudioBuffer.h +│ │ └── AudioBuffer.cpp # Thread-safe ring buffer +│ ├── api/ +│ │ ├── WhisperClient.h +│ │ ├── WhisperClient.cpp # Whisper API client +│ │ ├── ClaudeClient.h +│ │ └── ClaudeClient.cpp # Claude API client +│ ├── ui/ +│ │ ├── TranslationUI.h +│ │ └── TranslationUI.cpp # ImGui interface +│ ├── utils/ +│ │ ├── Config.h +│ │ ├── Config.cpp # Load .env + config.json +│ │ ├── ThreadSafeQueue.h # Template queue thread-safe +│ │ └── Logger.h # Simple logging +│ └── core/ +│ ├── Pipeline.h +│ └── Pipeline.cpp # Orchestrate threads +├── recordings/ # Output audio files +│ └── .gitkeep +└── build/ # CMake build output (ignored) +``` + +--- + +## 🔧 Dépendances + +### vcpkg.json +```json +{ + "name": "secondvoice", + "version": "0.1.0", + "dependencies": [ + "portaudio", + "cpp-httplib", + "nlohmann-json", + "imgui[glfw-binding,opengl3-binding]", + "glfw3", + "opengl" + ] +} +``` + +### System Requirements (Linux) +```bash +# PortAudio dependencies +sudo apt install libasound2-dev + +# OpenGL dependencies +sudo apt install libgl1-mesa-dev libglu1-mesa-dev +``` + +--- + +## ⚙️ Configuration + +### .env (racine projet) +```env +OPENAI_API_KEY=sk-... +ANTHROPIC_API_KEY=sk-ant-... +``` + +### config.json (racine projet) +```json +{ + "audio": { + "sample_rate": 16000, + "channels": 1, + "chunk_duration_seconds": 10, + "format": "wav" + }, + "whisper": { + "model": "whisper-1", + "language": "zh", + "temperature": 0.0 + }, + "claude": { + "model": "claude-haiku-4-20250514", + "max_tokens": 1024, + "temperature": 0.3, + "system_prompt": "Tu es un traducteur professionnel chinois-français. Traduis le texte suivant de manière naturelle et contextuelle." + }, + "ui": { + "window_width": 800, + "window_height": 600, + "font_size": 16, + "max_display_lines": 50 + }, + "recording": { + "save_audio": true, + "output_directory": "./recordings" + } +} +``` + +--- + +## 🔌 API Clients + +### Whisper API +```cpp +// POST https://api.openai.com/v1/audio/transcriptions +// Content-Type: multipart/form-data + +Request: +- file: audio.wav (binary) +- model: whisper-1 +- language: zh +- temperature: 0.0 + +Response: +{ + "text": "你好,今天我们讨论项目进度..." +} +``` + +### Claude API +```cpp +// POST https://api.anthropic.com/v1/messages +// Content-Type: application/json +// x-api-key: {ANTHROPIC_API_KEY} +// anthropic-version: 2023-06-01 + +Request: +{ + "model": "claude-haiku-4-20250514", + "max_tokens": 1024, + "messages": [{ + "role": "user", + "content": "Traduis en français: 你好,今天我们讨论项目进度..." + }] +} + +Response: +{ + "content": [{ + "type": "text", + "text": "Bonjour, aujourd'hui nous discutons de l'avancement du projet..." + }], + "model": "claude-haiku-4-20250514", + "usage": {...} +} +``` + +--- + +## 🎨 Interface ImGui + +### Layout Minimaliste +``` +┌────────────────────────────────────────────┐ +│ SecondVoice - Live Translation │ +├────────────────────────────────────────────┤ +│ │ +│ [●] Recording... Duration: 00:05:23 │ +│ │ +│ ┌────────────────────────────────────────┐ │ +│ │ 中文: 你好,今天我们讨论项目进度... │ │ +│ │ FR: Bonjour, aujourd'hui nous │ │ +│ │ discutons de l'avancement... │ │ +│ │ │ │ +│ │ 中文: 关于预算的问题... │ │ +│ │ FR: Concernant la question du budget.. │ │ +│ │ │ │ +│ │ [Auto-scroll enabled] │ │ +│ │ │ │ +│ └────────────────────────────────────────┘ │ +│ │ +│ [ STOP RECORDING ] │ +│ │ +│ Status: Processing chunk 12/12 │ +│ Audio: 16kHz mono, chunk size: 10s │ +└────────────────────────────────────────────┘ +``` + +### Features UI +- **Scrollable text area**: Auto-scroll, peut désactiver pour review +- **Color coding**: Chinois (couleur 1), Français (couleur 2) +- **Status bar**: Duration, chunk count, processing status +- **Stop button**: Arrête capture + processing, sauvegarde audio +- **Window resizable**: Layout adaptatif + +--- + +## 🚀 Ordre d'Implémentation + +### Phase 1 - Setup Infrastructure (Jour 1) +**Todo**: +1. ✅ Créer structure projet +2. ✅ Setup CMakeLists.txt avec vcpkg +3. ✅ Créer .gitignore (.env, build/, recordings/) +4. ✅ Créer config.json template +5. ✅ Setup .env (API keys) +6. ✅ Test build minimal (hello world) + +**Validation**: `cmake -B build && cmake --build build` compile sans erreurs + +--- + +### Phase 2 - Audio Capture (Jour 1-2) +**Todo**: +1. Implémenter `AudioCapture.h/cpp`: + - Init PortAudio + - Callback capture audio + - Accumulation chunks (configurable duration) + - Push dans ThreadSafeQueue +2. Implémenter `AudioBuffer.h/cpp`: + - Ring buffer pour audio raw + - Thread-safe operations +3. Test standalone: Capture 30s audio → save WAV + +**Validation**: Audio WAV lisible, durée correcte, qualité OK + +--- + +### Phase 3 - Whisper Client (Jour 2) +**Todo**: +1. Implémenter `WhisperClient.h/cpp`: + - Load API key depuis .env + - POST multipart/form-data (cpp-httplib) + - Encode audio WAV en memory + - Parse JSON response + - Error handling (retry, timeout) +2. Test standalone: Audio file → Whisper → texte chinois + +**Validation**: Transcription chinoise correcte sur sample audio + +--- + +### Phase 4 - Claude Client (Jour 2-3) +**Todo**: +1. Implémenter `ClaudeClient.h/cpp`: + - Load API key depuis .env + - POST JSON request (cpp-httplib) + - System prompt configurable + - Parse response (extract text) + - Error handling +2. Test standalone: Texte chinois → Claude → texte français + +**Validation**: Traduction française naturelle et correcte + +--- + +### Phase 5 - ImGui UI (Jour 3) +**Todo**: +1. Setup ImGui + GLFW + OpenGL: + - Window creation + - Render loop + - Input handling +2. Implémenter `TranslationUI.h/cpp`: + - Scrollable text area + - Display messages (CN + FR) + - Button Stop + - Status bar (duration, chunk count) +3. Test standalone: Afficher mock data + +**Validation**: UI responsive, affichage texte OK, bouton fonctionne + +--- + +### Phase 6 - Pipeline Integration (Jour 4) +**Todo**: +1. Implémenter `Pipeline.h/cpp`: + - Thread 1: AudioCapture loop + - Thread 2: Processing loop (Whisper → Claude) + - Thread 3: UI loop (ImGui) + - ThreadSafeQueue entre threads + - Synchronisation (start/stop) +2. Implémenter `Config.h/cpp`: + - Load .env (API keys) + - Load config.json (settings) +3. Implémenter `main.cpp`: + - Init all components + - Start pipeline + - Handle graceful shutdown + +**Validation**: Pipeline complet fonctionne bout-à-bout + +--- + +### Phase 7 - Testing & Tuning (Jour 5) +**Todo**: +1. Test avec audio réel chinois: + - Sample conversations + - Different audio qualities + - Different chunk sizes (5s, 10s, 30s) +2. Measure latence: + - Audio → Whisper: X secondes + - Whisper → Claude: Y secondes + - Total: Z secondes +3. Debug & fix bugs: + - Memory leaks + - Thread safety issues + - API errors handling +4. Optimize: + - Chunk size optimal (tradeoff latency vs accuracy) + - API timeout values + - UI refresh rate + +**Validation**: +- Latence totale < 10s acceptable +- Pas de crash sur 30min recording +- Transcription + traduction compréhensibles + +--- + +## 🧪 Test Plan + +### Unit Tests (Phase 2+) +- `AudioCapture`: Capture audio, format correct +- `WhisperClient`: API call mock, parsing JSON +- `ClaudeClient`: API call mock, parsing JSON +- `ThreadSafeQueue`: Thread safety, no data loss + +### Integration Tests +- Audio → Whisper: Audio file → texte chinois correct +- Whisper → Claude: Texte chinois → traduction française correcte +- Pipeline: Audio → UI display complet + +### End-to-End Test +- Recording 5min conversation chinoise réelle +- Vérifier transcription accuracy (>85%) +- Vérifier traduction compréhensible +- Vérifier UI responsive +- Vérifier audio sauvegardé correctement + +--- + +## 📊 Metrics à Tracker + +### Performance +- **Latence Whisper**: Temps API call (target: <3s pour 10s audio) +- **Latence Claude**: Temps API call (target: <2s pour 200 tokens) +- **Latence totale**: Audio → Display (target: <10s) +- **Memory usage**: Stable sur longue durée (no leaks) +- **CPU usage**: Acceptable (<50% sur laptop) + +### Qualité +- **Whisper accuracy**: % mots corrects (target: >85%) +- **Claude quality**: Traduction naturelle (subjective) +- **Crash rate**: 0 crash sur 1h recording + +### Cost +- **Whisper**: $0.006/min audio +- **Claude**: ~$0.03-0.05/h (depends on text volume) +- **Total**: ~$0.40/h meeting + +--- + +## ⚠️ Risks & Mitigations + +| Risk | Impact | Mitigation | +|------|--------|------------| +| **Whisper API timeout** | Bloquant | Retry logic, timeout 30s, fallback queue | +| **Claude API rate limit** | Moyen | Exponential backoff, queue requests | +| **Audio buffer overflow** | Moyen | Ring buffer size adequate, drop old chunks if needed | +| **Thread deadlock** | Bloquant | Use std::lock_guard, avoid nested locks | +| **Memory leak** | Moyen | Use smart pointers, valgrind tests | +| **Network interruption** | Moyen | Retry logic, cache audio locally | + +--- + +## 🎯 Success Criteria MVP + +✅ **MVP validé si**: +1. Capture audio microphone fonctionne +2. Transcription chinoise >85% précise +3. Traduction française compréhensible +4. UI affiche traductions temps réel +5. Bouton Stop arrête proprement +6. Audio sauvegardé correctement +7. Pas de crash sur 30min recording +8. Latence totale <10s acceptable + +--- + +## 📝 Notes Implémentation + +### Thread Safety +- Utiliser `std::mutex` + `std::lock_guard` pour queues +- Pas de shared state sans protection +- Use `std::atomic` pour flags (running, stopping) + +### Error Handling +- Try/catch sur API calls +- Log errors (spdlog ou simple cout) +- Retry logic (max 3 attempts) +- Graceful degradation (skip chunk si error persistant) + +### Audio Format +- **Sample rate**: 16kHz (optimal pour Whisper) +- **Channels**: Mono (sufficient, réduit bandwidth) +- **Format**: 16-bit PCM WAV +- **Chunk size**: Configurable (default 10s) + +### API Best Practices +- **Timeout**: 30s pour Whisper, 15s pour Claude +- **Retry**: Exponential backoff (1s, 2s, 4s) +- **Rate limiting**: Respect API limits (monitor 429 errors) +- **Headers**: Always set User-Agent, API version + +--- + +## 🔄 Post-MVP (Phase 2) + +**Not included in MVP, but planned**: +- ❌ Résumé auto post-meeting (Claude summary) +- ❌ Export structuré (transcripts + audio) +- ❌ Système de recherche (backlog) +- ❌ Diarization (qui parle) +- ❌ Replay mode +- ❌ GUI élaborée (settings, etc) + +**Focus MVP**: Pipeline fonctionnel bout-à-bout, validation concept, usage réel premier meeting. + +--- + +*Document créé: 20 novembre 2025* +*Status: Ready to implement* +*Estimated effort: 5 jours développement + 2 jours tests* diff --git a/recordings/.gitkeep b/recordings/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/api/ClaudeClient.cpp b/src/api/ClaudeClient.cpp new file mode 100644 index 0000000..0753260 --- /dev/null +++ b/src/api/ClaudeClient.cpp @@ -0,0 +1,79 @@ +#include "ClaudeClient.h" +#include +#include +#include + +using json = nlohmann::json; + +namespace secondvoice { + +ClaudeClient::ClaudeClient(const std::string& api_key) + : api_key_(api_key) { +} + +std::optional ClaudeClient::translate( + const std::string& chinese_text, + const std::string& system_prompt, + int max_tokens, + float temperature) { + + // Build request JSON + json request_json = { + {"model", MODEL}, + {"max_tokens", max_tokens}, + {"temperature", temperature}, + {"messages", json::array({ + { + {"role", "user"}, + {"content", "Traduis en français: " + chinese_text} + } + })} + }; + + if (!system_prompt.empty()) { + request_json["system"] = system_prompt; + } + + std::string request_body = request_json.dump(); + + // Make HTTP request + httplib::Client client("https://api.anthropic.com"); + client.set_read_timeout(15, 0); // 15 seconds timeout + + httplib::Headers headers = { + {"x-api-key", api_key_}, + {"anthropic-version", API_VERSION}, + {"content-type", "application/json"} + }; + + auto res = client.Post("/v1/messages", headers, request_body, "application/json"); + + if (!res) { + std::cerr << "Claude API request failed: " << httplib::to_string(res.error()) << std::endl; + return std::nullopt; + } + + if (res->status != 200) { + std::cerr << "Claude API error " << res->status << ": " << res->body << std::endl; + return std::nullopt; + } + + // Parse response + try { + json response_json = json::parse(res->body); + + if (!response_json.contains("content") || !response_json["content"].is_array()) { + std::cerr << "Invalid Claude API response format" << std::endl; + return std::nullopt; + } + + ClaudeResponse response; + response.text = response_json["content"][0]["text"].get(); + return response; + } catch (const json::exception& e) { + std::cerr << "Failed to parse Claude response: " << e.what() << std::endl; + return std::nullopt; + } +} + +} // namespace secondvoice diff --git a/src/api/ClaudeClient.h b/src/api/ClaudeClient.h new file mode 100644 index 0000000..08c0d31 --- /dev/null +++ b/src/api/ClaudeClient.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include + +namespace secondvoice { + +struct ClaudeResponse { + std::string text; +}; + +class ClaudeClient { +public: + ClaudeClient(const std::string& api_key); + + std::optional translate( + const std::string& chinese_text, + const std::string& system_prompt = "", + int max_tokens = 1024, + float temperature = 0.3f + ); + +private: + std::string api_key_; + static constexpr const char* API_URL = "https://api.anthropic.com/v1/messages"; + static constexpr const char* MODEL = "claude-haiku-4-20250514"; + static constexpr const char* API_VERSION = "2023-06-01"; +}; + +} // namespace secondvoice diff --git a/src/api/WhisperClient.cpp b/src/api/WhisperClient.cpp new file mode 100644 index 0000000..8cac3f3 --- /dev/null +++ b/src/api/WhisperClient.cpp @@ -0,0 +1,85 @@ +#include "WhisperClient.h" +#include "../audio/AudioBuffer.h" +#include +#include +#include +#include +#include + +using json = nlohmann::json; + +namespace secondvoice { + +WhisperClient::WhisperClient(const std::string& api_key) + : api_key_(api_key) { +} + +std::optional WhisperClient::transcribe( + const std::vector& audio_data, + int sample_rate, + int channels, + const std::string& language, + float temperature) { + + // Save audio to temporary WAV file + AudioBuffer buffer(sample_rate, channels); + buffer.addSamples(audio_data); + + std::string temp_file = "/tmp/secondvoice_temp.wav"; + if (!buffer.saveToWav(temp_file)) { + std::cerr << "Failed to save temporary WAV file" << std::endl; + return std::nullopt; + } + + // Read WAV file + std::ifstream file(temp_file, std::ios::binary); + if (!file.is_open()) { + std::cerr << "Failed to open temporary WAV file" << std::endl; + return std::nullopt; + } + + std::ostringstream wav_stream; + wav_stream << file.rdbuf(); + std::string wav_data = wav_stream.str(); + file.close(); + + // Make HTTP request + httplib::Client client("https://api.openai.com"); + client.set_read_timeout(30, 0); // 30 seconds timeout + + httplib::MultipartFormDataItems items = { + {"file", wav_data, "audio.wav", "audio/wav"}, + {"model", "whisper-1", "", ""}, + {"language", language, "", ""}, + {"temperature", std::to_string(temperature), "", ""} + }; + + httplib::Headers headers = { + {"Authorization", "Bearer " + api_key_} + }; + + auto res = client.Post("/v1/audio/transcriptions", headers, items); + + if (!res) { + std::cerr << "Whisper API request failed: " << httplib::to_string(res.error()) << std::endl; + return std::nullopt; + } + + if (res->status != 200) { + std::cerr << "Whisper API error " << res->status << ": " << res->body << std::endl; + return std::nullopt; + } + + // Parse response + try { + json response_json = json::parse(res->body); + WhisperResponse response; + response.text = response_json["text"].get(); + return response; + } catch (const json::exception& e) { + std::cerr << "Failed to parse Whisper response: " << e.what() << std::endl; + return std::nullopt; + } +} + +} // namespace secondvoice diff --git a/src/api/WhisperClient.h b/src/api/WhisperClient.h new file mode 100644 index 0000000..3885b70 --- /dev/null +++ b/src/api/WhisperClient.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include +#include + +namespace secondvoice { + +struct WhisperResponse { + std::string text; +}; + +class WhisperClient { +public: + WhisperClient(const std::string& api_key); + + std::optional transcribe( + const std::vector& audio_data, + int sample_rate, + int channels, + const std::string& language = "zh", + float temperature = 0.0f + ); + +private: + std::string api_key_; + static constexpr const char* API_URL = "https://api.openai.com/v1/audio/transcriptions"; +}; + +} // namespace secondvoice diff --git a/src/audio/AudioBuffer.cpp b/src/audio/AudioBuffer.cpp new file mode 100644 index 0000000..9e8f5d2 --- /dev/null +++ b/src/audio/AudioBuffer.cpp @@ -0,0 +1,76 @@ +#include "AudioBuffer.h" +#include +#include +#include + +namespace secondvoice { + +AudioBuffer::AudioBuffer(int sample_rate, int channels) + : sample_rate_(sample_rate) + , channels_(channels) { +} + +void AudioBuffer::addSamples(const std::vector& samples) { + samples_.insert(samples_.end(), samples.begin(), samples.end()); +} + +std::vector AudioBuffer::getSamples() const { + return samples_; +} + +void AudioBuffer::clear() { + samples_.clear(); +} + +bool AudioBuffer::saveToWav(const std::string& filename) const { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) { + return false; + } + + // WAV header + struct WavHeader { + char riff[4] = {'R', 'I', 'F', 'F'}; + uint32_t file_size; + char wave[4] = {'W', 'A', 'V', 'E'}; + char fmt[4] = {'f', 'm', 't', ' '}; + uint32_t fmt_size = 16; + uint16_t audio_format = 1; // PCM + uint16_t num_channels; + uint32_t sample_rate; + uint32_t byte_rate; + uint16_t block_align; + uint16_t bits_per_sample = 16; + char data[4] = {'d', 'a', 't', 'a'}; + uint32_t data_size; + }; + + WavHeader header; + header.num_channels = channels_; + header.sample_rate = sample_rate_; + header.byte_rate = sample_rate_ * channels_ * 2; // 16-bit = 2 bytes + header.block_align = channels_ * 2; + + // Convert float samples to 16-bit PCM + std::vector pcm_samples; + pcm_samples.reserve(samples_.size()); + for (float sample : samples_) { + // Clamp to [-1.0, 1.0] and convert to 16-bit + float clamped = std::max(-1.0f, std::min(1.0f, sample)); + pcm_samples.push_back(static_cast(clamped * 32767.0f)); + } + + header.data_size = pcm_samples.size() * sizeof(int16_t); + header.file_size = sizeof(WavHeader) - 8 + header.data_size; + + // Write header + file.write(reinterpret_cast(&header), sizeof(header)); + + // Write PCM data + file.write(reinterpret_cast(pcm_samples.data()), header.data_size); + + file.close(); + return true; +} + +} // namespace secondvoice diff --git a/src/audio/AudioBuffer.h b/src/audio/AudioBuffer.h new file mode 100644 index 0000000..03a5da8 --- /dev/null +++ b/src/audio/AudioBuffer.h @@ -0,0 +1,27 @@ +#pragma once + +#include +#include + +namespace secondvoice { + +class AudioBuffer { +public: + AudioBuffer(int sample_rate, int channels); + + void addSamples(const std::vector& samples); + std::vector getSamples() const; + void clear(); + + bool saveToWav(const std::string& filename) const; + + size_t size() const { return samples_.size(); } + bool empty() const { return samples_.empty(); } + +private: + int sample_rate_; + int channels_; + std::vector samples_; +}; + +} // namespace secondvoice diff --git a/src/audio/AudioCapture.cpp b/src/audio/AudioCapture.cpp new file mode 100644 index 0000000..e8bd095 --- /dev/null +++ b/src/audio/AudioCapture.cpp @@ -0,0 +1,110 @@ +#include "AudioCapture.h" +#include + +namespace secondvoice { + +AudioCapture::AudioCapture(int sample_rate, int channels, int chunk_duration_seconds) + : sample_rate_(sample_rate) + , channels_(channels) + , chunk_duration_seconds_(chunk_duration_seconds) { +} + +AudioCapture::~AudioCapture() { + stop(); + if (stream_) { + Pa_CloseStream(stream_); + } + Pa_Terminate(); +} + +bool AudioCapture::initialize() { + PaError err = Pa_Initialize(); + if (err != paNoError) { + std::cerr << "PortAudio init error: " << Pa_GetErrorText(err) << std::endl; + return false; + } + return true; +} + +int AudioCapture::audioCallback(const void* input, void* output, + unsigned long frame_count, + const PaStreamCallbackTimeInfo* time_info, + PaStreamCallbackFlags status_flags, + void* user_data) { + AudioCapture* self = static_cast(user_data); + const float* in = static_cast(input); + + // Accumulate audio data + for (unsigned long i = 0; i < frame_count * self->channels_; ++i) { + self->buffer_.push_back(in[i]); + } + + // Check if we have accumulated enough data for a chunk + size_t chunk_samples = self->sample_rate_ * self->channels_ * self->chunk_duration_seconds_; + if (self->buffer_.size() >= chunk_samples) { + // Call the callback with the chunk + if (self->callback_) { + self->callback_(self->buffer_); + } + self->buffer_.clear(); + } + + return paContinue; +} + +bool AudioCapture::start(AudioCallback callback) { + if (is_recording_) { + return false; + } + + callback_ = callback; + buffer_.clear(); + + PaStreamParameters input_params; + input_params.device = Pa_GetDefaultInputDevice(); + if (input_params.device == paNoDevice) { + std::cerr << "No default input device" << std::endl; + return false; + } + + input_params.channelCount = channels_; + input_params.sampleFormat = paFloat32; + input_params.suggestedLatency = Pa_GetDeviceInfo(input_params.device)->defaultLowInputLatency; + input_params.hostApiSpecificStreamInfo = nullptr; + + PaError err = Pa_OpenStream( + &stream_, + &input_params, + nullptr, // no output + sample_rate_, + paFramesPerBufferUnspecified, + paClipOff, + &AudioCapture::audioCallback, + this + ); + + if (err != paNoError) { + std::cerr << "PortAudio open stream error: " << Pa_GetErrorText(err) << std::endl; + return false; + } + + err = Pa_StartStream(stream_); + if (err != paNoError) { + std::cerr << "PortAudio start stream error: " << Pa_GetErrorText(err) << std::endl; + return false; + } + + is_recording_ = true; + return true; +} + +void AudioCapture::stop() { + if (!is_recording_ || !stream_) { + return; + } + + Pa_StopStream(stream_); + is_recording_ = false; +} + +} // namespace secondvoice diff --git a/src/audio/AudioCapture.h b/src/audio/AudioCapture.h new file mode 100644 index 0000000..5bf7264 --- /dev/null +++ b/src/audio/AudioCapture.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include +#include +#include + +namespace secondvoice { + +class AudioCapture { +public: + using AudioCallback = std::function&)>; + + AudioCapture(int sample_rate, int channels, int chunk_duration_seconds); + ~AudioCapture(); + + bool initialize(); + bool start(AudioCallback callback); + void stop(); + bool isRecording() const { return is_recording_; } + +private: + static int audioCallback(const void* input, void* output, + unsigned long frame_count, + const PaStreamCallbackTimeInfo* time_info, + PaStreamCallbackFlags status_flags, + void* user_data); + + int sample_rate_; + int channels_; + int chunk_duration_seconds_; + bool is_recording_ = false; + + PaStream* stream_ = nullptr; + AudioCallback callback_; + std::vector buffer_; +}; + +} // namespace secondvoice diff --git a/src/core/Pipeline.cpp b/src/core/Pipeline.cpp new file mode 100644 index 0000000..7c0c51a --- /dev/null +++ b/src/core/Pipeline.cpp @@ -0,0 +1,214 @@ +#include "Pipeline.h" +#include "../audio/AudioCapture.h" +#include "../audio/AudioBuffer.h" +#include "../api/WhisperClient.h" +#include "../api/ClaudeClient.h" +#include "../ui/TranslationUI.h" +#include "../utils/Config.h" +#include +#include +#include + +namespace secondvoice { + +Pipeline::Pipeline() = default; + +Pipeline::~Pipeline() { + stop(); +} + +bool Pipeline::initialize() { + auto& config = Config::getInstance(); + + // Initialize audio capture + audio_capture_ = std::make_unique( + config.getAudioConfig().sample_rate, + config.getAudioConfig().channels, + config.getAudioConfig().chunk_duration_seconds + ); + + if (!audio_capture_->initialize()) { + std::cerr << "Failed to initialize audio capture" << std::endl; + return false; + } + + // Initialize API clients + whisper_client_ = std::make_unique(config.getOpenAIKey()); + claude_client_ = std::make_unique(config.getAnthropicKey()); + + // Initialize UI + ui_ = std::make_unique( + config.getUIConfig().window_width, + config.getUIConfig().window_height + ); + + if (!ui_->initialize()) { + std::cerr << "Failed to initialize UI" << std::endl; + return false; + } + + // Initialize full recording buffer + full_recording_ = std::make_unique( + config.getAudioConfig().sample_rate, + config.getAudioConfig().channels + ); + + // Create recordings directory if it doesn't exist + std::filesystem::create_directories(config.getRecordingConfig().output_directory); + + return true; +} + +bool Pipeline::start() { + if (running_) { + return false; + } + + running_ = true; + + // Start threads + audio_thread_ = std::thread(&Pipeline::audioThread, this); + processing_thread_ = std::thread(&Pipeline::processingThread, this); + ui_thread_ = std::thread(&Pipeline::uiThread, this); + + return true; +} + +void Pipeline::stop() { + if (!running_) { + return; + } + + running_ = false; + + // Stop audio capture + if (audio_capture_) { + audio_capture_->stop(); + } + + // Shutdown queues + audio_queue_.shutdown(); + transcription_queue_.shutdown(); + + // Wait for threads + if (audio_thread_.joinable()) { + audio_thread_.join(); + } + if (processing_thread_.joinable()) { + processing_thread_.join(); + } + if (ui_thread_.joinable()) { + ui_thread_.join(); + } + + // Save full recording + auto& config = Config::getInstance(); + if (config.getRecordingConfig().save_audio && full_recording_) { + auto now = std::chrono::system_clock::now(); + auto time_t = std::chrono::system_clock::to_time_t(now); + std::stringstream ss; + ss << config.getRecordingConfig().output_directory << "/" + << "recording_" << std::put_time(std::localtime(&time_t), "%Y%m%d_%H%M%S") + << ".wav"; + + if (full_recording_->saveToWav(ss.str())) { + std::cout << "Recording saved to: " << ss.str() << std::endl; + } else { + std::cerr << "Failed to save recording" << std::endl; + } + } +} + +void Pipeline::audioThread() { + auto& config = Config::getInstance(); + + audio_capture_->start([this, &config](const std::vector& audio_data) { + if (!running_) return; + + // Add to full recording + full_recording_->addSamples(audio_data); + + // Push to processing queue + AudioChunk chunk; + chunk.data = audio_data; + chunk.sample_rate = config.getAudioConfig().sample_rate; + chunk.channels = config.getAudioConfig().channels; + + audio_queue_.push(std::move(chunk)); + }); + + // Keep thread alive while recording + auto start_time = std::chrono::steady_clock::now(); + while (running_ && audio_capture_->isRecording()) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + + // Update duration + auto now = std::chrono::steady_clock::now(); + recording_duration_ = std::chrono::duration_cast(now - start_time).count(); + } +} + +void Pipeline::processingThread() { + auto& config = Config::getInstance(); + + while (running_) { + auto chunk_opt = audio_queue_.wait_and_pop(); + if (!chunk_opt.has_value()) { + break; // Queue shutdown + } + + auto& chunk = chunk_opt.value(); + + // Transcribe with Whisper + auto whisper_result = whisper_client_->transcribe( + chunk.data, + chunk.sample_rate, + chunk.channels, + config.getWhisperConfig().language, + config.getWhisperConfig().temperature + ); + + if (!whisper_result.has_value()) { + std::cerr << "Whisper transcription failed" << std::endl; + continue; + } + + // Translate with Claude + auto claude_result = claude_client_->translate( + whisper_result->text, + config.getClaudeConfig().system_prompt, + config.getClaudeConfig().max_tokens, + config.getClaudeConfig().temperature + ); + + if (!claude_result.has_value()) { + std::cerr << "Claude translation failed" << std::endl; + continue; + } + + // Add to UI + ui_->addTranslation(whisper_result->text, claude_result->text); + + std::cout << "CN: " << whisper_result->text << std::endl; + std::cout << "FR: " << claude_result->text << std::endl; + std::cout << "---" << std::endl; + } +} + +void Pipeline::uiThread() { + while (running_ && !ui_->shouldClose()) { + ui_->setRecordingDuration(recording_duration_); + ui_->setProcessingStatus("Processing..."); + ui_->render(); + + // Check if stop was requested + if (ui_->isStopRequested()) { + running_ = false; + break; + } + + std::this_thread::sleep_for(std::chrono::milliseconds(16)); // ~60 FPS + } +} + +} // namespace secondvoice diff --git a/src/core/Pipeline.h b/src/core/Pipeline.h new file mode 100644 index 0000000..5fe5104 --- /dev/null +++ b/src/core/Pipeline.h @@ -0,0 +1,59 @@ +#pragma once + +#include +#include +#include +#include "../utils/ThreadSafeQueue.h" + +namespace secondvoice { + +class AudioCapture; +class WhisperClient; +class ClaudeClient; +class TranslationUI; +class AudioBuffer; + +struct AudioChunk { + std::vector data; + int sample_rate; + int channels; +}; + +struct TranscriptionResult { + std::string chinese_text; +}; + +class Pipeline { +public: + Pipeline(); + ~Pipeline(); + + bool initialize(); + bool start(); + void stop(); + + bool isRunning() const { return running_; } + +private: + void audioThread(); + void processingThread(); + void uiThread(); + + std::unique_ptr audio_capture_; + std::unique_ptr whisper_client_; + std::unique_ptr claude_client_; + std::unique_ptr ui_; + std::unique_ptr full_recording_; + + ThreadSafeQueue audio_queue_; + ThreadSafeQueue transcription_queue_; + + std::thread audio_thread_; + std::thread processing_thread_; + std::thread ui_thread_; + + std::atomic running_{false}; + std::atomic recording_duration_{0}; +}; + +} // namespace secondvoice diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000..9a19f4a --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,56 @@ +#include +#include "utils/Config.h" +#include "core/Pipeline.h" + +int main(int argc, char** argv) { + std::cout << "SecondVoice - Real-time Translation System" << std::endl; + std::cout << "===========================================" << std::endl; + + // Load configuration + secondvoice::Config& config = secondvoice::Config::getInstance(); + if (!config.load("config.json", ".env")) { + std::cerr << "Failed to load configuration" << std::endl; + return 1; + } + + std::cout << "Configuration loaded successfully" << std::endl; + std::cout << "Audio: " << config.getAudioConfig().sample_rate << "Hz, " + << config.getAudioConfig().channels << " channel(s), " + << config.getAudioConfig().chunk_duration_seconds << "s chunks" << std::endl; + std::cout << "Whisper: " << config.getWhisperConfig().model + << " (language: " << config.getWhisperConfig().language << ")" << std::endl; + std::cout << "Claude: " << config.getClaudeConfig().model << std::endl; + std::cout << std::endl; + + // Create and initialize pipeline + secondvoice::Pipeline pipeline; + if (!pipeline.initialize()) { + std::cerr << "Failed to initialize pipeline" << std::endl; + return 1; + } + + std::cout << "Pipeline initialized successfully" << std::endl; + std::cout << "Starting recording and translation..." << std::endl; + std::cout << std::endl; + + // Start pipeline + if (!pipeline.start()) { + std::cerr << "Failed to start pipeline" << std::endl; + return 1; + } + + // Wait for pipeline to finish (user clicks Stop button) + while (pipeline.isRunning()) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + + std::cout << std::endl; + std::cout << "Recording stopped" << std::endl; + std::cout << "Saving audio..." << std::endl; + + pipeline.stop(); + + std::cout << "Done!" << std::endl; + + return 0; +} diff --git a/src/ui/TranslationUI.cpp b/src/ui/TranslationUI.cpp new file mode 100644 index 0000000..59cbcc6 --- /dev/null +++ b/src/ui/TranslationUI.cpp @@ -0,0 +1,160 @@ +#include "TranslationUI.h" +#include +#include +#include +#include + +namespace secondvoice { + +TranslationUI::TranslationUI(int width, int height) + : width_(width) + , height_(height) { +} + +TranslationUI::~TranslationUI() { + if (window_) { + ImGui_ImplOpenGL3_Shutdown(); + ImGui_ImplGlfw_Shutdown(); + ImGui::DestroyContext(); + glfwDestroyWindow(window_); + glfwTerminate(); + } +} + +bool TranslationUI::initialize() { + // Initialize GLFW + if (!glfwInit()) { + std::cerr << "Failed to initialize GLFW" << std::endl; + return false; + } + + // OpenGL 3.3 + GLSL 330 + glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3); + glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3); + glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); + + // Create window + window_ = glfwCreateWindow(width_, height_, "SecondVoice - Live Translation", nullptr, nullptr); + if (!window_) { + std::cerr << "Failed to create GLFW window" << std::endl; + glfwTerminate(); + return false; + } + + glfwMakeContextCurrent(window_); + glfwSwapInterval(1); // Enable vsync + + // Initialize ImGui + IMGUI_CHECKVERSION(); + ImGui::CreateContext(); + ImGuiIO& io = ImGui::GetIO(); + io.ConfigFlags |= ImGuiConfigFlags_NavEnableKeyboard; + + ImGui::StyleColorsDark(); + + ImGui_ImplGlfw_InitForOpenGL(window_, true); + ImGui_ImplOpenGL3_Init("#version 330"); + + return true; +} + +void TranslationUI::render() { + glfwPollEvents(); + + // Start ImGui frame + ImGui_ImplOpenGL3_NewFrame(); + ImGui_ImplGlfw_NewFrame(); + ImGui::NewFrame(); + + // Main window (full viewport) + ImGui::SetNextWindowPos(ImVec2(0, 0)); + ImGui::SetNextWindowSize(ImGui::GetIO().DisplaySize); + ImGui::Begin("SecondVoice", nullptr, + ImGuiWindowFlags_NoTitleBar | + ImGuiWindowFlags_NoResize | + ImGuiWindowFlags_NoMove | + ImGuiWindowFlags_NoCollapse); + + renderTranslations(); + renderControls(); + renderStatus(); + + ImGui::End(); + + // Rendering + ImGui::Render(); + int display_w, display_h; + glfwGetFramebufferSize(window_, &display_w, &display_h); + glViewport(0, 0, display_w, display_h); + glClearColor(0.1f, 0.1f, 0.1f, 1.0f); + glClear(GL_COLOR_BUFFER_BIT); + ImGui_ImplOpenGL3_RenderDrawData(ImGui::GetDrawData()); + + glfwSwapBuffers(window_); +} + +bool TranslationUI::shouldClose() const { + return glfwWindowShouldClose(window_); +} + +void TranslationUI::addTranslation(const std::string& chinese, const std::string& french) { + messages_.push_back({chinese, french}); +} + +void TranslationUI::renderTranslations() { + ImGui::Text("SecondVoice - Live Translation"); + ImGui::Separator(); + + ImGui::BeginChild("Translations", ImVec2(0, -120), true); + + for (const auto& msg : messages_) { + ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.5f, 0.8f, 1.0f, 1.0f)); + ImGui::TextWrapped("中文: %s", msg.chinese.c_str()); + ImGui::PopStyleColor(); + + ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.5f, 1.0f, 0.5f, 1.0f)); + ImGui::TextWrapped("FR: %s", msg.french.c_str()); + ImGui::PopStyleColor(); + + ImGui::Spacing(); + ImGui::Separator(); + ImGui::Spacing(); + } + + if (auto_scroll_ && ImGui::GetScrollY() >= ImGui::GetScrollMaxY()) { + ImGui::SetScrollHereY(1.0f); + } + + ImGui::EndChild(); +} + +void TranslationUI::renderControls() { + ImGui::Spacing(); + + // Center the stop button + float button_width = 200.0f; + float window_width = ImGui::GetWindowWidth(); + ImGui::SetCursorPosX((window_width - button_width) * 0.5f); + + if (ImGui::Button("STOP RECORDING", ImVec2(button_width, 40))) { + stop_requested_ = true; + } + + ImGui::Spacing(); +} + +void TranslationUI::renderStatus() { + ImGui::Separator(); + + // Format duration as MM:SS + int minutes = recording_duration_ / 60; + int seconds = recording_duration_ % 60; + ImGui::Text("Recording... Duration: %02d:%02d", minutes, seconds); + + if (!processing_status_.empty()) { + ImGui::SameLine(); + ImGui::Text(" | Status: %s", processing_status_.c_str()); + } +} + +} // namespace secondvoice diff --git a/src/ui/TranslationUI.h b/src/ui/TranslationUI.h new file mode 100644 index 0000000..eaa3e83 --- /dev/null +++ b/src/ui/TranslationUI.h @@ -0,0 +1,47 @@ +#pragma once + +#include +#include +#include + +namespace secondvoice { + +struct TranslationMessage { + std::string chinese; + std::string french; +}; + +class TranslationUI { +public: + TranslationUI(int width, int height); + ~TranslationUI(); + + bool initialize(); + void render(); + bool shouldClose() const; + + void addTranslation(const std::string& chinese, const std::string& french); + bool isStopRequested() const { return stop_requested_; } + void resetStopRequest() { stop_requested_ = false; } + + void setRecordingDuration(int seconds) { recording_duration_ = seconds; } + void setProcessingStatus(const std::string& status) { processing_status_ = status; } + +private: + void renderTranslations(); + void renderControls(); + void renderStatus(); + + int width_; + int height_; + GLFWwindow* window_ = nullptr; + + std::vector messages_; + bool stop_requested_ = false; + bool auto_scroll_ = true; + + int recording_duration_ = 0; + std::string processing_status_; +}; + +} // namespace secondvoice diff --git a/src/utils/Config.cpp b/src/utils/Config.cpp new file mode 100644 index 0000000..614086e --- /dev/null +++ b/src/utils/Config.cpp @@ -0,0 +1,106 @@ +#include "Config.h" +#include +#include +#include +#include + +using json = nlohmann::json; + +namespace secondvoice { + +Config& Config::getInstance() { + static Config instance; + return instance; +} + +bool Config::load(const std::string& config_path, const std::string& env_path) { + // Load .env file + std::ifstream env_file(env_path); + if (env_file.is_open()) { + std::string line; + while (std::getline(env_file, line)) { + if (line.empty() || line[0] == '#') continue; + + auto pos = line.find('='); + if (pos != std::string::npos) { + std::string key = line.substr(0, pos); + std::string value = line.substr(pos + 1); + + // Remove quotes if present + if (!value.empty() && value.front() == '"' && value.back() == '"') { + value = value.substr(1, value.length() - 2); + } + + if (key == "OPENAI_API_KEY") { + openai_key_ = value; + } else if (key == "ANTHROPIC_API_KEY") { + anthropic_key_ = value; + } + } + } + env_file.close(); + } else { + std::cerr << "Warning: Could not open .env file: " << env_path << std::endl; + } + + // Load config.json + std::ifstream config_file(config_path); + if (!config_file.is_open()) { + std::cerr << "Error: Could not open config file: " << config_path << std::endl; + return false; + } + + json config_json; + try { + config_file >> config_json; + } catch (const json::parse_error& e) { + std::cerr << "Error parsing config.json: " << e.what() << std::endl; + return false; + } + + // Parse audio config + if (config_json.contains("audio")) { + auto& audio = config_json["audio"]; + audio_config_.sample_rate = audio.value("sample_rate", 16000); + audio_config_.channels = audio.value("channels", 1); + audio_config_.chunk_duration_seconds = audio.value("chunk_duration_seconds", 10); + audio_config_.format = audio.value("format", "wav"); + } + + // Parse whisper config + if (config_json.contains("whisper")) { + auto& whisper = config_json["whisper"]; + whisper_config_.model = whisper.value("model", "whisper-1"); + whisper_config_.language = whisper.value("language", "zh"); + whisper_config_.temperature = whisper.value("temperature", 0.0f); + } + + // Parse claude config + if (config_json.contains("claude")) { + auto& claude = config_json["claude"]; + claude_config_.model = claude.value("model", "claude-haiku-4-20250514"); + claude_config_.max_tokens = claude.value("max_tokens", 1024); + claude_config_.temperature = claude.value("temperature", 0.3f); + claude_config_.system_prompt = claude.value("system_prompt", ""); + } + + // Parse UI config + if (config_json.contains("ui")) { + auto& ui = config_json["ui"]; + ui_config_.window_width = ui.value("window_width", 800); + ui_config_.window_height = ui.value("window_height", 600); + ui_config_.font_size = ui.value("font_size", 16); + ui_config_.max_display_lines = ui.value("max_display_lines", 50); + } + + // Parse recording config + if (config_json.contains("recording")) { + auto& recording = config_json["recording"]; + recording_config_.save_audio = recording.value("save_audio", true); + recording_config_.output_directory = recording.value("output_directory", "./recordings"); + } + + return true; +} + +} // namespace secondvoice diff --git a/src/utils/Config.h b/src/utils/Config.h new file mode 100644 index 0000000..962a135 --- /dev/null +++ b/src/utils/Config.h @@ -0,0 +1,69 @@ +#pragma once + +#include + +namespace secondvoice { + +struct AudioConfig { + int sample_rate; + int channels; + int chunk_duration_seconds; + std::string format; +}; + +struct WhisperConfig { + std::string model; + std::string language; + float temperature; +}; + +struct ClaudeConfig { + std::string model; + int max_tokens; + float temperature; + std::string system_prompt; +}; + +struct UIConfig { + int window_width; + int window_height; + int font_size; + int max_display_lines; +}; + +struct RecordingConfig { + bool save_audio; + std::string output_directory; +}; + +class Config { +public: + static Config& getInstance(); + + bool load(const std::string& config_path, const std::string& env_path); + + const AudioConfig& getAudioConfig() const { return audio_config_; } + const WhisperConfig& getWhisperConfig() const { return whisper_config_; } + const ClaudeConfig& getClaudeConfig() const { return claude_config_; } + const UIConfig& getUIConfig() const { return ui_config_; } + const RecordingConfig& getRecordingConfig() const { return recording_config_; } + + const std::string& getOpenAIKey() const { return openai_key_; } + const std::string& getAnthropicKey() const { return anthropic_key_; } + +private: + Config() = default; + Config(const Config&) = delete; + Config& operator=(const Config&) = delete; + + AudioConfig audio_config_; + WhisperConfig whisper_config_; + ClaudeConfig claude_config_; + UIConfig ui_config_; + RecordingConfig recording_config_; + + std::string openai_key_; + std::string anthropic_key_; +}; + +} // namespace secondvoice diff --git a/src/utils/ThreadSafeQueue.h b/src/utils/ThreadSafeQueue.h new file mode 100644 index 0000000..f37893d --- /dev/null +++ b/src/utils/ThreadSafeQueue.h @@ -0,0 +1,65 @@ +#pragma once + +#include +#include +#include +#include + +namespace secondvoice { + +template +class ThreadSafeQueue { +public: + void push(T value) { + std::lock_guard lock(mutex_); + queue_.push(std::move(value)); + cv_.notify_one(); + } + + std::optional pop() { + std::unique_lock lock(mutex_); + if (queue_.empty()) { + return std::nullopt; + } + T value = std::move(queue_.front()); + queue_.pop(); + return value; + } + + std::optional wait_and_pop() { + std::unique_lock lock(mutex_); + cv_.wait(lock, [this] { return !queue_.empty() || shutdown_; }); + + if (shutdown_ && queue_.empty()) { + return std::nullopt; + } + + T value = std::move(queue_.front()); + queue_.pop(); + return value; + } + + bool empty() const { + std::lock_guard lock(mutex_); + return queue_.empty(); + } + + size_t size() const { + std::lock_guard lock(mutex_); + return queue_.size(); + } + + void shutdown() { + std::lock_guard lock(mutex_); + shutdown_ = true; + cv_.notify_all(); + } + +private: + mutable std::mutex mutex_; + std::condition_variable cv_; + std::queue queue_; + bool shutdown_ = false; +}; + +} // namespace secondvoice diff --git a/vcpkg.json b/vcpkg.json new file mode 100644 index 0000000..c777e35 --- /dev/null +++ b/vcpkg.json @@ -0,0 +1,15 @@ +{ + "name": "secondvoice", + "version": "0.1.0", + "dependencies": [ + "portaudio", + "cpp-httplib", + "nlohmann-json", + { + "name": "imgui", + "features": ["glfw-binding", "opengl3-binding"] + }, + "glfw3", + "opengl" + ] +}