feat: Add RNNoise denoising + transient suppressor + VAD improvements

- Add RNNoise neural network audio denoising (16kHz↔48kHz resampling) - Add transient suppressor to filter claps/clicks/pops before RNNoise - VAD now works on FILTERED audio (not raw) to avoid false triggers - Real-time denoised audio level display in UI - Save denoised audio previews in Opus format (.ogg) - Add extensive Whisper hallucination filter (Tingting, music, etc.) - Add "Clear" button to reset accumulated translations - Double VAD thresholds (0.02/0.08) for less sensitivity - Update Claude prompt to handle offensive content gracefully 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-23 16:46:38 +08:00 · 2025-11-23 16:46:38 +08:00 · 741ca09663
commit 741ca09663
parent fa8ea2907b
10 changed files with 996 additions and 150 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.20)
-project(SecondVoice VERSION 0.1.0 LANGUAGES CXX)
+project(SecondVoice VERSION 0.1.0 LANGUAGES C CXX)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@ -38,6 +38,35 @@ FetchContent_Declare(
 FetchContent_MakeAvailable(ogg opus)
 # Fetch RNNoise for noise reduction (using older stable version)
 FetchContent_Declare(
  rnnoise
  GIT_REPOSITORY https://github.com/xiph/rnnoise.git
  GIT_TAG v0.1.1
 )
 FetchContent_MakeAvailable(rnnoise)
 # Build RNNoise as a static library (v0.1.1 structure)
 add_library(rnnoise STATIC
    ${rnnoise_SOURCE_DIR}/src/denoise.c
    ${rnnoise_SOURCE_DIR}/src/rnn.c
    ${rnnoise_SOURCE_DIR}/src/rnn_data.c
    ${rnnoise_SOURCE_DIR}/src/pitch.c
    ${rnnoise_SOURCE_DIR}/src/kiss_fft.c
    ${rnnoise_SOURCE_DIR}/src/celt_lpc.c
 )
 target_include_directories(rnnoise PUBLIC
    ${rnnoise_SOURCE_DIR}/include
    ${rnnoise_SOURCE_DIR}/src
 )
 # RNNoise needs math library
 if(UNIX)
    target_link_libraries(rnnoise PRIVATE m)
 endif()
 # Create ImGui library with OpenGL/GLFW backends
 add_library(imgui_backends
    ${imgui_SOURCE_DIR}/imgui.cpp
@ -70,6 +99,7 @@ set(SOURCES_UI
    # Audio module
    src/audio/AudioCapture.cpp
    src/audio/AudioBuffer.cpp
    src/audio/NoiseReducer.cpp
    # API clients
    src/api/WhisperClient.cpp
    src/api/ClaudeClient.cpp
@ -88,6 +118,7 @@ set(SOURCES_CONSOLE
    # Audio module
    src/audio/AudioCapture.cpp
    src/audio/AudioBuffer.cpp
    src/audio/NoiseReducer.cpp
    # API clients
    src/api/WhisperClient.cpp
    src/api/ClaudeClient.cpp
@ -107,9 +138,11 @@ add_executable(${PROJECT_NAME}_Console ${SOURCES_CONSOLE})
 # Include directories
 target_include_directories(${PROJECT_NAME} PRIVATE
    ${CMAKE_CURRENT_SOURCE_DIR}/src
    ${rnnoise_SOURCE_DIR}/include
 )
 target_include_directories(${PROJECT_NAME}_Console PRIVATE
    ${CMAKE_CURRENT_SOURCE_DIR}/src
    ${rnnoise_SOURCE_DIR}/include
 )
 # Link libraries
@ -124,6 +157,7 @@ if(MINGW AND NOT BUILD_SHARED_LIBS)
        OpenGL::GL
        opus
        ogg
        rnnoise
        # Windows system libraries
        winmm
        setupapi
@ -136,6 +170,7 @@ if(MINGW AND NOT BUILD_SHARED_LIBS)
        nlohmann_json::nlohmann_json
        opus
        ogg
        rnnoise
        # Windows system libraries
        winmm
        setupapi
@ -152,6 +187,7 @@ else()
        OpenGL::GL
        opus
        ogg
        rnnoise
    )
    # Console version - NO UI libs
    target_link_libraries(${PROJECT_NAME}_Console PRIVATE
@ -159,6 +195,7 @@ else()
        nlohmann_json::nlohmann_json
        opus
        ogg
        rnnoise
    )
 endif()
--- a/config.json
+++ b/config.json
@ -18,10 +18,10 @@
    "model": "claude-3-5-haiku-20241022",
    "max_tokens": 1024,
    "temperature": 0.3,
-    "system_prompt": "Tu es un traducteur chinois-français. Réponds UNIQUEMENT avec la traduction française, sans explications, notes, commentaires ou alternatives. Une seule phrase traduite, rien d'autre."
+    "system_prompt": "Tu es un traducteur chinois-français. Réponds UNIQUEMENT avec la traduction française, sans explications, notes, commentaires ou alternatives. Une seule phrase traduite, rien d'autre. Si du contenu offensant est présent, remplace-le par [contenu offensant] mais traduis impérativement le reste de la phrase."
  },
  "ui": {
-    "window_width": 1200,
+    "window_width": 1500,
    "window_height": 800,
    "font_size": 24,
    "max_display_lines": 50
--- a/src/audio/AudioCapture.cpp
+++ b/src/audio/AudioCapture.cpp
@ -1,14 +1,14 @@
 #include "AudioCapture.h"
 #include "NoiseReducer.h"
 #include <iostream>
 namespace secondvoice {
-AudioCapture::AudioCapture(int sample_rate, int channels, int chunk_duration_seconds, int chunk_step_seconds)
+AudioCapture::AudioCapture(int sample_rate, int channels)
    : sample_rate_(sample_rate)
    , channels_(channels)
-    , chunk_duration_seconds_(chunk_duration_seconds)
+    , noise_reducer_(std::make_unique<NoiseReducer>()) {
-    , chunk_step_seconds_(chunk_step_seconds > 0 ? chunk_step_seconds : chunk_duration_seconds) {
+    std::cout << "[Audio] Noise reduction enabled (RNNoise)" << std::endl;
    // If step not specified, use duration (no overlap)
 }
 AudioCapture::~AudioCapture() {
@ -35,31 +35,212 @@ int AudioCapture::audioCallback(const void* input, void* output,
                                const PaStreamCallbackTimeInfo* time_info,
                                PaStreamCallbackFlags status_flags,
                                void* user_data) {
-    (void)output;       // Unused
+    (void)output;
-    (void)time_info;    // Unused
+    (void)time_info;
-    (void)status_flags; // Unused
+    (void)status_flags;
    AudioCapture* self = static_cast<AudioCapture*>(user_data);
    const float* in = static_cast<const float*>(input);
    unsigned long sample_count = frame_count * self->channels_;
-    // Accumulate audio data
+    // === REAL-TIME DENOISING ===
-    for (unsigned long i = 0; i < frame_count * self->channels_; ++i) {
+    // Process audio through RNNoise in real-time for meter display
-        self->buffer_.push_back(in[i]);
+    std::vector<float> denoised_samples;
    if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) {
        denoised_samples = self->noise_reducer_->processRealtime(in, sample_count);
    }
-    // Check if we have accumulated enough data for a chunk
+    // Calculate RMS and Peak from RAW audio (for VAD detection)
-    size_t chunk_samples = self->sample_rate_ * self->channels_ * self->chunk_duration_seconds_;
+    float raw_sum_squared = 0.0f;
-    size_t step_samples = self->sample_rate_ * self->channels_ * self->chunk_step_seconds_;
+    float raw_max_amp = 0.0f;
-
+    for (unsigned long i = 0; i < sample_count; ++i) {
-    if (self->buffer_.size() >= chunk_samples) {
+        float sample = in[i];
-        // Call the callback with the full chunk
+        raw_sum_squared += sample * sample;
-        if (self->callback_) {
+        if (std::abs(sample) > raw_max_amp) {
-            // Send exactly chunk_samples (the window)
+            raw_max_amp = std::abs(sample);
-            std::vector<float> chunk(self->buffer_.begin(), self->buffer_.begin() + chunk_samples);
+        }
-            self->callback_(chunk);
+    }
    float instant_rms = std::sqrt(raw_sum_squared / sample_count);
    float instant_peak = raw_max_amp;
    // Calculate RMS and Peak from DENOISED audio (for UI meter)
    float denoised_rms = 0.0f;
    float denoised_peak = 0.0f;
    if (!denoised_samples.empty()) {
        float sum_squared = 0.0f;
        float max_amp = 0.0f;
        for (const float& sample : denoised_samples) {
            sum_squared += sample * sample;
            if (std::abs(sample) > max_amp) {
                max_amp = std::abs(sample);
            }
        }
        denoised_rms = std::sqrt(sum_squared / denoised_samples.size());
        denoised_peak = max_amp;
    } else {
        // Fallback to raw if no denoised output yet
        denoised_rms = instant_rms;
        denoised_peak = instant_peak;
    }
    // Smooth RMS/Peak for display (fast attack, slow decay) - USE DENOISED VALUES
    constexpr float alpha_rise = 0.1f;
    constexpr float alpha_fall = 0.02f;
    float old_rms = self->current_rms_.load(std::memory_order_relaxed);
    float old_peak = self->current_peak_.load(std::memory_order_relaxed);
    float alpha_rms = (denoised_rms > old_rms) ? alpha_rise : alpha_fall;
    float alpha_peak = (denoised_peak > old_peak) ? alpha_rise : alpha_fall;
    self->current_rms_.store(alpha_rms * denoised_rms + (1.0f - alpha_rms) * old_rms, std::memory_order_relaxed);
    self->current_peak_.store(alpha_peak * denoised_peak + (1.0f - alpha_peak) * old_peak, std::memory_order_relaxed);
    // === VAD NOW WORKS ON FILTERED AUDIO ===
    // This way, filtered noise/claps won't trigger VAD!
    // Calculate Zero-Crossing Rate on DENOISED audio
    float zcr = 0.0f;
    if (!denoised_samples.empty() && denoised_samples.size() > 1) {
        int zero_crossings = 0;
        for (size_t i = 1; i < denoised_samples.size(); ++i) {
            if ((denoised_samples[i] >= 0 && denoised_samples[i-1] < 0) ||
                (denoised_samples[i] < 0 && denoised_samples[i-1] >= 0)) {
                zero_crossings++;
            }
        }
        zcr = static_cast<float>(zero_crossings) / denoised_samples.size();
    }
    self->last_zcr_ = zcr;
    // Get user-adjustable thresholds
    float rms_thresh = self->vad_rms_threshold_.load(std::memory_order_relaxed);
    float peak_thresh = self->vad_peak_threshold_.load(std::memory_order_relaxed);
    // Adaptive noise floor: track minimum DENOISED energy level
    if (denoised_rms < self->noise_floor_ * 2.0f) {
        self->noise_floor_ = self->noise_floor_ * (1.0f - self->noise_floor_alpha_) +
                             denoised_rms * self->noise_floor_alpha_;
    }
    // Dynamic threshold: noise floor + user threshold as margin
    float adaptive_rms_thresh = self->noise_floor_ + rms_thresh;
    // Speech detection on DENOISED audio:
    // 1. Energy above adaptive threshold
    // 2. ZCR in speech range (not too high = noise, not too low = silence)
    bool energy_ok = (denoised_rms >= adaptive_rms_thresh) || (denoised_peak >= peak_thresh);
    bool zcr_ok = (zcr >= self->speech_zcr_min_) && (zcr <= self->speech_zcr_max_);
    // Speech = energy OK AND (ZCR OK or very high energy)
    bool frame_has_speech = energy_ok && (zcr_ok || denoised_rms > adaptive_rms_thresh * 3.0f);
    // Hang time logic: don't immediately cut on silence
    if (frame_has_speech) {
        self->hang_frames_ = self->hang_frames_threshold_;  // Reset hang counter
    } else if (self->hang_frames_ > 0) {
        self->hang_frames_--;
        frame_has_speech = true;  // Keep "speaking" during hang time
    }
    // Calculate durations in samples
    int silence_samples_threshold = (self->silence_duration_ms_ * self->sample_rate_ * self->channels_) / 1000;
    int min_speech_samples = (self->min_speech_duration_ms_ * self->sample_rate_ * self->channels_) / 1000;
    int max_speech_samples = (self->max_speech_duration_ms_ * self->sample_rate_ * self->channels_) / 1000;
    if (frame_has_speech) {
        // Speech detected
        self->is_speech_active_.store(true, std::memory_order_relaxed);
        self->silence_samples_count_ = 0;
        // Add DENOISED audio to buffer (already processed by processRealtime above)
        if (!denoised_samples.empty()) {
            self->speech_buffer_.insert(self->speech_buffer_.end(),
                                        denoised_samples.begin(), denoised_samples.end());
        } else {
            // Fallback to raw if denoising disabled
            for (unsigned long i = 0; i < sample_count; ++i) {
                self->speech_buffer_.push_back(in[i]);
            }
        }
        self->speech_samples_count_ += sample_count;
        // Force flush if too long
        if (self->speech_samples_count_ >= max_speech_samples) {
            std::cout << "[VAD] Max duration reached, forcing flush ("
                      << self->speech_samples_count_ / (self->sample_rate_ * self->channels_) << "s)" << std::endl;
            if (self->callback_ && self->speech_buffer_.size() >= static_cast<size_t>(min_speech_samples)) {
                // Flush any remaining samples from the denoiser
                if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) {
                    auto remaining = self->noise_reducer_->flushStream();
                    self->speech_buffer_.insert(self->speech_buffer_.end(),
                                                remaining.begin(), remaining.end());
                    // Save preview for listening
                    self->noise_reducer_->savePreview(self->speech_buffer_);
                }
                self->callback_(self->speech_buffer_);
            }
            self->speech_buffer_.clear();
            self->speech_samples_count_ = 0;
            // Reset stream for next segment
            if (self->noise_reducer_) {
                self->noise_reducer_->resetStream();
            }
        }
    } else {
        // True silence (after hang time expired)
        self->silence_samples_count_ += sample_count;
        // If we were speaking and now have enough silence, flush
        if (self->speech_buffer_.size() > 0) {
            // Add trailing silence (denoised)
            if (!denoised_samples.empty()) {
                self->speech_buffer_.insert(self->speech_buffer_.end(),
                                            denoised_samples.begin(), denoised_samples.end());
            } else {
                for (unsigned long i = 0; i < sample_count; ++i) {
                    self->speech_buffer_.push_back(in[i]);
                }
            }
            if (self->silence_samples_count_ >= silence_samples_threshold) {
                self->is_speech_active_.store(false, std::memory_order_relaxed);
                // Flush if we have enough speech
                if (self->speech_samples_count_ >= min_speech_samples) {
                    // Flush any remaining samples from the denoiser
                    if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) {
                        auto remaining = self->noise_reducer_->flushStream();
                        self->speech_buffer_.insert(self->speech_buffer_.end(),
                                                    remaining.begin(), remaining.end());
                        // Save preview for listening
                        self->noise_reducer_->savePreview(self->speech_buffer_);
                    }
                    float duration = static_cast<float>(self->speech_buffer_.size()) /
                                   (self->sample_rate_ * self->channels_);
                    std::cout << "[VAD] Speech ended (noise_floor=" << self->noise_floor_
                              << "), flushing " << duration << "s (denoised)" << std::endl;
                    if (self->callback_) {
                        self->callback_(self->speech_buffer_);
                    }
                } else {
                    std::cout << "[VAD] Speech too short (" << self->speech_samples_count_
                              << " samples), discarding" << std::endl;
                }
                self->speech_buffer_.clear();
                self->speech_samples_count_ = 0;
                // Reset stream for next segment
                if (self->noise_reducer_) {
                    self->noise_reducer_->resetStream();
                }
            }
        } else {
            self->is_speech_active_.store(false, std::memory_order_relaxed);
        }
        // Sliding window: remove only step_samples, keep overlap for next chunk
        self->buffer_.erase(self->buffer_.begin(), self->buffer_.begin() + step_samples);
    }
    return paContinue;
@ -71,7 +252,9 @@ bool AudioCapture::start(AudioCallback callback) {
    }
    callback_ = callback;
-    buffer_.clear();
+    speech_buffer_.clear();
    silence_samples_count_ = 0;
    speech_samples_count_ = 0;
    PaStreamParameters input_params;
    input_params.device = Pa_GetDefaultInputDevice();
@ -88,7 +271,7 @@ bool AudioCapture::start(AudioCallback callback) {
    PaError err = Pa_OpenStream(
        &stream_,
        &input_params,
-        nullptr,  // no output
+        nullptr,
        sample_rate_,
        paFramesPerBufferUnspecified,
        paClipOff,
@ -108,6 +291,9 @@ bool AudioCapture::start(AudioCallback callback) {
    }
    is_recording_ = true;
    std::cout << "[VAD] Started with RMS threshold=" << vad_rms_threshold_
              << ", Peak threshold=" << vad_peak_threshold_
              << ", Silence duration=" << silence_duration_ms_ << "ms" << std::endl;
    return true;
 }
@ -116,8 +302,42 @@ void AudioCapture::stop() {
        return;
    }
    // Flush any remaining audio
    if (speech_buffer_.size() > 0 && callback_) {
        int min_speech_samples = (min_speech_duration_ms_ * sample_rate_ * channels_) / 1000;
        if (speech_samples_count_ >= min_speech_samples) {
            std::cout << "[VAD] Flushing remaining audio on stop (denoised)" << std::endl;
            // Flush any remaining samples from the denoiser
            if (noise_reducer_ && noise_reducer_->isEnabled()) {
                auto remaining = noise_reducer_->flushStream();
                speech_buffer_.insert(speech_buffer_.end(),
                                      remaining.begin(), remaining.end());
                // Save preview for listening
                noise_reducer_->savePreview(speech_buffer_);
            }
            callback_(speech_buffer_);
        }
    }
    // Reset the stream for next session
    if (noise_reducer_) {
        noise_reducer_->resetStream();
    }
    Pa_StopStream(stream_);
    is_recording_ = false;
 }
 void AudioCapture::setDenoiseEnabled(bool enabled) {
    if (noise_reducer_) {
        noise_reducer_->setEnabled(enabled);
    }
 }
 bool AudioCapture::isDenoiseEnabled() const {
    return noise_reducer_ && noise_reducer_->isEnabled();
 }
 } // namespace secondvoice
--- a/src/audio/AudioCapture.h
+++ b/src/audio/AudioCapture.h
@ -3,16 +3,20 @@
 #include <vector>
 #include <string>
 #include <functional>
 #include <atomic>
 #include <cmath>
 #include <memory>
 #include <portaudio.h>
 namespace secondvoice {
 class NoiseReducer;
 class AudioCapture {
 public:
    using AudioCallback = std::function<void(const std::vector<float>&)>;
-    // chunk_duration = window size, chunk_step = how often to send (overlap = duration - step)
+    AudioCapture(int sample_rate, int channels);
    AudioCapture(int sample_rate, int channels, int chunk_duration_seconds, int chunk_step_seconds = 0);
    ~AudioCapture();
    bool initialize();
@ -20,6 +24,26 @@ public:
    void stop();
    bool isRecording() const { return is_recording_; }
    // Real-time audio levels (updated every callback, ~10ms)
    float getCurrentRMS() const { return current_rms_; }
    float getCurrentPeak() const { return current_peak_; }
    // VAD settings (can be adjusted in real-time from UI)
    void setVadThresholds(float rms_threshold, float peak_threshold) {
        vad_rms_threshold_ = rms_threshold;
        vad_peak_threshold_ = peak_threshold;
    }
    void setSilenceDuration(int ms) { silence_duration_ms_ = ms; }
    void setMinSpeechDuration(int ms) { min_speech_duration_ms_ = ms; }
    void setMaxSpeechDuration(int ms) { max_speech_duration_ms_ = ms; }
    // VAD state (for UI display)
    bool isSpeechDetected() const { return is_speech_active_; }
    // Noise reduction
    void setDenoiseEnabled(bool enabled);
    bool isDenoiseEnabled() const;
 private:
    static int audioCallback(const void* input, void* output,
                            unsigned long frame_count,
@ -29,13 +53,45 @@ private:
    int sample_rate_;
    int channels_;
    int chunk_duration_seconds_;
    int chunk_step_seconds_;  // How often to emit chunks (0 = same as duration, no overlap)
    bool is_recording_ = false;
    PaStream* stream_ = nullptr;
    AudioCallback callback_;
-    std::vector<float> buffer_;
+
    // Speech accumulation buffer
    std::vector<float> speech_buffer_;
    // VAD state
    std::atomic<bool> is_speech_active_{false};
    int silence_samples_count_ = 0;  // How many samples of silence
    int speech_samples_count_ = 0;   // How many samples of speech accumulated
    // VAD parameters - Higher threshold to avoid false triggers on filtered noise
    std::atomic<float> vad_rms_threshold_{0.02f};   // Was 0.01f
    std::atomic<float> vad_peak_threshold_{0.08f};  // Was 0.04f
    int silence_duration_ms_ = 400;      // Wait 400ms of silence before cutting
    int min_speech_duration_ms_ = 300;   // Minimum speech to send
    int max_speech_duration_ms_ = 25000; // 25s max before forced flush
    // Adaptive noise floor
    float noise_floor_ = 0.005f;         // Estimated background noise level
    float noise_floor_alpha_ = 0.001f;   // Slower adaptation
    // Hang time - wait before cutting to avoid mid-sentence cuts
    int hang_frames_ = 0;
    int hang_frames_threshold_ = 20;     // ~200ms tolerance for pauses
    // Zero-crossing rate for speech vs noise discrimination
    float last_zcr_ = 0.0f;
    float speech_zcr_min_ = 0.01f;       // More permissive lower bound
    float speech_zcr_max_ = 0.25f;       // More permissive upper bound
    // Real-time audio levels (atomic for thread safety)
    std::atomic<float> current_rms_{0.0f};
    std::atomic<float> current_peak_{0.0f};
    // Noise reduction
    std::unique_ptr<NoiseReducer> noise_reducer_;
 };
 } // namespace secondvoice
--- a/src/audio/NoiseReducer.cpp
+++ b/src/audio/NoiseReducer.cpp
@ -0,0 +1,249 @@
 #include "NoiseReducer.h"
 #include "AudioBuffer.h"
 #include <rnnoise.h>
 #include <cstring>
 #include <algorithm>
 #include <iostream>
 #include <filesystem>
 namespace secondvoice {
 NoiseReducer::NoiseReducer() {
    state_ = rnnoise_create(nullptr);
    upsample_buffer_.resize(RNNOISE_FRAME_SIZE);
    downsample_buffer_.resize(INPUT_FRAME_SIZE);
 }
 NoiseReducer::~NoiseReducer() {
    if (state_) {
        rnnoise_destroy(state_);
    }
 }
 void NoiseReducer::upsample(const float* in, float* out, int in_samples) {
    // Simple linear interpolation 16kHz -> 48kHz (3x)
    for (int i = 0; i < in_samples; ++i) {
        int out_idx = i * 3;
        float curr = in[i];
        float next = (i + 1 < in_samples) ? in[i + 1] : curr;
        out[out_idx] = curr;
        out[out_idx + 1] = curr + (next - curr) * 0.333f;
        out[out_idx + 2] = curr + (next - curr) * 0.667f;
    }
 }
 void NoiseReducer::downsample(const float* in, float* out, int out_samples) {
    // Simple decimation 48kHz -> 16kHz (take every 3rd sample with averaging)
    for (int i = 0; i < out_samples; ++i) {
        int in_idx = i * 3;
        // Average of 3 samples for anti-aliasing
        out[i] = (in[in_idx] + in[in_idx + 1] + in[in_idx + 2]) / 3.0f;
    }
 }
 void NoiseReducer::suppressTransients(float* samples, size_t count) {
    // Transient suppressor: attenuates sudden spikes (claps, clicks, pops)
    // while preserving speech which has smoother amplitude changes
    int transients_detected = 0;
    for (size_t i = 0; i < count; ++i) {
        float abs_sample = std::abs(samples[i]);
        // IMPORTANT: Use envelope BEFORE updating for transient detection
        // Otherwise the envelope rises with the clap and we miss it!
        float safe_envelope = std::max(envelope_, 0.001f);
        float ratio = abs_sample / safe_envelope;
        bool is_transient = (ratio > transient_threshold_);
        if (is_transient) {
            // This is a transient - attenuate it heavily
            float target_amplitude = safe_envelope * transient_ratio_;
            float attenuation = target_amplitude / abs_sample;
            samples[i] *= attenuation;
            transients_detected++;
            // DON'T update envelope from transients - keep it stable
            // This prevents the envelope from "learning" the clap
        } else {
            // Only update envelope from non-transient samples
            if (abs_sample > envelope_) {
                envelope_ = envelope_ * (1.0f - envelope_attack_) + abs_sample * envelope_attack_;
            } else {
                envelope_ = envelope_ * (1.0f - envelope_release_) + abs_sample * envelope_release_;
            }
        }
    }
    if (transients_detected > 10) {
        std::cout << "[RNNoise] Suppressed " << transients_detected << " transient samples (clap/click)" << std::endl;
    }
 }
 void NoiseReducer::process(std::vector<float>& samples) {
    if (!enabled_ || !state_ || samples.empty()) {
        return;
    }
    // Process in chunks of INPUT_FRAME_SIZE (160 samples = 10ms at 16kHz)
    size_t num_frames = samples.size() / INPUT_FRAME_SIZE;
    float duration_ms = static_cast<float>(samples.size()) / 16.0f;  // 16 samples/ms at 16kHz
    std::cout << "[RNNoise] Denoising " << duration_ms << "ms of audio (" << num_frames << " frames)" << std::endl;
    for (size_t frame = 0; frame < num_frames; ++frame) {
        float* frame_ptr = samples.data() + frame * INPUT_FRAME_SIZE;
        // Upsample 16kHz -> 48kHz
        upsample(frame_ptr, upsample_buffer_.data(), INPUT_FRAME_SIZE);
        // RNNoise expects float input in range [-32768, 32768] (like int16)
        // Our input is [-1, 1], so scale up
        for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) {
            upsample_buffer_[i] *= 32768.0f;
        }
        // Apply RNNoise
        rnnoise_process_frame(state_, upsample_buffer_.data(), upsample_buffer_.data());
        // Scale back to [-1, 1]
        for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) {
            upsample_buffer_[i] /= 32768.0f;
            // Clamp to prevent any overflow
            upsample_buffer_[i] = std::clamp(upsample_buffer_[i], -1.0f, 1.0f);
        }
        // Downsample 48kHz -> 16kHz
        downsample(upsample_buffer_.data(), frame_ptr, INPUT_FRAME_SIZE);
    }
    // Handle remaining samples (less than a full frame) - just leave them as-is
    // They'll be processed in the next call
 }
 std::vector<float> NoiseReducer::processCopy(const std::vector<float>& samples) {
    std::vector<float> result = samples;
    process(result);
    return result;
 }
 std::vector<float> NoiseReducer::processRealtime(const float* samples, size_t count) {
    std::vector<float> output;
    if (!enabled_ || !state_ || count == 0) {
        // Pass through unchanged
        output.assign(samples, samples + count);
        return output;
    }
    // Add new samples to streaming buffer
    stream_input_buffer_.insert(stream_input_buffer_.end(), samples, samples + count);
    // Process complete frames
    while (stream_input_buffer_.size() >= INPUT_FRAME_SIZE) {
        // Get frame pointer
        float* frame_ptr = stream_input_buffer_.data();
        // Step 1: Suppress transients (claps, clicks, pops) BEFORE RNNoise
        suppressTransients(frame_ptr, INPUT_FRAME_SIZE);
        // Step 2: Upsample 16kHz -> 48kHz
        upsample(frame_ptr, upsample_buffer_.data(), INPUT_FRAME_SIZE);
        // Scale to int16 range for RNNoise
        for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) {
            upsample_buffer_[i] *= 32768.0f;
        }
        // Apply RNNoise
        rnnoise_process_frame(state_, upsample_buffer_.data(), upsample_buffer_.data());
        // Scale back to [-1, 1]
        for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) {
            upsample_buffer_[i] /= 32768.0f;
            upsample_buffer_[i] = std::clamp(upsample_buffer_[i], -1.0f, 1.0f);
        }
        // Downsample 48kHz -> 16kHz and add to output
        std::vector<float> denoised_frame(INPUT_FRAME_SIZE);
        downsample(upsample_buffer_.data(), denoised_frame.data(), INPUT_FRAME_SIZE);
        output.insert(output.end(), denoised_frame.begin(), denoised_frame.end());
        // Remove processed samples from buffer
        stream_input_buffer_.erase(stream_input_buffer_.begin(),
                                   stream_input_buffer_.begin() + INPUT_FRAME_SIZE);
    }
    return output;
 }
 std::vector<float> NoiseReducer::flushStream() {
    std::vector<float> output;
    if (!enabled_ || !state_ || stream_input_buffer_.empty()) {
        // Return any remaining samples as-is
        output = std::move(stream_input_buffer_);
        stream_input_buffer_.clear();
        return output;
    }
    // Process remaining samples (zero-pad to complete frame if needed)
    while (!stream_input_buffer_.empty()) {
        // Pad with zeros if we have less than a full frame
        while (stream_input_buffer_.size() < INPUT_FRAME_SIZE) {
            stream_input_buffer_.push_back(0.0f);
        }
        // Get frame pointer
        float* frame_ptr = stream_input_buffer_.data();
        // Step 1: Suppress transients
        suppressTransients(frame_ptr, INPUT_FRAME_SIZE);
        // Step 2: Upsample 16kHz -> 48kHz
        upsample(frame_ptr, upsample_buffer_.data(), INPUT_FRAME_SIZE);
        // Scale to int16 range for RNNoise
        for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) {
            upsample_buffer_[i] *= 32768.0f;
        }
        // Apply RNNoise
        rnnoise_process_frame(state_, upsample_buffer_.data(), upsample_buffer_.data());
        // Scale back to [-1, 1]
        for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) {
            upsample_buffer_[i] /= 32768.0f;
            upsample_buffer_[i] = std::clamp(upsample_buffer_[i], -1.0f, 1.0f);
        }
        // Downsample 48kHz -> 16kHz and add to output
        std::vector<float> denoised_frame(INPUT_FRAME_SIZE);
        downsample(upsample_buffer_.data(), denoised_frame.data(), INPUT_FRAME_SIZE);
        output.insert(output.end(), denoised_frame.begin(), denoised_frame.end());
        // Remove processed samples from buffer
        stream_input_buffer_.erase(stream_input_buffer_.begin(),
                                   stream_input_buffer_.begin() + INPUT_FRAME_SIZE);
    }
    return output;
 }
 void NoiseReducer::savePreview(const std::vector<float>& samples) {
    if (!save_preview_ || samples.empty()) {
        return;
    }
    std::filesystem::create_directories("./denoised");
    std::string filename = "./denoised/denoised_" + std::to_string(preview_count_++) + ".ogg";
    AudioBuffer buffer(16000, 1);  // 16kHz mono
    buffer.addSamples(samples);
    if (buffer.saveToOpus(filename)) {
        std::cout << "[RNNoise] Saved denoised preview: " << filename << std::endl;
    }
 }
 } // namespace secondvoice
--- a/src/audio/NoiseReducer.h
+++ b/src/audio/NoiseReducer.h
@ -0,0 +1,86 @@
 #pragma once
 #include <vector>
 #include <memory>
 // Forward declaration
 struct DenoiseState;
 namespace secondvoice {
 /**
 * NoiseReducer - Wrapper around RNNoise for real-time audio denoising
 *
 * RNNoise expects 48kHz audio, but we work at 16kHz.
 * This wrapper handles resampling transparently.
 */
 class NoiseReducer {
 public:
    NoiseReducer();
    ~NoiseReducer();
    // Process audio samples in-place (batch mode - for final output)
    // Input/output: float samples normalized to [-1, 1]
    void process(std::vector<float>& samples);
    // Process a buffer, returns denoised copy
    std::vector<float> processCopy(const std::vector<float>& samples);
    // Real-time streaming: process samples as they come in
    // Returns denoised samples (may be slightly delayed due to frame buffering)
    std::vector<float> processRealtime(const float* samples, size_t count);
    // Flush any remaining samples in the stream buffer (zero-padded if needed)
    std::vector<float> flushStream();
    // Clear the stream buffer (call when starting a new segment)
    void resetStream() { stream_input_buffer_.clear(); }
    // Enable/disable denoising
    void setEnabled(bool enabled) { enabled_ = enabled; }
    bool isEnabled() const { return enabled_; }
    // Save denoised audio for preview
    void setSavePreview(bool save) { save_preview_ = save; }
    bool isSavePreviewEnabled() const { return save_preview_; }
    // Save audio samples to Opus file (for preview)
    void savePreview(const std::vector<float>& samples);
 private:
    DenoiseState* state_ = nullptr;
    bool enabled_ = true;
    bool save_preview_ = true;  // Save denoised audio for listening
    int preview_count_ = 0;
    // RNNoise frame size (480 samples at 48kHz = 10ms)
    static constexpr int RNNOISE_FRAME_SIZE = 480;
    // Our frame size (160 samples at 16kHz = 10ms)
    static constexpr int INPUT_FRAME_SIZE = 160;
    // Resampling buffers
    std::vector<float> upsample_buffer_;
    std::vector<float> downsample_buffer_;
    // Streaming input buffer (for real-time processing)
    std::vector<float> stream_input_buffer_;
    // Transient suppressor state
    float envelope_ = 0.0f;              // Smoothed signal envelope
    float envelope_attack_ = 0.01f;      // Fast attack to track signal
    float envelope_release_ = 0.0005f;   // Slow release to maintain level
    float transient_threshold_ = 4.0f;   // Spike must be 4x envelope to be considered transient
    float transient_ratio_ = 0.1f;       // Attenuate transients to 10% (-90% reduction)
    // Suppress transient sounds (claps, clicks, pops)
    void suppressTransients(float* samples, size_t count);
    // Upsample 16kHz to 48kHz (3x)
    void upsample(const float* in, float* out, int in_samples);
    // Downsample 48kHz to 16kHz (3x)
    void downsample(const float* in, float* out, int out_samples);
 };
 } // namespace secondvoice
--- a/src/core/Pipeline.cpp
+++ b/src/core/Pipeline.cpp
@ -24,18 +24,13 @@ Pipeline::~Pipeline() {
 bool Pipeline::initialize() {
    auto& config = Config::getInstance();
-    // Initialize audio capture with sliding window (overlap)
+    // Initialize audio capture with VAD-based segmentation
    audio_capture_ = std::make_unique<AudioCapture>(
        config.getAudioConfig().sample_rate,
-        config.getAudioConfig().channels,
+        config.getAudioConfig().channels
        config.getAudioConfig().chunk_duration_seconds,
        config.getAudioConfig().chunk_step_seconds
    );
-    std::cout << "[Pipeline] Audio: " << config.getAudioConfig().chunk_duration_seconds
+    std::cout << "[Pipeline] VAD-based audio segmentation enabled" << std::endl;
              << "s window, " << config.getAudioConfig().chunk_step_seconds
              << "s step (overlap: " << (config.getAudioConfig().chunk_duration_seconds - config.getAudioConfig().chunk_step_seconds)
              << "s)" << std::endl;
    if (!audio_capture_->initialize()) {
        std::cerr << "Failed to initialize audio capture" << std::endl;
@ -76,7 +71,7 @@ bool Pipeline::start() {
    running_ = true;
-    // Start background threads (NOT UI - that runs in main thread)
+    // Start background threads
    audio_thread_ = std::thread(&Pipeline::audioThread, this);
    processing_thread_ = std::thread(&Pipeline::processingThread, this);
@ -95,9 +90,8 @@ void Pipeline::stop() {
        audio_capture_->stop();
    }
-    // Shutdown queues
+    // Shutdown queue
    audio_queue_.shutdown();
    transcription_queue_.shutdown();
    // Wait for background threads
    if (audio_thread_.joinable()) {
@ -146,7 +140,7 @@ void Pipeline::audioThread() {
    // Keep thread alive while recording
    auto start_time = std::chrono::steady_clock::now();
    while (running_ && audio_capture_->isRecording()) {
-        std::this_thread::sleep_for(std::chrono::seconds(1));
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
        // Update duration
        auto now = std::chrono::steady_clock::now();
@ -157,11 +151,6 @@ void Pipeline::audioThread() {
 void Pipeline::processingThread() {
    auto& config = Config::getInstance();
    // VAD threshold - audio RMS must exceed this to be considered speech
    // Higher values = more aggressive noise filtering
    constexpr float VAD_THRESHOLD = 0.02f;  // RMS energy threshold
    constexpr float VAD_MIN_PEAK = 0.08f;   // Minimum peak amplitude
    while (running_) {
        auto chunk_opt = audio_queue_.wait_and_pop();
        if (!chunk_opt.has_value()) {
@ -169,22 +158,8 @@ void Pipeline::processingThread() {
        }
        auto& chunk = chunk_opt.value();
-
+        float duration = static_cast<float>(chunk.data.size()) / (chunk.sample_rate * chunk.channels);
-        // Voice Activity Detection - skip silent/noise chunks
+        std::cout << "[Processing] Speech segment: " << duration << "s" << std::endl;
        float sum_squared = 0.0f;
        float max_amplitude = 0.0f;
        for (const float sample : chunk.data) {
            sum_squared += sample * sample;
            max_amplitude = std::max(max_amplitude, std::abs(sample));
        }
        float rms = std::sqrt(sum_squared / chunk.data.size());
        if (rms < VAD_THRESHOLD || max_amplitude < VAD_MIN_PEAK) {
            std::cout << "[Skip] Silent chunk (RMS: " << rms << ", Peak: " << max_amplitude << ")" << std::endl;
            continue;
        }
        std::cout << "[Audio] Processing chunk (RMS: " << rms << ", Peak: " << max_amplitude << ")" << std::endl;
        // Transcribe with Whisper
        auto whisper_result = whisper_client_->transcribe(
@ -203,8 +178,9 @@ void Pipeline::processingThread() {
            continue;
        }
-        // Filter out Whisper hallucinations (common when audio is silent/unclear)
+        // Filter out Whisper hallucinations
        std::string text = whisper_result->text;
        // Trim whitespace
        size_t start = text.find_first_not_of(" \t\n\r");
        size_t end = text.find_last_not_of(" \t\n\r");
@ -215,28 +191,61 @@ void Pipeline::processingThread() {
        text = text.substr(start, end - start + 1);
        // Skip known hallucinations and garbage
        // Whisper hallucinates these when given silence/noise
        bool is_garbage = false;
        // Too short to be meaningful
        if (text.length() < 4) {
            is_garbage = true;
-        }
+        } else if (
-        // Known Whisper hallucinations
+            // Common Whisper hallucinations
-        else if (text.find("Amara.org") != std::string::npos ||
+            text.find("Tingting") != std::string::npos ||
-                 text.find("amara.org") != std::string::npos ||
+            text.find("tingting") != std::string::npos ||
-                 text.find("字幕") != std::string::npos ||
+            text.find("婷婷") != std::string::npos ||
-                 text.find("subtitle") != std::string::npos ||
+            text.find("Thank you") != std::string::npos ||
-                 text.find("Subtitle") != std::string::npos ||
+            text.find("thanks for watching") != std::string::npos ||
-                 text.find("Thank you") != std::string::npos ||
+            text.find("Thanks for watching") != std::string::npos ||
-                 text.find("thanks for watching") != std::string::npos ||
+            text.find("Subscribe") != std::string::npos ||
-                 text.find("Subscribe") != std::string::npos ||
+            text.find("subscribe") != std::string::npos ||
-                 text.find("谢谢观看") != std::string::npos ||
+            text.find("請訂閱") != std::string::npos ||
-                 text.find("订阅") != std::string::npos ||
+            text.find("请订阅") != std::string::npos ||
-                 text.find("...") == 0) {  // Starts with ellipsis
+            text.find("訂閱") != std::string::npos ||
            text.find("订阅") != std::string::npos ||
            text.find("Amara.org") != std::string::npos ||
            text.find("amara.org") != std::string::npos ||
            text.find("字幕") != std::string::npos ||
            text.find("subtitle") != std::string::npos ||
            text.find("Subtitle") != std::string::npos ||
            text.find("谢谢观看") != std::string::npos ||
            text.find("謝謝觀看") != std::string::npos ||
            text.find("感谢收看") != std::string::npos ||
            text.find("感謝收看") != std::string::npos ||
            text.find("下期再见") != std::string::npos ||
            text.find("下期再見") != std::string::npos ||
            text.find("再见") != std::string::npos ||
            text.find("再見") != std::string::npos ||
            text.find("music") != std::string::npos ||
            text.find("Music") != std::string::npos ||
            text.find("♪") != std::string::npos ||
            text.find("silenc") != std::string::npos ||  // silence, silencieux
            text.find("Silenc") != std::string::npos ||
            text.find("...") == 0 ||
            // Repetitive single words/sounds
            text == "Bonjour" ||
            text == "Hello" ||
            text == "Hi" ||
            text == "你好" ||
            text == "好" ||
            text == "嗯" ||
            text == "啊" ||
            text == "哈" ||
            text == "呵呵" ||
            text == "哈哈" ||
            text == "嘻嘻" ||
            text == "Hmm" ||
            text == "Huh" ||
            text == "Um" ||
            text == "Uh") {
            is_garbage = true;
-        }
+        } else {
        // Only punctuation or whitespace
        else {
            bool has_content = false;
            for (char c : text) {
                if (std::isalnum(static_cast<unsigned char>(c)) || (c & 0x80)) {
@ -244,9 +253,7 @@ void Pipeline::processingThread() {
                    break;
                }
            }
-            if (!has_content) {
+            if (!has_content) is_garbage = true;
                is_garbage = true;
            }
        }
        if (is_garbage) {
@ -254,20 +261,14 @@ void Pipeline::processingThread() {
            continue;
        }
-        // Remove overlap with previous transcription
+        // Track audio cost
-        std::string deduplicated = removeOverlap(text, last_transcription_);
+        if (ui_) {
-        last_transcription_ = text;  // Store full text for next comparison
+            ui_->addAudioCost(duration);
        if (deduplicated.empty() || deduplicated.length() < 2) {
            std::cout << "[Skip] Fully overlapping transcription" << std::endl;
            continue;
        }
-        std::cout << "[New content] " << deduplicated << std::endl;
+        // Translate with Claude
        // Translate with Claude (only the new, deduplicated content)
        auto claude_result = claude_client_->translate(
-            deduplicated,
+            text,
            config.getClaudeConfig().system_prompt,
            config.getClaudeConfig().max_tokens,
            config.getClaudeConfig().temperature
@ -278,10 +279,27 @@ void Pipeline::processingThread() {
            continue;
        }
-        // Add to UI
+        // Track Claude cost
-        ui_->addTranslation(deduplicated, claude_result->text);
+        if (ui_) {
            ui_->addClaudeCost();
        }
-        std::cout << "CN: " << deduplicated << std::endl;
+        // Simple accumulation
        if (!accumulated_chinese_.empty()) {
            accumulated_chinese_ += " ";
            accumulated_french_ += " ";
        }
        accumulated_chinese_ += text;
        accumulated_french_ += claude_result->text;
        std::cout << "[Accumulated CN] " << accumulated_chinese_ << std::endl;
        std::cout << "[Accumulated FR] " << accumulated_french_ << std::endl;
        // Update UI
        ui_->setAccumulatedText(accumulated_chinese_, accumulated_french_);
        ui_->addTranslation(text, claude_result->text);
        std::cout << "CN: " << text << std::endl;
        std::cout << "FR: " << claude_result->text << std::endl;
        std::cout << "---" << std::endl;
    }
@ -290,58 +308,45 @@ void Pipeline::processingThread() {
 void Pipeline::update() {
    if (!ui_) return;
    // Sync VAD thresholds from UI to AudioCapture
    if (audio_capture_) {
        audio_capture_->setVadThresholds(
            ui_->getVadThreshold(),
            ui_->getVadPeakThreshold()
        );
        // Update UI with audio levels
        ui_->setCurrentRMS(audio_capture_->getCurrentRMS());
        ui_->setCurrentPeak(audio_capture_->getCurrentPeak());
    }
    ui_->setRecordingDuration(recording_duration_);
-    ui_->setProcessingStatus("Processing...");
+    ui_->setProcessingStatus(audio_capture_ && audio_capture_->isSpeechDetected() ? "Recording speech..." : "Listening...");
    ui_->render();
    // Check if stop was requested
    if (ui_->isStopRequested()) {
        running_ = false;
    }
    // Check if clear was requested
    if (ui_->isClearRequested()) {
        clearAccumulated();
        ui_->resetClearRequest();
    }
 }
 bool Pipeline::shouldClose() const {
    return ui_ ? ui_->shouldClose() : true;
 }
-std::string Pipeline::removeOverlap(const std::string& current, const std::string& previous) {
+void Pipeline::clearAccumulated() {
-    if (previous.empty()) {
+    accumulated_chinese_.clear();
-        return current;
+    accumulated_french_.clear();
    if (ui_) {
        ui_->setAccumulatedText("", "");
    }
-
+    std::cout << "[Pipeline] Cleared accumulated text" << std::endl;
    // Try to find overlap: check if current starts with end of previous
    // Start from half the previous text (since we have 50% overlap)
    size_t min_overlap = 4;  // Minimum characters to consider as overlap
    size_t search_start = previous.length() / 3;  // Start looking from 1/3 into previous
    for (size_t i = search_start; i < previous.length() - min_overlap; ++i) {
        std::string suffix = previous.substr(i);
        if (current.find(suffix) == 0) {
            // Found overlap - return only the new part
            std::string new_part = current.substr(suffix.length());
            if (!new_part.empty()) {
                std::cout << "[Overlap] Removed: \"" << suffix << "\"" << std::endl;
                return new_part;
            }
        }
    }
    // No overlap found - check for partial word overlap at start
    // This handles cases where the same content appears but tokenized differently
    size_t max_check = std::min(current.length(), previous.length()) / 2;
    for (size_t len = max_check; len >= min_overlap; --len) {
        std::string end_prev = previous.substr(previous.length() - len);
        std::string start_curr = current.substr(0, len);
        if (end_prev == start_curr) {
            std::string new_part = current.substr(len);
            if (!new_part.empty()) {
                std::cout << "[Overlap] Partial match removed: \"" << end_prev << "\"" << std::endl;
                return new_part;
            }
        }
    }
    return current;  // No overlap detected
 }
 } // namespace secondvoice
--- a/src/core/Pipeline.h
+++ b/src/core/Pipeline.h
@ -21,10 +21,6 @@ struct AudioChunk {
    int channels;
 };
 struct TranscriptionResult {
    std::string chinese_text;
 };
 class Pipeline {
 public:
    Pipeline();
@ -40,6 +36,9 @@ public:
    bool isRunning() const { return running_; }
    bool shouldClose() const;
    // Clear accumulated translations
    void clearAccumulated();
 private:
    void audioThread();
    void processingThread();
@ -51,7 +50,6 @@ private:
    std::unique_ptr<AudioBuffer> full_recording_;
    ThreadSafeQueue<AudioChunk> audio_queue_;
    ThreadSafeQueue<TranscriptionResult> transcription_queue_;
    std::thread audio_thread_;
    std::thread processing_thread_;
@ -59,9 +57,9 @@ private:
    std::atomic<bool> running_{false};
    std::atomic<int> recording_duration_{0};
-    // For overlap deduplication
+    // Simple accumulation
-    std::string last_transcription_;
+    std::string accumulated_chinese_;
-    std::string removeOverlap(const std::string& current, const std::string& previous);
+    std::string accumulated_french_;
 };
 } // namespace secondvoice
--- a/src/ui/TranslationUI.cpp
+++ b/src/ui/TranslationUI.cpp
@ -199,10 +199,31 @@ void TranslationUI::render() {
        ImGuiWindowFlags_NoMove |
        ImGuiWindowFlags_NoCollapse);
    // Two-column layout: main content on left, audio panel on right
    float audio_panel_width = 280.0f;
    float main_width = ImGui::GetWindowWidth() - audio_panel_width - 20.0f;
    // Left column - main content
    ImGui::BeginChild("MainContent", ImVec2(main_width, -1), false);
    ImGui::Text("SecondVoice - Live Translation");
    ImGui::Separator();
    ImGui::Spacing();
    renderAccumulated();
    renderTranslations();
    renderControls();
    renderStatus();
    ImGui::EndChild();
    ImGui::SameLine();
    // Right column - audio panel
    ImGui::BeginChild("AudioPanel", ImVec2(audio_panel_width, -1), true);
    renderAudioPanel();
    ImGui::EndChild();
    ImGui::End();
    // Rendering
@ -225,11 +246,49 @@ void TranslationUI::addTranslation(const std::string& chinese, const std::string
    messages_.push_back({chinese, french});
 }
-void TranslationUI::renderTranslations() {
+void TranslationUI::setAccumulatedText(const std::string& chinese, const std::string& french) {
-    ImGui::Text("SecondVoice - Live Translation");
+    accumulated_chinese_ = chinese;
    accumulated_french_ = french;
 }
 void TranslationUI::renderAccumulated() {
    if (accumulated_chinese_.empty() && accumulated_french_.empty()) {
        return;
    }
    ImGui::Text("Texte Recomposé");
    ImGui::SameLine();
    if (ImGui::SmallButton("Clear")) {
        clear_requested_ = true;
    }
    ImGui::Separator();
-    ImGui::BeginChild("Translations", ImVec2(0, -120), true);
+    ImGui::BeginChild("Accumulated", ImVec2(0, 150), true);
    // Chinese accumulated text
    ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(1.0f, 0.9f, 0.5f, 1.0f));  // Yellow
    ImGui::TextWrapped("%s", accumulated_chinese_.c_str());
    ImGui::PopStyleColor();
    ImGui::Spacing();
    // French accumulated text
    ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.5f, 1.0f, 0.8f, 1.0f));  // Cyan
    ImGui::TextWrapped("%s", accumulated_french_.c_str());
    ImGui::PopStyleColor();
    ImGui::EndChild();
    ImGui::Spacing();
 }
 void TranslationUI::renderTranslations() {
    ImGui::Text("Segments (détail)");
    ImGui::Separator();
    // Height depends on whether accumulated section is shown
    float height = (accumulated_chinese_.empty() && accumulated_french_.empty()) ? -120 : -120;
    ImGui::BeginChild("Translations", ImVec2(0, height), true);
    for (const auto& msg : messages_) {
        ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.5f, 0.8f, 1.0f, 1.0f));
@ -281,4 +340,101 @@ void TranslationUI::renderStatus() {
    }
 }
 void TranslationUI::renderAudioPanel() {
    ImGui::Text("Audio Monitor");
    ImGui::Separator();
    ImGui::Spacing();
    // Current volume display
    ImGui::Text("Volume Level");
    // RMS meter (horizontal bar)
    float rms_normalized = std::min(current_rms_ * 10.0f, 1.0f);  // Scale for visibility
    ImVec4 rms_color = rms_normalized > vad_threshold_ * 10.0f ?
        ImVec4(0.2f, 0.8f, 0.2f, 1.0f) :  // Green if above threshold
        ImVec4(0.5f, 0.5f, 0.5f, 1.0f);   // Gray if below
    ImGui::PushStyleColor(ImGuiCol_PlotHistogram, rms_color);
    ImGui::ProgressBar(rms_normalized, ImVec2(-1, 20), "");
    ImGui::PopStyleColor();
    ImGui::Text("RMS: %.4f", current_rms_);
    ImGui::Spacing();
    // Peak meter
    ImGui::Text("Peak Level");
    float peak_normalized = std::min(current_peak_, 1.0f);
    ImVec4 peak_color = peak_normalized > vad_peak_threshold_ ?
        ImVec4(0.2f, 0.8f, 0.2f, 1.0f) :
        ImVec4(0.5f, 0.5f, 0.5f, 1.0f);
    ImGui::PushStyleColor(ImGuiCol_PlotHistogram, peak_color);
    ImGui::ProgressBar(peak_normalized, ImVec2(-1, 20), "");
    ImGui::PopStyleColor();
    ImGui::Text("Peak: %.4f", current_peak_);
    ImGui::Spacing();
    ImGui::Separator();
    ImGui::Spacing();
    // VAD Threshold sliders
    ImGui::Text("VAD Settings");
    ImGui::Spacing();
    ImGui::Text("RMS Threshold");
    ImGui::SliderFloat("##rms_thresh", &vad_threshold_, 0.001f, 0.1f, "%.3f");
    // Draw threshold line on the RMS meter area
    ImGui::Spacing();
    ImGui::Text("Peak Threshold");
    ImGui::SliderFloat("##peak_thresh", &vad_peak_threshold_, 0.01f, 0.3f, "%.3f");
    ImGui::Spacing();
    ImGui::Separator();
    ImGui::Spacing();
    // Status indicator
    bool is_active = (current_rms_ >= vad_threshold_) && (current_peak_ >= vad_peak_threshold_);
    if (is_active) {
        ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.2f, 1.0f, 0.2f, 1.0f));
        ImGui::Text(">> VOICE DETECTED <<");
        ImGui::PopStyleColor();
    } else {
        ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.6f, 0.6f, 0.6f, 1.0f));
        ImGui::Text("   (silence)");
        ImGui::PopStyleColor();
    }
    ImGui::Spacing();
    ImGui::Separator();
    ImGui::Spacing();
    // Cost tracking
    ImGui::Text("API Usage");
    ImGui::Spacing();
    // Audio sent to Whisper
    int minutes = static_cast<int>(total_audio_seconds_) / 60;
    int seconds = static_cast<int>(total_audio_seconds_) % 60;
    ImGui::Text("Audio: %d:%02d", minutes, seconds);
    ImGui::Text("Whisper calls: %d", whisper_calls_);
    ImGui::Text("Claude calls: %d", claude_calls_);
    ImGui::Spacing();
    // Estimated cost
    float cost = getEstimatedCost();
    ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(1.0f, 0.8f, 0.3f, 1.0f));  // Gold
    ImGui::Text("Est. cost: $%.4f", cost);
    ImGui::PopStyleColor();
    ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.6f, 0.6f, 0.6f, 1.0f));
    ImGui::Text("(Whisper $0.006/min)");
    ImGui::Text("(Haiku ~$0.001/call)");
    ImGui::PopStyleColor();
 }
 } // namespace secondvoice
--- a/src/ui/TranslationUI.h
+++ b/src/ui/TranslationUI.h
@ -23,27 +23,66 @@ public:
    bool shouldClose() const;
    void addTranslation(const std::string& chinese, const std::string& french);
    void setAccumulatedText(const std::string& chinese, const std::string& french);
    bool isStopRequested() const { return stop_requested_; }
    void resetStopRequest() { stop_requested_ = false; }
    bool isClearRequested() const { return clear_requested_; }
    void resetClearRequest() { clear_requested_ = false; }
    void setRecordingDuration(int seconds) { recording_duration_ = seconds; }
    void setProcessingStatus(const std::string& status) { processing_status_ = status; }
    // Audio level monitoring
    void setCurrentRMS(float rms) { current_rms_ = rms; }
    void setCurrentPeak(float peak) { current_peak_ = peak; }
    float getVadThreshold() const { return vad_threshold_; }
    float getVadPeakThreshold() const { return vad_peak_threshold_; }
 private:
    void renderAccumulated();
    void renderTranslations();
    void renderControls();
    void renderStatus();
    void renderAudioPanel();
    int width_;
    int height_;
    GLFWwindow* window_ = nullptr;
    std::vector<TranslationMessage> messages_;
    std::string accumulated_chinese_;
    std::string accumulated_french_;
    bool stop_requested_ = false;
    bool clear_requested_ = false;
    bool auto_scroll_ = true;
    int recording_duration_ = 0;
    std::string processing_status_;
    // Audio monitoring
    float current_rms_ = 0.0f;
    float current_peak_ = 0.0f;
    float vad_threshold_ = 0.02f;       // 2x higher to avoid false triggers
    float vad_peak_threshold_ = 0.08f;  // 2x higher
    // Cost tracking
    float total_audio_seconds_ = 0.0f;
    int whisper_calls_ = 0;
    int claude_calls_ = 0;
 public:
    void addAudioCost(float seconds) {
        total_audio_seconds_ += seconds;
        whisper_calls_++;
    }
    void addClaudeCost() { claude_calls_++; }
    float getTotalAudioSeconds() const { return total_audio_seconds_; }
    float getEstimatedCost() const {
        // gpt-4o-mini-transcribe: $0.006/min = $0.0001/sec
        // Haiku: ~$0.001 per short translation
        return (total_audio_seconds_ * 0.0001f) + (claude_calls_ * 0.001f);
    }
 };
 } // namespace secondvoice