diff --git a/CMakeLists.txt b/CMakeLists.txt index 35d045f..071193d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.20) -project(SecondVoice VERSION 0.1.0 LANGUAGES CXX) +project(SecondVoice VERSION 0.1.0 LANGUAGES C CXX) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -38,6 +38,35 @@ FetchContent_Declare( FetchContent_MakeAvailable(ogg opus) +# Fetch RNNoise for noise reduction (using older stable version) +FetchContent_Declare( + rnnoise + GIT_REPOSITORY https://github.com/xiph/rnnoise.git + GIT_TAG v0.1.1 +) + +FetchContent_MakeAvailable(rnnoise) + +# Build RNNoise as a static library (v0.1.1 structure) +add_library(rnnoise STATIC + ${rnnoise_SOURCE_DIR}/src/denoise.c + ${rnnoise_SOURCE_DIR}/src/rnn.c + ${rnnoise_SOURCE_DIR}/src/rnn_data.c + ${rnnoise_SOURCE_DIR}/src/pitch.c + ${rnnoise_SOURCE_DIR}/src/kiss_fft.c + ${rnnoise_SOURCE_DIR}/src/celt_lpc.c +) + +target_include_directories(rnnoise PUBLIC + ${rnnoise_SOURCE_DIR}/include + ${rnnoise_SOURCE_DIR}/src +) + +# RNNoise needs math library +if(UNIX) + target_link_libraries(rnnoise PRIVATE m) +endif() + # Create ImGui library with OpenGL/GLFW backends add_library(imgui_backends ${imgui_SOURCE_DIR}/imgui.cpp @@ -70,6 +99,7 @@ set(SOURCES_UI # Audio module src/audio/AudioCapture.cpp src/audio/AudioBuffer.cpp + src/audio/NoiseReducer.cpp # API clients src/api/WhisperClient.cpp src/api/ClaudeClient.cpp @@ -88,6 +118,7 @@ set(SOURCES_CONSOLE # Audio module src/audio/AudioCapture.cpp src/audio/AudioBuffer.cpp + src/audio/NoiseReducer.cpp # API clients src/api/WhisperClient.cpp src/api/ClaudeClient.cpp @@ -107,9 +138,11 @@ add_executable(${PROJECT_NAME}_Console ${SOURCES_CONSOLE}) # Include directories target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src + ${rnnoise_SOURCE_DIR}/include ) target_include_directories(${PROJECT_NAME}_Console PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src + ${rnnoise_SOURCE_DIR}/include ) # Link libraries @@ -124,6 +157,7 @@ if(MINGW AND NOT BUILD_SHARED_LIBS) OpenGL::GL opus ogg + rnnoise # Windows system libraries winmm setupapi @@ -136,6 +170,7 @@ if(MINGW AND NOT BUILD_SHARED_LIBS) nlohmann_json::nlohmann_json opus ogg + rnnoise # Windows system libraries winmm setupapi @@ -152,6 +187,7 @@ else() OpenGL::GL opus ogg + rnnoise ) # Console version - NO UI libs target_link_libraries(${PROJECT_NAME}_Console PRIVATE @@ -159,6 +195,7 @@ else() nlohmann_json::nlohmann_json opus ogg + rnnoise ) endif() diff --git a/config.json b/config.json index 54be627..c0edd1a 100644 --- a/config.json +++ b/config.json @@ -18,10 +18,10 @@ "model": "claude-3-5-haiku-20241022", "max_tokens": 1024, "temperature": 0.3, - "system_prompt": "Tu es un traducteur chinois-français. Réponds UNIQUEMENT avec la traduction française, sans explications, notes, commentaires ou alternatives. Une seule phrase traduite, rien d'autre." + "system_prompt": "Tu es un traducteur chinois-français. Réponds UNIQUEMENT avec la traduction française, sans explications, notes, commentaires ou alternatives. Une seule phrase traduite, rien d'autre. Si du contenu offensant est présent, remplace-le par [contenu offensant] mais traduis impérativement le reste de la phrase." }, "ui": { - "window_width": 1200, + "window_width": 1500, "window_height": 800, "font_size": 24, "max_display_lines": 50 diff --git a/src/audio/AudioCapture.cpp b/src/audio/AudioCapture.cpp index 4542083..b384df7 100644 --- a/src/audio/AudioCapture.cpp +++ b/src/audio/AudioCapture.cpp @@ -1,14 +1,14 @@ #include "AudioCapture.h" +#include "NoiseReducer.h" #include namespace secondvoice { -AudioCapture::AudioCapture(int sample_rate, int channels, int chunk_duration_seconds, int chunk_step_seconds) +AudioCapture::AudioCapture(int sample_rate, int channels) : sample_rate_(sample_rate) , channels_(channels) - , chunk_duration_seconds_(chunk_duration_seconds) - , chunk_step_seconds_(chunk_step_seconds > 0 ? chunk_step_seconds : chunk_duration_seconds) { - // If step not specified, use duration (no overlap) + , noise_reducer_(std::make_unique()) { + std::cout << "[Audio] Noise reduction enabled (RNNoise)" << std::endl; } AudioCapture::~AudioCapture() { @@ -35,31 +35,212 @@ int AudioCapture::audioCallback(const void* input, void* output, const PaStreamCallbackTimeInfo* time_info, PaStreamCallbackFlags status_flags, void* user_data) { - (void)output; // Unused - (void)time_info; // Unused - (void)status_flags; // Unused + (void)output; + (void)time_info; + (void)status_flags; AudioCapture* self = static_cast(user_data); const float* in = static_cast(input); + unsigned long sample_count = frame_count * self->channels_; - // Accumulate audio data - for (unsigned long i = 0; i < frame_count * self->channels_; ++i) { - self->buffer_.push_back(in[i]); + // === REAL-TIME DENOISING === + // Process audio through RNNoise in real-time for meter display + std::vector denoised_samples; + if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) { + denoised_samples = self->noise_reducer_->processRealtime(in, sample_count); } - // Check if we have accumulated enough data for a chunk - size_t chunk_samples = self->sample_rate_ * self->channels_ * self->chunk_duration_seconds_; - size_t step_samples = self->sample_rate_ * self->channels_ * self->chunk_step_seconds_; - - if (self->buffer_.size() >= chunk_samples) { - // Call the callback with the full chunk - if (self->callback_) { - // Send exactly chunk_samples (the window) - std::vector chunk(self->buffer_.begin(), self->buffer_.begin() + chunk_samples); - self->callback_(chunk); + // Calculate RMS and Peak from RAW audio (for VAD detection) + float raw_sum_squared = 0.0f; + float raw_max_amp = 0.0f; + for (unsigned long i = 0; i < sample_count; ++i) { + float sample = in[i]; + raw_sum_squared += sample * sample; + if (std::abs(sample) > raw_max_amp) { + raw_max_amp = std::abs(sample); + } + } + float instant_rms = std::sqrt(raw_sum_squared / sample_count); + float instant_peak = raw_max_amp; + + // Calculate RMS and Peak from DENOISED audio (for UI meter) + float denoised_rms = 0.0f; + float denoised_peak = 0.0f; + if (!denoised_samples.empty()) { + float sum_squared = 0.0f; + float max_amp = 0.0f; + for (const float& sample : denoised_samples) { + sum_squared += sample * sample; + if (std::abs(sample) > max_amp) { + max_amp = std::abs(sample); + } + } + denoised_rms = std::sqrt(sum_squared / denoised_samples.size()); + denoised_peak = max_amp; + } else { + // Fallback to raw if no denoised output yet + denoised_rms = instant_rms; + denoised_peak = instant_peak; + } + + // Smooth RMS/Peak for display (fast attack, slow decay) - USE DENOISED VALUES + constexpr float alpha_rise = 0.1f; + constexpr float alpha_fall = 0.02f; + + float old_rms = self->current_rms_.load(std::memory_order_relaxed); + float old_peak = self->current_peak_.load(std::memory_order_relaxed); + + float alpha_rms = (denoised_rms > old_rms) ? alpha_rise : alpha_fall; + float alpha_peak = (denoised_peak > old_peak) ? alpha_rise : alpha_fall; + + self->current_rms_.store(alpha_rms * denoised_rms + (1.0f - alpha_rms) * old_rms, std::memory_order_relaxed); + self->current_peak_.store(alpha_peak * denoised_peak + (1.0f - alpha_peak) * old_peak, std::memory_order_relaxed); + + // === VAD NOW WORKS ON FILTERED AUDIO === + // This way, filtered noise/claps won't trigger VAD! + + // Calculate Zero-Crossing Rate on DENOISED audio + float zcr = 0.0f; + if (!denoised_samples.empty() && denoised_samples.size() > 1) { + int zero_crossings = 0; + for (size_t i = 1; i < denoised_samples.size(); ++i) { + if ((denoised_samples[i] >= 0 && denoised_samples[i-1] < 0) || + (denoised_samples[i] < 0 && denoised_samples[i-1] >= 0)) { + zero_crossings++; + } + } + zcr = static_cast(zero_crossings) / denoised_samples.size(); + } + self->last_zcr_ = zcr; + + // Get user-adjustable thresholds + float rms_thresh = self->vad_rms_threshold_.load(std::memory_order_relaxed); + float peak_thresh = self->vad_peak_threshold_.load(std::memory_order_relaxed); + + // Adaptive noise floor: track minimum DENOISED energy level + if (denoised_rms < self->noise_floor_ * 2.0f) { + self->noise_floor_ = self->noise_floor_ * (1.0f - self->noise_floor_alpha_) + + denoised_rms * self->noise_floor_alpha_; + } + + // Dynamic threshold: noise floor + user threshold as margin + float adaptive_rms_thresh = self->noise_floor_ + rms_thresh; + + // Speech detection on DENOISED audio: + // 1. Energy above adaptive threshold + // 2. ZCR in speech range (not too high = noise, not too low = silence) + bool energy_ok = (denoised_rms >= adaptive_rms_thresh) || (denoised_peak >= peak_thresh); + bool zcr_ok = (zcr >= self->speech_zcr_min_) && (zcr <= self->speech_zcr_max_); + + // Speech = energy OK AND (ZCR OK or very high energy) + bool frame_has_speech = energy_ok && (zcr_ok || denoised_rms > adaptive_rms_thresh * 3.0f); + + // Hang time logic: don't immediately cut on silence + if (frame_has_speech) { + self->hang_frames_ = self->hang_frames_threshold_; // Reset hang counter + } else if (self->hang_frames_ > 0) { + self->hang_frames_--; + frame_has_speech = true; // Keep "speaking" during hang time + } + + // Calculate durations in samples + int silence_samples_threshold = (self->silence_duration_ms_ * self->sample_rate_ * self->channels_) / 1000; + int min_speech_samples = (self->min_speech_duration_ms_ * self->sample_rate_ * self->channels_) / 1000; + int max_speech_samples = (self->max_speech_duration_ms_ * self->sample_rate_ * self->channels_) / 1000; + + if (frame_has_speech) { + // Speech detected + self->is_speech_active_.store(true, std::memory_order_relaxed); + self->silence_samples_count_ = 0; + + // Add DENOISED audio to buffer (already processed by processRealtime above) + if (!denoised_samples.empty()) { + self->speech_buffer_.insert(self->speech_buffer_.end(), + denoised_samples.begin(), denoised_samples.end()); + } else { + // Fallback to raw if denoising disabled + for (unsigned long i = 0; i < sample_count; ++i) { + self->speech_buffer_.push_back(in[i]); + } + } + self->speech_samples_count_ += sample_count; + + // Force flush if too long + if (self->speech_samples_count_ >= max_speech_samples) { + std::cout << "[VAD] Max duration reached, forcing flush (" + << self->speech_samples_count_ / (self->sample_rate_ * self->channels_) << "s)" << std::endl; + + if (self->callback_ && self->speech_buffer_.size() >= static_cast(min_speech_samples)) { + // Flush any remaining samples from the denoiser + if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) { + auto remaining = self->noise_reducer_->flushStream(); + self->speech_buffer_.insert(self->speech_buffer_.end(), + remaining.begin(), remaining.end()); + // Save preview for listening + self->noise_reducer_->savePreview(self->speech_buffer_); + } + self->callback_(self->speech_buffer_); + } + self->speech_buffer_.clear(); + self->speech_samples_count_ = 0; + // Reset stream for next segment + if (self->noise_reducer_) { + self->noise_reducer_->resetStream(); + } + } + } else { + // True silence (after hang time expired) + self->silence_samples_count_ += sample_count; + + // If we were speaking and now have enough silence, flush + if (self->speech_buffer_.size() > 0) { + // Add trailing silence (denoised) + if (!denoised_samples.empty()) { + self->speech_buffer_.insert(self->speech_buffer_.end(), + denoised_samples.begin(), denoised_samples.end()); + } else { + for (unsigned long i = 0; i < sample_count; ++i) { + self->speech_buffer_.push_back(in[i]); + } + } + + if (self->silence_samples_count_ >= silence_samples_threshold) { + self->is_speech_active_.store(false, std::memory_order_relaxed); + + // Flush if we have enough speech + if (self->speech_samples_count_ >= min_speech_samples) { + // Flush any remaining samples from the denoiser + if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) { + auto remaining = self->noise_reducer_->flushStream(); + self->speech_buffer_.insert(self->speech_buffer_.end(), + remaining.begin(), remaining.end()); + // Save preview for listening + self->noise_reducer_->savePreview(self->speech_buffer_); + } + + float duration = static_cast(self->speech_buffer_.size()) / + (self->sample_rate_ * self->channels_); + std::cout << "[VAD] Speech ended (noise_floor=" << self->noise_floor_ + << "), flushing " << duration << "s (denoised)" << std::endl; + + if (self->callback_) { + self->callback_(self->speech_buffer_); + } + } else { + std::cout << "[VAD] Speech too short (" << self->speech_samples_count_ + << " samples), discarding" << std::endl; + } + + self->speech_buffer_.clear(); + self->speech_samples_count_ = 0; + // Reset stream for next segment + if (self->noise_reducer_) { + self->noise_reducer_->resetStream(); + } + } + } else { + self->is_speech_active_.store(false, std::memory_order_relaxed); } - // Sliding window: remove only step_samples, keep overlap for next chunk - self->buffer_.erase(self->buffer_.begin(), self->buffer_.begin() + step_samples); } return paContinue; @@ -71,7 +252,9 @@ bool AudioCapture::start(AudioCallback callback) { } callback_ = callback; - buffer_.clear(); + speech_buffer_.clear(); + silence_samples_count_ = 0; + speech_samples_count_ = 0; PaStreamParameters input_params; input_params.device = Pa_GetDefaultInputDevice(); @@ -88,7 +271,7 @@ bool AudioCapture::start(AudioCallback callback) { PaError err = Pa_OpenStream( &stream_, &input_params, - nullptr, // no output + nullptr, sample_rate_, paFramesPerBufferUnspecified, paClipOff, @@ -108,6 +291,9 @@ bool AudioCapture::start(AudioCallback callback) { } is_recording_ = true; + std::cout << "[VAD] Started with RMS threshold=" << vad_rms_threshold_ + << ", Peak threshold=" << vad_peak_threshold_ + << ", Silence duration=" << silence_duration_ms_ << "ms" << std::endl; return true; } @@ -116,8 +302,42 @@ void AudioCapture::stop() { return; } + // Flush any remaining audio + if (speech_buffer_.size() > 0 && callback_) { + int min_speech_samples = (min_speech_duration_ms_ * sample_rate_ * channels_) / 1000; + if (speech_samples_count_ >= min_speech_samples) { + std::cout << "[VAD] Flushing remaining audio on stop (denoised)" << std::endl; + + // Flush any remaining samples from the denoiser + if (noise_reducer_ && noise_reducer_->isEnabled()) { + auto remaining = noise_reducer_->flushStream(); + speech_buffer_.insert(speech_buffer_.end(), + remaining.begin(), remaining.end()); + // Save preview for listening + noise_reducer_->savePreview(speech_buffer_); + } + + callback_(speech_buffer_); + } + } + + // Reset the stream for next session + if (noise_reducer_) { + noise_reducer_->resetStream(); + } + Pa_StopStream(stream_); is_recording_ = false; } +void AudioCapture::setDenoiseEnabled(bool enabled) { + if (noise_reducer_) { + noise_reducer_->setEnabled(enabled); + } +} + +bool AudioCapture::isDenoiseEnabled() const { + return noise_reducer_ && noise_reducer_->isEnabled(); +} + } // namespace secondvoice diff --git a/src/audio/AudioCapture.h b/src/audio/AudioCapture.h index e21118f..af39627 100644 --- a/src/audio/AudioCapture.h +++ b/src/audio/AudioCapture.h @@ -3,16 +3,20 @@ #include #include #include +#include +#include +#include #include namespace secondvoice { +class NoiseReducer; + class AudioCapture { public: using AudioCallback = std::function&)>; - // chunk_duration = window size, chunk_step = how often to send (overlap = duration - step) - AudioCapture(int sample_rate, int channels, int chunk_duration_seconds, int chunk_step_seconds = 0); + AudioCapture(int sample_rate, int channels); ~AudioCapture(); bool initialize(); @@ -20,6 +24,26 @@ public: void stop(); bool isRecording() const { return is_recording_; } + // Real-time audio levels (updated every callback, ~10ms) + float getCurrentRMS() const { return current_rms_; } + float getCurrentPeak() const { return current_peak_; } + + // VAD settings (can be adjusted in real-time from UI) + void setVadThresholds(float rms_threshold, float peak_threshold) { + vad_rms_threshold_ = rms_threshold; + vad_peak_threshold_ = peak_threshold; + } + void setSilenceDuration(int ms) { silence_duration_ms_ = ms; } + void setMinSpeechDuration(int ms) { min_speech_duration_ms_ = ms; } + void setMaxSpeechDuration(int ms) { max_speech_duration_ms_ = ms; } + + // VAD state (for UI display) + bool isSpeechDetected() const { return is_speech_active_; } + + // Noise reduction + void setDenoiseEnabled(bool enabled); + bool isDenoiseEnabled() const; + private: static int audioCallback(const void* input, void* output, unsigned long frame_count, @@ -29,13 +53,45 @@ private: int sample_rate_; int channels_; - int chunk_duration_seconds_; - int chunk_step_seconds_; // How often to emit chunks (0 = same as duration, no overlap) bool is_recording_ = false; PaStream* stream_ = nullptr; AudioCallback callback_; - std::vector buffer_; + + // Speech accumulation buffer + std::vector speech_buffer_; + + // VAD state + std::atomic is_speech_active_{false}; + int silence_samples_count_ = 0; // How many samples of silence + int speech_samples_count_ = 0; // How many samples of speech accumulated + + // VAD parameters - Higher threshold to avoid false triggers on filtered noise + std::atomic vad_rms_threshold_{0.02f}; // Was 0.01f + std::atomic vad_peak_threshold_{0.08f}; // Was 0.04f + int silence_duration_ms_ = 400; // Wait 400ms of silence before cutting + int min_speech_duration_ms_ = 300; // Minimum speech to send + int max_speech_duration_ms_ = 25000; // 25s max before forced flush + + // Adaptive noise floor + float noise_floor_ = 0.005f; // Estimated background noise level + float noise_floor_alpha_ = 0.001f; // Slower adaptation + + // Hang time - wait before cutting to avoid mid-sentence cuts + int hang_frames_ = 0; + int hang_frames_threshold_ = 20; // ~200ms tolerance for pauses + + // Zero-crossing rate for speech vs noise discrimination + float last_zcr_ = 0.0f; + float speech_zcr_min_ = 0.01f; // More permissive lower bound + float speech_zcr_max_ = 0.25f; // More permissive upper bound + + // Real-time audio levels (atomic for thread safety) + std::atomic current_rms_{0.0f}; + std::atomic current_peak_{0.0f}; + + // Noise reduction + std::unique_ptr noise_reducer_; }; } // namespace secondvoice diff --git a/src/audio/NoiseReducer.cpp b/src/audio/NoiseReducer.cpp new file mode 100644 index 0000000..7acaca0 --- /dev/null +++ b/src/audio/NoiseReducer.cpp @@ -0,0 +1,249 @@ +#include "NoiseReducer.h" +#include "AudioBuffer.h" +#include +#include +#include +#include +#include + +namespace secondvoice { + +NoiseReducer::NoiseReducer() { + state_ = rnnoise_create(nullptr); + upsample_buffer_.resize(RNNOISE_FRAME_SIZE); + downsample_buffer_.resize(INPUT_FRAME_SIZE); +} + +NoiseReducer::~NoiseReducer() { + if (state_) { + rnnoise_destroy(state_); + } +} + +void NoiseReducer::upsample(const float* in, float* out, int in_samples) { + // Simple linear interpolation 16kHz -> 48kHz (3x) + for (int i = 0; i < in_samples; ++i) { + int out_idx = i * 3; + float curr = in[i]; + float next = (i + 1 < in_samples) ? in[i + 1] : curr; + + out[out_idx] = curr; + out[out_idx + 1] = curr + (next - curr) * 0.333f; + out[out_idx + 2] = curr + (next - curr) * 0.667f; + } +} + +void NoiseReducer::downsample(const float* in, float* out, int out_samples) { + // Simple decimation 48kHz -> 16kHz (take every 3rd sample with averaging) + for (int i = 0; i < out_samples; ++i) { + int in_idx = i * 3; + // Average of 3 samples for anti-aliasing + out[i] = (in[in_idx] + in[in_idx + 1] + in[in_idx + 2]) / 3.0f; + } +} + +void NoiseReducer::suppressTransients(float* samples, size_t count) { + // Transient suppressor: attenuates sudden spikes (claps, clicks, pops) + // while preserving speech which has smoother amplitude changes + + int transients_detected = 0; + + for (size_t i = 0; i < count; ++i) { + float abs_sample = std::abs(samples[i]); + + // IMPORTANT: Use envelope BEFORE updating for transient detection + // Otherwise the envelope rises with the clap and we miss it! + float safe_envelope = std::max(envelope_, 0.001f); + float ratio = abs_sample / safe_envelope; + + bool is_transient = (ratio > transient_threshold_); + + if (is_transient) { + // This is a transient - attenuate it heavily + float target_amplitude = safe_envelope * transient_ratio_; + float attenuation = target_amplitude / abs_sample; + samples[i] *= attenuation; + transients_detected++; + + // DON'T update envelope from transients - keep it stable + // This prevents the envelope from "learning" the clap + } else { + // Only update envelope from non-transient samples + if (abs_sample > envelope_) { + envelope_ = envelope_ * (1.0f - envelope_attack_) + abs_sample * envelope_attack_; + } else { + envelope_ = envelope_ * (1.0f - envelope_release_) + abs_sample * envelope_release_; + } + } + } + + if (transients_detected > 10) { + std::cout << "[RNNoise] Suppressed " << transients_detected << " transient samples (clap/click)" << std::endl; + } +} + +void NoiseReducer::process(std::vector& samples) { + if (!enabled_ || !state_ || samples.empty()) { + return; + } + + // Process in chunks of INPUT_FRAME_SIZE (160 samples = 10ms at 16kHz) + size_t num_frames = samples.size() / INPUT_FRAME_SIZE; + float duration_ms = static_cast(samples.size()) / 16.0f; // 16 samples/ms at 16kHz + std::cout << "[RNNoise] Denoising " << duration_ms << "ms of audio (" << num_frames << " frames)" << std::endl; + + for (size_t frame = 0; frame < num_frames; ++frame) { + float* frame_ptr = samples.data() + frame * INPUT_FRAME_SIZE; + + // Upsample 16kHz -> 48kHz + upsample(frame_ptr, upsample_buffer_.data(), INPUT_FRAME_SIZE); + + // RNNoise expects float input in range [-32768, 32768] (like int16) + // Our input is [-1, 1], so scale up + for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) { + upsample_buffer_[i] *= 32768.0f; + } + + // Apply RNNoise + rnnoise_process_frame(state_, upsample_buffer_.data(), upsample_buffer_.data()); + + // Scale back to [-1, 1] + for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) { + upsample_buffer_[i] /= 32768.0f; + // Clamp to prevent any overflow + upsample_buffer_[i] = std::clamp(upsample_buffer_[i], -1.0f, 1.0f); + } + + // Downsample 48kHz -> 16kHz + downsample(upsample_buffer_.data(), frame_ptr, INPUT_FRAME_SIZE); + } + + // Handle remaining samples (less than a full frame) - just leave them as-is + // They'll be processed in the next call +} + +std::vector NoiseReducer::processCopy(const std::vector& samples) { + std::vector result = samples; + process(result); + return result; +} + +std::vector NoiseReducer::processRealtime(const float* samples, size_t count) { + std::vector output; + + if (!enabled_ || !state_ || count == 0) { + // Pass through unchanged + output.assign(samples, samples + count); + return output; + } + + // Add new samples to streaming buffer + stream_input_buffer_.insert(stream_input_buffer_.end(), samples, samples + count); + + // Process complete frames + while (stream_input_buffer_.size() >= INPUT_FRAME_SIZE) { + // Get frame pointer + float* frame_ptr = stream_input_buffer_.data(); + + // Step 1: Suppress transients (claps, clicks, pops) BEFORE RNNoise + suppressTransients(frame_ptr, INPUT_FRAME_SIZE); + + // Step 2: Upsample 16kHz -> 48kHz + upsample(frame_ptr, upsample_buffer_.data(), INPUT_FRAME_SIZE); + + // Scale to int16 range for RNNoise + for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) { + upsample_buffer_[i] *= 32768.0f; + } + + // Apply RNNoise + rnnoise_process_frame(state_, upsample_buffer_.data(), upsample_buffer_.data()); + + // Scale back to [-1, 1] + for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) { + upsample_buffer_[i] /= 32768.0f; + upsample_buffer_[i] = std::clamp(upsample_buffer_[i], -1.0f, 1.0f); + } + + // Downsample 48kHz -> 16kHz and add to output + std::vector denoised_frame(INPUT_FRAME_SIZE); + downsample(upsample_buffer_.data(), denoised_frame.data(), INPUT_FRAME_SIZE); + output.insert(output.end(), denoised_frame.begin(), denoised_frame.end()); + + // Remove processed samples from buffer + stream_input_buffer_.erase(stream_input_buffer_.begin(), + stream_input_buffer_.begin() + INPUT_FRAME_SIZE); + } + + return output; +} + +std::vector NoiseReducer::flushStream() { + std::vector output; + + if (!enabled_ || !state_ || stream_input_buffer_.empty()) { + // Return any remaining samples as-is + output = std::move(stream_input_buffer_); + stream_input_buffer_.clear(); + return output; + } + + // Process remaining samples (zero-pad to complete frame if needed) + while (!stream_input_buffer_.empty()) { + // Pad with zeros if we have less than a full frame + while (stream_input_buffer_.size() < INPUT_FRAME_SIZE) { + stream_input_buffer_.push_back(0.0f); + } + + // Get frame pointer + float* frame_ptr = stream_input_buffer_.data(); + + // Step 1: Suppress transients + suppressTransients(frame_ptr, INPUT_FRAME_SIZE); + + // Step 2: Upsample 16kHz -> 48kHz + upsample(frame_ptr, upsample_buffer_.data(), INPUT_FRAME_SIZE); + + // Scale to int16 range for RNNoise + for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) { + upsample_buffer_[i] *= 32768.0f; + } + + // Apply RNNoise + rnnoise_process_frame(state_, upsample_buffer_.data(), upsample_buffer_.data()); + + // Scale back to [-1, 1] + for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) { + upsample_buffer_[i] /= 32768.0f; + upsample_buffer_[i] = std::clamp(upsample_buffer_[i], -1.0f, 1.0f); + } + + // Downsample 48kHz -> 16kHz and add to output + std::vector denoised_frame(INPUT_FRAME_SIZE); + downsample(upsample_buffer_.data(), denoised_frame.data(), INPUT_FRAME_SIZE); + output.insert(output.end(), denoised_frame.begin(), denoised_frame.end()); + + // Remove processed samples from buffer + stream_input_buffer_.erase(stream_input_buffer_.begin(), + stream_input_buffer_.begin() + INPUT_FRAME_SIZE); + } + + return output; +} + +void NoiseReducer::savePreview(const std::vector& samples) { + if (!save_preview_ || samples.empty()) { + return; + } + + std::filesystem::create_directories("./denoised"); + std::string filename = "./denoised/denoised_" + std::to_string(preview_count_++) + ".ogg"; + + AudioBuffer buffer(16000, 1); // 16kHz mono + buffer.addSamples(samples); + if (buffer.saveToOpus(filename)) { + std::cout << "[RNNoise] Saved denoised preview: " << filename << std::endl; + } +} + +} // namespace secondvoice diff --git a/src/audio/NoiseReducer.h b/src/audio/NoiseReducer.h new file mode 100644 index 0000000..ee6b650 --- /dev/null +++ b/src/audio/NoiseReducer.h @@ -0,0 +1,86 @@ +#pragma once + +#include +#include + +// Forward declaration +struct DenoiseState; + +namespace secondvoice { + +/** + * NoiseReducer - Wrapper around RNNoise for real-time audio denoising + * + * RNNoise expects 48kHz audio, but we work at 16kHz. + * This wrapper handles resampling transparently. + */ +class NoiseReducer { +public: + NoiseReducer(); + ~NoiseReducer(); + + // Process audio samples in-place (batch mode - for final output) + // Input/output: float samples normalized to [-1, 1] + void process(std::vector& samples); + + // Process a buffer, returns denoised copy + std::vector processCopy(const std::vector& samples); + + // Real-time streaming: process samples as they come in + // Returns denoised samples (may be slightly delayed due to frame buffering) + std::vector processRealtime(const float* samples, size_t count); + + // Flush any remaining samples in the stream buffer (zero-padded if needed) + std::vector flushStream(); + + // Clear the stream buffer (call when starting a new segment) + void resetStream() { stream_input_buffer_.clear(); } + + // Enable/disable denoising + void setEnabled(bool enabled) { enabled_ = enabled; } + bool isEnabled() const { return enabled_; } + + // Save denoised audio for preview + void setSavePreview(bool save) { save_preview_ = save; } + bool isSavePreviewEnabled() const { return save_preview_; } + + // Save audio samples to Opus file (for preview) + void savePreview(const std::vector& samples); + +private: + DenoiseState* state_ = nullptr; + bool enabled_ = true; + bool save_preview_ = true; // Save denoised audio for listening + int preview_count_ = 0; + + // RNNoise frame size (480 samples at 48kHz = 10ms) + static constexpr int RNNOISE_FRAME_SIZE = 480; + + // Our frame size (160 samples at 16kHz = 10ms) + static constexpr int INPUT_FRAME_SIZE = 160; + + // Resampling buffers + std::vector upsample_buffer_; + std::vector downsample_buffer_; + + // Streaming input buffer (for real-time processing) + std::vector stream_input_buffer_; + + // Transient suppressor state + float envelope_ = 0.0f; // Smoothed signal envelope + float envelope_attack_ = 0.01f; // Fast attack to track signal + float envelope_release_ = 0.0005f; // Slow release to maintain level + float transient_threshold_ = 4.0f; // Spike must be 4x envelope to be considered transient + float transient_ratio_ = 0.1f; // Attenuate transients to 10% (-90% reduction) + + // Suppress transient sounds (claps, clicks, pops) + void suppressTransients(float* samples, size_t count); + + // Upsample 16kHz to 48kHz (3x) + void upsample(const float* in, float* out, int in_samples); + + // Downsample 48kHz to 16kHz (3x) + void downsample(const float* in, float* out, int out_samples); +}; + +} // namespace secondvoice diff --git a/src/core/Pipeline.cpp b/src/core/Pipeline.cpp index 788bb8e..2057322 100644 --- a/src/core/Pipeline.cpp +++ b/src/core/Pipeline.cpp @@ -24,18 +24,13 @@ Pipeline::~Pipeline() { bool Pipeline::initialize() { auto& config = Config::getInstance(); - // Initialize audio capture with sliding window (overlap) + // Initialize audio capture with VAD-based segmentation audio_capture_ = std::make_unique( config.getAudioConfig().sample_rate, - config.getAudioConfig().channels, - config.getAudioConfig().chunk_duration_seconds, - config.getAudioConfig().chunk_step_seconds + config.getAudioConfig().channels ); - std::cout << "[Pipeline] Audio: " << config.getAudioConfig().chunk_duration_seconds - << "s window, " << config.getAudioConfig().chunk_step_seconds - << "s step (overlap: " << (config.getAudioConfig().chunk_duration_seconds - config.getAudioConfig().chunk_step_seconds) - << "s)" << std::endl; + std::cout << "[Pipeline] VAD-based audio segmentation enabled" << std::endl; if (!audio_capture_->initialize()) { std::cerr << "Failed to initialize audio capture" << std::endl; @@ -76,7 +71,7 @@ bool Pipeline::start() { running_ = true; - // Start background threads (NOT UI - that runs in main thread) + // Start background threads audio_thread_ = std::thread(&Pipeline::audioThread, this); processing_thread_ = std::thread(&Pipeline::processingThread, this); @@ -95,9 +90,8 @@ void Pipeline::stop() { audio_capture_->stop(); } - // Shutdown queues + // Shutdown queue audio_queue_.shutdown(); - transcription_queue_.shutdown(); // Wait for background threads if (audio_thread_.joinable()) { @@ -146,7 +140,7 @@ void Pipeline::audioThread() { // Keep thread alive while recording auto start_time = std::chrono::steady_clock::now(); while (running_ && audio_capture_->isRecording()) { - std::this_thread::sleep_for(std::chrono::seconds(1)); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // Update duration auto now = std::chrono::steady_clock::now(); @@ -157,11 +151,6 @@ void Pipeline::audioThread() { void Pipeline::processingThread() { auto& config = Config::getInstance(); - // VAD threshold - audio RMS must exceed this to be considered speech - // Higher values = more aggressive noise filtering - constexpr float VAD_THRESHOLD = 0.02f; // RMS energy threshold - constexpr float VAD_MIN_PEAK = 0.08f; // Minimum peak amplitude - while (running_) { auto chunk_opt = audio_queue_.wait_and_pop(); if (!chunk_opt.has_value()) { @@ -169,22 +158,8 @@ void Pipeline::processingThread() { } auto& chunk = chunk_opt.value(); - - // Voice Activity Detection - skip silent/noise chunks - float sum_squared = 0.0f; - float max_amplitude = 0.0f; - for (const float sample : chunk.data) { - sum_squared += sample * sample; - max_amplitude = std::max(max_amplitude, std::abs(sample)); - } - float rms = std::sqrt(sum_squared / chunk.data.size()); - - if (rms < VAD_THRESHOLD || max_amplitude < VAD_MIN_PEAK) { - std::cout << "[Skip] Silent chunk (RMS: " << rms << ", Peak: " << max_amplitude << ")" << std::endl; - continue; - } - - std::cout << "[Audio] Processing chunk (RMS: " << rms << ", Peak: " << max_amplitude << ")" << std::endl; + float duration = static_cast(chunk.data.size()) / (chunk.sample_rate * chunk.channels); + std::cout << "[Processing] Speech segment: " << duration << "s" << std::endl; // Transcribe with Whisper auto whisper_result = whisper_client_->transcribe( @@ -203,8 +178,9 @@ void Pipeline::processingThread() { continue; } - // Filter out Whisper hallucinations (common when audio is silent/unclear) + // Filter out Whisper hallucinations std::string text = whisper_result->text; + // Trim whitespace size_t start = text.find_first_not_of(" \t\n\r"); size_t end = text.find_last_not_of(" \t\n\r"); @@ -215,28 +191,61 @@ void Pipeline::processingThread() { text = text.substr(start, end - start + 1); // Skip known hallucinations and garbage + // Whisper hallucinates these when given silence/noise bool is_garbage = false; - - // Too short to be meaningful if (text.length() < 4) { is_garbage = true; - } - // Known Whisper hallucinations - else if (text.find("Amara.org") != std::string::npos || - text.find("amara.org") != std::string::npos || - text.find("字幕") != std::string::npos || - text.find("subtitle") != std::string::npos || - text.find("Subtitle") != std::string::npos || - text.find("Thank you") != std::string::npos || - text.find("thanks for watching") != std::string::npos || - text.find("Subscribe") != std::string::npos || - text.find("谢谢观看") != std::string::npos || - text.find("订阅") != std::string::npos || - text.find("...") == 0) { // Starts with ellipsis + } else if ( + // Common Whisper hallucinations + text.find("Tingting") != std::string::npos || + text.find("tingting") != std::string::npos || + text.find("婷婷") != std::string::npos || + text.find("Thank you") != std::string::npos || + text.find("thanks for watching") != std::string::npos || + text.find("Thanks for watching") != std::string::npos || + text.find("Subscribe") != std::string::npos || + text.find("subscribe") != std::string::npos || + text.find("請訂閱") != std::string::npos || + text.find("请订阅") != std::string::npos || + text.find("訂閱") != std::string::npos || + text.find("订阅") != std::string::npos || + text.find("Amara.org") != std::string::npos || + text.find("amara.org") != std::string::npos || + text.find("字幕") != std::string::npos || + text.find("subtitle") != std::string::npos || + text.find("Subtitle") != std::string::npos || + text.find("谢谢观看") != std::string::npos || + text.find("謝謝觀看") != std::string::npos || + text.find("感谢收看") != std::string::npos || + text.find("感謝收看") != std::string::npos || + text.find("下期再见") != std::string::npos || + text.find("下期再見") != std::string::npos || + text.find("再见") != std::string::npos || + text.find("再見") != std::string::npos || + text.find("music") != std::string::npos || + text.find("Music") != std::string::npos || + text.find("♪") != std::string::npos || + text.find("silenc") != std::string::npos || // silence, silencieux + text.find("Silenc") != std::string::npos || + text.find("...") == 0 || + // Repetitive single words/sounds + text == "Bonjour" || + text == "Hello" || + text == "Hi" || + text == "你好" || + text == "好" || + text == "嗯" || + text == "啊" || + text == "哈" || + text == "呵呵" || + text == "哈哈" || + text == "嘻嘻" || + text == "Hmm" || + text == "Huh" || + text == "Um" || + text == "Uh") { is_garbage = true; - } - // Only punctuation or whitespace - else { + } else { bool has_content = false; for (char c : text) { if (std::isalnum(static_cast(c)) || (c & 0x80)) { @@ -244,9 +253,7 @@ void Pipeline::processingThread() { break; } } - if (!has_content) { - is_garbage = true; - } + if (!has_content) is_garbage = true; } if (is_garbage) { @@ -254,20 +261,14 @@ void Pipeline::processingThread() { continue; } - // Remove overlap with previous transcription - std::string deduplicated = removeOverlap(text, last_transcription_); - last_transcription_ = text; // Store full text for next comparison - - if (deduplicated.empty() || deduplicated.length() < 2) { - std::cout << "[Skip] Fully overlapping transcription" << std::endl; - continue; + // Track audio cost + if (ui_) { + ui_->addAudioCost(duration); } - std::cout << "[New content] " << deduplicated << std::endl; - - // Translate with Claude (only the new, deduplicated content) + // Translate with Claude auto claude_result = claude_client_->translate( - deduplicated, + text, config.getClaudeConfig().system_prompt, config.getClaudeConfig().max_tokens, config.getClaudeConfig().temperature @@ -278,10 +279,27 @@ void Pipeline::processingThread() { continue; } - // Add to UI - ui_->addTranslation(deduplicated, claude_result->text); + // Track Claude cost + if (ui_) { + ui_->addClaudeCost(); + } - std::cout << "CN: " << deduplicated << std::endl; + // Simple accumulation + if (!accumulated_chinese_.empty()) { + accumulated_chinese_ += " "; + accumulated_french_ += " "; + } + accumulated_chinese_ += text; + accumulated_french_ += claude_result->text; + + std::cout << "[Accumulated CN] " << accumulated_chinese_ << std::endl; + std::cout << "[Accumulated FR] " << accumulated_french_ << std::endl; + + // Update UI + ui_->setAccumulatedText(accumulated_chinese_, accumulated_french_); + ui_->addTranslation(text, claude_result->text); + + std::cout << "CN: " << text << std::endl; std::cout << "FR: " << claude_result->text << std::endl; std::cout << "---" << std::endl; } @@ -290,58 +308,45 @@ void Pipeline::processingThread() { void Pipeline::update() { if (!ui_) return; + // Sync VAD thresholds from UI to AudioCapture + if (audio_capture_) { + audio_capture_->setVadThresholds( + ui_->getVadThreshold(), + ui_->getVadPeakThreshold() + ); + + // Update UI with audio levels + ui_->setCurrentRMS(audio_capture_->getCurrentRMS()); + ui_->setCurrentPeak(audio_capture_->getCurrentPeak()); + } + ui_->setRecordingDuration(recording_duration_); - ui_->setProcessingStatus("Processing..."); + ui_->setProcessingStatus(audio_capture_ && audio_capture_->isSpeechDetected() ? "Recording speech..." : "Listening..."); ui_->render(); // Check if stop was requested if (ui_->isStopRequested()) { running_ = false; } + + // Check if clear was requested + if (ui_->isClearRequested()) { + clearAccumulated(); + ui_->resetClearRequest(); + } } bool Pipeline::shouldClose() const { return ui_ ? ui_->shouldClose() : true; } -std::string Pipeline::removeOverlap(const std::string& current, const std::string& previous) { - if (previous.empty()) { - return current; +void Pipeline::clearAccumulated() { + accumulated_chinese_.clear(); + accumulated_french_.clear(); + if (ui_) { + ui_->setAccumulatedText("", ""); } - - // Try to find overlap: check if current starts with end of previous - // Start from half the previous text (since we have 50% overlap) - size_t min_overlap = 4; // Minimum characters to consider as overlap - size_t search_start = previous.length() / 3; // Start looking from 1/3 into previous - - for (size_t i = search_start; i < previous.length() - min_overlap; ++i) { - std::string suffix = previous.substr(i); - if (current.find(suffix) == 0) { - // Found overlap - return only the new part - std::string new_part = current.substr(suffix.length()); - if (!new_part.empty()) { - std::cout << "[Overlap] Removed: \"" << suffix << "\"" << std::endl; - return new_part; - } - } - } - - // No overlap found - check for partial word overlap at start - // This handles cases where the same content appears but tokenized differently - size_t max_check = std::min(current.length(), previous.length()) / 2; - for (size_t len = max_check; len >= min_overlap; --len) { - std::string end_prev = previous.substr(previous.length() - len); - std::string start_curr = current.substr(0, len); - if (end_prev == start_curr) { - std::string new_part = current.substr(len); - if (!new_part.empty()) { - std::cout << "[Overlap] Partial match removed: \"" << end_prev << "\"" << std::endl; - return new_part; - } - } - } - - return current; // No overlap detected + std::cout << "[Pipeline] Cleared accumulated text" << std::endl; } } // namespace secondvoice diff --git a/src/core/Pipeline.h b/src/core/Pipeline.h index 6647879..d32271a 100644 --- a/src/core/Pipeline.h +++ b/src/core/Pipeline.h @@ -21,10 +21,6 @@ struct AudioChunk { int channels; }; -struct TranscriptionResult { - std::string chinese_text; -}; - class Pipeline { public: Pipeline(); @@ -40,6 +36,9 @@ public: bool isRunning() const { return running_; } bool shouldClose() const; + // Clear accumulated translations + void clearAccumulated(); + private: void audioThread(); void processingThread(); @@ -51,7 +50,6 @@ private: std::unique_ptr full_recording_; ThreadSafeQueue audio_queue_; - ThreadSafeQueue transcription_queue_; std::thread audio_thread_; std::thread processing_thread_; @@ -59,9 +57,9 @@ private: std::atomic running_{false}; std::atomic recording_duration_{0}; - // For overlap deduplication - std::string last_transcription_; - std::string removeOverlap(const std::string& current, const std::string& previous); + // Simple accumulation + std::string accumulated_chinese_; + std::string accumulated_french_; }; } // namespace secondvoice diff --git a/src/ui/TranslationUI.cpp b/src/ui/TranslationUI.cpp index 5a61589..2004d25 100644 --- a/src/ui/TranslationUI.cpp +++ b/src/ui/TranslationUI.cpp @@ -199,10 +199,31 @@ void TranslationUI::render() { ImGuiWindowFlags_NoMove | ImGuiWindowFlags_NoCollapse); + // Two-column layout: main content on left, audio panel on right + float audio_panel_width = 280.0f; + float main_width = ImGui::GetWindowWidth() - audio_panel_width - 20.0f; + + // Left column - main content + ImGui::BeginChild("MainContent", ImVec2(main_width, -1), false); + + ImGui::Text("SecondVoice - Live Translation"); + ImGui::Separator(); + ImGui::Spacing(); + + renderAccumulated(); renderTranslations(); renderControls(); renderStatus(); + ImGui::EndChild(); + + ImGui::SameLine(); + + // Right column - audio panel + ImGui::BeginChild("AudioPanel", ImVec2(audio_panel_width, -1), true); + renderAudioPanel(); + ImGui::EndChild(); + ImGui::End(); // Rendering @@ -225,11 +246,49 @@ void TranslationUI::addTranslation(const std::string& chinese, const std::string messages_.push_back({chinese, french}); } -void TranslationUI::renderTranslations() { - ImGui::Text("SecondVoice - Live Translation"); +void TranslationUI::setAccumulatedText(const std::string& chinese, const std::string& french) { + accumulated_chinese_ = chinese; + accumulated_french_ = french; +} + +void TranslationUI::renderAccumulated() { + if (accumulated_chinese_.empty() && accumulated_french_.empty()) { + return; + } + + ImGui::Text("Texte Recomposé"); + ImGui::SameLine(); + if (ImGui::SmallButton("Clear")) { + clear_requested_ = true; + } ImGui::Separator(); - ImGui::BeginChild("Translations", ImVec2(0, -120), true); + ImGui::BeginChild("Accumulated", ImVec2(0, 150), true); + + // Chinese accumulated text + ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(1.0f, 0.9f, 0.5f, 1.0f)); // Yellow + ImGui::TextWrapped("%s", accumulated_chinese_.c_str()); + ImGui::PopStyleColor(); + + ImGui::Spacing(); + + // French accumulated text + ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.5f, 1.0f, 0.8f, 1.0f)); // Cyan + ImGui::TextWrapped("%s", accumulated_french_.c_str()); + ImGui::PopStyleColor(); + + ImGui::EndChild(); + + ImGui::Spacing(); +} + +void TranslationUI::renderTranslations() { + ImGui::Text("Segments (détail)"); + ImGui::Separator(); + + // Height depends on whether accumulated section is shown + float height = (accumulated_chinese_.empty() && accumulated_french_.empty()) ? -120 : -120; + ImGui::BeginChild("Translations", ImVec2(0, height), true); for (const auto& msg : messages_) { ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.5f, 0.8f, 1.0f, 1.0f)); @@ -281,4 +340,101 @@ void TranslationUI::renderStatus() { } } +void TranslationUI::renderAudioPanel() { + ImGui::Text("Audio Monitor"); + ImGui::Separator(); + ImGui::Spacing(); + + // Current volume display + ImGui::Text("Volume Level"); + + // RMS meter (horizontal bar) + float rms_normalized = std::min(current_rms_ * 10.0f, 1.0f); // Scale for visibility + ImVec4 rms_color = rms_normalized > vad_threshold_ * 10.0f ? + ImVec4(0.2f, 0.8f, 0.2f, 1.0f) : // Green if above threshold + ImVec4(0.5f, 0.5f, 0.5f, 1.0f); // Gray if below + + ImGui::PushStyleColor(ImGuiCol_PlotHistogram, rms_color); + ImGui::ProgressBar(rms_normalized, ImVec2(-1, 20), ""); + ImGui::PopStyleColor(); + + ImGui::Text("RMS: %.4f", current_rms_); + + ImGui::Spacing(); + + // Peak meter + ImGui::Text("Peak Level"); + float peak_normalized = std::min(current_peak_, 1.0f); + ImVec4 peak_color = peak_normalized > vad_peak_threshold_ ? + ImVec4(0.2f, 0.8f, 0.2f, 1.0f) : + ImVec4(0.5f, 0.5f, 0.5f, 1.0f); + + ImGui::PushStyleColor(ImGuiCol_PlotHistogram, peak_color); + ImGui::ProgressBar(peak_normalized, ImVec2(-1, 20), ""); + ImGui::PopStyleColor(); + + ImGui::Text("Peak: %.4f", current_peak_); + + ImGui::Spacing(); + ImGui::Separator(); + ImGui::Spacing(); + + // VAD Threshold sliders + ImGui::Text("VAD Settings"); + ImGui::Spacing(); + + ImGui::Text("RMS Threshold"); + ImGui::SliderFloat("##rms_thresh", &vad_threshold_, 0.001f, 0.1f, "%.3f"); + + // Draw threshold line on the RMS meter area + ImGui::Spacing(); + + ImGui::Text("Peak Threshold"); + ImGui::SliderFloat("##peak_thresh", &vad_peak_threshold_, 0.01f, 0.3f, "%.3f"); + + ImGui::Spacing(); + ImGui::Separator(); + ImGui::Spacing(); + + // Status indicator + bool is_active = (current_rms_ >= vad_threshold_) && (current_peak_ >= vad_peak_threshold_); + if (is_active) { + ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.2f, 1.0f, 0.2f, 1.0f)); + ImGui::Text(">> VOICE DETECTED <<"); + ImGui::PopStyleColor(); + } else { + ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.6f, 0.6f, 0.6f, 1.0f)); + ImGui::Text(" (silence)"); + ImGui::PopStyleColor(); + } + + ImGui::Spacing(); + ImGui::Separator(); + ImGui::Spacing(); + + // Cost tracking + ImGui::Text("API Usage"); + ImGui::Spacing(); + + // Audio sent to Whisper + int minutes = static_cast(total_audio_seconds_) / 60; + int seconds = static_cast(total_audio_seconds_) % 60; + ImGui::Text("Audio: %d:%02d", minutes, seconds); + ImGui::Text("Whisper calls: %d", whisper_calls_); + ImGui::Text("Claude calls: %d", claude_calls_); + + ImGui::Spacing(); + + // Estimated cost + float cost = getEstimatedCost(); + ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(1.0f, 0.8f, 0.3f, 1.0f)); // Gold + ImGui::Text("Est. cost: $%.4f", cost); + ImGui::PopStyleColor(); + + ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.6f, 0.6f, 0.6f, 1.0f)); + ImGui::Text("(Whisper $0.006/min)"); + ImGui::Text("(Haiku ~$0.001/call)"); + ImGui::PopStyleColor(); +} + } // namespace secondvoice diff --git a/src/ui/TranslationUI.h b/src/ui/TranslationUI.h index 8b97f0d..cb6145e 100644 --- a/src/ui/TranslationUI.h +++ b/src/ui/TranslationUI.h @@ -23,27 +23,66 @@ public: bool shouldClose() const; void addTranslation(const std::string& chinese, const std::string& french); + void setAccumulatedText(const std::string& chinese, const std::string& french); bool isStopRequested() const { return stop_requested_; } void resetStopRequest() { stop_requested_ = false; } + bool isClearRequested() const { return clear_requested_; } + void resetClearRequest() { clear_requested_ = false; } + void setRecordingDuration(int seconds) { recording_duration_ = seconds; } void setProcessingStatus(const std::string& status) { processing_status_ = status; } + // Audio level monitoring + void setCurrentRMS(float rms) { current_rms_ = rms; } + void setCurrentPeak(float peak) { current_peak_ = peak; } + float getVadThreshold() const { return vad_threshold_; } + float getVadPeakThreshold() const { return vad_peak_threshold_; } + private: + void renderAccumulated(); void renderTranslations(); void renderControls(); void renderStatus(); + void renderAudioPanel(); int width_; int height_; GLFWwindow* window_ = nullptr; std::vector messages_; + std::string accumulated_chinese_; + std::string accumulated_french_; bool stop_requested_ = false; + bool clear_requested_ = false; bool auto_scroll_ = true; int recording_duration_ = 0; std::string processing_status_; + + // Audio monitoring + float current_rms_ = 0.0f; + float current_peak_ = 0.0f; + float vad_threshold_ = 0.02f; // 2x higher to avoid false triggers + float vad_peak_threshold_ = 0.08f; // 2x higher + + // Cost tracking + float total_audio_seconds_ = 0.0f; + int whisper_calls_ = 0; + int claude_calls_ = 0; + +public: + void addAudioCost(float seconds) { + total_audio_seconds_ += seconds; + whisper_calls_++; + } + void addClaudeCost() { claude_calls_++; } + float getTotalAudioSeconds() const { return total_audio_seconds_; } + float getEstimatedCost() const { + // gpt-4o-mini-transcribe: $0.006/min = $0.0001/sec + // Haiku: ~$0.001 per short translation + return (total_audio_seconds_ * 0.0001f) + (claude_calls_ * 0.001f); + } }; } // namespace secondvoice