feat: Add RNNoise denoising + transient suppressor + VAD improvements

- Add RNNoise neural network audio denoising (16kHz↔48kHz resampling)
- Add transient suppressor to filter claps/clicks/pops before RNNoise
- VAD now works on FILTERED audio (not raw) to avoid false triggers
- Real-time denoised audio level display in UI
- Save denoised audio previews in Opus format (.ogg)
- Add extensive Whisper hallucination filter (Tingting, music, etc.)
- Add "Clear" button to reset accumulated translations
- Double VAD thresholds (0.02/0.08) for less sensitivity
- Update Claude prompt to handle offensive content gracefully

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Trouve Alexis 2025-11-23 16:46:38 +08:00
parent fa8ea2907b
commit 741ca09663
10 changed files with 996 additions and 150 deletions

View File

@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.20) cmake_minimum_required(VERSION 3.20)
project(SecondVoice VERSION 0.1.0 LANGUAGES CXX) project(SecondVoice VERSION 0.1.0 LANGUAGES C CXX)
set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_STANDARD_REQUIRED ON)
@ -38,6 +38,35 @@ FetchContent_Declare(
FetchContent_MakeAvailable(ogg opus) FetchContent_MakeAvailable(ogg opus)
# Fetch RNNoise for noise reduction (using older stable version)
FetchContent_Declare(
rnnoise
GIT_REPOSITORY https://github.com/xiph/rnnoise.git
GIT_TAG v0.1.1
)
FetchContent_MakeAvailable(rnnoise)
# Build RNNoise as a static library (v0.1.1 structure)
add_library(rnnoise STATIC
${rnnoise_SOURCE_DIR}/src/denoise.c
${rnnoise_SOURCE_DIR}/src/rnn.c
${rnnoise_SOURCE_DIR}/src/rnn_data.c
${rnnoise_SOURCE_DIR}/src/pitch.c
${rnnoise_SOURCE_DIR}/src/kiss_fft.c
${rnnoise_SOURCE_DIR}/src/celt_lpc.c
)
target_include_directories(rnnoise PUBLIC
${rnnoise_SOURCE_DIR}/include
${rnnoise_SOURCE_DIR}/src
)
# RNNoise needs math library
if(UNIX)
target_link_libraries(rnnoise PRIVATE m)
endif()
# Create ImGui library with OpenGL/GLFW backends # Create ImGui library with OpenGL/GLFW backends
add_library(imgui_backends add_library(imgui_backends
${imgui_SOURCE_DIR}/imgui.cpp ${imgui_SOURCE_DIR}/imgui.cpp
@ -70,6 +99,7 @@ set(SOURCES_UI
# Audio module # Audio module
src/audio/AudioCapture.cpp src/audio/AudioCapture.cpp
src/audio/AudioBuffer.cpp src/audio/AudioBuffer.cpp
src/audio/NoiseReducer.cpp
# API clients # API clients
src/api/WhisperClient.cpp src/api/WhisperClient.cpp
src/api/ClaudeClient.cpp src/api/ClaudeClient.cpp
@ -88,6 +118,7 @@ set(SOURCES_CONSOLE
# Audio module # Audio module
src/audio/AudioCapture.cpp src/audio/AudioCapture.cpp
src/audio/AudioBuffer.cpp src/audio/AudioBuffer.cpp
src/audio/NoiseReducer.cpp
# API clients # API clients
src/api/WhisperClient.cpp src/api/WhisperClient.cpp
src/api/ClaudeClient.cpp src/api/ClaudeClient.cpp
@ -107,9 +138,11 @@ add_executable(${PROJECT_NAME}_Console ${SOURCES_CONSOLE})
# Include directories # Include directories
target_include_directories(${PROJECT_NAME} PRIVATE target_include_directories(${PROJECT_NAME} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/src
${rnnoise_SOURCE_DIR}/include
) )
target_include_directories(${PROJECT_NAME}_Console PRIVATE target_include_directories(${PROJECT_NAME}_Console PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/src
${rnnoise_SOURCE_DIR}/include
) )
# Link libraries # Link libraries
@ -124,6 +157,7 @@ if(MINGW AND NOT BUILD_SHARED_LIBS)
OpenGL::GL OpenGL::GL
opus opus
ogg ogg
rnnoise
# Windows system libraries # Windows system libraries
winmm winmm
setupapi setupapi
@ -136,6 +170,7 @@ if(MINGW AND NOT BUILD_SHARED_LIBS)
nlohmann_json::nlohmann_json nlohmann_json::nlohmann_json
opus opus
ogg ogg
rnnoise
# Windows system libraries # Windows system libraries
winmm winmm
setupapi setupapi
@ -152,6 +187,7 @@ else()
OpenGL::GL OpenGL::GL
opus opus
ogg ogg
rnnoise
) )
# Console version - NO UI libs # Console version - NO UI libs
target_link_libraries(${PROJECT_NAME}_Console PRIVATE target_link_libraries(${PROJECT_NAME}_Console PRIVATE
@ -159,6 +195,7 @@ else()
nlohmann_json::nlohmann_json nlohmann_json::nlohmann_json
opus opus
ogg ogg
rnnoise
) )
endif() endif()

View File

@ -18,10 +18,10 @@
"model": "claude-3-5-haiku-20241022", "model": "claude-3-5-haiku-20241022",
"max_tokens": 1024, "max_tokens": 1024,
"temperature": 0.3, "temperature": 0.3,
"system_prompt": "Tu es un traducteur chinois-français. Réponds UNIQUEMENT avec la traduction française, sans explications, notes, commentaires ou alternatives. Une seule phrase traduite, rien d'autre." "system_prompt": "Tu es un traducteur chinois-français. Réponds UNIQUEMENT avec la traduction française, sans explications, notes, commentaires ou alternatives. Une seule phrase traduite, rien d'autre. Si du contenu offensant est présent, remplace-le par [contenu offensant] mais traduis impérativement le reste de la phrase."
}, },
"ui": { "ui": {
"window_width": 1200, "window_width": 1500,
"window_height": 800, "window_height": 800,
"font_size": 24, "font_size": 24,
"max_display_lines": 50 "max_display_lines": 50

View File

@ -1,14 +1,14 @@
#include "AudioCapture.h" #include "AudioCapture.h"
#include "NoiseReducer.h"
#include <iostream> #include <iostream>
namespace secondvoice { namespace secondvoice {
AudioCapture::AudioCapture(int sample_rate, int channels, int chunk_duration_seconds, int chunk_step_seconds) AudioCapture::AudioCapture(int sample_rate, int channels)
: sample_rate_(sample_rate) : sample_rate_(sample_rate)
, channels_(channels) , channels_(channels)
, chunk_duration_seconds_(chunk_duration_seconds) , noise_reducer_(std::make_unique<NoiseReducer>()) {
, chunk_step_seconds_(chunk_step_seconds > 0 ? chunk_step_seconds : chunk_duration_seconds) { std::cout << "[Audio] Noise reduction enabled (RNNoise)" << std::endl;
// If step not specified, use duration (no overlap)
} }
AudioCapture::~AudioCapture() { AudioCapture::~AudioCapture() {
@ -35,31 +35,212 @@ int AudioCapture::audioCallback(const void* input, void* output,
const PaStreamCallbackTimeInfo* time_info, const PaStreamCallbackTimeInfo* time_info,
PaStreamCallbackFlags status_flags, PaStreamCallbackFlags status_flags,
void* user_data) { void* user_data) {
(void)output; // Unused (void)output;
(void)time_info; // Unused (void)time_info;
(void)status_flags; // Unused (void)status_flags;
AudioCapture* self = static_cast<AudioCapture*>(user_data); AudioCapture* self = static_cast<AudioCapture*>(user_data);
const float* in = static_cast<const float*>(input); const float* in = static_cast<const float*>(input);
unsigned long sample_count = frame_count * self->channels_;
// Accumulate audio data // === REAL-TIME DENOISING ===
for (unsigned long i = 0; i < frame_count * self->channels_; ++i) { // Process audio through RNNoise in real-time for meter display
self->buffer_.push_back(in[i]); std::vector<float> denoised_samples;
if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) {
denoised_samples = self->noise_reducer_->processRealtime(in, sample_count);
} }
// Check if we have accumulated enough data for a chunk // Calculate RMS and Peak from RAW audio (for VAD detection)
size_t chunk_samples = self->sample_rate_ * self->channels_ * self->chunk_duration_seconds_; float raw_sum_squared = 0.0f;
size_t step_samples = self->sample_rate_ * self->channels_ * self->chunk_step_seconds_; float raw_max_amp = 0.0f;
for (unsigned long i = 0; i < sample_count; ++i) {
if (self->buffer_.size() >= chunk_samples) { float sample = in[i];
// Call the callback with the full chunk raw_sum_squared += sample * sample;
if (self->callback_) { if (std::abs(sample) > raw_max_amp) {
// Send exactly chunk_samples (the window) raw_max_amp = std::abs(sample);
std::vector<float> chunk(self->buffer_.begin(), self->buffer_.begin() + chunk_samples); }
self->callback_(chunk); }
float instant_rms = std::sqrt(raw_sum_squared / sample_count);
float instant_peak = raw_max_amp;
// Calculate RMS and Peak from DENOISED audio (for UI meter)
float denoised_rms = 0.0f;
float denoised_peak = 0.0f;
if (!denoised_samples.empty()) {
float sum_squared = 0.0f;
float max_amp = 0.0f;
for (const float& sample : denoised_samples) {
sum_squared += sample * sample;
if (std::abs(sample) > max_amp) {
max_amp = std::abs(sample);
}
}
denoised_rms = std::sqrt(sum_squared / denoised_samples.size());
denoised_peak = max_amp;
} else {
// Fallback to raw if no denoised output yet
denoised_rms = instant_rms;
denoised_peak = instant_peak;
}
// Smooth RMS/Peak for display (fast attack, slow decay) - USE DENOISED VALUES
constexpr float alpha_rise = 0.1f;
constexpr float alpha_fall = 0.02f;
float old_rms = self->current_rms_.load(std::memory_order_relaxed);
float old_peak = self->current_peak_.load(std::memory_order_relaxed);
float alpha_rms = (denoised_rms > old_rms) ? alpha_rise : alpha_fall;
float alpha_peak = (denoised_peak > old_peak) ? alpha_rise : alpha_fall;
self->current_rms_.store(alpha_rms * denoised_rms + (1.0f - alpha_rms) * old_rms, std::memory_order_relaxed);
self->current_peak_.store(alpha_peak * denoised_peak + (1.0f - alpha_peak) * old_peak, std::memory_order_relaxed);
// === VAD NOW WORKS ON FILTERED AUDIO ===
// This way, filtered noise/claps won't trigger VAD!
// Calculate Zero-Crossing Rate on DENOISED audio
float zcr = 0.0f;
if (!denoised_samples.empty() && denoised_samples.size() > 1) {
int zero_crossings = 0;
for (size_t i = 1; i < denoised_samples.size(); ++i) {
if ((denoised_samples[i] >= 0 && denoised_samples[i-1] < 0) ||
(denoised_samples[i] < 0 && denoised_samples[i-1] >= 0)) {
zero_crossings++;
}
}
zcr = static_cast<float>(zero_crossings) / denoised_samples.size();
}
self->last_zcr_ = zcr;
// Get user-adjustable thresholds
float rms_thresh = self->vad_rms_threshold_.load(std::memory_order_relaxed);
float peak_thresh = self->vad_peak_threshold_.load(std::memory_order_relaxed);
// Adaptive noise floor: track minimum DENOISED energy level
if (denoised_rms < self->noise_floor_ * 2.0f) {
self->noise_floor_ = self->noise_floor_ * (1.0f - self->noise_floor_alpha_) +
denoised_rms * self->noise_floor_alpha_;
}
// Dynamic threshold: noise floor + user threshold as margin
float adaptive_rms_thresh = self->noise_floor_ + rms_thresh;
// Speech detection on DENOISED audio:
// 1. Energy above adaptive threshold
// 2. ZCR in speech range (not too high = noise, not too low = silence)
bool energy_ok = (denoised_rms >= adaptive_rms_thresh) || (denoised_peak >= peak_thresh);
bool zcr_ok = (zcr >= self->speech_zcr_min_) && (zcr <= self->speech_zcr_max_);
// Speech = energy OK AND (ZCR OK or very high energy)
bool frame_has_speech = energy_ok && (zcr_ok || denoised_rms > adaptive_rms_thresh * 3.0f);
// Hang time logic: don't immediately cut on silence
if (frame_has_speech) {
self->hang_frames_ = self->hang_frames_threshold_; // Reset hang counter
} else if (self->hang_frames_ > 0) {
self->hang_frames_--;
frame_has_speech = true; // Keep "speaking" during hang time
}
// Calculate durations in samples
int silence_samples_threshold = (self->silence_duration_ms_ * self->sample_rate_ * self->channels_) / 1000;
int min_speech_samples = (self->min_speech_duration_ms_ * self->sample_rate_ * self->channels_) / 1000;
int max_speech_samples = (self->max_speech_duration_ms_ * self->sample_rate_ * self->channels_) / 1000;
if (frame_has_speech) {
// Speech detected
self->is_speech_active_.store(true, std::memory_order_relaxed);
self->silence_samples_count_ = 0;
// Add DENOISED audio to buffer (already processed by processRealtime above)
if (!denoised_samples.empty()) {
self->speech_buffer_.insert(self->speech_buffer_.end(),
denoised_samples.begin(), denoised_samples.end());
} else {
// Fallback to raw if denoising disabled
for (unsigned long i = 0; i < sample_count; ++i) {
self->speech_buffer_.push_back(in[i]);
}
}
self->speech_samples_count_ += sample_count;
// Force flush if too long
if (self->speech_samples_count_ >= max_speech_samples) {
std::cout << "[VAD] Max duration reached, forcing flush ("
<< self->speech_samples_count_ / (self->sample_rate_ * self->channels_) << "s)" << std::endl;
if (self->callback_ && self->speech_buffer_.size() >= static_cast<size_t>(min_speech_samples)) {
// Flush any remaining samples from the denoiser
if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) {
auto remaining = self->noise_reducer_->flushStream();
self->speech_buffer_.insert(self->speech_buffer_.end(),
remaining.begin(), remaining.end());
// Save preview for listening
self->noise_reducer_->savePreview(self->speech_buffer_);
}
self->callback_(self->speech_buffer_);
}
self->speech_buffer_.clear();
self->speech_samples_count_ = 0;
// Reset stream for next segment
if (self->noise_reducer_) {
self->noise_reducer_->resetStream();
}
}
} else {
// True silence (after hang time expired)
self->silence_samples_count_ += sample_count;
// If we were speaking and now have enough silence, flush
if (self->speech_buffer_.size() > 0) {
// Add trailing silence (denoised)
if (!denoised_samples.empty()) {
self->speech_buffer_.insert(self->speech_buffer_.end(),
denoised_samples.begin(), denoised_samples.end());
} else {
for (unsigned long i = 0; i < sample_count; ++i) {
self->speech_buffer_.push_back(in[i]);
}
}
if (self->silence_samples_count_ >= silence_samples_threshold) {
self->is_speech_active_.store(false, std::memory_order_relaxed);
// Flush if we have enough speech
if (self->speech_samples_count_ >= min_speech_samples) {
// Flush any remaining samples from the denoiser
if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) {
auto remaining = self->noise_reducer_->flushStream();
self->speech_buffer_.insert(self->speech_buffer_.end(),
remaining.begin(), remaining.end());
// Save preview for listening
self->noise_reducer_->savePreview(self->speech_buffer_);
}
float duration = static_cast<float>(self->speech_buffer_.size()) /
(self->sample_rate_ * self->channels_);
std::cout << "[VAD] Speech ended (noise_floor=" << self->noise_floor_
<< "), flushing " << duration << "s (denoised)" << std::endl;
if (self->callback_) {
self->callback_(self->speech_buffer_);
}
} else {
std::cout << "[VAD] Speech too short (" << self->speech_samples_count_
<< " samples), discarding" << std::endl;
}
self->speech_buffer_.clear();
self->speech_samples_count_ = 0;
// Reset stream for next segment
if (self->noise_reducer_) {
self->noise_reducer_->resetStream();
}
}
} else {
self->is_speech_active_.store(false, std::memory_order_relaxed);
} }
// Sliding window: remove only step_samples, keep overlap for next chunk
self->buffer_.erase(self->buffer_.begin(), self->buffer_.begin() + step_samples);
} }
return paContinue; return paContinue;
@ -71,7 +252,9 @@ bool AudioCapture::start(AudioCallback callback) {
} }
callback_ = callback; callback_ = callback;
buffer_.clear(); speech_buffer_.clear();
silence_samples_count_ = 0;
speech_samples_count_ = 0;
PaStreamParameters input_params; PaStreamParameters input_params;
input_params.device = Pa_GetDefaultInputDevice(); input_params.device = Pa_GetDefaultInputDevice();
@ -88,7 +271,7 @@ bool AudioCapture::start(AudioCallback callback) {
PaError err = Pa_OpenStream( PaError err = Pa_OpenStream(
&stream_, &stream_,
&input_params, &input_params,
nullptr, // no output nullptr,
sample_rate_, sample_rate_,
paFramesPerBufferUnspecified, paFramesPerBufferUnspecified,
paClipOff, paClipOff,
@ -108,6 +291,9 @@ bool AudioCapture::start(AudioCallback callback) {
} }
is_recording_ = true; is_recording_ = true;
std::cout << "[VAD] Started with RMS threshold=" << vad_rms_threshold_
<< ", Peak threshold=" << vad_peak_threshold_
<< ", Silence duration=" << silence_duration_ms_ << "ms" << std::endl;
return true; return true;
} }
@ -116,8 +302,42 @@ void AudioCapture::stop() {
return; return;
} }
// Flush any remaining audio
if (speech_buffer_.size() > 0 && callback_) {
int min_speech_samples = (min_speech_duration_ms_ * sample_rate_ * channels_) / 1000;
if (speech_samples_count_ >= min_speech_samples) {
std::cout << "[VAD] Flushing remaining audio on stop (denoised)" << std::endl;
// Flush any remaining samples from the denoiser
if (noise_reducer_ && noise_reducer_->isEnabled()) {
auto remaining = noise_reducer_->flushStream();
speech_buffer_.insert(speech_buffer_.end(),
remaining.begin(), remaining.end());
// Save preview for listening
noise_reducer_->savePreview(speech_buffer_);
}
callback_(speech_buffer_);
}
}
// Reset the stream for next session
if (noise_reducer_) {
noise_reducer_->resetStream();
}
Pa_StopStream(stream_); Pa_StopStream(stream_);
is_recording_ = false; is_recording_ = false;
} }
void AudioCapture::setDenoiseEnabled(bool enabled) {
if (noise_reducer_) {
noise_reducer_->setEnabled(enabled);
}
}
bool AudioCapture::isDenoiseEnabled() const {
return noise_reducer_ && noise_reducer_->isEnabled();
}
} // namespace secondvoice } // namespace secondvoice

View File

@ -3,16 +3,20 @@
#include <vector> #include <vector>
#include <string> #include <string>
#include <functional> #include <functional>
#include <atomic>
#include <cmath>
#include <memory>
#include <portaudio.h> #include <portaudio.h>
namespace secondvoice { namespace secondvoice {
class NoiseReducer;
class AudioCapture { class AudioCapture {
public: public:
using AudioCallback = std::function<void(const std::vector<float>&)>; using AudioCallback = std::function<void(const std::vector<float>&)>;
// chunk_duration = window size, chunk_step = how often to send (overlap = duration - step) AudioCapture(int sample_rate, int channels);
AudioCapture(int sample_rate, int channels, int chunk_duration_seconds, int chunk_step_seconds = 0);
~AudioCapture(); ~AudioCapture();
bool initialize(); bool initialize();
@ -20,6 +24,26 @@ public:
void stop(); void stop();
bool isRecording() const { return is_recording_; } bool isRecording() const { return is_recording_; }
// Real-time audio levels (updated every callback, ~10ms)
float getCurrentRMS() const { return current_rms_; }
float getCurrentPeak() const { return current_peak_; }
// VAD settings (can be adjusted in real-time from UI)
void setVadThresholds(float rms_threshold, float peak_threshold) {
vad_rms_threshold_ = rms_threshold;
vad_peak_threshold_ = peak_threshold;
}
void setSilenceDuration(int ms) { silence_duration_ms_ = ms; }
void setMinSpeechDuration(int ms) { min_speech_duration_ms_ = ms; }
void setMaxSpeechDuration(int ms) { max_speech_duration_ms_ = ms; }
// VAD state (for UI display)
bool isSpeechDetected() const { return is_speech_active_; }
// Noise reduction
void setDenoiseEnabled(bool enabled);
bool isDenoiseEnabled() const;
private: private:
static int audioCallback(const void* input, void* output, static int audioCallback(const void* input, void* output,
unsigned long frame_count, unsigned long frame_count,
@ -29,13 +53,45 @@ private:
int sample_rate_; int sample_rate_;
int channels_; int channels_;
int chunk_duration_seconds_;
int chunk_step_seconds_; // How often to emit chunks (0 = same as duration, no overlap)
bool is_recording_ = false; bool is_recording_ = false;
PaStream* stream_ = nullptr; PaStream* stream_ = nullptr;
AudioCallback callback_; AudioCallback callback_;
std::vector<float> buffer_;
// Speech accumulation buffer
std::vector<float> speech_buffer_;
// VAD state
std::atomic<bool> is_speech_active_{false};
int silence_samples_count_ = 0; // How many samples of silence
int speech_samples_count_ = 0; // How many samples of speech accumulated
// VAD parameters - Higher threshold to avoid false triggers on filtered noise
std::atomic<float> vad_rms_threshold_{0.02f}; // Was 0.01f
std::atomic<float> vad_peak_threshold_{0.08f}; // Was 0.04f
int silence_duration_ms_ = 400; // Wait 400ms of silence before cutting
int min_speech_duration_ms_ = 300; // Minimum speech to send
int max_speech_duration_ms_ = 25000; // 25s max before forced flush
// Adaptive noise floor
float noise_floor_ = 0.005f; // Estimated background noise level
float noise_floor_alpha_ = 0.001f; // Slower adaptation
// Hang time - wait before cutting to avoid mid-sentence cuts
int hang_frames_ = 0;
int hang_frames_threshold_ = 20; // ~200ms tolerance for pauses
// Zero-crossing rate for speech vs noise discrimination
float last_zcr_ = 0.0f;
float speech_zcr_min_ = 0.01f; // More permissive lower bound
float speech_zcr_max_ = 0.25f; // More permissive upper bound
// Real-time audio levels (atomic for thread safety)
std::atomic<float> current_rms_{0.0f};
std::atomic<float> current_peak_{0.0f};
// Noise reduction
std::unique_ptr<NoiseReducer> noise_reducer_;
}; };
} // namespace secondvoice } // namespace secondvoice

249
src/audio/NoiseReducer.cpp Normal file
View File

@ -0,0 +1,249 @@
#include "NoiseReducer.h"
#include "AudioBuffer.h"
#include <rnnoise.h>
#include <cstring>
#include <algorithm>
#include <iostream>
#include <filesystem>
namespace secondvoice {
NoiseReducer::NoiseReducer() {
state_ = rnnoise_create(nullptr);
upsample_buffer_.resize(RNNOISE_FRAME_SIZE);
downsample_buffer_.resize(INPUT_FRAME_SIZE);
}
NoiseReducer::~NoiseReducer() {
if (state_) {
rnnoise_destroy(state_);
}
}
void NoiseReducer::upsample(const float* in, float* out, int in_samples) {
// Simple linear interpolation 16kHz -> 48kHz (3x)
for (int i = 0; i < in_samples; ++i) {
int out_idx = i * 3;
float curr = in[i];
float next = (i + 1 < in_samples) ? in[i + 1] : curr;
out[out_idx] = curr;
out[out_idx + 1] = curr + (next - curr) * 0.333f;
out[out_idx + 2] = curr + (next - curr) * 0.667f;
}
}
void NoiseReducer::downsample(const float* in, float* out, int out_samples) {
// Simple decimation 48kHz -> 16kHz (take every 3rd sample with averaging)
for (int i = 0; i < out_samples; ++i) {
int in_idx = i * 3;
// Average of 3 samples for anti-aliasing
out[i] = (in[in_idx] + in[in_idx + 1] + in[in_idx + 2]) / 3.0f;
}
}
void NoiseReducer::suppressTransients(float* samples, size_t count) {
// Transient suppressor: attenuates sudden spikes (claps, clicks, pops)
// while preserving speech which has smoother amplitude changes
int transients_detected = 0;
for (size_t i = 0; i < count; ++i) {
float abs_sample = std::abs(samples[i]);
// IMPORTANT: Use envelope BEFORE updating for transient detection
// Otherwise the envelope rises with the clap and we miss it!
float safe_envelope = std::max(envelope_, 0.001f);
float ratio = abs_sample / safe_envelope;
bool is_transient = (ratio > transient_threshold_);
if (is_transient) {
// This is a transient - attenuate it heavily
float target_amplitude = safe_envelope * transient_ratio_;
float attenuation = target_amplitude / abs_sample;
samples[i] *= attenuation;
transients_detected++;
// DON'T update envelope from transients - keep it stable
// This prevents the envelope from "learning" the clap
} else {
// Only update envelope from non-transient samples
if (abs_sample > envelope_) {
envelope_ = envelope_ * (1.0f - envelope_attack_) + abs_sample * envelope_attack_;
} else {
envelope_ = envelope_ * (1.0f - envelope_release_) + abs_sample * envelope_release_;
}
}
}
if (transients_detected > 10) {
std::cout << "[RNNoise] Suppressed " << transients_detected << " transient samples (clap/click)" << std::endl;
}
}
void NoiseReducer::process(std::vector<float>& samples) {
if (!enabled_ || !state_ || samples.empty()) {
return;
}
// Process in chunks of INPUT_FRAME_SIZE (160 samples = 10ms at 16kHz)
size_t num_frames = samples.size() / INPUT_FRAME_SIZE;
float duration_ms = static_cast<float>(samples.size()) / 16.0f; // 16 samples/ms at 16kHz
std::cout << "[RNNoise] Denoising " << duration_ms << "ms of audio (" << num_frames << " frames)" << std::endl;
for (size_t frame = 0; frame < num_frames; ++frame) {
float* frame_ptr = samples.data() + frame * INPUT_FRAME_SIZE;
// Upsample 16kHz -> 48kHz
upsample(frame_ptr, upsample_buffer_.data(), INPUT_FRAME_SIZE);
// RNNoise expects float input in range [-32768, 32768] (like int16)
// Our input is [-1, 1], so scale up
for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) {
upsample_buffer_[i] *= 32768.0f;
}
// Apply RNNoise
rnnoise_process_frame(state_, upsample_buffer_.data(), upsample_buffer_.data());
// Scale back to [-1, 1]
for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) {
upsample_buffer_[i] /= 32768.0f;
// Clamp to prevent any overflow
upsample_buffer_[i] = std::clamp(upsample_buffer_[i], -1.0f, 1.0f);
}
// Downsample 48kHz -> 16kHz
downsample(upsample_buffer_.data(), frame_ptr, INPUT_FRAME_SIZE);
}
// Handle remaining samples (less than a full frame) - just leave them as-is
// They'll be processed in the next call
}
std::vector<float> NoiseReducer::processCopy(const std::vector<float>& samples) {
std::vector<float> result = samples;
process(result);
return result;
}
std::vector<float> NoiseReducer::processRealtime(const float* samples, size_t count) {
std::vector<float> output;
if (!enabled_ || !state_ || count == 0) {
// Pass through unchanged
output.assign(samples, samples + count);
return output;
}
// Add new samples to streaming buffer
stream_input_buffer_.insert(stream_input_buffer_.end(), samples, samples + count);
// Process complete frames
while (stream_input_buffer_.size() >= INPUT_FRAME_SIZE) {
// Get frame pointer
float* frame_ptr = stream_input_buffer_.data();
// Step 1: Suppress transients (claps, clicks, pops) BEFORE RNNoise
suppressTransients(frame_ptr, INPUT_FRAME_SIZE);
// Step 2: Upsample 16kHz -> 48kHz
upsample(frame_ptr, upsample_buffer_.data(), INPUT_FRAME_SIZE);
// Scale to int16 range for RNNoise
for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) {
upsample_buffer_[i] *= 32768.0f;
}
// Apply RNNoise
rnnoise_process_frame(state_, upsample_buffer_.data(), upsample_buffer_.data());
// Scale back to [-1, 1]
for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) {
upsample_buffer_[i] /= 32768.0f;
upsample_buffer_[i] = std::clamp(upsample_buffer_[i], -1.0f, 1.0f);
}
// Downsample 48kHz -> 16kHz and add to output
std::vector<float> denoised_frame(INPUT_FRAME_SIZE);
downsample(upsample_buffer_.data(), denoised_frame.data(), INPUT_FRAME_SIZE);
output.insert(output.end(), denoised_frame.begin(), denoised_frame.end());
// Remove processed samples from buffer
stream_input_buffer_.erase(stream_input_buffer_.begin(),
stream_input_buffer_.begin() + INPUT_FRAME_SIZE);
}
return output;
}
std::vector<float> NoiseReducer::flushStream() {
std::vector<float> output;
if (!enabled_ || !state_ || stream_input_buffer_.empty()) {
// Return any remaining samples as-is
output = std::move(stream_input_buffer_);
stream_input_buffer_.clear();
return output;
}
// Process remaining samples (zero-pad to complete frame if needed)
while (!stream_input_buffer_.empty()) {
// Pad with zeros if we have less than a full frame
while (stream_input_buffer_.size() < INPUT_FRAME_SIZE) {
stream_input_buffer_.push_back(0.0f);
}
// Get frame pointer
float* frame_ptr = stream_input_buffer_.data();
// Step 1: Suppress transients
suppressTransients(frame_ptr, INPUT_FRAME_SIZE);
// Step 2: Upsample 16kHz -> 48kHz
upsample(frame_ptr, upsample_buffer_.data(), INPUT_FRAME_SIZE);
// Scale to int16 range for RNNoise
for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) {
upsample_buffer_[i] *= 32768.0f;
}
// Apply RNNoise
rnnoise_process_frame(state_, upsample_buffer_.data(), upsample_buffer_.data());
// Scale back to [-1, 1]
for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) {
upsample_buffer_[i] /= 32768.0f;
upsample_buffer_[i] = std::clamp(upsample_buffer_[i], -1.0f, 1.0f);
}
// Downsample 48kHz -> 16kHz and add to output
std::vector<float> denoised_frame(INPUT_FRAME_SIZE);
downsample(upsample_buffer_.data(), denoised_frame.data(), INPUT_FRAME_SIZE);
output.insert(output.end(), denoised_frame.begin(), denoised_frame.end());
// Remove processed samples from buffer
stream_input_buffer_.erase(stream_input_buffer_.begin(),
stream_input_buffer_.begin() + INPUT_FRAME_SIZE);
}
return output;
}
void NoiseReducer::savePreview(const std::vector<float>& samples) {
if (!save_preview_ || samples.empty()) {
return;
}
std::filesystem::create_directories("./denoised");
std::string filename = "./denoised/denoised_" + std::to_string(preview_count_++) + ".ogg";
AudioBuffer buffer(16000, 1); // 16kHz mono
buffer.addSamples(samples);
if (buffer.saveToOpus(filename)) {
std::cout << "[RNNoise] Saved denoised preview: " << filename << std::endl;
}
}
} // namespace secondvoice

86
src/audio/NoiseReducer.h Normal file
View File

@ -0,0 +1,86 @@
#pragma once
#include <vector>
#include <memory>
// Forward declaration
struct DenoiseState;
namespace secondvoice {
/**
* NoiseReducer - Wrapper around RNNoise for real-time audio denoising
*
* RNNoise expects 48kHz audio, but we work at 16kHz.
* This wrapper handles resampling transparently.
*/
class NoiseReducer {
public:
NoiseReducer();
~NoiseReducer();
// Process audio samples in-place (batch mode - for final output)
// Input/output: float samples normalized to [-1, 1]
void process(std::vector<float>& samples);
// Process a buffer, returns denoised copy
std::vector<float> processCopy(const std::vector<float>& samples);
// Real-time streaming: process samples as they come in
// Returns denoised samples (may be slightly delayed due to frame buffering)
std::vector<float> processRealtime(const float* samples, size_t count);
// Flush any remaining samples in the stream buffer (zero-padded if needed)
std::vector<float> flushStream();
// Clear the stream buffer (call when starting a new segment)
void resetStream() { stream_input_buffer_.clear(); }
// Enable/disable denoising
void setEnabled(bool enabled) { enabled_ = enabled; }
bool isEnabled() const { return enabled_; }
// Save denoised audio for preview
void setSavePreview(bool save) { save_preview_ = save; }
bool isSavePreviewEnabled() const { return save_preview_; }
// Save audio samples to Opus file (for preview)
void savePreview(const std::vector<float>& samples);
private:
DenoiseState* state_ = nullptr;
bool enabled_ = true;
bool save_preview_ = true; // Save denoised audio for listening
int preview_count_ = 0;
// RNNoise frame size (480 samples at 48kHz = 10ms)
static constexpr int RNNOISE_FRAME_SIZE = 480;
// Our frame size (160 samples at 16kHz = 10ms)
static constexpr int INPUT_FRAME_SIZE = 160;
// Resampling buffers
std::vector<float> upsample_buffer_;
std::vector<float> downsample_buffer_;
// Streaming input buffer (for real-time processing)
std::vector<float> stream_input_buffer_;
// Transient suppressor state
float envelope_ = 0.0f; // Smoothed signal envelope
float envelope_attack_ = 0.01f; // Fast attack to track signal
float envelope_release_ = 0.0005f; // Slow release to maintain level
float transient_threshold_ = 4.0f; // Spike must be 4x envelope to be considered transient
float transient_ratio_ = 0.1f; // Attenuate transients to 10% (-90% reduction)
// Suppress transient sounds (claps, clicks, pops)
void suppressTransients(float* samples, size_t count);
// Upsample 16kHz to 48kHz (3x)
void upsample(const float* in, float* out, int in_samples);
// Downsample 48kHz to 16kHz (3x)
void downsample(const float* in, float* out, int out_samples);
};
} // namespace secondvoice

View File

@ -24,18 +24,13 @@ Pipeline::~Pipeline() {
bool Pipeline::initialize() { bool Pipeline::initialize() {
auto& config = Config::getInstance(); auto& config = Config::getInstance();
// Initialize audio capture with sliding window (overlap) // Initialize audio capture with VAD-based segmentation
audio_capture_ = std::make_unique<AudioCapture>( audio_capture_ = std::make_unique<AudioCapture>(
config.getAudioConfig().sample_rate, config.getAudioConfig().sample_rate,
config.getAudioConfig().channels, config.getAudioConfig().channels
config.getAudioConfig().chunk_duration_seconds,
config.getAudioConfig().chunk_step_seconds
); );
std::cout << "[Pipeline] Audio: " << config.getAudioConfig().chunk_duration_seconds std::cout << "[Pipeline] VAD-based audio segmentation enabled" << std::endl;
<< "s window, " << config.getAudioConfig().chunk_step_seconds
<< "s step (overlap: " << (config.getAudioConfig().chunk_duration_seconds - config.getAudioConfig().chunk_step_seconds)
<< "s)" << std::endl;
if (!audio_capture_->initialize()) { if (!audio_capture_->initialize()) {
std::cerr << "Failed to initialize audio capture" << std::endl; std::cerr << "Failed to initialize audio capture" << std::endl;
@ -76,7 +71,7 @@ bool Pipeline::start() {
running_ = true; running_ = true;
// Start background threads (NOT UI - that runs in main thread) // Start background threads
audio_thread_ = std::thread(&Pipeline::audioThread, this); audio_thread_ = std::thread(&Pipeline::audioThread, this);
processing_thread_ = std::thread(&Pipeline::processingThread, this); processing_thread_ = std::thread(&Pipeline::processingThread, this);
@ -95,9 +90,8 @@ void Pipeline::stop() {
audio_capture_->stop(); audio_capture_->stop();
} }
// Shutdown queues // Shutdown queue
audio_queue_.shutdown(); audio_queue_.shutdown();
transcription_queue_.shutdown();
// Wait for background threads // Wait for background threads
if (audio_thread_.joinable()) { if (audio_thread_.joinable()) {
@ -146,7 +140,7 @@ void Pipeline::audioThread() {
// Keep thread alive while recording // Keep thread alive while recording
auto start_time = std::chrono::steady_clock::now(); auto start_time = std::chrono::steady_clock::now();
while (running_ && audio_capture_->isRecording()) { while (running_ && audio_capture_->isRecording()) {
std::this_thread::sleep_for(std::chrono::seconds(1)); std::this_thread::sleep_for(std::chrono::milliseconds(100));
// Update duration // Update duration
auto now = std::chrono::steady_clock::now(); auto now = std::chrono::steady_clock::now();
@ -157,11 +151,6 @@ void Pipeline::audioThread() {
void Pipeline::processingThread() { void Pipeline::processingThread() {
auto& config = Config::getInstance(); auto& config = Config::getInstance();
// VAD threshold - audio RMS must exceed this to be considered speech
// Higher values = more aggressive noise filtering
constexpr float VAD_THRESHOLD = 0.02f; // RMS energy threshold
constexpr float VAD_MIN_PEAK = 0.08f; // Minimum peak amplitude
while (running_) { while (running_) {
auto chunk_opt = audio_queue_.wait_and_pop(); auto chunk_opt = audio_queue_.wait_and_pop();
if (!chunk_opt.has_value()) { if (!chunk_opt.has_value()) {
@ -169,22 +158,8 @@ void Pipeline::processingThread() {
} }
auto& chunk = chunk_opt.value(); auto& chunk = chunk_opt.value();
float duration = static_cast<float>(chunk.data.size()) / (chunk.sample_rate * chunk.channels);
// Voice Activity Detection - skip silent/noise chunks std::cout << "[Processing] Speech segment: " << duration << "s" << std::endl;
float sum_squared = 0.0f;
float max_amplitude = 0.0f;
for (const float sample : chunk.data) {
sum_squared += sample * sample;
max_amplitude = std::max(max_amplitude, std::abs(sample));
}
float rms = std::sqrt(sum_squared / chunk.data.size());
if (rms < VAD_THRESHOLD || max_amplitude < VAD_MIN_PEAK) {
std::cout << "[Skip] Silent chunk (RMS: " << rms << ", Peak: " << max_amplitude << ")" << std::endl;
continue;
}
std::cout << "[Audio] Processing chunk (RMS: " << rms << ", Peak: " << max_amplitude << ")" << std::endl;
// Transcribe with Whisper // Transcribe with Whisper
auto whisper_result = whisper_client_->transcribe( auto whisper_result = whisper_client_->transcribe(
@ -203,8 +178,9 @@ void Pipeline::processingThread() {
continue; continue;
} }
// Filter out Whisper hallucinations (common when audio is silent/unclear) // Filter out Whisper hallucinations
std::string text = whisper_result->text; std::string text = whisper_result->text;
// Trim whitespace // Trim whitespace
size_t start = text.find_first_not_of(" \t\n\r"); size_t start = text.find_first_not_of(" \t\n\r");
size_t end = text.find_last_not_of(" \t\n\r"); size_t end = text.find_last_not_of(" \t\n\r");
@ -215,28 +191,61 @@ void Pipeline::processingThread() {
text = text.substr(start, end - start + 1); text = text.substr(start, end - start + 1);
// Skip known hallucinations and garbage // Skip known hallucinations and garbage
// Whisper hallucinates these when given silence/noise
bool is_garbage = false; bool is_garbage = false;
// Too short to be meaningful
if (text.length() < 4) { if (text.length() < 4) {
is_garbage = true; is_garbage = true;
} } else if (
// Known Whisper hallucinations // Common Whisper hallucinations
else if (text.find("Amara.org") != std::string::npos || text.find("Tingting") != std::string::npos ||
text.find("amara.org") != std::string::npos || text.find("tingting") != std::string::npos ||
text.find("字幕") != std::string::npos || text.find("婷婷") != std::string::npos ||
text.find("subtitle") != std::string::npos || text.find("Thank you") != std::string::npos ||
text.find("Subtitle") != std::string::npos || text.find("thanks for watching") != std::string::npos ||
text.find("Thank you") != std::string::npos || text.find("Thanks for watching") != std::string::npos ||
text.find("thanks for watching") != std::string::npos || text.find("Subscribe") != std::string::npos ||
text.find("Subscribe") != std::string::npos || text.find("subscribe") != std::string::npos ||
text.find("谢谢观看") != std::string::npos || text.find("請訂閱") != std::string::npos ||
text.find("订阅") != std::string::npos || text.find("请订阅") != std::string::npos ||
text.find("...") == 0) { // Starts with ellipsis text.find("訂閱") != std::string::npos ||
text.find("订阅") != std::string::npos ||
text.find("Amara.org") != std::string::npos ||
text.find("amara.org") != std::string::npos ||
text.find("字幕") != std::string::npos ||
text.find("subtitle") != std::string::npos ||
text.find("Subtitle") != std::string::npos ||
text.find("谢谢观看") != std::string::npos ||
text.find("謝謝觀看") != std::string::npos ||
text.find("感谢收看") != std::string::npos ||
text.find("感謝收看") != std::string::npos ||
text.find("下期再见") != std::string::npos ||
text.find("下期再見") != std::string::npos ||
text.find("再见") != std::string::npos ||
text.find("再見") != std::string::npos ||
text.find("music") != std::string::npos ||
text.find("Music") != std::string::npos ||
text.find("") != std::string::npos ||
text.find("silenc") != std::string::npos || // silence, silencieux
text.find("Silenc") != std::string::npos ||
text.find("...") == 0 ||
// Repetitive single words/sounds
text == "Bonjour" ||
text == "Hello" ||
text == "Hi" ||
text == "你好" ||
text == "" ||
text == "" ||
text == "" ||
text == "" ||
text == "呵呵" ||
text == "哈哈" ||
text == "嘻嘻" ||
text == "Hmm" ||
text == "Huh" ||
text == "Um" ||
text == "Uh") {
is_garbage = true; is_garbage = true;
} } else {
// Only punctuation or whitespace
else {
bool has_content = false; bool has_content = false;
for (char c : text) { for (char c : text) {
if (std::isalnum(static_cast<unsigned char>(c)) || (c & 0x80)) { if (std::isalnum(static_cast<unsigned char>(c)) || (c & 0x80)) {
@ -244,9 +253,7 @@ void Pipeline::processingThread() {
break; break;
} }
} }
if (!has_content) { if (!has_content) is_garbage = true;
is_garbage = true;
}
} }
if (is_garbage) { if (is_garbage) {
@ -254,20 +261,14 @@ void Pipeline::processingThread() {
continue; continue;
} }
// Remove overlap with previous transcription // Track audio cost
std::string deduplicated = removeOverlap(text, last_transcription_); if (ui_) {
last_transcription_ = text; // Store full text for next comparison ui_->addAudioCost(duration);
if (deduplicated.empty() || deduplicated.length() < 2) {
std::cout << "[Skip] Fully overlapping transcription" << std::endl;
continue;
} }
std::cout << "[New content] " << deduplicated << std::endl; // Translate with Claude
// Translate with Claude (only the new, deduplicated content)
auto claude_result = claude_client_->translate( auto claude_result = claude_client_->translate(
deduplicated, text,
config.getClaudeConfig().system_prompt, config.getClaudeConfig().system_prompt,
config.getClaudeConfig().max_tokens, config.getClaudeConfig().max_tokens,
config.getClaudeConfig().temperature config.getClaudeConfig().temperature
@ -278,10 +279,27 @@ void Pipeline::processingThread() {
continue; continue;
} }
// Add to UI // Track Claude cost
ui_->addTranslation(deduplicated, claude_result->text); if (ui_) {
ui_->addClaudeCost();
}
std::cout << "CN: " << deduplicated << std::endl; // Simple accumulation
if (!accumulated_chinese_.empty()) {
accumulated_chinese_ += " ";
accumulated_french_ += " ";
}
accumulated_chinese_ += text;
accumulated_french_ += claude_result->text;
std::cout << "[Accumulated CN] " << accumulated_chinese_ << std::endl;
std::cout << "[Accumulated FR] " << accumulated_french_ << std::endl;
// Update UI
ui_->setAccumulatedText(accumulated_chinese_, accumulated_french_);
ui_->addTranslation(text, claude_result->text);
std::cout << "CN: " << text << std::endl;
std::cout << "FR: " << claude_result->text << std::endl; std::cout << "FR: " << claude_result->text << std::endl;
std::cout << "---" << std::endl; std::cout << "---" << std::endl;
} }
@ -290,58 +308,45 @@ void Pipeline::processingThread() {
void Pipeline::update() { void Pipeline::update() {
if (!ui_) return; if (!ui_) return;
// Sync VAD thresholds from UI to AudioCapture
if (audio_capture_) {
audio_capture_->setVadThresholds(
ui_->getVadThreshold(),
ui_->getVadPeakThreshold()
);
// Update UI with audio levels
ui_->setCurrentRMS(audio_capture_->getCurrentRMS());
ui_->setCurrentPeak(audio_capture_->getCurrentPeak());
}
ui_->setRecordingDuration(recording_duration_); ui_->setRecordingDuration(recording_duration_);
ui_->setProcessingStatus("Processing..."); ui_->setProcessingStatus(audio_capture_ && audio_capture_->isSpeechDetected() ? "Recording speech..." : "Listening...");
ui_->render(); ui_->render();
// Check if stop was requested // Check if stop was requested
if (ui_->isStopRequested()) { if (ui_->isStopRequested()) {
running_ = false; running_ = false;
} }
// Check if clear was requested
if (ui_->isClearRequested()) {
clearAccumulated();
ui_->resetClearRequest();
}
} }
bool Pipeline::shouldClose() const { bool Pipeline::shouldClose() const {
return ui_ ? ui_->shouldClose() : true; return ui_ ? ui_->shouldClose() : true;
} }
std::string Pipeline::removeOverlap(const std::string& current, const std::string& previous) { void Pipeline::clearAccumulated() {
if (previous.empty()) { accumulated_chinese_.clear();
return current; accumulated_french_.clear();
if (ui_) {
ui_->setAccumulatedText("", "");
} }
std::cout << "[Pipeline] Cleared accumulated text" << std::endl;
// Try to find overlap: check if current starts with end of previous
// Start from half the previous text (since we have 50% overlap)
size_t min_overlap = 4; // Minimum characters to consider as overlap
size_t search_start = previous.length() / 3; // Start looking from 1/3 into previous
for (size_t i = search_start; i < previous.length() - min_overlap; ++i) {
std::string suffix = previous.substr(i);
if (current.find(suffix) == 0) {
// Found overlap - return only the new part
std::string new_part = current.substr(suffix.length());
if (!new_part.empty()) {
std::cout << "[Overlap] Removed: \"" << suffix << "\"" << std::endl;
return new_part;
}
}
}
// No overlap found - check for partial word overlap at start
// This handles cases where the same content appears but tokenized differently
size_t max_check = std::min(current.length(), previous.length()) / 2;
for (size_t len = max_check; len >= min_overlap; --len) {
std::string end_prev = previous.substr(previous.length() - len);
std::string start_curr = current.substr(0, len);
if (end_prev == start_curr) {
std::string new_part = current.substr(len);
if (!new_part.empty()) {
std::cout << "[Overlap] Partial match removed: \"" << end_prev << "\"" << std::endl;
return new_part;
}
}
}
return current; // No overlap detected
} }
} // namespace secondvoice } // namespace secondvoice

View File

@ -21,10 +21,6 @@ struct AudioChunk {
int channels; int channels;
}; };
struct TranscriptionResult {
std::string chinese_text;
};
class Pipeline { class Pipeline {
public: public:
Pipeline(); Pipeline();
@ -40,6 +36,9 @@ public:
bool isRunning() const { return running_; } bool isRunning() const { return running_; }
bool shouldClose() const; bool shouldClose() const;
// Clear accumulated translations
void clearAccumulated();
private: private:
void audioThread(); void audioThread();
void processingThread(); void processingThread();
@ -51,7 +50,6 @@ private:
std::unique_ptr<AudioBuffer> full_recording_; std::unique_ptr<AudioBuffer> full_recording_;
ThreadSafeQueue<AudioChunk> audio_queue_; ThreadSafeQueue<AudioChunk> audio_queue_;
ThreadSafeQueue<TranscriptionResult> transcription_queue_;
std::thread audio_thread_; std::thread audio_thread_;
std::thread processing_thread_; std::thread processing_thread_;
@ -59,9 +57,9 @@ private:
std::atomic<bool> running_{false}; std::atomic<bool> running_{false};
std::atomic<int> recording_duration_{0}; std::atomic<int> recording_duration_{0};
// For overlap deduplication // Simple accumulation
std::string last_transcription_; std::string accumulated_chinese_;
std::string removeOverlap(const std::string& current, const std::string& previous); std::string accumulated_french_;
}; };
} // namespace secondvoice } // namespace secondvoice

View File

@ -199,10 +199,31 @@ void TranslationUI::render() {
ImGuiWindowFlags_NoMove | ImGuiWindowFlags_NoMove |
ImGuiWindowFlags_NoCollapse); ImGuiWindowFlags_NoCollapse);
// Two-column layout: main content on left, audio panel on right
float audio_panel_width = 280.0f;
float main_width = ImGui::GetWindowWidth() - audio_panel_width - 20.0f;
// Left column - main content
ImGui::BeginChild("MainContent", ImVec2(main_width, -1), false);
ImGui::Text("SecondVoice - Live Translation");
ImGui::Separator();
ImGui::Spacing();
renderAccumulated();
renderTranslations(); renderTranslations();
renderControls(); renderControls();
renderStatus(); renderStatus();
ImGui::EndChild();
ImGui::SameLine();
// Right column - audio panel
ImGui::BeginChild("AudioPanel", ImVec2(audio_panel_width, -1), true);
renderAudioPanel();
ImGui::EndChild();
ImGui::End(); ImGui::End();
// Rendering // Rendering
@ -225,11 +246,49 @@ void TranslationUI::addTranslation(const std::string& chinese, const std::string
messages_.push_back({chinese, french}); messages_.push_back({chinese, french});
} }
void TranslationUI::renderTranslations() { void TranslationUI::setAccumulatedText(const std::string& chinese, const std::string& french) {
ImGui::Text("SecondVoice - Live Translation"); accumulated_chinese_ = chinese;
accumulated_french_ = french;
}
void TranslationUI::renderAccumulated() {
if (accumulated_chinese_.empty() && accumulated_french_.empty()) {
return;
}
ImGui::Text("Texte Recomposé");
ImGui::SameLine();
if (ImGui::SmallButton("Clear")) {
clear_requested_ = true;
}
ImGui::Separator(); ImGui::Separator();
ImGui::BeginChild("Translations", ImVec2(0, -120), true); ImGui::BeginChild("Accumulated", ImVec2(0, 150), true);
// Chinese accumulated text
ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(1.0f, 0.9f, 0.5f, 1.0f)); // Yellow
ImGui::TextWrapped("%s", accumulated_chinese_.c_str());
ImGui::PopStyleColor();
ImGui::Spacing();
// French accumulated text
ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.5f, 1.0f, 0.8f, 1.0f)); // Cyan
ImGui::TextWrapped("%s", accumulated_french_.c_str());
ImGui::PopStyleColor();
ImGui::EndChild();
ImGui::Spacing();
}
void TranslationUI::renderTranslations() {
ImGui::Text("Segments (détail)");
ImGui::Separator();
// Height depends on whether accumulated section is shown
float height = (accumulated_chinese_.empty() && accumulated_french_.empty()) ? -120 : -120;
ImGui::BeginChild("Translations", ImVec2(0, height), true);
for (const auto& msg : messages_) { for (const auto& msg : messages_) {
ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.5f, 0.8f, 1.0f, 1.0f)); ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.5f, 0.8f, 1.0f, 1.0f));
@ -281,4 +340,101 @@ void TranslationUI::renderStatus() {
} }
} }
void TranslationUI::renderAudioPanel() {
ImGui::Text("Audio Monitor");
ImGui::Separator();
ImGui::Spacing();
// Current volume display
ImGui::Text("Volume Level");
// RMS meter (horizontal bar)
float rms_normalized = std::min(current_rms_ * 10.0f, 1.0f); // Scale for visibility
ImVec4 rms_color = rms_normalized > vad_threshold_ * 10.0f ?
ImVec4(0.2f, 0.8f, 0.2f, 1.0f) : // Green if above threshold
ImVec4(0.5f, 0.5f, 0.5f, 1.0f); // Gray if below
ImGui::PushStyleColor(ImGuiCol_PlotHistogram, rms_color);
ImGui::ProgressBar(rms_normalized, ImVec2(-1, 20), "");
ImGui::PopStyleColor();
ImGui::Text("RMS: %.4f", current_rms_);
ImGui::Spacing();
// Peak meter
ImGui::Text("Peak Level");
float peak_normalized = std::min(current_peak_, 1.0f);
ImVec4 peak_color = peak_normalized > vad_peak_threshold_ ?
ImVec4(0.2f, 0.8f, 0.2f, 1.0f) :
ImVec4(0.5f, 0.5f, 0.5f, 1.0f);
ImGui::PushStyleColor(ImGuiCol_PlotHistogram, peak_color);
ImGui::ProgressBar(peak_normalized, ImVec2(-1, 20), "");
ImGui::PopStyleColor();
ImGui::Text("Peak: %.4f", current_peak_);
ImGui::Spacing();
ImGui::Separator();
ImGui::Spacing();
// VAD Threshold sliders
ImGui::Text("VAD Settings");
ImGui::Spacing();
ImGui::Text("RMS Threshold");
ImGui::SliderFloat("##rms_thresh", &vad_threshold_, 0.001f, 0.1f, "%.3f");
// Draw threshold line on the RMS meter area
ImGui::Spacing();
ImGui::Text("Peak Threshold");
ImGui::SliderFloat("##peak_thresh", &vad_peak_threshold_, 0.01f, 0.3f, "%.3f");
ImGui::Spacing();
ImGui::Separator();
ImGui::Spacing();
// Status indicator
bool is_active = (current_rms_ >= vad_threshold_) && (current_peak_ >= vad_peak_threshold_);
if (is_active) {
ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.2f, 1.0f, 0.2f, 1.0f));
ImGui::Text(">> VOICE DETECTED <<");
ImGui::PopStyleColor();
} else {
ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.6f, 0.6f, 0.6f, 1.0f));
ImGui::Text(" (silence)");
ImGui::PopStyleColor();
}
ImGui::Spacing();
ImGui::Separator();
ImGui::Spacing();
// Cost tracking
ImGui::Text("API Usage");
ImGui::Spacing();
// Audio sent to Whisper
int minutes = static_cast<int>(total_audio_seconds_) / 60;
int seconds = static_cast<int>(total_audio_seconds_) % 60;
ImGui::Text("Audio: %d:%02d", minutes, seconds);
ImGui::Text("Whisper calls: %d", whisper_calls_);
ImGui::Text("Claude calls: %d", claude_calls_);
ImGui::Spacing();
// Estimated cost
float cost = getEstimatedCost();
ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(1.0f, 0.8f, 0.3f, 1.0f)); // Gold
ImGui::Text("Est. cost: $%.4f", cost);
ImGui::PopStyleColor();
ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.6f, 0.6f, 0.6f, 1.0f));
ImGui::Text("(Whisper $0.006/min)");
ImGui::Text("(Haiku ~$0.001/call)");
ImGui::PopStyleColor();
}
} // namespace secondvoice } // namespace secondvoice

View File

@ -23,27 +23,66 @@ public:
bool shouldClose() const; bool shouldClose() const;
void addTranslation(const std::string& chinese, const std::string& french); void addTranslation(const std::string& chinese, const std::string& french);
void setAccumulatedText(const std::string& chinese, const std::string& french);
bool isStopRequested() const { return stop_requested_; } bool isStopRequested() const { return stop_requested_; }
void resetStopRequest() { stop_requested_ = false; } void resetStopRequest() { stop_requested_ = false; }
bool isClearRequested() const { return clear_requested_; }
void resetClearRequest() { clear_requested_ = false; }
void setRecordingDuration(int seconds) { recording_duration_ = seconds; } void setRecordingDuration(int seconds) { recording_duration_ = seconds; }
void setProcessingStatus(const std::string& status) { processing_status_ = status; } void setProcessingStatus(const std::string& status) { processing_status_ = status; }
// Audio level monitoring
void setCurrentRMS(float rms) { current_rms_ = rms; }
void setCurrentPeak(float peak) { current_peak_ = peak; }
float getVadThreshold() const { return vad_threshold_; }
float getVadPeakThreshold() const { return vad_peak_threshold_; }
private: private:
void renderAccumulated();
void renderTranslations(); void renderTranslations();
void renderControls(); void renderControls();
void renderStatus(); void renderStatus();
void renderAudioPanel();
int width_; int width_;
int height_; int height_;
GLFWwindow* window_ = nullptr; GLFWwindow* window_ = nullptr;
std::vector<TranslationMessage> messages_; std::vector<TranslationMessage> messages_;
std::string accumulated_chinese_;
std::string accumulated_french_;
bool stop_requested_ = false; bool stop_requested_ = false;
bool clear_requested_ = false;
bool auto_scroll_ = true; bool auto_scroll_ = true;
int recording_duration_ = 0; int recording_duration_ = 0;
std::string processing_status_; std::string processing_status_;
// Audio monitoring
float current_rms_ = 0.0f;
float current_peak_ = 0.0f;
float vad_threshold_ = 0.02f; // 2x higher to avoid false triggers
float vad_peak_threshold_ = 0.08f; // 2x higher
// Cost tracking
float total_audio_seconds_ = 0.0f;
int whisper_calls_ = 0;
int claude_calls_ = 0;
public:
void addAudioCost(float seconds) {
total_audio_seconds_ += seconds;
whisper_calls_++;
}
void addClaudeCost() { claude_calls_++; }
float getTotalAudioSeconds() const { return total_audio_seconds_; }
float getEstimatedCost() const {
// gpt-4o-mini-transcribe: $0.006/min = $0.0001/sec
// Haiku: ~$0.001 per short translation
return (total_audio_seconds_ * 0.0001f) + (claude_calls_ * 0.001f);
}
}; };
} // namespace secondvoice } // namespace secondvoice