feat: Add RNNoise denoising + transient suppressor + VAD improvements
- Add RNNoise neural network audio denoising (16kHz↔48kHz resampling) - Add transient suppressor to filter claps/clicks/pops before RNNoise - VAD now works on FILTERED audio (not raw) to avoid false triggers - Real-time denoised audio level display in UI - Save denoised audio previews in Opus format (.ogg) - Add extensive Whisper hallucination filter (Tingting, music, etc.) - Add "Clear" button to reset accumulated translations - Double VAD thresholds (0.02/0.08) for less sensitivity - Update Claude prompt to handle offensive content gracefully 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
fa8ea2907b
commit
741ca09663
@ -1,5 +1,5 @@
|
|||||||
cmake_minimum_required(VERSION 3.20)
|
cmake_minimum_required(VERSION 3.20)
|
||||||
project(SecondVoice VERSION 0.1.0 LANGUAGES CXX)
|
project(SecondVoice VERSION 0.1.0 LANGUAGES C CXX)
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 17)
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||||
@ -38,6 +38,35 @@ FetchContent_Declare(
|
|||||||
|
|
||||||
FetchContent_MakeAvailable(ogg opus)
|
FetchContent_MakeAvailable(ogg opus)
|
||||||
|
|
||||||
|
# Fetch RNNoise for noise reduction (using older stable version)
|
||||||
|
FetchContent_Declare(
|
||||||
|
rnnoise
|
||||||
|
GIT_REPOSITORY https://github.com/xiph/rnnoise.git
|
||||||
|
GIT_TAG v0.1.1
|
||||||
|
)
|
||||||
|
|
||||||
|
FetchContent_MakeAvailable(rnnoise)
|
||||||
|
|
||||||
|
# Build RNNoise as a static library (v0.1.1 structure)
|
||||||
|
add_library(rnnoise STATIC
|
||||||
|
${rnnoise_SOURCE_DIR}/src/denoise.c
|
||||||
|
${rnnoise_SOURCE_DIR}/src/rnn.c
|
||||||
|
${rnnoise_SOURCE_DIR}/src/rnn_data.c
|
||||||
|
${rnnoise_SOURCE_DIR}/src/pitch.c
|
||||||
|
${rnnoise_SOURCE_DIR}/src/kiss_fft.c
|
||||||
|
${rnnoise_SOURCE_DIR}/src/celt_lpc.c
|
||||||
|
)
|
||||||
|
|
||||||
|
target_include_directories(rnnoise PUBLIC
|
||||||
|
${rnnoise_SOURCE_DIR}/include
|
||||||
|
${rnnoise_SOURCE_DIR}/src
|
||||||
|
)
|
||||||
|
|
||||||
|
# RNNoise needs math library
|
||||||
|
if(UNIX)
|
||||||
|
target_link_libraries(rnnoise PRIVATE m)
|
||||||
|
endif()
|
||||||
|
|
||||||
# Create ImGui library with OpenGL/GLFW backends
|
# Create ImGui library with OpenGL/GLFW backends
|
||||||
add_library(imgui_backends
|
add_library(imgui_backends
|
||||||
${imgui_SOURCE_DIR}/imgui.cpp
|
${imgui_SOURCE_DIR}/imgui.cpp
|
||||||
@ -70,6 +99,7 @@ set(SOURCES_UI
|
|||||||
# Audio module
|
# Audio module
|
||||||
src/audio/AudioCapture.cpp
|
src/audio/AudioCapture.cpp
|
||||||
src/audio/AudioBuffer.cpp
|
src/audio/AudioBuffer.cpp
|
||||||
|
src/audio/NoiseReducer.cpp
|
||||||
# API clients
|
# API clients
|
||||||
src/api/WhisperClient.cpp
|
src/api/WhisperClient.cpp
|
||||||
src/api/ClaudeClient.cpp
|
src/api/ClaudeClient.cpp
|
||||||
@ -88,6 +118,7 @@ set(SOURCES_CONSOLE
|
|||||||
# Audio module
|
# Audio module
|
||||||
src/audio/AudioCapture.cpp
|
src/audio/AudioCapture.cpp
|
||||||
src/audio/AudioBuffer.cpp
|
src/audio/AudioBuffer.cpp
|
||||||
|
src/audio/NoiseReducer.cpp
|
||||||
# API clients
|
# API clients
|
||||||
src/api/WhisperClient.cpp
|
src/api/WhisperClient.cpp
|
||||||
src/api/ClaudeClient.cpp
|
src/api/ClaudeClient.cpp
|
||||||
@ -107,9 +138,11 @@ add_executable(${PROJECT_NAME}_Console ${SOURCES_CONSOLE})
|
|||||||
# Include directories
|
# Include directories
|
||||||
target_include_directories(${PROJECT_NAME} PRIVATE
|
target_include_directories(${PROJECT_NAME} PRIVATE
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/src
|
${CMAKE_CURRENT_SOURCE_DIR}/src
|
||||||
|
${rnnoise_SOURCE_DIR}/include
|
||||||
)
|
)
|
||||||
target_include_directories(${PROJECT_NAME}_Console PRIVATE
|
target_include_directories(${PROJECT_NAME}_Console PRIVATE
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/src
|
${CMAKE_CURRENT_SOURCE_DIR}/src
|
||||||
|
${rnnoise_SOURCE_DIR}/include
|
||||||
)
|
)
|
||||||
|
|
||||||
# Link libraries
|
# Link libraries
|
||||||
@ -124,6 +157,7 @@ if(MINGW AND NOT BUILD_SHARED_LIBS)
|
|||||||
OpenGL::GL
|
OpenGL::GL
|
||||||
opus
|
opus
|
||||||
ogg
|
ogg
|
||||||
|
rnnoise
|
||||||
# Windows system libraries
|
# Windows system libraries
|
||||||
winmm
|
winmm
|
||||||
setupapi
|
setupapi
|
||||||
@ -136,6 +170,7 @@ if(MINGW AND NOT BUILD_SHARED_LIBS)
|
|||||||
nlohmann_json::nlohmann_json
|
nlohmann_json::nlohmann_json
|
||||||
opus
|
opus
|
||||||
ogg
|
ogg
|
||||||
|
rnnoise
|
||||||
# Windows system libraries
|
# Windows system libraries
|
||||||
winmm
|
winmm
|
||||||
setupapi
|
setupapi
|
||||||
@ -152,6 +187,7 @@ else()
|
|||||||
OpenGL::GL
|
OpenGL::GL
|
||||||
opus
|
opus
|
||||||
ogg
|
ogg
|
||||||
|
rnnoise
|
||||||
)
|
)
|
||||||
# Console version - NO UI libs
|
# Console version - NO UI libs
|
||||||
target_link_libraries(${PROJECT_NAME}_Console PRIVATE
|
target_link_libraries(${PROJECT_NAME}_Console PRIVATE
|
||||||
@ -159,6 +195,7 @@ else()
|
|||||||
nlohmann_json::nlohmann_json
|
nlohmann_json::nlohmann_json
|
||||||
opus
|
opus
|
||||||
ogg
|
ogg
|
||||||
|
rnnoise
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|||||||
@ -18,10 +18,10 @@
|
|||||||
"model": "claude-3-5-haiku-20241022",
|
"model": "claude-3-5-haiku-20241022",
|
||||||
"max_tokens": 1024,
|
"max_tokens": 1024,
|
||||||
"temperature": 0.3,
|
"temperature": 0.3,
|
||||||
"system_prompt": "Tu es un traducteur chinois-français. Réponds UNIQUEMENT avec la traduction française, sans explications, notes, commentaires ou alternatives. Une seule phrase traduite, rien d'autre."
|
"system_prompt": "Tu es un traducteur chinois-français. Réponds UNIQUEMENT avec la traduction française, sans explications, notes, commentaires ou alternatives. Une seule phrase traduite, rien d'autre. Si du contenu offensant est présent, remplace-le par [contenu offensant] mais traduis impérativement le reste de la phrase."
|
||||||
},
|
},
|
||||||
"ui": {
|
"ui": {
|
||||||
"window_width": 1200,
|
"window_width": 1500,
|
||||||
"window_height": 800,
|
"window_height": 800,
|
||||||
"font_size": 24,
|
"font_size": 24,
|
||||||
"max_display_lines": 50
|
"max_display_lines": 50
|
||||||
|
|||||||
@ -1,14 +1,14 @@
|
|||||||
#include "AudioCapture.h"
|
#include "AudioCapture.h"
|
||||||
|
#include "NoiseReducer.h"
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
namespace secondvoice {
|
namespace secondvoice {
|
||||||
|
|
||||||
AudioCapture::AudioCapture(int sample_rate, int channels, int chunk_duration_seconds, int chunk_step_seconds)
|
AudioCapture::AudioCapture(int sample_rate, int channels)
|
||||||
: sample_rate_(sample_rate)
|
: sample_rate_(sample_rate)
|
||||||
, channels_(channels)
|
, channels_(channels)
|
||||||
, chunk_duration_seconds_(chunk_duration_seconds)
|
, noise_reducer_(std::make_unique<NoiseReducer>()) {
|
||||||
, chunk_step_seconds_(chunk_step_seconds > 0 ? chunk_step_seconds : chunk_duration_seconds) {
|
std::cout << "[Audio] Noise reduction enabled (RNNoise)" << std::endl;
|
||||||
// If step not specified, use duration (no overlap)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
AudioCapture::~AudioCapture() {
|
AudioCapture::~AudioCapture() {
|
||||||
@ -35,31 +35,212 @@ int AudioCapture::audioCallback(const void* input, void* output,
|
|||||||
const PaStreamCallbackTimeInfo* time_info,
|
const PaStreamCallbackTimeInfo* time_info,
|
||||||
PaStreamCallbackFlags status_flags,
|
PaStreamCallbackFlags status_flags,
|
||||||
void* user_data) {
|
void* user_data) {
|
||||||
(void)output; // Unused
|
(void)output;
|
||||||
(void)time_info; // Unused
|
(void)time_info;
|
||||||
(void)status_flags; // Unused
|
(void)status_flags;
|
||||||
|
|
||||||
AudioCapture* self = static_cast<AudioCapture*>(user_data);
|
AudioCapture* self = static_cast<AudioCapture*>(user_data);
|
||||||
const float* in = static_cast<const float*>(input);
|
const float* in = static_cast<const float*>(input);
|
||||||
|
unsigned long sample_count = frame_count * self->channels_;
|
||||||
|
|
||||||
// Accumulate audio data
|
// === REAL-TIME DENOISING ===
|
||||||
for (unsigned long i = 0; i < frame_count * self->channels_; ++i) {
|
// Process audio through RNNoise in real-time for meter display
|
||||||
self->buffer_.push_back(in[i]);
|
std::vector<float> denoised_samples;
|
||||||
|
if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) {
|
||||||
|
denoised_samples = self->noise_reducer_->processRealtime(in, sample_count);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if we have accumulated enough data for a chunk
|
// Calculate RMS and Peak from RAW audio (for VAD detection)
|
||||||
size_t chunk_samples = self->sample_rate_ * self->channels_ * self->chunk_duration_seconds_;
|
float raw_sum_squared = 0.0f;
|
||||||
size_t step_samples = self->sample_rate_ * self->channels_ * self->chunk_step_seconds_;
|
float raw_max_amp = 0.0f;
|
||||||
|
for (unsigned long i = 0; i < sample_count; ++i) {
|
||||||
if (self->buffer_.size() >= chunk_samples) {
|
float sample = in[i];
|
||||||
// Call the callback with the full chunk
|
raw_sum_squared += sample * sample;
|
||||||
if (self->callback_) {
|
if (std::abs(sample) > raw_max_amp) {
|
||||||
// Send exactly chunk_samples (the window)
|
raw_max_amp = std::abs(sample);
|
||||||
std::vector<float> chunk(self->buffer_.begin(), self->buffer_.begin() + chunk_samples);
|
}
|
||||||
self->callback_(chunk);
|
}
|
||||||
|
float instant_rms = std::sqrt(raw_sum_squared / sample_count);
|
||||||
|
float instant_peak = raw_max_amp;
|
||||||
|
|
||||||
|
// Calculate RMS and Peak from DENOISED audio (for UI meter)
|
||||||
|
float denoised_rms = 0.0f;
|
||||||
|
float denoised_peak = 0.0f;
|
||||||
|
if (!denoised_samples.empty()) {
|
||||||
|
float sum_squared = 0.0f;
|
||||||
|
float max_amp = 0.0f;
|
||||||
|
for (const float& sample : denoised_samples) {
|
||||||
|
sum_squared += sample * sample;
|
||||||
|
if (std::abs(sample) > max_amp) {
|
||||||
|
max_amp = std::abs(sample);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
denoised_rms = std::sqrt(sum_squared / denoised_samples.size());
|
||||||
|
denoised_peak = max_amp;
|
||||||
|
} else {
|
||||||
|
// Fallback to raw if no denoised output yet
|
||||||
|
denoised_rms = instant_rms;
|
||||||
|
denoised_peak = instant_peak;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Smooth RMS/Peak for display (fast attack, slow decay) - USE DENOISED VALUES
|
||||||
|
constexpr float alpha_rise = 0.1f;
|
||||||
|
constexpr float alpha_fall = 0.02f;
|
||||||
|
|
||||||
|
float old_rms = self->current_rms_.load(std::memory_order_relaxed);
|
||||||
|
float old_peak = self->current_peak_.load(std::memory_order_relaxed);
|
||||||
|
|
||||||
|
float alpha_rms = (denoised_rms > old_rms) ? alpha_rise : alpha_fall;
|
||||||
|
float alpha_peak = (denoised_peak > old_peak) ? alpha_rise : alpha_fall;
|
||||||
|
|
||||||
|
self->current_rms_.store(alpha_rms * denoised_rms + (1.0f - alpha_rms) * old_rms, std::memory_order_relaxed);
|
||||||
|
self->current_peak_.store(alpha_peak * denoised_peak + (1.0f - alpha_peak) * old_peak, std::memory_order_relaxed);
|
||||||
|
|
||||||
|
// === VAD NOW WORKS ON FILTERED AUDIO ===
|
||||||
|
// This way, filtered noise/claps won't trigger VAD!
|
||||||
|
|
||||||
|
// Calculate Zero-Crossing Rate on DENOISED audio
|
||||||
|
float zcr = 0.0f;
|
||||||
|
if (!denoised_samples.empty() && denoised_samples.size() > 1) {
|
||||||
|
int zero_crossings = 0;
|
||||||
|
for (size_t i = 1; i < denoised_samples.size(); ++i) {
|
||||||
|
if ((denoised_samples[i] >= 0 && denoised_samples[i-1] < 0) ||
|
||||||
|
(denoised_samples[i] < 0 && denoised_samples[i-1] >= 0)) {
|
||||||
|
zero_crossings++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
zcr = static_cast<float>(zero_crossings) / denoised_samples.size();
|
||||||
|
}
|
||||||
|
self->last_zcr_ = zcr;
|
||||||
|
|
||||||
|
// Get user-adjustable thresholds
|
||||||
|
float rms_thresh = self->vad_rms_threshold_.load(std::memory_order_relaxed);
|
||||||
|
float peak_thresh = self->vad_peak_threshold_.load(std::memory_order_relaxed);
|
||||||
|
|
||||||
|
// Adaptive noise floor: track minimum DENOISED energy level
|
||||||
|
if (denoised_rms < self->noise_floor_ * 2.0f) {
|
||||||
|
self->noise_floor_ = self->noise_floor_ * (1.0f - self->noise_floor_alpha_) +
|
||||||
|
denoised_rms * self->noise_floor_alpha_;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dynamic threshold: noise floor + user threshold as margin
|
||||||
|
float adaptive_rms_thresh = self->noise_floor_ + rms_thresh;
|
||||||
|
|
||||||
|
// Speech detection on DENOISED audio:
|
||||||
|
// 1. Energy above adaptive threshold
|
||||||
|
// 2. ZCR in speech range (not too high = noise, not too low = silence)
|
||||||
|
bool energy_ok = (denoised_rms >= adaptive_rms_thresh) || (denoised_peak >= peak_thresh);
|
||||||
|
bool zcr_ok = (zcr >= self->speech_zcr_min_) && (zcr <= self->speech_zcr_max_);
|
||||||
|
|
||||||
|
// Speech = energy OK AND (ZCR OK or very high energy)
|
||||||
|
bool frame_has_speech = energy_ok && (zcr_ok || denoised_rms > adaptive_rms_thresh * 3.0f);
|
||||||
|
|
||||||
|
// Hang time logic: don't immediately cut on silence
|
||||||
|
if (frame_has_speech) {
|
||||||
|
self->hang_frames_ = self->hang_frames_threshold_; // Reset hang counter
|
||||||
|
} else if (self->hang_frames_ > 0) {
|
||||||
|
self->hang_frames_--;
|
||||||
|
frame_has_speech = true; // Keep "speaking" during hang time
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate durations in samples
|
||||||
|
int silence_samples_threshold = (self->silence_duration_ms_ * self->sample_rate_ * self->channels_) / 1000;
|
||||||
|
int min_speech_samples = (self->min_speech_duration_ms_ * self->sample_rate_ * self->channels_) / 1000;
|
||||||
|
int max_speech_samples = (self->max_speech_duration_ms_ * self->sample_rate_ * self->channels_) / 1000;
|
||||||
|
|
||||||
|
if (frame_has_speech) {
|
||||||
|
// Speech detected
|
||||||
|
self->is_speech_active_.store(true, std::memory_order_relaxed);
|
||||||
|
self->silence_samples_count_ = 0;
|
||||||
|
|
||||||
|
// Add DENOISED audio to buffer (already processed by processRealtime above)
|
||||||
|
if (!denoised_samples.empty()) {
|
||||||
|
self->speech_buffer_.insert(self->speech_buffer_.end(),
|
||||||
|
denoised_samples.begin(), denoised_samples.end());
|
||||||
|
} else {
|
||||||
|
// Fallback to raw if denoising disabled
|
||||||
|
for (unsigned long i = 0; i < sample_count; ++i) {
|
||||||
|
self->speech_buffer_.push_back(in[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self->speech_samples_count_ += sample_count;
|
||||||
|
|
||||||
|
// Force flush if too long
|
||||||
|
if (self->speech_samples_count_ >= max_speech_samples) {
|
||||||
|
std::cout << "[VAD] Max duration reached, forcing flush ("
|
||||||
|
<< self->speech_samples_count_ / (self->sample_rate_ * self->channels_) << "s)" << std::endl;
|
||||||
|
|
||||||
|
if (self->callback_ && self->speech_buffer_.size() >= static_cast<size_t>(min_speech_samples)) {
|
||||||
|
// Flush any remaining samples from the denoiser
|
||||||
|
if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) {
|
||||||
|
auto remaining = self->noise_reducer_->flushStream();
|
||||||
|
self->speech_buffer_.insert(self->speech_buffer_.end(),
|
||||||
|
remaining.begin(), remaining.end());
|
||||||
|
// Save preview for listening
|
||||||
|
self->noise_reducer_->savePreview(self->speech_buffer_);
|
||||||
|
}
|
||||||
|
self->callback_(self->speech_buffer_);
|
||||||
|
}
|
||||||
|
self->speech_buffer_.clear();
|
||||||
|
self->speech_samples_count_ = 0;
|
||||||
|
// Reset stream for next segment
|
||||||
|
if (self->noise_reducer_) {
|
||||||
|
self->noise_reducer_->resetStream();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// True silence (after hang time expired)
|
||||||
|
self->silence_samples_count_ += sample_count;
|
||||||
|
|
||||||
|
// If we were speaking and now have enough silence, flush
|
||||||
|
if (self->speech_buffer_.size() > 0) {
|
||||||
|
// Add trailing silence (denoised)
|
||||||
|
if (!denoised_samples.empty()) {
|
||||||
|
self->speech_buffer_.insert(self->speech_buffer_.end(),
|
||||||
|
denoised_samples.begin(), denoised_samples.end());
|
||||||
|
} else {
|
||||||
|
for (unsigned long i = 0; i < sample_count; ++i) {
|
||||||
|
self->speech_buffer_.push_back(in[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (self->silence_samples_count_ >= silence_samples_threshold) {
|
||||||
|
self->is_speech_active_.store(false, std::memory_order_relaxed);
|
||||||
|
|
||||||
|
// Flush if we have enough speech
|
||||||
|
if (self->speech_samples_count_ >= min_speech_samples) {
|
||||||
|
// Flush any remaining samples from the denoiser
|
||||||
|
if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) {
|
||||||
|
auto remaining = self->noise_reducer_->flushStream();
|
||||||
|
self->speech_buffer_.insert(self->speech_buffer_.end(),
|
||||||
|
remaining.begin(), remaining.end());
|
||||||
|
// Save preview for listening
|
||||||
|
self->noise_reducer_->savePreview(self->speech_buffer_);
|
||||||
|
}
|
||||||
|
|
||||||
|
float duration = static_cast<float>(self->speech_buffer_.size()) /
|
||||||
|
(self->sample_rate_ * self->channels_);
|
||||||
|
std::cout << "[VAD] Speech ended (noise_floor=" << self->noise_floor_
|
||||||
|
<< "), flushing " << duration << "s (denoised)" << std::endl;
|
||||||
|
|
||||||
|
if (self->callback_) {
|
||||||
|
self->callback_(self->speech_buffer_);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
std::cout << "[VAD] Speech too short (" << self->speech_samples_count_
|
||||||
|
<< " samples), discarding" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
self->speech_buffer_.clear();
|
||||||
|
self->speech_samples_count_ = 0;
|
||||||
|
// Reset stream for next segment
|
||||||
|
if (self->noise_reducer_) {
|
||||||
|
self->noise_reducer_->resetStream();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
self->is_speech_active_.store(false, std::memory_order_relaxed);
|
||||||
}
|
}
|
||||||
// Sliding window: remove only step_samples, keep overlap for next chunk
|
|
||||||
self->buffer_.erase(self->buffer_.begin(), self->buffer_.begin() + step_samples);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return paContinue;
|
return paContinue;
|
||||||
@ -71,7 +252,9 @@ bool AudioCapture::start(AudioCallback callback) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
callback_ = callback;
|
callback_ = callback;
|
||||||
buffer_.clear();
|
speech_buffer_.clear();
|
||||||
|
silence_samples_count_ = 0;
|
||||||
|
speech_samples_count_ = 0;
|
||||||
|
|
||||||
PaStreamParameters input_params;
|
PaStreamParameters input_params;
|
||||||
input_params.device = Pa_GetDefaultInputDevice();
|
input_params.device = Pa_GetDefaultInputDevice();
|
||||||
@ -88,7 +271,7 @@ bool AudioCapture::start(AudioCallback callback) {
|
|||||||
PaError err = Pa_OpenStream(
|
PaError err = Pa_OpenStream(
|
||||||
&stream_,
|
&stream_,
|
||||||
&input_params,
|
&input_params,
|
||||||
nullptr, // no output
|
nullptr,
|
||||||
sample_rate_,
|
sample_rate_,
|
||||||
paFramesPerBufferUnspecified,
|
paFramesPerBufferUnspecified,
|
||||||
paClipOff,
|
paClipOff,
|
||||||
@ -108,6 +291,9 @@ bool AudioCapture::start(AudioCallback callback) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
is_recording_ = true;
|
is_recording_ = true;
|
||||||
|
std::cout << "[VAD] Started with RMS threshold=" << vad_rms_threshold_
|
||||||
|
<< ", Peak threshold=" << vad_peak_threshold_
|
||||||
|
<< ", Silence duration=" << silence_duration_ms_ << "ms" << std::endl;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -116,8 +302,42 @@ void AudioCapture::stop() {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Flush any remaining audio
|
||||||
|
if (speech_buffer_.size() > 0 && callback_) {
|
||||||
|
int min_speech_samples = (min_speech_duration_ms_ * sample_rate_ * channels_) / 1000;
|
||||||
|
if (speech_samples_count_ >= min_speech_samples) {
|
||||||
|
std::cout << "[VAD] Flushing remaining audio on stop (denoised)" << std::endl;
|
||||||
|
|
||||||
|
// Flush any remaining samples from the denoiser
|
||||||
|
if (noise_reducer_ && noise_reducer_->isEnabled()) {
|
||||||
|
auto remaining = noise_reducer_->flushStream();
|
||||||
|
speech_buffer_.insert(speech_buffer_.end(),
|
||||||
|
remaining.begin(), remaining.end());
|
||||||
|
// Save preview for listening
|
||||||
|
noise_reducer_->savePreview(speech_buffer_);
|
||||||
|
}
|
||||||
|
|
||||||
|
callback_(speech_buffer_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reset the stream for next session
|
||||||
|
if (noise_reducer_) {
|
||||||
|
noise_reducer_->resetStream();
|
||||||
|
}
|
||||||
|
|
||||||
Pa_StopStream(stream_);
|
Pa_StopStream(stream_);
|
||||||
is_recording_ = false;
|
is_recording_ = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void AudioCapture::setDenoiseEnabled(bool enabled) {
|
||||||
|
if (noise_reducer_) {
|
||||||
|
noise_reducer_->setEnabled(enabled);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool AudioCapture::isDenoiseEnabled() const {
|
||||||
|
return noise_reducer_ && noise_reducer_->isEnabled();
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace secondvoice
|
} // namespace secondvoice
|
||||||
|
|||||||
@ -3,16 +3,20 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
#include <atomic>
|
||||||
|
#include <cmath>
|
||||||
|
#include <memory>
|
||||||
#include <portaudio.h>
|
#include <portaudio.h>
|
||||||
|
|
||||||
namespace secondvoice {
|
namespace secondvoice {
|
||||||
|
|
||||||
|
class NoiseReducer;
|
||||||
|
|
||||||
class AudioCapture {
|
class AudioCapture {
|
||||||
public:
|
public:
|
||||||
using AudioCallback = std::function<void(const std::vector<float>&)>;
|
using AudioCallback = std::function<void(const std::vector<float>&)>;
|
||||||
|
|
||||||
// chunk_duration = window size, chunk_step = how often to send (overlap = duration - step)
|
AudioCapture(int sample_rate, int channels);
|
||||||
AudioCapture(int sample_rate, int channels, int chunk_duration_seconds, int chunk_step_seconds = 0);
|
|
||||||
~AudioCapture();
|
~AudioCapture();
|
||||||
|
|
||||||
bool initialize();
|
bool initialize();
|
||||||
@ -20,6 +24,26 @@ public:
|
|||||||
void stop();
|
void stop();
|
||||||
bool isRecording() const { return is_recording_; }
|
bool isRecording() const { return is_recording_; }
|
||||||
|
|
||||||
|
// Real-time audio levels (updated every callback, ~10ms)
|
||||||
|
float getCurrentRMS() const { return current_rms_; }
|
||||||
|
float getCurrentPeak() const { return current_peak_; }
|
||||||
|
|
||||||
|
// VAD settings (can be adjusted in real-time from UI)
|
||||||
|
void setVadThresholds(float rms_threshold, float peak_threshold) {
|
||||||
|
vad_rms_threshold_ = rms_threshold;
|
||||||
|
vad_peak_threshold_ = peak_threshold;
|
||||||
|
}
|
||||||
|
void setSilenceDuration(int ms) { silence_duration_ms_ = ms; }
|
||||||
|
void setMinSpeechDuration(int ms) { min_speech_duration_ms_ = ms; }
|
||||||
|
void setMaxSpeechDuration(int ms) { max_speech_duration_ms_ = ms; }
|
||||||
|
|
||||||
|
// VAD state (for UI display)
|
||||||
|
bool isSpeechDetected() const { return is_speech_active_; }
|
||||||
|
|
||||||
|
// Noise reduction
|
||||||
|
void setDenoiseEnabled(bool enabled);
|
||||||
|
bool isDenoiseEnabled() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static int audioCallback(const void* input, void* output,
|
static int audioCallback(const void* input, void* output,
|
||||||
unsigned long frame_count,
|
unsigned long frame_count,
|
||||||
@ -29,13 +53,45 @@ private:
|
|||||||
|
|
||||||
int sample_rate_;
|
int sample_rate_;
|
||||||
int channels_;
|
int channels_;
|
||||||
int chunk_duration_seconds_;
|
|
||||||
int chunk_step_seconds_; // How often to emit chunks (0 = same as duration, no overlap)
|
|
||||||
bool is_recording_ = false;
|
bool is_recording_ = false;
|
||||||
|
|
||||||
PaStream* stream_ = nullptr;
|
PaStream* stream_ = nullptr;
|
||||||
AudioCallback callback_;
|
AudioCallback callback_;
|
||||||
std::vector<float> buffer_;
|
|
||||||
|
// Speech accumulation buffer
|
||||||
|
std::vector<float> speech_buffer_;
|
||||||
|
|
||||||
|
// VAD state
|
||||||
|
std::atomic<bool> is_speech_active_{false};
|
||||||
|
int silence_samples_count_ = 0; // How many samples of silence
|
||||||
|
int speech_samples_count_ = 0; // How many samples of speech accumulated
|
||||||
|
|
||||||
|
// VAD parameters - Higher threshold to avoid false triggers on filtered noise
|
||||||
|
std::atomic<float> vad_rms_threshold_{0.02f}; // Was 0.01f
|
||||||
|
std::atomic<float> vad_peak_threshold_{0.08f}; // Was 0.04f
|
||||||
|
int silence_duration_ms_ = 400; // Wait 400ms of silence before cutting
|
||||||
|
int min_speech_duration_ms_ = 300; // Minimum speech to send
|
||||||
|
int max_speech_duration_ms_ = 25000; // 25s max before forced flush
|
||||||
|
|
||||||
|
// Adaptive noise floor
|
||||||
|
float noise_floor_ = 0.005f; // Estimated background noise level
|
||||||
|
float noise_floor_alpha_ = 0.001f; // Slower adaptation
|
||||||
|
|
||||||
|
// Hang time - wait before cutting to avoid mid-sentence cuts
|
||||||
|
int hang_frames_ = 0;
|
||||||
|
int hang_frames_threshold_ = 20; // ~200ms tolerance for pauses
|
||||||
|
|
||||||
|
// Zero-crossing rate for speech vs noise discrimination
|
||||||
|
float last_zcr_ = 0.0f;
|
||||||
|
float speech_zcr_min_ = 0.01f; // More permissive lower bound
|
||||||
|
float speech_zcr_max_ = 0.25f; // More permissive upper bound
|
||||||
|
|
||||||
|
// Real-time audio levels (atomic for thread safety)
|
||||||
|
std::atomic<float> current_rms_{0.0f};
|
||||||
|
std::atomic<float> current_peak_{0.0f};
|
||||||
|
|
||||||
|
// Noise reduction
|
||||||
|
std::unique_ptr<NoiseReducer> noise_reducer_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace secondvoice
|
} // namespace secondvoice
|
||||||
|
|||||||
249
src/audio/NoiseReducer.cpp
Normal file
249
src/audio/NoiseReducer.cpp
Normal file
@ -0,0 +1,249 @@
|
|||||||
|
#include "NoiseReducer.h"
|
||||||
|
#include "AudioBuffer.h"
|
||||||
|
#include <rnnoise.h>
|
||||||
|
#include <cstring>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <iostream>
|
||||||
|
#include <filesystem>
|
||||||
|
|
||||||
|
namespace secondvoice {
|
||||||
|
|
||||||
|
NoiseReducer::NoiseReducer() {
|
||||||
|
state_ = rnnoise_create(nullptr);
|
||||||
|
upsample_buffer_.resize(RNNOISE_FRAME_SIZE);
|
||||||
|
downsample_buffer_.resize(INPUT_FRAME_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
NoiseReducer::~NoiseReducer() {
|
||||||
|
if (state_) {
|
||||||
|
rnnoise_destroy(state_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void NoiseReducer::upsample(const float* in, float* out, int in_samples) {
|
||||||
|
// Simple linear interpolation 16kHz -> 48kHz (3x)
|
||||||
|
for (int i = 0; i < in_samples; ++i) {
|
||||||
|
int out_idx = i * 3;
|
||||||
|
float curr = in[i];
|
||||||
|
float next = (i + 1 < in_samples) ? in[i + 1] : curr;
|
||||||
|
|
||||||
|
out[out_idx] = curr;
|
||||||
|
out[out_idx + 1] = curr + (next - curr) * 0.333f;
|
||||||
|
out[out_idx + 2] = curr + (next - curr) * 0.667f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void NoiseReducer::downsample(const float* in, float* out, int out_samples) {
|
||||||
|
// Simple decimation 48kHz -> 16kHz (take every 3rd sample with averaging)
|
||||||
|
for (int i = 0; i < out_samples; ++i) {
|
||||||
|
int in_idx = i * 3;
|
||||||
|
// Average of 3 samples for anti-aliasing
|
||||||
|
out[i] = (in[in_idx] + in[in_idx + 1] + in[in_idx + 2]) / 3.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void NoiseReducer::suppressTransients(float* samples, size_t count) {
|
||||||
|
// Transient suppressor: attenuates sudden spikes (claps, clicks, pops)
|
||||||
|
// while preserving speech which has smoother amplitude changes
|
||||||
|
|
||||||
|
int transients_detected = 0;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < count; ++i) {
|
||||||
|
float abs_sample = std::abs(samples[i]);
|
||||||
|
|
||||||
|
// IMPORTANT: Use envelope BEFORE updating for transient detection
|
||||||
|
// Otherwise the envelope rises with the clap and we miss it!
|
||||||
|
float safe_envelope = std::max(envelope_, 0.001f);
|
||||||
|
float ratio = abs_sample / safe_envelope;
|
||||||
|
|
||||||
|
bool is_transient = (ratio > transient_threshold_);
|
||||||
|
|
||||||
|
if (is_transient) {
|
||||||
|
// This is a transient - attenuate it heavily
|
||||||
|
float target_amplitude = safe_envelope * transient_ratio_;
|
||||||
|
float attenuation = target_amplitude / abs_sample;
|
||||||
|
samples[i] *= attenuation;
|
||||||
|
transients_detected++;
|
||||||
|
|
||||||
|
// DON'T update envelope from transients - keep it stable
|
||||||
|
// This prevents the envelope from "learning" the clap
|
||||||
|
} else {
|
||||||
|
// Only update envelope from non-transient samples
|
||||||
|
if (abs_sample > envelope_) {
|
||||||
|
envelope_ = envelope_ * (1.0f - envelope_attack_) + abs_sample * envelope_attack_;
|
||||||
|
} else {
|
||||||
|
envelope_ = envelope_ * (1.0f - envelope_release_) + abs_sample * envelope_release_;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (transients_detected > 10) {
|
||||||
|
std::cout << "[RNNoise] Suppressed " << transients_detected << " transient samples (clap/click)" << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void NoiseReducer::process(std::vector<float>& samples) {
|
||||||
|
if (!enabled_ || !state_ || samples.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process in chunks of INPUT_FRAME_SIZE (160 samples = 10ms at 16kHz)
|
||||||
|
size_t num_frames = samples.size() / INPUT_FRAME_SIZE;
|
||||||
|
float duration_ms = static_cast<float>(samples.size()) / 16.0f; // 16 samples/ms at 16kHz
|
||||||
|
std::cout << "[RNNoise] Denoising " << duration_ms << "ms of audio (" << num_frames << " frames)" << std::endl;
|
||||||
|
|
||||||
|
for (size_t frame = 0; frame < num_frames; ++frame) {
|
||||||
|
float* frame_ptr = samples.data() + frame * INPUT_FRAME_SIZE;
|
||||||
|
|
||||||
|
// Upsample 16kHz -> 48kHz
|
||||||
|
upsample(frame_ptr, upsample_buffer_.data(), INPUT_FRAME_SIZE);
|
||||||
|
|
||||||
|
// RNNoise expects float input in range [-32768, 32768] (like int16)
|
||||||
|
// Our input is [-1, 1], so scale up
|
||||||
|
for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) {
|
||||||
|
upsample_buffer_[i] *= 32768.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply RNNoise
|
||||||
|
rnnoise_process_frame(state_, upsample_buffer_.data(), upsample_buffer_.data());
|
||||||
|
|
||||||
|
// Scale back to [-1, 1]
|
||||||
|
for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) {
|
||||||
|
upsample_buffer_[i] /= 32768.0f;
|
||||||
|
// Clamp to prevent any overflow
|
||||||
|
upsample_buffer_[i] = std::clamp(upsample_buffer_[i], -1.0f, 1.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Downsample 48kHz -> 16kHz
|
||||||
|
downsample(upsample_buffer_.data(), frame_ptr, INPUT_FRAME_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle remaining samples (less than a full frame) - just leave them as-is
|
||||||
|
// They'll be processed in the next call
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<float> NoiseReducer::processCopy(const std::vector<float>& samples) {
|
||||||
|
std::vector<float> result = samples;
|
||||||
|
process(result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<float> NoiseReducer::processRealtime(const float* samples, size_t count) {
|
||||||
|
std::vector<float> output;
|
||||||
|
|
||||||
|
if (!enabled_ || !state_ || count == 0) {
|
||||||
|
// Pass through unchanged
|
||||||
|
output.assign(samples, samples + count);
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add new samples to streaming buffer
|
||||||
|
stream_input_buffer_.insert(stream_input_buffer_.end(), samples, samples + count);
|
||||||
|
|
||||||
|
// Process complete frames
|
||||||
|
while (stream_input_buffer_.size() >= INPUT_FRAME_SIZE) {
|
||||||
|
// Get frame pointer
|
||||||
|
float* frame_ptr = stream_input_buffer_.data();
|
||||||
|
|
||||||
|
// Step 1: Suppress transients (claps, clicks, pops) BEFORE RNNoise
|
||||||
|
suppressTransients(frame_ptr, INPUT_FRAME_SIZE);
|
||||||
|
|
||||||
|
// Step 2: Upsample 16kHz -> 48kHz
|
||||||
|
upsample(frame_ptr, upsample_buffer_.data(), INPUT_FRAME_SIZE);
|
||||||
|
|
||||||
|
// Scale to int16 range for RNNoise
|
||||||
|
for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) {
|
||||||
|
upsample_buffer_[i] *= 32768.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply RNNoise
|
||||||
|
rnnoise_process_frame(state_, upsample_buffer_.data(), upsample_buffer_.data());
|
||||||
|
|
||||||
|
// Scale back to [-1, 1]
|
||||||
|
for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) {
|
||||||
|
upsample_buffer_[i] /= 32768.0f;
|
||||||
|
upsample_buffer_[i] = std::clamp(upsample_buffer_[i], -1.0f, 1.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Downsample 48kHz -> 16kHz and add to output
|
||||||
|
std::vector<float> denoised_frame(INPUT_FRAME_SIZE);
|
||||||
|
downsample(upsample_buffer_.data(), denoised_frame.data(), INPUT_FRAME_SIZE);
|
||||||
|
output.insert(output.end(), denoised_frame.begin(), denoised_frame.end());
|
||||||
|
|
||||||
|
// Remove processed samples from buffer
|
||||||
|
stream_input_buffer_.erase(stream_input_buffer_.begin(),
|
||||||
|
stream_input_buffer_.begin() + INPUT_FRAME_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<float> NoiseReducer::flushStream() {
|
||||||
|
std::vector<float> output;
|
||||||
|
|
||||||
|
if (!enabled_ || !state_ || stream_input_buffer_.empty()) {
|
||||||
|
// Return any remaining samples as-is
|
||||||
|
output = std::move(stream_input_buffer_);
|
||||||
|
stream_input_buffer_.clear();
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process remaining samples (zero-pad to complete frame if needed)
|
||||||
|
while (!stream_input_buffer_.empty()) {
|
||||||
|
// Pad with zeros if we have less than a full frame
|
||||||
|
while (stream_input_buffer_.size() < INPUT_FRAME_SIZE) {
|
||||||
|
stream_input_buffer_.push_back(0.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get frame pointer
|
||||||
|
float* frame_ptr = stream_input_buffer_.data();
|
||||||
|
|
||||||
|
// Step 1: Suppress transients
|
||||||
|
suppressTransients(frame_ptr, INPUT_FRAME_SIZE);
|
||||||
|
|
||||||
|
// Step 2: Upsample 16kHz -> 48kHz
|
||||||
|
upsample(frame_ptr, upsample_buffer_.data(), INPUT_FRAME_SIZE);
|
||||||
|
|
||||||
|
// Scale to int16 range for RNNoise
|
||||||
|
for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) {
|
||||||
|
upsample_buffer_[i] *= 32768.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply RNNoise
|
||||||
|
rnnoise_process_frame(state_, upsample_buffer_.data(), upsample_buffer_.data());
|
||||||
|
|
||||||
|
// Scale back to [-1, 1]
|
||||||
|
for (int i = 0; i < RNNOISE_FRAME_SIZE; ++i) {
|
||||||
|
upsample_buffer_[i] /= 32768.0f;
|
||||||
|
upsample_buffer_[i] = std::clamp(upsample_buffer_[i], -1.0f, 1.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Downsample 48kHz -> 16kHz and add to output
|
||||||
|
std::vector<float> denoised_frame(INPUT_FRAME_SIZE);
|
||||||
|
downsample(upsample_buffer_.data(), denoised_frame.data(), INPUT_FRAME_SIZE);
|
||||||
|
output.insert(output.end(), denoised_frame.begin(), denoised_frame.end());
|
||||||
|
|
||||||
|
// Remove processed samples from buffer
|
||||||
|
stream_input_buffer_.erase(stream_input_buffer_.begin(),
|
||||||
|
stream_input_buffer_.begin() + INPUT_FRAME_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
void NoiseReducer::savePreview(const std::vector<float>& samples) {
|
||||||
|
if (!save_preview_ || samples.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::filesystem::create_directories("./denoised");
|
||||||
|
std::string filename = "./denoised/denoised_" + std::to_string(preview_count_++) + ".ogg";
|
||||||
|
|
||||||
|
AudioBuffer buffer(16000, 1); // 16kHz mono
|
||||||
|
buffer.addSamples(samples);
|
||||||
|
if (buffer.saveToOpus(filename)) {
|
||||||
|
std::cout << "[RNNoise] Saved denoised preview: " << filename << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace secondvoice
|
||||||
86
src/audio/NoiseReducer.h
Normal file
86
src/audio/NoiseReducer.h
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
// Forward declaration
|
||||||
|
struct DenoiseState;
|
||||||
|
|
||||||
|
namespace secondvoice {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* NoiseReducer - Wrapper around RNNoise for real-time audio denoising
|
||||||
|
*
|
||||||
|
* RNNoise expects 48kHz audio, but we work at 16kHz.
|
||||||
|
* This wrapper handles resampling transparently.
|
||||||
|
*/
|
||||||
|
class NoiseReducer {
|
||||||
|
public:
|
||||||
|
NoiseReducer();
|
||||||
|
~NoiseReducer();
|
||||||
|
|
||||||
|
// Process audio samples in-place (batch mode - for final output)
|
||||||
|
// Input/output: float samples normalized to [-1, 1]
|
||||||
|
void process(std::vector<float>& samples);
|
||||||
|
|
||||||
|
// Process a buffer, returns denoised copy
|
||||||
|
std::vector<float> processCopy(const std::vector<float>& samples);
|
||||||
|
|
||||||
|
// Real-time streaming: process samples as they come in
|
||||||
|
// Returns denoised samples (may be slightly delayed due to frame buffering)
|
||||||
|
std::vector<float> processRealtime(const float* samples, size_t count);
|
||||||
|
|
||||||
|
// Flush any remaining samples in the stream buffer (zero-padded if needed)
|
||||||
|
std::vector<float> flushStream();
|
||||||
|
|
||||||
|
// Clear the stream buffer (call when starting a new segment)
|
||||||
|
void resetStream() { stream_input_buffer_.clear(); }
|
||||||
|
|
||||||
|
// Enable/disable denoising
|
||||||
|
void setEnabled(bool enabled) { enabled_ = enabled; }
|
||||||
|
bool isEnabled() const { return enabled_; }
|
||||||
|
|
||||||
|
// Save denoised audio for preview
|
||||||
|
void setSavePreview(bool save) { save_preview_ = save; }
|
||||||
|
bool isSavePreviewEnabled() const { return save_preview_; }
|
||||||
|
|
||||||
|
// Save audio samples to Opus file (for preview)
|
||||||
|
void savePreview(const std::vector<float>& samples);
|
||||||
|
|
||||||
|
private:
|
||||||
|
DenoiseState* state_ = nullptr;
|
||||||
|
bool enabled_ = true;
|
||||||
|
bool save_preview_ = true; // Save denoised audio for listening
|
||||||
|
int preview_count_ = 0;
|
||||||
|
|
||||||
|
// RNNoise frame size (480 samples at 48kHz = 10ms)
|
||||||
|
static constexpr int RNNOISE_FRAME_SIZE = 480;
|
||||||
|
|
||||||
|
// Our frame size (160 samples at 16kHz = 10ms)
|
||||||
|
static constexpr int INPUT_FRAME_SIZE = 160;
|
||||||
|
|
||||||
|
// Resampling buffers
|
||||||
|
std::vector<float> upsample_buffer_;
|
||||||
|
std::vector<float> downsample_buffer_;
|
||||||
|
|
||||||
|
// Streaming input buffer (for real-time processing)
|
||||||
|
std::vector<float> stream_input_buffer_;
|
||||||
|
|
||||||
|
// Transient suppressor state
|
||||||
|
float envelope_ = 0.0f; // Smoothed signal envelope
|
||||||
|
float envelope_attack_ = 0.01f; // Fast attack to track signal
|
||||||
|
float envelope_release_ = 0.0005f; // Slow release to maintain level
|
||||||
|
float transient_threshold_ = 4.0f; // Spike must be 4x envelope to be considered transient
|
||||||
|
float transient_ratio_ = 0.1f; // Attenuate transients to 10% (-90% reduction)
|
||||||
|
|
||||||
|
// Suppress transient sounds (claps, clicks, pops)
|
||||||
|
void suppressTransients(float* samples, size_t count);
|
||||||
|
|
||||||
|
// Upsample 16kHz to 48kHz (3x)
|
||||||
|
void upsample(const float* in, float* out, int in_samples);
|
||||||
|
|
||||||
|
// Downsample 48kHz to 16kHz (3x)
|
||||||
|
void downsample(const float* in, float* out, int out_samples);
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace secondvoice
|
||||||
@ -24,18 +24,13 @@ Pipeline::~Pipeline() {
|
|||||||
bool Pipeline::initialize() {
|
bool Pipeline::initialize() {
|
||||||
auto& config = Config::getInstance();
|
auto& config = Config::getInstance();
|
||||||
|
|
||||||
// Initialize audio capture with sliding window (overlap)
|
// Initialize audio capture with VAD-based segmentation
|
||||||
audio_capture_ = std::make_unique<AudioCapture>(
|
audio_capture_ = std::make_unique<AudioCapture>(
|
||||||
config.getAudioConfig().sample_rate,
|
config.getAudioConfig().sample_rate,
|
||||||
config.getAudioConfig().channels,
|
config.getAudioConfig().channels
|
||||||
config.getAudioConfig().chunk_duration_seconds,
|
|
||||||
config.getAudioConfig().chunk_step_seconds
|
|
||||||
);
|
);
|
||||||
|
|
||||||
std::cout << "[Pipeline] Audio: " << config.getAudioConfig().chunk_duration_seconds
|
std::cout << "[Pipeline] VAD-based audio segmentation enabled" << std::endl;
|
||||||
<< "s window, " << config.getAudioConfig().chunk_step_seconds
|
|
||||||
<< "s step (overlap: " << (config.getAudioConfig().chunk_duration_seconds - config.getAudioConfig().chunk_step_seconds)
|
|
||||||
<< "s)" << std::endl;
|
|
||||||
|
|
||||||
if (!audio_capture_->initialize()) {
|
if (!audio_capture_->initialize()) {
|
||||||
std::cerr << "Failed to initialize audio capture" << std::endl;
|
std::cerr << "Failed to initialize audio capture" << std::endl;
|
||||||
@ -76,7 +71,7 @@ bool Pipeline::start() {
|
|||||||
|
|
||||||
running_ = true;
|
running_ = true;
|
||||||
|
|
||||||
// Start background threads (NOT UI - that runs in main thread)
|
// Start background threads
|
||||||
audio_thread_ = std::thread(&Pipeline::audioThread, this);
|
audio_thread_ = std::thread(&Pipeline::audioThread, this);
|
||||||
processing_thread_ = std::thread(&Pipeline::processingThread, this);
|
processing_thread_ = std::thread(&Pipeline::processingThread, this);
|
||||||
|
|
||||||
@ -95,9 +90,8 @@ void Pipeline::stop() {
|
|||||||
audio_capture_->stop();
|
audio_capture_->stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Shutdown queues
|
// Shutdown queue
|
||||||
audio_queue_.shutdown();
|
audio_queue_.shutdown();
|
||||||
transcription_queue_.shutdown();
|
|
||||||
|
|
||||||
// Wait for background threads
|
// Wait for background threads
|
||||||
if (audio_thread_.joinable()) {
|
if (audio_thread_.joinable()) {
|
||||||
@ -146,7 +140,7 @@ void Pipeline::audioThread() {
|
|||||||
// Keep thread alive while recording
|
// Keep thread alive while recording
|
||||||
auto start_time = std::chrono::steady_clock::now();
|
auto start_time = std::chrono::steady_clock::now();
|
||||||
while (running_ && audio_capture_->isRecording()) {
|
while (running_ && audio_capture_->isRecording()) {
|
||||||
std::this_thread::sleep_for(std::chrono::seconds(1));
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||||
|
|
||||||
// Update duration
|
// Update duration
|
||||||
auto now = std::chrono::steady_clock::now();
|
auto now = std::chrono::steady_clock::now();
|
||||||
@ -157,11 +151,6 @@ void Pipeline::audioThread() {
|
|||||||
void Pipeline::processingThread() {
|
void Pipeline::processingThread() {
|
||||||
auto& config = Config::getInstance();
|
auto& config = Config::getInstance();
|
||||||
|
|
||||||
// VAD threshold - audio RMS must exceed this to be considered speech
|
|
||||||
// Higher values = more aggressive noise filtering
|
|
||||||
constexpr float VAD_THRESHOLD = 0.02f; // RMS energy threshold
|
|
||||||
constexpr float VAD_MIN_PEAK = 0.08f; // Minimum peak amplitude
|
|
||||||
|
|
||||||
while (running_) {
|
while (running_) {
|
||||||
auto chunk_opt = audio_queue_.wait_and_pop();
|
auto chunk_opt = audio_queue_.wait_and_pop();
|
||||||
if (!chunk_opt.has_value()) {
|
if (!chunk_opt.has_value()) {
|
||||||
@ -169,22 +158,8 @@ void Pipeline::processingThread() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
auto& chunk = chunk_opt.value();
|
auto& chunk = chunk_opt.value();
|
||||||
|
float duration = static_cast<float>(chunk.data.size()) / (chunk.sample_rate * chunk.channels);
|
||||||
// Voice Activity Detection - skip silent/noise chunks
|
std::cout << "[Processing] Speech segment: " << duration << "s" << std::endl;
|
||||||
float sum_squared = 0.0f;
|
|
||||||
float max_amplitude = 0.0f;
|
|
||||||
for (const float sample : chunk.data) {
|
|
||||||
sum_squared += sample * sample;
|
|
||||||
max_amplitude = std::max(max_amplitude, std::abs(sample));
|
|
||||||
}
|
|
||||||
float rms = std::sqrt(sum_squared / chunk.data.size());
|
|
||||||
|
|
||||||
if (rms < VAD_THRESHOLD || max_amplitude < VAD_MIN_PEAK) {
|
|
||||||
std::cout << "[Skip] Silent chunk (RMS: " << rms << ", Peak: " << max_amplitude << ")" << std::endl;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cout << "[Audio] Processing chunk (RMS: " << rms << ", Peak: " << max_amplitude << ")" << std::endl;
|
|
||||||
|
|
||||||
// Transcribe with Whisper
|
// Transcribe with Whisper
|
||||||
auto whisper_result = whisper_client_->transcribe(
|
auto whisper_result = whisper_client_->transcribe(
|
||||||
@ -203,8 +178,9 @@ void Pipeline::processingThread() {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Filter out Whisper hallucinations (common when audio is silent/unclear)
|
// Filter out Whisper hallucinations
|
||||||
std::string text = whisper_result->text;
|
std::string text = whisper_result->text;
|
||||||
|
|
||||||
// Trim whitespace
|
// Trim whitespace
|
||||||
size_t start = text.find_first_not_of(" \t\n\r");
|
size_t start = text.find_first_not_of(" \t\n\r");
|
||||||
size_t end = text.find_last_not_of(" \t\n\r");
|
size_t end = text.find_last_not_of(" \t\n\r");
|
||||||
@ -215,28 +191,61 @@ void Pipeline::processingThread() {
|
|||||||
text = text.substr(start, end - start + 1);
|
text = text.substr(start, end - start + 1);
|
||||||
|
|
||||||
// Skip known hallucinations and garbage
|
// Skip known hallucinations and garbage
|
||||||
|
// Whisper hallucinates these when given silence/noise
|
||||||
bool is_garbage = false;
|
bool is_garbage = false;
|
||||||
|
|
||||||
// Too short to be meaningful
|
|
||||||
if (text.length() < 4) {
|
if (text.length() < 4) {
|
||||||
is_garbage = true;
|
is_garbage = true;
|
||||||
}
|
} else if (
|
||||||
// Known Whisper hallucinations
|
// Common Whisper hallucinations
|
||||||
else if (text.find("Amara.org") != std::string::npos ||
|
text.find("Tingting") != std::string::npos ||
|
||||||
text.find("amara.org") != std::string::npos ||
|
text.find("tingting") != std::string::npos ||
|
||||||
text.find("字幕") != std::string::npos ||
|
text.find("婷婷") != std::string::npos ||
|
||||||
text.find("subtitle") != std::string::npos ||
|
text.find("Thank you") != std::string::npos ||
|
||||||
text.find("Subtitle") != std::string::npos ||
|
text.find("thanks for watching") != std::string::npos ||
|
||||||
text.find("Thank you") != std::string::npos ||
|
text.find("Thanks for watching") != std::string::npos ||
|
||||||
text.find("thanks for watching") != std::string::npos ||
|
text.find("Subscribe") != std::string::npos ||
|
||||||
text.find("Subscribe") != std::string::npos ||
|
text.find("subscribe") != std::string::npos ||
|
||||||
text.find("谢谢观看") != std::string::npos ||
|
text.find("請訂閱") != std::string::npos ||
|
||||||
text.find("订阅") != std::string::npos ||
|
text.find("请订阅") != std::string::npos ||
|
||||||
text.find("...") == 0) { // Starts with ellipsis
|
text.find("訂閱") != std::string::npos ||
|
||||||
|
text.find("订阅") != std::string::npos ||
|
||||||
|
text.find("Amara.org") != std::string::npos ||
|
||||||
|
text.find("amara.org") != std::string::npos ||
|
||||||
|
text.find("字幕") != std::string::npos ||
|
||||||
|
text.find("subtitle") != std::string::npos ||
|
||||||
|
text.find("Subtitle") != std::string::npos ||
|
||||||
|
text.find("谢谢观看") != std::string::npos ||
|
||||||
|
text.find("謝謝觀看") != std::string::npos ||
|
||||||
|
text.find("感谢收看") != std::string::npos ||
|
||||||
|
text.find("感謝收看") != std::string::npos ||
|
||||||
|
text.find("下期再见") != std::string::npos ||
|
||||||
|
text.find("下期再見") != std::string::npos ||
|
||||||
|
text.find("再见") != std::string::npos ||
|
||||||
|
text.find("再見") != std::string::npos ||
|
||||||
|
text.find("music") != std::string::npos ||
|
||||||
|
text.find("Music") != std::string::npos ||
|
||||||
|
text.find("♪") != std::string::npos ||
|
||||||
|
text.find("silenc") != std::string::npos || // silence, silencieux
|
||||||
|
text.find("Silenc") != std::string::npos ||
|
||||||
|
text.find("...") == 0 ||
|
||||||
|
// Repetitive single words/sounds
|
||||||
|
text == "Bonjour" ||
|
||||||
|
text == "Hello" ||
|
||||||
|
text == "Hi" ||
|
||||||
|
text == "你好" ||
|
||||||
|
text == "好" ||
|
||||||
|
text == "嗯" ||
|
||||||
|
text == "啊" ||
|
||||||
|
text == "哈" ||
|
||||||
|
text == "呵呵" ||
|
||||||
|
text == "哈哈" ||
|
||||||
|
text == "嘻嘻" ||
|
||||||
|
text == "Hmm" ||
|
||||||
|
text == "Huh" ||
|
||||||
|
text == "Um" ||
|
||||||
|
text == "Uh") {
|
||||||
is_garbage = true;
|
is_garbage = true;
|
||||||
}
|
} else {
|
||||||
// Only punctuation or whitespace
|
|
||||||
else {
|
|
||||||
bool has_content = false;
|
bool has_content = false;
|
||||||
for (char c : text) {
|
for (char c : text) {
|
||||||
if (std::isalnum(static_cast<unsigned char>(c)) || (c & 0x80)) {
|
if (std::isalnum(static_cast<unsigned char>(c)) || (c & 0x80)) {
|
||||||
@ -244,9 +253,7 @@ void Pipeline::processingThread() {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!has_content) {
|
if (!has_content) is_garbage = true;
|
||||||
is_garbage = true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (is_garbage) {
|
if (is_garbage) {
|
||||||
@ -254,20 +261,14 @@ void Pipeline::processingThread() {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove overlap with previous transcription
|
// Track audio cost
|
||||||
std::string deduplicated = removeOverlap(text, last_transcription_);
|
if (ui_) {
|
||||||
last_transcription_ = text; // Store full text for next comparison
|
ui_->addAudioCost(duration);
|
||||||
|
|
||||||
if (deduplicated.empty() || deduplicated.length() < 2) {
|
|
||||||
std::cout << "[Skip] Fully overlapping transcription" << std::endl;
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << "[New content] " << deduplicated << std::endl;
|
// Translate with Claude
|
||||||
|
|
||||||
// Translate with Claude (only the new, deduplicated content)
|
|
||||||
auto claude_result = claude_client_->translate(
|
auto claude_result = claude_client_->translate(
|
||||||
deduplicated,
|
text,
|
||||||
config.getClaudeConfig().system_prompt,
|
config.getClaudeConfig().system_prompt,
|
||||||
config.getClaudeConfig().max_tokens,
|
config.getClaudeConfig().max_tokens,
|
||||||
config.getClaudeConfig().temperature
|
config.getClaudeConfig().temperature
|
||||||
@ -278,10 +279,27 @@ void Pipeline::processingThread() {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add to UI
|
// Track Claude cost
|
||||||
ui_->addTranslation(deduplicated, claude_result->text);
|
if (ui_) {
|
||||||
|
ui_->addClaudeCost();
|
||||||
|
}
|
||||||
|
|
||||||
std::cout << "CN: " << deduplicated << std::endl;
|
// Simple accumulation
|
||||||
|
if (!accumulated_chinese_.empty()) {
|
||||||
|
accumulated_chinese_ += " ";
|
||||||
|
accumulated_french_ += " ";
|
||||||
|
}
|
||||||
|
accumulated_chinese_ += text;
|
||||||
|
accumulated_french_ += claude_result->text;
|
||||||
|
|
||||||
|
std::cout << "[Accumulated CN] " << accumulated_chinese_ << std::endl;
|
||||||
|
std::cout << "[Accumulated FR] " << accumulated_french_ << std::endl;
|
||||||
|
|
||||||
|
// Update UI
|
||||||
|
ui_->setAccumulatedText(accumulated_chinese_, accumulated_french_);
|
||||||
|
ui_->addTranslation(text, claude_result->text);
|
||||||
|
|
||||||
|
std::cout << "CN: " << text << std::endl;
|
||||||
std::cout << "FR: " << claude_result->text << std::endl;
|
std::cout << "FR: " << claude_result->text << std::endl;
|
||||||
std::cout << "---" << std::endl;
|
std::cout << "---" << std::endl;
|
||||||
}
|
}
|
||||||
@ -290,58 +308,45 @@ void Pipeline::processingThread() {
|
|||||||
void Pipeline::update() {
|
void Pipeline::update() {
|
||||||
if (!ui_) return;
|
if (!ui_) return;
|
||||||
|
|
||||||
|
// Sync VAD thresholds from UI to AudioCapture
|
||||||
|
if (audio_capture_) {
|
||||||
|
audio_capture_->setVadThresholds(
|
||||||
|
ui_->getVadThreshold(),
|
||||||
|
ui_->getVadPeakThreshold()
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update UI with audio levels
|
||||||
|
ui_->setCurrentRMS(audio_capture_->getCurrentRMS());
|
||||||
|
ui_->setCurrentPeak(audio_capture_->getCurrentPeak());
|
||||||
|
}
|
||||||
|
|
||||||
ui_->setRecordingDuration(recording_duration_);
|
ui_->setRecordingDuration(recording_duration_);
|
||||||
ui_->setProcessingStatus("Processing...");
|
ui_->setProcessingStatus(audio_capture_ && audio_capture_->isSpeechDetected() ? "Recording speech..." : "Listening...");
|
||||||
ui_->render();
|
ui_->render();
|
||||||
|
|
||||||
// Check if stop was requested
|
// Check if stop was requested
|
||||||
if (ui_->isStopRequested()) {
|
if (ui_->isStopRequested()) {
|
||||||
running_ = false;
|
running_ = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check if clear was requested
|
||||||
|
if (ui_->isClearRequested()) {
|
||||||
|
clearAccumulated();
|
||||||
|
ui_->resetClearRequest();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Pipeline::shouldClose() const {
|
bool Pipeline::shouldClose() const {
|
||||||
return ui_ ? ui_->shouldClose() : true;
|
return ui_ ? ui_->shouldClose() : true;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string Pipeline::removeOverlap(const std::string& current, const std::string& previous) {
|
void Pipeline::clearAccumulated() {
|
||||||
if (previous.empty()) {
|
accumulated_chinese_.clear();
|
||||||
return current;
|
accumulated_french_.clear();
|
||||||
|
if (ui_) {
|
||||||
|
ui_->setAccumulatedText("", "");
|
||||||
}
|
}
|
||||||
|
std::cout << "[Pipeline] Cleared accumulated text" << std::endl;
|
||||||
// Try to find overlap: check if current starts with end of previous
|
|
||||||
// Start from half the previous text (since we have 50% overlap)
|
|
||||||
size_t min_overlap = 4; // Minimum characters to consider as overlap
|
|
||||||
size_t search_start = previous.length() / 3; // Start looking from 1/3 into previous
|
|
||||||
|
|
||||||
for (size_t i = search_start; i < previous.length() - min_overlap; ++i) {
|
|
||||||
std::string suffix = previous.substr(i);
|
|
||||||
if (current.find(suffix) == 0) {
|
|
||||||
// Found overlap - return only the new part
|
|
||||||
std::string new_part = current.substr(suffix.length());
|
|
||||||
if (!new_part.empty()) {
|
|
||||||
std::cout << "[Overlap] Removed: \"" << suffix << "\"" << std::endl;
|
|
||||||
return new_part;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// No overlap found - check for partial word overlap at start
|
|
||||||
// This handles cases where the same content appears but tokenized differently
|
|
||||||
size_t max_check = std::min(current.length(), previous.length()) / 2;
|
|
||||||
for (size_t len = max_check; len >= min_overlap; --len) {
|
|
||||||
std::string end_prev = previous.substr(previous.length() - len);
|
|
||||||
std::string start_curr = current.substr(0, len);
|
|
||||||
if (end_prev == start_curr) {
|
|
||||||
std::string new_part = current.substr(len);
|
|
||||||
if (!new_part.empty()) {
|
|
||||||
std::cout << "[Overlap] Partial match removed: \"" << end_prev << "\"" << std::endl;
|
|
||||||
return new_part;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return current; // No overlap detected
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace secondvoice
|
} // namespace secondvoice
|
||||||
|
|||||||
@ -21,10 +21,6 @@ struct AudioChunk {
|
|||||||
int channels;
|
int channels;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct TranscriptionResult {
|
|
||||||
std::string chinese_text;
|
|
||||||
};
|
|
||||||
|
|
||||||
class Pipeline {
|
class Pipeline {
|
||||||
public:
|
public:
|
||||||
Pipeline();
|
Pipeline();
|
||||||
@ -40,6 +36,9 @@ public:
|
|||||||
bool isRunning() const { return running_; }
|
bool isRunning() const { return running_; }
|
||||||
bool shouldClose() const;
|
bool shouldClose() const;
|
||||||
|
|
||||||
|
// Clear accumulated translations
|
||||||
|
void clearAccumulated();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void audioThread();
|
void audioThread();
|
||||||
void processingThread();
|
void processingThread();
|
||||||
@ -51,7 +50,6 @@ private:
|
|||||||
std::unique_ptr<AudioBuffer> full_recording_;
|
std::unique_ptr<AudioBuffer> full_recording_;
|
||||||
|
|
||||||
ThreadSafeQueue<AudioChunk> audio_queue_;
|
ThreadSafeQueue<AudioChunk> audio_queue_;
|
||||||
ThreadSafeQueue<TranscriptionResult> transcription_queue_;
|
|
||||||
|
|
||||||
std::thread audio_thread_;
|
std::thread audio_thread_;
|
||||||
std::thread processing_thread_;
|
std::thread processing_thread_;
|
||||||
@ -59,9 +57,9 @@ private:
|
|||||||
std::atomic<bool> running_{false};
|
std::atomic<bool> running_{false};
|
||||||
std::atomic<int> recording_duration_{0};
|
std::atomic<int> recording_duration_{0};
|
||||||
|
|
||||||
// For overlap deduplication
|
// Simple accumulation
|
||||||
std::string last_transcription_;
|
std::string accumulated_chinese_;
|
||||||
std::string removeOverlap(const std::string& current, const std::string& previous);
|
std::string accumulated_french_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace secondvoice
|
} // namespace secondvoice
|
||||||
|
|||||||
@ -199,10 +199,31 @@ void TranslationUI::render() {
|
|||||||
ImGuiWindowFlags_NoMove |
|
ImGuiWindowFlags_NoMove |
|
||||||
ImGuiWindowFlags_NoCollapse);
|
ImGuiWindowFlags_NoCollapse);
|
||||||
|
|
||||||
|
// Two-column layout: main content on left, audio panel on right
|
||||||
|
float audio_panel_width = 280.0f;
|
||||||
|
float main_width = ImGui::GetWindowWidth() - audio_panel_width - 20.0f;
|
||||||
|
|
||||||
|
// Left column - main content
|
||||||
|
ImGui::BeginChild("MainContent", ImVec2(main_width, -1), false);
|
||||||
|
|
||||||
|
ImGui::Text("SecondVoice - Live Translation");
|
||||||
|
ImGui::Separator();
|
||||||
|
ImGui::Spacing();
|
||||||
|
|
||||||
|
renderAccumulated();
|
||||||
renderTranslations();
|
renderTranslations();
|
||||||
renderControls();
|
renderControls();
|
||||||
renderStatus();
|
renderStatus();
|
||||||
|
|
||||||
|
ImGui::EndChild();
|
||||||
|
|
||||||
|
ImGui::SameLine();
|
||||||
|
|
||||||
|
// Right column - audio panel
|
||||||
|
ImGui::BeginChild("AudioPanel", ImVec2(audio_panel_width, -1), true);
|
||||||
|
renderAudioPanel();
|
||||||
|
ImGui::EndChild();
|
||||||
|
|
||||||
ImGui::End();
|
ImGui::End();
|
||||||
|
|
||||||
// Rendering
|
// Rendering
|
||||||
@ -225,11 +246,49 @@ void TranslationUI::addTranslation(const std::string& chinese, const std::string
|
|||||||
messages_.push_back({chinese, french});
|
messages_.push_back({chinese, french});
|
||||||
}
|
}
|
||||||
|
|
||||||
void TranslationUI::renderTranslations() {
|
void TranslationUI::setAccumulatedText(const std::string& chinese, const std::string& french) {
|
||||||
ImGui::Text("SecondVoice - Live Translation");
|
accumulated_chinese_ = chinese;
|
||||||
|
accumulated_french_ = french;
|
||||||
|
}
|
||||||
|
|
||||||
|
void TranslationUI::renderAccumulated() {
|
||||||
|
if (accumulated_chinese_.empty() && accumulated_french_.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ImGui::Text("Texte Recomposé");
|
||||||
|
ImGui::SameLine();
|
||||||
|
if (ImGui::SmallButton("Clear")) {
|
||||||
|
clear_requested_ = true;
|
||||||
|
}
|
||||||
ImGui::Separator();
|
ImGui::Separator();
|
||||||
|
|
||||||
ImGui::BeginChild("Translations", ImVec2(0, -120), true);
|
ImGui::BeginChild("Accumulated", ImVec2(0, 150), true);
|
||||||
|
|
||||||
|
// Chinese accumulated text
|
||||||
|
ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(1.0f, 0.9f, 0.5f, 1.0f)); // Yellow
|
||||||
|
ImGui::TextWrapped("%s", accumulated_chinese_.c_str());
|
||||||
|
ImGui::PopStyleColor();
|
||||||
|
|
||||||
|
ImGui::Spacing();
|
||||||
|
|
||||||
|
// French accumulated text
|
||||||
|
ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.5f, 1.0f, 0.8f, 1.0f)); // Cyan
|
||||||
|
ImGui::TextWrapped("%s", accumulated_french_.c_str());
|
||||||
|
ImGui::PopStyleColor();
|
||||||
|
|
||||||
|
ImGui::EndChild();
|
||||||
|
|
||||||
|
ImGui::Spacing();
|
||||||
|
}
|
||||||
|
|
||||||
|
void TranslationUI::renderTranslations() {
|
||||||
|
ImGui::Text("Segments (détail)");
|
||||||
|
ImGui::Separator();
|
||||||
|
|
||||||
|
// Height depends on whether accumulated section is shown
|
||||||
|
float height = (accumulated_chinese_.empty() && accumulated_french_.empty()) ? -120 : -120;
|
||||||
|
ImGui::BeginChild("Translations", ImVec2(0, height), true);
|
||||||
|
|
||||||
for (const auto& msg : messages_) {
|
for (const auto& msg : messages_) {
|
||||||
ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.5f, 0.8f, 1.0f, 1.0f));
|
ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.5f, 0.8f, 1.0f, 1.0f));
|
||||||
@ -281,4 +340,101 @@ void TranslationUI::renderStatus() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void TranslationUI::renderAudioPanel() {
|
||||||
|
ImGui::Text("Audio Monitor");
|
||||||
|
ImGui::Separator();
|
||||||
|
ImGui::Spacing();
|
||||||
|
|
||||||
|
// Current volume display
|
||||||
|
ImGui::Text("Volume Level");
|
||||||
|
|
||||||
|
// RMS meter (horizontal bar)
|
||||||
|
float rms_normalized = std::min(current_rms_ * 10.0f, 1.0f); // Scale for visibility
|
||||||
|
ImVec4 rms_color = rms_normalized > vad_threshold_ * 10.0f ?
|
||||||
|
ImVec4(0.2f, 0.8f, 0.2f, 1.0f) : // Green if above threshold
|
||||||
|
ImVec4(0.5f, 0.5f, 0.5f, 1.0f); // Gray if below
|
||||||
|
|
||||||
|
ImGui::PushStyleColor(ImGuiCol_PlotHistogram, rms_color);
|
||||||
|
ImGui::ProgressBar(rms_normalized, ImVec2(-1, 20), "");
|
||||||
|
ImGui::PopStyleColor();
|
||||||
|
|
||||||
|
ImGui::Text("RMS: %.4f", current_rms_);
|
||||||
|
|
||||||
|
ImGui::Spacing();
|
||||||
|
|
||||||
|
// Peak meter
|
||||||
|
ImGui::Text("Peak Level");
|
||||||
|
float peak_normalized = std::min(current_peak_, 1.0f);
|
||||||
|
ImVec4 peak_color = peak_normalized > vad_peak_threshold_ ?
|
||||||
|
ImVec4(0.2f, 0.8f, 0.2f, 1.0f) :
|
||||||
|
ImVec4(0.5f, 0.5f, 0.5f, 1.0f);
|
||||||
|
|
||||||
|
ImGui::PushStyleColor(ImGuiCol_PlotHistogram, peak_color);
|
||||||
|
ImGui::ProgressBar(peak_normalized, ImVec2(-1, 20), "");
|
||||||
|
ImGui::PopStyleColor();
|
||||||
|
|
||||||
|
ImGui::Text("Peak: %.4f", current_peak_);
|
||||||
|
|
||||||
|
ImGui::Spacing();
|
||||||
|
ImGui::Separator();
|
||||||
|
ImGui::Spacing();
|
||||||
|
|
||||||
|
// VAD Threshold sliders
|
||||||
|
ImGui::Text("VAD Settings");
|
||||||
|
ImGui::Spacing();
|
||||||
|
|
||||||
|
ImGui::Text("RMS Threshold");
|
||||||
|
ImGui::SliderFloat("##rms_thresh", &vad_threshold_, 0.001f, 0.1f, "%.3f");
|
||||||
|
|
||||||
|
// Draw threshold line on the RMS meter area
|
||||||
|
ImGui::Spacing();
|
||||||
|
|
||||||
|
ImGui::Text("Peak Threshold");
|
||||||
|
ImGui::SliderFloat("##peak_thresh", &vad_peak_threshold_, 0.01f, 0.3f, "%.3f");
|
||||||
|
|
||||||
|
ImGui::Spacing();
|
||||||
|
ImGui::Separator();
|
||||||
|
ImGui::Spacing();
|
||||||
|
|
||||||
|
// Status indicator
|
||||||
|
bool is_active = (current_rms_ >= vad_threshold_) && (current_peak_ >= vad_peak_threshold_);
|
||||||
|
if (is_active) {
|
||||||
|
ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.2f, 1.0f, 0.2f, 1.0f));
|
||||||
|
ImGui::Text(">> VOICE DETECTED <<");
|
||||||
|
ImGui::PopStyleColor();
|
||||||
|
} else {
|
||||||
|
ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.6f, 0.6f, 0.6f, 1.0f));
|
||||||
|
ImGui::Text(" (silence)");
|
||||||
|
ImGui::PopStyleColor();
|
||||||
|
}
|
||||||
|
|
||||||
|
ImGui::Spacing();
|
||||||
|
ImGui::Separator();
|
||||||
|
ImGui::Spacing();
|
||||||
|
|
||||||
|
// Cost tracking
|
||||||
|
ImGui::Text("API Usage");
|
||||||
|
ImGui::Spacing();
|
||||||
|
|
||||||
|
// Audio sent to Whisper
|
||||||
|
int minutes = static_cast<int>(total_audio_seconds_) / 60;
|
||||||
|
int seconds = static_cast<int>(total_audio_seconds_) % 60;
|
||||||
|
ImGui::Text("Audio: %d:%02d", minutes, seconds);
|
||||||
|
ImGui::Text("Whisper calls: %d", whisper_calls_);
|
||||||
|
ImGui::Text("Claude calls: %d", claude_calls_);
|
||||||
|
|
||||||
|
ImGui::Spacing();
|
||||||
|
|
||||||
|
// Estimated cost
|
||||||
|
float cost = getEstimatedCost();
|
||||||
|
ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(1.0f, 0.8f, 0.3f, 1.0f)); // Gold
|
||||||
|
ImGui::Text("Est. cost: $%.4f", cost);
|
||||||
|
ImGui::PopStyleColor();
|
||||||
|
|
||||||
|
ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.6f, 0.6f, 0.6f, 1.0f));
|
||||||
|
ImGui::Text("(Whisper $0.006/min)");
|
||||||
|
ImGui::Text("(Haiku ~$0.001/call)");
|
||||||
|
ImGui::PopStyleColor();
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace secondvoice
|
} // namespace secondvoice
|
||||||
|
|||||||
@ -23,27 +23,66 @@ public:
|
|||||||
bool shouldClose() const;
|
bool shouldClose() const;
|
||||||
|
|
||||||
void addTranslation(const std::string& chinese, const std::string& french);
|
void addTranslation(const std::string& chinese, const std::string& french);
|
||||||
|
void setAccumulatedText(const std::string& chinese, const std::string& french);
|
||||||
bool isStopRequested() const { return stop_requested_; }
|
bool isStopRequested() const { return stop_requested_; }
|
||||||
void resetStopRequest() { stop_requested_ = false; }
|
void resetStopRequest() { stop_requested_ = false; }
|
||||||
|
|
||||||
|
bool isClearRequested() const { return clear_requested_; }
|
||||||
|
void resetClearRequest() { clear_requested_ = false; }
|
||||||
|
|
||||||
void setRecordingDuration(int seconds) { recording_duration_ = seconds; }
|
void setRecordingDuration(int seconds) { recording_duration_ = seconds; }
|
||||||
void setProcessingStatus(const std::string& status) { processing_status_ = status; }
|
void setProcessingStatus(const std::string& status) { processing_status_ = status; }
|
||||||
|
|
||||||
|
// Audio level monitoring
|
||||||
|
void setCurrentRMS(float rms) { current_rms_ = rms; }
|
||||||
|
void setCurrentPeak(float peak) { current_peak_ = peak; }
|
||||||
|
float getVadThreshold() const { return vad_threshold_; }
|
||||||
|
float getVadPeakThreshold() const { return vad_peak_threshold_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
void renderAccumulated();
|
||||||
void renderTranslations();
|
void renderTranslations();
|
||||||
void renderControls();
|
void renderControls();
|
||||||
void renderStatus();
|
void renderStatus();
|
||||||
|
void renderAudioPanel();
|
||||||
|
|
||||||
int width_;
|
int width_;
|
||||||
int height_;
|
int height_;
|
||||||
GLFWwindow* window_ = nullptr;
|
GLFWwindow* window_ = nullptr;
|
||||||
|
|
||||||
std::vector<TranslationMessage> messages_;
|
std::vector<TranslationMessage> messages_;
|
||||||
|
std::string accumulated_chinese_;
|
||||||
|
std::string accumulated_french_;
|
||||||
bool stop_requested_ = false;
|
bool stop_requested_ = false;
|
||||||
|
bool clear_requested_ = false;
|
||||||
bool auto_scroll_ = true;
|
bool auto_scroll_ = true;
|
||||||
|
|
||||||
int recording_duration_ = 0;
|
int recording_duration_ = 0;
|
||||||
std::string processing_status_;
|
std::string processing_status_;
|
||||||
|
|
||||||
|
// Audio monitoring
|
||||||
|
float current_rms_ = 0.0f;
|
||||||
|
float current_peak_ = 0.0f;
|
||||||
|
float vad_threshold_ = 0.02f; // 2x higher to avoid false triggers
|
||||||
|
float vad_peak_threshold_ = 0.08f; // 2x higher
|
||||||
|
|
||||||
|
// Cost tracking
|
||||||
|
float total_audio_seconds_ = 0.0f;
|
||||||
|
int whisper_calls_ = 0;
|
||||||
|
int claude_calls_ = 0;
|
||||||
|
|
||||||
|
public:
|
||||||
|
void addAudioCost(float seconds) {
|
||||||
|
total_audio_seconds_ += seconds;
|
||||||
|
whisper_calls_++;
|
||||||
|
}
|
||||||
|
void addClaudeCost() { claude_calls_++; }
|
||||||
|
float getTotalAudioSeconds() const { return total_audio_seconds_; }
|
||||||
|
float getEstimatedCost() const {
|
||||||
|
// gpt-4o-mini-transcribe: $0.006/min = $0.0001/sec
|
||||||
|
// Haiku: ~$0.001 per short translation
|
||||||
|
return (total_audio_seconds_ * 0.0001f) + (claude_calls_ * 0.001f);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace secondvoice
|
} // namespace secondvoice
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user