secondvoice/src/audio/AudioCapture.cpp
StillHammer db0f8e5990 refactor: Improve VAD trailing silence detection and update docs
- Replace hang time logic with consecutive silence frame counter for more precise speech end detection
- Update Whisper prompt to utilize previous context for better transcription coherence
- Expand README with comprehensive feature list, architecture details, debugging status, and session logging structure
- Add troubleshooting section for real-world testing conditions and known issues
2025-12-02 09:44:06 +08:00

353 lines
14 KiB
C++

#include "AudioCapture.h"
#include "NoiseReducer.h"
#include <iostream>
namespace secondvoice {
AudioCapture::AudioCapture(int sample_rate, int channels)
: sample_rate_(sample_rate)
, channels_(channels)
, noise_reducer_(std::make_unique<NoiseReducer>()) {
std::cout << "[Audio] Noise reduction enabled (RNNoise)" << std::endl;
}
AudioCapture::~AudioCapture() {
stop();
if (stream_) {
Pa_CloseStream(stream_);
}
Pa_Terminate();
}
bool AudioCapture::initialize() {
std::cout << "[Audio] Initializing PortAudio..." << std::endl;
PaError err = Pa_Initialize();
if (err != paNoError) {
std::cerr << "[Audio] PortAudio init error: " << Pa_GetErrorText(err) << std::endl;
return false;
}
std::cout << "[Audio] PortAudio initialized successfully" << std::endl;
return true;
}
int AudioCapture::audioCallback(const void* input, void* output,
unsigned long frame_count,
const PaStreamCallbackTimeInfo* time_info,
PaStreamCallbackFlags status_flags,
void* user_data) {
(void)output;
(void)time_info;
(void)status_flags;
AudioCapture* self = static_cast<AudioCapture*>(user_data);
const float* in = static_cast<const float*>(input);
unsigned long sample_count = frame_count * self->channels_;
// === REAL-TIME DENOISING ===
// Process audio through RNNoise in real-time for meter display
std::vector<float> denoised_samples;
if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) {
denoised_samples = self->noise_reducer_->processRealtime(in, sample_count);
}
// Calculate RMS and Peak from RAW audio (for VAD detection)
float raw_sum_squared = 0.0f;
float raw_max_amp = 0.0f;
for (unsigned long i = 0; i < sample_count; ++i) {
float sample = in[i];
raw_sum_squared += sample * sample;
if (std::abs(sample) > raw_max_amp) {
raw_max_amp = std::abs(sample);
}
}
float instant_rms = std::sqrt(raw_sum_squared / sample_count);
float instant_peak = raw_max_amp;
// Calculate RMS and Peak from DENOISED audio (for UI meter)
float denoised_rms = 0.0f;
float denoised_peak = 0.0f;
if (!denoised_samples.empty()) {
float sum_squared = 0.0f;
float max_amp = 0.0f;
for (const float& sample : denoised_samples) {
sum_squared += sample * sample;
if (std::abs(sample) > max_amp) {
max_amp = std::abs(sample);
}
}
denoised_rms = std::sqrt(sum_squared / denoised_samples.size());
denoised_peak = max_amp;
} else {
// Fallback to raw if no denoised output yet
denoised_rms = instant_rms;
denoised_peak = instant_peak;
}
// Smooth RMS/Peak for display (fast attack, slow decay) - USE DENOISED VALUES
constexpr float alpha_rise = 0.1f;
constexpr float alpha_fall = 0.02f;
float old_rms = self->current_rms_.load(std::memory_order_relaxed);
float old_peak = self->current_peak_.load(std::memory_order_relaxed);
float alpha_rms = (denoised_rms > old_rms) ? alpha_rise : alpha_fall;
float alpha_peak = (denoised_peak > old_peak) ? alpha_rise : alpha_fall;
self->current_rms_.store(alpha_rms * denoised_rms + (1.0f - alpha_rms) * old_rms, std::memory_order_relaxed);
self->current_peak_.store(alpha_peak * denoised_peak + (1.0f - alpha_peak) * old_peak, std::memory_order_relaxed);
// === VAD NOW WORKS ON FILTERED AUDIO ===
// This way, filtered noise/claps won't trigger VAD!
// Calculate Zero-Crossing Rate on DENOISED audio
float zcr = 0.0f;
if (!denoised_samples.empty() && denoised_samples.size() > 1) {
int zero_crossings = 0;
for (size_t i = 1; i < denoised_samples.size(); ++i) {
if ((denoised_samples[i] >= 0 && denoised_samples[i-1] < 0) ||
(denoised_samples[i] < 0 && denoised_samples[i-1] >= 0)) {
zero_crossings++;
}
}
zcr = static_cast<float>(zero_crossings) / denoised_samples.size();
}
self->last_zcr_ = zcr;
// Get user-adjustable thresholds
float rms_thresh = self->vad_rms_threshold_.load(std::memory_order_relaxed);
float peak_thresh = self->vad_peak_threshold_.load(std::memory_order_relaxed);
// Adaptive noise floor: track minimum DENOISED energy level
if (denoised_rms < self->noise_floor_ * 2.0f) {
self->noise_floor_ = self->noise_floor_ * (1.0f - self->noise_floor_alpha_) +
denoised_rms * self->noise_floor_alpha_;
}
// Dynamic threshold: noise floor + user threshold as margin
float adaptive_rms_thresh = self->noise_floor_ + rms_thresh;
// Speech detection on DENOISED audio:
// 1. Energy above adaptive threshold
// 2. ZCR in speech range (not too high = noise, not too low = silence)
bool energy_ok = (denoised_rms >= adaptive_rms_thresh) || (denoised_peak >= peak_thresh);
bool zcr_ok = (zcr >= self->speech_zcr_min_) && (zcr <= self->speech_zcr_max_);
// Speech = energy OK AND (ZCR OK or very high energy)
bool frame_has_speech = energy_ok && (zcr_ok || denoised_rms > adaptive_rms_thresh * 3.0f);
// Reset trailing silence counter when speech detected
if (frame_has_speech) {
self->consecutive_silence_frames_ = 0;
}
// Calculate durations in samples
int min_speech_samples = (self->min_speech_duration_ms_ * self->sample_rate_ * self->channels_) / 1000;
int max_speech_samples = (self->max_speech_duration_ms_ * self->sample_rate_ * self->channels_) / 1000;
if (frame_has_speech) {
// Speech detected
self->is_speech_active_.store(true, std::memory_order_relaxed);
self->silence_samples_count_ = 0;
// Add DENOISED audio to buffer (already processed by processRealtime above)
if (!denoised_samples.empty()) {
self->speech_buffer_.insert(self->speech_buffer_.end(),
denoised_samples.begin(), denoised_samples.end());
} else {
// Fallback to raw if denoising disabled
for (unsigned long i = 0; i < sample_count; ++i) {
self->speech_buffer_.push_back(in[i]);
}
}
self->speech_samples_count_ += sample_count;
// Force flush if too long
if (self->speech_samples_count_ >= max_speech_samples) {
std::cout << "[VAD] Max duration reached, forcing flush ("
<< self->speech_samples_count_ / (self->sample_rate_ * self->channels_) << "s)" << std::endl;
if (self->callback_ && self->speech_buffer_.size() >= static_cast<size_t>(min_speech_samples)) {
// Flush any remaining samples from the denoiser
if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) {
auto remaining = self->noise_reducer_->flushStream();
self->speech_buffer_.insert(self->speech_buffer_.end(),
remaining.begin(), remaining.end());
// Save preview for listening
self->noise_reducer_->savePreview(self->speech_buffer_);
}
self->callback_(self->speech_buffer_);
}
self->speech_buffer_.clear();
self->speech_samples_count_ = 0;
self->consecutive_silence_frames_ = 0; // Reset after forced flush
// Reset stream for next segment
if (self->noise_reducer_) {
self->noise_reducer_->resetStream();
}
}
} else {
// Silence detected
self->silence_samples_count_ += sample_count;
// If we were speaking and now have silence, track consecutive silence frames
if (self->speech_buffer_.size() > 0) {
// Add trailing silence (denoised)
if (!denoised_samples.empty()) {
self->speech_buffer_.insert(self->speech_buffer_.end(),
denoised_samples.begin(), denoised_samples.end());
} else {
for (unsigned long i = 0; i < sample_count; ++i) {
self->speech_buffer_.push_back(in[i]);
}
}
// Increment consecutive silence frame counter
self->consecutive_silence_frames_++;
// Calculate threshold in frames (callbacks)
// frames_per_buffer = frame_count from callback
int frames_per_buffer = static_cast<int>(frame_count);
int silence_threshold_frames = (self->silence_duration_ms_ * self->sample_rate_) / (1000 * frames_per_buffer);
// Flush when consecutive silence exceeds threshold
if (self->consecutive_silence_frames_ >= silence_threshold_frames) {
self->is_speech_active_.store(false, std::memory_order_relaxed);
// Flush if we have enough speech
if (self->speech_samples_count_ >= min_speech_samples) {
// Flush any remaining samples from the denoiser
if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) {
auto remaining = self->noise_reducer_->flushStream();
self->speech_buffer_.insert(self->speech_buffer_.end(),
remaining.begin(), remaining.end());
// Save preview for listening
self->noise_reducer_->savePreview(self->speech_buffer_);
}
float duration = static_cast<float>(self->speech_buffer_.size()) /
(self->sample_rate_ * self->channels_);
std::cout << "[VAD] Speech ended (trailing silence detected, "
<< self->consecutive_silence_frames_ << " frames, "
<< "noise_floor=" << self->noise_floor_
<< "), flushing " << duration << "s (denoised)" << std::endl;
if (self->callback_) {
self->callback_(self->speech_buffer_);
}
} else {
std::cout << "[VAD] Speech too short (" << self->speech_samples_count_
<< " samples), discarding" << std::endl;
}
self->speech_buffer_.clear();
self->speech_samples_count_ = 0;
self->consecutive_silence_frames_ = 0; // Reset after flush
// Reset stream for next segment
if (self->noise_reducer_) {
self->noise_reducer_->resetStream();
}
}
} else {
self->is_speech_active_.store(false, std::memory_order_relaxed);
}
}
return paContinue;
}
bool AudioCapture::start(AudioCallback callback) {
if (is_recording_) {
return false;
}
callback_ = callback;
speech_buffer_.clear();
silence_samples_count_ = 0;
speech_samples_count_ = 0;
PaStreamParameters input_params;
input_params.device = Pa_GetDefaultInputDevice();
if (input_params.device == paNoDevice) {
std::cerr << "No default input device" << std::endl;
return false;
}
input_params.channelCount = channels_;
input_params.sampleFormat = paFloat32;
input_params.suggestedLatency = Pa_GetDeviceInfo(input_params.device)->defaultLowInputLatency;
input_params.hostApiSpecificStreamInfo = nullptr;
PaError err = Pa_OpenStream(
&stream_,
&input_params,
nullptr,
sample_rate_,
paFramesPerBufferUnspecified,
paClipOff,
&AudioCapture::audioCallback,
this
);
if (err != paNoError) {
std::cerr << "PortAudio open stream error: " << Pa_GetErrorText(err) << std::endl;
return false;
}
err = Pa_StartStream(stream_);
if (err != paNoError) {
std::cerr << "PortAudio start stream error: " << Pa_GetErrorText(err) << std::endl;
return false;
}
is_recording_ = true;
std::cout << "[VAD] Started with RMS threshold=" << vad_rms_threshold_
<< ", Peak threshold=" << vad_peak_threshold_
<< ", Silence duration=" << silence_duration_ms_ << "ms" << std::endl;
return true;
}
void AudioCapture::stop() {
if (!is_recording_ || !stream_) {
return;
}
// Flush any remaining audio
if (speech_buffer_.size() > 0 && callback_) {
int min_speech_samples = (min_speech_duration_ms_ * sample_rate_ * channels_) / 1000;
if (speech_samples_count_ >= min_speech_samples) {
std::cout << "[VAD] Flushing remaining audio on stop (denoised)" << std::endl;
// Flush any remaining samples from the denoiser
if (noise_reducer_ && noise_reducer_->isEnabled()) {
auto remaining = noise_reducer_->flushStream();
speech_buffer_.insert(speech_buffer_.end(),
remaining.begin(), remaining.end());
// Save preview for listening
noise_reducer_->savePreview(speech_buffer_);
}
callback_(speech_buffer_);
}
}
// Reset the stream for next session
if (noise_reducer_) {
noise_reducer_->resetStream();
}
Pa_StopStream(stream_);
is_recording_ = false;
}
void AudioCapture::setDenoiseEnabled(bool enabled) {
if (noise_reducer_) {
noise_reducer_->setEnabled(enabled);
}
}
bool AudioCapture::isDenoiseEnabled() const {
return noise_reducer_ && noise_reducer_->isEnabled();
}
} // namespace secondvoice