#include "AudioCapture.h" #include "NoiseReducer.h" #include namespace secondvoice { AudioCapture::AudioCapture(int sample_rate, int channels) : sample_rate_(sample_rate) , channels_(channels) , noise_reducer_(std::make_unique()) { std::cout << "[Audio] Noise reduction enabled (RNNoise)" << std::endl; } AudioCapture::~AudioCapture() { stop(); if (stream_) { Pa_CloseStream(stream_); } Pa_Terminate(); } bool AudioCapture::initialize() { std::cout << "[Audio] Initializing PortAudio..." << std::endl; PaError err = Pa_Initialize(); if (err != paNoError) { std::cerr << "[Audio] PortAudio init error: " << Pa_GetErrorText(err) << std::endl; return false; } std::cout << "[Audio] PortAudio initialized successfully" << std::endl; return true; } int AudioCapture::audioCallback(const void* input, void* output, unsigned long frame_count, const PaStreamCallbackTimeInfo* time_info, PaStreamCallbackFlags status_flags, void* user_data) { (void)output; (void)time_info; (void)status_flags; AudioCapture* self = static_cast(user_data); const float* in = static_cast(input); unsigned long sample_count = frame_count * self->channels_; // === REAL-TIME DENOISING === // Process audio through RNNoise in real-time for meter display std::vector denoised_samples; if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) { denoised_samples = self->noise_reducer_->processRealtime(in, sample_count); } // Calculate RMS and Peak from RAW audio (for VAD detection) float raw_sum_squared = 0.0f; float raw_max_amp = 0.0f; for (unsigned long i = 0; i < sample_count; ++i) { float sample = in[i]; raw_sum_squared += sample * sample; if (std::abs(sample) > raw_max_amp) { raw_max_amp = std::abs(sample); } } float instant_rms = std::sqrt(raw_sum_squared / sample_count); float instant_peak = raw_max_amp; // Calculate RMS and Peak from DENOISED audio (for UI meter) float denoised_rms = 0.0f; float denoised_peak = 0.0f; if (!denoised_samples.empty()) { float sum_squared = 0.0f; float max_amp = 0.0f; for (const float& sample : denoised_samples) { sum_squared += sample * sample; if (std::abs(sample) > max_amp) { max_amp = std::abs(sample); } } denoised_rms = std::sqrt(sum_squared / denoised_samples.size()); denoised_peak = max_amp; } else { // Fallback to raw if no denoised output yet denoised_rms = instant_rms; denoised_peak = instant_peak; } // Smooth RMS/Peak for display (fast attack, slow decay) - USE DENOISED VALUES constexpr float alpha_rise = 0.1f; constexpr float alpha_fall = 0.02f; float old_rms = self->current_rms_.load(std::memory_order_relaxed); float old_peak = self->current_peak_.load(std::memory_order_relaxed); float alpha_rms = (denoised_rms > old_rms) ? alpha_rise : alpha_fall; float alpha_peak = (denoised_peak > old_peak) ? alpha_rise : alpha_fall; self->current_rms_.store(alpha_rms * denoised_rms + (1.0f - alpha_rms) * old_rms, std::memory_order_relaxed); self->current_peak_.store(alpha_peak * denoised_peak + (1.0f - alpha_peak) * old_peak, std::memory_order_relaxed); // === VAD NOW WORKS ON FILTERED AUDIO === // This way, filtered noise/claps won't trigger VAD! // Calculate Zero-Crossing Rate on DENOISED audio float zcr = 0.0f; if (!denoised_samples.empty() && denoised_samples.size() > 1) { int zero_crossings = 0; for (size_t i = 1; i < denoised_samples.size(); ++i) { if ((denoised_samples[i] >= 0 && denoised_samples[i-1] < 0) || (denoised_samples[i] < 0 && denoised_samples[i-1] >= 0)) { zero_crossings++; } } zcr = static_cast(zero_crossings) / denoised_samples.size(); } self->last_zcr_ = zcr; // Get user-adjustable thresholds float rms_thresh = self->vad_rms_threshold_.load(std::memory_order_relaxed); float peak_thresh = self->vad_peak_threshold_.load(std::memory_order_relaxed); // Adaptive noise floor: track minimum DENOISED energy level if (denoised_rms < self->noise_floor_ * 2.0f) { self->noise_floor_ = self->noise_floor_ * (1.0f - self->noise_floor_alpha_) + denoised_rms * self->noise_floor_alpha_; } // Dynamic threshold: noise floor + user threshold as margin float adaptive_rms_thresh = self->noise_floor_ + rms_thresh; // Speech detection on DENOISED audio: // 1. Energy above adaptive threshold // 2. ZCR in speech range (not too high = noise, not too low = silence) bool energy_ok = (denoised_rms >= adaptive_rms_thresh) || (denoised_peak >= peak_thresh); bool zcr_ok = (zcr >= self->speech_zcr_min_) && (zcr <= self->speech_zcr_max_); // Speech = energy OK AND (ZCR OK or very high energy) bool frame_has_speech = energy_ok && (zcr_ok || denoised_rms > adaptive_rms_thresh * 3.0f); // Reset trailing silence counter when speech detected if (frame_has_speech) { self->consecutive_silence_frames_ = 0; } // Calculate durations in samples int min_speech_samples = (self->min_speech_duration_ms_ * self->sample_rate_ * self->channels_) / 1000; int max_speech_samples = (self->max_speech_duration_ms_ * self->sample_rate_ * self->channels_) / 1000; if (frame_has_speech) { // Speech detected self->is_speech_active_.store(true, std::memory_order_relaxed); self->silence_samples_count_ = 0; // Add DENOISED audio to buffer (already processed by processRealtime above) if (!denoised_samples.empty()) { self->speech_buffer_.insert(self->speech_buffer_.end(), denoised_samples.begin(), denoised_samples.end()); } else { // Fallback to raw if denoising disabled for (unsigned long i = 0; i < sample_count; ++i) { self->speech_buffer_.push_back(in[i]); } } self->speech_samples_count_ += sample_count; // Force flush if too long if (self->speech_samples_count_ >= max_speech_samples) { std::cout << "[VAD] Max duration reached, forcing flush (" << self->speech_samples_count_ / (self->sample_rate_ * self->channels_) << "s)" << std::endl; if (self->callback_ && self->speech_buffer_.size() >= static_cast(min_speech_samples)) { // Flush any remaining samples from the denoiser if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) { auto remaining = self->noise_reducer_->flushStream(); self->speech_buffer_.insert(self->speech_buffer_.end(), remaining.begin(), remaining.end()); // Save preview for listening self->noise_reducer_->savePreview(self->speech_buffer_); } self->callback_(self->speech_buffer_); } self->speech_buffer_.clear(); self->speech_samples_count_ = 0; self->consecutive_silence_frames_ = 0; // Reset after forced flush // Reset stream for next segment if (self->noise_reducer_) { self->noise_reducer_->resetStream(); } } } else { // Silence detected self->silence_samples_count_ += sample_count; // If we were speaking and now have silence, track consecutive silence frames if (self->speech_buffer_.size() > 0) { // Add trailing silence (denoised) if (!denoised_samples.empty()) { self->speech_buffer_.insert(self->speech_buffer_.end(), denoised_samples.begin(), denoised_samples.end()); } else { for (unsigned long i = 0; i < sample_count; ++i) { self->speech_buffer_.push_back(in[i]); } } // Increment consecutive silence frame counter self->consecutive_silence_frames_++; // Calculate threshold in frames (callbacks) // frames_per_buffer = frame_count from callback int frames_per_buffer = static_cast(frame_count); int silence_threshold_frames = (self->silence_duration_ms_ * self->sample_rate_) / (1000 * frames_per_buffer); // Flush when consecutive silence exceeds threshold if (self->consecutive_silence_frames_ >= silence_threshold_frames) { self->is_speech_active_.store(false, std::memory_order_relaxed); // Flush if we have enough speech if (self->speech_samples_count_ >= min_speech_samples) { // Flush any remaining samples from the denoiser if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) { auto remaining = self->noise_reducer_->flushStream(); self->speech_buffer_.insert(self->speech_buffer_.end(), remaining.begin(), remaining.end()); // Save preview for listening self->noise_reducer_->savePreview(self->speech_buffer_); } float duration = static_cast(self->speech_buffer_.size()) / (self->sample_rate_ * self->channels_); std::cout << "[VAD] Speech ended (trailing silence detected, " << self->consecutive_silence_frames_ << " frames, " << "noise_floor=" << self->noise_floor_ << "), flushing " << duration << "s (denoised)" << std::endl; if (self->callback_) { self->callback_(self->speech_buffer_); } } else { std::cout << "[VAD] Speech too short (" << self->speech_samples_count_ << " samples), discarding" << std::endl; } self->speech_buffer_.clear(); self->speech_samples_count_ = 0; self->consecutive_silence_frames_ = 0; // Reset after flush // Reset stream for next segment if (self->noise_reducer_) { self->noise_reducer_->resetStream(); } } } else { self->is_speech_active_.store(false, std::memory_order_relaxed); } } return paContinue; } bool AudioCapture::start(AudioCallback callback) { if (is_recording_) { return false; } callback_ = callback; speech_buffer_.clear(); silence_samples_count_ = 0; speech_samples_count_ = 0; PaStreamParameters input_params; input_params.device = Pa_GetDefaultInputDevice(); if (input_params.device == paNoDevice) { std::cerr << "No default input device" << std::endl; return false; } input_params.channelCount = channels_; input_params.sampleFormat = paFloat32; input_params.suggestedLatency = Pa_GetDeviceInfo(input_params.device)->defaultLowInputLatency; input_params.hostApiSpecificStreamInfo = nullptr; PaError err = Pa_OpenStream( &stream_, &input_params, nullptr, sample_rate_, paFramesPerBufferUnspecified, paClipOff, &AudioCapture::audioCallback, this ); if (err != paNoError) { std::cerr << "PortAudio open stream error: " << Pa_GetErrorText(err) << std::endl; return false; } err = Pa_StartStream(stream_); if (err != paNoError) { std::cerr << "PortAudio start stream error: " << Pa_GetErrorText(err) << std::endl; return false; } is_recording_ = true; std::cout << "[VAD] Started with RMS threshold=" << vad_rms_threshold_ << ", Peak threshold=" << vad_peak_threshold_ << ", Silence duration=" << silence_duration_ms_ << "ms" << std::endl; return true; } void AudioCapture::stop() { if (!is_recording_ || !stream_) { return; } // Flush any remaining audio if (speech_buffer_.size() > 0 && callback_) { int min_speech_samples = (min_speech_duration_ms_ * sample_rate_ * channels_) / 1000; if (speech_samples_count_ >= min_speech_samples) { std::cout << "[VAD] Flushing remaining audio on stop (denoised)" << std::endl; // Flush any remaining samples from the denoiser if (noise_reducer_ && noise_reducer_->isEnabled()) { auto remaining = noise_reducer_->flushStream(); speech_buffer_.insert(speech_buffer_.end(), remaining.begin(), remaining.end()); // Save preview for listening noise_reducer_->savePreview(speech_buffer_); } callback_(speech_buffer_); } } // Reset the stream for next session if (noise_reducer_) { noise_reducer_->resetStream(); } Pa_StopStream(stream_); is_recording_ = false; } void AudioCapture::setDenoiseEnabled(bool enabled) { if (noise_reducer_) { noise_reducer_->setEnabled(enabled); } } bool AudioCapture::isDenoiseEnabled() const { return noise_reducer_ && noise_reducer_->isEnabled(); } } // namespace secondvoice