- Replace hang time logic with consecutive silence frame counter for more precise speech end detection - Update Whisper prompt to utilize previous context for better transcription coherence - Expand README with comprehensive feature list, architecture details, debugging status, and session logging structure - Add troubleshooting section for real-world testing conditions and known issues
353 lines
14 KiB
C++
353 lines
14 KiB
C++
#include "AudioCapture.h"
|
|
#include "NoiseReducer.h"
|
|
#include <iostream>
|
|
|
|
namespace secondvoice {
|
|
|
|
AudioCapture::AudioCapture(int sample_rate, int channels)
|
|
: sample_rate_(sample_rate)
|
|
, channels_(channels)
|
|
, noise_reducer_(std::make_unique<NoiseReducer>()) {
|
|
std::cout << "[Audio] Noise reduction enabled (RNNoise)" << std::endl;
|
|
}
|
|
|
|
AudioCapture::~AudioCapture() {
|
|
stop();
|
|
if (stream_) {
|
|
Pa_CloseStream(stream_);
|
|
}
|
|
Pa_Terminate();
|
|
}
|
|
|
|
bool AudioCapture::initialize() {
|
|
std::cout << "[Audio] Initializing PortAudio..." << std::endl;
|
|
PaError err = Pa_Initialize();
|
|
if (err != paNoError) {
|
|
std::cerr << "[Audio] PortAudio init error: " << Pa_GetErrorText(err) << std::endl;
|
|
return false;
|
|
}
|
|
std::cout << "[Audio] PortAudio initialized successfully" << std::endl;
|
|
return true;
|
|
}
|
|
|
|
int AudioCapture::audioCallback(const void* input, void* output,
|
|
unsigned long frame_count,
|
|
const PaStreamCallbackTimeInfo* time_info,
|
|
PaStreamCallbackFlags status_flags,
|
|
void* user_data) {
|
|
(void)output;
|
|
(void)time_info;
|
|
(void)status_flags;
|
|
|
|
AudioCapture* self = static_cast<AudioCapture*>(user_data);
|
|
const float* in = static_cast<const float*>(input);
|
|
unsigned long sample_count = frame_count * self->channels_;
|
|
|
|
// === REAL-TIME DENOISING ===
|
|
// Process audio through RNNoise in real-time for meter display
|
|
std::vector<float> denoised_samples;
|
|
if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) {
|
|
denoised_samples = self->noise_reducer_->processRealtime(in, sample_count);
|
|
}
|
|
|
|
// Calculate RMS and Peak from RAW audio (for VAD detection)
|
|
float raw_sum_squared = 0.0f;
|
|
float raw_max_amp = 0.0f;
|
|
for (unsigned long i = 0; i < sample_count; ++i) {
|
|
float sample = in[i];
|
|
raw_sum_squared += sample * sample;
|
|
if (std::abs(sample) > raw_max_amp) {
|
|
raw_max_amp = std::abs(sample);
|
|
}
|
|
}
|
|
float instant_rms = std::sqrt(raw_sum_squared / sample_count);
|
|
float instant_peak = raw_max_amp;
|
|
|
|
// Calculate RMS and Peak from DENOISED audio (for UI meter)
|
|
float denoised_rms = 0.0f;
|
|
float denoised_peak = 0.0f;
|
|
if (!denoised_samples.empty()) {
|
|
float sum_squared = 0.0f;
|
|
float max_amp = 0.0f;
|
|
for (const float& sample : denoised_samples) {
|
|
sum_squared += sample * sample;
|
|
if (std::abs(sample) > max_amp) {
|
|
max_amp = std::abs(sample);
|
|
}
|
|
}
|
|
denoised_rms = std::sqrt(sum_squared / denoised_samples.size());
|
|
denoised_peak = max_amp;
|
|
} else {
|
|
// Fallback to raw if no denoised output yet
|
|
denoised_rms = instant_rms;
|
|
denoised_peak = instant_peak;
|
|
}
|
|
|
|
// Smooth RMS/Peak for display (fast attack, slow decay) - USE DENOISED VALUES
|
|
constexpr float alpha_rise = 0.1f;
|
|
constexpr float alpha_fall = 0.02f;
|
|
|
|
float old_rms = self->current_rms_.load(std::memory_order_relaxed);
|
|
float old_peak = self->current_peak_.load(std::memory_order_relaxed);
|
|
|
|
float alpha_rms = (denoised_rms > old_rms) ? alpha_rise : alpha_fall;
|
|
float alpha_peak = (denoised_peak > old_peak) ? alpha_rise : alpha_fall;
|
|
|
|
self->current_rms_.store(alpha_rms * denoised_rms + (1.0f - alpha_rms) * old_rms, std::memory_order_relaxed);
|
|
self->current_peak_.store(alpha_peak * denoised_peak + (1.0f - alpha_peak) * old_peak, std::memory_order_relaxed);
|
|
|
|
// === VAD NOW WORKS ON FILTERED AUDIO ===
|
|
// This way, filtered noise/claps won't trigger VAD!
|
|
|
|
// Calculate Zero-Crossing Rate on DENOISED audio
|
|
float zcr = 0.0f;
|
|
if (!denoised_samples.empty() && denoised_samples.size() > 1) {
|
|
int zero_crossings = 0;
|
|
for (size_t i = 1; i < denoised_samples.size(); ++i) {
|
|
if ((denoised_samples[i] >= 0 && denoised_samples[i-1] < 0) ||
|
|
(denoised_samples[i] < 0 && denoised_samples[i-1] >= 0)) {
|
|
zero_crossings++;
|
|
}
|
|
}
|
|
zcr = static_cast<float>(zero_crossings) / denoised_samples.size();
|
|
}
|
|
self->last_zcr_ = zcr;
|
|
|
|
// Get user-adjustable thresholds
|
|
float rms_thresh = self->vad_rms_threshold_.load(std::memory_order_relaxed);
|
|
float peak_thresh = self->vad_peak_threshold_.load(std::memory_order_relaxed);
|
|
|
|
// Adaptive noise floor: track minimum DENOISED energy level
|
|
if (denoised_rms < self->noise_floor_ * 2.0f) {
|
|
self->noise_floor_ = self->noise_floor_ * (1.0f - self->noise_floor_alpha_) +
|
|
denoised_rms * self->noise_floor_alpha_;
|
|
}
|
|
|
|
// Dynamic threshold: noise floor + user threshold as margin
|
|
float adaptive_rms_thresh = self->noise_floor_ + rms_thresh;
|
|
|
|
// Speech detection on DENOISED audio:
|
|
// 1. Energy above adaptive threshold
|
|
// 2. ZCR in speech range (not too high = noise, not too low = silence)
|
|
bool energy_ok = (denoised_rms >= adaptive_rms_thresh) || (denoised_peak >= peak_thresh);
|
|
bool zcr_ok = (zcr >= self->speech_zcr_min_) && (zcr <= self->speech_zcr_max_);
|
|
|
|
// Speech = energy OK AND (ZCR OK or very high energy)
|
|
bool frame_has_speech = energy_ok && (zcr_ok || denoised_rms > adaptive_rms_thresh * 3.0f);
|
|
|
|
// Reset trailing silence counter when speech detected
|
|
if (frame_has_speech) {
|
|
self->consecutive_silence_frames_ = 0;
|
|
}
|
|
|
|
// Calculate durations in samples
|
|
int min_speech_samples = (self->min_speech_duration_ms_ * self->sample_rate_ * self->channels_) / 1000;
|
|
int max_speech_samples = (self->max_speech_duration_ms_ * self->sample_rate_ * self->channels_) / 1000;
|
|
|
|
if (frame_has_speech) {
|
|
// Speech detected
|
|
self->is_speech_active_.store(true, std::memory_order_relaxed);
|
|
self->silence_samples_count_ = 0;
|
|
|
|
// Add DENOISED audio to buffer (already processed by processRealtime above)
|
|
if (!denoised_samples.empty()) {
|
|
self->speech_buffer_.insert(self->speech_buffer_.end(),
|
|
denoised_samples.begin(), denoised_samples.end());
|
|
} else {
|
|
// Fallback to raw if denoising disabled
|
|
for (unsigned long i = 0; i < sample_count; ++i) {
|
|
self->speech_buffer_.push_back(in[i]);
|
|
}
|
|
}
|
|
self->speech_samples_count_ += sample_count;
|
|
|
|
// Force flush if too long
|
|
if (self->speech_samples_count_ >= max_speech_samples) {
|
|
std::cout << "[VAD] Max duration reached, forcing flush ("
|
|
<< self->speech_samples_count_ / (self->sample_rate_ * self->channels_) << "s)" << std::endl;
|
|
|
|
if (self->callback_ && self->speech_buffer_.size() >= static_cast<size_t>(min_speech_samples)) {
|
|
// Flush any remaining samples from the denoiser
|
|
if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) {
|
|
auto remaining = self->noise_reducer_->flushStream();
|
|
self->speech_buffer_.insert(self->speech_buffer_.end(),
|
|
remaining.begin(), remaining.end());
|
|
// Save preview for listening
|
|
self->noise_reducer_->savePreview(self->speech_buffer_);
|
|
}
|
|
self->callback_(self->speech_buffer_);
|
|
}
|
|
self->speech_buffer_.clear();
|
|
self->speech_samples_count_ = 0;
|
|
self->consecutive_silence_frames_ = 0; // Reset after forced flush
|
|
// Reset stream for next segment
|
|
if (self->noise_reducer_) {
|
|
self->noise_reducer_->resetStream();
|
|
}
|
|
}
|
|
} else {
|
|
// Silence detected
|
|
self->silence_samples_count_ += sample_count;
|
|
|
|
// If we were speaking and now have silence, track consecutive silence frames
|
|
if (self->speech_buffer_.size() > 0) {
|
|
// Add trailing silence (denoised)
|
|
if (!denoised_samples.empty()) {
|
|
self->speech_buffer_.insert(self->speech_buffer_.end(),
|
|
denoised_samples.begin(), denoised_samples.end());
|
|
} else {
|
|
for (unsigned long i = 0; i < sample_count; ++i) {
|
|
self->speech_buffer_.push_back(in[i]);
|
|
}
|
|
}
|
|
|
|
// Increment consecutive silence frame counter
|
|
self->consecutive_silence_frames_++;
|
|
|
|
// Calculate threshold in frames (callbacks)
|
|
// frames_per_buffer = frame_count from callback
|
|
int frames_per_buffer = static_cast<int>(frame_count);
|
|
int silence_threshold_frames = (self->silence_duration_ms_ * self->sample_rate_) / (1000 * frames_per_buffer);
|
|
|
|
// Flush when consecutive silence exceeds threshold
|
|
if (self->consecutive_silence_frames_ >= silence_threshold_frames) {
|
|
self->is_speech_active_.store(false, std::memory_order_relaxed);
|
|
|
|
// Flush if we have enough speech
|
|
if (self->speech_samples_count_ >= min_speech_samples) {
|
|
// Flush any remaining samples from the denoiser
|
|
if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) {
|
|
auto remaining = self->noise_reducer_->flushStream();
|
|
self->speech_buffer_.insert(self->speech_buffer_.end(),
|
|
remaining.begin(), remaining.end());
|
|
// Save preview for listening
|
|
self->noise_reducer_->savePreview(self->speech_buffer_);
|
|
}
|
|
|
|
float duration = static_cast<float>(self->speech_buffer_.size()) /
|
|
(self->sample_rate_ * self->channels_);
|
|
std::cout << "[VAD] Speech ended (trailing silence detected, "
|
|
<< self->consecutive_silence_frames_ << " frames, "
|
|
<< "noise_floor=" << self->noise_floor_
|
|
<< "), flushing " << duration << "s (denoised)" << std::endl;
|
|
|
|
if (self->callback_) {
|
|
self->callback_(self->speech_buffer_);
|
|
}
|
|
} else {
|
|
std::cout << "[VAD] Speech too short (" << self->speech_samples_count_
|
|
<< " samples), discarding" << std::endl;
|
|
}
|
|
|
|
self->speech_buffer_.clear();
|
|
self->speech_samples_count_ = 0;
|
|
self->consecutive_silence_frames_ = 0; // Reset after flush
|
|
// Reset stream for next segment
|
|
if (self->noise_reducer_) {
|
|
self->noise_reducer_->resetStream();
|
|
}
|
|
}
|
|
} else {
|
|
self->is_speech_active_.store(false, std::memory_order_relaxed);
|
|
}
|
|
}
|
|
|
|
return paContinue;
|
|
}
|
|
|
|
bool AudioCapture::start(AudioCallback callback) {
|
|
if (is_recording_) {
|
|
return false;
|
|
}
|
|
|
|
callback_ = callback;
|
|
speech_buffer_.clear();
|
|
silence_samples_count_ = 0;
|
|
speech_samples_count_ = 0;
|
|
|
|
PaStreamParameters input_params;
|
|
input_params.device = Pa_GetDefaultInputDevice();
|
|
if (input_params.device == paNoDevice) {
|
|
std::cerr << "No default input device" << std::endl;
|
|
return false;
|
|
}
|
|
|
|
input_params.channelCount = channels_;
|
|
input_params.sampleFormat = paFloat32;
|
|
input_params.suggestedLatency = Pa_GetDeviceInfo(input_params.device)->defaultLowInputLatency;
|
|
input_params.hostApiSpecificStreamInfo = nullptr;
|
|
|
|
PaError err = Pa_OpenStream(
|
|
&stream_,
|
|
&input_params,
|
|
nullptr,
|
|
sample_rate_,
|
|
paFramesPerBufferUnspecified,
|
|
paClipOff,
|
|
&AudioCapture::audioCallback,
|
|
this
|
|
);
|
|
|
|
if (err != paNoError) {
|
|
std::cerr << "PortAudio open stream error: " << Pa_GetErrorText(err) << std::endl;
|
|
return false;
|
|
}
|
|
|
|
err = Pa_StartStream(stream_);
|
|
if (err != paNoError) {
|
|
std::cerr << "PortAudio start stream error: " << Pa_GetErrorText(err) << std::endl;
|
|
return false;
|
|
}
|
|
|
|
is_recording_ = true;
|
|
std::cout << "[VAD] Started with RMS threshold=" << vad_rms_threshold_
|
|
<< ", Peak threshold=" << vad_peak_threshold_
|
|
<< ", Silence duration=" << silence_duration_ms_ << "ms" << std::endl;
|
|
return true;
|
|
}
|
|
|
|
void AudioCapture::stop() {
|
|
if (!is_recording_ || !stream_) {
|
|
return;
|
|
}
|
|
|
|
// Flush any remaining audio
|
|
if (speech_buffer_.size() > 0 && callback_) {
|
|
int min_speech_samples = (min_speech_duration_ms_ * sample_rate_ * channels_) / 1000;
|
|
if (speech_samples_count_ >= min_speech_samples) {
|
|
std::cout << "[VAD] Flushing remaining audio on stop (denoised)" << std::endl;
|
|
|
|
// Flush any remaining samples from the denoiser
|
|
if (noise_reducer_ && noise_reducer_->isEnabled()) {
|
|
auto remaining = noise_reducer_->flushStream();
|
|
speech_buffer_.insert(speech_buffer_.end(),
|
|
remaining.begin(), remaining.end());
|
|
// Save preview for listening
|
|
noise_reducer_->savePreview(speech_buffer_);
|
|
}
|
|
|
|
callback_(speech_buffer_);
|
|
}
|
|
}
|
|
|
|
// Reset the stream for next session
|
|
if (noise_reducer_) {
|
|
noise_reducer_->resetStream();
|
|
}
|
|
|
|
Pa_StopStream(stream_);
|
|
is_recording_ = false;
|
|
}
|
|
|
|
void AudioCapture::setDenoiseEnabled(bool enabled) {
|
|
if (noise_reducer_) {
|
|
noise_reducer_->setEnabled(enabled);
|
|
}
|
|
}
|
|
|
|
bool AudioCapture::isDenoiseEnabled() const {
|
|
return noise_reducer_ && noise_reducer_->isEnabled();
|
|
}
|
|
|
|
} // namespace secondvoice
|