feat: Add VAD metrics tracking to session logs

This commit is contained in:
StillHammer 2025-12-02 10:03:20 +08:00
parent a1b4e335c8
commit e8dd7f840e
5 changed files with 60 additions and 6 deletions

View File

@ -4,9 +4,15 @@
namespace secondvoice {
AudioCapture::AudioCapture(int sample_rate, int channels)
AudioCapture::AudioCapture(int sample_rate, int channels,
int silence_duration_ms,
int min_speech_duration_ms,
int max_speech_duration_ms)
: sample_rate_(sample_rate)
, channels_(channels)
, silence_duration_ms_(silence_duration_ms)
, min_speech_duration_ms_(min_speech_duration_ms)
, max_speech_duration_ms_(max_speech_duration_ms)
, noise_reducer_(std::make_unique<NoiseReducer>()) {
std::cout << "[Audio] Noise reduction enabled (RNNoise)" << std::endl;
}
@ -166,6 +172,11 @@ int AudioCapture::audioCallback(const void* input, void* output,
std::cout << "[VAD] Max duration reached, forcing flush ("
<< self->speech_samples_count_ / (self->sample_rate_ * self->channels_) << "s)" << std::endl;
// Calculate metrics BEFORE flushing
self->last_speech_duration_ms_ = (self->speech_samples_count_ * 1000) / (self->sample_rate_ * self->channels_);
self->last_silence_duration_ms_ = 0; // No trailing silence in forced flush
self->last_flush_reason_ = "max_duration";
if (self->callback_ && self->speech_buffer_.size() >= static_cast<size_t>(min_speech_samples)) {
// Flush any remaining samples from the denoiser
if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) {
@ -213,6 +224,11 @@ int AudioCapture::audioCallback(const void* input, void* output,
if (self->consecutive_silence_frames_ >= silence_threshold_frames) {
self->is_speech_active_.store(false, std::memory_order_relaxed);
// Calculate metrics BEFORE flushing
self->last_speech_duration_ms_ = (self->speech_samples_count_ * 1000) / (self->sample_rate_ * self->channels_);
self->last_silence_duration_ms_ = (self->silence_samples_count_ * 1000) / (self->sample_rate_ * self->channels_);
self->last_flush_reason_ = "silence_threshold";
// Flush if we have enough speech
if (self->speech_samples_count_ >= min_speech_samples) {
// Flush any remaining samples from the denoiser

View File

@ -16,7 +16,10 @@ class AudioCapture {
public:
using AudioCallback = std::function<void(const std::vector<float>&)>;
AudioCapture(int sample_rate, int channels);
AudioCapture(int sample_rate, int channels,
int silence_duration_ms = 700,
int min_speech_duration_ms = 2000,
int max_speech_duration_ms = 30000);
~AudioCapture();
bool initialize();
@ -44,6 +47,11 @@ public:
void setDenoiseEnabled(bool enabled);
bool isDenoiseEnabled() const;
// Get metrics from last flushed segment
int getLastSpeechDuration() const { return last_speech_duration_ms_; }
int getLastSilenceDuration() const { return last_silence_duration_ms_; }
std::string getLastFlushReason() const { return last_flush_reason_; }
private:
static int audioCallback(const void* input, void* output,
unsigned long frame_count,
@ -69,9 +77,9 @@ private:
// VAD parameters - Higher threshold to avoid false triggers on filtered noise
std::atomic<float> vad_rms_threshold_{0.02f}; // Was 0.01f
std::atomic<float> vad_peak_threshold_{0.08f}; // Was 0.04f
int silence_duration_ms_ = 700; // Wait 700ms of silence before cutting (was 400)
int min_speech_duration_ms_ = 2000; // Minimum 2s speech to send (was 1000)
int max_speech_duration_ms_ = 30000; // 30s max before forced flush (was 25000)
int silence_duration_ms_; // Wait 700ms of silence before cutting (was 400)
int min_speech_duration_ms_; // Minimum 2s speech to send (was 1000)
int max_speech_duration_ms_; // 30s max before forced flush (was 25000)
// Adaptive noise floor
float noise_floor_ = 0.005f; // Estimated background noise level
@ -80,6 +88,11 @@ private:
// Trailing silence detection - count consecutive silence frames after speech
int consecutive_silence_frames_ = 0;
// Metrics for last flushed segment (set in callback, read in processing thread)
int last_speech_duration_ms_ = 0;
int last_silence_duration_ms_ = 0;
std::string last_flush_reason_;
// Zero-crossing rate for speech vs noise discrimination
float last_zcr_ = 0.0f;
float speech_zcr_min_ = 0.01f; // More permissive lower bound

View File

@ -24,12 +24,23 @@ Pipeline::~Pipeline() {
bool Pipeline::initialize() {
auto& config = Config::getInstance();
// Load VAD parameters from config (with fallbacks if missing)
int silence_duration = config.getVadSilenceDurationMs();
int min_speech = config.getVadMinSpeechDurationMs();
int max_speech = config.getVadMaxSpeechDurationMs();
// Initialize audio capture with VAD-based segmentation
audio_capture_ = std::make_unique<AudioCapture>(
config.getAudioConfig().sample_rate,
config.getAudioConfig().channels
config.getAudioConfig().channels,
silence_duration,
min_speech,
max_speech
);
std::cout << "[Pipeline] VAD configured: silence=" << silence_duration
<< "ms, min_speech=" << min_speech
<< "ms, max_speech=" << max_speech << "ms" << std::endl;
std::cout << "[Pipeline] VAD-based audio segmentation enabled" << std::endl;
if (!audio_capture_->initialize()) {
@ -395,6 +406,10 @@ void Pipeline::processingThread() {
seg.was_filtered = false;
seg.filter_reason = "";
seg.timestamp = ""; // Will be set by logger
// Add VAD metrics from AudioCapture
seg.speech_duration_ms = audio_capture_->getLastSpeechDuration();
seg.silence_duration_ms = audio_capture_->getLastSilenceDuration();
seg.flush_reason = audio_capture_->getLastFlushReason();
session_logger_.logSegment(seg);
std::cout << "CN: " << text << std::endl;

View File

@ -89,6 +89,11 @@ void SessionLogger::logSegment(const SegmentLog& segment) {
j["was_filtered"] = segment.was_filtered;
j["filter_reason"] = segment.filter_reason;
j["timestamp"] = segment.timestamp;
j["vad_metrics"] = {
{"speech_duration_ms", segment.speech_duration_ms},
{"silence_duration_ms", segment.silence_duration_ms},
{"flush_reason", segment.flush_reason}
};
std::ofstream file(filename.str());
if (file.is_open()) {

View File

@ -18,6 +18,11 @@ struct SegmentLog {
bool was_filtered;
std::string filter_reason;
std::string timestamp;
// VAD metrics (added for TASK8)
int speech_duration_ms = 0;
int silence_duration_ms = 0;
std::string flush_reason = "";
};
class SessionLogger {