feat: Add VAD metrics tracking to session logs

2025-12-02 10:03:20 +08:00 · 2025-12-02 10:03:20 +08:00 · e8dd7f840e
commit e8dd7f840e
parent a1b4e335c8
5 changed files with 60 additions and 6 deletions
--- a/src/audio/AudioCapture.cpp
+++ b/src/audio/AudioCapture.cpp
@ -4,9 +4,15 @@

 namespace secondvoice {

-AudioCapture::AudioCapture(int sample_rate, int channels)
+AudioCapture::AudioCapture(int sample_rate, int channels,
+                           int silence_duration_ms,
+                           int min_speech_duration_ms,
+                           int max_speech_duration_ms)
    : sample_rate_(sample_rate)
    , channels_(channels)
+    , silence_duration_ms_(silence_duration_ms)
+    , min_speech_duration_ms_(min_speech_duration_ms)
+    , max_speech_duration_ms_(max_speech_duration_ms)
    , noise_reducer_(std::make_unique<NoiseReducer>()) {
    std::cout << "[Audio] Noise reduction enabled (RNNoise)" << std::endl;
 }
@ -166,6 +172,11 @@ int AudioCapture::audioCallback(const void* input, void* output,
            std::cout << "[VAD] Max duration reached, forcing flush ("
                      << self->speech_samples_count_ / (self->sample_rate_ * self->channels_) << "s)" << std::endl;

+            // Calculate metrics BEFORE flushing
+            self->last_speech_duration_ms_ = (self->speech_samples_count_ * 1000) / (self->sample_rate_ * self->channels_);
+            self->last_silence_duration_ms_ = 0;  // No trailing silence in forced flush
+            self->last_flush_reason_ = "max_duration";
+
            if (self->callback_ && self->speech_buffer_.size() >= static_cast<size_t>(min_speech_samples)) {
                // Flush any remaining samples from the denoiser
                if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) {
@ -213,6 +224,11 @@ int AudioCapture::audioCallback(const void* input, void* output,
            if (self->consecutive_silence_frames_ >= silence_threshold_frames) {
                self->is_speech_active_.store(false, std::memory_order_relaxed);

+                // Calculate metrics BEFORE flushing
+                self->last_speech_duration_ms_ = (self->speech_samples_count_ * 1000) / (self->sample_rate_ * self->channels_);
+                self->last_silence_duration_ms_ = (self->silence_samples_count_ * 1000) / (self->sample_rate_ * self->channels_);
+                self->last_flush_reason_ = "silence_threshold";
+
                // Flush if we have enough speech
                if (self->speech_samples_count_ >= min_speech_samples) {
                    // Flush any remaining samples from the denoiser
--- a/src/audio/AudioCapture.h
+++ b/src/audio/AudioCapture.h
@ -16,7 +16,10 @@ class AudioCapture {
 public:
    using AudioCallback = std::function<void(const std::vector<float>&)>;

-    AudioCapture(int sample_rate, int channels);
+    AudioCapture(int sample_rate, int channels,
+                 int silence_duration_ms = 700,
+                 int min_speech_duration_ms = 2000,
+                 int max_speech_duration_ms = 30000);
    ~AudioCapture();

    bool initialize();
@ -44,6 +47,11 @@ public:
    void setDenoiseEnabled(bool enabled);
    bool isDenoiseEnabled() const;

+    // Get metrics from last flushed segment
+    int getLastSpeechDuration() const { return last_speech_duration_ms_; }
+    int getLastSilenceDuration() const { return last_silence_duration_ms_; }
+    std::string getLastFlushReason() const { return last_flush_reason_; }
+
 private:
    static int audioCallback(const void* input, void* output,
                            unsigned long frame_count,
@ -69,9 +77,9 @@ private:
    // VAD parameters - Higher threshold to avoid false triggers on filtered noise
    std::atomic<float> vad_rms_threshold_{0.02f};   // Was 0.01f
    std::atomic<float> vad_peak_threshold_{0.08f};  // Was 0.04f
-    int silence_duration_ms_ = 700;      // Wait 700ms of silence before cutting (was 400)
-    int min_speech_duration_ms_ = 2000;  // Minimum 2s speech to send (was 1000)
-    int max_speech_duration_ms_ = 30000; // 30s max before forced flush (was 25000)
+    int silence_duration_ms_;      // Wait 700ms of silence before cutting (was 400)
+    int min_speech_duration_ms_;  // Minimum 2s speech to send (was 1000)
+    int max_speech_duration_ms_; // 30s max before forced flush (was 25000)

    // Adaptive noise floor
    float noise_floor_ = 0.005f;         // Estimated background noise level
@ -80,6 +88,11 @@ private:
    // Trailing silence detection - count consecutive silence frames after speech
    int consecutive_silence_frames_ = 0;

+    // Metrics for last flushed segment (set in callback, read in processing thread)
+    int last_speech_duration_ms_ = 0;
+    int last_silence_duration_ms_ = 0;
+    std::string last_flush_reason_;
+
    // Zero-crossing rate for speech vs noise discrimination
    float last_zcr_ = 0.0f;
    float speech_zcr_min_ = 0.01f;       // More permissive lower bound
--- a/src/core/Pipeline.cpp
+++ b/src/core/Pipeline.cpp
@ -24,12 +24,23 @@ Pipeline::~Pipeline() {
 bool Pipeline::initialize() {
    auto& config = Config::getInstance();

+    // Load VAD parameters from config (with fallbacks if missing)
+    int silence_duration = config.getVadSilenceDurationMs();
+    int min_speech = config.getVadMinSpeechDurationMs();
+    int max_speech = config.getVadMaxSpeechDurationMs();
+
    // Initialize audio capture with VAD-based segmentation
    audio_capture_ = std::make_unique<AudioCapture>(
        config.getAudioConfig().sample_rate,
-        config.getAudioConfig().channels
+        config.getAudioConfig().channels,
+        silence_duration,
+        min_speech,
+        max_speech
    );

+    std::cout << "[Pipeline] VAD configured: silence=" << silence_duration
+              << "ms, min_speech=" << min_speech
+              << "ms, max_speech=" << max_speech << "ms" << std::endl;
    std::cout << "[Pipeline] VAD-based audio segmentation enabled" << std::endl;

    if (!audio_capture_->initialize()) {
@ -395,6 +406,10 @@ void Pipeline::processingThread() {
        seg.was_filtered = false;
        seg.filter_reason = "";
        seg.timestamp = "";  // Will be set by logger
+        // Add VAD metrics from AudioCapture
+        seg.speech_duration_ms = audio_capture_->getLastSpeechDuration();
+        seg.silence_duration_ms = audio_capture_->getLastSilenceDuration();
+        seg.flush_reason = audio_capture_->getLastFlushReason();
        session_logger_.logSegment(seg);

        std::cout << "CN: " << text << std::endl;
--- a/src/utils/SessionLogger.cpp
+++ b/src/utils/SessionLogger.cpp
@ -89,6 +89,11 @@ void SessionLogger::logSegment(const SegmentLog& segment) {
    j["was_filtered"] = segment.was_filtered;
    j["filter_reason"] = segment.filter_reason;
    j["timestamp"] = segment.timestamp;
+    j["vad_metrics"] = {
+        {"speech_duration_ms", segment.speech_duration_ms},
+        {"silence_duration_ms", segment.silence_duration_ms},
+        {"flush_reason", segment.flush_reason}
+    };

    std::ofstream file(filename.str());
    if (file.is_open()) {
--- a/src/utils/SessionLogger.h
+++ b/src/utils/SessionLogger.h
@ -18,6 +18,11 @@ struct SegmentLog {
    bool was_filtered;
    std::string filter_reason;
    std::string timestamp;
+
+    // VAD metrics (added for TASK8)
+    int speech_duration_ms = 0;
+    int silence_duration_ms = 0;
+    std::string flush_reason = "";
 };

 class SessionLogger {