diff --git a/src/audio/AudioCapture.cpp b/src/audio/AudioCapture.cpp index 8d2c62c..33eb0a2 100644 --- a/src/audio/AudioCapture.cpp +++ b/src/audio/AudioCapture.cpp @@ -4,9 +4,15 @@ namespace secondvoice { -AudioCapture::AudioCapture(int sample_rate, int channels) +AudioCapture::AudioCapture(int sample_rate, int channels, + int silence_duration_ms, + int min_speech_duration_ms, + int max_speech_duration_ms) : sample_rate_(sample_rate) , channels_(channels) + , silence_duration_ms_(silence_duration_ms) + , min_speech_duration_ms_(min_speech_duration_ms) + , max_speech_duration_ms_(max_speech_duration_ms) , noise_reducer_(std::make_unique()) { std::cout << "[Audio] Noise reduction enabled (RNNoise)" << std::endl; } @@ -166,6 +172,11 @@ int AudioCapture::audioCallback(const void* input, void* output, std::cout << "[VAD] Max duration reached, forcing flush (" << self->speech_samples_count_ / (self->sample_rate_ * self->channels_) << "s)" << std::endl; + // Calculate metrics BEFORE flushing + self->last_speech_duration_ms_ = (self->speech_samples_count_ * 1000) / (self->sample_rate_ * self->channels_); + self->last_silence_duration_ms_ = 0; // No trailing silence in forced flush + self->last_flush_reason_ = "max_duration"; + if (self->callback_ && self->speech_buffer_.size() >= static_cast(min_speech_samples)) { // Flush any remaining samples from the denoiser if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) { @@ -213,6 +224,11 @@ int AudioCapture::audioCallback(const void* input, void* output, if (self->consecutive_silence_frames_ >= silence_threshold_frames) { self->is_speech_active_.store(false, std::memory_order_relaxed); + // Calculate metrics BEFORE flushing + self->last_speech_duration_ms_ = (self->speech_samples_count_ * 1000) / (self->sample_rate_ * self->channels_); + self->last_silence_duration_ms_ = (self->silence_samples_count_ * 1000) / (self->sample_rate_ * self->channels_); + self->last_flush_reason_ = "silence_threshold"; + // Flush if we have enough speech if (self->speech_samples_count_ >= min_speech_samples) { // Flush any remaining samples from the denoiser diff --git a/src/audio/AudioCapture.h b/src/audio/AudioCapture.h index 0b09c1d..0551c64 100644 --- a/src/audio/AudioCapture.h +++ b/src/audio/AudioCapture.h @@ -16,7 +16,10 @@ class AudioCapture { public: using AudioCallback = std::function&)>; - AudioCapture(int sample_rate, int channels); + AudioCapture(int sample_rate, int channels, + int silence_duration_ms = 700, + int min_speech_duration_ms = 2000, + int max_speech_duration_ms = 30000); ~AudioCapture(); bool initialize(); @@ -44,6 +47,11 @@ public: void setDenoiseEnabled(bool enabled); bool isDenoiseEnabled() const; + // Get metrics from last flushed segment + int getLastSpeechDuration() const { return last_speech_duration_ms_; } + int getLastSilenceDuration() const { return last_silence_duration_ms_; } + std::string getLastFlushReason() const { return last_flush_reason_; } + private: static int audioCallback(const void* input, void* output, unsigned long frame_count, @@ -69,9 +77,9 @@ private: // VAD parameters - Higher threshold to avoid false triggers on filtered noise std::atomic vad_rms_threshold_{0.02f}; // Was 0.01f std::atomic vad_peak_threshold_{0.08f}; // Was 0.04f - int silence_duration_ms_ = 700; // Wait 700ms of silence before cutting (was 400) - int min_speech_duration_ms_ = 2000; // Minimum 2s speech to send (was 1000) - int max_speech_duration_ms_ = 30000; // 30s max before forced flush (was 25000) + int silence_duration_ms_; // Wait 700ms of silence before cutting (was 400) + int min_speech_duration_ms_; // Minimum 2s speech to send (was 1000) + int max_speech_duration_ms_; // 30s max before forced flush (was 25000) // Adaptive noise floor float noise_floor_ = 0.005f; // Estimated background noise level @@ -80,6 +88,11 @@ private: // Trailing silence detection - count consecutive silence frames after speech int consecutive_silence_frames_ = 0; + // Metrics for last flushed segment (set in callback, read in processing thread) + int last_speech_duration_ms_ = 0; + int last_silence_duration_ms_ = 0; + std::string last_flush_reason_; + // Zero-crossing rate for speech vs noise discrimination float last_zcr_ = 0.0f; float speech_zcr_min_ = 0.01f; // More permissive lower bound diff --git a/src/core/Pipeline.cpp b/src/core/Pipeline.cpp index b44af23..f397dca 100644 --- a/src/core/Pipeline.cpp +++ b/src/core/Pipeline.cpp @@ -24,12 +24,23 @@ Pipeline::~Pipeline() { bool Pipeline::initialize() { auto& config = Config::getInstance(); + // Load VAD parameters from config (with fallbacks if missing) + int silence_duration = config.getVadSilenceDurationMs(); + int min_speech = config.getVadMinSpeechDurationMs(); + int max_speech = config.getVadMaxSpeechDurationMs(); + // Initialize audio capture with VAD-based segmentation audio_capture_ = std::make_unique( config.getAudioConfig().sample_rate, - config.getAudioConfig().channels + config.getAudioConfig().channels, + silence_duration, + min_speech, + max_speech ); + std::cout << "[Pipeline] VAD configured: silence=" << silence_duration + << "ms, min_speech=" << min_speech + << "ms, max_speech=" << max_speech << "ms" << std::endl; std::cout << "[Pipeline] VAD-based audio segmentation enabled" << std::endl; if (!audio_capture_->initialize()) { @@ -395,6 +406,10 @@ void Pipeline::processingThread() { seg.was_filtered = false; seg.filter_reason = ""; seg.timestamp = ""; // Will be set by logger + // Add VAD metrics from AudioCapture + seg.speech_duration_ms = audio_capture_->getLastSpeechDuration(); + seg.silence_duration_ms = audio_capture_->getLastSilenceDuration(); + seg.flush_reason = audio_capture_->getLastFlushReason(); session_logger_.logSegment(seg); std::cout << "CN: " << text << std::endl; diff --git a/src/utils/SessionLogger.cpp b/src/utils/SessionLogger.cpp index ed371c0..56949a8 100644 --- a/src/utils/SessionLogger.cpp +++ b/src/utils/SessionLogger.cpp @@ -89,6 +89,11 @@ void SessionLogger::logSegment(const SegmentLog& segment) { j["was_filtered"] = segment.was_filtered; j["filter_reason"] = segment.filter_reason; j["timestamp"] = segment.timestamp; + j["vad_metrics"] = { + {"speech_duration_ms", segment.speech_duration_ms}, + {"silence_duration_ms", segment.silence_duration_ms}, + {"flush_reason", segment.flush_reason} + }; std::ofstream file(filename.str()); if (file.is_open()) { diff --git a/src/utils/SessionLogger.h b/src/utils/SessionLogger.h index c999967..8a1c7ce 100644 --- a/src/utils/SessionLogger.h +++ b/src/utils/SessionLogger.h @@ -18,6 +18,11 @@ struct SegmentLog { bool was_filtered; std::string filter_reason; std::string timestamp; + + // VAD metrics (added for TASK8) + int speech_duration_ms = 0; + int silence_duration_ms = 0; + std::string flush_reason = ""; }; class SessionLogger {