feat: Add VAD metrics tracking to session logs
This commit is contained in:
parent
a1b4e335c8
commit
e8dd7f840e
@ -4,9 +4,15 @@
|
||||
|
||||
namespace secondvoice {
|
||||
|
||||
AudioCapture::AudioCapture(int sample_rate, int channels)
|
||||
AudioCapture::AudioCapture(int sample_rate, int channels,
|
||||
int silence_duration_ms,
|
||||
int min_speech_duration_ms,
|
||||
int max_speech_duration_ms)
|
||||
: sample_rate_(sample_rate)
|
||||
, channels_(channels)
|
||||
, silence_duration_ms_(silence_duration_ms)
|
||||
, min_speech_duration_ms_(min_speech_duration_ms)
|
||||
, max_speech_duration_ms_(max_speech_duration_ms)
|
||||
, noise_reducer_(std::make_unique<NoiseReducer>()) {
|
||||
std::cout << "[Audio] Noise reduction enabled (RNNoise)" << std::endl;
|
||||
}
|
||||
@ -166,6 +172,11 @@ int AudioCapture::audioCallback(const void* input, void* output,
|
||||
std::cout << "[VAD] Max duration reached, forcing flush ("
|
||||
<< self->speech_samples_count_ / (self->sample_rate_ * self->channels_) << "s)" << std::endl;
|
||||
|
||||
// Calculate metrics BEFORE flushing
|
||||
self->last_speech_duration_ms_ = (self->speech_samples_count_ * 1000) / (self->sample_rate_ * self->channels_);
|
||||
self->last_silence_duration_ms_ = 0; // No trailing silence in forced flush
|
||||
self->last_flush_reason_ = "max_duration";
|
||||
|
||||
if (self->callback_ && self->speech_buffer_.size() >= static_cast<size_t>(min_speech_samples)) {
|
||||
// Flush any remaining samples from the denoiser
|
||||
if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) {
|
||||
@ -213,6 +224,11 @@ int AudioCapture::audioCallback(const void* input, void* output,
|
||||
if (self->consecutive_silence_frames_ >= silence_threshold_frames) {
|
||||
self->is_speech_active_.store(false, std::memory_order_relaxed);
|
||||
|
||||
// Calculate metrics BEFORE flushing
|
||||
self->last_speech_duration_ms_ = (self->speech_samples_count_ * 1000) / (self->sample_rate_ * self->channels_);
|
||||
self->last_silence_duration_ms_ = (self->silence_samples_count_ * 1000) / (self->sample_rate_ * self->channels_);
|
||||
self->last_flush_reason_ = "silence_threshold";
|
||||
|
||||
// Flush if we have enough speech
|
||||
if (self->speech_samples_count_ >= min_speech_samples) {
|
||||
// Flush any remaining samples from the denoiser
|
||||
|
||||
@ -16,7 +16,10 @@ class AudioCapture {
|
||||
public:
|
||||
using AudioCallback = std::function<void(const std::vector<float>&)>;
|
||||
|
||||
AudioCapture(int sample_rate, int channels);
|
||||
AudioCapture(int sample_rate, int channels,
|
||||
int silence_duration_ms = 700,
|
||||
int min_speech_duration_ms = 2000,
|
||||
int max_speech_duration_ms = 30000);
|
||||
~AudioCapture();
|
||||
|
||||
bool initialize();
|
||||
@ -44,6 +47,11 @@ public:
|
||||
void setDenoiseEnabled(bool enabled);
|
||||
bool isDenoiseEnabled() const;
|
||||
|
||||
// Get metrics from last flushed segment
|
||||
int getLastSpeechDuration() const { return last_speech_duration_ms_; }
|
||||
int getLastSilenceDuration() const { return last_silence_duration_ms_; }
|
||||
std::string getLastFlushReason() const { return last_flush_reason_; }
|
||||
|
||||
private:
|
||||
static int audioCallback(const void* input, void* output,
|
||||
unsigned long frame_count,
|
||||
@ -69,9 +77,9 @@ private:
|
||||
// VAD parameters - Higher threshold to avoid false triggers on filtered noise
|
||||
std::atomic<float> vad_rms_threshold_{0.02f}; // Was 0.01f
|
||||
std::atomic<float> vad_peak_threshold_{0.08f}; // Was 0.04f
|
||||
int silence_duration_ms_ = 700; // Wait 700ms of silence before cutting (was 400)
|
||||
int min_speech_duration_ms_ = 2000; // Minimum 2s speech to send (was 1000)
|
||||
int max_speech_duration_ms_ = 30000; // 30s max before forced flush (was 25000)
|
||||
int silence_duration_ms_; // Wait 700ms of silence before cutting (was 400)
|
||||
int min_speech_duration_ms_; // Minimum 2s speech to send (was 1000)
|
||||
int max_speech_duration_ms_; // 30s max before forced flush (was 25000)
|
||||
|
||||
// Adaptive noise floor
|
||||
float noise_floor_ = 0.005f; // Estimated background noise level
|
||||
@ -80,6 +88,11 @@ private:
|
||||
// Trailing silence detection - count consecutive silence frames after speech
|
||||
int consecutive_silence_frames_ = 0;
|
||||
|
||||
// Metrics for last flushed segment (set in callback, read in processing thread)
|
||||
int last_speech_duration_ms_ = 0;
|
||||
int last_silence_duration_ms_ = 0;
|
||||
std::string last_flush_reason_;
|
||||
|
||||
// Zero-crossing rate for speech vs noise discrimination
|
||||
float last_zcr_ = 0.0f;
|
||||
float speech_zcr_min_ = 0.01f; // More permissive lower bound
|
||||
|
||||
@ -24,12 +24,23 @@ Pipeline::~Pipeline() {
|
||||
bool Pipeline::initialize() {
|
||||
auto& config = Config::getInstance();
|
||||
|
||||
// Load VAD parameters from config (with fallbacks if missing)
|
||||
int silence_duration = config.getVadSilenceDurationMs();
|
||||
int min_speech = config.getVadMinSpeechDurationMs();
|
||||
int max_speech = config.getVadMaxSpeechDurationMs();
|
||||
|
||||
// Initialize audio capture with VAD-based segmentation
|
||||
audio_capture_ = std::make_unique<AudioCapture>(
|
||||
config.getAudioConfig().sample_rate,
|
||||
config.getAudioConfig().channels
|
||||
config.getAudioConfig().channels,
|
||||
silence_duration,
|
||||
min_speech,
|
||||
max_speech
|
||||
);
|
||||
|
||||
std::cout << "[Pipeline] VAD configured: silence=" << silence_duration
|
||||
<< "ms, min_speech=" << min_speech
|
||||
<< "ms, max_speech=" << max_speech << "ms" << std::endl;
|
||||
std::cout << "[Pipeline] VAD-based audio segmentation enabled" << std::endl;
|
||||
|
||||
if (!audio_capture_->initialize()) {
|
||||
@ -395,6 +406,10 @@ void Pipeline::processingThread() {
|
||||
seg.was_filtered = false;
|
||||
seg.filter_reason = "";
|
||||
seg.timestamp = ""; // Will be set by logger
|
||||
// Add VAD metrics from AudioCapture
|
||||
seg.speech_duration_ms = audio_capture_->getLastSpeechDuration();
|
||||
seg.silence_duration_ms = audio_capture_->getLastSilenceDuration();
|
||||
seg.flush_reason = audio_capture_->getLastFlushReason();
|
||||
session_logger_.logSegment(seg);
|
||||
|
||||
std::cout << "CN: " << text << std::endl;
|
||||
|
||||
@ -89,6 +89,11 @@ void SessionLogger::logSegment(const SegmentLog& segment) {
|
||||
j["was_filtered"] = segment.was_filtered;
|
||||
j["filter_reason"] = segment.filter_reason;
|
||||
j["timestamp"] = segment.timestamp;
|
||||
j["vad_metrics"] = {
|
||||
{"speech_duration_ms", segment.speech_duration_ms},
|
||||
{"silence_duration_ms", segment.silence_duration_ms},
|
||||
{"flush_reason", segment.flush_reason}
|
||||
};
|
||||
|
||||
std::ofstream file(filename.str());
|
||||
if (file.is_open()) {
|
||||
|
||||
@ -18,6 +18,11 @@ struct SegmentLog {
|
||||
bool was_filtered;
|
||||
std::string filter_reason;
|
||||
std::string timestamp;
|
||||
|
||||
// VAD metrics (added for TASK8)
|
||||
int speech_duration_ms = 0;
|
||||
int silence_duration_ms = 0;
|
||||
std::string flush_reason = "";
|
||||
};
|
||||
|
||||
class SessionLogger {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user