feat: Add VAD metrics tracking to session logs
This commit is contained in:
parent
a1b4e335c8
commit
e8dd7f840e
@ -4,9 +4,15 @@
|
|||||||
|
|
||||||
namespace secondvoice {
|
namespace secondvoice {
|
||||||
|
|
||||||
AudioCapture::AudioCapture(int sample_rate, int channels)
|
AudioCapture::AudioCapture(int sample_rate, int channels,
|
||||||
|
int silence_duration_ms,
|
||||||
|
int min_speech_duration_ms,
|
||||||
|
int max_speech_duration_ms)
|
||||||
: sample_rate_(sample_rate)
|
: sample_rate_(sample_rate)
|
||||||
, channels_(channels)
|
, channels_(channels)
|
||||||
|
, silence_duration_ms_(silence_duration_ms)
|
||||||
|
, min_speech_duration_ms_(min_speech_duration_ms)
|
||||||
|
, max_speech_duration_ms_(max_speech_duration_ms)
|
||||||
, noise_reducer_(std::make_unique<NoiseReducer>()) {
|
, noise_reducer_(std::make_unique<NoiseReducer>()) {
|
||||||
std::cout << "[Audio] Noise reduction enabled (RNNoise)" << std::endl;
|
std::cout << "[Audio] Noise reduction enabled (RNNoise)" << std::endl;
|
||||||
}
|
}
|
||||||
@ -166,6 +172,11 @@ int AudioCapture::audioCallback(const void* input, void* output,
|
|||||||
std::cout << "[VAD] Max duration reached, forcing flush ("
|
std::cout << "[VAD] Max duration reached, forcing flush ("
|
||||||
<< self->speech_samples_count_ / (self->sample_rate_ * self->channels_) << "s)" << std::endl;
|
<< self->speech_samples_count_ / (self->sample_rate_ * self->channels_) << "s)" << std::endl;
|
||||||
|
|
||||||
|
// Calculate metrics BEFORE flushing
|
||||||
|
self->last_speech_duration_ms_ = (self->speech_samples_count_ * 1000) / (self->sample_rate_ * self->channels_);
|
||||||
|
self->last_silence_duration_ms_ = 0; // No trailing silence in forced flush
|
||||||
|
self->last_flush_reason_ = "max_duration";
|
||||||
|
|
||||||
if (self->callback_ && self->speech_buffer_.size() >= static_cast<size_t>(min_speech_samples)) {
|
if (self->callback_ && self->speech_buffer_.size() >= static_cast<size_t>(min_speech_samples)) {
|
||||||
// Flush any remaining samples from the denoiser
|
// Flush any remaining samples from the denoiser
|
||||||
if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) {
|
if (self->noise_reducer_ && self->noise_reducer_->isEnabled()) {
|
||||||
@ -213,6 +224,11 @@ int AudioCapture::audioCallback(const void* input, void* output,
|
|||||||
if (self->consecutive_silence_frames_ >= silence_threshold_frames) {
|
if (self->consecutive_silence_frames_ >= silence_threshold_frames) {
|
||||||
self->is_speech_active_.store(false, std::memory_order_relaxed);
|
self->is_speech_active_.store(false, std::memory_order_relaxed);
|
||||||
|
|
||||||
|
// Calculate metrics BEFORE flushing
|
||||||
|
self->last_speech_duration_ms_ = (self->speech_samples_count_ * 1000) / (self->sample_rate_ * self->channels_);
|
||||||
|
self->last_silence_duration_ms_ = (self->silence_samples_count_ * 1000) / (self->sample_rate_ * self->channels_);
|
||||||
|
self->last_flush_reason_ = "silence_threshold";
|
||||||
|
|
||||||
// Flush if we have enough speech
|
// Flush if we have enough speech
|
||||||
if (self->speech_samples_count_ >= min_speech_samples) {
|
if (self->speech_samples_count_ >= min_speech_samples) {
|
||||||
// Flush any remaining samples from the denoiser
|
// Flush any remaining samples from the denoiser
|
||||||
|
|||||||
@ -16,7 +16,10 @@ class AudioCapture {
|
|||||||
public:
|
public:
|
||||||
using AudioCallback = std::function<void(const std::vector<float>&)>;
|
using AudioCallback = std::function<void(const std::vector<float>&)>;
|
||||||
|
|
||||||
AudioCapture(int sample_rate, int channels);
|
AudioCapture(int sample_rate, int channels,
|
||||||
|
int silence_duration_ms = 700,
|
||||||
|
int min_speech_duration_ms = 2000,
|
||||||
|
int max_speech_duration_ms = 30000);
|
||||||
~AudioCapture();
|
~AudioCapture();
|
||||||
|
|
||||||
bool initialize();
|
bool initialize();
|
||||||
@ -44,6 +47,11 @@ public:
|
|||||||
void setDenoiseEnabled(bool enabled);
|
void setDenoiseEnabled(bool enabled);
|
||||||
bool isDenoiseEnabled() const;
|
bool isDenoiseEnabled() const;
|
||||||
|
|
||||||
|
// Get metrics from last flushed segment
|
||||||
|
int getLastSpeechDuration() const { return last_speech_duration_ms_; }
|
||||||
|
int getLastSilenceDuration() const { return last_silence_duration_ms_; }
|
||||||
|
std::string getLastFlushReason() const { return last_flush_reason_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static int audioCallback(const void* input, void* output,
|
static int audioCallback(const void* input, void* output,
|
||||||
unsigned long frame_count,
|
unsigned long frame_count,
|
||||||
@ -69,9 +77,9 @@ private:
|
|||||||
// VAD parameters - Higher threshold to avoid false triggers on filtered noise
|
// VAD parameters - Higher threshold to avoid false triggers on filtered noise
|
||||||
std::atomic<float> vad_rms_threshold_{0.02f}; // Was 0.01f
|
std::atomic<float> vad_rms_threshold_{0.02f}; // Was 0.01f
|
||||||
std::atomic<float> vad_peak_threshold_{0.08f}; // Was 0.04f
|
std::atomic<float> vad_peak_threshold_{0.08f}; // Was 0.04f
|
||||||
int silence_duration_ms_ = 700; // Wait 700ms of silence before cutting (was 400)
|
int silence_duration_ms_; // Wait 700ms of silence before cutting (was 400)
|
||||||
int min_speech_duration_ms_ = 2000; // Minimum 2s speech to send (was 1000)
|
int min_speech_duration_ms_; // Minimum 2s speech to send (was 1000)
|
||||||
int max_speech_duration_ms_ = 30000; // 30s max before forced flush (was 25000)
|
int max_speech_duration_ms_; // 30s max before forced flush (was 25000)
|
||||||
|
|
||||||
// Adaptive noise floor
|
// Adaptive noise floor
|
||||||
float noise_floor_ = 0.005f; // Estimated background noise level
|
float noise_floor_ = 0.005f; // Estimated background noise level
|
||||||
@ -80,6 +88,11 @@ private:
|
|||||||
// Trailing silence detection - count consecutive silence frames after speech
|
// Trailing silence detection - count consecutive silence frames after speech
|
||||||
int consecutive_silence_frames_ = 0;
|
int consecutive_silence_frames_ = 0;
|
||||||
|
|
||||||
|
// Metrics for last flushed segment (set in callback, read in processing thread)
|
||||||
|
int last_speech_duration_ms_ = 0;
|
||||||
|
int last_silence_duration_ms_ = 0;
|
||||||
|
std::string last_flush_reason_;
|
||||||
|
|
||||||
// Zero-crossing rate for speech vs noise discrimination
|
// Zero-crossing rate for speech vs noise discrimination
|
||||||
float last_zcr_ = 0.0f;
|
float last_zcr_ = 0.0f;
|
||||||
float speech_zcr_min_ = 0.01f; // More permissive lower bound
|
float speech_zcr_min_ = 0.01f; // More permissive lower bound
|
||||||
|
|||||||
@ -24,12 +24,23 @@ Pipeline::~Pipeline() {
|
|||||||
bool Pipeline::initialize() {
|
bool Pipeline::initialize() {
|
||||||
auto& config = Config::getInstance();
|
auto& config = Config::getInstance();
|
||||||
|
|
||||||
|
// Load VAD parameters from config (with fallbacks if missing)
|
||||||
|
int silence_duration = config.getVadSilenceDurationMs();
|
||||||
|
int min_speech = config.getVadMinSpeechDurationMs();
|
||||||
|
int max_speech = config.getVadMaxSpeechDurationMs();
|
||||||
|
|
||||||
// Initialize audio capture with VAD-based segmentation
|
// Initialize audio capture with VAD-based segmentation
|
||||||
audio_capture_ = std::make_unique<AudioCapture>(
|
audio_capture_ = std::make_unique<AudioCapture>(
|
||||||
config.getAudioConfig().sample_rate,
|
config.getAudioConfig().sample_rate,
|
||||||
config.getAudioConfig().channels
|
config.getAudioConfig().channels,
|
||||||
|
silence_duration,
|
||||||
|
min_speech,
|
||||||
|
max_speech
|
||||||
);
|
);
|
||||||
|
|
||||||
|
std::cout << "[Pipeline] VAD configured: silence=" << silence_duration
|
||||||
|
<< "ms, min_speech=" << min_speech
|
||||||
|
<< "ms, max_speech=" << max_speech << "ms" << std::endl;
|
||||||
std::cout << "[Pipeline] VAD-based audio segmentation enabled" << std::endl;
|
std::cout << "[Pipeline] VAD-based audio segmentation enabled" << std::endl;
|
||||||
|
|
||||||
if (!audio_capture_->initialize()) {
|
if (!audio_capture_->initialize()) {
|
||||||
@ -395,6 +406,10 @@ void Pipeline::processingThread() {
|
|||||||
seg.was_filtered = false;
|
seg.was_filtered = false;
|
||||||
seg.filter_reason = "";
|
seg.filter_reason = "";
|
||||||
seg.timestamp = ""; // Will be set by logger
|
seg.timestamp = ""; // Will be set by logger
|
||||||
|
// Add VAD metrics from AudioCapture
|
||||||
|
seg.speech_duration_ms = audio_capture_->getLastSpeechDuration();
|
||||||
|
seg.silence_duration_ms = audio_capture_->getLastSilenceDuration();
|
||||||
|
seg.flush_reason = audio_capture_->getLastFlushReason();
|
||||||
session_logger_.logSegment(seg);
|
session_logger_.logSegment(seg);
|
||||||
|
|
||||||
std::cout << "CN: " << text << std::endl;
|
std::cout << "CN: " << text << std::endl;
|
||||||
|
|||||||
@ -89,6 +89,11 @@ void SessionLogger::logSegment(const SegmentLog& segment) {
|
|||||||
j["was_filtered"] = segment.was_filtered;
|
j["was_filtered"] = segment.was_filtered;
|
||||||
j["filter_reason"] = segment.filter_reason;
|
j["filter_reason"] = segment.filter_reason;
|
||||||
j["timestamp"] = segment.timestamp;
|
j["timestamp"] = segment.timestamp;
|
||||||
|
j["vad_metrics"] = {
|
||||||
|
{"speech_duration_ms", segment.speech_duration_ms},
|
||||||
|
{"silence_duration_ms", segment.silence_duration_ms},
|
||||||
|
{"flush_reason", segment.flush_reason}
|
||||||
|
};
|
||||||
|
|
||||||
std::ofstream file(filename.str());
|
std::ofstream file(filename.str());
|
||||||
if (file.is_open()) {
|
if (file.is_open()) {
|
||||||
|
|||||||
@ -18,6 +18,11 @@ struct SegmentLog {
|
|||||||
bool was_filtered;
|
bool was_filtered;
|
||||||
std::string filter_reason;
|
std::string filter_reason;
|
||||||
std::string timestamp;
|
std::string timestamp;
|
||||||
|
|
||||||
|
// VAD metrics (added for TASK8)
|
||||||
|
int speech_duration_ms = 0;
|
||||||
|
int silence_duration_ms = 0;
|
||||||
|
std::string flush_reason = "";
|
||||||
};
|
};
|
||||||
|
|
||||||
class SessionLogger {
|
class SessionLogger {
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user