From 49f9cb906e08cb19295a62de1ac8f2c12f83d9fe Mon Sep 17 00:00:00 2001 From: StillHammer Date: Tue, 2 Dec 2025 09:48:44 +0800 Subject: [PATCH] tune: Extend VAD speech duration and improve context prompt formatting --- config.json | 5 +++++ src/audio/AudioCapture.h | 4 ++-- src/core/Pipeline.cpp | 6 +++--- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/config.json b/config.json index 6a706d2..e08ca2d 100644 --- a/config.json +++ b/config.json @@ -6,6 +6,11 @@ "chunk_step_seconds": 5, "format": "ogg" }, + "vad": { + "silence_duration_ms": 700, + "min_speech_duration_ms": 2000, + "max_speech_duration_ms": 30000 + }, "whisper": { "model": "gpt-4o-mini-transcribe", "language": "zh", diff --git a/src/audio/AudioCapture.h b/src/audio/AudioCapture.h index ff1eba8..0b09c1d 100644 --- a/src/audio/AudioCapture.h +++ b/src/audio/AudioCapture.h @@ -70,8 +70,8 @@ private: std::atomic vad_rms_threshold_{0.02f}; // Was 0.01f std::atomic vad_peak_threshold_{0.08f}; // Was 0.04f int silence_duration_ms_ = 700; // Wait 700ms of silence before cutting (was 400) - int min_speech_duration_ms_ = 1000; // Minimum 1s speech to send (was 300) - int max_speech_duration_ms_ = 25000; // 25s max before forced flush + int min_speech_duration_ms_ = 2000; // Minimum 2s speech to send (was 1000) + int max_speech_duration_ms_ = 30000; // 30s max before forced flush (was 25000) // Adaptive noise floor float noise_floor_ = 0.005f; // Estimated background noise level diff --git a/src/core/Pipeline.cpp b/src/core/Pipeline.cpp index bec2224..b44af23 100644 --- a/src/core/Pipeline.cpp +++ b/src/core/Pipeline.cpp @@ -468,11 +468,11 @@ std::string Pipeline::buildDynamicPrompt() const { // Build context from recent transcriptions std::stringstream context; context << base_prompt; - context << "\n\nContexte des phrases précédentes: "; + context << "\n\nContexte des phrases précédentes:\n"; for (size_t i = 0; i < recent_transcriptions_.size(); ++i) { - if (i > 0) context << " "; - context << recent_transcriptions_[i]; + context << std::to_string(i + 1) << ". " + << recent_transcriptions_[i] << "\n"; } return context.str();