tune: Extend VAD speech duration and improve context prompt formatting

2025-12-02 09:48:44 +08:00 · 2025-12-02 09:48:44 +08:00 · 49f9cb906e
commit 49f9cb906e
parent db0f8e5990
3 changed files with 10 additions and 5 deletions
--- a/config.json
+++ b/config.json
@ -6,6 +6,11 @@
    "chunk_step_seconds": 5,
    "format": "ogg"
  },
+  "vad": {
+    "silence_duration_ms": 700,
+    "min_speech_duration_ms": 2000,
+    "max_speech_duration_ms": 30000
+  },
  "whisper": {
    "model": "gpt-4o-mini-transcribe",
    "language": "zh",
--- a/src/audio/AudioCapture.h
+++ b/src/audio/AudioCapture.h
@ -70,8 +70,8 @@ private:
    std::atomic<float> vad_rms_threshold_{0.02f};   // Was 0.01f
    std::atomic<float> vad_peak_threshold_{0.08f};  // Was 0.04f
    int silence_duration_ms_ = 700;      // Wait 700ms of silence before cutting (was 400)
-    int min_speech_duration_ms_ = 1000;  // Minimum 1s speech to send (was 300)
-    int max_speech_duration_ms_ = 25000; // 25s max before forced flush
+    int min_speech_duration_ms_ = 2000;  // Minimum 2s speech to send (was 1000)
+    int max_speech_duration_ms_ = 30000; // 30s max before forced flush (was 25000)

    // Adaptive noise floor
    float noise_floor_ = 0.005f;         // Estimated background noise level
--- a/src/core/Pipeline.cpp
+++ b/src/core/Pipeline.cpp
@ -468,11 +468,11 @@ std::string Pipeline::buildDynamicPrompt() const {
    // Build context from recent transcriptions
    std::stringstream context;
    context << base_prompt;
-    context << "\n\nContexte des phrases précédentes: ";
+    context << "\n\nContexte des phrases précédentes:\n";

    for (size_t i = 0; i < recent_transcriptions_.size(); ++i) {
-        if (i > 0) context << " ";
-        context << recent_transcriptions_[i];
+        context << std::to_string(i + 1) << ". "
+                << recent_transcriptions_[i] << "\n";
    }

    return context.str();