From 49f9cb906e08cb19295a62de1ac8f2c12f83d9fe Mon Sep 17 00:00:00 2001
From: StillHammer <alexistrouve.pro@gmail.com>
Date: Tue, 2 Dec 2025 09:48:44 +0800
Subject: [PATCH] tune: Extend VAD speech duration and improve context prompt
 formatting

---
 config.json              | 5 +++++
 src/audio/AudioCapture.h | 4 ++--
 src/core/Pipeline.cpp    | 6 +++---
 3 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/config.json b/config.json
index 6a706d2..e08ca2d 100644
--- a/config.json
+++ b/config.json
@@ -6,6 +6,11 @@
     "chunk_step_seconds": 5,
     "format": "ogg"
   },
+  "vad": {
+    "silence_duration_ms": 700,
+    "min_speech_duration_ms": 2000,
+    "max_speech_duration_ms": 30000
+  },
   "whisper": {
     "model": "gpt-4o-mini-transcribe",
     "language": "zh",
diff --git a/src/audio/AudioCapture.h b/src/audio/AudioCapture.h
index ff1eba8..0b09c1d 100644
--- a/src/audio/AudioCapture.h
+++ b/src/audio/AudioCapture.h
@@ -70,8 +70,8 @@ private:
     std::atomic<float> vad_rms_threshold_{0.02f};   // Was 0.01f
     std::atomic<float> vad_peak_threshold_{0.08f};  // Was 0.04f
     int silence_duration_ms_ = 700;      // Wait 700ms of silence before cutting (was 400)
-    int min_speech_duration_ms_ = 1000;  // Minimum 1s speech to send (was 300)
-    int max_speech_duration_ms_ = 25000; // 25s max before forced flush
+    int min_speech_duration_ms_ = 2000;  // Minimum 2s speech to send (was 1000)
+    int max_speech_duration_ms_ = 30000; // 30s max before forced flush (was 25000)
 
     // Adaptive noise floor
     float noise_floor_ = 0.005f;         // Estimated background noise level
diff --git a/src/core/Pipeline.cpp b/src/core/Pipeline.cpp
index bec2224..b44af23 100644
--- a/src/core/Pipeline.cpp
+++ b/src/core/Pipeline.cpp
@@ -468,11 +468,11 @@ std::string Pipeline::buildDynamicPrompt() const {
     // Build context from recent transcriptions
     std::stringstream context;
     context << base_prompt;
-    context << "\n\nContexte des phrases précédentes: ";
+    context << "\n\nContexte des phrases précédentes:\n";
 
     for (size_t i = 0; i < recent_transcriptions_.size(); ++i) {
-        if (i > 0) context << " ";
-        context << recent_transcriptions_[i];
+        context << std::to_string(i + 1) << ". "
+                << recent_transcriptions_[i] << "\n";
     }
 
     return context.str();