tune: Extend VAD speech duration and improve context prompt formatting

This commit is contained in:
StillHammer 2025-12-02 09:48:44 +08:00
parent db0f8e5990
commit 49f9cb906e
3 changed files with 10 additions and 5 deletions

View File

@ -6,6 +6,11 @@
"chunk_step_seconds": 5,
"format": "ogg"
},
"vad": {
"silence_duration_ms": 700,
"min_speech_duration_ms": 2000,
"max_speech_duration_ms": 30000
},
"whisper": {
"model": "gpt-4o-mini-transcribe",
"language": "zh",

View File

@ -70,8 +70,8 @@ private:
std::atomic<float> vad_rms_threshold_{0.02f}; // Was 0.01f
std::atomic<float> vad_peak_threshold_{0.08f}; // Was 0.04f
int silence_duration_ms_ = 700; // Wait 700ms of silence before cutting (was 400)
int min_speech_duration_ms_ = 1000; // Minimum 1s speech to send (was 300)
int max_speech_duration_ms_ = 25000; // 25s max before forced flush
int min_speech_duration_ms_ = 2000; // Minimum 2s speech to send (was 1000)
int max_speech_duration_ms_ = 30000; // 30s max before forced flush (was 25000)
// Adaptive noise floor
float noise_floor_ = 0.005f; // Estimated background noise level

View File

@ -468,11 +468,11 @@ std::string Pipeline::buildDynamicPrompt() const {
// Build context from recent transcriptions
std::stringstream context;
context << base_prompt;
context << "\n\nContexte des phrases précédentes: ";
context << "\n\nContexte des phrases précédentes:\n";
for (size_t i = 0; i < recent_transcriptions_.size(); ++i) {
if (i > 0) context << " ";
context << recent_transcriptions_[i];
context << std::to_string(i + 1) << ". "
<< recent_transcriptions_[i] << "\n";
}
return context.str();