tune: Extend VAD speech duration and improve context prompt formatting
This commit is contained in:
parent
db0f8e5990
commit
49f9cb906e
@ -6,6 +6,11 @@
|
||||
"chunk_step_seconds": 5,
|
||||
"format": "ogg"
|
||||
},
|
||||
"vad": {
|
||||
"silence_duration_ms": 700,
|
||||
"min_speech_duration_ms": 2000,
|
||||
"max_speech_duration_ms": 30000
|
||||
},
|
||||
"whisper": {
|
||||
"model": "gpt-4o-mini-transcribe",
|
||||
"language": "zh",
|
||||
|
||||
@ -70,8 +70,8 @@ private:
|
||||
std::atomic<float> vad_rms_threshold_{0.02f}; // Was 0.01f
|
||||
std::atomic<float> vad_peak_threshold_{0.08f}; // Was 0.04f
|
||||
int silence_duration_ms_ = 700; // Wait 700ms of silence before cutting (was 400)
|
||||
int min_speech_duration_ms_ = 1000; // Minimum 1s speech to send (was 300)
|
||||
int max_speech_duration_ms_ = 25000; // 25s max before forced flush
|
||||
int min_speech_duration_ms_ = 2000; // Minimum 2s speech to send (was 1000)
|
||||
int max_speech_duration_ms_ = 30000; // 30s max before forced flush (was 25000)
|
||||
|
||||
// Adaptive noise floor
|
||||
float noise_floor_ = 0.005f; // Estimated background noise level
|
||||
|
||||
@ -468,11 +468,11 @@ std::string Pipeline::buildDynamicPrompt() const {
|
||||
// Build context from recent transcriptions
|
||||
std::stringstream context;
|
||||
context << base_prompt;
|
||||
context << "\n\nContexte des phrases précédentes: ";
|
||||
context << "\n\nContexte des phrases précédentes:\n";
|
||||
|
||||
for (size_t i = 0; i < recent_transcriptions_.size(); ++i) {
|
||||
if (i > 0) context << " ";
|
||||
context << recent_transcriptions_[i];
|
||||
context << std::to_string(i + 1) << ". "
|
||||
<< recent_transcriptions_[i] << "\n";
|
||||
}
|
||||
|
||||
return context.str();
|
||||
|
||||
Loading…
Reference in New Issue
Block a user