feat: Add dynamic Whisper context and audio segment saving

- Pass last 3 transcriptions to Whisper for better context
- Add deduplication filter to skip identical consecutive segments
- Save each audio segment as .ogg in session directory for debugging
- Add queue debug logging to detect double-push issues

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Trouve Alexis 2025-11-23 22:02:49 +08:00
parent 9baa213a82
commit 53b21b94d6
2 changed files with 76 additions and 2 deletions

View File

@ -150,6 +150,8 @@ void Pipeline::audioThread() {
chunk.sample_rate = config.getAudioConfig().sample_rate;
chunk.channels = config.getAudioConfig().channels;
float push_duration = static_cast<float>(audio_data.size()) / (chunk.sample_rate * chunk.channels);
std::cout << "[Queue] Pushing " << push_duration << "s chunk, queue size: " << audio_queue_.size() << std::endl;
audio_queue_.push(std::move(chunk));
});
@ -166,6 +168,7 @@ void Pipeline::audioThread() {
void Pipeline::processingThread() {
auto& config = Config::getInstance();
int audio_segment_id = 0;
while (running_) {
auto chunk_opt = audio_queue_.wait_and_pop();
@ -176,6 +179,23 @@ void Pipeline::processingThread() {
auto& chunk = chunk_opt.value();
float duration = static_cast<float>(chunk.data.size()) / (chunk.sample_rate * chunk.channels);
// Debug: log queue size to detect double-push
std::cout << "[Queue] Processing chunk, " << audio_queue_.size() << " remaining" << std::endl;
// Save audio segment to session directory for debugging
audio_segment_id++;
if (session_logger_.isActive()) {
std::stringstream audio_path;
audio_path << session_logger_.getSessionPath() << "/audio_"
<< std::setfill('0') << std::setw(3) << audio_segment_id << ".ogg";
AudioBuffer segment_buffer(chunk.sample_rate, chunk.channels);
segment_buffer.addSamples(chunk.data);
if (segment_buffer.saveToOpus(audio_path.str())) {
std::cout << "[Session] Saved audio segment: " << audio_path.str() << std::endl;
}
}
// Calculate audio RMS for logging
float audio_rms = 0.0f;
if (!chunk.data.empty()) {
@ -189,6 +209,12 @@ void Pipeline::processingThread() {
// Time Whisper
auto whisper_start = std::chrono::steady_clock::now();
// Build dynamic prompt with recent context
std::string dynamic_prompt = buildDynamicPrompt();
if (!recent_transcriptions_.empty()) {
std::cout << "[Context] Using " << recent_transcriptions_.size() << " previous segments" << std::endl;
}
// Transcribe with Whisper
auto whisper_result = whisper_client_->transcribe(
chunk.data,
@ -197,7 +223,7 @@ void Pipeline::processingThread() {
config.getWhisperConfig().model,
config.getWhisperConfig().language,
config.getWhisperConfig().temperature,
config.getWhisperConfig().prompt,
dynamic_prompt,
config.getWhisperConfig().response_format
);
@ -296,6 +322,20 @@ void Pipeline::processingThread() {
continue;
}
// Deduplication: skip if exact same as last transcription
if (text == last_transcription_) {
std::cout << "[Skip] Duplicate: " << text << std::endl;
session_logger_.logFilteredSegment(text, "duplicate", duration, audio_rms);
continue;
}
last_transcription_ = text;
// Update dynamic context for next Whisper call
recent_transcriptions_.push_back(text);
if (recent_transcriptions_.size() > MAX_CONTEXT_SEGMENTS) {
recent_transcriptions_.erase(recent_transcriptions_.begin());
}
// Track audio cost
if (ui_) {
ui_->addAudioCost(duration);
@ -408,10 +448,34 @@ bool Pipeline::shouldClose() const {
void Pipeline::clearAccumulated() {
accumulated_chinese_.clear();
accumulated_french_.clear();
recent_transcriptions_.clear();
last_transcription_.clear();
if (ui_) {
ui_->setAccumulatedText("", "");
}
std::cout << "[Pipeline] Cleared accumulated text" << std::endl;
std::cout << "[Pipeline] Cleared accumulated text and context" << std::endl;
}
std::string Pipeline::buildDynamicPrompt() const {
auto& config = Config::getInstance();
std::string base_prompt = config.getWhisperConfig().prompt;
// If no recent transcriptions, just return base prompt
if (recent_transcriptions_.empty()) {
return base_prompt;
}
// Build context from recent transcriptions
std::stringstream context;
context << base_prompt;
context << "\n\nContexte des phrases précédentes: ";
for (size_t i = 0; i < recent_transcriptions_.size(); ++i) {
if (i > 0) context << " ";
context << recent_transcriptions_[i];
}
return context.str();
}
} // namespace secondvoice

View File

@ -62,6 +62,16 @@ private:
std::string accumulated_chinese_;
std::string accumulated_french_;
// Dynamic context for Whisper (last N transcriptions)
std::vector<std::string> recent_transcriptions_;
static constexpr size_t MAX_CONTEXT_SEGMENTS = 3;
// Deduplication: skip if same as last transcription
std::string last_transcription_;
// Build dynamic prompt with recent context
std::string buildDynamicPrompt() const;
// Session logging
SessionLogger session_logger_;
int segment_id_ = 0;