feat: Add dynamic Whisper context and audio segment saving
- Pass last 3 transcriptions to Whisper for better context - Add deduplication filter to skip identical consecutive segments - Save each audio segment as .ogg in session directory for debugging - Add queue debug logging to detect double-push issues 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
9baa213a82
commit
53b21b94d6
@ -150,6 +150,8 @@ void Pipeline::audioThread() {
|
||||
chunk.sample_rate = config.getAudioConfig().sample_rate;
|
||||
chunk.channels = config.getAudioConfig().channels;
|
||||
|
||||
float push_duration = static_cast<float>(audio_data.size()) / (chunk.sample_rate * chunk.channels);
|
||||
std::cout << "[Queue] Pushing " << push_duration << "s chunk, queue size: " << audio_queue_.size() << std::endl;
|
||||
audio_queue_.push(std::move(chunk));
|
||||
});
|
||||
|
||||
@ -166,6 +168,7 @@ void Pipeline::audioThread() {
|
||||
|
||||
void Pipeline::processingThread() {
|
||||
auto& config = Config::getInstance();
|
||||
int audio_segment_id = 0;
|
||||
|
||||
while (running_) {
|
||||
auto chunk_opt = audio_queue_.wait_and_pop();
|
||||
@ -176,6 +179,23 @@ void Pipeline::processingThread() {
|
||||
auto& chunk = chunk_opt.value();
|
||||
float duration = static_cast<float>(chunk.data.size()) / (chunk.sample_rate * chunk.channels);
|
||||
|
||||
// Debug: log queue size to detect double-push
|
||||
std::cout << "[Queue] Processing chunk, " << audio_queue_.size() << " remaining" << std::endl;
|
||||
|
||||
// Save audio segment to session directory for debugging
|
||||
audio_segment_id++;
|
||||
if (session_logger_.isActive()) {
|
||||
std::stringstream audio_path;
|
||||
audio_path << session_logger_.getSessionPath() << "/audio_"
|
||||
<< std::setfill('0') << std::setw(3) << audio_segment_id << ".ogg";
|
||||
|
||||
AudioBuffer segment_buffer(chunk.sample_rate, chunk.channels);
|
||||
segment_buffer.addSamples(chunk.data);
|
||||
if (segment_buffer.saveToOpus(audio_path.str())) {
|
||||
std::cout << "[Session] Saved audio segment: " << audio_path.str() << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate audio RMS for logging
|
||||
float audio_rms = 0.0f;
|
||||
if (!chunk.data.empty()) {
|
||||
@ -189,6 +209,12 @@ void Pipeline::processingThread() {
|
||||
// Time Whisper
|
||||
auto whisper_start = std::chrono::steady_clock::now();
|
||||
|
||||
// Build dynamic prompt with recent context
|
||||
std::string dynamic_prompt = buildDynamicPrompt();
|
||||
if (!recent_transcriptions_.empty()) {
|
||||
std::cout << "[Context] Using " << recent_transcriptions_.size() << " previous segments" << std::endl;
|
||||
}
|
||||
|
||||
// Transcribe with Whisper
|
||||
auto whisper_result = whisper_client_->transcribe(
|
||||
chunk.data,
|
||||
@ -197,7 +223,7 @@ void Pipeline::processingThread() {
|
||||
config.getWhisperConfig().model,
|
||||
config.getWhisperConfig().language,
|
||||
config.getWhisperConfig().temperature,
|
||||
config.getWhisperConfig().prompt,
|
||||
dynamic_prompt,
|
||||
config.getWhisperConfig().response_format
|
||||
);
|
||||
|
||||
@ -296,6 +322,20 @@ void Pipeline::processingThread() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Deduplication: skip if exact same as last transcription
|
||||
if (text == last_transcription_) {
|
||||
std::cout << "[Skip] Duplicate: " << text << std::endl;
|
||||
session_logger_.logFilteredSegment(text, "duplicate", duration, audio_rms);
|
||||
continue;
|
||||
}
|
||||
last_transcription_ = text;
|
||||
|
||||
// Update dynamic context for next Whisper call
|
||||
recent_transcriptions_.push_back(text);
|
||||
if (recent_transcriptions_.size() > MAX_CONTEXT_SEGMENTS) {
|
||||
recent_transcriptions_.erase(recent_transcriptions_.begin());
|
||||
}
|
||||
|
||||
// Track audio cost
|
||||
if (ui_) {
|
||||
ui_->addAudioCost(duration);
|
||||
@ -408,10 +448,34 @@ bool Pipeline::shouldClose() const {
|
||||
void Pipeline::clearAccumulated() {
|
||||
accumulated_chinese_.clear();
|
||||
accumulated_french_.clear();
|
||||
recent_transcriptions_.clear();
|
||||
last_transcription_.clear();
|
||||
if (ui_) {
|
||||
ui_->setAccumulatedText("", "");
|
||||
}
|
||||
std::cout << "[Pipeline] Cleared accumulated text" << std::endl;
|
||||
std::cout << "[Pipeline] Cleared accumulated text and context" << std::endl;
|
||||
}
|
||||
|
||||
std::string Pipeline::buildDynamicPrompt() const {
|
||||
auto& config = Config::getInstance();
|
||||
std::string base_prompt = config.getWhisperConfig().prompt;
|
||||
|
||||
// If no recent transcriptions, just return base prompt
|
||||
if (recent_transcriptions_.empty()) {
|
||||
return base_prompt;
|
||||
}
|
||||
|
||||
// Build context from recent transcriptions
|
||||
std::stringstream context;
|
||||
context << base_prompt;
|
||||
context << "\n\nContexte des phrases précédentes: ";
|
||||
|
||||
for (size_t i = 0; i < recent_transcriptions_.size(); ++i) {
|
||||
if (i > 0) context << " ";
|
||||
context << recent_transcriptions_[i];
|
||||
}
|
||||
|
||||
return context.str();
|
||||
}
|
||||
|
||||
} // namespace secondvoice
|
||||
|
||||
@ -62,6 +62,16 @@ private:
|
||||
std::string accumulated_chinese_;
|
||||
std::string accumulated_french_;
|
||||
|
||||
// Dynamic context for Whisper (last N transcriptions)
|
||||
std::vector<std::string> recent_transcriptions_;
|
||||
static constexpr size_t MAX_CONTEXT_SEGMENTS = 3;
|
||||
|
||||
// Deduplication: skip if same as last transcription
|
||||
std::string last_transcription_;
|
||||
|
||||
// Build dynamic prompt with recent context
|
||||
std::string buildDynamicPrompt() const;
|
||||
|
||||
// Session logging
|
||||
SessionLogger session_logger_;
|
||||
int segment_id_ = 0;
|
||||
|
||||
Loading…
Reference in New Issue
Block a user