feat: Add dynamic Whisper context and audio segment saving
- Pass last 3 transcriptions to Whisper for better context - Add deduplication filter to skip identical consecutive segments - Save each audio segment as .ogg in session directory for debugging - Add queue debug logging to detect double-push issues 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
9baa213a82
commit
53b21b94d6
@ -150,6 +150,8 @@ void Pipeline::audioThread() {
|
|||||||
chunk.sample_rate = config.getAudioConfig().sample_rate;
|
chunk.sample_rate = config.getAudioConfig().sample_rate;
|
||||||
chunk.channels = config.getAudioConfig().channels;
|
chunk.channels = config.getAudioConfig().channels;
|
||||||
|
|
||||||
|
float push_duration = static_cast<float>(audio_data.size()) / (chunk.sample_rate * chunk.channels);
|
||||||
|
std::cout << "[Queue] Pushing " << push_duration << "s chunk, queue size: " << audio_queue_.size() << std::endl;
|
||||||
audio_queue_.push(std::move(chunk));
|
audio_queue_.push(std::move(chunk));
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -166,6 +168,7 @@ void Pipeline::audioThread() {
|
|||||||
|
|
||||||
void Pipeline::processingThread() {
|
void Pipeline::processingThread() {
|
||||||
auto& config = Config::getInstance();
|
auto& config = Config::getInstance();
|
||||||
|
int audio_segment_id = 0;
|
||||||
|
|
||||||
while (running_) {
|
while (running_) {
|
||||||
auto chunk_opt = audio_queue_.wait_and_pop();
|
auto chunk_opt = audio_queue_.wait_and_pop();
|
||||||
@ -176,6 +179,23 @@ void Pipeline::processingThread() {
|
|||||||
auto& chunk = chunk_opt.value();
|
auto& chunk = chunk_opt.value();
|
||||||
float duration = static_cast<float>(chunk.data.size()) / (chunk.sample_rate * chunk.channels);
|
float duration = static_cast<float>(chunk.data.size()) / (chunk.sample_rate * chunk.channels);
|
||||||
|
|
||||||
|
// Debug: log queue size to detect double-push
|
||||||
|
std::cout << "[Queue] Processing chunk, " << audio_queue_.size() << " remaining" << std::endl;
|
||||||
|
|
||||||
|
// Save audio segment to session directory for debugging
|
||||||
|
audio_segment_id++;
|
||||||
|
if (session_logger_.isActive()) {
|
||||||
|
std::stringstream audio_path;
|
||||||
|
audio_path << session_logger_.getSessionPath() << "/audio_"
|
||||||
|
<< std::setfill('0') << std::setw(3) << audio_segment_id << ".ogg";
|
||||||
|
|
||||||
|
AudioBuffer segment_buffer(chunk.sample_rate, chunk.channels);
|
||||||
|
segment_buffer.addSamples(chunk.data);
|
||||||
|
if (segment_buffer.saveToOpus(audio_path.str())) {
|
||||||
|
std::cout << "[Session] Saved audio segment: " << audio_path.str() << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Calculate audio RMS for logging
|
// Calculate audio RMS for logging
|
||||||
float audio_rms = 0.0f;
|
float audio_rms = 0.0f;
|
||||||
if (!chunk.data.empty()) {
|
if (!chunk.data.empty()) {
|
||||||
@ -189,6 +209,12 @@ void Pipeline::processingThread() {
|
|||||||
// Time Whisper
|
// Time Whisper
|
||||||
auto whisper_start = std::chrono::steady_clock::now();
|
auto whisper_start = std::chrono::steady_clock::now();
|
||||||
|
|
||||||
|
// Build dynamic prompt with recent context
|
||||||
|
std::string dynamic_prompt = buildDynamicPrompt();
|
||||||
|
if (!recent_transcriptions_.empty()) {
|
||||||
|
std::cout << "[Context] Using " << recent_transcriptions_.size() << " previous segments" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
// Transcribe with Whisper
|
// Transcribe with Whisper
|
||||||
auto whisper_result = whisper_client_->transcribe(
|
auto whisper_result = whisper_client_->transcribe(
|
||||||
chunk.data,
|
chunk.data,
|
||||||
@ -197,7 +223,7 @@ void Pipeline::processingThread() {
|
|||||||
config.getWhisperConfig().model,
|
config.getWhisperConfig().model,
|
||||||
config.getWhisperConfig().language,
|
config.getWhisperConfig().language,
|
||||||
config.getWhisperConfig().temperature,
|
config.getWhisperConfig().temperature,
|
||||||
config.getWhisperConfig().prompt,
|
dynamic_prompt,
|
||||||
config.getWhisperConfig().response_format
|
config.getWhisperConfig().response_format
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -296,6 +322,20 @@ void Pipeline::processingThread() {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Deduplication: skip if exact same as last transcription
|
||||||
|
if (text == last_transcription_) {
|
||||||
|
std::cout << "[Skip] Duplicate: " << text << std::endl;
|
||||||
|
session_logger_.logFilteredSegment(text, "duplicate", duration, audio_rms);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
last_transcription_ = text;
|
||||||
|
|
||||||
|
// Update dynamic context for next Whisper call
|
||||||
|
recent_transcriptions_.push_back(text);
|
||||||
|
if (recent_transcriptions_.size() > MAX_CONTEXT_SEGMENTS) {
|
||||||
|
recent_transcriptions_.erase(recent_transcriptions_.begin());
|
||||||
|
}
|
||||||
|
|
||||||
// Track audio cost
|
// Track audio cost
|
||||||
if (ui_) {
|
if (ui_) {
|
||||||
ui_->addAudioCost(duration);
|
ui_->addAudioCost(duration);
|
||||||
@ -408,10 +448,34 @@ bool Pipeline::shouldClose() const {
|
|||||||
void Pipeline::clearAccumulated() {
|
void Pipeline::clearAccumulated() {
|
||||||
accumulated_chinese_.clear();
|
accumulated_chinese_.clear();
|
||||||
accumulated_french_.clear();
|
accumulated_french_.clear();
|
||||||
|
recent_transcriptions_.clear();
|
||||||
|
last_transcription_.clear();
|
||||||
if (ui_) {
|
if (ui_) {
|
||||||
ui_->setAccumulatedText("", "");
|
ui_->setAccumulatedText("", "");
|
||||||
}
|
}
|
||||||
std::cout << "[Pipeline] Cleared accumulated text" << std::endl;
|
std::cout << "[Pipeline] Cleared accumulated text and context" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string Pipeline::buildDynamicPrompt() const {
|
||||||
|
auto& config = Config::getInstance();
|
||||||
|
std::string base_prompt = config.getWhisperConfig().prompt;
|
||||||
|
|
||||||
|
// If no recent transcriptions, just return base prompt
|
||||||
|
if (recent_transcriptions_.empty()) {
|
||||||
|
return base_prompt;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build context from recent transcriptions
|
||||||
|
std::stringstream context;
|
||||||
|
context << base_prompt;
|
||||||
|
context << "\n\nContexte des phrases précédentes: ";
|
||||||
|
|
||||||
|
for (size_t i = 0; i < recent_transcriptions_.size(); ++i) {
|
||||||
|
if (i > 0) context << " ";
|
||||||
|
context << recent_transcriptions_[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
return context.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace secondvoice
|
} // namespace secondvoice
|
||||||
|
|||||||
@ -62,6 +62,16 @@ private:
|
|||||||
std::string accumulated_chinese_;
|
std::string accumulated_chinese_;
|
||||||
std::string accumulated_french_;
|
std::string accumulated_french_;
|
||||||
|
|
||||||
|
// Dynamic context for Whisper (last N transcriptions)
|
||||||
|
std::vector<std::string> recent_transcriptions_;
|
||||||
|
static constexpr size_t MAX_CONTEXT_SEGMENTS = 3;
|
||||||
|
|
||||||
|
// Deduplication: skip if same as last transcription
|
||||||
|
std::string last_transcription_;
|
||||||
|
|
||||||
|
// Build dynamic prompt with recent context
|
||||||
|
std::string buildDynamicPrompt() const;
|
||||||
|
|
||||||
// Session logging
|
// Session logging
|
||||||
SessionLogger session_logger_;
|
SessionLogger session_logger_;
|
||||||
int segment_id_ = 0;
|
int segment_id_ = 0;
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user