Compare commits

...

3 Commits

Author SHA1 Message Date
a28bb89913 tune: Adjust VAD parameters for longer segments
- min_speech_duration: 300ms → 1000ms (avoid tiny segments)
- silence_duration: 400ms → 700ms (wait longer before cutting)
- hang_frames_threshold: 20 → 35 (~350ms pause tolerance)

This should reduce mid-sentence cuts and give Whisper more context.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-23 22:08:01 +08:00
53b21b94d6 feat: Add dynamic Whisper context and audio segment saving
- Pass last 3 transcriptions to Whisper for better context
- Add deduplication filter to skip identical consecutive segments
- Save each audio segment as .ogg in session directory for debugging
- Add queue debug logging to detect double-push issues

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-23 22:02:49 +08:00
9baa213a82 feat: Add session logging system with per-segment metrics
- Add SessionLogger class for structured debug logging
- Log each segment with: chinese, french, audio duration, RMS, latency
- Track filtered segments with reasons (hallucination, empty, failed)
- Create session directories with JSON files per segment
- Update Whisper prompt with anti-hallucination rules
- Integrate timing measurements for Whisper and Claude calls

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-23 21:37:55 +08:00
8 changed files with 397 additions and 7 deletions

1
.gitignore vendored
View File

@ -64,6 +64,7 @@ imgui.ini
*.aac *.aac
*.m4a *.m4a
denoised/ denoised/
sessions/
# Claude Code local settings # Claude Code local settings
.claude/settings.local.json .claude/settings.local.json

View File

@ -108,6 +108,7 @@ set(SOURCES_UI
src/ui/TranslationUI.cpp src/ui/TranslationUI.cpp
# Utils # Utils
src/utils/Config.cpp src/utils/Config.cpp
src/utils/SessionLogger.cpp
# Core # Core
src/core/Pipeline.cpp src/core/Pipeline.cpp
) )

View File

@ -10,7 +10,7 @@
"model": "gpt-4o-mini-transcribe", "model": "gpt-4o-mini-transcribe",
"language": "zh", "language": "zh",
"temperature": 0.0, "temperature": 0.0,
"prompt": "The following is a conversation in Mandarin Chinese about business, family, and daily life. Common names: Tingting, Alexis.", "prompt": "Transcription en direct d'une conversation en chinois mandarin. Plusieurs interlocuteurs parlent, parfois en même temps. RÈGLES STRICTES: (1) Ne transcris QUE les paroles audibles en chinois. (2) Si l'audio est inaudible, du bruit, ou du silence, renvoie une chaîne vide. (3) NE GÉNÈRE JAMAIS ces phrases: 谢谢观看, 感谢收看, 订阅, 请订阅, 下期再见, Thank you, Subscribe, 字幕. (4) Ignore: musique, applaudissements, rires, bruits de fond, respirations.",
"stream": false, "stream": false,
"response_format": "text" "response_format": "text"
}, },

View File

@ -69,8 +69,8 @@ private:
// VAD parameters - Higher threshold to avoid false triggers on filtered noise // VAD parameters - Higher threshold to avoid false triggers on filtered noise
std::atomic<float> vad_rms_threshold_{0.02f}; // Was 0.01f std::atomic<float> vad_rms_threshold_{0.02f}; // Was 0.01f
std::atomic<float> vad_peak_threshold_{0.08f}; // Was 0.04f std::atomic<float> vad_peak_threshold_{0.08f}; // Was 0.04f
int silence_duration_ms_ = 400; // Wait 400ms of silence before cutting int silence_duration_ms_ = 700; // Wait 700ms of silence before cutting (was 400)
int min_speech_duration_ms_ = 300; // Minimum speech to send int min_speech_duration_ms_ = 1000; // Minimum 1s speech to send (was 300)
int max_speech_duration_ms_ = 25000; // 25s max before forced flush int max_speech_duration_ms_ = 25000; // 25s max before forced flush
// Adaptive noise floor // Adaptive noise floor
@ -79,7 +79,7 @@ private:
// Hang time - wait before cutting to avoid mid-sentence cuts // Hang time - wait before cutting to avoid mid-sentence cuts
int hang_frames_ = 0; int hang_frames_ = 0;
int hang_frames_threshold_ = 20; // ~200ms tolerance for pauses int hang_frames_threshold_ = 35; // ~350ms tolerance for pauses (was 20)
// Zero-crossing rate for speech vs noise discrimination // Zero-crossing rate for speech vs noise discrimination
float last_zcr_ = 0.0f; float last_zcr_ = 0.0f;

View File

@ -70,6 +70,10 @@ bool Pipeline::start() {
} }
running_ = true; running_ = true;
segment_id_ = 0;
// Start session logging
session_logger_.startSession();
// Start background threads // Start background threads
audio_thread_ = std::thread(&Pipeline::audioThread, this); audio_thread_ = std::thread(&Pipeline::audioThread, this);
@ -126,6 +130,9 @@ void Pipeline::stop() {
transcript_ss << "transcripts/transcript_" << timestamp.str() << ".txt"; transcript_ss << "transcripts/transcript_" << timestamp.str() << ".txt";
ui_->exportTranscript(transcript_ss.str()); ui_->exportTranscript(transcript_ss.str());
} }
// End session logging
session_logger_.endSession();
} }
void Pipeline::audioThread() { void Pipeline::audioThread() {
@ -143,6 +150,8 @@ void Pipeline::audioThread() {
chunk.sample_rate = config.getAudioConfig().sample_rate; chunk.sample_rate = config.getAudioConfig().sample_rate;
chunk.channels = config.getAudioConfig().channels; chunk.channels = config.getAudioConfig().channels;
float push_duration = static_cast<float>(audio_data.size()) / (chunk.sample_rate * chunk.channels);
std::cout << "[Queue] Pushing " << push_duration << "s chunk, queue size: " << audio_queue_.size() << std::endl;
audio_queue_.push(std::move(chunk)); audio_queue_.push(std::move(chunk));
}); });
@ -159,6 +168,7 @@ void Pipeline::audioThread() {
void Pipeline::processingThread() { void Pipeline::processingThread() {
auto& config = Config::getInstance(); auto& config = Config::getInstance();
int audio_segment_id = 0;
while (running_) { while (running_) {
auto chunk_opt = audio_queue_.wait_and_pop(); auto chunk_opt = audio_queue_.wait_and_pop();
@ -168,7 +178,42 @@ void Pipeline::processingThread() {
auto& chunk = chunk_opt.value(); auto& chunk = chunk_opt.value();
float duration = static_cast<float>(chunk.data.size()) / (chunk.sample_rate * chunk.channels); float duration = static_cast<float>(chunk.data.size()) / (chunk.sample_rate * chunk.channels);
std::cout << "[Processing] Speech segment: " << duration << "s" << std::endl;
// Debug: log queue size to detect double-push
std::cout << "[Queue] Processing chunk, " << audio_queue_.size() << " remaining" << std::endl;
// Save audio segment to session directory for debugging
audio_segment_id++;
if (session_logger_.isActive()) {
std::stringstream audio_path;
audio_path << session_logger_.getSessionPath() << "/audio_"
<< std::setfill('0') << std::setw(3) << audio_segment_id << ".ogg";
AudioBuffer segment_buffer(chunk.sample_rate, chunk.channels);
segment_buffer.addSamples(chunk.data);
if (segment_buffer.saveToOpus(audio_path.str())) {
std::cout << "[Session] Saved audio segment: " << audio_path.str() << std::endl;
}
}
// Calculate audio RMS for logging
float audio_rms = 0.0f;
if (!chunk.data.empty()) {
float sum_sq = 0.0f;
for (float s : chunk.data) sum_sq += s * s;
audio_rms = std::sqrt(sum_sq / chunk.data.size());
}
std::cout << "[Processing] Speech segment: " << duration << "s (RMS=" << audio_rms << ")" << std::endl;
// Time Whisper
auto whisper_start = std::chrono::steady_clock::now();
// Build dynamic prompt with recent context
std::string dynamic_prompt = buildDynamicPrompt();
if (!recent_transcriptions_.empty()) {
std::cout << "[Context] Using " << recent_transcriptions_.size() << " previous segments" << std::endl;
}
// Transcribe with Whisper // Transcribe with Whisper
auto whisper_result = whisper_client_->transcribe( auto whisper_result = whisper_client_->transcribe(
@ -178,12 +223,17 @@ void Pipeline::processingThread() {
config.getWhisperConfig().model, config.getWhisperConfig().model,
config.getWhisperConfig().language, config.getWhisperConfig().language,
config.getWhisperConfig().temperature, config.getWhisperConfig().temperature,
config.getWhisperConfig().prompt, dynamic_prompt,
config.getWhisperConfig().response_format config.getWhisperConfig().response_format
); );
auto whisper_end = std::chrono::steady_clock::now();
int64_t whisper_latency = std::chrono::duration_cast<std::chrono::milliseconds>(
whisper_end - whisper_start).count();
if (!whisper_result.has_value()) { if (!whisper_result.has_value()) {
std::cerr << "Whisper transcription failed" << std::endl; std::cerr << "Whisper transcription failed" << std::endl;
session_logger_.logFilteredSegment("", "whisper_failed", duration, audio_rms);
continue; continue;
} }
@ -195,6 +245,7 @@ void Pipeline::processingThread() {
size_t end = text.find_last_not_of(" \t\n\r"); size_t end = text.find_last_not_of(" \t\n\r");
if (start == std::string::npos) { if (start == std::string::npos) {
std::cout << "[Skip] Empty transcription" << std::endl; std::cout << "[Skip] Empty transcription" << std::endl;
session_logger_.logFilteredSegment("", "empty", duration, audio_rms);
continue; continue;
} }
text = text.substr(start, end - start + 1); text = text.substr(start, end - start + 1);
@ -267,14 +318,32 @@ void Pipeline::processingThread() {
if (is_garbage) { if (is_garbage) {
std::cout << "[Skip] Filtered: " << text << std::endl; std::cout << "[Skip] Filtered: " << text << std::endl;
session_logger_.logFilteredSegment(text, "hallucination", duration, audio_rms);
continue; continue;
} }
// Deduplication: skip if exact same as last transcription
if (text == last_transcription_) {
std::cout << "[Skip] Duplicate: " << text << std::endl;
session_logger_.logFilteredSegment(text, "duplicate", duration, audio_rms);
continue;
}
last_transcription_ = text;
// Update dynamic context for next Whisper call
recent_transcriptions_.push_back(text);
if (recent_transcriptions_.size() > MAX_CONTEXT_SEGMENTS) {
recent_transcriptions_.erase(recent_transcriptions_.begin());
}
// Track audio cost // Track audio cost
if (ui_) { if (ui_) {
ui_->addAudioCost(duration); ui_->addAudioCost(duration);
} }
// Time Claude
auto claude_start = std::chrono::steady_clock::now();
// Translate with Claude // Translate with Claude
auto claude_result = claude_client_->translate( auto claude_result = claude_client_->translate(
text, text,
@ -283,8 +352,13 @@ void Pipeline::processingThread() {
config.getClaudeConfig().temperature config.getClaudeConfig().temperature
); );
auto claude_end = std::chrono::steady_clock::now();
int64_t claude_latency = std::chrono::duration_cast<std::chrono::milliseconds>(
claude_end - claude_start).count();
if (!claude_result.has_value()) { if (!claude_result.has_value()) {
std::cerr << "Claude translation failed" << std::endl; std::cerr << "Claude translation failed" << std::endl;
session_logger_.logFilteredSegment(text, "claude_failed", duration, audio_rms);
continue; continue;
} }
@ -308,8 +382,24 @@ void Pipeline::processingThread() {
ui_->setAccumulatedText(accumulated_chinese_, accumulated_french_); ui_->setAccumulatedText(accumulated_chinese_, accumulated_french_);
ui_->addTranslation(text, claude_result->text); ui_->addTranslation(text, claude_result->text);
// Log successful segment
segment_id_++;
SegmentLog seg;
seg.id = segment_id_;
seg.chinese = text;
seg.french = claude_result->text;
seg.audio_duration_sec = duration;
seg.audio_rms = audio_rms;
seg.whisper_latency_ms = whisper_latency;
seg.claude_latency_ms = claude_latency;
seg.was_filtered = false;
seg.filter_reason = "";
seg.timestamp = ""; // Will be set by logger
session_logger_.logSegment(seg);
std::cout << "CN: " << text << std::endl; std::cout << "CN: " << text << std::endl;
std::cout << "FR: " << claude_result->text << std::endl; std::cout << "FR: " << claude_result->text << std::endl;
std::cout << "[Latency] Whisper: " << whisper_latency << "ms, Claude: " << claude_latency << "ms" << std::endl;
std::cout << "---" << std::endl; std::cout << "---" << std::endl;
} }
} }
@ -358,10 +448,34 @@ bool Pipeline::shouldClose() const {
void Pipeline::clearAccumulated() { void Pipeline::clearAccumulated() {
accumulated_chinese_.clear(); accumulated_chinese_.clear();
accumulated_french_.clear(); accumulated_french_.clear();
recent_transcriptions_.clear();
last_transcription_.clear();
if (ui_) { if (ui_) {
ui_->setAccumulatedText("", ""); ui_->setAccumulatedText("", "");
} }
std::cout << "[Pipeline] Cleared accumulated text" << std::endl; std::cout << "[Pipeline] Cleared accumulated text and context" << std::endl;
}
std::string Pipeline::buildDynamicPrompt() const {
auto& config = Config::getInstance();
std::string base_prompt = config.getWhisperConfig().prompt;
// If no recent transcriptions, just return base prompt
if (recent_transcriptions_.empty()) {
return base_prompt;
}
// Build context from recent transcriptions
std::stringstream context;
context << base_prompt;
context << "\n\nContexte des phrases précédentes: ";
for (size_t i = 0; i < recent_transcriptions_.size(); ++i) {
if (i > 0) context << " ";
context << recent_transcriptions_[i];
}
return context.str();
} }
} // namespace secondvoice } // namespace secondvoice

View File

@ -6,6 +6,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "../utils/ThreadSafeQueue.h" #include "../utils/ThreadSafeQueue.h"
#include "../utils/SessionLogger.h"
namespace secondvoice { namespace secondvoice {
@ -60,6 +61,20 @@ private:
// Simple accumulation // Simple accumulation
std::string accumulated_chinese_; std::string accumulated_chinese_;
std::string accumulated_french_; std::string accumulated_french_;
// Dynamic context for Whisper (last N transcriptions)
std::vector<std::string> recent_transcriptions_;
static constexpr size_t MAX_CONTEXT_SEGMENTS = 3;
// Deduplication: skip if same as last transcription
std::string last_transcription_;
// Build dynamic prompt with recent context
std::string buildDynamicPrompt() const;
// Session logging
SessionLogger session_logger_;
int segment_id_ = 0;
}; };
} // namespace secondvoice } // namespace secondvoice

196
src/utils/SessionLogger.cpp Normal file
View File

@ -0,0 +1,196 @@
#include "SessionLogger.h"
#include <nlohmann/json.hpp>
#include <filesystem>
#include <iostream>
#include <iomanip>
#include <sstream>
namespace secondvoice {
using json = nlohmann::json;
SessionLogger::SessionLogger() = default;
SessionLogger::~SessionLogger() {
if (is_active_) {
endSession();
}
}
std::string SessionLogger::getCurrentTimestamp() const {
auto now = std::chrono::system_clock::now();
auto time_t = std::chrono::system_clock::to_time_t(now);
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(
now.time_since_epoch()) % 1000;
std::stringstream ss;
ss << std::put_time(std::localtime(&time_t), "%Y-%m-%d_%H%M%S");
return ss.str();
}
void SessionLogger::startSession() {
if (is_active_) {
endSession();
}
session_start_time_ = getCurrentTimestamp();
session_path_ = "./sessions/" + session_start_time_;
// Create directories
std::filesystem::create_directories(session_path_ + "/segments");
is_active_ = true;
segment_count_ = 0;
filtered_count_ = 0;
total_audio_sec_ = 0.0f;
total_whisper_ms_ = 0;
total_claude_ms_ = 0;
segments_.clear();
std::cout << "[Session] Started: " << session_path_ << std::endl;
}
void SessionLogger::endSession() {
if (!is_active_) return;
writeSessionJson();
is_active_ = false;
std::cout << "[Session] Ended: " << segment_count_ << " segments, "
<< filtered_count_ << " filtered, "
<< total_audio_sec_ << "s audio" << std::endl;
}
void SessionLogger::logSegment(const SegmentLog& segment) {
if (!is_active_) return;
// Update counters
segment_count_++;
total_audio_sec_ += segment.audio_duration_sec;
total_whisper_ms_ += segment.whisper_latency_ms;
total_claude_ms_ += segment.claude_latency_ms;
// Store segment
segments_.push_back(segment);
// Write individual segment JSON
std::stringstream filename;
filename << session_path_ << "/segments/"
<< std::setfill('0') << std::setw(3) << segment.id << ".json";
json j;
j["id"] = segment.id;
j["chinese"] = segment.chinese;
j["french"] = segment.french;
j["audio_duration_sec"] = segment.audio_duration_sec;
j["audio_rms"] = segment.audio_rms;
j["whisper_latency_ms"] = segment.whisper_latency_ms;
j["claude_latency_ms"] = segment.claude_latency_ms;
j["was_filtered"] = segment.was_filtered;
j["filter_reason"] = segment.filter_reason;
j["timestamp"] = segment.timestamp;
std::ofstream file(filename.str());
if (file.is_open()) {
file << j.dump(2);
file.close();
}
std::cout << "[Session] Logged segment #" << segment.id
<< " (" << segment.audio_duration_sec << "s)" << std::endl;
}
void SessionLogger::logFilteredSegment(const std::string& chinese, const std::string& reason,
float audio_duration, float audio_rms) {
if (!is_active_) return;
filtered_count_++;
total_audio_sec_ += audio_duration;
// Log filtered segment with special marker
SegmentLog seg;
seg.id = segment_count_ + filtered_count_;
seg.chinese = chinese;
seg.french = "[FILTERED]";
seg.audio_duration_sec = audio_duration;
seg.audio_rms = audio_rms;
seg.whisper_latency_ms = 0;
seg.claude_latency_ms = 0;
seg.was_filtered = true;
seg.filter_reason = reason;
seg.timestamp = getCurrentTimestamp();
segments_.push_back(seg);
// Write filtered segment JSON
std::stringstream filename;
filename << session_path_ << "/segments/"
<< std::setfill('0') << std::setw(3) << seg.id << "_filtered.json";
json j;
j["id"] = seg.id;
j["chinese"] = seg.chinese;
j["filter_reason"] = reason;
j["audio_duration_sec"] = audio_duration;
j["audio_rms"] = audio_rms;
j["timestamp"] = seg.timestamp;
std::ofstream file(filename.str());
if (file.is_open()) {
file << j.dump(2);
file.close();
}
}
void SessionLogger::writeSessionJson() {
json session;
session["start_time"] = session_start_time_;
session["end_time"] = getCurrentTimestamp();
session["total_segments"] = segment_count_;
session["filtered_segments"] = filtered_count_;
session["total_audio_seconds"] = total_audio_sec_;
session["avg_whisper_latency_ms"] = segment_count_ > 0 ?
total_whisper_ms_ / segment_count_ : 0;
session["avg_claude_latency_ms"] = segment_count_ > 0 ?
total_claude_ms_ / segment_count_ : 0;
// Summary of all segments
json segments_summary = json::array();
for (const auto& seg : segments_) {
json s;
s["id"] = seg.id;
s["chinese"] = seg.chinese;
s["french"] = seg.french;
s["duration"] = seg.audio_duration_sec;
s["filtered"] = seg.was_filtered;
if (seg.was_filtered) {
s["filter_reason"] = seg.filter_reason;
}
segments_summary.push_back(s);
}
session["segments"] = segments_summary;
std::string filepath = session_path_ + "/session.json";
std::ofstream file(filepath);
if (file.is_open()) {
file << session.dump(2);
file.close();
std::cout << "[Session] Wrote " << filepath << std::endl;
}
// Also write plain text transcript
std::string transcript_path = session_path_ + "/transcript.txt";
std::ofstream transcript(transcript_path);
if (transcript.is_open()) {
transcript << "=== SecondVoice Session " << session_start_time_ << " ===\n\n";
for (const auto& seg : segments_) {
if (!seg.was_filtered) {
transcript << "CN: " << seg.chinese << "\n";
transcript << "FR: " << seg.french << "\n\n";
}
}
transcript.close();
}
}
} // namespace secondvoice

63
src/utils/SessionLogger.h Normal file
View File

@ -0,0 +1,63 @@
#pragma once
#include <string>
#include <vector>
#include <chrono>
#include <fstream>
namespace secondvoice {
struct SegmentLog {
int id;
std::string chinese;
std::string french;
float audio_duration_sec;
float audio_rms;
int64_t whisper_latency_ms;
int64_t claude_latency_ms;
bool was_filtered;
std::string filter_reason;
std::string timestamp;
};
class SessionLogger {
public:
SessionLogger();
~SessionLogger();
// Start a new session (creates directory)
void startSession();
// End session (writes session.json summary)
void endSession();
// Log a segment
void logSegment(const SegmentLog& segment);
// Log a filtered/skipped segment
void logFilteredSegment(const std::string& chinese, const std::string& reason,
float audio_duration, float audio_rms);
// Get current session path
std::string getSessionPath() const { return session_path_; }
// Check if session is active
bool isActive() const { return is_active_; }
private:
std::string getCurrentTimestamp() const;
void writeSessionJson();
bool is_active_ = false;
std::string session_path_;
std::string session_start_time_;
int segment_count_ = 0;
int filtered_count_ = 0;
float total_audio_sec_ = 0.0f;
int total_whisper_ms_ = 0;
int total_claude_ms_ = 0;
std::vector<SegmentLog> segments_;
};
} // namespace secondvoice