tune: Adjust VAD parameters for longer segments

- min_speech_duration: 300ms → 1000ms (avoid tiny segments) - silence_duration: 400ms → 700ms (wait longer before cutting) - hang_frames_threshold: 20 → 35 (~350ms pause tolerance) This should reduce mid-sentence cuts and give Whisper more context. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
feat: Add dynamic Whisper context and audio segment saving
2025-11-23 22:08:01 +08:00 · 2025-11-23 22:02:49 +08:00 · 2025-11-23 21:37:55 +08:00
8 changed files with 397 additions and 7 deletions
--- a/.gitignore
+++ b/.gitignore
@ -64,6 +64,7 @@ imgui.ini
 *.aac
 *.m4a
 denoised/
 sessions/
 # Claude Code local settings
 .claude/settings.local.json
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -108,6 +108,7 @@ set(SOURCES_UI
    src/ui/TranslationUI.cpp
    # Utils
    src/utils/Config.cpp
    src/utils/SessionLogger.cpp
    # Core
    src/core/Pipeline.cpp
 )
--- a/config.json
+++ b/config.json
@ -10,7 +10,7 @@
    "model": "gpt-4o-mini-transcribe",
    "language": "zh",
    "temperature": 0.0,
-    "prompt": "The following is a conversation in Mandarin Chinese about business, family, and daily life. Common names: Tingting, Alexis.",
+    "prompt": "Transcription en direct d'une conversation en chinois mandarin. Plusieurs interlocuteurs parlent, parfois en même temps. RÈGLES STRICTES: (1) Ne transcris QUE les paroles audibles en chinois. (2) Si l'audio est inaudible, du bruit, ou du silence, renvoie une chaîne vide. (3) NE GÉNÈRE JAMAIS ces phrases: 谢谢观看, 感谢收看, 订阅, 请订阅, 下期再见, Thank you, Subscribe, 字幕. (4) Ignore: musique, applaudissements, rires, bruits de fond, respirations.",
    "stream": false,
    "response_format": "text"
  },
--- a/src/audio/AudioCapture.h
+++ b/src/audio/AudioCapture.h
@ -69,8 +69,8 @@ private:
    // VAD parameters - Higher threshold to avoid false triggers on filtered noise
    std::atomic<float> vad_rms_threshold_{0.02f};   // Was 0.01f
    std::atomic<float> vad_peak_threshold_{0.08f};  // Was 0.04f
-    int silence_duration_ms_ = 400;      // Wait 400ms of silence before cutting
+    int silence_duration_ms_ = 700;      // Wait 700ms of silence before cutting (was 400)
-    int min_speech_duration_ms_ = 300;   // Minimum speech to send
+    int min_speech_duration_ms_ = 1000;  // Minimum 1s speech to send (was 300)
    int max_speech_duration_ms_ = 25000; // 25s max before forced flush
    // Adaptive noise floor
@ -79,7 +79,7 @@ private:
    // Hang time - wait before cutting to avoid mid-sentence cuts
    int hang_frames_ = 0;
-    int hang_frames_threshold_ = 20;     // ~200ms tolerance for pauses
+    int hang_frames_threshold_ = 35;     // ~350ms tolerance for pauses (was 20)
    // Zero-crossing rate for speech vs noise discrimination
    float last_zcr_ = 0.0f;
--- a/src/core/Pipeline.cpp
+++ b/src/core/Pipeline.cpp
@ -70,6 +70,10 @@ bool Pipeline::start() {
    }
    running_ = true;
    segment_id_ = 0;
    // Start session logging
    session_logger_.startSession();
    // Start background threads
    audio_thread_ = std::thread(&Pipeline::audioThread, this);
@ -126,6 +130,9 @@ void Pipeline::stop() {
        transcript_ss << "transcripts/transcript_" << timestamp.str() << ".txt";
        ui_->exportTranscript(transcript_ss.str());
    }
    // End session logging
    session_logger_.endSession();
 }
 void Pipeline::audioThread() {
@ -143,6 +150,8 @@ void Pipeline::audioThread() {
        chunk.sample_rate = config.getAudioConfig().sample_rate;
        chunk.channels = config.getAudioConfig().channels;
        float push_duration = static_cast<float>(audio_data.size()) / (chunk.sample_rate * chunk.channels);
        std::cout << "[Queue] Pushing " << push_duration << "s chunk, queue size: " << audio_queue_.size() << std::endl;
        audio_queue_.push(std::move(chunk));
    });
@ -159,6 +168,7 @@ void Pipeline::audioThread() {
 void Pipeline::processingThread() {
    auto& config = Config::getInstance();
    int audio_segment_id = 0;
    while (running_) {
        auto chunk_opt = audio_queue_.wait_and_pop();
@ -168,7 +178,42 @@ void Pipeline::processingThread() {
        auto& chunk = chunk_opt.value();
        float duration = static_cast<float>(chunk.data.size()) / (chunk.sample_rate * chunk.channels);
-        std::cout << "[Processing] Speech segment: " << duration << "s" << std::endl;
+
        // Debug: log queue size to detect double-push
        std::cout << "[Queue] Processing chunk, " << audio_queue_.size() << " remaining" << std::endl;
        // Save audio segment to session directory for debugging
        audio_segment_id++;
        if (session_logger_.isActive()) {
            std::stringstream audio_path;
            audio_path << session_logger_.getSessionPath() << "/audio_"
                       << std::setfill('0') << std::setw(3) << audio_segment_id << ".ogg";
            AudioBuffer segment_buffer(chunk.sample_rate, chunk.channels);
            segment_buffer.addSamples(chunk.data);
            if (segment_buffer.saveToOpus(audio_path.str())) {
                std::cout << "[Session] Saved audio segment: " << audio_path.str() << std::endl;
            }
        }
        // Calculate audio RMS for logging
        float audio_rms = 0.0f;
        if (!chunk.data.empty()) {
            float sum_sq = 0.0f;
            for (float s : chunk.data) sum_sq += s * s;
            audio_rms = std::sqrt(sum_sq / chunk.data.size());
        }
        std::cout << "[Processing] Speech segment: " << duration << "s (RMS=" << audio_rms << ")" << std::endl;
        // Time Whisper
        auto whisper_start = std::chrono::steady_clock::now();
        // Build dynamic prompt with recent context
        std::string dynamic_prompt = buildDynamicPrompt();
        if (!recent_transcriptions_.empty()) {
            std::cout << "[Context] Using " << recent_transcriptions_.size() << " previous segments" << std::endl;
        }
        // Transcribe with Whisper
        auto whisper_result = whisper_client_->transcribe(
@ -178,12 +223,17 @@ void Pipeline::processingThread() {
            config.getWhisperConfig().model,
            config.getWhisperConfig().language,
            config.getWhisperConfig().temperature,
-            config.getWhisperConfig().prompt,
+            dynamic_prompt,
            config.getWhisperConfig().response_format
        );
        auto whisper_end = std::chrono::steady_clock::now();
        int64_t whisper_latency = std::chrono::duration_cast<std::chrono::milliseconds>(
            whisper_end - whisper_start).count();
        if (!whisper_result.has_value()) {
            std::cerr << "Whisper transcription failed" << std::endl;
            session_logger_.logFilteredSegment("", "whisper_failed", duration, audio_rms);
            continue;
        }
@ -195,6 +245,7 @@ void Pipeline::processingThread() {
        size_t end = text.find_last_not_of(" \t\n\r");
        if (start == std::string::npos) {
            std::cout << "[Skip] Empty transcription" << std::endl;
            session_logger_.logFilteredSegment("", "empty", duration, audio_rms);
            continue;
        }
        text = text.substr(start, end - start + 1);
@ -267,14 +318,32 @@ void Pipeline::processingThread() {
        if (is_garbage) {
            std::cout << "[Skip] Filtered: " << text << std::endl;
            session_logger_.logFilteredSegment(text, "hallucination", duration, audio_rms);
            continue;
        }
        // Deduplication: skip if exact same as last transcription
        if (text == last_transcription_) {
            std::cout << "[Skip] Duplicate: " << text << std::endl;
            session_logger_.logFilteredSegment(text, "duplicate", duration, audio_rms);
            continue;
        }
        last_transcription_ = text;
        // Update dynamic context for next Whisper call
        recent_transcriptions_.push_back(text);
        if (recent_transcriptions_.size() > MAX_CONTEXT_SEGMENTS) {
            recent_transcriptions_.erase(recent_transcriptions_.begin());
        }
        // Track audio cost
        if (ui_) {
            ui_->addAudioCost(duration);
        }
        // Time Claude
        auto claude_start = std::chrono::steady_clock::now();
        // Translate with Claude
        auto claude_result = claude_client_->translate(
            text,
@ -283,8 +352,13 @@ void Pipeline::processingThread() {
            config.getClaudeConfig().temperature
        );
        auto claude_end = std::chrono::steady_clock::now();
        int64_t claude_latency = std::chrono::duration_cast<std::chrono::milliseconds>(
            claude_end - claude_start).count();
        if (!claude_result.has_value()) {
            std::cerr << "Claude translation failed" << std::endl;
            session_logger_.logFilteredSegment(text, "claude_failed", duration, audio_rms);
            continue;
        }
@ -308,8 +382,24 @@ void Pipeline::processingThread() {
        ui_->setAccumulatedText(accumulated_chinese_, accumulated_french_);
        ui_->addTranslation(text, claude_result->text);
        // Log successful segment
        segment_id_++;
        SegmentLog seg;
        seg.id = segment_id_;
        seg.chinese = text;
        seg.french = claude_result->text;
        seg.audio_duration_sec = duration;
        seg.audio_rms = audio_rms;
        seg.whisper_latency_ms = whisper_latency;
        seg.claude_latency_ms = claude_latency;
        seg.was_filtered = false;
        seg.filter_reason = "";
        seg.timestamp = "";  // Will be set by logger
        session_logger_.logSegment(seg);
        std::cout << "CN: " << text << std::endl;
        std::cout << "FR: " << claude_result->text << std::endl;
        std::cout << "[Latency] Whisper: " << whisper_latency << "ms, Claude: " << claude_latency << "ms" << std::endl;
        std::cout << "---" << std::endl;
    }
 }
@ -358,10 +448,34 @@ bool Pipeline::shouldClose() const {
 void Pipeline::clearAccumulated() {
    accumulated_chinese_.clear();
    accumulated_french_.clear();
    recent_transcriptions_.clear();
    last_transcription_.clear();
    if (ui_) {
        ui_->setAccumulatedText("", "");
    }
-    std::cout << "[Pipeline] Cleared accumulated text" << std::endl;
+    std::cout << "[Pipeline] Cleared accumulated text and context" << std::endl;
 }
 std::string Pipeline::buildDynamicPrompt() const {
    auto& config = Config::getInstance();
    std::string base_prompt = config.getWhisperConfig().prompt;
    // If no recent transcriptions, just return base prompt
    if (recent_transcriptions_.empty()) {
        return base_prompt;
    }
    // Build context from recent transcriptions
    std::stringstream context;
    context << base_prompt;
    context << "\n\nContexte des phrases précédentes: ";
    for (size_t i = 0; i < recent_transcriptions_.size(); ++i) {
        if (i > 0) context << " ";
        context << recent_transcriptions_[i];
    }
    return context.str();
 }
 } // namespace secondvoice
--- a/src/core/Pipeline.h
+++ b/src/core/Pipeline.h
@ -6,6 +6,7 @@
 #include <string>
 #include <vector>
 #include "../utils/ThreadSafeQueue.h"
 #include "../utils/SessionLogger.h"
 namespace secondvoice {
@ -60,6 +61,20 @@ private:
    // Simple accumulation
    std::string accumulated_chinese_;
    std::string accumulated_french_;
    // Dynamic context for Whisper (last N transcriptions)
    std::vector<std::string> recent_transcriptions_;
    static constexpr size_t MAX_CONTEXT_SEGMENTS = 3;
    // Deduplication: skip if same as last transcription
    std::string last_transcription_;
    // Build dynamic prompt with recent context
    std::string buildDynamicPrompt() const;
    // Session logging
    SessionLogger session_logger_;
    int segment_id_ = 0;
 };
 } // namespace secondvoice
--- a/src/utils/SessionLogger.cpp
+++ b/src/utils/SessionLogger.cpp
@ -0,0 +1,196 @@
 #include "SessionLogger.h"
 #include <nlohmann/json.hpp>
 #include <filesystem>
 #include <iostream>
 #include <iomanip>
 #include <sstream>
 namespace secondvoice {
 using json = nlohmann::json;
 SessionLogger::SessionLogger() = default;
 SessionLogger::~SessionLogger() {
    if (is_active_) {
        endSession();
    }
 }
 std::string SessionLogger::getCurrentTimestamp() const {
    auto now = std::chrono::system_clock::now();
    auto time_t = std::chrono::system_clock::to_time_t(now);
    auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(
        now.time_since_epoch()) % 1000;
    std::stringstream ss;
    ss << std::put_time(std::localtime(&time_t), "%Y-%m-%d_%H%M%S");
    return ss.str();
 }
 void SessionLogger::startSession() {
    if (is_active_) {
        endSession();
    }
    session_start_time_ = getCurrentTimestamp();
    session_path_ = "./sessions/" + session_start_time_;
    // Create directories
    std::filesystem::create_directories(session_path_ + "/segments");
    is_active_ = true;
    segment_count_ = 0;
    filtered_count_ = 0;
    total_audio_sec_ = 0.0f;
    total_whisper_ms_ = 0;
    total_claude_ms_ = 0;
    segments_.clear();
    std::cout << "[Session] Started: " << session_path_ << std::endl;
 }
 void SessionLogger::endSession() {
    if (!is_active_) return;
    writeSessionJson();
    is_active_ = false;
    std::cout << "[Session] Ended: " << segment_count_ << " segments, "
              << filtered_count_ << " filtered, "
              << total_audio_sec_ << "s audio" << std::endl;
 }
 void SessionLogger::logSegment(const SegmentLog& segment) {
    if (!is_active_) return;
    // Update counters
    segment_count_++;
    total_audio_sec_ += segment.audio_duration_sec;
    total_whisper_ms_ += segment.whisper_latency_ms;
    total_claude_ms_ += segment.claude_latency_ms;
    // Store segment
    segments_.push_back(segment);
    // Write individual segment JSON
    std::stringstream filename;
    filename << session_path_ << "/segments/"
             << std::setfill('0') << std::setw(3) << segment.id << ".json";
    json j;
    j["id"] = segment.id;
    j["chinese"] = segment.chinese;
    j["french"] = segment.french;
    j["audio_duration_sec"] = segment.audio_duration_sec;
    j["audio_rms"] = segment.audio_rms;
    j["whisper_latency_ms"] = segment.whisper_latency_ms;
    j["claude_latency_ms"] = segment.claude_latency_ms;
    j["was_filtered"] = segment.was_filtered;
    j["filter_reason"] = segment.filter_reason;
    j["timestamp"] = segment.timestamp;
    std::ofstream file(filename.str());
    if (file.is_open()) {
        file << j.dump(2);
        file.close();
    }
    std::cout << "[Session] Logged segment #" << segment.id
              << " (" << segment.audio_duration_sec << "s)" << std::endl;
 }
 void SessionLogger::logFilteredSegment(const std::string& chinese, const std::string& reason,
                                       float audio_duration, float audio_rms) {
    if (!is_active_) return;
    filtered_count_++;
    total_audio_sec_ += audio_duration;
    // Log filtered segment with special marker
    SegmentLog seg;
    seg.id = segment_count_ + filtered_count_;
    seg.chinese = chinese;
    seg.french = "[FILTERED]";
    seg.audio_duration_sec = audio_duration;
    seg.audio_rms = audio_rms;
    seg.whisper_latency_ms = 0;
    seg.claude_latency_ms = 0;
    seg.was_filtered = true;
    seg.filter_reason = reason;
    seg.timestamp = getCurrentTimestamp();
    segments_.push_back(seg);
    // Write filtered segment JSON
    std::stringstream filename;
    filename << session_path_ << "/segments/"
             << std::setfill('0') << std::setw(3) << seg.id << "_filtered.json";
    json j;
    j["id"] = seg.id;
    j["chinese"] = seg.chinese;
    j["filter_reason"] = reason;
    j["audio_duration_sec"] = audio_duration;
    j["audio_rms"] = audio_rms;
    j["timestamp"] = seg.timestamp;
    std::ofstream file(filename.str());
    if (file.is_open()) {
        file << j.dump(2);
        file.close();
    }
 }
 void SessionLogger::writeSessionJson() {
    json session;
    session["start_time"] = session_start_time_;
    session["end_time"] = getCurrentTimestamp();
    session["total_segments"] = segment_count_;
    session["filtered_segments"] = filtered_count_;
    session["total_audio_seconds"] = total_audio_sec_;
    session["avg_whisper_latency_ms"] = segment_count_ > 0 ?
        total_whisper_ms_ / segment_count_ : 0;
    session["avg_claude_latency_ms"] = segment_count_ > 0 ?
        total_claude_ms_ / segment_count_ : 0;
    // Summary of all segments
    json segments_summary = json::array();
    for (const auto& seg : segments_) {
        json s;
        s["id"] = seg.id;
        s["chinese"] = seg.chinese;
        s["french"] = seg.french;
        s["duration"] = seg.audio_duration_sec;
        s["filtered"] = seg.was_filtered;
        if (seg.was_filtered) {
            s["filter_reason"] = seg.filter_reason;
        }
        segments_summary.push_back(s);
    }
    session["segments"] = segments_summary;
    std::string filepath = session_path_ + "/session.json";
    std::ofstream file(filepath);
    if (file.is_open()) {
        file << session.dump(2);
        file.close();
        std::cout << "[Session] Wrote " << filepath << std::endl;
    }
    // Also write plain text transcript
    std::string transcript_path = session_path_ + "/transcript.txt";
    std::ofstream transcript(transcript_path);
    if (transcript.is_open()) {
        transcript << "=== SecondVoice Session " << session_start_time_ << " ===\n\n";
        for (const auto& seg : segments_) {
            if (!seg.was_filtered) {
                transcript << "CN: " << seg.chinese << "\n";
                transcript << "FR: " << seg.french << "\n\n";
            }
        }
        transcript.close();
    }
 }
 } // namespace secondvoice
--- a/src/utils/SessionLogger.h
+++ b/src/utils/SessionLogger.h
@ -0,0 +1,63 @@
 #pragma once
 #include <string>
 #include <vector>
 #include <chrono>
 #include <fstream>
 namespace secondvoice {
 struct SegmentLog {
    int id;
    std::string chinese;
    std::string french;
    float audio_duration_sec;
    float audio_rms;
    int64_t whisper_latency_ms;
    int64_t claude_latency_ms;
    bool was_filtered;
    std::string filter_reason;
    std::string timestamp;
 };
 class SessionLogger {
 public:
    SessionLogger();
    ~SessionLogger();
    // Start a new session (creates directory)
    void startSession();
    // End session (writes session.json summary)
    void endSession();
    // Log a segment
    void logSegment(const SegmentLog& segment);
    // Log a filtered/skipped segment
    void logFilteredSegment(const std::string& chinese, const std::string& reason,
                           float audio_duration, float audio_rms);
    // Get current session path
    std::string getSessionPath() const { return session_path_; }
    // Check if session is active
    bool isActive() const { return is_active_; }
 private:
    std::string getCurrentTimestamp() const;
    void writeSessionJson();
    bool is_active_ = false;
    std::string session_path_;
    std::string session_start_time_;
    int segment_count_ = 0;
    int filtered_count_ = 0;
    float total_audio_sec_ = 0.0f;
    int total_whisper_ms_ = 0;
    int total_claude_ms_ = 0;
    std::vector<SegmentLog> segments_;
 };
 } // namespace secondvoice