secondvoice/src/audio/AudioCapture.h
StillHammer db0f8e5990 refactor: Improve VAD trailing silence detection and update docs
- Replace hang time logic with consecutive silence frame counter for more precise speech end detection
- Update Whisper prompt to utilize previous context for better transcription coherence
- Expand README with comprehensive feature list, architecture details, debugging status, and session logging structure
- Add troubleshooting section for real-world testing conditions and known issues
2025-12-02 09:44:06 +08:00

97 lines
3.2 KiB
C++

#pragma once
#include <vector>
#include <string>
#include <functional>
#include <atomic>
#include <cmath>
#include <memory>
#include <portaudio.h>
namespace secondvoice {
class NoiseReducer;
class AudioCapture {
public:
using AudioCallback = std::function<void(const std::vector<float>&)>;
AudioCapture(int sample_rate, int channels);
~AudioCapture();
bool initialize();
bool start(AudioCallback callback);
void stop();
bool isRecording() const { return is_recording_; }
// Real-time audio levels (updated every callback, ~10ms)
float getCurrentRMS() const { return current_rms_; }
float getCurrentPeak() const { return current_peak_; }
// VAD settings (can be adjusted in real-time from UI)
void setVadThresholds(float rms_threshold, float peak_threshold) {
vad_rms_threshold_ = rms_threshold;
vad_peak_threshold_ = peak_threshold;
}
void setSilenceDuration(int ms) { silence_duration_ms_ = ms; }
void setMinSpeechDuration(int ms) { min_speech_duration_ms_ = ms; }
void setMaxSpeechDuration(int ms) { max_speech_duration_ms_ = ms; }
// VAD state (for UI display)
bool isSpeechDetected() const { return is_speech_active_; }
// Noise reduction
void setDenoiseEnabled(bool enabled);
bool isDenoiseEnabled() const;
private:
static int audioCallback(const void* input, void* output,
unsigned long frame_count,
const PaStreamCallbackTimeInfo* time_info,
PaStreamCallbackFlags status_flags,
void* user_data);
int sample_rate_;
int channels_;
bool is_recording_ = false;
PaStream* stream_ = nullptr;
AudioCallback callback_;
// Speech accumulation buffer
std::vector<float> speech_buffer_;
// VAD state
std::atomic<bool> is_speech_active_{false};
int silence_samples_count_ = 0; // How many samples of silence
int speech_samples_count_ = 0; // How many samples of speech accumulated
// VAD parameters - Higher threshold to avoid false triggers on filtered noise
std::atomic<float> vad_rms_threshold_{0.02f}; // Was 0.01f
std::atomic<float> vad_peak_threshold_{0.08f}; // Was 0.04f
int silence_duration_ms_ = 700; // Wait 700ms of silence before cutting (was 400)
int min_speech_duration_ms_ = 1000; // Minimum 1s speech to send (was 300)
int max_speech_duration_ms_ = 25000; // 25s max before forced flush
// Adaptive noise floor
float noise_floor_ = 0.005f; // Estimated background noise level
float noise_floor_alpha_ = 0.001f; // Slower adaptation
// Trailing silence detection - count consecutive silence frames after speech
int consecutive_silence_frames_ = 0;
// Zero-crossing rate for speech vs noise discrimination
float last_zcr_ = 0.0f;
float speech_zcr_min_ = 0.01f; // More permissive lower bound
float speech_zcr_max_ = 0.25f; // More permissive upper bound
// Real-time audio levels (atomic for thread safety)
std::atomic<float> current_rms_{0.0f};
std::atomic<float> current_peak_{0.0f};
// Noise reduction
std::unique_ptr<NoiseReducer> noise_reducer_;
};
} // namespace secondvoice