- Replace hang time logic with consecutive silence frame counter for more precise speech end detection - Update Whisper prompt to utilize previous context for better transcription coherence - Expand README with comprehensive feature list, architecture details, debugging status, and session logging structure - Add troubleshooting section for real-world testing conditions and known issues
97 lines
3.2 KiB
C++
97 lines
3.2 KiB
C++
#pragma once
|
|
|
|
#include <vector>
|
|
#include <string>
|
|
#include <functional>
|
|
#include <atomic>
|
|
#include <cmath>
|
|
#include <memory>
|
|
#include <portaudio.h>
|
|
|
|
namespace secondvoice {
|
|
|
|
class NoiseReducer;
|
|
|
|
class AudioCapture {
|
|
public:
|
|
using AudioCallback = std::function<void(const std::vector<float>&)>;
|
|
|
|
AudioCapture(int sample_rate, int channels);
|
|
~AudioCapture();
|
|
|
|
bool initialize();
|
|
bool start(AudioCallback callback);
|
|
void stop();
|
|
bool isRecording() const { return is_recording_; }
|
|
|
|
// Real-time audio levels (updated every callback, ~10ms)
|
|
float getCurrentRMS() const { return current_rms_; }
|
|
float getCurrentPeak() const { return current_peak_; }
|
|
|
|
// VAD settings (can be adjusted in real-time from UI)
|
|
void setVadThresholds(float rms_threshold, float peak_threshold) {
|
|
vad_rms_threshold_ = rms_threshold;
|
|
vad_peak_threshold_ = peak_threshold;
|
|
}
|
|
void setSilenceDuration(int ms) { silence_duration_ms_ = ms; }
|
|
void setMinSpeechDuration(int ms) { min_speech_duration_ms_ = ms; }
|
|
void setMaxSpeechDuration(int ms) { max_speech_duration_ms_ = ms; }
|
|
|
|
// VAD state (for UI display)
|
|
bool isSpeechDetected() const { return is_speech_active_; }
|
|
|
|
// Noise reduction
|
|
void setDenoiseEnabled(bool enabled);
|
|
bool isDenoiseEnabled() const;
|
|
|
|
private:
|
|
static int audioCallback(const void* input, void* output,
|
|
unsigned long frame_count,
|
|
const PaStreamCallbackTimeInfo* time_info,
|
|
PaStreamCallbackFlags status_flags,
|
|
void* user_data);
|
|
|
|
int sample_rate_;
|
|
int channels_;
|
|
bool is_recording_ = false;
|
|
|
|
PaStream* stream_ = nullptr;
|
|
AudioCallback callback_;
|
|
|
|
// Speech accumulation buffer
|
|
std::vector<float> speech_buffer_;
|
|
|
|
// VAD state
|
|
std::atomic<bool> is_speech_active_{false};
|
|
int silence_samples_count_ = 0; // How many samples of silence
|
|
int speech_samples_count_ = 0; // How many samples of speech accumulated
|
|
|
|
// VAD parameters - Higher threshold to avoid false triggers on filtered noise
|
|
std::atomic<float> vad_rms_threshold_{0.02f}; // Was 0.01f
|
|
std::atomic<float> vad_peak_threshold_{0.08f}; // Was 0.04f
|
|
int silence_duration_ms_ = 700; // Wait 700ms of silence before cutting (was 400)
|
|
int min_speech_duration_ms_ = 1000; // Minimum 1s speech to send (was 300)
|
|
int max_speech_duration_ms_ = 25000; // 25s max before forced flush
|
|
|
|
// Adaptive noise floor
|
|
float noise_floor_ = 0.005f; // Estimated background noise level
|
|
float noise_floor_alpha_ = 0.001f; // Slower adaptation
|
|
|
|
// Trailing silence detection - count consecutive silence frames after speech
|
|
int consecutive_silence_frames_ = 0;
|
|
|
|
// Zero-crossing rate for speech vs noise discrimination
|
|
float last_zcr_ = 0.0f;
|
|
float speech_zcr_min_ = 0.01f; // More permissive lower bound
|
|
float speech_zcr_max_ = 0.25f; // More permissive upper bound
|
|
|
|
// Real-time audio levels (atomic for thread safety)
|
|
std::atomic<float> current_rms_{0.0f};
|
|
std::atomic<float> current_peak_{0.0f};
|
|
|
|
// Noise reduction
|
|
std::unique_ptr<NoiseReducer> noise_reducer_;
|
|
};
|
|
|
|
} // namespace secondvoice
|