#pragma once #include #include #include #include #include #include #include namespace secondvoice { class NoiseReducer; class AudioCapture { public: using AudioCallback = std::function&)>; AudioCapture(int sample_rate, int channels); ~AudioCapture(); bool initialize(); bool start(AudioCallback callback); void stop(); bool isRecording() const { return is_recording_; } // Real-time audio levels (updated every callback, ~10ms) float getCurrentRMS() const { return current_rms_; } float getCurrentPeak() const { return current_peak_; } // VAD settings (can be adjusted in real-time from UI) void setVadThresholds(float rms_threshold, float peak_threshold) { vad_rms_threshold_ = rms_threshold; vad_peak_threshold_ = peak_threshold; } void setSilenceDuration(int ms) { silence_duration_ms_ = ms; } void setMinSpeechDuration(int ms) { min_speech_duration_ms_ = ms; } void setMaxSpeechDuration(int ms) { max_speech_duration_ms_ = ms; } // VAD state (for UI display) bool isSpeechDetected() const { return is_speech_active_; } // Noise reduction void setDenoiseEnabled(bool enabled); bool isDenoiseEnabled() const; private: static int audioCallback(const void* input, void* output, unsigned long frame_count, const PaStreamCallbackTimeInfo* time_info, PaStreamCallbackFlags status_flags, void* user_data); int sample_rate_; int channels_; bool is_recording_ = false; PaStream* stream_ = nullptr; AudioCallback callback_; // Speech accumulation buffer std::vector speech_buffer_; // VAD state std::atomic is_speech_active_{false}; int silence_samples_count_ = 0; // How many samples of silence int speech_samples_count_ = 0; // How many samples of speech accumulated // VAD parameters - Higher threshold to avoid false triggers on filtered noise std::atomic vad_rms_threshold_{0.02f}; // Was 0.01f std::atomic vad_peak_threshold_{0.08f}; // Was 0.04f int silence_duration_ms_ = 700; // Wait 700ms of silence before cutting (was 400) int min_speech_duration_ms_ = 1000; // Minimum 1s speech to send (was 300) int max_speech_duration_ms_ = 25000; // 25s max before forced flush // Adaptive noise floor float noise_floor_ = 0.005f; // Estimated background noise level float noise_floor_alpha_ = 0.001f; // Slower adaptation // Trailing silence detection - count consecutive silence frames after speech int consecutive_silence_frames_ = 0; // Zero-crossing rate for speech vs noise discrimination float last_zcr_ = 0.0f; float speech_zcr_min_ = 0.01f; // More permissive lower bound float speech_zcr_max_ = 0.25f; // More permissive upper bound // Real-time audio levels (atomic for thread safety) std::atomic current_rms_{0.0f}; std::atomic current_peak_{0.0f}; // Noise reduction std::unique_ptr noise_reducer_; }; } // namespace secondvoice