secondvoice/src/audio/AudioCapture.h

#pragma once

#include <vector>
#include <string>
#include <functional>
#include <atomic>
#include <cmath>
#include <memory>
#include <portaudio.h>

namespace secondvoice {

class NoiseReducer;

class AudioCapture {
public:
    using AudioCallback = std::function<void(const std::vector<float>&)>;

    AudioCapture(int sample_rate, int channels);
    ~AudioCapture();

    bool initialize();
    bool start(AudioCallback callback);
    void stop();
    bool isRecording() const { return is_recording_; }

    // Real-time audio levels (updated every callback, ~10ms)
    float getCurrentRMS() const { return current_rms_; }
    float getCurrentPeak() const { return current_peak_; }

    // VAD settings (can be adjusted in real-time from UI)
    void setVadThresholds(float rms_threshold, float peak_threshold) {
        vad_rms_threshold_ = rms_threshold;
        vad_peak_threshold_ = peak_threshold;
    }
    void setSilenceDuration(int ms) { silence_duration_ms_ = ms; }
    void setMinSpeechDuration(int ms) { min_speech_duration_ms_ = ms; }
    void setMaxSpeechDuration(int ms) { max_speech_duration_ms_ = ms; }

    // VAD state (for UI display)
    bool isSpeechDetected() const { return is_speech_active_; }

    // Noise reduction
    void setDenoiseEnabled(bool enabled);
    bool isDenoiseEnabled() const;

private:
    static int audioCallback(const void* input, void* output,
                            unsigned long frame_count,
                            const PaStreamCallbackTimeInfo* time_info,
                            PaStreamCallbackFlags status_flags,
                            void* user_data);

    int sample_rate_;
    int channels_;
    bool is_recording_ = false;

    PaStream* stream_ = nullptr;
    AudioCallback callback_;

    // Speech accumulation buffer
    std::vector<float> speech_buffer_;

    // VAD state
    std::atomic<bool> is_speech_active_{false};
    int silence_samples_count_ = 0;  // How many samples of silence
    int speech_samples_count_ = 0;   // How many samples of speech accumulated

    // VAD parameters - Higher threshold to avoid false triggers on filtered noise
    std::atomic<float> vad_rms_threshold_{0.02f};   // Was 0.01f
    std::atomic<float> vad_peak_threshold_{0.08f};  // Was 0.04f
    int silence_duration_ms_ = 400;      // Wait 400ms of silence before cutting
    int min_speech_duration_ms_ = 300;   // Minimum speech to send
    int max_speech_duration_ms_ = 25000; // 25s max before forced flush

    // Adaptive noise floor
    float noise_floor_ = 0.005f;         // Estimated background noise level
    float noise_floor_alpha_ = 0.001f;   // Slower adaptation

    // Hang time - wait before cutting to avoid mid-sentence cuts
    int hang_frames_ = 0;
    int hang_frames_threshold_ = 20;     // ~200ms tolerance for pauses

    // Zero-crossing rate for speech vs noise discrimination
    float last_zcr_ = 0.0f;
    float speech_zcr_min_ = 0.01f;       // More permissive lower bound
    float speech_zcr_max_ = 0.25f;       // More permissive upper bound

    // Real-time audio levels (atomic for thread safety)
    std::atomic<float> current_rms_{0.0f};
    std::atomic<float> current_peak_{0.0f};

    // Noise reduction
    std::unique_ptr<NoiseReducer> noise_reducer_;
};

} // namespace secondvoice