secondvoice/src/audio/AudioCapture.h
Trouve Alexis 741ca09663 feat: Add RNNoise denoising + transient suppressor + VAD improvements
- Add RNNoise neural network audio denoising (16kHz↔48kHz resampling)
- Add transient suppressor to filter claps/clicks/pops before RNNoise
- VAD now works on FILTERED audio (not raw) to avoid false triggers
- Real-time denoised audio level display in UI
- Save denoised audio previews in Opus format (.ogg)
- Add extensive Whisper hallucination filter (Tingting, music, etc.)
- Add "Clear" button to reset accumulated translations
- Double VAD thresholds (0.02/0.08) for less sensitivity
- Update Claude prompt to handle offensive content gracefully

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-23 16:46:38 +08:00

98 lines
3.2 KiB
C++

#pragma once
#include <vector>
#include <string>
#include <functional>
#include <atomic>
#include <cmath>
#include <memory>
#include <portaudio.h>
namespace secondvoice {
class NoiseReducer;
class AudioCapture {
public:
using AudioCallback = std::function<void(const std::vector<float>&)>;
AudioCapture(int sample_rate, int channels);
~AudioCapture();
bool initialize();
bool start(AudioCallback callback);
void stop();
bool isRecording() const { return is_recording_; }
// Real-time audio levels (updated every callback, ~10ms)
float getCurrentRMS() const { return current_rms_; }
float getCurrentPeak() const { return current_peak_; }
// VAD settings (can be adjusted in real-time from UI)
void setVadThresholds(float rms_threshold, float peak_threshold) {
vad_rms_threshold_ = rms_threshold;
vad_peak_threshold_ = peak_threshold;
}
void setSilenceDuration(int ms) { silence_duration_ms_ = ms; }
void setMinSpeechDuration(int ms) { min_speech_duration_ms_ = ms; }
void setMaxSpeechDuration(int ms) { max_speech_duration_ms_ = ms; }
// VAD state (for UI display)
bool isSpeechDetected() const { return is_speech_active_; }
// Noise reduction
void setDenoiseEnabled(bool enabled);
bool isDenoiseEnabled() const;
private:
static int audioCallback(const void* input, void* output,
unsigned long frame_count,
const PaStreamCallbackTimeInfo* time_info,
PaStreamCallbackFlags status_flags,
void* user_data);
int sample_rate_;
int channels_;
bool is_recording_ = false;
PaStream* stream_ = nullptr;
AudioCallback callback_;
// Speech accumulation buffer
std::vector<float> speech_buffer_;
// VAD state
std::atomic<bool> is_speech_active_{false};
int silence_samples_count_ = 0; // How many samples of silence
int speech_samples_count_ = 0; // How many samples of speech accumulated
// VAD parameters - Higher threshold to avoid false triggers on filtered noise
std::atomic<float> vad_rms_threshold_{0.02f}; // Was 0.01f
std::atomic<float> vad_peak_threshold_{0.08f}; // Was 0.04f
int silence_duration_ms_ = 400; // Wait 400ms of silence before cutting
int min_speech_duration_ms_ = 300; // Minimum speech to send
int max_speech_duration_ms_ = 25000; // 25s max before forced flush
// Adaptive noise floor
float noise_floor_ = 0.005f; // Estimated background noise level
float noise_floor_alpha_ = 0.001f; // Slower adaptation
// Hang time - wait before cutting to avoid mid-sentence cuts
int hang_frames_ = 0;
int hang_frames_threshold_ = 20; // ~200ms tolerance for pauses
// Zero-crossing rate for speech vs noise discrimination
float last_zcr_ = 0.0f;
float speech_zcr_min_ = 0.01f; // More permissive lower bound
float speech_zcr_max_ = 0.25f; // More permissive upper bound
// Real-time audio levels (atomic for thread safety)
std::atomic<float> current_rms_{0.0f};
std::atomic<float> current_peak_{0.0f};
// Noise reduction
std::unique_ptr<NoiseReducer> noise_reducer_;
};
} // namespace secondvoice