feat: Add Opus audio encoding for 46x bandwidth reduction

Replace WAV with Opus encoding for Whisper API uploads:
- Add libopus & libogg via FetchContent (no vcpkg dependency)
- Implement AudioBuffer::saveToOpus() with Ogg container
- Configure Opus encoder for voice (VOIP mode, 32kbps VBR)
- Update WhisperClient to use Opus format (audio/ogg)
- Fix Windows temp file path compatibility

Benefits:
- 46x smaller files (37KB vs 1.7MB for 10s audio)
- Reduced API costs and bandwidth
- Faster uploads for real-time translation
- Whisper API fully supports Opus/Ogg format

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
StillHammer 2025-11-21 17:38:16 +08:00
parent 7dec7a6eed
commit 14ed043bf5
4 changed files with 202 additions and 10 deletions

View File

@ -23,6 +23,21 @@ FetchContent_Declare(
FetchContent_MakeAvailable(imgui)
# Fetch Ogg and Opus for audio encoding
FetchContent_Declare(
ogg
GIT_REPOSITORY https://github.com/xiph/ogg.git
GIT_TAG v1.3.6
)
FetchContent_Declare(
opus
GIT_REPOSITORY https://github.com/xiph/opus.git
GIT_TAG v1.5.2
)
FetchContent_MakeAvailable(ogg opus)
# Create ImGui library with OpenGL/GLFW backends
add_library(imgui_backends
${imgui_SOURCE_DIR}/imgui.cpp
@ -107,6 +122,8 @@ if(MINGW AND NOT BUILD_SHARED_LIBS)
${CMAKE_CURRENT_BINARY_DIR}/vcpkg_installed/x64-mingw-static/lib/libglfw3.a
glad::glad
OpenGL::GL
opus
ogg
# Windows system libraries for portaudio
winmm
setupapi
@ -116,6 +133,8 @@ if(MINGW AND NOT BUILD_SHARED_LIBS)
${CMAKE_CURRENT_BINARY_DIR}/vcpkg_installed/x64-mingw-static/lib/libportaudio.a
httplib::httplib
nlohmann_json::nlohmann_json
opus
ogg
# Windows system libraries for portaudio
winmm
setupapi
@ -129,12 +148,16 @@ else()
glfw
glad::glad
OpenGL::GL
opus
ogg
)
# Console version - NO UI libs
target_link_libraries(${PROJECT_NAME}_Console PRIVATE
portaudio
httplib::httplib
nlohmann_json::nlohmann_json
opus
ogg
)
endif()

View File

@ -25,26 +25,26 @@ std::optional<WhisperResponse> WhisperClient::transcribe(
const std::string& prompt,
const std::string& response_format) {
// Save audio to temporary WAV file
// Save audio to temporary Opus file
AudioBuffer buffer(sample_rate, channels);
buffer.addSamples(audio_data);
std::string temp_file = "/tmp/secondvoice_temp.wav";
if (!buffer.saveToWav(temp_file)) {
std::cerr << "Failed to save temporary WAV file" << std::endl;
std::string temp_file = "secondvoice_temp.opus";
if (!buffer.saveToOpus(temp_file)) {
std::cerr << "Failed to save temporary Opus file" << std::endl;
return std::nullopt;
}
// Read WAV file
// Read Opus file
std::ifstream file(temp_file, std::ios::binary);
if (!file.is_open()) {
std::cerr << "Failed to open temporary WAV file" << std::endl;
std::cerr << "Failed to open temporary Opus file" << std::endl;
return std::nullopt;
}
std::ostringstream wav_stream;
wav_stream << file.rdbuf();
std::string wav_data = wav_stream.str();
std::ostringstream opus_stream;
opus_stream << file.rdbuf();
std::string opus_data = opus_stream.str();
file.close();
// Make HTTP request
@ -56,7 +56,7 @@ std::optional<WhisperResponse> WhisperClient::transcribe(
};
httplib::UploadFormDataItems items;
items.push_back({"file", wav_data, "audio.wav", "audio/wav"});
items.push_back({"file", opus_data, "audio.opus", "audio/ogg"});
items.push_back({"model", model, "", ""});
items.push_back({"language", language, "", ""});
items.push_back({"temperature", std::to_string(temperature), "", ""});

View File

@ -3,6 +3,9 @@
#include <cstring>
#include <cstdint>
#include <algorithm>
#include <iostream>
#include <opus.h>
#include <ogg/ogg.h>
namespace secondvoice {
@ -74,4 +77,169 @@ bool AudioBuffer::saveToWav(const std::string& filename) const {
return true;
}
bool AudioBuffer::saveToOpus(const std::string& filename) const {
if (samples_.empty()) {
std::cerr << "AudioBuffer::saveToOpus: No samples to encode" << std::endl;
return false;
}
// Create Opus encoder
int error;
OpusEncoder* encoder = opus_encoder_create(
sample_rate_,
channels_,
OPUS_APPLICATION_VOIP, // Optimized for voice
&error
);
if (error != OPUS_OK || !encoder) {
std::cerr << "Failed to create Opus encoder: " << opus_strerror(error) << std::endl;
return false;
}
// Configure encoder for best quality/compression tradeoff
opus_encoder_ctl(encoder, OPUS_SET_BITRATE(32000)); // 32 kbps (good for voice)
opus_encoder_ctl(encoder, OPUS_SET_VBR(1)); // Variable bitrate
opus_encoder_ctl(encoder, OPUS_SET_COMPLEXITY(5)); // Medium complexity
// Open output file
std::ofstream file(filename, std::ios::binary);
if (!file.is_open()) {
opus_encoder_destroy(encoder);
return false;
}
// Initialize Ogg stream
ogg_stream_state stream_state;
ogg_page page;
ogg_packet packet;
// Random serial number for Ogg stream
ogg_stream_init(&stream_state, 12345);
// Create Opus header (OpusHead)
unsigned char header_data[19] = {
'O', 'p', 'u', 's', 'H', 'e', 'a', 'd', // Magic signature
1, // Version
static_cast<unsigned char>(channels_), // Channel count
0, 0, // Pre-skip (little-endian, 0)
static_cast<unsigned char>(sample_rate_ & 0xFF),
static_cast<unsigned char>((sample_rate_ >> 8) & 0xFF),
static_cast<unsigned char>((sample_rate_ >> 16) & 0xFF),
static_cast<unsigned char>((sample_rate_ >> 24) & 0xFF),
0, 0, // Output gain (little-endian, 0)
0 // Channel mapping family (0 = mono/stereo)
};
packet.packet = header_data;
packet.bytes = 19;
packet.b_o_s = 1; // Beginning of stream
packet.e_o_s = 0;
packet.granulepos = 0;
packet.packetno = 0;
ogg_stream_packetin(&stream_state, &packet);
// Write header pages
while (ogg_stream_flush(&stream_state, &page) != 0) {
file.write(reinterpret_cast<char*>(page.header), page.header_len);
file.write(reinterpret_cast<char*>(page.body), page.body_len);
}
// Create Opus comment header (OpusTags)
const char* vendor = "SecondVoice";
size_t vendor_len = strlen(vendor);
size_t comment_header_size = 8 + 4 + vendor_len + 4;
std::vector<unsigned char> comment_header(comment_header_size);
memcpy(&comment_header[0], "OpusTags", 8);
comment_header[8] = vendor_len & 0xFF;
comment_header[9] = (vendor_len >> 8) & 0xFF;
comment_header[10] = (vendor_len >> 16) & 0xFF;
comment_header[11] = (vendor_len >> 24) & 0xFF;
memcpy(&comment_header[12], vendor, vendor_len);
// Number of user comments (0)
comment_header[12 + vendor_len] = 0;
comment_header[13 + vendor_len] = 0;
comment_header[14 + vendor_len] = 0;
comment_header[15 + vendor_len] = 0;
packet.packet = comment_header.data();
packet.bytes = comment_header_size;
packet.b_o_s = 0;
packet.e_o_s = 0;
packet.granulepos = 0;
packet.packetno = 1;
ogg_stream_packetin(&stream_state, &packet);
while (ogg_stream_flush(&stream_state, &page) != 0) {
file.write(reinterpret_cast<char*>(page.header), page.header_len);
file.write(reinterpret_cast<char*>(page.body), page.body_len);
}
// Encode audio in frames
// Opus recommends 20ms frames: frame_size = sample_rate * 0.02
int frame_size = sample_rate_ / 50; // 20ms
int frame_samples = frame_size * channels_;
std::vector<unsigned char> encoded_data(4000); // Max packet size for Opus
int64_t packetno = 2;
int64_t granulepos = 0;
for (size_t i = 0; i < samples_.size(); i += frame_samples) {
// Get frame (pad with zeros if incomplete)
std::vector<float> frame(frame_samples, 0.0f);
size_t samples_to_copy = std::min(frame_samples, static_cast<int>(samples_.size() - i));
std::copy(samples_.begin() + i, samples_.begin() + i + samples_to_copy, frame.begin());
// Encode frame
int encoded_bytes = opus_encode_float(
encoder,
frame.data(),
frame_size,
encoded_data.data(),
encoded_data.size()
);
if (encoded_bytes < 0) {
std::cerr << "Opus encoding failed: " << opus_strerror(encoded_bytes) << std::endl;
opus_encoder_destroy(encoder);
ogg_stream_clear(&stream_state);
return false;
}
// Create Ogg packet
granulepos += frame_size;
packet.packet = encoded_data.data();
packet.bytes = encoded_bytes;
packet.b_o_s = 0;
packet.e_o_s = (i + frame_samples >= samples_.size()) ? 1 : 0;
packet.granulepos = granulepos;
packet.packetno = packetno++;
ogg_stream_packetin(&stream_state, &packet);
// Write pages
while (ogg_stream_pageout(&stream_state, &page) != 0) {
file.write(reinterpret_cast<char*>(page.header), page.header_len);
file.write(reinterpret_cast<char*>(page.body), page.body_len);
}
}
// Flush remaining pages
while (ogg_stream_flush(&stream_state, &page) != 0) {
file.write(reinterpret_cast<char*>(page.header), page.header_len);
file.write(reinterpret_cast<char*>(page.body), page.body_len);
}
// Cleanup
opus_encoder_destroy(encoder);
ogg_stream_clear(&stream_state);
file.close();
std::cout << "Saved Opus file: " << filename << std::endl;
return true;
}
} // namespace secondvoice

View File

@ -14,6 +14,7 @@ public:
void clear();
bool saveToWav(const std::string& filename) const;
bool saveToOpus(const std::string& filename) const;
size_t size() const { return samples_.size(); }
bool empty() const { return samples_.empty(); }