feat: Major improvements - WinHTTP, gpt-4o-mini, Opus, sliding window

- Replace cpp-httplib with native WinHTTP for HTTPS support
- Switch from whisper-1 to gpt-4o-mini-transcribe model
- Use Opus/OGG encoding instead of WAV (~10x smaller files)
- Implement sliding window audio capture with overlap
- Add transcription deduplication for overlapping segments
- Add Voice Activity Detection (VAD) to filter silence/noise
- Filter Whisper hallucinations (Amara.org, etc.)
- Add UTF-8 console support for Chinese characters
- Add Chinese font loading in ImGui
- Make Claude responses concise (translation only, no explanations)
- Configurable window size, font size, chunk duration/step

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Trouve Alexis 2025-11-23 12:17:41 +08:00
parent 089acbfff1
commit fa8ea2907b
17 changed files with 708 additions and 143 deletions

View File

@ -56,7 +56,6 @@ target_include_directories(imgui_backends PUBLIC
# Find packages (no more vcpkg imgui!)
find_package(portaudio CONFIG REQUIRED)
find_package(httplib CONFIG REQUIRED)
find_package(nlohmann_json CONFIG REQUIRED)
find_package(glfw3 CONFIG REQUIRED)
find_package(glad CONFIG REQUIRED)
@ -74,6 +73,7 @@ set(SOURCES_UI
# API clients
src/api/WhisperClient.cpp
src/api/ClaudeClient.cpp
src/api/WinHttpClient.cpp
# UI
src/ui/TranslationUI.cpp
# Utils
@ -91,6 +91,7 @@ set(SOURCES_CONSOLE
# API clients
src/api/WhisperClient.cpp
src/api/ClaudeClient.cpp
src/api/WinHttpClient.cpp
# Utils
src/utils/Config.cpp
# Core - WAIT, Pipeline uses UI!
@ -116,7 +117,6 @@ if(MINGW AND NOT BUILD_SHARED_LIBS)
# Static linking for MinGW - need to add Windows system libs for portaudio
target_link_libraries(${PROJECT_NAME} PRIVATE
${CMAKE_CURRENT_BINARY_DIR}/vcpkg_installed/x64-mingw-static/lib/libportaudio.a
httplib::httplib
nlohmann_json::nlohmann_json
imgui_backends
${CMAKE_CURRENT_BINARY_DIR}/vcpkg_installed/x64-mingw-static/lib/libglfw3.a
@ -124,25 +124,27 @@ if(MINGW AND NOT BUILD_SHARED_LIBS)
OpenGL::GL
opus
ogg
# Windows system libraries for portaudio
# Windows system libraries
winmm
setupapi
ws2_32
winhttp
)
# Console version - NO UI libs
target_link_libraries(${PROJECT_NAME}_Console PRIVATE
${CMAKE_CURRENT_BINARY_DIR}/vcpkg_installed/x64-mingw-static/lib/libportaudio.a
httplib::httplib
nlohmann_json::nlohmann_json
opus
ogg
# Windows system libraries for portaudio
# Windows system libraries
winmm
setupapi
ws2_32
winhttp
)
else()
target_link_libraries(${PROJECT_NAME} PRIVATE
portaudio
httplib::httplib
portaudio_static
nlohmann_json::nlohmann_json
imgui_backends
glfw
@ -153,8 +155,7 @@ else()
)
# Console version - NO UI libs
target_link_libraries(${PROJECT_NAME}_Console PRIVATE
portaudio
httplib::httplib
portaudio_static
nlohmann_json::nlohmann_json
opus
ogg
@ -196,6 +197,12 @@ endif()
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.json
${CMAKE_CURRENT_BINARY_DIR}/config.json COPYONLY)
# Copy .env file if it exists
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.env)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/.env
${CMAKE_CURRENT_BINARY_DIR}/.env COPYONLY)
endif()
# Install target
install(TARGETS ${PROJECT_NAME} DESTINATION bin)
install(FILES config.json DESTINATION bin)

View File

@ -3,26 +3,27 @@
"sample_rate": 16000,
"channels": 1,
"chunk_duration_seconds": 10,
"format": "wav"
"chunk_step_seconds": 5,
"format": "ogg"
},
"whisper": {
"model": "gpt-4o-mini-transcribe",
"language": "zh",
"temperature": 0.0,
"prompt": "The following is a conversation in Mandarin Chinese about business, family, and daily life. Common names: Tingting, Alexis.",
"stream": true,
"stream": false,
"response_format": "text"
},
"claude": {
"model": "claude-haiku-4-20250514",
"model": "claude-3-5-haiku-20241022",
"max_tokens": 1024,
"temperature": 0.3,
"system_prompt": "Tu es un traducteur professionnel chinois-français. Traduis le texte suivant de manière naturelle et contextuelle."
"system_prompt": "Tu es un traducteur chinois-français. Réponds UNIQUEMENT avec la traduction française, sans explications, notes, commentaires ou alternatives. Une seule phrase traduite, rien d'autre."
},
"ui": {
"window_width": 800,
"window_height": 600,
"font_size": 16,
"window_width": 1200,
"window_height": 800,
"font_size": 24,
"max_display_lines": 50
},
"recording": {

View File

@ -1,4 +1,5 @@
@echo off
setlocal enabledelayedexpansion
REM MinGW Setup Script for SecondVoice
REM This installs a lightweight compiler instead of Visual Studio
@ -7,18 +8,18 @@ echo SecondVoice - MinGW Setup
echo ========================================
echo.
echo This script will install:
echo - MinGW-w64 (GCC compiler for Windows)
echo - CMake (build system)
echo - Ninja (build tool)
echo - Git (if not installed)
echo - MinGW-w64 - GCC compiler for Windows
echo - CMake - build system
echo - Ninja - build tool
echo - Git if not installed
echo.
echo Total size: ~500MB (vs 10GB+ for Visual Studio!)
echo Total size: ~500MB vs 10GB+ for Visual Studio
echo.
pause
REM Check if running as admin
net session >nul 2>&1
if %errorlevel% neq 0 (
if !errorlevel! neq 0 (
echo [WARNING] Not running as administrator
echo Some installations might fail. Recommended to run as admin.
echo.
@ -27,7 +28,7 @@ if %errorlevel% neq 0 (
REM Install chocolatey if not present
where choco >nul 2>&1
if %errorlevel% neq 0 (
if !errorlevel! neq 0 (
echo [INFO] Installing Chocolatey package manager...
echo.
@ -37,7 +38,7 @@ if %errorlevel% neq 0 (
set "PATH=%PATH%;C:\ProgramData\chocolatey\bin"
where choco >nul 2>&1
if %errorlevel% neq 0 (
if !errorlevel! neq 0 (
echo.
echo ========================================
echo [IMPORTANT] Chocolatey installed successfully!
@ -59,11 +60,11 @@ echo.
REM Install MinGW-w64
where gcc >nul 2>&1
if %errorlevel% neq 0 (
echo [INFO] Installing MinGW-w64 (GCC compiler)...
if !errorlevel! neq 0 (
echo [INFO] Installing MinGW-w64...
choco install mingw -y
if %errorlevel% neq 0 (
if !errorlevel! neq 0 (
echo [ERROR] Failed to install MinGW
pause
exit /b 1
@ -79,11 +80,11 @@ if %errorlevel% neq 0 (
REM Install CMake
where cmake >nul 2>&1
if %errorlevel% neq 0 (
if !errorlevel! neq 0 (
echo [INFO] Installing CMake...
choco install cmake -y --installargs 'ADD_CMAKE_TO_PATH=System'
if %errorlevel% neq 0 (
if !errorlevel! neq 0 (
echo [ERROR] Failed to install CMake
pause
exit /b 1
@ -99,11 +100,11 @@ if %errorlevel% neq 0 (
REM Install Ninja
where ninja >nul 2>&1
if %errorlevel% neq 0 (
if !errorlevel! neq 0 (
echo [INFO] Installing Ninja...
choco install ninja -y
if %errorlevel% neq 0 (
if !errorlevel! neq 0 (
echo [ERROR] Failed to install Ninja
pause
exit /b 1
@ -119,11 +120,11 @@ if %errorlevel% neq 0 (
REM Install Git (if not present)
where git >nul 2>&1
if %errorlevel% neq 0 (
if !errorlevel! neq 0 (
echo [INFO] Installing Git...
choco install git -y
if %errorlevel% neq 0 (
if !errorlevel! neq 0 (
echo [ERROR] Failed to install Git
pause
exit /b 1
@ -146,7 +147,7 @@ if not defined VCPKG_ROOT (
cd vcpkg
call bootstrap-vcpkg.bat
if %errorlevel% neq 0 (
if !errorlevel! neq 0 (
echo [ERROR] Failed to bootstrap vcpkg
pause
exit /b 1
@ -162,7 +163,7 @@ if not defined VCPKG_ROOT (
echo [SUCCESS] VCPKG_ROOT set to C:\vcpkg
echo.
) else (
echo [INFO] vcpkg already configured at: %VCPKG_ROOT%
echo [INFO] vcpkg already configured at: !VCPKG_ROOT!
echo.
)
@ -175,16 +176,16 @@ where gcc
where cmake
where ninja
where git
echo VCPKG_ROOT=%VCPKG_ROOT%
echo VCPKG_ROOT=!VCPKG_ROOT!
echo.
echo ========================================
echo Next Steps:
echo ========================================
echo 1. Close and reopen this terminal (to reload PATH)
echo 1. Close and reopen this terminal to reload PATH
echo 2. Run: build_mingw.bat --release
echo 3. Your .exe will be in: build\mingw-release\SecondVoice.exe
echo.
echo Total installation size: ~500MB
echo (vs 10GB+ for Visual Studio!)
echo vs 10GB+ for Visual Studio!
echo.
pause

View File

@ -1,6 +1,5 @@
#include "ClaudeClient.h"
#include "../mingw_compat.h"
#include <httplib.h>
#include "WinHttpClient.h"
#include <nlohmann/json.hpp>
#include <iostream>
@ -37,40 +36,37 @@ std::optional<ClaudeResponse> ClaudeClient::translate(
std::string request_body = request_json.dump();
// Make HTTP request
httplib::Client client("https://api.anthropic.com");
client.set_read_timeout(15, 0); // 15 seconds timeout
httplib::Headers headers = {
WinHttpClient client;
std::map<std::string, std::string> headers = {
{"x-api-key", api_key_},
{"anthropic-version", API_VERSION},
{"content-type", "application/json"}
};
auto res = client.Post("/v1/messages", headers, request_body, "application/json");
auto response = client.postJson("api.anthropic.com", "/v1/messages", request_body, headers);
if (!res) {
std::cerr << "Claude API request failed: " << httplib::to_string(res.error()) << std::endl;
if (!response) {
std::cerr << "Claude API request failed" << std::endl;
return std::nullopt;
}
if (res->status != 200) {
std::cerr << "Claude API error " << res->status << ": " << res->body << std::endl;
if (response->statusCode != 200) {
std::cerr << "Claude API error " << response->statusCode << ": " << response->body << std::endl;
return std::nullopt;
}
// Parse response
try {
json response_json = json::parse(res->body);
json response_json = json::parse(response->body);
if (!response_json.contains("content") || !response_json["content"].is_array()) {
std::cerr << "Invalid Claude API response format" << std::endl;
return std::nullopt;
}
ClaudeResponse response;
response.text = response_json["content"][0]["text"].get<std::string>();
return response;
ClaudeResponse result;
result.text = response_json["content"][0]["text"].get<std::string>();
return result;
} catch (const json::exception& e) {
std::cerr << "Failed to parse Claude response: " << e.what() << std::endl;
return std::nullopt;

View File

@ -23,7 +23,7 @@ public:
private:
std::string api_key_;
static constexpr const char* API_URL = "https://api.anthropic.com/v1/messages";
static constexpr const char* MODEL = "claude-haiku-4-20250514";
static constexpr const char* MODEL = "claude-3-5-haiku-20241022";
static constexpr const char* API_VERSION = "2023-06-01";
};

View File

@ -1,10 +1,8 @@
#include "WhisperClient.h"
#include "WinHttpClient.h"
#include "../audio/AudioBuffer.h"
#include "../mingw_compat.h"
#include <httplib.h>
#include <nlohmann/json.hpp>
#include <iostream>
#include <sstream>
#include <fstream>
using json = nlohmann::json;
@ -25,70 +23,93 @@ std::optional<WhisperResponse> WhisperClient::transcribe(
const std::string& prompt,
const std::string& response_format) {
// Save audio to temporary Opus file
// Save audio to temporary Opus/OGG file (smaller than WAV)
AudioBuffer buffer(sample_rate, channels);
buffer.addSamples(audio_data);
std::string temp_file = "secondvoice_temp.opus";
std::string temp_file = "secondvoice_temp.ogg";
if (!buffer.saveToOpus(temp_file)) {
std::cerr << "Failed to save temporary Opus file" << std::endl;
return std::nullopt;
}
// Read Opus file
std::ifstream file(temp_file, std::ios::binary);
std::cout << "Saved Opus file: " << temp_file << std::endl;
// Read file into memory
std::ifstream file(temp_file, std::ios::binary | std::ios::ate);
if (!file.is_open()) {
std::cerr << "Failed to open temporary Opus file" << std::endl;
std::cerr << "Failed to open temporary WAV file" << std::endl;
return std::nullopt;
}
std::ostringstream opus_stream;
opus_stream << file.rdbuf();
std::string opus_data = opus_stream.str();
std::streamsize size = file.tellg();
file.seekg(0, std::ios::beg);
std::vector<char> fileData(size);
if (!file.read(fileData.data(), size)) {
std::cerr << "Failed to read temporary WAV file" << std::endl;
return std::nullopt;
}
file.close();
// Make HTTP request
httplib::Client client("https://api.openai.com");
client.set_read_timeout(30, 0); // 30 seconds timeout
// Prepare form fields
std::map<std::string, std::string> fields = {
{"model", model},
{"language", language},
{"response_format", response_format}
};
httplib::Headers headers = {
// Temperature only supported by whisper-1
if (model == "whisper-1") {
fields["temperature"] = std::to_string(temperature);
}
if (!prompt.empty()) {
fields["prompt"] = prompt;
}
// Make request
WinHttpClient client;
std::map<std::string, std::string> headers = {
{"Authorization", "Bearer " + api_key_}
};
httplib::UploadFormDataItems items;
items.push_back({"file", opus_data, "audio.opus", "audio/ogg"});
items.push_back({"model", model, "", ""});
items.push_back({"language", language, "", ""});
items.push_back({"temperature", std::to_string(temperature), "", ""});
items.push_back({"response_format", response_format, "", ""});
auto response = client.postMultipart(
"api.openai.com",
"/v1/audio/transcriptions",
fields,
"file",
"audio.ogg",
"audio/ogg",
fileData,
headers
);
// Add prompt if provided
if (!prompt.empty()) {
items.push_back({"prompt", prompt, "", ""});
}
auto res = client.Post("/v1/audio/transcriptions", headers, items);
if (!res) {
std::cerr << "Whisper API request failed: " << httplib::to_string(res.error()) << std::endl;
if (!response) {
std::cerr << "Whisper API request failed" << std::endl;
return std::nullopt;
}
if (res->status != 200) {
std::cerr << "Whisper API error " << res->status << ": " << res->body << std::endl;
if (response->statusCode != 200) {
std::cerr << "Whisper API error " << response->statusCode << ": " << response->body << std::endl;
return std::nullopt;
}
// Parse response
try {
json response_json = json::parse(res->body);
WhisperResponse response;
response.text = response_json["text"].get<std::string>();
return response;
} catch (const json::exception& e) {
std::cerr << "Failed to parse Whisper response: " << e.what() << std::endl;
return std::nullopt;
// Parse response - for "text" format, response is plain text
// For "json" format, it's JSON
WhisperResponse result;
if (response_format == "text") {
result.text = response->body;
} else {
try {
json response_json = json::parse(response->body);
result.text = response_json["text"].get<std::string>();
} catch (const json::exception& e) {
std::cerr << "Failed to parse Whisper response: " << e.what() << std::endl;
return std::nullopt;
}
}
return result;
}
} // namespace secondvoice

301
src/api/WinHttpClient.cpp Normal file
View File

@ -0,0 +1,301 @@
#include "WinHttpClient.h"
#include <iostream>
#include <sstream>
#include <random>
#include <iomanip>
#ifdef _WIN32
#pragma comment(lib, "winhttp.lib")
#endif
namespace secondvoice {
WinHttpClient::WinHttpClient() {
#ifdef _WIN32
hSession_ = WinHttpOpen(
L"SecondVoice/1.0",
WINHTTP_ACCESS_TYPE_DEFAULT_PROXY,
WINHTTP_NO_PROXY_NAME,
WINHTTP_NO_PROXY_BYPASS,
0
);
if (!hSession_) {
std::cerr << "WinHttpOpen failed: " << GetLastError() << std::endl;
}
#endif
}
WinHttpClient::~WinHttpClient() {
#ifdef _WIN32
if (hSession_) {
WinHttpCloseHandle(hSession_);
}
#endif
}
std::string WinHttpClient::generateBoundary() {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> dis(0, 15);
const char* hex = "0123456789abcdef";
std::string boundary = "----WebKitFormBoundary";
for (int i = 0; i < 16; i++) {
boundary += hex[dis(gen)];
}
return boundary;
}
#ifdef _WIN32
static std::wstring toWideString(const std::string& str) {
if (str.empty()) return L"";
int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, nullptr, 0);
std::wstring result(size - 1, 0);
MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, &result[0], size);
return result;
}
#endif
std::optional<HttpResponse> WinHttpClient::postJson(
const std::string& host,
const std::string& path,
const std::string& jsonBody,
const std::map<std::string, std::string>& headers) {
#ifdef _WIN32
if (!hSession_) {
return std::nullopt;
}
std::wstring wHost = toWideString(host);
HINTERNET hConnect = WinHttpConnect(hSession_, wHost.c_str(), INTERNET_DEFAULT_HTTPS_PORT, 0);
if (!hConnect) {
std::cerr << "WinHttpConnect failed: " << GetLastError() << std::endl;
return std::nullopt;
}
std::wstring wPath = toWideString(path);
HINTERNET hRequest = WinHttpOpenRequest(
hConnect,
L"POST",
wPath.c_str(),
nullptr,
WINHTTP_NO_REFERER,
WINHTTP_DEFAULT_ACCEPT_TYPES,
WINHTTP_FLAG_SECURE
);
if (!hRequest) {
std::cerr << "WinHttpOpenRequest failed: " << GetLastError() << std::endl;
WinHttpCloseHandle(hConnect);
return std::nullopt;
}
// Add headers
for (const auto& [key, value] : headers) {
std::wstring header = toWideString(key + ": " + value);
WinHttpAddRequestHeaders(hRequest, header.c_str(), -1, WINHTTP_ADDREQ_FLAG_ADD);
}
// Send request
BOOL result = WinHttpSendRequest(
hRequest,
WINHTTP_NO_ADDITIONAL_HEADERS,
0,
(LPVOID)jsonBody.c_str(),
jsonBody.length(),
jsonBody.length(),
0
);
if (!result) {
std::cerr << "WinHttpSendRequest failed: " << GetLastError() << std::endl;
WinHttpCloseHandle(hRequest);
WinHttpCloseHandle(hConnect);
return std::nullopt;
}
result = WinHttpReceiveResponse(hRequest, nullptr);
if (!result) {
std::cerr << "WinHttpReceiveResponse failed: " << GetLastError() << std::endl;
WinHttpCloseHandle(hRequest);
WinHttpCloseHandle(hConnect);
return std::nullopt;
}
// Get status code
DWORD statusCode = 0;
DWORD statusCodeSize = sizeof(statusCode);
WinHttpQueryHeaders(
hRequest,
WINHTTP_QUERY_STATUS_CODE | WINHTTP_QUERY_FLAG_NUMBER,
WINHTTP_HEADER_NAME_BY_INDEX,
&statusCode,
&statusCodeSize,
WINHTTP_NO_HEADER_INDEX
);
// Read response body
std::string responseBody;
DWORD bytesAvailable = 0;
do {
bytesAvailable = 0;
if (!WinHttpQueryDataAvailable(hRequest, &bytesAvailable)) {
break;
}
if (bytesAvailable == 0) break;
std::vector<char> buffer(bytesAvailable + 1);
DWORD bytesRead = 0;
if (WinHttpReadData(hRequest, buffer.data(), bytesAvailable, &bytesRead)) {
responseBody.append(buffer.data(), bytesRead);
}
} while (bytesAvailable > 0);
WinHttpCloseHandle(hRequest);
WinHttpCloseHandle(hConnect);
return HttpResponse{static_cast<int>(statusCode), responseBody};
#else
return std::nullopt;
#endif
}
std::optional<HttpResponse> WinHttpClient::postMultipart(
const std::string& host,
const std::string& path,
const std::map<std::string, std::string>& fields,
const std::string& fileFieldName,
const std::string& fileName,
const std::string& fileContentType,
const std::vector<char>& fileData,
const std::map<std::string, std::string>& headers) {
#ifdef _WIN32
if (!hSession_) {
return std::nullopt;
}
std::string boundary = generateBoundary();
// Build multipart body
std::ostringstream body;
// Add fields
for (const auto& [key, value] : fields) {
body << "--" << boundary << "\r\n";
body << "Content-Disposition: form-data; name=\"" << key << "\"\r\n\r\n";
body << value << "\r\n";
}
// Add file
body << "--" << boundary << "\r\n";
body << "Content-Disposition: form-data; name=\"" << fileFieldName << "\"; filename=\"" << fileName << "\"\r\n";
body << "Content-Type: " << fileContentType << "\r\n\r\n";
std::string bodyPrefix = body.str();
std::string bodySuffix = "\r\n--" + boundary + "--\r\n";
// Combine all parts
std::vector<char> fullBody;
fullBody.insert(fullBody.end(), bodyPrefix.begin(), bodyPrefix.end());
fullBody.insert(fullBody.end(), fileData.begin(), fileData.end());
fullBody.insert(fullBody.end(), bodySuffix.begin(), bodySuffix.end());
std::wstring wHost = toWideString(host);
HINTERNET hConnect = WinHttpConnect(hSession_, wHost.c_str(), INTERNET_DEFAULT_HTTPS_PORT, 0);
if (!hConnect) {
std::cerr << "WinHttpConnect failed: " << GetLastError() << std::endl;
return std::nullopt;
}
std::wstring wPath = toWideString(path);
HINTERNET hRequest = WinHttpOpenRequest(
hConnect,
L"POST",
wPath.c_str(),
nullptr,
WINHTTP_NO_REFERER,
WINHTTP_DEFAULT_ACCEPT_TYPES,
WINHTTP_FLAG_SECURE
);
if (!hRequest) {
std::cerr << "WinHttpOpenRequest failed: " << GetLastError() << std::endl;
WinHttpCloseHandle(hConnect);
return std::nullopt;
}
// Add headers
for (const auto& [key, value] : headers) {
std::wstring header = toWideString(key + ": " + value);
WinHttpAddRequestHeaders(hRequest, header.c_str(), -1, WINHTTP_ADDREQ_FLAG_ADD);
}
// Add content-type header
std::wstring contentType = toWideString("Content-Type: multipart/form-data; boundary=" + boundary);
WinHttpAddRequestHeaders(hRequest, contentType.c_str(), -1, WINHTTP_ADDREQ_FLAG_ADD);
// Send request
BOOL result = WinHttpSendRequest(
hRequest,
WINHTTP_NO_ADDITIONAL_HEADERS,
0,
fullBody.data(),
fullBody.size(),
fullBody.size(),
0
);
if (!result) {
std::cerr << "WinHttpSendRequest failed: " << GetLastError() << std::endl;
WinHttpCloseHandle(hRequest);
WinHttpCloseHandle(hConnect);
return std::nullopt;
}
result = WinHttpReceiveResponse(hRequest, nullptr);
if (!result) {
std::cerr << "WinHttpReceiveResponse failed: " << GetLastError() << std::endl;
WinHttpCloseHandle(hRequest);
WinHttpCloseHandle(hConnect);
return std::nullopt;
}
// Get status code
DWORD statusCode = 0;
DWORD statusCodeSize = sizeof(statusCode);
WinHttpQueryHeaders(
hRequest,
WINHTTP_QUERY_STATUS_CODE | WINHTTP_QUERY_FLAG_NUMBER,
WINHTTP_HEADER_NAME_BY_INDEX,
&statusCode,
&statusCodeSize,
WINHTTP_NO_HEADER_INDEX
);
// Read response body
std::string responseBody;
DWORD bytesAvailable = 0;
do {
bytesAvailable = 0;
if (!WinHttpQueryDataAvailable(hRequest, &bytesAvailable)) {
break;
}
if (bytesAvailable == 0) break;
std::vector<char> buffer(bytesAvailable + 1);
DWORD bytesRead = 0;
if (WinHttpReadData(hRequest, buffer.data(), bytesAvailable, &bytesRead)) {
responseBody.append(buffer.data(), bytesRead);
}
} while (bytesAvailable > 0);
WinHttpCloseHandle(hRequest);
WinHttpCloseHandle(hConnect);
return HttpResponse{static_cast<int>(statusCode), responseBody};
#else
return std::nullopt;
#endif
}
} // namespace secondvoice

52
src/api/WinHttpClient.h Normal file
View File

@ -0,0 +1,52 @@
#pragma once
#ifdef _WIN32
#include <windows.h>
#include <winhttp.h>
#endif
#include <string>
#include <vector>
#include <map>
#include <optional>
namespace secondvoice {
struct HttpResponse {
int statusCode;
std::string body;
};
class WinHttpClient {
public:
WinHttpClient();
~WinHttpClient();
// POST JSON request
std::optional<HttpResponse> postJson(
const std::string& host,
const std::string& path,
const std::string& jsonBody,
const std::map<std::string, std::string>& headers
);
// POST multipart form data
std::optional<HttpResponse> postMultipart(
const std::string& host,
const std::string& path,
const std::map<std::string, std::string>& fields,
const std::string& fileFieldName,
const std::string& fileName,
const std::string& fileContentType,
const std::vector<char>& fileData,
const std::map<std::string, std::string>& headers
);
private:
#ifdef _WIN32
HINTERNET hSession_;
#endif
std::string generateBoundary();
};
} // namespace secondvoice

View File

@ -3,10 +3,12 @@
namespace secondvoice {
AudioCapture::AudioCapture(int sample_rate, int channels, int chunk_duration_seconds)
AudioCapture::AudioCapture(int sample_rate, int channels, int chunk_duration_seconds, int chunk_step_seconds)
: sample_rate_(sample_rate)
, channels_(channels)
, chunk_duration_seconds_(chunk_duration_seconds) {
, chunk_duration_seconds_(chunk_duration_seconds)
, chunk_step_seconds_(chunk_step_seconds > 0 ? chunk_step_seconds : chunk_duration_seconds) {
// If step not specified, use duration (no overlap)
}
AudioCapture::~AudioCapture() {
@ -47,12 +49,17 @@ int AudioCapture::audioCallback(const void* input, void* output,
// Check if we have accumulated enough data for a chunk
size_t chunk_samples = self->sample_rate_ * self->channels_ * self->chunk_duration_seconds_;
size_t step_samples = self->sample_rate_ * self->channels_ * self->chunk_step_seconds_;
if (self->buffer_.size() >= chunk_samples) {
// Call the callback with the chunk
// Call the callback with the full chunk
if (self->callback_) {
self->callback_(self->buffer_);
// Send exactly chunk_samples (the window)
std::vector<float> chunk(self->buffer_.begin(), self->buffer_.begin() + chunk_samples);
self->callback_(chunk);
}
self->buffer_.clear();
// Sliding window: remove only step_samples, keep overlap for next chunk
self->buffer_.erase(self->buffer_.begin(), self->buffer_.begin() + step_samples);
}
return paContinue;

View File

@ -11,7 +11,8 @@ class AudioCapture {
public:
using AudioCallback = std::function<void(const std::vector<float>&)>;
AudioCapture(int sample_rate, int channels, int chunk_duration_seconds);
// chunk_duration = window size, chunk_step = how often to send (overlap = duration - step)
AudioCapture(int sample_rate, int channels, int chunk_duration_seconds, int chunk_step_seconds = 0);
~AudioCapture();
bool initialize();
@ -29,6 +30,7 @@ private:
int sample_rate_;
int channels_;
int chunk_duration_seconds_;
int chunk_step_seconds_; // How often to emit chunks (0 = same as duration, no overlap)
bool is_recording_ = false;
PaStream* stream_ = nullptr;

View File

@ -10,6 +10,8 @@
#include <sstream>
#include <chrono>
#include <filesystem>
#include <cmath>
#include <algorithm>
namespace secondvoice {
@ -22,13 +24,19 @@ Pipeline::~Pipeline() {
bool Pipeline::initialize() {
auto& config = Config::getInstance();
// Initialize audio capture
// Initialize audio capture with sliding window (overlap)
audio_capture_ = std::make_unique<AudioCapture>(
config.getAudioConfig().sample_rate,
config.getAudioConfig().channels,
config.getAudioConfig().chunk_duration_seconds
config.getAudioConfig().chunk_duration_seconds,
config.getAudioConfig().chunk_step_seconds
);
std::cout << "[Pipeline] Audio: " << config.getAudioConfig().chunk_duration_seconds
<< "s window, " << config.getAudioConfig().chunk_step_seconds
<< "s step (overlap: " << (config.getAudioConfig().chunk_duration_seconds - config.getAudioConfig().chunk_step_seconds)
<< "s)" << std::endl;
if (!audio_capture_->initialize()) {
std::cerr << "Failed to initialize audio capture" << std::endl;
return false;
@ -68,10 +76,9 @@ bool Pipeline::start() {
running_ = true;
// Start threads
// Start background threads (NOT UI - that runs in main thread)
audio_thread_ = std::thread(&Pipeline::audioThread, this);
processing_thread_ = std::thread(&Pipeline::processingThread, this);
ui_thread_ = std::thread(&Pipeline::uiThread, this);
return true;
}
@ -92,16 +99,13 @@ void Pipeline::stop() {
audio_queue_.shutdown();
transcription_queue_.shutdown();
// Wait for threads
// Wait for background threads
if (audio_thread_.joinable()) {
audio_thread_.join();
}
if (processing_thread_.joinable()) {
processing_thread_.join();
}
if (ui_thread_.joinable()) {
ui_thread_.join();
}
// Save full recording
auto& config = Config::getInstance();
@ -153,6 +157,11 @@ void Pipeline::audioThread() {
void Pipeline::processingThread() {
auto& config = Config::getInstance();
// VAD threshold - audio RMS must exceed this to be considered speech
// Higher values = more aggressive noise filtering
constexpr float VAD_THRESHOLD = 0.02f; // RMS energy threshold
constexpr float VAD_MIN_PEAK = 0.08f; // Minimum peak amplitude
while (running_) {
auto chunk_opt = audio_queue_.wait_and_pop();
if (!chunk_opt.has_value()) {
@ -161,6 +170,22 @@ void Pipeline::processingThread() {
auto& chunk = chunk_opt.value();
// Voice Activity Detection - skip silent/noise chunks
float sum_squared = 0.0f;
float max_amplitude = 0.0f;
for (const float sample : chunk.data) {
sum_squared += sample * sample;
max_amplitude = std::max(max_amplitude, std::abs(sample));
}
float rms = std::sqrt(sum_squared / chunk.data.size());
if (rms < VAD_THRESHOLD || max_amplitude < VAD_MIN_PEAK) {
std::cout << "[Skip] Silent chunk (RMS: " << rms << ", Peak: " << max_amplitude << ")" << std::endl;
continue;
}
std::cout << "[Audio] Processing chunk (RMS: " << rms << ", Peak: " << max_amplitude << ")" << std::endl;
// Transcribe with Whisper
auto whisper_result = whisper_client_->transcribe(
chunk.data,
@ -178,9 +203,71 @@ void Pipeline::processingThread() {
continue;
}
// Translate with Claude
// Filter out Whisper hallucinations (common when audio is silent/unclear)
std::string text = whisper_result->text;
// Trim whitespace
size_t start = text.find_first_not_of(" \t\n\r");
size_t end = text.find_last_not_of(" \t\n\r");
if (start == std::string::npos) {
std::cout << "[Skip] Empty transcription" << std::endl;
continue;
}
text = text.substr(start, end - start + 1);
// Skip known hallucinations and garbage
bool is_garbage = false;
// Too short to be meaningful
if (text.length() < 4) {
is_garbage = true;
}
// Known Whisper hallucinations
else if (text.find("Amara.org") != std::string::npos ||
text.find("amara.org") != std::string::npos ||
text.find("字幕") != std::string::npos ||
text.find("subtitle") != std::string::npos ||
text.find("Subtitle") != std::string::npos ||
text.find("Thank you") != std::string::npos ||
text.find("thanks for watching") != std::string::npos ||
text.find("Subscribe") != std::string::npos ||
text.find("谢谢观看") != std::string::npos ||
text.find("订阅") != std::string::npos ||
text.find("...") == 0) { // Starts with ellipsis
is_garbage = true;
}
// Only punctuation or whitespace
else {
bool has_content = false;
for (char c : text) {
if (std::isalnum(static_cast<unsigned char>(c)) || (c & 0x80)) {
has_content = true;
break;
}
}
if (!has_content) {
is_garbage = true;
}
}
if (is_garbage) {
std::cout << "[Skip] Filtered: " << text << std::endl;
continue;
}
// Remove overlap with previous transcription
std::string deduplicated = removeOverlap(text, last_transcription_);
last_transcription_ = text; // Store full text for next comparison
if (deduplicated.empty() || deduplicated.length() < 2) {
std::cout << "[Skip] Fully overlapping transcription" << std::endl;
continue;
}
std::cout << "[New content] " << deduplicated << std::endl;
// Translate with Claude (only the new, deduplicated content)
auto claude_result = claude_client_->translate(
whisper_result->text,
deduplicated,
config.getClaudeConfig().system_prompt,
config.getClaudeConfig().max_tokens,
config.getClaudeConfig().temperature
@ -192,31 +279,69 @@ void Pipeline::processingThread() {
}
// Add to UI
ui_->addTranslation(whisper_result->text, claude_result->text);
ui_->addTranslation(deduplicated, claude_result->text);
std::cout << "CN: " << whisper_result->text << std::endl;
std::cout << "CN: " << deduplicated << std::endl;
std::cout << "FR: " << claude_result->text << std::endl;
std::cout << "---" << std::endl;
}
}
void Pipeline::uiThread() {
// CRITICAL: Make OpenGL context current in THIS thread (not the thread that created it)
ui_->makeContextCurrent();
void Pipeline::update() {
if (!ui_) return;
while (running_ && !ui_->shouldClose()) {
ui_->setRecordingDuration(recording_duration_);
ui_->setProcessingStatus("Processing...");
ui_->render();
ui_->setRecordingDuration(recording_duration_);
ui_->setProcessingStatus("Processing...");
ui_->render();
// Check if stop was requested
if (ui_->isStopRequested()) {
running_ = false;
break;
}
std::this_thread::sleep_for(std::chrono::milliseconds(16)); // ~60 FPS
// Check if stop was requested
if (ui_->isStopRequested()) {
running_ = false;
}
}
bool Pipeline::shouldClose() const {
return ui_ ? ui_->shouldClose() : true;
}
std::string Pipeline::removeOverlap(const std::string& current, const std::string& previous) {
if (previous.empty()) {
return current;
}
// Try to find overlap: check if current starts with end of previous
// Start from half the previous text (since we have 50% overlap)
size_t min_overlap = 4; // Minimum characters to consider as overlap
size_t search_start = previous.length() / 3; // Start looking from 1/3 into previous
for (size_t i = search_start; i < previous.length() - min_overlap; ++i) {
std::string suffix = previous.substr(i);
if (current.find(suffix) == 0) {
// Found overlap - return only the new part
std::string new_part = current.substr(suffix.length());
if (!new_part.empty()) {
std::cout << "[Overlap] Removed: \"" << suffix << "\"" << std::endl;
return new_part;
}
}
}
// No overlap found - check for partial word overlap at start
// This handles cases where the same content appears but tokenized differently
size_t max_check = std::min(current.length(), previous.length()) / 2;
for (size_t len = max_check; len >= min_overlap; --len) {
std::string end_prev = previous.substr(previous.length() - len);
std::string start_curr = current.substr(0, len);
if (end_prev == start_curr) {
std::string new_part = current.substr(len);
if (!new_part.empty()) {
std::cout << "[Overlap] Partial match removed: \"" << end_prev << "\"" << std::endl;
return new_part;
}
}
}
return current; // No overlap detected
}
} // namespace secondvoice

View File

@ -34,12 +34,15 @@ public:
bool start();
void stop();
// Call this from main thread for UI updates
void update();
bool isRunning() const { return running_; }
bool shouldClose() const;
private:
void audioThread();
void processingThread();
void uiThread();
std::unique_ptr<AudioCapture> audio_capture_;
std::unique_ptr<WhisperClient> whisper_client_;
@ -52,10 +55,13 @@ private:
std::thread audio_thread_;
std::thread processing_thread_;
std::thread ui_thread_;
std::atomic<bool> running_{false};
std::atomic<int> recording_duration_{0};
// For overlap deduplication
std::string last_transcription_;
std::string removeOverlap(const std::string& current, const std::string& previous);
};
} // namespace secondvoice

View File

@ -4,6 +4,10 @@
#include "utils/Config.h"
#include "core/Pipeline.h"
#ifdef _WIN32
#include <windows.h>
#endif
// Force NVIDIA GPU on Optimus systems (instead of integrated AMD/Intel)
// These MUST be global and volatile to prevent linker optimization
#ifdef _WIN32
@ -38,6 +42,12 @@ int main(int argc, char** argv) {
(void)argc;
(void)argv;
#ifdef _WIN32
// Enable UTF-8 console output for Chinese characters
SetConsoleOutputCP(CP_UTF8);
SetConsoleCP(CP_UTF8);
#endif
log_msg("MAIN: Entry point reached");
log_msg("========================================");
log_msg("SecondVoice - Real-time Translation System");
@ -67,17 +77,18 @@ int main(int argc, char** argv) {
log_msg("Pipeline initialized successfully");
log_msg("Starting recording and translation...");
// Start pipeline
// Start pipeline (background threads for audio + API calls)
log_msg("Starting pipeline...");
if (!pipeline.start()) {
log_msg("ERROR: Failed to start pipeline");
return 1;
}
// Wait for pipeline to finish (user clicks Stop button)
log_msg("Pipeline running, waiting for user to stop...");
while (pipeline.isRunning()) {
std::this_thread::sleep_for(std::chrono::milliseconds(100));
// Main loop - UI runs in main thread (required by GLFW)
log_msg("Pipeline running, entering main loop...");
while (pipeline.isRunning() && !pipeline.shouldClose()) {
pipeline.update(); // Render one frame
std::this_thread::sleep_for(std::chrono::milliseconds(16)); // ~60 FPS
}
log_msg("");

View File

@ -1,6 +1,7 @@
#include <glad/glad.h> // MUST be FIRST! Provides OpenGL functions
#define GLFW_INCLUDE_NONE // Tell GLFW not to include OpenGL headers (GLAD does it)
#include "TranslationUI.h"
#include "../utils/Config.h"
#include <imgui.h>
#include <imgui_impl_glfw.h>
#include <imgui_impl_opengl3.h>
@ -86,6 +87,40 @@ bool TranslationUI::initialize() {
ImGuiIO& io = ImGui::GetIO();
io.ConfigFlags |= ImGuiConfigFlags_NavEnableKeyboard;
// Load Chinese font from Windows system fonts
float font_size = static_cast<float>(Config::getInstance().getUIConfig().font_size);
std::cout << "[UI] Loading Chinese font (size: " << font_size << ")..." << std::endl;
ImFontConfig font_config;
font_config.OversampleH = 2;
font_config.OversampleV = 2;
// Try common Chinese fonts on Windows
const char* chinese_fonts[] = {
"C:\\Windows\\Fonts\\msyh.ttc", // Microsoft YaHei
"C:\\Windows\\Fonts\\simhei.ttf", // SimHei
"C:\\Windows\\Fonts\\simsun.ttc", // SimSun
"C:\\Windows\\Fonts\\mingliub.ttc", // MingLiU
};
ImFont* font = nullptr;
for (const char* font_path : chinese_fonts) {
font = io.Fonts->AddFontFromFileTTF(
font_path,
font_size,
&font_config,
io.Fonts->GetGlyphRangesChineseFull()
);
if (font) {
std::cout << "[UI] Loaded font: " << font_path << std::endl;
break;
}
}
if (!font) {
std::cout << "[UI] Warning: No Chinese font found, using default (Chinese chars won't display)" << std::endl;
io.Fonts->AddFontDefault();
}
ImGui::StyleColorsDark();
ImGui_ImplGlfw_InitForOpenGL(window_, true);
@ -130,13 +165,12 @@ void main() {
return false;
}
// Let ImGui auto-detect the GLSL version
ImGui_ImplOpenGL3_Init(nullptr);
// Explicitly specify GLSL version for OpenGL 3.3 core profile
// On Windows, auto-detection (nullptr) can fail, so we specify "#version 330"
std::cout << "[UI] Initializing ImGui OpenGL3 backend with GLSL 330..." << std::endl;
ImGui_ImplOpenGL3_Init("#version 330");
// CRITICAL: Release the OpenGL context from this thread
// The UI rendering will happen in a separate thread which will call makeContextCurrent()
std::cout << "[UI] Releasing OpenGL context from initialization thread" << std::endl;
glfwMakeContextCurrent(nullptr);
// Context stays in main thread - no need to release
return true;
}

View File

@ -71,7 +71,8 @@ bool Config::load(const std::string& config_path, const std::string& env_path) {
audio_config_.sample_rate = audio.value("sample_rate", 16000);
audio_config_.channels = audio.value("channels", 1);
audio_config_.chunk_duration_seconds = audio.value("chunk_duration_seconds", 10);
audio_config_.format = audio.value("format", "wav");
audio_config_.chunk_step_seconds = audio.value("chunk_step_seconds", 0); // 0 = no overlap
audio_config_.format = audio.value("format", "ogg");
}
// Parse whisper config

View File

@ -8,6 +8,7 @@ struct AudioConfig {
int sample_rate;
int channels;
int chunk_duration_seconds;
int chunk_step_seconds; // How often to emit chunks (for overlap)
std::string format;
};

View File

@ -3,7 +3,6 @@
"version": "0.1.0",
"dependencies": [
"portaudio",
"cpp-httplib",
"nlohmann-json",
"glfw3",
"glad",