feat: Major improvements - WinHTTP, gpt-4o-mini, Opus, sliding window
- Replace cpp-httplib with native WinHTTP for HTTPS support - Switch from whisper-1 to gpt-4o-mini-transcribe model - Use Opus/OGG encoding instead of WAV (~10x smaller files) - Implement sliding window audio capture with overlap - Add transcription deduplication for overlapping segments - Add Voice Activity Detection (VAD) to filter silence/noise - Filter Whisper hallucinations (Amara.org, etc.) - Add UTF-8 console support for Chinese characters - Add Chinese font loading in ImGui - Make Claude responses concise (translation only, no explanations) - Configurable window size, font size, chunk duration/step 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
089acbfff1
commit
fa8ea2907b
@ -56,7 +56,6 @@ target_include_directories(imgui_backends PUBLIC
|
||||
|
||||
# Find packages (no more vcpkg imgui!)
|
||||
find_package(portaudio CONFIG REQUIRED)
|
||||
find_package(httplib CONFIG REQUIRED)
|
||||
find_package(nlohmann_json CONFIG REQUIRED)
|
||||
find_package(glfw3 CONFIG REQUIRED)
|
||||
find_package(glad CONFIG REQUIRED)
|
||||
@ -74,6 +73,7 @@ set(SOURCES_UI
|
||||
# API clients
|
||||
src/api/WhisperClient.cpp
|
||||
src/api/ClaudeClient.cpp
|
||||
src/api/WinHttpClient.cpp
|
||||
# UI
|
||||
src/ui/TranslationUI.cpp
|
||||
# Utils
|
||||
@ -91,6 +91,7 @@ set(SOURCES_CONSOLE
|
||||
# API clients
|
||||
src/api/WhisperClient.cpp
|
||||
src/api/ClaudeClient.cpp
|
||||
src/api/WinHttpClient.cpp
|
||||
# Utils
|
||||
src/utils/Config.cpp
|
||||
# Core - WAIT, Pipeline uses UI!
|
||||
@ -116,7 +117,6 @@ if(MINGW AND NOT BUILD_SHARED_LIBS)
|
||||
# Static linking for MinGW - need to add Windows system libs for portaudio
|
||||
target_link_libraries(${PROJECT_NAME} PRIVATE
|
||||
${CMAKE_CURRENT_BINARY_DIR}/vcpkg_installed/x64-mingw-static/lib/libportaudio.a
|
||||
httplib::httplib
|
||||
nlohmann_json::nlohmann_json
|
||||
imgui_backends
|
||||
${CMAKE_CURRENT_BINARY_DIR}/vcpkg_installed/x64-mingw-static/lib/libglfw3.a
|
||||
@ -124,25 +124,27 @@ if(MINGW AND NOT BUILD_SHARED_LIBS)
|
||||
OpenGL::GL
|
||||
opus
|
||||
ogg
|
||||
# Windows system libraries for portaudio
|
||||
# Windows system libraries
|
||||
winmm
|
||||
setupapi
|
||||
ws2_32
|
||||
winhttp
|
||||
)
|
||||
# Console version - NO UI libs
|
||||
target_link_libraries(${PROJECT_NAME}_Console PRIVATE
|
||||
${CMAKE_CURRENT_BINARY_DIR}/vcpkg_installed/x64-mingw-static/lib/libportaudio.a
|
||||
httplib::httplib
|
||||
nlohmann_json::nlohmann_json
|
||||
opus
|
||||
ogg
|
||||
# Windows system libraries for portaudio
|
||||
# Windows system libraries
|
||||
winmm
|
||||
setupapi
|
||||
ws2_32
|
||||
winhttp
|
||||
)
|
||||
else()
|
||||
target_link_libraries(${PROJECT_NAME} PRIVATE
|
||||
portaudio
|
||||
httplib::httplib
|
||||
portaudio_static
|
||||
nlohmann_json::nlohmann_json
|
||||
imgui_backends
|
||||
glfw
|
||||
@ -153,8 +155,7 @@ else()
|
||||
)
|
||||
# Console version - NO UI libs
|
||||
target_link_libraries(${PROJECT_NAME}_Console PRIVATE
|
||||
portaudio
|
||||
httplib::httplib
|
||||
portaudio_static
|
||||
nlohmann_json::nlohmann_json
|
||||
opus
|
||||
ogg
|
||||
@ -196,6 +197,12 @@ endif()
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.json
|
||||
${CMAKE_CURRENT_BINARY_DIR}/config.json COPYONLY)
|
||||
|
||||
# Copy .env file if it exists
|
||||
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.env)
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/.env
|
||||
${CMAKE_CURRENT_BINARY_DIR}/.env COPYONLY)
|
||||
endif()
|
||||
|
||||
# Install target
|
||||
install(TARGETS ${PROJECT_NAME} DESTINATION bin)
|
||||
install(FILES config.json DESTINATION bin)
|
||||
|
||||
15
config.json
15
config.json
@ -3,26 +3,27 @@
|
||||
"sample_rate": 16000,
|
||||
"channels": 1,
|
||||
"chunk_duration_seconds": 10,
|
||||
"format": "wav"
|
||||
"chunk_step_seconds": 5,
|
||||
"format": "ogg"
|
||||
},
|
||||
"whisper": {
|
||||
"model": "gpt-4o-mini-transcribe",
|
||||
"language": "zh",
|
||||
"temperature": 0.0,
|
||||
"prompt": "The following is a conversation in Mandarin Chinese about business, family, and daily life. Common names: Tingting, Alexis.",
|
||||
"stream": true,
|
||||
"stream": false,
|
||||
"response_format": "text"
|
||||
},
|
||||
"claude": {
|
||||
"model": "claude-haiku-4-20250514",
|
||||
"model": "claude-3-5-haiku-20241022",
|
||||
"max_tokens": 1024,
|
||||
"temperature": 0.3,
|
||||
"system_prompt": "Tu es un traducteur professionnel chinois-français. Traduis le texte suivant de manière naturelle et contextuelle."
|
||||
"system_prompt": "Tu es un traducteur chinois-français. Réponds UNIQUEMENT avec la traduction française, sans explications, notes, commentaires ou alternatives. Une seule phrase traduite, rien d'autre."
|
||||
},
|
||||
"ui": {
|
||||
"window_width": 800,
|
||||
"window_height": 600,
|
||||
"font_size": 16,
|
||||
"window_width": 1200,
|
||||
"window_height": 800,
|
||||
"font_size": 24,
|
||||
"max_display_lines": 50
|
||||
},
|
||||
"recording": {
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
@echo off
|
||||
setlocal enabledelayedexpansion
|
||||
REM MinGW Setup Script for SecondVoice
|
||||
REM This installs a lightweight compiler instead of Visual Studio
|
||||
|
||||
@ -7,18 +8,18 @@ echo SecondVoice - MinGW Setup
|
||||
echo ========================================
|
||||
echo.
|
||||
echo This script will install:
|
||||
echo - MinGW-w64 (GCC compiler for Windows)
|
||||
echo - CMake (build system)
|
||||
echo - Ninja (build tool)
|
||||
echo - Git (if not installed)
|
||||
echo - MinGW-w64 - GCC compiler for Windows
|
||||
echo - CMake - build system
|
||||
echo - Ninja - build tool
|
||||
echo - Git if not installed
|
||||
echo.
|
||||
echo Total size: ~500MB (vs 10GB+ for Visual Studio!)
|
||||
echo Total size: ~500MB vs 10GB+ for Visual Studio
|
||||
echo.
|
||||
pause
|
||||
|
||||
REM Check if running as admin
|
||||
net session >nul 2>&1
|
||||
if %errorlevel% neq 0 (
|
||||
if !errorlevel! neq 0 (
|
||||
echo [WARNING] Not running as administrator
|
||||
echo Some installations might fail. Recommended to run as admin.
|
||||
echo.
|
||||
@ -27,7 +28,7 @@ if %errorlevel% neq 0 (
|
||||
|
||||
REM Install chocolatey if not present
|
||||
where choco >nul 2>&1
|
||||
if %errorlevel% neq 0 (
|
||||
if !errorlevel! neq 0 (
|
||||
echo [INFO] Installing Chocolatey package manager...
|
||||
echo.
|
||||
|
||||
@ -37,7 +38,7 @@ if %errorlevel% neq 0 (
|
||||
set "PATH=%PATH%;C:\ProgramData\chocolatey\bin"
|
||||
|
||||
where choco >nul 2>&1
|
||||
if %errorlevel% neq 0 (
|
||||
if !errorlevel! neq 0 (
|
||||
echo.
|
||||
echo ========================================
|
||||
echo [IMPORTANT] Chocolatey installed successfully!
|
||||
@ -59,11 +60,11 @@ echo.
|
||||
|
||||
REM Install MinGW-w64
|
||||
where gcc >nul 2>&1
|
||||
if %errorlevel% neq 0 (
|
||||
echo [INFO] Installing MinGW-w64 (GCC compiler)...
|
||||
if !errorlevel! neq 0 (
|
||||
echo [INFO] Installing MinGW-w64...
|
||||
choco install mingw -y
|
||||
|
||||
if %errorlevel% neq 0 (
|
||||
if !errorlevel! neq 0 (
|
||||
echo [ERROR] Failed to install MinGW
|
||||
pause
|
||||
exit /b 1
|
||||
@ -79,11 +80,11 @@ if %errorlevel% neq 0 (
|
||||
|
||||
REM Install CMake
|
||||
where cmake >nul 2>&1
|
||||
if %errorlevel% neq 0 (
|
||||
if !errorlevel! neq 0 (
|
||||
echo [INFO] Installing CMake...
|
||||
choco install cmake -y --installargs 'ADD_CMAKE_TO_PATH=System'
|
||||
|
||||
if %errorlevel% neq 0 (
|
||||
if !errorlevel! neq 0 (
|
||||
echo [ERROR] Failed to install CMake
|
||||
pause
|
||||
exit /b 1
|
||||
@ -99,11 +100,11 @@ if %errorlevel% neq 0 (
|
||||
|
||||
REM Install Ninja
|
||||
where ninja >nul 2>&1
|
||||
if %errorlevel% neq 0 (
|
||||
if !errorlevel! neq 0 (
|
||||
echo [INFO] Installing Ninja...
|
||||
choco install ninja -y
|
||||
|
||||
if %errorlevel% neq 0 (
|
||||
if !errorlevel! neq 0 (
|
||||
echo [ERROR] Failed to install Ninja
|
||||
pause
|
||||
exit /b 1
|
||||
@ -119,11 +120,11 @@ if %errorlevel% neq 0 (
|
||||
|
||||
REM Install Git (if not present)
|
||||
where git >nul 2>&1
|
||||
if %errorlevel% neq 0 (
|
||||
if !errorlevel! neq 0 (
|
||||
echo [INFO] Installing Git...
|
||||
choco install git -y
|
||||
|
||||
if %errorlevel% neq 0 (
|
||||
if !errorlevel! neq 0 (
|
||||
echo [ERROR] Failed to install Git
|
||||
pause
|
||||
exit /b 1
|
||||
@ -146,7 +147,7 @@ if not defined VCPKG_ROOT (
|
||||
cd vcpkg
|
||||
call bootstrap-vcpkg.bat
|
||||
|
||||
if %errorlevel% neq 0 (
|
||||
if !errorlevel! neq 0 (
|
||||
echo [ERROR] Failed to bootstrap vcpkg
|
||||
pause
|
||||
exit /b 1
|
||||
@ -162,7 +163,7 @@ if not defined VCPKG_ROOT (
|
||||
echo [SUCCESS] VCPKG_ROOT set to C:\vcpkg
|
||||
echo.
|
||||
) else (
|
||||
echo [INFO] vcpkg already configured at: %VCPKG_ROOT%
|
||||
echo [INFO] vcpkg already configured at: !VCPKG_ROOT!
|
||||
echo.
|
||||
)
|
||||
|
||||
@ -175,16 +176,16 @@ where gcc
|
||||
where cmake
|
||||
where ninja
|
||||
where git
|
||||
echo VCPKG_ROOT=%VCPKG_ROOT%
|
||||
echo VCPKG_ROOT=!VCPKG_ROOT!
|
||||
echo.
|
||||
echo ========================================
|
||||
echo Next Steps:
|
||||
echo ========================================
|
||||
echo 1. Close and reopen this terminal (to reload PATH)
|
||||
echo 1. Close and reopen this terminal to reload PATH
|
||||
echo 2. Run: build_mingw.bat --release
|
||||
echo 3. Your .exe will be in: build\mingw-release\SecondVoice.exe
|
||||
echo.
|
||||
echo Total installation size: ~500MB
|
||||
echo (vs 10GB+ for Visual Studio!)
|
||||
echo vs 10GB+ for Visual Studio!
|
||||
echo.
|
||||
pause
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
#include "ClaudeClient.h"
|
||||
#include "../mingw_compat.h"
|
||||
#include <httplib.h>
|
||||
#include "WinHttpClient.h"
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <iostream>
|
||||
|
||||
@ -37,40 +36,37 @@ std::optional<ClaudeResponse> ClaudeClient::translate(
|
||||
|
||||
std::string request_body = request_json.dump();
|
||||
|
||||
// Make HTTP request
|
||||
httplib::Client client("https://api.anthropic.com");
|
||||
client.set_read_timeout(15, 0); // 15 seconds timeout
|
||||
|
||||
httplib::Headers headers = {
|
||||
WinHttpClient client;
|
||||
std::map<std::string, std::string> headers = {
|
||||
{"x-api-key", api_key_},
|
||||
{"anthropic-version", API_VERSION},
|
||||
{"content-type", "application/json"}
|
||||
};
|
||||
|
||||
auto res = client.Post("/v1/messages", headers, request_body, "application/json");
|
||||
auto response = client.postJson("api.anthropic.com", "/v1/messages", request_body, headers);
|
||||
|
||||
if (!res) {
|
||||
std::cerr << "Claude API request failed: " << httplib::to_string(res.error()) << std::endl;
|
||||
if (!response) {
|
||||
std::cerr << "Claude API request failed" << std::endl;
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
if (res->status != 200) {
|
||||
std::cerr << "Claude API error " << res->status << ": " << res->body << std::endl;
|
||||
if (response->statusCode != 200) {
|
||||
std::cerr << "Claude API error " << response->statusCode << ": " << response->body << std::endl;
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
// Parse response
|
||||
try {
|
||||
json response_json = json::parse(res->body);
|
||||
json response_json = json::parse(response->body);
|
||||
|
||||
if (!response_json.contains("content") || !response_json["content"].is_array()) {
|
||||
std::cerr << "Invalid Claude API response format" << std::endl;
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
ClaudeResponse response;
|
||||
response.text = response_json["content"][0]["text"].get<std::string>();
|
||||
return response;
|
||||
ClaudeResponse result;
|
||||
result.text = response_json["content"][0]["text"].get<std::string>();
|
||||
return result;
|
||||
} catch (const json::exception& e) {
|
||||
std::cerr << "Failed to parse Claude response: " << e.what() << std::endl;
|
||||
return std::nullopt;
|
||||
|
||||
@ -23,7 +23,7 @@ public:
|
||||
private:
|
||||
std::string api_key_;
|
||||
static constexpr const char* API_URL = "https://api.anthropic.com/v1/messages";
|
||||
static constexpr const char* MODEL = "claude-haiku-4-20250514";
|
||||
static constexpr const char* MODEL = "claude-3-5-haiku-20241022";
|
||||
static constexpr const char* API_VERSION = "2023-06-01";
|
||||
};
|
||||
|
||||
|
||||
@ -1,10 +1,8 @@
|
||||
#include "WhisperClient.h"
|
||||
#include "WinHttpClient.h"
|
||||
#include "../audio/AudioBuffer.h"
|
||||
#include "../mingw_compat.h"
|
||||
#include <httplib.h>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
|
||||
using json = nlohmann::json;
|
||||
@ -25,70 +23,93 @@ std::optional<WhisperResponse> WhisperClient::transcribe(
|
||||
const std::string& prompt,
|
||||
const std::string& response_format) {
|
||||
|
||||
// Save audio to temporary Opus file
|
||||
// Save audio to temporary Opus/OGG file (smaller than WAV)
|
||||
AudioBuffer buffer(sample_rate, channels);
|
||||
buffer.addSamples(audio_data);
|
||||
|
||||
std::string temp_file = "secondvoice_temp.opus";
|
||||
std::string temp_file = "secondvoice_temp.ogg";
|
||||
if (!buffer.saveToOpus(temp_file)) {
|
||||
std::cerr << "Failed to save temporary Opus file" << std::endl;
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
// Read Opus file
|
||||
std::ifstream file(temp_file, std::ios::binary);
|
||||
std::cout << "Saved Opus file: " << temp_file << std::endl;
|
||||
|
||||
// Read file into memory
|
||||
std::ifstream file(temp_file, std::ios::binary | std::ios::ate);
|
||||
if (!file.is_open()) {
|
||||
std::cerr << "Failed to open temporary Opus file" << std::endl;
|
||||
std::cerr << "Failed to open temporary WAV file" << std::endl;
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
std::ostringstream opus_stream;
|
||||
opus_stream << file.rdbuf();
|
||||
std::string opus_data = opus_stream.str();
|
||||
std::streamsize size = file.tellg();
|
||||
file.seekg(0, std::ios::beg);
|
||||
std::vector<char> fileData(size);
|
||||
if (!file.read(fileData.data(), size)) {
|
||||
std::cerr << "Failed to read temporary WAV file" << std::endl;
|
||||
return std::nullopt;
|
||||
}
|
||||
file.close();
|
||||
|
||||
// Make HTTP request
|
||||
httplib::Client client("https://api.openai.com");
|
||||
client.set_read_timeout(30, 0); // 30 seconds timeout
|
||||
// Prepare form fields
|
||||
std::map<std::string, std::string> fields = {
|
||||
{"model", model},
|
||||
{"language", language},
|
||||
{"response_format", response_format}
|
||||
};
|
||||
|
||||
httplib::Headers headers = {
|
||||
// Temperature only supported by whisper-1
|
||||
if (model == "whisper-1") {
|
||||
fields["temperature"] = std::to_string(temperature);
|
||||
}
|
||||
|
||||
if (!prompt.empty()) {
|
||||
fields["prompt"] = prompt;
|
||||
}
|
||||
|
||||
// Make request
|
||||
WinHttpClient client;
|
||||
std::map<std::string, std::string> headers = {
|
||||
{"Authorization", "Bearer " + api_key_}
|
||||
};
|
||||
|
||||
httplib::UploadFormDataItems items;
|
||||
items.push_back({"file", opus_data, "audio.opus", "audio/ogg"});
|
||||
items.push_back({"model", model, "", ""});
|
||||
items.push_back({"language", language, "", ""});
|
||||
items.push_back({"temperature", std::to_string(temperature), "", ""});
|
||||
items.push_back({"response_format", response_format, "", ""});
|
||||
auto response = client.postMultipart(
|
||||
"api.openai.com",
|
||||
"/v1/audio/transcriptions",
|
||||
fields,
|
||||
"file",
|
||||
"audio.ogg",
|
||||
"audio/ogg",
|
||||
fileData,
|
||||
headers
|
||||
);
|
||||
|
||||
// Add prompt if provided
|
||||
if (!prompt.empty()) {
|
||||
items.push_back({"prompt", prompt, "", ""});
|
||||
}
|
||||
|
||||
auto res = client.Post("/v1/audio/transcriptions", headers, items);
|
||||
|
||||
if (!res) {
|
||||
std::cerr << "Whisper API request failed: " << httplib::to_string(res.error()) << std::endl;
|
||||
if (!response) {
|
||||
std::cerr << "Whisper API request failed" << std::endl;
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
if (res->status != 200) {
|
||||
std::cerr << "Whisper API error " << res->status << ": " << res->body << std::endl;
|
||||
if (response->statusCode != 200) {
|
||||
std::cerr << "Whisper API error " << response->statusCode << ": " << response->body << std::endl;
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
// Parse response
|
||||
try {
|
||||
json response_json = json::parse(res->body);
|
||||
WhisperResponse response;
|
||||
response.text = response_json["text"].get<std::string>();
|
||||
return response;
|
||||
} catch (const json::exception& e) {
|
||||
std::cerr << "Failed to parse Whisper response: " << e.what() << std::endl;
|
||||
return std::nullopt;
|
||||
// Parse response - for "text" format, response is plain text
|
||||
// For "json" format, it's JSON
|
||||
WhisperResponse result;
|
||||
if (response_format == "text") {
|
||||
result.text = response->body;
|
||||
} else {
|
||||
try {
|
||||
json response_json = json::parse(response->body);
|
||||
result.text = response_json["text"].get<std::string>();
|
||||
} catch (const json::exception& e) {
|
||||
std::cerr << "Failed to parse Whisper response: " << e.what() << std::endl;
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace secondvoice
|
||||
|
||||
301
src/api/WinHttpClient.cpp
Normal file
301
src/api/WinHttpClient.cpp
Normal file
@ -0,0 +1,301 @@
|
||||
#include "WinHttpClient.h"
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <random>
|
||||
#include <iomanip>
|
||||
|
||||
#ifdef _WIN32
|
||||
#pragma comment(lib, "winhttp.lib")
|
||||
#endif
|
||||
|
||||
namespace secondvoice {
|
||||
|
||||
WinHttpClient::WinHttpClient() {
|
||||
#ifdef _WIN32
|
||||
hSession_ = WinHttpOpen(
|
||||
L"SecondVoice/1.0",
|
||||
WINHTTP_ACCESS_TYPE_DEFAULT_PROXY,
|
||||
WINHTTP_NO_PROXY_NAME,
|
||||
WINHTTP_NO_PROXY_BYPASS,
|
||||
0
|
||||
);
|
||||
if (!hSession_) {
|
||||
std::cerr << "WinHttpOpen failed: " << GetLastError() << std::endl;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
WinHttpClient::~WinHttpClient() {
|
||||
#ifdef _WIN32
|
||||
if (hSession_) {
|
||||
WinHttpCloseHandle(hSession_);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
std::string WinHttpClient::generateBoundary() {
|
||||
std::random_device rd;
|
||||
std::mt19937 gen(rd());
|
||||
std::uniform_int_distribution<> dis(0, 15);
|
||||
const char* hex = "0123456789abcdef";
|
||||
|
||||
std::string boundary = "----WebKitFormBoundary";
|
||||
for (int i = 0; i < 16; i++) {
|
||||
boundary += hex[dis(gen)];
|
||||
}
|
||||
return boundary;
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
static std::wstring toWideString(const std::string& str) {
|
||||
if (str.empty()) return L"";
|
||||
int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, nullptr, 0);
|
||||
std::wstring result(size - 1, 0);
|
||||
MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, &result[0], size);
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
std::optional<HttpResponse> WinHttpClient::postJson(
|
||||
const std::string& host,
|
||||
const std::string& path,
|
||||
const std::string& jsonBody,
|
||||
const std::map<std::string, std::string>& headers) {
|
||||
|
||||
#ifdef _WIN32
|
||||
if (!hSession_) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
std::wstring wHost = toWideString(host);
|
||||
HINTERNET hConnect = WinHttpConnect(hSession_, wHost.c_str(), INTERNET_DEFAULT_HTTPS_PORT, 0);
|
||||
if (!hConnect) {
|
||||
std::cerr << "WinHttpConnect failed: " << GetLastError() << std::endl;
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
std::wstring wPath = toWideString(path);
|
||||
HINTERNET hRequest = WinHttpOpenRequest(
|
||||
hConnect,
|
||||
L"POST",
|
||||
wPath.c_str(),
|
||||
nullptr,
|
||||
WINHTTP_NO_REFERER,
|
||||
WINHTTP_DEFAULT_ACCEPT_TYPES,
|
||||
WINHTTP_FLAG_SECURE
|
||||
);
|
||||
if (!hRequest) {
|
||||
std::cerr << "WinHttpOpenRequest failed: " << GetLastError() << std::endl;
|
||||
WinHttpCloseHandle(hConnect);
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
// Add headers
|
||||
for (const auto& [key, value] : headers) {
|
||||
std::wstring header = toWideString(key + ": " + value);
|
||||
WinHttpAddRequestHeaders(hRequest, header.c_str(), -1, WINHTTP_ADDREQ_FLAG_ADD);
|
||||
}
|
||||
|
||||
// Send request
|
||||
BOOL result = WinHttpSendRequest(
|
||||
hRequest,
|
||||
WINHTTP_NO_ADDITIONAL_HEADERS,
|
||||
0,
|
||||
(LPVOID)jsonBody.c_str(),
|
||||
jsonBody.length(),
|
||||
jsonBody.length(),
|
||||
0
|
||||
);
|
||||
|
||||
if (!result) {
|
||||
std::cerr << "WinHttpSendRequest failed: " << GetLastError() << std::endl;
|
||||
WinHttpCloseHandle(hRequest);
|
||||
WinHttpCloseHandle(hConnect);
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
result = WinHttpReceiveResponse(hRequest, nullptr);
|
||||
if (!result) {
|
||||
std::cerr << "WinHttpReceiveResponse failed: " << GetLastError() << std::endl;
|
||||
WinHttpCloseHandle(hRequest);
|
||||
WinHttpCloseHandle(hConnect);
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
// Get status code
|
||||
DWORD statusCode = 0;
|
||||
DWORD statusCodeSize = sizeof(statusCode);
|
||||
WinHttpQueryHeaders(
|
||||
hRequest,
|
||||
WINHTTP_QUERY_STATUS_CODE | WINHTTP_QUERY_FLAG_NUMBER,
|
||||
WINHTTP_HEADER_NAME_BY_INDEX,
|
||||
&statusCode,
|
||||
&statusCodeSize,
|
||||
WINHTTP_NO_HEADER_INDEX
|
||||
);
|
||||
|
||||
// Read response body
|
||||
std::string responseBody;
|
||||
DWORD bytesAvailable = 0;
|
||||
do {
|
||||
bytesAvailable = 0;
|
||||
if (!WinHttpQueryDataAvailable(hRequest, &bytesAvailable)) {
|
||||
break;
|
||||
}
|
||||
if (bytesAvailable == 0) break;
|
||||
|
||||
std::vector<char> buffer(bytesAvailable + 1);
|
||||
DWORD bytesRead = 0;
|
||||
if (WinHttpReadData(hRequest, buffer.data(), bytesAvailable, &bytesRead)) {
|
||||
responseBody.append(buffer.data(), bytesRead);
|
||||
}
|
||||
} while (bytesAvailable > 0);
|
||||
|
||||
WinHttpCloseHandle(hRequest);
|
||||
WinHttpCloseHandle(hConnect);
|
||||
|
||||
return HttpResponse{static_cast<int>(statusCode), responseBody};
|
||||
#else
|
||||
return std::nullopt;
|
||||
#endif
|
||||
}
|
||||
|
||||
std::optional<HttpResponse> WinHttpClient::postMultipart(
|
||||
const std::string& host,
|
||||
const std::string& path,
|
||||
const std::map<std::string, std::string>& fields,
|
||||
const std::string& fileFieldName,
|
||||
const std::string& fileName,
|
||||
const std::string& fileContentType,
|
||||
const std::vector<char>& fileData,
|
||||
const std::map<std::string, std::string>& headers) {
|
||||
|
||||
#ifdef _WIN32
|
||||
if (!hSession_) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
std::string boundary = generateBoundary();
|
||||
|
||||
// Build multipart body
|
||||
std::ostringstream body;
|
||||
|
||||
// Add fields
|
||||
for (const auto& [key, value] : fields) {
|
||||
body << "--" << boundary << "\r\n";
|
||||
body << "Content-Disposition: form-data; name=\"" << key << "\"\r\n\r\n";
|
||||
body << value << "\r\n";
|
||||
}
|
||||
|
||||
// Add file
|
||||
body << "--" << boundary << "\r\n";
|
||||
body << "Content-Disposition: form-data; name=\"" << fileFieldName << "\"; filename=\"" << fileName << "\"\r\n";
|
||||
body << "Content-Type: " << fileContentType << "\r\n\r\n";
|
||||
|
||||
std::string bodyPrefix = body.str();
|
||||
std::string bodySuffix = "\r\n--" + boundary + "--\r\n";
|
||||
|
||||
// Combine all parts
|
||||
std::vector<char> fullBody;
|
||||
fullBody.insert(fullBody.end(), bodyPrefix.begin(), bodyPrefix.end());
|
||||
fullBody.insert(fullBody.end(), fileData.begin(), fileData.end());
|
||||
fullBody.insert(fullBody.end(), bodySuffix.begin(), bodySuffix.end());
|
||||
|
||||
std::wstring wHost = toWideString(host);
|
||||
HINTERNET hConnect = WinHttpConnect(hSession_, wHost.c_str(), INTERNET_DEFAULT_HTTPS_PORT, 0);
|
||||
if (!hConnect) {
|
||||
std::cerr << "WinHttpConnect failed: " << GetLastError() << std::endl;
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
std::wstring wPath = toWideString(path);
|
||||
HINTERNET hRequest = WinHttpOpenRequest(
|
||||
hConnect,
|
||||
L"POST",
|
||||
wPath.c_str(),
|
||||
nullptr,
|
||||
WINHTTP_NO_REFERER,
|
||||
WINHTTP_DEFAULT_ACCEPT_TYPES,
|
||||
WINHTTP_FLAG_SECURE
|
||||
);
|
||||
if (!hRequest) {
|
||||
std::cerr << "WinHttpOpenRequest failed: " << GetLastError() << std::endl;
|
||||
WinHttpCloseHandle(hConnect);
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
// Add headers
|
||||
for (const auto& [key, value] : headers) {
|
||||
std::wstring header = toWideString(key + ": " + value);
|
||||
WinHttpAddRequestHeaders(hRequest, header.c_str(), -1, WINHTTP_ADDREQ_FLAG_ADD);
|
||||
}
|
||||
|
||||
// Add content-type header
|
||||
std::wstring contentType = toWideString("Content-Type: multipart/form-data; boundary=" + boundary);
|
||||
WinHttpAddRequestHeaders(hRequest, contentType.c_str(), -1, WINHTTP_ADDREQ_FLAG_ADD);
|
||||
|
||||
// Send request
|
||||
BOOL result = WinHttpSendRequest(
|
||||
hRequest,
|
||||
WINHTTP_NO_ADDITIONAL_HEADERS,
|
||||
0,
|
||||
fullBody.data(),
|
||||
fullBody.size(),
|
||||
fullBody.size(),
|
||||
0
|
||||
);
|
||||
|
||||
if (!result) {
|
||||
std::cerr << "WinHttpSendRequest failed: " << GetLastError() << std::endl;
|
||||
WinHttpCloseHandle(hRequest);
|
||||
WinHttpCloseHandle(hConnect);
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
result = WinHttpReceiveResponse(hRequest, nullptr);
|
||||
if (!result) {
|
||||
std::cerr << "WinHttpReceiveResponse failed: " << GetLastError() << std::endl;
|
||||
WinHttpCloseHandle(hRequest);
|
||||
WinHttpCloseHandle(hConnect);
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
// Get status code
|
||||
DWORD statusCode = 0;
|
||||
DWORD statusCodeSize = sizeof(statusCode);
|
||||
WinHttpQueryHeaders(
|
||||
hRequest,
|
||||
WINHTTP_QUERY_STATUS_CODE | WINHTTP_QUERY_FLAG_NUMBER,
|
||||
WINHTTP_HEADER_NAME_BY_INDEX,
|
||||
&statusCode,
|
||||
&statusCodeSize,
|
||||
WINHTTP_NO_HEADER_INDEX
|
||||
);
|
||||
|
||||
// Read response body
|
||||
std::string responseBody;
|
||||
DWORD bytesAvailable = 0;
|
||||
do {
|
||||
bytesAvailable = 0;
|
||||
if (!WinHttpQueryDataAvailable(hRequest, &bytesAvailable)) {
|
||||
break;
|
||||
}
|
||||
if (bytesAvailable == 0) break;
|
||||
|
||||
std::vector<char> buffer(bytesAvailable + 1);
|
||||
DWORD bytesRead = 0;
|
||||
if (WinHttpReadData(hRequest, buffer.data(), bytesAvailable, &bytesRead)) {
|
||||
responseBody.append(buffer.data(), bytesRead);
|
||||
}
|
||||
} while (bytesAvailable > 0);
|
||||
|
||||
WinHttpCloseHandle(hRequest);
|
||||
WinHttpCloseHandle(hConnect);
|
||||
|
||||
return HttpResponse{static_cast<int>(statusCode), responseBody};
|
||||
#else
|
||||
return std::nullopt;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace secondvoice
|
||||
52
src/api/WinHttpClient.h
Normal file
52
src/api/WinHttpClient.h
Normal file
@ -0,0 +1,52 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#include <winhttp.h>
|
||||
#endif
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <optional>
|
||||
|
||||
namespace secondvoice {
|
||||
|
||||
struct HttpResponse {
|
||||
int statusCode;
|
||||
std::string body;
|
||||
};
|
||||
|
||||
class WinHttpClient {
|
||||
public:
|
||||
WinHttpClient();
|
||||
~WinHttpClient();
|
||||
|
||||
// POST JSON request
|
||||
std::optional<HttpResponse> postJson(
|
||||
const std::string& host,
|
||||
const std::string& path,
|
||||
const std::string& jsonBody,
|
||||
const std::map<std::string, std::string>& headers
|
||||
);
|
||||
|
||||
// POST multipart form data
|
||||
std::optional<HttpResponse> postMultipart(
|
||||
const std::string& host,
|
||||
const std::string& path,
|
||||
const std::map<std::string, std::string>& fields,
|
||||
const std::string& fileFieldName,
|
||||
const std::string& fileName,
|
||||
const std::string& fileContentType,
|
||||
const std::vector<char>& fileData,
|
||||
const std::map<std::string, std::string>& headers
|
||||
);
|
||||
|
||||
private:
|
||||
#ifdef _WIN32
|
||||
HINTERNET hSession_;
|
||||
#endif
|
||||
std::string generateBoundary();
|
||||
};
|
||||
|
||||
} // namespace secondvoice
|
||||
@ -3,10 +3,12 @@
|
||||
|
||||
namespace secondvoice {
|
||||
|
||||
AudioCapture::AudioCapture(int sample_rate, int channels, int chunk_duration_seconds)
|
||||
AudioCapture::AudioCapture(int sample_rate, int channels, int chunk_duration_seconds, int chunk_step_seconds)
|
||||
: sample_rate_(sample_rate)
|
||||
, channels_(channels)
|
||||
, chunk_duration_seconds_(chunk_duration_seconds) {
|
||||
, chunk_duration_seconds_(chunk_duration_seconds)
|
||||
, chunk_step_seconds_(chunk_step_seconds > 0 ? chunk_step_seconds : chunk_duration_seconds) {
|
||||
// If step not specified, use duration (no overlap)
|
||||
}
|
||||
|
||||
AudioCapture::~AudioCapture() {
|
||||
@ -47,12 +49,17 @@ int AudioCapture::audioCallback(const void* input, void* output,
|
||||
|
||||
// Check if we have accumulated enough data for a chunk
|
||||
size_t chunk_samples = self->sample_rate_ * self->channels_ * self->chunk_duration_seconds_;
|
||||
size_t step_samples = self->sample_rate_ * self->channels_ * self->chunk_step_seconds_;
|
||||
|
||||
if (self->buffer_.size() >= chunk_samples) {
|
||||
// Call the callback with the chunk
|
||||
// Call the callback with the full chunk
|
||||
if (self->callback_) {
|
||||
self->callback_(self->buffer_);
|
||||
// Send exactly chunk_samples (the window)
|
||||
std::vector<float> chunk(self->buffer_.begin(), self->buffer_.begin() + chunk_samples);
|
||||
self->callback_(chunk);
|
||||
}
|
||||
self->buffer_.clear();
|
||||
// Sliding window: remove only step_samples, keep overlap for next chunk
|
||||
self->buffer_.erase(self->buffer_.begin(), self->buffer_.begin() + step_samples);
|
||||
}
|
||||
|
||||
return paContinue;
|
||||
|
||||
@ -11,7 +11,8 @@ class AudioCapture {
|
||||
public:
|
||||
using AudioCallback = std::function<void(const std::vector<float>&)>;
|
||||
|
||||
AudioCapture(int sample_rate, int channels, int chunk_duration_seconds);
|
||||
// chunk_duration = window size, chunk_step = how often to send (overlap = duration - step)
|
||||
AudioCapture(int sample_rate, int channels, int chunk_duration_seconds, int chunk_step_seconds = 0);
|
||||
~AudioCapture();
|
||||
|
||||
bool initialize();
|
||||
@ -29,6 +30,7 @@ private:
|
||||
int sample_rate_;
|
||||
int channels_;
|
||||
int chunk_duration_seconds_;
|
||||
int chunk_step_seconds_; // How often to emit chunks (0 = same as duration, no overlap)
|
||||
bool is_recording_ = false;
|
||||
|
||||
PaStream* stream_ = nullptr;
|
||||
|
||||
@ -10,6 +10,8 @@
|
||||
#include <sstream>
|
||||
#include <chrono>
|
||||
#include <filesystem>
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
|
||||
namespace secondvoice {
|
||||
|
||||
@ -22,13 +24,19 @@ Pipeline::~Pipeline() {
|
||||
bool Pipeline::initialize() {
|
||||
auto& config = Config::getInstance();
|
||||
|
||||
// Initialize audio capture
|
||||
// Initialize audio capture with sliding window (overlap)
|
||||
audio_capture_ = std::make_unique<AudioCapture>(
|
||||
config.getAudioConfig().sample_rate,
|
||||
config.getAudioConfig().channels,
|
||||
config.getAudioConfig().chunk_duration_seconds
|
||||
config.getAudioConfig().chunk_duration_seconds,
|
||||
config.getAudioConfig().chunk_step_seconds
|
||||
);
|
||||
|
||||
std::cout << "[Pipeline] Audio: " << config.getAudioConfig().chunk_duration_seconds
|
||||
<< "s window, " << config.getAudioConfig().chunk_step_seconds
|
||||
<< "s step (overlap: " << (config.getAudioConfig().chunk_duration_seconds - config.getAudioConfig().chunk_step_seconds)
|
||||
<< "s)" << std::endl;
|
||||
|
||||
if (!audio_capture_->initialize()) {
|
||||
std::cerr << "Failed to initialize audio capture" << std::endl;
|
||||
return false;
|
||||
@ -68,10 +76,9 @@ bool Pipeline::start() {
|
||||
|
||||
running_ = true;
|
||||
|
||||
// Start threads
|
||||
// Start background threads (NOT UI - that runs in main thread)
|
||||
audio_thread_ = std::thread(&Pipeline::audioThread, this);
|
||||
processing_thread_ = std::thread(&Pipeline::processingThread, this);
|
||||
ui_thread_ = std::thread(&Pipeline::uiThread, this);
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -92,16 +99,13 @@ void Pipeline::stop() {
|
||||
audio_queue_.shutdown();
|
||||
transcription_queue_.shutdown();
|
||||
|
||||
// Wait for threads
|
||||
// Wait for background threads
|
||||
if (audio_thread_.joinable()) {
|
||||
audio_thread_.join();
|
||||
}
|
||||
if (processing_thread_.joinable()) {
|
||||
processing_thread_.join();
|
||||
}
|
||||
if (ui_thread_.joinable()) {
|
||||
ui_thread_.join();
|
||||
}
|
||||
|
||||
// Save full recording
|
||||
auto& config = Config::getInstance();
|
||||
@ -153,6 +157,11 @@ void Pipeline::audioThread() {
|
||||
void Pipeline::processingThread() {
|
||||
auto& config = Config::getInstance();
|
||||
|
||||
// VAD threshold - audio RMS must exceed this to be considered speech
|
||||
// Higher values = more aggressive noise filtering
|
||||
constexpr float VAD_THRESHOLD = 0.02f; // RMS energy threshold
|
||||
constexpr float VAD_MIN_PEAK = 0.08f; // Minimum peak amplitude
|
||||
|
||||
while (running_) {
|
||||
auto chunk_opt = audio_queue_.wait_and_pop();
|
||||
if (!chunk_opt.has_value()) {
|
||||
@ -161,6 +170,22 @@ void Pipeline::processingThread() {
|
||||
|
||||
auto& chunk = chunk_opt.value();
|
||||
|
||||
// Voice Activity Detection - skip silent/noise chunks
|
||||
float sum_squared = 0.0f;
|
||||
float max_amplitude = 0.0f;
|
||||
for (const float sample : chunk.data) {
|
||||
sum_squared += sample * sample;
|
||||
max_amplitude = std::max(max_amplitude, std::abs(sample));
|
||||
}
|
||||
float rms = std::sqrt(sum_squared / chunk.data.size());
|
||||
|
||||
if (rms < VAD_THRESHOLD || max_amplitude < VAD_MIN_PEAK) {
|
||||
std::cout << "[Skip] Silent chunk (RMS: " << rms << ", Peak: " << max_amplitude << ")" << std::endl;
|
||||
continue;
|
||||
}
|
||||
|
||||
std::cout << "[Audio] Processing chunk (RMS: " << rms << ", Peak: " << max_amplitude << ")" << std::endl;
|
||||
|
||||
// Transcribe with Whisper
|
||||
auto whisper_result = whisper_client_->transcribe(
|
||||
chunk.data,
|
||||
@ -178,9 +203,71 @@ void Pipeline::processingThread() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Translate with Claude
|
||||
// Filter out Whisper hallucinations (common when audio is silent/unclear)
|
||||
std::string text = whisper_result->text;
|
||||
// Trim whitespace
|
||||
size_t start = text.find_first_not_of(" \t\n\r");
|
||||
size_t end = text.find_last_not_of(" \t\n\r");
|
||||
if (start == std::string::npos) {
|
||||
std::cout << "[Skip] Empty transcription" << std::endl;
|
||||
continue;
|
||||
}
|
||||
text = text.substr(start, end - start + 1);
|
||||
|
||||
// Skip known hallucinations and garbage
|
||||
bool is_garbage = false;
|
||||
|
||||
// Too short to be meaningful
|
||||
if (text.length() < 4) {
|
||||
is_garbage = true;
|
||||
}
|
||||
// Known Whisper hallucinations
|
||||
else if (text.find("Amara.org") != std::string::npos ||
|
||||
text.find("amara.org") != std::string::npos ||
|
||||
text.find("字幕") != std::string::npos ||
|
||||
text.find("subtitle") != std::string::npos ||
|
||||
text.find("Subtitle") != std::string::npos ||
|
||||
text.find("Thank you") != std::string::npos ||
|
||||
text.find("thanks for watching") != std::string::npos ||
|
||||
text.find("Subscribe") != std::string::npos ||
|
||||
text.find("谢谢观看") != std::string::npos ||
|
||||
text.find("订阅") != std::string::npos ||
|
||||
text.find("...") == 0) { // Starts with ellipsis
|
||||
is_garbage = true;
|
||||
}
|
||||
// Only punctuation or whitespace
|
||||
else {
|
||||
bool has_content = false;
|
||||
for (char c : text) {
|
||||
if (std::isalnum(static_cast<unsigned char>(c)) || (c & 0x80)) {
|
||||
has_content = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!has_content) {
|
||||
is_garbage = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_garbage) {
|
||||
std::cout << "[Skip] Filtered: " << text << std::endl;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Remove overlap with previous transcription
|
||||
std::string deduplicated = removeOverlap(text, last_transcription_);
|
||||
last_transcription_ = text; // Store full text for next comparison
|
||||
|
||||
if (deduplicated.empty() || deduplicated.length() < 2) {
|
||||
std::cout << "[Skip] Fully overlapping transcription" << std::endl;
|
||||
continue;
|
||||
}
|
||||
|
||||
std::cout << "[New content] " << deduplicated << std::endl;
|
||||
|
||||
// Translate with Claude (only the new, deduplicated content)
|
||||
auto claude_result = claude_client_->translate(
|
||||
whisper_result->text,
|
||||
deduplicated,
|
||||
config.getClaudeConfig().system_prompt,
|
||||
config.getClaudeConfig().max_tokens,
|
||||
config.getClaudeConfig().temperature
|
||||
@ -192,31 +279,69 @@ void Pipeline::processingThread() {
|
||||
}
|
||||
|
||||
// Add to UI
|
||||
ui_->addTranslation(whisper_result->text, claude_result->text);
|
||||
ui_->addTranslation(deduplicated, claude_result->text);
|
||||
|
||||
std::cout << "CN: " << whisper_result->text << std::endl;
|
||||
std::cout << "CN: " << deduplicated << std::endl;
|
||||
std::cout << "FR: " << claude_result->text << std::endl;
|
||||
std::cout << "---" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void Pipeline::uiThread() {
|
||||
// CRITICAL: Make OpenGL context current in THIS thread (not the thread that created it)
|
||||
ui_->makeContextCurrent();
|
||||
void Pipeline::update() {
|
||||
if (!ui_) return;
|
||||
|
||||
while (running_ && !ui_->shouldClose()) {
|
||||
ui_->setRecordingDuration(recording_duration_);
|
||||
ui_->setProcessingStatus("Processing...");
|
||||
ui_->render();
|
||||
ui_->setRecordingDuration(recording_duration_);
|
||||
ui_->setProcessingStatus("Processing...");
|
||||
ui_->render();
|
||||
|
||||
// Check if stop was requested
|
||||
if (ui_->isStopRequested()) {
|
||||
running_ = false;
|
||||
break;
|
||||
}
|
||||
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(16)); // ~60 FPS
|
||||
// Check if stop was requested
|
||||
if (ui_->isStopRequested()) {
|
||||
running_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
bool Pipeline::shouldClose() const {
|
||||
return ui_ ? ui_->shouldClose() : true;
|
||||
}
|
||||
|
||||
std::string Pipeline::removeOverlap(const std::string& current, const std::string& previous) {
|
||||
if (previous.empty()) {
|
||||
return current;
|
||||
}
|
||||
|
||||
// Try to find overlap: check if current starts with end of previous
|
||||
// Start from half the previous text (since we have 50% overlap)
|
||||
size_t min_overlap = 4; // Minimum characters to consider as overlap
|
||||
size_t search_start = previous.length() / 3; // Start looking from 1/3 into previous
|
||||
|
||||
for (size_t i = search_start; i < previous.length() - min_overlap; ++i) {
|
||||
std::string suffix = previous.substr(i);
|
||||
if (current.find(suffix) == 0) {
|
||||
// Found overlap - return only the new part
|
||||
std::string new_part = current.substr(suffix.length());
|
||||
if (!new_part.empty()) {
|
||||
std::cout << "[Overlap] Removed: \"" << suffix << "\"" << std::endl;
|
||||
return new_part;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// No overlap found - check for partial word overlap at start
|
||||
// This handles cases where the same content appears but tokenized differently
|
||||
size_t max_check = std::min(current.length(), previous.length()) / 2;
|
||||
for (size_t len = max_check; len >= min_overlap; --len) {
|
||||
std::string end_prev = previous.substr(previous.length() - len);
|
||||
std::string start_curr = current.substr(0, len);
|
||||
if (end_prev == start_curr) {
|
||||
std::string new_part = current.substr(len);
|
||||
if (!new_part.empty()) {
|
||||
std::cout << "[Overlap] Partial match removed: \"" << end_prev << "\"" << std::endl;
|
||||
return new_part;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return current; // No overlap detected
|
||||
}
|
||||
|
||||
} // namespace secondvoice
|
||||
|
||||
@ -34,12 +34,15 @@ public:
|
||||
bool start();
|
||||
void stop();
|
||||
|
||||
// Call this from main thread for UI updates
|
||||
void update();
|
||||
|
||||
bool isRunning() const { return running_; }
|
||||
bool shouldClose() const;
|
||||
|
||||
private:
|
||||
void audioThread();
|
||||
void processingThread();
|
||||
void uiThread();
|
||||
|
||||
std::unique_ptr<AudioCapture> audio_capture_;
|
||||
std::unique_ptr<WhisperClient> whisper_client_;
|
||||
@ -52,10 +55,13 @@ private:
|
||||
|
||||
std::thread audio_thread_;
|
||||
std::thread processing_thread_;
|
||||
std::thread ui_thread_;
|
||||
|
||||
std::atomic<bool> running_{false};
|
||||
std::atomic<int> recording_duration_{0};
|
||||
|
||||
// For overlap deduplication
|
||||
std::string last_transcription_;
|
||||
std::string removeOverlap(const std::string& current, const std::string& previous);
|
||||
};
|
||||
|
||||
} // namespace secondvoice
|
||||
|
||||
21
src/main.cpp
21
src/main.cpp
@ -4,6 +4,10 @@
|
||||
#include "utils/Config.h"
|
||||
#include "core/Pipeline.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
// Force NVIDIA GPU on Optimus systems (instead of integrated AMD/Intel)
|
||||
// These MUST be global and volatile to prevent linker optimization
|
||||
#ifdef _WIN32
|
||||
@ -38,6 +42,12 @@ int main(int argc, char** argv) {
|
||||
(void)argc;
|
||||
(void)argv;
|
||||
|
||||
#ifdef _WIN32
|
||||
// Enable UTF-8 console output for Chinese characters
|
||||
SetConsoleOutputCP(CP_UTF8);
|
||||
SetConsoleCP(CP_UTF8);
|
||||
#endif
|
||||
|
||||
log_msg("MAIN: Entry point reached");
|
||||
log_msg("========================================");
|
||||
log_msg("SecondVoice - Real-time Translation System");
|
||||
@ -67,17 +77,18 @@ int main(int argc, char** argv) {
|
||||
log_msg("Pipeline initialized successfully");
|
||||
log_msg("Starting recording and translation...");
|
||||
|
||||
// Start pipeline
|
||||
// Start pipeline (background threads for audio + API calls)
|
||||
log_msg("Starting pipeline...");
|
||||
if (!pipeline.start()) {
|
||||
log_msg("ERROR: Failed to start pipeline");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Wait for pipeline to finish (user clicks Stop button)
|
||||
log_msg("Pipeline running, waiting for user to stop...");
|
||||
while (pipeline.isRunning()) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
// Main loop - UI runs in main thread (required by GLFW)
|
||||
log_msg("Pipeline running, entering main loop...");
|
||||
while (pipeline.isRunning() && !pipeline.shouldClose()) {
|
||||
pipeline.update(); // Render one frame
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(16)); // ~60 FPS
|
||||
}
|
||||
|
||||
log_msg("");
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
#include <glad/glad.h> // MUST be FIRST! Provides OpenGL functions
|
||||
#define GLFW_INCLUDE_NONE // Tell GLFW not to include OpenGL headers (GLAD does it)
|
||||
#include "TranslationUI.h"
|
||||
#include "../utils/Config.h"
|
||||
#include <imgui.h>
|
||||
#include <imgui_impl_glfw.h>
|
||||
#include <imgui_impl_opengl3.h>
|
||||
@ -86,6 +87,40 @@ bool TranslationUI::initialize() {
|
||||
ImGuiIO& io = ImGui::GetIO();
|
||||
io.ConfigFlags |= ImGuiConfigFlags_NavEnableKeyboard;
|
||||
|
||||
// Load Chinese font from Windows system fonts
|
||||
float font_size = static_cast<float>(Config::getInstance().getUIConfig().font_size);
|
||||
std::cout << "[UI] Loading Chinese font (size: " << font_size << ")..." << std::endl;
|
||||
ImFontConfig font_config;
|
||||
font_config.OversampleH = 2;
|
||||
font_config.OversampleV = 2;
|
||||
|
||||
// Try common Chinese fonts on Windows
|
||||
const char* chinese_fonts[] = {
|
||||
"C:\\Windows\\Fonts\\msyh.ttc", // Microsoft YaHei
|
||||
"C:\\Windows\\Fonts\\simhei.ttf", // SimHei
|
||||
"C:\\Windows\\Fonts\\simsun.ttc", // SimSun
|
||||
"C:\\Windows\\Fonts\\mingliub.ttc", // MingLiU
|
||||
};
|
||||
|
||||
ImFont* font = nullptr;
|
||||
for (const char* font_path : chinese_fonts) {
|
||||
font = io.Fonts->AddFontFromFileTTF(
|
||||
font_path,
|
||||
font_size,
|
||||
&font_config,
|
||||
io.Fonts->GetGlyphRangesChineseFull()
|
||||
);
|
||||
if (font) {
|
||||
std::cout << "[UI] Loaded font: " << font_path << std::endl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!font) {
|
||||
std::cout << "[UI] Warning: No Chinese font found, using default (Chinese chars won't display)" << std::endl;
|
||||
io.Fonts->AddFontDefault();
|
||||
}
|
||||
|
||||
ImGui::StyleColorsDark();
|
||||
|
||||
ImGui_ImplGlfw_InitForOpenGL(window_, true);
|
||||
@ -130,13 +165,12 @@ void main() {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Let ImGui auto-detect the GLSL version
|
||||
ImGui_ImplOpenGL3_Init(nullptr);
|
||||
// Explicitly specify GLSL version for OpenGL 3.3 core profile
|
||||
// On Windows, auto-detection (nullptr) can fail, so we specify "#version 330"
|
||||
std::cout << "[UI] Initializing ImGui OpenGL3 backend with GLSL 330..." << std::endl;
|
||||
ImGui_ImplOpenGL3_Init("#version 330");
|
||||
|
||||
// CRITICAL: Release the OpenGL context from this thread
|
||||
// The UI rendering will happen in a separate thread which will call makeContextCurrent()
|
||||
std::cout << "[UI] Releasing OpenGL context from initialization thread" << std::endl;
|
||||
glfwMakeContextCurrent(nullptr);
|
||||
// Context stays in main thread - no need to release
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -71,7 +71,8 @@ bool Config::load(const std::string& config_path, const std::string& env_path) {
|
||||
audio_config_.sample_rate = audio.value("sample_rate", 16000);
|
||||
audio_config_.channels = audio.value("channels", 1);
|
||||
audio_config_.chunk_duration_seconds = audio.value("chunk_duration_seconds", 10);
|
||||
audio_config_.format = audio.value("format", "wav");
|
||||
audio_config_.chunk_step_seconds = audio.value("chunk_step_seconds", 0); // 0 = no overlap
|
||||
audio_config_.format = audio.value("format", "ogg");
|
||||
}
|
||||
|
||||
// Parse whisper config
|
||||
|
||||
@ -8,6 +8,7 @@ struct AudioConfig {
|
||||
int sample_rate;
|
||||
int channels;
|
||||
int chunk_duration_seconds;
|
||||
int chunk_step_seconds; // How often to emit chunks (for overlap)
|
||||
std::string format;
|
||||
};
|
||||
|
||||
|
||||
@ -3,7 +3,6 @@
|
||||
"version": "0.1.0",
|
||||
"dependencies": [
|
||||
"portaudio",
|
||||
"cpp-httplib",
|
||||
"nlohmann-json",
|
||||
"glfw3",
|
||||
"glad",
|
||||
|
||||
Loading…
Reference in New Issue
Block a user