feat: Major improvements - WinHTTP, gpt-4o-mini, Opus, sliding window

- Replace cpp-httplib with native WinHTTP for HTTPS support - Switch from whisper-1 to gpt-4o-mini-transcribe model - Use Opus/OGG encoding instead of WAV (~10x smaller files) - Implement sliding window audio capture with overlap - Add transcription deduplication for overlapping segments - Add Voice Activity Detection (VAD) to filter silence/noise - Filter Whisper hallucinations (Amara.org, etc.) - Add UTF-8 console support for Chinese characters - Add Chinese font loading in ImGui - Make Claude responses concise (translation only, no explanations) - Configurable window size, font size, chunk duration/step 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-23 12:17:41 +08:00 · 2025-11-23 12:17:41 +08:00 · fa8ea2907b
commit fa8ea2907b
parent 089acbfff1
17 changed files with 708 additions and 143 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -56,7 +56,6 @@ target_include_directories(imgui_backends PUBLIC

 # Find packages (no more vcpkg imgui!)
 find_package(portaudio CONFIG REQUIRED)
-find_package(httplib CONFIG REQUIRED)
 find_package(nlohmann_json CONFIG REQUIRED)
 find_package(glfw3 CONFIG REQUIRED)
 find_package(glad CONFIG REQUIRED)
@ -74,6 +73,7 @@ set(SOURCES_UI
    # API clients
    src/api/WhisperClient.cpp
    src/api/ClaudeClient.cpp
+    src/api/WinHttpClient.cpp
    # UI
    src/ui/TranslationUI.cpp
    # Utils
@ -91,6 +91,7 @@ set(SOURCES_CONSOLE
    # API clients
    src/api/WhisperClient.cpp
    src/api/ClaudeClient.cpp
+    src/api/WinHttpClient.cpp
    # Utils
    src/utils/Config.cpp
    # Core - WAIT, Pipeline uses UI!
@ -116,7 +117,6 @@ if(MINGW AND NOT BUILD_SHARED_LIBS)
    # Static linking for MinGW - need to add Windows system libs for portaudio
    target_link_libraries(${PROJECT_NAME} PRIVATE
        ${CMAKE_CURRENT_BINARY_DIR}/vcpkg_installed/x64-mingw-static/lib/libportaudio.a
-        httplib::httplib
        nlohmann_json::nlohmann_json
        imgui_backends
        ${CMAKE_CURRENT_BINARY_DIR}/vcpkg_installed/x64-mingw-static/lib/libglfw3.a
@ -124,25 +124,27 @@ if(MINGW AND NOT BUILD_SHARED_LIBS)
        OpenGL::GL
        opus
        ogg
-        # Windows system libraries for portaudio
+        # Windows system libraries
        winmm
        setupapi
+        ws2_32
+        winhttp
    )
    # Console version - NO UI libs
    target_link_libraries(${PROJECT_NAME}_Console PRIVATE
        ${CMAKE_CURRENT_BINARY_DIR}/vcpkg_installed/x64-mingw-static/lib/libportaudio.a
-        httplib::httplib
        nlohmann_json::nlohmann_json
        opus
        ogg
-        # Windows system libraries for portaudio
+        # Windows system libraries
        winmm
        setupapi
+        ws2_32
+        winhttp
    )
 else()
    target_link_libraries(${PROJECT_NAME} PRIVATE
-        portaudio
-        httplib::httplib
+        portaudio_static
        nlohmann_json::nlohmann_json
        imgui_backends
        glfw
@ -153,8 +155,7 @@ else()
    )
    # Console version - NO UI libs
    target_link_libraries(${PROJECT_NAME}_Console PRIVATE
-        portaudio
-        httplib::httplib
+        portaudio_static
        nlohmann_json::nlohmann_json
        opus
        ogg
@ -196,6 +197,12 @@ endif()
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.json
               ${CMAKE_CURRENT_BINARY_DIR}/config.json COPYONLY)

+# Copy .env file if it exists
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.env)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/.env
+                   ${CMAKE_CURRENT_BINARY_DIR}/.env COPYONLY)
+endif()
+
 # Install target
 install(TARGETS ${PROJECT_NAME} DESTINATION bin)
 install(FILES config.json DESTINATION bin)
--- a/config.json
+++ b/config.json
@ -3,26 +3,27 @@
    "sample_rate": 16000,
    "channels": 1,
    "chunk_duration_seconds": 10,
-    "format": "wav"
+    "chunk_step_seconds": 5,
+    "format": "ogg"
  },
  "whisper": {
    "model": "gpt-4o-mini-transcribe",
    "language": "zh",
    "temperature": 0.0,
    "prompt": "The following is a conversation in Mandarin Chinese about business, family, and daily life. Common names: Tingting, Alexis.",
-    "stream": true,
+    "stream": false,
    "response_format": "text"
  },
  "claude": {
-    "model": "claude-haiku-4-20250514",
+    "model": "claude-3-5-haiku-20241022",
    "max_tokens": 1024,
    "temperature": 0.3,
-    "system_prompt": "Tu es un traducteur professionnel chinois-français. Traduis le texte suivant de manière naturelle et contextuelle."
+    "system_prompt": "Tu es un traducteur chinois-français. Réponds UNIQUEMENT avec la traduction française, sans explications, notes, commentaires ou alternatives. Une seule phrase traduite, rien d'autre."
  },
  "ui": {
-    "window_width": 800,
-    "window_height": 600,
-    "font_size": 16,
+    "window_width": 1200,
+    "window_height": 800,
+    "font_size": 24,
    "max_display_lines": 50
  },
  "recording": {
--- a/setup_mingw.bat
+++ b/setup_mingw.bat
@ -1,4 +1,5 @@
@echo off
+setlocal enabledelayedexpansion
 REM MinGW Setup Script for SecondVoice
 REM This installs a lightweight compiler instead of Visual Studio

@ -7,18 +8,18 @@ echo SecondVoice - MinGW Setup
 echo ========================================
 echo.
 echo This script will install:
-echo   - MinGW-w64 (GCC compiler for Windows)
-echo   - CMake (build system)
-echo   - Ninja (build tool)
-echo   - Git (if not installed)
+echo   - MinGW-w64 - GCC compiler for Windows
+echo   - CMake - build system
+echo   - Ninja - build tool
+echo   - Git if not installed
 echo.
-echo Total size: ~500MB (vs 10GB+ for Visual Studio!)
+echo Total size: ~500MB vs 10GB+ for Visual Studio
 echo.
 pause

 REM Check if running as admin
 net session >nul 2>&1
-if %errorlevel% neq 0 (
+if !errorlevel! neq 0 (
    echo [WARNING] Not running as administrator
    echo Some installations might fail. Recommended to run as admin.
    echo.
@ -27,7 +28,7 @@ if %errorlevel% neq 0 (

 REM Install chocolatey if not present
 where choco >nul 2>&1
-if %errorlevel% neq 0 (
+if !errorlevel! neq 0 (
    echo [INFO] Installing Chocolatey package manager...
    echo.

@ -37,7 +38,7 @@ if %errorlevel% neq 0 (
    set "PATH=%PATH%;C:\ProgramData\chocolatey\bin"

    where choco >nul 2>&1
-    if %errorlevel% neq 0 (
+    if !errorlevel! neq 0 (
        echo.
        echo ========================================
        echo [IMPORTANT] Chocolatey installed successfully!
@ -59,11 +60,11 @@ echo.

 REM Install MinGW-w64
 where gcc >nul 2>&1
-if %errorlevel% neq 0 (
-    echo [INFO] Installing MinGW-w64 (GCC compiler)...
+if !errorlevel! neq 0 (
+    echo [INFO] Installing MinGW-w64...
    choco install mingw -y

-    if %errorlevel% neq 0 (
+    if !errorlevel! neq 0 (
        echo [ERROR] Failed to install MinGW
        pause
        exit /b 1
@ -79,11 +80,11 @@ if %errorlevel% neq 0 (

 REM Install CMake
 where cmake >nul 2>&1
-if %errorlevel% neq 0 (
+if !errorlevel! neq 0 (
    echo [INFO] Installing CMake...
    choco install cmake -y --installargs 'ADD_CMAKE_TO_PATH=System'

-    if %errorlevel% neq 0 (
+    if !errorlevel! neq 0 (
        echo [ERROR] Failed to install CMake
        pause
        exit /b 1
@ -99,11 +100,11 @@ if %errorlevel% neq 0 (

 REM Install Ninja
 where ninja >nul 2>&1
-if %errorlevel% neq 0 (
+if !errorlevel! neq 0 (
    echo [INFO] Installing Ninja...
    choco install ninja -y

-    if %errorlevel% neq 0 (
+    if !errorlevel! neq 0 (
        echo [ERROR] Failed to install Ninja
        pause
        exit /b 1
@ -119,11 +120,11 @@ if %errorlevel% neq 0 (

 REM Install Git (if not present)
 where git >nul 2>&1
-if %errorlevel% neq 0 (
+if !errorlevel! neq 0 (
    echo [INFO] Installing Git...
    choco install git -y

-    if %errorlevel% neq 0 (
+    if !errorlevel! neq 0 (
        echo [ERROR] Failed to install Git
        pause
        exit /b 1
@ -146,7 +147,7 @@ if not defined VCPKG_ROOT (
        cd vcpkg
        call bootstrap-vcpkg.bat

-        if %errorlevel% neq 0 (
+        if !errorlevel! neq 0 (
            echo [ERROR] Failed to bootstrap vcpkg
            pause
            exit /b 1
@ -162,7 +163,7 @@ if not defined VCPKG_ROOT (
    echo [SUCCESS] VCPKG_ROOT set to C:\vcpkg
    echo.
 ) else (
-    echo [INFO] vcpkg already configured at: %VCPKG_ROOT%
+    echo [INFO] vcpkg already configured at: !VCPKG_ROOT!
    echo.
 )

@ -175,16 +176,16 @@ where gcc
 where cmake
 where ninja
 where git
-echo VCPKG_ROOT=%VCPKG_ROOT%
+echo VCPKG_ROOT=!VCPKG_ROOT!
 echo.
 echo ========================================
 echo Next Steps:
 echo ========================================
-echo 1. Close and reopen this terminal (to reload PATH)
+echo 1. Close and reopen this terminal to reload PATH
 echo 2. Run: build_mingw.bat --release
 echo 3. Your .exe will be in: build\mingw-release\SecondVoice.exe
 echo.
 echo Total installation size: ~500MB
-echo (vs 10GB+ for Visual Studio!)
+echo vs 10GB+ for Visual Studio!
 echo.
 pause
--- a/src/api/ClaudeClient.cpp
+++ b/src/api/ClaudeClient.cpp
@ -1,6 +1,5 @@
 #include "ClaudeClient.h"
-#include "../mingw_compat.h"
-#include <httplib.h>
+#include "WinHttpClient.h"
 #include <nlohmann/json.hpp>
 #include <iostream>

@ -37,40 +36,37 @@ std::optional<ClaudeResponse> ClaudeClient::translate(

    std::string request_body = request_json.dump();

-    // Make HTTP request
-    httplib::Client client("https://api.anthropic.com");
-    client.set_read_timeout(15, 0);  // 15 seconds timeout
-
-    httplib::Headers headers = {
+    WinHttpClient client;
+    std::map<std::string, std::string> headers = {
        {"x-api-key", api_key_},
        {"anthropic-version", API_VERSION},
        {"content-type", "application/json"}
    };

-    auto res = client.Post("/v1/messages", headers, request_body, "application/json");
+    auto response = client.postJson("api.anthropic.com", "/v1/messages", request_body, headers);

-    if (!res) {
-        std::cerr << "Claude API request failed: " << httplib::to_string(res.error()) << std::endl;
+    if (!response) {
+        std::cerr << "Claude API request failed" << std::endl;
        return std::nullopt;
    }

-    if (res->status != 200) {
-        std::cerr << "Claude API error " << res->status << ": " << res->body << std::endl;
+    if (response->statusCode != 200) {
+        std::cerr << "Claude API error " << response->statusCode << ": " << response->body << std::endl;
        return std::nullopt;
    }

    // Parse response
    try {
-        json response_json = json::parse(res->body);
+        json response_json = json::parse(response->body);

        if (!response_json.contains("content") || !response_json["content"].is_array()) {
            std::cerr << "Invalid Claude API response format" << std::endl;
            return std::nullopt;
        }

-        ClaudeResponse response;
-        response.text = response_json["content"][0]["text"].get<std::string>();
-        return response;
+        ClaudeResponse result;
+        result.text = response_json["content"][0]["text"].get<std::string>();
+        return result;
    } catch (const json::exception& e) {
        std::cerr << "Failed to parse Claude response: " << e.what() << std::endl;
        return std::nullopt;
--- a/src/api/ClaudeClient.h
+++ b/src/api/ClaudeClient.h
@ -23,7 +23,7 @@ public:
 private:
    std::string api_key_;
    static constexpr const char* API_URL = "https://api.anthropic.com/v1/messages";
-    static constexpr const char* MODEL = "claude-haiku-4-20250514";
+    static constexpr const char* MODEL = "claude-3-5-haiku-20241022";
    static constexpr const char* API_VERSION = "2023-06-01";
 };

--- a/src/api/WhisperClient.cpp
+++ b/src/api/WhisperClient.cpp
@ -1,10 +1,8 @@
 #include "WhisperClient.h"
+#include "WinHttpClient.h"
 #include "../audio/AudioBuffer.h"
-#include "../mingw_compat.h"
-#include <httplib.h>
 #include <nlohmann/json.hpp>
 #include <iostream>
-#include <sstream>
 #include <fstream>

 using json = nlohmann::json;
@ -25,70 +23,93 @@ std::optional<WhisperResponse> WhisperClient::transcribe(
    const std::string& prompt,
    const std::string& response_format) {

-    // Save audio to temporary Opus file
+    // Save audio to temporary Opus/OGG file (smaller than WAV)
    AudioBuffer buffer(sample_rate, channels);
    buffer.addSamples(audio_data);

-    std::string temp_file = "secondvoice_temp.opus";
+    std::string temp_file = "secondvoice_temp.ogg";
    if (!buffer.saveToOpus(temp_file)) {
        std::cerr << "Failed to save temporary Opus file" << std::endl;
        return std::nullopt;
    }

-    // Read Opus file
-    std::ifstream file(temp_file, std::ios::binary);
+    std::cout << "Saved Opus file: " << temp_file << std::endl;
+
+    // Read file into memory
+    std::ifstream file(temp_file, std::ios::binary | std::ios::ate);
    if (!file.is_open()) {
-        std::cerr << "Failed to open temporary Opus file" << std::endl;
+        std::cerr << "Failed to open temporary WAV file" << std::endl;
        return std::nullopt;
    }

-    std::ostringstream opus_stream;
-    opus_stream << file.rdbuf();
-    std::string opus_data = opus_stream.str();
+    std::streamsize size = file.tellg();
+    file.seekg(0, std::ios::beg);
+    std::vector<char> fileData(size);
+    if (!file.read(fileData.data(), size)) {
+        std::cerr << "Failed to read temporary WAV file" << std::endl;
+        return std::nullopt;
+    }
    file.close();

-    // Make HTTP request
-    httplib::Client client("https://api.openai.com");
-    client.set_read_timeout(30, 0);  // 30 seconds timeout
+    // Prepare form fields
+    std::map<std::string, std::string> fields = {
+        {"model", model},
+        {"language", language},
+        {"response_format", response_format}
+    };

-    httplib::Headers headers = {
+    // Temperature only supported by whisper-1
+    if (model == "whisper-1") {
+        fields["temperature"] = std::to_string(temperature);
+    }
+
+    if (!prompt.empty()) {
+        fields["prompt"] = prompt;
+    }
+
+    // Make request
+    WinHttpClient client;
+    std::map<std::string, std::string> headers = {
        {"Authorization", "Bearer " + api_key_}
    };

-    httplib::UploadFormDataItems items;
-    items.push_back({"file", opus_data, "audio.opus", "audio/ogg"});
-    items.push_back({"model", model, "", ""});
-    items.push_back({"language", language, "", ""});
-    items.push_back({"temperature", std::to_string(temperature), "", ""});
-    items.push_back({"response_format", response_format, "", ""});
+    auto response = client.postMultipart(
+        "api.openai.com",
+        "/v1/audio/transcriptions",
+        fields,
+        "file",
+        "audio.ogg",
+        "audio/ogg",
+        fileData,
+        headers
+    );

-    // Add prompt if provided
-    if (!prompt.empty()) {
-        items.push_back({"prompt", prompt, "", ""});
-    }
-
-    auto res = client.Post("/v1/audio/transcriptions", headers, items);
-
-    if (!res) {
-        std::cerr << "Whisper API request failed: " << httplib::to_string(res.error()) << std::endl;
+    if (!response) {
+        std::cerr << "Whisper API request failed" << std::endl;
        return std::nullopt;
    }

-    if (res->status != 200) {
-        std::cerr << "Whisper API error " << res->status << ": " << res->body << std::endl;
+    if (response->statusCode != 200) {
+        std::cerr << "Whisper API error " << response->statusCode << ": " << response->body << std::endl;
        return std::nullopt;
    }

-    // Parse response
-    try {
-        json response_json = json::parse(res->body);
-        WhisperResponse response;
-        response.text = response_json["text"].get<std::string>();
-        return response;
-    } catch (const json::exception& e) {
-        std::cerr << "Failed to parse Whisper response: " << e.what() << std::endl;
-        return std::nullopt;
+    // Parse response - for "text" format, response is plain text
+    // For "json" format, it's JSON
+    WhisperResponse result;
+    if (response_format == "text") {
+        result.text = response->body;
+    } else {
+        try {
+            json response_json = json::parse(response->body);
+            result.text = response_json["text"].get<std::string>();
+        } catch (const json::exception& e) {
+            std::cerr << "Failed to parse Whisper response: " << e.what() << std::endl;
+            return std::nullopt;
+        }
    }
+
+    return result;
 }

 } // namespace secondvoice
--- a/src/api/WinHttpClient.cpp
+++ b/src/api/WinHttpClient.cpp
@ -0,0 +1,301 @@
+#include "WinHttpClient.h"
+#include <iostream>
+#include <sstream>
+#include <random>
+#include <iomanip>
+
+#ifdef _WIN32
+#pragma comment(lib, "winhttp.lib")
+#endif
+
+namespace secondvoice {
+
+WinHttpClient::WinHttpClient() {
+#ifdef _WIN32
+    hSession_ = WinHttpOpen(
+        L"SecondVoice/1.0",
+        WINHTTP_ACCESS_TYPE_DEFAULT_PROXY,
+        WINHTTP_NO_PROXY_NAME,
+        WINHTTP_NO_PROXY_BYPASS,
+        0
+    );
+    if (!hSession_) {
+        std::cerr << "WinHttpOpen failed: " << GetLastError() << std::endl;
+    }
+#endif
+}
+
+WinHttpClient::~WinHttpClient() {
+#ifdef _WIN32
+    if (hSession_) {
+        WinHttpCloseHandle(hSession_);
+    }
+#endif
+}
+
+std::string WinHttpClient::generateBoundary() {
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> dis(0, 15);
+    const char* hex = "0123456789abcdef";
+
+    std::string boundary = "----WebKitFormBoundary";
+    for (int i = 0; i < 16; i++) {
+        boundary += hex[dis(gen)];
+    }
+    return boundary;
+}
+
+#ifdef _WIN32
+static std::wstring toWideString(const std::string& str) {
+    if (str.empty()) return L"";
+    int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, nullptr, 0);
+    std::wstring result(size - 1, 0);
+    MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, &result[0], size);
+    return result;
+}
+#endif
+
+std::optional<HttpResponse> WinHttpClient::postJson(
+    const std::string& host,
+    const std::string& path,
+    const std::string& jsonBody,
+    const std::map<std::string, std::string>& headers) {
+
+#ifdef _WIN32
+    if (!hSession_) {
+        return std::nullopt;
+    }
+
+    std::wstring wHost = toWideString(host);
+    HINTERNET hConnect = WinHttpConnect(hSession_, wHost.c_str(), INTERNET_DEFAULT_HTTPS_PORT, 0);
+    if (!hConnect) {
+        std::cerr << "WinHttpConnect failed: " << GetLastError() << std::endl;
+        return std::nullopt;
+    }
+
+    std::wstring wPath = toWideString(path);
+    HINTERNET hRequest = WinHttpOpenRequest(
+        hConnect,
+        L"POST",
+        wPath.c_str(),
+        nullptr,
+        WINHTTP_NO_REFERER,
+        WINHTTP_DEFAULT_ACCEPT_TYPES,
+        WINHTTP_FLAG_SECURE
+    );
+    if (!hRequest) {
+        std::cerr << "WinHttpOpenRequest failed: " << GetLastError() << std::endl;
+        WinHttpCloseHandle(hConnect);
+        return std::nullopt;
+    }
+
+    // Add headers
+    for (const auto& [key, value] : headers) {
+        std::wstring header = toWideString(key + ": " + value);
+        WinHttpAddRequestHeaders(hRequest, header.c_str(), -1, WINHTTP_ADDREQ_FLAG_ADD);
+    }
+
+    // Send request
+    BOOL result = WinHttpSendRequest(
+        hRequest,
+        WINHTTP_NO_ADDITIONAL_HEADERS,
+        0,
+        (LPVOID)jsonBody.c_str(),
+        jsonBody.length(),
+        jsonBody.length(),
+        0
+    );
+
+    if (!result) {
+        std::cerr << "WinHttpSendRequest failed: " << GetLastError() << std::endl;
+        WinHttpCloseHandle(hRequest);
+        WinHttpCloseHandle(hConnect);
+        return std::nullopt;
+    }
+
+    result = WinHttpReceiveResponse(hRequest, nullptr);
+    if (!result) {
+        std::cerr << "WinHttpReceiveResponse failed: " << GetLastError() << std::endl;
+        WinHttpCloseHandle(hRequest);
+        WinHttpCloseHandle(hConnect);
+        return std::nullopt;
+    }
+
+    // Get status code
+    DWORD statusCode = 0;
+    DWORD statusCodeSize = sizeof(statusCode);
+    WinHttpQueryHeaders(
+        hRequest,
+        WINHTTP_QUERY_STATUS_CODE | WINHTTP_QUERY_FLAG_NUMBER,
+        WINHTTP_HEADER_NAME_BY_INDEX,
+        &statusCode,
+        &statusCodeSize,
+        WINHTTP_NO_HEADER_INDEX
+    );
+
+    // Read response body
+    std::string responseBody;
+    DWORD bytesAvailable = 0;
+    do {
+        bytesAvailable = 0;
+        if (!WinHttpQueryDataAvailable(hRequest, &bytesAvailable)) {
+            break;
+        }
+        if (bytesAvailable == 0) break;
+
+        std::vector<char> buffer(bytesAvailable + 1);
+        DWORD bytesRead = 0;
+        if (WinHttpReadData(hRequest, buffer.data(), bytesAvailable, &bytesRead)) {
+            responseBody.append(buffer.data(), bytesRead);
+        }
+    } while (bytesAvailable > 0);
+
+    WinHttpCloseHandle(hRequest);
+    WinHttpCloseHandle(hConnect);
+
+    return HttpResponse{static_cast<int>(statusCode), responseBody};
+#else
+    return std::nullopt;
+#endif
+}
+
+std::optional<HttpResponse> WinHttpClient::postMultipart(
+    const std::string& host,
+    const std::string& path,
+    const std::map<std::string, std::string>& fields,
+    const std::string& fileFieldName,
+    const std::string& fileName,
+    const std::string& fileContentType,
+    const std::vector<char>& fileData,
+    const std::map<std::string, std::string>& headers) {
+
+#ifdef _WIN32
+    if (!hSession_) {
+        return std::nullopt;
+    }
+
+    std::string boundary = generateBoundary();
+
+    // Build multipart body
+    std::ostringstream body;
+
+    // Add fields
+    for (const auto& [key, value] : fields) {
+        body << "--" << boundary << "\r\n";
+        body << "Content-Disposition: form-data; name=\"" << key << "\"\r\n\r\n";
+        body << value << "\r\n";
+    }
+
+    // Add file
+    body << "--" << boundary << "\r\n";
+    body << "Content-Disposition: form-data; name=\"" << fileFieldName << "\"; filename=\"" << fileName << "\"\r\n";
+    body << "Content-Type: " << fileContentType << "\r\n\r\n";
+
+    std::string bodyPrefix = body.str();
+    std::string bodySuffix = "\r\n--" + boundary + "--\r\n";
+
+    // Combine all parts
+    std::vector<char> fullBody;
+    fullBody.insert(fullBody.end(), bodyPrefix.begin(), bodyPrefix.end());
+    fullBody.insert(fullBody.end(), fileData.begin(), fileData.end());
+    fullBody.insert(fullBody.end(), bodySuffix.begin(), bodySuffix.end());
+
+    std::wstring wHost = toWideString(host);
+    HINTERNET hConnect = WinHttpConnect(hSession_, wHost.c_str(), INTERNET_DEFAULT_HTTPS_PORT, 0);
+    if (!hConnect) {
+        std::cerr << "WinHttpConnect failed: " << GetLastError() << std::endl;
+        return std::nullopt;
+    }
+
+    std::wstring wPath = toWideString(path);
+    HINTERNET hRequest = WinHttpOpenRequest(
+        hConnect,
+        L"POST",
+        wPath.c_str(),
+        nullptr,
+        WINHTTP_NO_REFERER,
+        WINHTTP_DEFAULT_ACCEPT_TYPES,
+        WINHTTP_FLAG_SECURE
+    );
+    if (!hRequest) {
+        std::cerr << "WinHttpOpenRequest failed: " << GetLastError() << std::endl;
+        WinHttpCloseHandle(hConnect);
+        return std::nullopt;
+    }
+
+    // Add headers
+    for (const auto& [key, value] : headers) {
+        std::wstring header = toWideString(key + ": " + value);
+        WinHttpAddRequestHeaders(hRequest, header.c_str(), -1, WINHTTP_ADDREQ_FLAG_ADD);
+    }
+
+    // Add content-type header
+    std::wstring contentType = toWideString("Content-Type: multipart/form-data; boundary=" + boundary);
+    WinHttpAddRequestHeaders(hRequest, contentType.c_str(), -1, WINHTTP_ADDREQ_FLAG_ADD);
+
+    // Send request
+    BOOL result = WinHttpSendRequest(
+        hRequest,
+        WINHTTP_NO_ADDITIONAL_HEADERS,
+        0,
+        fullBody.data(),
+        fullBody.size(),
+        fullBody.size(),
+        0
+    );
+
+    if (!result) {
+        std::cerr << "WinHttpSendRequest failed: " << GetLastError() << std::endl;
+        WinHttpCloseHandle(hRequest);
+        WinHttpCloseHandle(hConnect);
+        return std::nullopt;
+    }
+
+    result = WinHttpReceiveResponse(hRequest, nullptr);
+    if (!result) {
+        std::cerr << "WinHttpReceiveResponse failed: " << GetLastError() << std::endl;
+        WinHttpCloseHandle(hRequest);
+        WinHttpCloseHandle(hConnect);
+        return std::nullopt;
+    }
+
+    // Get status code
+    DWORD statusCode = 0;
+    DWORD statusCodeSize = sizeof(statusCode);
+    WinHttpQueryHeaders(
+        hRequest,
+        WINHTTP_QUERY_STATUS_CODE | WINHTTP_QUERY_FLAG_NUMBER,
+        WINHTTP_HEADER_NAME_BY_INDEX,
+        &statusCode,
+        &statusCodeSize,
+        WINHTTP_NO_HEADER_INDEX
+    );
+
+    // Read response body
+    std::string responseBody;
+    DWORD bytesAvailable = 0;
+    do {
+        bytesAvailable = 0;
+        if (!WinHttpQueryDataAvailable(hRequest, &bytesAvailable)) {
+            break;
+        }
+        if (bytesAvailable == 0) break;
+
+        std::vector<char> buffer(bytesAvailable + 1);
+        DWORD bytesRead = 0;
+        if (WinHttpReadData(hRequest, buffer.data(), bytesAvailable, &bytesRead)) {
+            responseBody.append(buffer.data(), bytesRead);
+        }
+    } while (bytesAvailable > 0);
+
+    WinHttpCloseHandle(hRequest);
+    WinHttpCloseHandle(hConnect);
+
+    return HttpResponse{static_cast<int>(statusCode), responseBody};
+#else
+    return std::nullopt;
+#endif
+}
+
+} // namespace secondvoice
--- a/src/api/WinHttpClient.h
+++ b/src/api/WinHttpClient.h
@ -0,0 +1,52 @@
+#pragma once
+
+#ifdef _WIN32
+#include <windows.h>
+#include <winhttp.h>
+#endif
+
+#include <string>
+#include <vector>
+#include <map>
+#include <optional>
+
+namespace secondvoice {
+
+struct HttpResponse {
+    int statusCode;
+    std::string body;
+};
+
+class WinHttpClient {
+public:
+    WinHttpClient();
+    ~WinHttpClient();
+
+    // POST JSON request
+    std::optional<HttpResponse> postJson(
+        const std::string& host,
+        const std::string& path,
+        const std::string& jsonBody,
+        const std::map<std::string, std::string>& headers
+    );
+
+    // POST multipart form data
+    std::optional<HttpResponse> postMultipart(
+        const std::string& host,
+        const std::string& path,
+        const std::map<std::string, std::string>& fields,
+        const std::string& fileFieldName,
+        const std::string& fileName,
+        const std::string& fileContentType,
+        const std::vector<char>& fileData,
+        const std::map<std::string, std::string>& headers
+    );
+
+private:
+#ifdef _WIN32
+    HINTERNET hSession_;
+#endif
+    std::string generateBoundary();
+};
+
+} // namespace secondvoice
--- a/src/audio/AudioCapture.cpp
+++ b/src/audio/AudioCapture.cpp
@ -3,10 +3,12 @@

 namespace secondvoice {

-AudioCapture::AudioCapture(int sample_rate, int channels, int chunk_duration_seconds)
+AudioCapture::AudioCapture(int sample_rate, int channels, int chunk_duration_seconds, int chunk_step_seconds)
    : sample_rate_(sample_rate)
    , channels_(channels)
-    , chunk_duration_seconds_(chunk_duration_seconds) {
+    , chunk_duration_seconds_(chunk_duration_seconds)
+    , chunk_step_seconds_(chunk_step_seconds > 0 ? chunk_step_seconds : chunk_duration_seconds) {
+    // If step not specified, use duration (no overlap)
 }

 AudioCapture::~AudioCapture() {
@ -47,12 +49,17 @@ int AudioCapture::audioCallback(const void* input, void* output,

    // Check if we have accumulated enough data for a chunk
    size_t chunk_samples = self->sample_rate_ * self->channels_ * self->chunk_duration_seconds_;
+    size_t step_samples = self->sample_rate_ * self->channels_ * self->chunk_step_seconds_;
+
    if (self->buffer_.size() >= chunk_samples) {
-        // Call the callback with the chunk
+        // Call the callback with the full chunk
        if (self->callback_) {
-            self->callback_(self->buffer_);
+            // Send exactly chunk_samples (the window)
+            std::vector<float> chunk(self->buffer_.begin(), self->buffer_.begin() + chunk_samples);
+            self->callback_(chunk);
        }
-        self->buffer_.clear();
+        // Sliding window: remove only step_samples, keep overlap for next chunk
+        self->buffer_.erase(self->buffer_.begin(), self->buffer_.begin() + step_samples);
    }

    return paContinue;
--- a/src/audio/AudioCapture.h
+++ b/src/audio/AudioCapture.h
@ -11,7 +11,8 @@ class AudioCapture {
 public:
    using AudioCallback = std::function<void(const std::vector<float>&)>;

-    AudioCapture(int sample_rate, int channels, int chunk_duration_seconds);
+    // chunk_duration = window size, chunk_step = how often to send (overlap = duration - step)
+    AudioCapture(int sample_rate, int channels, int chunk_duration_seconds, int chunk_step_seconds = 0);
    ~AudioCapture();

    bool initialize();
@ -29,6 +30,7 @@ private:
    int sample_rate_;
    int channels_;
    int chunk_duration_seconds_;
+    int chunk_step_seconds_;  // How often to emit chunks (0 = same as duration, no overlap)
    bool is_recording_ = false;

    PaStream* stream_ = nullptr;
--- a/src/core/Pipeline.cpp
+++ b/src/core/Pipeline.cpp
@ -10,6 +10,8 @@
 #include <sstream>
 #include <chrono>
 #include <filesystem>
+#include <cmath>
+#include <algorithm>

 namespace secondvoice {

@ -22,13 +24,19 @@ Pipeline::~Pipeline() {
 bool Pipeline::initialize() {
    auto& config = Config::getInstance();

-    // Initialize audio capture
+    // Initialize audio capture with sliding window (overlap)
    audio_capture_ = std::make_unique<AudioCapture>(
        config.getAudioConfig().sample_rate,
        config.getAudioConfig().channels,
-        config.getAudioConfig().chunk_duration_seconds
+        config.getAudioConfig().chunk_duration_seconds,
+        config.getAudioConfig().chunk_step_seconds
    );

+    std::cout << "[Pipeline] Audio: " << config.getAudioConfig().chunk_duration_seconds
+              << "s window, " << config.getAudioConfig().chunk_step_seconds
+              << "s step (overlap: " << (config.getAudioConfig().chunk_duration_seconds - config.getAudioConfig().chunk_step_seconds)
+              << "s)" << std::endl;
+
    if (!audio_capture_->initialize()) {
        std::cerr << "Failed to initialize audio capture" << std::endl;
        return false;
@ -68,10 +76,9 @@ bool Pipeline::start() {

    running_ = true;

-    // Start threads
+    // Start background threads (NOT UI - that runs in main thread)
    audio_thread_ = std::thread(&Pipeline::audioThread, this);
    processing_thread_ = std::thread(&Pipeline::processingThread, this);
-    ui_thread_ = std::thread(&Pipeline::uiThread, this);

    return true;
 }
@ -92,16 +99,13 @@ void Pipeline::stop() {
    audio_queue_.shutdown();
    transcription_queue_.shutdown();

-    // Wait for threads
+    // Wait for background threads
    if (audio_thread_.joinable()) {
        audio_thread_.join();
    }
    if (processing_thread_.joinable()) {
        processing_thread_.join();
    }
-    if (ui_thread_.joinable()) {
-        ui_thread_.join();
-    }

    // Save full recording
    auto& config = Config::getInstance();
@ -153,6 +157,11 @@ void Pipeline::audioThread() {
 void Pipeline::processingThread() {
    auto& config = Config::getInstance();

+    // VAD threshold - audio RMS must exceed this to be considered speech
+    // Higher values = more aggressive noise filtering
+    constexpr float VAD_THRESHOLD = 0.02f;  // RMS energy threshold
+    constexpr float VAD_MIN_PEAK = 0.08f;   // Minimum peak amplitude
+
    while (running_) {
        auto chunk_opt = audio_queue_.wait_and_pop();
        if (!chunk_opt.has_value()) {
@ -161,6 +170,22 @@ void Pipeline::processingThread() {

        auto& chunk = chunk_opt.value();

+        // Voice Activity Detection - skip silent/noise chunks
+        float sum_squared = 0.0f;
+        float max_amplitude = 0.0f;
+        for (const float sample : chunk.data) {
+            sum_squared += sample * sample;
+            max_amplitude = std::max(max_amplitude, std::abs(sample));
+        }
+        float rms = std::sqrt(sum_squared / chunk.data.size());
+
+        if (rms < VAD_THRESHOLD || max_amplitude < VAD_MIN_PEAK) {
+            std::cout << "[Skip] Silent chunk (RMS: " << rms << ", Peak: " << max_amplitude << ")" << std::endl;
+            continue;
+        }
+
+        std::cout << "[Audio] Processing chunk (RMS: " << rms << ", Peak: " << max_amplitude << ")" << std::endl;
+
        // Transcribe with Whisper
        auto whisper_result = whisper_client_->transcribe(
            chunk.data,
@ -178,9 +203,71 @@ void Pipeline::processingThread() {
            continue;
        }

-        // Translate with Claude
+        // Filter out Whisper hallucinations (common when audio is silent/unclear)
+        std::string text = whisper_result->text;
+        // Trim whitespace
+        size_t start = text.find_first_not_of(" \t\n\r");
+        size_t end = text.find_last_not_of(" \t\n\r");
+        if (start == std::string::npos) {
+            std::cout << "[Skip] Empty transcription" << std::endl;
+            continue;
+        }
+        text = text.substr(start, end - start + 1);
+
+        // Skip known hallucinations and garbage
+        bool is_garbage = false;
+
+        // Too short to be meaningful
+        if (text.length() < 4) {
+            is_garbage = true;
+        }
+        // Known Whisper hallucinations
+        else if (text.find("Amara.org") != std::string::npos ||
+                 text.find("amara.org") != std::string::npos ||
+                 text.find("字幕") != std::string::npos ||
+                 text.find("subtitle") != std::string::npos ||
+                 text.find("Subtitle") != std::string::npos ||
+                 text.find("Thank you") != std::string::npos ||
+                 text.find("thanks for watching") != std::string::npos ||
+                 text.find("Subscribe") != std::string::npos ||
+                 text.find("谢谢观看") != std::string::npos ||
+                 text.find("订阅") != std::string::npos ||
+                 text.find("...") == 0) {  // Starts with ellipsis
+            is_garbage = true;
+        }
+        // Only punctuation or whitespace
+        else {
+            bool has_content = false;
+            for (char c : text) {
+                if (std::isalnum(static_cast<unsigned char>(c)) || (c & 0x80)) {
+                    has_content = true;
+                    break;
+                }
+            }
+            if (!has_content) {
+                is_garbage = true;
+            }
+        }
+
+        if (is_garbage) {
+            std::cout << "[Skip] Filtered: " << text << std::endl;
+            continue;
+        }
+
+        // Remove overlap with previous transcription
+        std::string deduplicated = removeOverlap(text, last_transcription_);
+        last_transcription_ = text;  // Store full text for next comparison
+
+        if (deduplicated.empty() || deduplicated.length() < 2) {
+            std::cout << "[Skip] Fully overlapping transcription" << std::endl;
+            continue;
+        }
+
+        std::cout << "[New content] " << deduplicated << std::endl;
+
+        // Translate with Claude (only the new, deduplicated content)
        auto claude_result = claude_client_->translate(
-            whisper_result->text,
+            deduplicated,
            config.getClaudeConfig().system_prompt,
            config.getClaudeConfig().max_tokens,
            config.getClaudeConfig().temperature
@ -192,31 +279,69 @@ void Pipeline::processingThread() {
        }

        // Add to UI
-        ui_->addTranslation(whisper_result->text, claude_result->text);
+        ui_->addTranslation(deduplicated, claude_result->text);

-        std::cout << "CN: " << whisper_result->text << std::endl;
+        std::cout << "CN: " << deduplicated << std::endl;
        std::cout << "FR: " << claude_result->text << std::endl;
        std::cout << "---" << std::endl;
    }
 }

-void Pipeline::uiThread() {
-    // CRITICAL: Make OpenGL context current in THIS thread (not the thread that created it)
-    ui_->makeContextCurrent();
+void Pipeline::update() {
+    if (!ui_) return;

-    while (running_ && !ui_->shouldClose()) {
-        ui_->setRecordingDuration(recording_duration_);
-        ui_->setProcessingStatus("Processing...");
-        ui_->render();
+    ui_->setRecordingDuration(recording_duration_);
+    ui_->setProcessingStatus("Processing...");
+    ui_->render();

-        // Check if stop was requested
-        if (ui_->isStopRequested()) {
-            running_ = false;
-            break;
-        }
-
-        std::this_thread::sleep_for(std::chrono::milliseconds(16));  // ~60 FPS
+    // Check if stop was requested
+    if (ui_->isStopRequested()) {
+        running_ = false;
    }
 }

+bool Pipeline::shouldClose() const {
+    return ui_ ? ui_->shouldClose() : true;
+}
+
+std::string Pipeline::removeOverlap(const std::string& current, const std::string& previous) {
+    if (previous.empty()) {
+        return current;
+    }
+
+    // Try to find overlap: check if current starts with end of previous
+    // Start from half the previous text (since we have 50% overlap)
+    size_t min_overlap = 4;  // Minimum characters to consider as overlap
+    size_t search_start = previous.length() / 3;  // Start looking from 1/3 into previous
+
+    for (size_t i = search_start; i < previous.length() - min_overlap; ++i) {
+        std::string suffix = previous.substr(i);
+        if (current.find(suffix) == 0) {
+            // Found overlap - return only the new part
+            std::string new_part = current.substr(suffix.length());
+            if (!new_part.empty()) {
+                std::cout << "[Overlap] Removed: \"" << suffix << "\"" << std::endl;
+                return new_part;
+            }
+        }
+    }
+
+    // No overlap found - check for partial word overlap at start
+    // This handles cases where the same content appears but tokenized differently
+    size_t max_check = std::min(current.length(), previous.length()) / 2;
+    for (size_t len = max_check; len >= min_overlap; --len) {
+        std::string end_prev = previous.substr(previous.length() - len);
+        std::string start_curr = current.substr(0, len);
+        if (end_prev == start_curr) {
+            std::string new_part = current.substr(len);
+            if (!new_part.empty()) {
+                std::cout << "[Overlap] Partial match removed: \"" << end_prev << "\"" << std::endl;
+                return new_part;
+            }
+        }
+    }
+
+    return current;  // No overlap detected
+}
+
 } // namespace secondvoice
--- a/src/core/Pipeline.h
+++ b/src/core/Pipeline.h
@ -34,12 +34,15 @@ public:
    bool start();
    void stop();

+    // Call this from main thread for UI updates
+    void update();
+
    bool isRunning() const { return running_; }
+    bool shouldClose() const;

 private:
    void audioThread();
    void processingThread();
-    void uiThread();

    std::unique_ptr<AudioCapture> audio_capture_;
    std::unique_ptr<WhisperClient> whisper_client_;
@ -52,10 +55,13 @@ private:

    std::thread audio_thread_;
    std::thread processing_thread_;
-    std::thread ui_thread_;

    std::atomic<bool> running_{false};
    std::atomic<int> recording_duration_{0};
+
+    // For overlap deduplication
+    std::string last_transcription_;
+    std::string removeOverlap(const std::string& current, const std::string& previous);
 };

 } // namespace secondvoice
--- a/src/main.cpp
+++ b/src/main.cpp
@ -4,6 +4,10 @@
 #include "utils/Config.h"
 #include "core/Pipeline.h"

+#ifdef _WIN32
+#include <windows.h>
+#endif
+
 // Force NVIDIA GPU on Optimus systems (instead of integrated AMD/Intel)
 // These MUST be global and volatile to prevent linker optimization
 #ifdef _WIN32
@ -38,6 +42,12 @@ int main(int argc, char** argv) {
    (void)argc;
    (void)argv;

+#ifdef _WIN32
+    // Enable UTF-8 console output for Chinese characters
+    SetConsoleOutputCP(CP_UTF8);
+    SetConsoleCP(CP_UTF8);
+#endif
+
    log_msg("MAIN: Entry point reached");
    log_msg("========================================");
    log_msg("SecondVoice - Real-time Translation System");
@ -67,17 +77,18 @@ int main(int argc, char** argv) {
    log_msg("Pipeline initialized successfully");
    log_msg("Starting recording and translation...");

-    // Start pipeline
+    // Start pipeline (background threads for audio + API calls)
    log_msg("Starting pipeline...");
    if (!pipeline.start()) {
        log_msg("ERROR: Failed to start pipeline");
        return 1;
    }

-    // Wait for pipeline to finish (user clicks Stop button)
-    log_msg("Pipeline running, waiting for user to stop...");
-    while (pipeline.isRunning()) {
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    // Main loop - UI runs in main thread (required by GLFW)
+    log_msg("Pipeline running, entering main loop...");
+    while (pipeline.isRunning() && !pipeline.shouldClose()) {
+        pipeline.update();  // Render one frame
+        std::this_thread::sleep_for(std::chrono::milliseconds(16));  // ~60 FPS
    }

    log_msg("");
--- a/src/ui/TranslationUI.cpp
+++ b/src/ui/TranslationUI.cpp
@ -1,6 +1,7 @@
 #include <glad/glad.h>  // MUST be FIRST! Provides OpenGL functions
 #define GLFW_INCLUDE_NONE  // Tell GLFW not to include OpenGL headers (GLAD does it)
 #include "TranslationUI.h"
+#include "../utils/Config.h"
 #include <imgui.h>
 #include <imgui_impl_glfw.h>
 #include <imgui_impl_opengl3.h>
@ -86,6 +87,40 @@ bool TranslationUI::initialize() {
    ImGuiIO& io = ImGui::GetIO();
    io.ConfigFlags |= ImGuiConfigFlags_NavEnableKeyboard;

+    // Load Chinese font from Windows system fonts
+    float font_size = static_cast<float>(Config::getInstance().getUIConfig().font_size);
+    std::cout << "[UI] Loading Chinese font (size: " << font_size << ")..." << std::endl;
+    ImFontConfig font_config;
+    font_config.OversampleH = 2;
+    font_config.OversampleV = 2;
+
+    // Try common Chinese fonts on Windows
+    const char* chinese_fonts[] = {
+        "C:\\Windows\\Fonts\\msyh.ttc",      // Microsoft YaHei
+        "C:\\Windows\\Fonts\\simhei.ttf",    // SimHei
+        "C:\\Windows\\Fonts\\simsun.ttc",    // SimSun
+        "C:\\Windows\\Fonts\\mingliub.ttc",  // MingLiU
+    };
+
+    ImFont* font = nullptr;
+    for (const char* font_path : chinese_fonts) {
+        font = io.Fonts->AddFontFromFileTTF(
+            font_path,
+            font_size,
+            &font_config,
+            io.Fonts->GetGlyphRangesChineseFull()
+        );
+        if (font) {
+            std::cout << "[UI] Loaded font: " << font_path << std::endl;
+            break;
+        }
+    }
+
+    if (!font) {
+        std::cout << "[UI] Warning: No Chinese font found, using default (Chinese chars won't display)" << std::endl;
+        io.Fonts->AddFontDefault();
+    }
+
    ImGui::StyleColorsDark();

    ImGui_ImplGlfw_InitForOpenGL(window_, true);
@ -130,13 +165,12 @@ void main() {
        return false;
    }

-    // Let ImGui auto-detect the GLSL version
-    ImGui_ImplOpenGL3_Init(nullptr);
+    // Explicitly specify GLSL version for OpenGL 3.3 core profile
+    // On Windows, auto-detection (nullptr) can fail, so we specify "#version 330"
+    std::cout << "[UI] Initializing ImGui OpenGL3 backend with GLSL 330..." << std::endl;
+    ImGui_ImplOpenGL3_Init("#version 330");

-    // CRITICAL: Release the OpenGL context from this thread
-    // The UI rendering will happen in a separate thread which will call makeContextCurrent()
-    std::cout << "[UI] Releasing OpenGL context from initialization thread" << std::endl;
-    glfwMakeContextCurrent(nullptr);
+    // Context stays in main thread - no need to release

    return true;
 }
--- a/src/utils/Config.cpp
+++ b/src/utils/Config.cpp
@ -71,7 +71,8 @@ bool Config::load(const std::string& config_path, const std::string& env_path) {
        audio_config_.sample_rate = audio.value("sample_rate", 16000);
        audio_config_.channels = audio.value("channels", 1);
        audio_config_.chunk_duration_seconds = audio.value("chunk_duration_seconds", 10);
-        audio_config_.format = audio.value("format", "wav");
+        audio_config_.chunk_step_seconds = audio.value("chunk_step_seconds", 0);  // 0 = no overlap
+        audio_config_.format = audio.value("format", "ogg");
    }

    // Parse whisper config
--- a/src/utils/Config.h
+++ b/src/utils/Config.h
@ -8,6 +8,7 @@ struct AudioConfig {
    int sample_rate;
    int channels;
    int chunk_duration_seconds;
+    int chunk_step_seconds;  // How often to emit chunks (for overlap)
    std::string format;
 };

--- a/vcpkg.json
+++ b/vcpkg.json
@ -3,7 +3,6 @@
  "version": "0.1.0",
  "dependencies": [
    "portaudio",
-    "cpp-httplib",
    "nlohmann-json",
    "glfw3",
    "glad",