secondvoice/src/api/WhisperClient.cpp

#include "WhisperClient.h"
#include "../audio/AudioBuffer.h"
#include <httplib.h>
#include <nlohmann/json.hpp>
#include <iostream>
#include <sstream>
#include <fstream>

using json = nlohmann::json;

namespace secondvoice {

WhisperClient::WhisperClient(const std::string& api_key)
    : api_key_(api_key) {
}

std::optional<WhisperResponse> WhisperClient::transcribe(
    const std::vector<float>& audio_data,
    int sample_rate,
    int channels,
    const std::string& model,
    const std::string& language,
    float temperature,
    const std::string& prompt,
    const std::string& response_format) {

    // Save audio to temporary WAV file
    AudioBuffer buffer(sample_rate, channels);
    buffer.addSamples(audio_data);

    std::string temp_file = "/tmp/secondvoice_temp.wav";
    if (!buffer.saveToWav(temp_file)) {
        std::cerr << "Failed to save temporary WAV file" << std::endl;
        return std::nullopt;
    }

    // Read WAV file
    std::ifstream file(temp_file, std::ios::binary);
    if (!file.is_open()) {
        std::cerr << "Failed to open temporary WAV file" << std::endl;
        return std::nullopt;
    }

    std::ostringstream wav_stream;
    wav_stream << file.rdbuf();
    std::string wav_data = wav_stream.str();
    file.close();

    // Make HTTP request
    httplib::Client client("https://api.openai.com");
    client.set_read_timeout(30, 0);  // 30 seconds timeout

    httplib::Headers headers = {
        {"Authorization", "Bearer " + api_key_}
    };

    httplib::UploadFormDataItems items;
    items.push_back({"file", wav_data, "audio.wav", "audio/wav"});
    items.push_back({"model", model, "", ""});
    items.push_back({"language", language, "", ""});
    items.push_back({"temperature", std::to_string(temperature), "", ""});
    items.push_back({"response_format", response_format, "", ""});

    // Add prompt if provided
    if (!prompt.empty()) {
        items.push_back({"prompt", prompt, "", ""});
    }

    auto res = client.Post("/v1/audio/transcriptions", headers, items);

    if (!res) {
        std::cerr << "Whisper API request failed: " << httplib::to_string(res.error()) << std::endl;
        return std::nullopt;
    }

    if (res->status != 200) {
        std::cerr << "Whisper API error " << res->status << ": " << res->body << std::endl;
        return std::nullopt;
    }

    // Parse response
    try {
        json response_json = json::parse(res->body);
        WhisperResponse response;
        response.text = response_json["text"].get<std::string>();
        return response;
    } catch (const json::exception& e) {
        std::cerr << "Failed to parse Whisper response: " << e.what() << std::endl;
        return std::nullopt;
    }
}

} // namespace secondvoice