238 lines
8.0 KiB
C++
238 lines
8.0 KiB
C++
/**
|
|
* @file test_stt_live.cpp
|
|
* @brief Live STT testing tool - Test all 4 engines
|
|
*/
|
|
|
|
#include "src/shared/audio/ISTTEngine.hpp"
|
|
#include <spdlog/spdlog.h>
|
|
#include <iostream>
|
|
#include <fstream>
|
|
#include <vector>
|
|
#include <cstdlib>
|
|
|
|
using namespace aissia;
|
|
|
|
// Helper: Load .env file
|
|
void loadEnv(const std::string& path = ".env") {
|
|
std::ifstream file(path);
|
|
if (!file.is_open()) {
|
|
spdlog::warn("No .env file found at: {}", path);
|
|
return;
|
|
}
|
|
|
|
std::string line;
|
|
while (std::getline(file, line)) {
|
|
if (line.empty() || line[0] == '#') continue;
|
|
|
|
auto pos = line.find('=');
|
|
if (pos != std::string::npos) {
|
|
std::string key = line.substr(0, pos);
|
|
std::string value = line.substr(pos + 1);
|
|
|
|
// Remove quotes
|
|
if (!value.empty() && value.front() == '"' && value.back() == '"') {
|
|
value = value.substr(1, value.length() - 2);
|
|
}
|
|
|
|
#ifdef _WIN32
|
|
_putenv_s(key.c_str(), value.c_str());
|
|
#else
|
|
setenv(key.c_str(), value.c_str(), 1);
|
|
#endif
|
|
}
|
|
}
|
|
spdlog::info("Loaded environment from {}", path);
|
|
}
|
|
|
|
// Helper: Get API key from env
|
|
std::string getEnvVar(const std::string& name) {
|
|
const char* val = std::getenv(name.c_str());
|
|
return val ? std::string(val) : "";
|
|
}
|
|
|
|
// Helper: Load audio file as WAV (simplified - assumes 16-bit PCM)
|
|
std::vector<float> loadWavFile(const std::string& path) {
|
|
std::ifstream file(path, std::ios::binary);
|
|
if (!file.is_open()) {
|
|
spdlog::error("Failed to open audio file: {}", path);
|
|
return {};
|
|
}
|
|
|
|
// Skip WAV header (44 bytes)
|
|
file.seekg(44);
|
|
|
|
// Read 16-bit PCM samples
|
|
std::vector<int16_t> samples;
|
|
int16_t sample;
|
|
while (file.read(reinterpret_cast<char*>(&sample), sizeof(sample))) {
|
|
samples.push_back(sample);
|
|
}
|
|
|
|
// Convert to float [-1.0, 1.0]
|
|
std::vector<float> audioData;
|
|
audioData.reserve(samples.size());
|
|
for (int16_t s : samples) {
|
|
audioData.push_back(static_cast<float>(s) / 32768.0f);
|
|
}
|
|
|
|
spdlog::info("Loaded {} samples from {}", audioData.size(), path);
|
|
return audioData;
|
|
}
|
|
|
|
int main(int argc, char* argv[]) {
|
|
spdlog::set_level(spdlog::level::info);
|
|
spdlog::info("=== AISSIA STT Live Test ===");
|
|
|
|
// Load environment variables
|
|
loadEnv();
|
|
|
|
// Check command line
|
|
if (argc < 2) {
|
|
std::cout << "Usage: " << argv[0] << " <audio.wav>\n";
|
|
std::cout << "\nAvailable engines:\n";
|
|
std::cout << " 1. Whisper.cpp (local, requires models/ggml-base.bin)\n";
|
|
std::cout << " 2. Whisper API (requires OPENAI_API_KEY)\n";
|
|
std::cout << " 3. Google Speech (requires GOOGLE_API_KEY)\n";
|
|
std::cout << " 4. Azure STT (requires AZURE_SPEECH_KEY + AZURE_SPEECH_REGION)\n";
|
|
std::cout << " 5. Deepgram (requires DEEPGRAM_API_KEY)\n";
|
|
return 1;
|
|
}
|
|
|
|
std::string audioFile = argv[1];
|
|
|
|
// Load audio
|
|
std::vector<float> audioData = loadWavFile(audioFile);
|
|
if (audioData.empty()) {
|
|
spdlog::error("Failed to load audio data");
|
|
return 1;
|
|
}
|
|
|
|
// Test each engine
|
|
std::cout << "\n========================================\n";
|
|
std::cout << "Testing STT Engines\n";
|
|
std::cout << "========================================\n\n";
|
|
|
|
// 1. Whisper.cpp (local)
|
|
{
|
|
std::cout << "[1/5] Whisper.cpp (local)\n";
|
|
std::cout << "----------------------------\n";
|
|
|
|
try {
|
|
auto engine = STTEngineFactory::create("whisper_cpp", "models/ggml-base.bin");
|
|
if (engine && engine->isAvailable()) {
|
|
engine->setLanguage("fr");
|
|
std::string result = engine->transcribe(audioData);
|
|
std::cout << "✅ Result: " << result << "\n\n";
|
|
} else {
|
|
std::cout << "❌ Not available (model missing?)\n\n";
|
|
}
|
|
} catch (const std::exception& e) {
|
|
std::cout << "❌ Error: " << e.what() << "\n\n";
|
|
}
|
|
}
|
|
|
|
// 2. Whisper API
|
|
{
|
|
std::cout << "[2/5] OpenAI Whisper API\n";
|
|
std::cout << "----------------------------\n";
|
|
|
|
std::string apiKey = getEnvVar("OPENAI_API_KEY");
|
|
if (apiKey.empty()) {
|
|
std::cout << "❌ OPENAI_API_KEY not set\n\n";
|
|
} else {
|
|
try {
|
|
auto engine = STTEngineFactory::create("whisper_api", "", apiKey);
|
|
if (engine && engine->isAvailable()) {
|
|
engine->setLanguage("fr");
|
|
std::string result = engine->transcribeFile(audioFile);
|
|
std::cout << "✅ Result: " << result << "\n\n";
|
|
} else {
|
|
std::cout << "❌ Not available\n\n";
|
|
}
|
|
} catch (const std::exception& e) {
|
|
std::cout << "❌ Error: " << e.what() << "\n\n";
|
|
}
|
|
}
|
|
}
|
|
|
|
// 3. Google Speech
|
|
{
|
|
std::cout << "[3/5] Google Speech-to-Text\n";
|
|
std::cout << "----------------------------\n";
|
|
|
|
std::string apiKey = getEnvVar("GOOGLE_API_KEY");
|
|
if (apiKey.empty()) {
|
|
std::cout << "❌ GOOGLE_API_KEY not set\n\n";
|
|
} else {
|
|
try {
|
|
auto engine = STTEngineFactory::create("google", "", apiKey);
|
|
if (engine && engine->isAvailable()) {
|
|
engine->setLanguage("fr");
|
|
std::string result = engine->transcribeFile(audioFile);
|
|
std::cout << "✅ Result: " << result << "\n\n";
|
|
} else {
|
|
std::cout << "❌ Not available\n\n";
|
|
}
|
|
} catch (const std::exception& e) {
|
|
std::cout << "❌ Error: " << e.what() << "\n\n";
|
|
}
|
|
}
|
|
}
|
|
|
|
// 4. Azure Speech
|
|
{
|
|
std::cout << "[4/5] Azure Speech-to-Text\n";
|
|
std::cout << "----------------------------\n";
|
|
|
|
std::string apiKey = getEnvVar("AZURE_SPEECH_KEY");
|
|
std::string region = getEnvVar("AZURE_SPEECH_REGION");
|
|
|
|
if (apiKey.empty() || region.empty()) {
|
|
std::cout << "❌ AZURE_SPEECH_KEY or AZURE_SPEECH_REGION not set\n\n";
|
|
} else {
|
|
try {
|
|
auto engine = STTEngineFactory::create("azure", region, apiKey);
|
|
if (engine && engine->isAvailable()) {
|
|
engine->setLanguage("fr");
|
|
std::string result = engine->transcribeFile(audioFile);
|
|
std::cout << "✅ Result: " << result << "\n\n";
|
|
} else {
|
|
std::cout << "❌ Not available\n\n";
|
|
}
|
|
} catch (const std::exception& e) {
|
|
std::cout << "❌ Error: " << e.what() << "\n\n";
|
|
}
|
|
}
|
|
}
|
|
|
|
// 5. Deepgram
|
|
{
|
|
std::cout << "[5/5] Deepgram\n";
|
|
std::cout << "----------------------------\n";
|
|
|
|
std::string apiKey = getEnvVar("DEEPGRAM_API_KEY");
|
|
if (apiKey.empty()) {
|
|
std::cout << "❌ DEEPGRAM_API_KEY not set\n\n";
|
|
} else {
|
|
try {
|
|
auto engine = STTEngineFactory::create("deepgram", "", apiKey);
|
|
if (engine && engine->isAvailable()) {
|
|
engine->setLanguage("fr");
|
|
std::string result = engine->transcribeFile(audioFile);
|
|
std::cout << "✅ Result: " << result << "\n\n";
|
|
} else {
|
|
std::cout << "❌ Not available\n\n";
|
|
}
|
|
} catch (const std::exception& e) {
|
|
std::cout << "❌ Error: " << e.what() << "\n\n";
|
|
}
|
|
}
|
|
}
|
|
|
|
std::cout << "========================================\n";
|
|
std::cout << "Testing complete!\n";
|
|
std::cout << "========================================\n";
|
|
|
|
return 0;
|
|
}
|