Complete STT system with 4 engine options: - WhisperCpp: High-quality local STT (ggml-base model, 147MB) - WhisperAPI: Cloud STT via OpenAI - PocketSphinx: Lightweight keyword spotting (optional) - Vosk: Balanced local STT (optional) Changes: - Add whisper.cpp as git submodule - Link whisper library with conditional compilation - Create test_stt_engines manual test program - Add PocketSphinx and Whisper.cpp optional dependencies Test results: ✅ WhisperCpp: Compiled, model loaded successfully ✅ WhisperAPI: Compiled (requires API key) ⚠️ PocketSphinx: Compiled (model path needs config) ❌ Vosk: Library not available in Ubuntu repos 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
183 lines
7.2 KiB
C++
183 lines
7.2 KiB
C++
/**
|
|
* @file test_stt_engines.cpp
|
|
* @brief Manual test program for all 4 STT engines
|
|
*
|
|
* Tests each STT engine with the same audio file and compares results.
|
|
*
|
|
* Usage: ./test_stt_engines <audio_file.mp3>
|
|
*/
|
|
|
|
#include "../../src/shared/audio/ISTTEngine.hpp"
|
|
#include "../../src/shared/audio/PocketSphinxEngine.hpp"
|
|
#include "../../src/shared/audio/VoskSTTEngine.hpp"
|
|
#include "../../src/shared/audio/WhisperCppEngine.hpp"
|
|
#include "../../src/shared/audio/WhisperAPIEngine.hpp"
|
|
#include <spdlog/spdlog.h>
|
|
#include <spdlog/sinks/stdout_color_sinks.h>
|
|
#include <iostream>
|
|
#include <chrono>
|
|
#include <cstdlib>
|
|
|
|
using namespace aissia;
|
|
|
|
struct TestResult {
|
|
std::string engineName;
|
|
bool available;
|
|
std::string transcription;
|
|
double durationMs;
|
|
std::string error;
|
|
};
|
|
|
|
TestResult testEngine(ISTTEngine* engine, const std::string& audioFile) {
|
|
TestResult result;
|
|
result.engineName = engine->getEngineName();
|
|
result.available = engine->isAvailable();
|
|
|
|
if (!result.available) {
|
|
result.error = "Engine not available";
|
|
return result;
|
|
}
|
|
|
|
auto start = std::chrono::high_resolution_clock::now();
|
|
|
|
try {
|
|
result.transcription = engine->transcribeFile(audioFile);
|
|
|
|
auto end = std::chrono::high_resolution_clock::now();
|
|
result.durationMs = std::chrono::duration<double, std::milli>(end - start).count();
|
|
|
|
if (result.transcription.empty()) {
|
|
result.error = "Empty transcription (file format not supported or processing failed)";
|
|
}
|
|
} catch (const std::exception& e) {
|
|
result.error = std::string("Exception: ") + e.what();
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
void printResult(const TestResult& result) {
|
|
std::cout << "\n";
|
|
std::cout << "┌─────────────────────────────────────────────────────────\n";
|
|
std::cout << "│ Engine: " << result.engineName << "\n";
|
|
std::cout << "├─────────────────────────────────────────────────────────\n";
|
|
|
|
if (!result.available) {
|
|
std::cout << "│ Status: ❌ NOT AVAILABLE\n";
|
|
std::cout << "│ Reason: " << result.error << "\n";
|
|
} else if (!result.error.empty()) {
|
|
std::cout << "│ Status: ⚠️ ERROR\n";
|
|
std::cout << "│ Error: " << result.error << "\n";
|
|
} else {
|
|
std::cout << "│ Status: ✅ SUCCESS\n";
|
|
std::cout << "│ Duration: " << result.durationMs << " ms\n";
|
|
std::cout << "│ Transcription: \"" << result.transcription << "\"\n";
|
|
}
|
|
|
|
std::cout << "└─────────────────────────────────────────────────────────\n";
|
|
}
|
|
|
|
int main(int argc, char* argv[]) {
|
|
// Setup logging
|
|
auto logger = spdlog::stdout_color_mt("test");
|
|
spdlog::set_level(spdlog::level::info);
|
|
|
|
// Check arguments
|
|
if (argc < 2) {
|
|
std::cerr << "Usage: " << argv[0] << " <audio_file>\n";
|
|
std::cerr << "Example: " << argv[0] << " test_audio.mp3\n";
|
|
return 1;
|
|
}
|
|
|
|
std::string audioFile = argv[1];
|
|
|
|
std::cout << "\n";
|
|
std::cout << "╔═══════════════════════════════════════════════════════════╗\n";
|
|
std::cout << "║ STT ENGINES TEST - AISSIA Phase 7 ║\n";
|
|
std::cout << "╚═══════════════════════════════════════════════════════════╝\n";
|
|
std::cout << "\n";
|
|
std::cout << "Audio file: " << audioFile << "\n";
|
|
std::cout << "\n";
|
|
std::cout << "Testing 4 STT engines...\n";
|
|
|
|
// Prepare engines
|
|
std::vector<std::pair<std::string, std::unique_ptr<ISTTEngine>>> engines;
|
|
|
|
// 1. PocketSphinx
|
|
std::cout << "\n[1/4] Initializing PocketSphinx...\n";
|
|
engines.push_back({
|
|
"PocketSphinx",
|
|
std::make_unique<PocketSphinxEngine>("/usr/share/pocketsphinx/model/en-us")
|
|
});
|
|
|
|
// 2. Vosk
|
|
std::cout << "[2/4] Initializing Vosk...\n";
|
|
engines.push_back({
|
|
"Vosk",
|
|
std::make_unique<VoskSTTEngine>("./models/vosk-model-small-fr-0.22")
|
|
});
|
|
|
|
// 3. Whisper.cpp
|
|
std::cout << "[3/4] Initializing Whisper.cpp...\n";
|
|
engines.push_back({
|
|
"Whisper.cpp",
|
|
std::make_unique<WhisperCppEngine>("./models/ggml-base.bin")
|
|
});
|
|
|
|
// 4. Whisper API
|
|
std::cout << "[4/4] Initializing Whisper API...\n";
|
|
const char* apiKey = std::getenv("OPENAI_API_KEY");
|
|
engines.push_back({
|
|
"Whisper API",
|
|
std::make_unique<WhisperAPIEngine>(apiKey ? apiKey : "")
|
|
});
|
|
|
|
// Test each engine
|
|
std::vector<TestResult> results;
|
|
for (auto& [name, engine] : engines) {
|
|
std::cout << "\n▶ Testing " << name << "...\n";
|
|
results.push_back(testEngine(engine.get(), audioFile));
|
|
}
|
|
|
|
// Print results
|
|
std::cout << "\n\n";
|
|
std::cout << "╔═══════════════════════════════════════════════════════════╗\n";
|
|
std::cout << "║ RESULTS ║\n";
|
|
std::cout << "╚═══════════════════════════════════════════════════════════╝\n";
|
|
|
|
for (const auto& result : results) {
|
|
printResult(result);
|
|
}
|
|
|
|
// Summary
|
|
std::cout << "\n\n";
|
|
std::cout << "╔═══════════════════════════════════════════════════════════╗\n";
|
|
std::cout << "║ SUMMARY ║\n";
|
|
std::cout << "╚═══════════════════════════════════════════════════════════╝\n";
|
|
std::cout << "\n";
|
|
|
|
int available = 0;
|
|
int successful = 0;
|
|
|
|
for (const auto& result : results) {
|
|
if (result.available) available++;
|
|
if (result.available && result.error.empty()) successful++;
|
|
}
|
|
|
|
std::cout << "Total engines tested: " << results.size() << "\n";
|
|
std::cout << "Engines available: " << available << "/" << results.size() << "\n";
|
|
std::cout << "Successful transcriptions: " << successful << "/" << available << "\n";
|
|
std::cout << "\n";
|
|
|
|
if (successful > 0) {
|
|
std::cout << "✅ STT system is working!\n";
|
|
return 0;
|
|
} else if (available > 0) {
|
|
std::cout << "⚠️ Some engines available but all failed (check audio file format)\n";
|
|
return 1;
|
|
} else {
|
|
std::cout << "❌ No STT engines available (install models/libraries)\n";
|
|
return 1;
|
|
}
|
|
}
|