feat: Phase 7.2 - Integrate Whisper.cpp STT engine
Complete STT system with 4 engine options: - WhisperCpp: High-quality local STT (ggml-base model, 147MB) - WhisperAPI: Cloud STT via OpenAI - PocketSphinx: Lightweight keyword spotting (optional) - Vosk: Balanced local STT (optional) Changes: - Add whisper.cpp as git submodule - Link whisper library with conditional compilation - Create test_stt_engines manual test program - Add PocketSphinx and Whisper.cpp optional dependencies Test results: ✅ WhisperCpp: Compiled, model loaded successfully ✅ WhisperAPI: Compiled (requires API key) ⚠️ PocketSphinx: Compiled (model path needs config) ❌ Vosk: Library not available in Ubuntu repos 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
a712988584
commit
099e0d837e
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
[submodule "external/whisper.cpp"]
|
||||
path = external/whisper.cpp
|
||||
url = https://github.com/ggerganov/whisper.cpp
|
||||
@ -137,6 +137,31 @@ else()
|
||||
message(STATUS "Vosk not found - STT will use fallback engines only")
|
||||
endif()
|
||||
|
||||
# Optional: Link PocketSphinx if available (Phase 7 STT)
|
||||
find_library(POCKETSPHINX_LIBRARY pocketsphinx)
|
||||
find_library(SPHINXBASE_LIBRARY sphinxbase)
|
||||
if(POCKETSPHINX_LIBRARY AND SPHINXBASE_LIBRARY)
|
||||
message(STATUS "PocketSphinx found: ${POCKETSPHINX_LIBRARY}")
|
||||
target_link_libraries(AissiaAudio PUBLIC ${POCKETSPHINX_LIBRARY} ${SPHINXBASE_LIBRARY})
|
||||
target_compile_definitions(AissiaAudio PRIVATE HAVE_POCKETSPHINX)
|
||||
target_include_directories(AissiaAudio PRIVATE /usr/include/pocketsphinx /usr/include/sphinxbase)
|
||||
else()
|
||||
message(STATUS "PocketSphinx not found - keyword spotting unavailable")
|
||||
endif()
|
||||
|
||||
# Optional: Link Whisper.cpp if available (Phase 7 STT)
|
||||
find_library(WHISPER_LIBRARY whisper PATHS ${CMAKE_SOURCE_DIR}/external/whisper.cpp/build/src)
|
||||
if(WHISPER_LIBRARY)
|
||||
message(STATUS "Whisper.cpp found: ${WHISPER_LIBRARY}")
|
||||
target_link_libraries(AissiaAudio PUBLIC ${WHISPER_LIBRARY})
|
||||
target_compile_definitions(AissiaAudio PRIVATE HAVE_WHISPER_CPP)
|
||||
target_include_directories(AissiaAudio PRIVATE
|
||||
${CMAKE_SOURCE_DIR}/external/whisper.cpp/include
|
||||
${CMAKE_SOURCE_DIR}/external/whisper.cpp/ggml/include)
|
||||
else()
|
||||
message(STATUS "Whisper.cpp not found - high-quality local STT unavailable")
|
||||
endif()
|
||||
|
||||
# ============================================================================
|
||||
# Infrastructure Services Library (linked into main)
|
||||
# ============================================================================
|
||||
@ -333,3 +358,12 @@ option(BUILD_TESTING "Build integration tests" OFF)
|
||||
if(BUILD_TESTING)
|
||||
add_subdirectory(tests)
|
||||
endif()
|
||||
|
||||
# Manual STT test executable
|
||||
add_executable(test_stt_engines
|
||||
tests/manual/test_stt_engines.cpp
|
||||
)
|
||||
target_link_libraries(test_stt_engines
|
||||
AissiaAudio
|
||||
spdlog::spdlog
|
||||
)
|
||||
|
||||
1
external/whisper.cpp
vendored
Submodule
1
external/whisper.cpp
vendored
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 19ceec8eac980403b714d603e5ca31653cd42a3f
|
||||
182
tests/manual/test_stt_engines.cpp
Normal file
182
tests/manual/test_stt_engines.cpp
Normal file
@ -0,0 +1,182 @@
|
||||
/**
|
||||
* @file test_stt_engines.cpp
|
||||
* @brief Manual test program for all 4 STT engines
|
||||
*
|
||||
* Tests each STT engine with the same audio file and compares results.
|
||||
*
|
||||
* Usage: ./test_stt_engines <audio_file.mp3>
|
||||
*/
|
||||
|
||||
#include "../../src/shared/audio/ISTTEngine.hpp"
|
||||
#include "../../src/shared/audio/PocketSphinxEngine.hpp"
|
||||
#include "../../src/shared/audio/VoskSTTEngine.hpp"
|
||||
#include "../../src/shared/audio/WhisperCppEngine.hpp"
|
||||
#include "../../src/shared/audio/WhisperAPIEngine.hpp"
|
||||
#include <spdlog/spdlog.h>
|
||||
#include <spdlog/sinks/stdout_color_sinks.h>
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include <cstdlib>
|
||||
|
||||
using namespace aissia;
|
||||
|
||||
struct TestResult {
|
||||
std::string engineName;
|
||||
bool available;
|
||||
std::string transcription;
|
||||
double durationMs;
|
||||
std::string error;
|
||||
};
|
||||
|
||||
TestResult testEngine(ISTTEngine* engine, const std::string& audioFile) {
|
||||
TestResult result;
|
||||
result.engineName = engine->getEngineName();
|
||||
result.available = engine->isAvailable();
|
||||
|
||||
if (!result.available) {
|
||||
result.error = "Engine not available";
|
||||
return result;
|
||||
}
|
||||
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
try {
|
||||
result.transcription = engine->transcribeFile(audioFile);
|
||||
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
result.durationMs = std::chrono::duration<double, std::milli>(end - start).count();
|
||||
|
||||
if (result.transcription.empty()) {
|
||||
result.error = "Empty transcription (file format not supported or processing failed)";
|
||||
}
|
||||
} catch (const std::exception& e) {
|
||||
result.error = std::string("Exception: ") + e.what();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void printResult(const TestResult& result) {
|
||||
std::cout << "\n";
|
||||
std::cout << "┌─────────────────────────────────────────────────────────\n";
|
||||
std::cout << "│ Engine: " << result.engineName << "\n";
|
||||
std::cout << "├─────────────────────────────────────────────────────────\n";
|
||||
|
||||
if (!result.available) {
|
||||
std::cout << "│ Status: ❌ NOT AVAILABLE\n";
|
||||
std::cout << "│ Reason: " << result.error << "\n";
|
||||
} else if (!result.error.empty()) {
|
||||
std::cout << "│ Status: ⚠️ ERROR\n";
|
||||
std::cout << "│ Error: " << result.error << "\n";
|
||||
} else {
|
||||
std::cout << "│ Status: ✅ SUCCESS\n";
|
||||
std::cout << "│ Duration: " << result.durationMs << " ms\n";
|
||||
std::cout << "│ Transcription: \"" << result.transcription << "\"\n";
|
||||
}
|
||||
|
||||
std::cout << "└─────────────────────────────────────────────────────────\n";
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
// Setup logging
|
||||
auto logger = spdlog::stdout_color_mt("test");
|
||||
spdlog::set_level(spdlog::level::info);
|
||||
|
||||
// Check arguments
|
||||
if (argc < 2) {
|
||||
std::cerr << "Usage: " << argv[0] << " <audio_file>\n";
|
||||
std::cerr << "Example: " << argv[0] << " test_audio.mp3\n";
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::string audioFile = argv[1];
|
||||
|
||||
std::cout << "\n";
|
||||
std::cout << "╔═══════════════════════════════════════════════════════════╗\n";
|
||||
std::cout << "║ STT ENGINES TEST - AISSIA Phase 7 ║\n";
|
||||
std::cout << "╚═══════════════════════════════════════════════════════════╝\n";
|
||||
std::cout << "\n";
|
||||
std::cout << "Audio file: " << audioFile << "\n";
|
||||
std::cout << "\n";
|
||||
std::cout << "Testing 4 STT engines...\n";
|
||||
|
||||
// Prepare engines
|
||||
std::vector<std::pair<std::string, std::unique_ptr<ISTTEngine>>> engines;
|
||||
|
||||
// 1. PocketSphinx
|
||||
std::cout << "\n[1/4] Initializing PocketSphinx...\n";
|
||||
engines.push_back({
|
||||
"PocketSphinx",
|
||||
std::make_unique<PocketSphinxEngine>("/usr/share/pocketsphinx/model/en-us")
|
||||
});
|
||||
|
||||
// 2. Vosk
|
||||
std::cout << "[2/4] Initializing Vosk...\n";
|
||||
engines.push_back({
|
||||
"Vosk",
|
||||
std::make_unique<VoskSTTEngine>("./models/vosk-model-small-fr-0.22")
|
||||
});
|
||||
|
||||
// 3. Whisper.cpp
|
||||
std::cout << "[3/4] Initializing Whisper.cpp...\n";
|
||||
engines.push_back({
|
||||
"Whisper.cpp",
|
||||
std::make_unique<WhisperCppEngine>("./models/ggml-base.bin")
|
||||
});
|
||||
|
||||
// 4. Whisper API
|
||||
std::cout << "[4/4] Initializing Whisper API...\n";
|
||||
const char* apiKey = std::getenv("OPENAI_API_KEY");
|
||||
engines.push_back({
|
||||
"Whisper API",
|
||||
std::make_unique<WhisperAPIEngine>(apiKey ? apiKey : "")
|
||||
});
|
||||
|
||||
// Test each engine
|
||||
std::vector<TestResult> results;
|
||||
for (auto& [name, engine] : engines) {
|
||||
std::cout << "\n▶ Testing " << name << "...\n";
|
||||
results.push_back(testEngine(engine.get(), audioFile));
|
||||
}
|
||||
|
||||
// Print results
|
||||
std::cout << "\n\n";
|
||||
std::cout << "╔═══════════════════════════════════════════════════════════╗\n";
|
||||
std::cout << "║ RESULTS ║\n";
|
||||
std::cout << "╚═══════════════════════════════════════════════════════════╝\n";
|
||||
|
||||
for (const auto& result : results) {
|
||||
printResult(result);
|
||||
}
|
||||
|
||||
// Summary
|
||||
std::cout << "\n\n";
|
||||
std::cout << "╔═══════════════════════════════════════════════════════════╗\n";
|
||||
std::cout << "║ SUMMARY ║\n";
|
||||
std::cout << "╚═══════════════════════════════════════════════════════════╝\n";
|
||||
std::cout << "\n";
|
||||
|
||||
int available = 0;
|
||||
int successful = 0;
|
||||
|
||||
for (const auto& result : results) {
|
||||
if (result.available) available++;
|
||||
if (result.available && result.error.empty()) successful++;
|
||||
}
|
||||
|
||||
std::cout << "Total engines tested: " << results.size() << "\n";
|
||||
std::cout << "Engines available: " << available << "/" << results.size() << "\n";
|
||||
std::cout << "Successful transcriptions: " << successful << "/" << available << "\n";
|
||||
std::cout << "\n";
|
||||
|
||||
if (successful > 0) {
|
||||
std::cout << "✅ STT system is working!\n";
|
||||
return 0;
|
||||
} else if (available > 0) {
|
||||
std::cout << "⚠️ Some engines available but all failed (check audio file format)\n";
|
||||
return 1;
|
||||
} else {
|
||||
std::cout << "❌ No STT engines available (install models/libraries)\n";
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user