feat: Phase 7.2 - Integrate Whisper.cpp STT engine

Complete STT system with 4 engine options:
- WhisperCpp: High-quality local STT (ggml-base model, 147MB)
- WhisperAPI: Cloud STT via OpenAI
- PocketSphinx: Lightweight keyword spotting (optional)
- Vosk: Balanced local STT (optional)

Changes:
- Add whisper.cpp as git submodule
- Link whisper library with conditional compilation
- Create test_stt_engines manual test program
- Add PocketSphinx and Whisper.cpp optional dependencies

Test results:
 WhisperCpp: Compiled, model loaded successfully
 WhisperAPI: Compiled (requires API key)
⚠️  PocketSphinx: Compiled (model path needs config)
 Vosk: Library not available in Ubuntu repos

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
StillHammer 2025-11-29 22:27:44 +08:00
parent a712988584
commit 099e0d837e
4 changed files with 220 additions and 0 deletions

3
.gitmodules vendored Normal file
View File

@ -0,0 +1,3 @@
[submodule "external/whisper.cpp"]
path = external/whisper.cpp
url = https://github.com/ggerganov/whisper.cpp

View File

@ -137,6 +137,31 @@ else()
message(STATUS "Vosk not found - STT will use fallback engines only")
endif()
# Optional: Link PocketSphinx if available (Phase 7 STT)
find_library(POCKETSPHINX_LIBRARY pocketsphinx)
find_library(SPHINXBASE_LIBRARY sphinxbase)
if(POCKETSPHINX_LIBRARY AND SPHINXBASE_LIBRARY)
message(STATUS "PocketSphinx found: ${POCKETSPHINX_LIBRARY}")
target_link_libraries(AissiaAudio PUBLIC ${POCKETSPHINX_LIBRARY} ${SPHINXBASE_LIBRARY})
target_compile_definitions(AissiaAudio PRIVATE HAVE_POCKETSPHINX)
target_include_directories(AissiaAudio PRIVATE /usr/include/pocketsphinx /usr/include/sphinxbase)
else()
message(STATUS "PocketSphinx not found - keyword spotting unavailable")
endif()
# Optional: Link Whisper.cpp if available (Phase 7 STT)
find_library(WHISPER_LIBRARY whisper PATHS ${CMAKE_SOURCE_DIR}/external/whisper.cpp/build/src)
if(WHISPER_LIBRARY)
message(STATUS "Whisper.cpp found: ${WHISPER_LIBRARY}")
target_link_libraries(AissiaAudio PUBLIC ${WHISPER_LIBRARY})
target_compile_definitions(AissiaAudio PRIVATE HAVE_WHISPER_CPP)
target_include_directories(AissiaAudio PRIVATE
${CMAKE_SOURCE_DIR}/external/whisper.cpp/include
${CMAKE_SOURCE_DIR}/external/whisper.cpp/ggml/include)
else()
message(STATUS "Whisper.cpp not found - high-quality local STT unavailable")
endif()
# ============================================================================
# Infrastructure Services Library (linked into main)
# ============================================================================
@ -333,3 +358,12 @@ option(BUILD_TESTING "Build integration tests" OFF)
if(BUILD_TESTING)
add_subdirectory(tests)
endif()
# Manual STT test executable
add_executable(test_stt_engines
tests/manual/test_stt_engines.cpp
)
target_link_libraries(test_stt_engines
AissiaAudio
spdlog::spdlog
)

1
external/whisper.cpp vendored Submodule

@ -0,0 +1 @@
Subproject commit 19ceec8eac980403b714d603e5ca31653cd42a3f

View File

@ -0,0 +1,182 @@
/**
* @file test_stt_engines.cpp
* @brief Manual test program for all 4 STT engines
*
* Tests each STT engine with the same audio file and compares results.
*
* Usage: ./test_stt_engines <audio_file.mp3>
*/
#include "../../src/shared/audio/ISTTEngine.hpp"
#include "../../src/shared/audio/PocketSphinxEngine.hpp"
#include "../../src/shared/audio/VoskSTTEngine.hpp"
#include "../../src/shared/audio/WhisperCppEngine.hpp"
#include "../../src/shared/audio/WhisperAPIEngine.hpp"
#include <spdlog/spdlog.h>
#include <spdlog/sinks/stdout_color_sinks.h>
#include <iostream>
#include <chrono>
#include <cstdlib>
using namespace aissia;
struct TestResult {
std::string engineName;
bool available;
std::string transcription;
double durationMs;
std::string error;
};
TestResult testEngine(ISTTEngine* engine, const std::string& audioFile) {
TestResult result;
result.engineName = engine->getEngineName();
result.available = engine->isAvailable();
if (!result.available) {
result.error = "Engine not available";
return result;
}
auto start = std::chrono::high_resolution_clock::now();
try {
result.transcription = engine->transcribeFile(audioFile);
auto end = std::chrono::high_resolution_clock::now();
result.durationMs = std::chrono::duration<double, std::milli>(end - start).count();
if (result.transcription.empty()) {
result.error = "Empty transcription (file format not supported or processing failed)";
}
} catch (const std::exception& e) {
result.error = std::string("Exception: ") + e.what();
}
return result;
}
void printResult(const TestResult& result) {
std::cout << "\n";
std::cout << "┌─────────────────────────────────────────────────────────\n";
std::cout << "│ Engine: " << result.engineName << "\n";
std::cout << "├─────────────────────────────────────────────────────────\n";
if (!result.available) {
std::cout << "│ Status: ❌ NOT AVAILABLE\n";
std::cout << "│ Reason: " << result.error << "\n";
} else if (!result.error.empty()) {
std::cout << "│ Status: ⚠️ ERROR\n";
std::cout << "│ Error: " << result.error << "\n";
} else {
std::cout << "│ Status: ✅ SUCCESS\n";
std::cout << "│ Duration: " << result.durationMs << " ms\n";
std::cout << "│ Transcription: \"" << result.transcription << "\"\n";
}
std::cout << "└─────────────────────────────────────────────────────────\n";
}
int main(int argc, char* argv[]) {
// Setup logging
auto logger = spdlog::stdout_color_mt("test");
spdlog::set_level(spdlog::level::info);
// Check arguments
if (argc < 2) {
std::cerr << "Usage: " << argv[0] << " <audio_file>\n";
std::cerr << "Example: " << argv[0] << " test_audio.mp3\n";
return 1;
}
std::string audioFile = argv[1];
std::cout << "\n";
std::cout << "╔═══════════════════════════════════════════════════════════╗\n";
std::cout << "║ STT ENGINES TEST - AISSIA Phase 7 ║\n";
std::cout << "╚═══════════════════════════════════════════════════════════╝\n";
std::cout << "\n";
std::cout << "Audio file: " << audioFile << "\n";
std::cout << "\n";
std::cout << "Testing 4 STT engines...\n";
// Prepare engines
std::vector<std::pair<std::string, std::unique_ptr<ISTTEngine>>> engines;
// 1. PocketSphinx
std::cout << "\n[1/4] Initializing PocketSphinx...\n";
engines.push_back({
"PocketSphinx",
std::make_unique<PocketSphinxEngine>("/usr/share/pocketsphinx/model/en-us")
});
// 2. Vosk
std::cout << "[2/4] Initializing Vosk...\n";
engines.push_back({
"Vosk",
std::make_unique<VoskSTTEngine>("./models/vosk-model-small-fr-0.22")
});
// 3. Whisper.cpp
std::cout << "[3/4] Initializing Whisper.cpp...\n";
engines.push_back({
"Whisper.cpp",
std::make_unique<WhisperCppEngine>("./models/ggml-base.bin")
});
// 4. Whisper API
std::cout << "[4/4] Initializing Whisper API...\n";
const char* apiKey = std::getenv("OPENAI_API_KEY");
engines.push_back({
"Whisper API",
std::make_unique<WhisperAPIEngine>(apiKey ? apiKey : "")
});
// Test each engine
std::vector<TestResult> results;
for (auto& [name, engine] : engines) {
std::cout << "\n▶ Testing " << name << "...\n";
results.push_back(testEngine(engine.get(), audioFile));
}
// Print results
std::cout << "\n\n";
std::cout << "╔═══════════════════════════════════════════════════════════╗\n";
std::cout << "║ RESULTS ║\n";
std::cout << "╚═══════════════════════════════════════════════════════════╝\n";
for (const auto& result : results) {
printResult(result);
}
// Summary
std::cout << "\n\n";
std::cout << "╔═══════════════════════════════════════════════════════════╗\n";
std::cout << "║ SUMMARY ║\n";
std::cout << "╚═══════════════════════════════════════════════════════════╝\n";
std::cout << "\n";
int available = 0;
int successful = 0;
for (const auto& result : results) {
if (result.available) available++;
if (result.available && result.error.empty()) successful++;
}
std::cout << "Total engines tested: " << results.size() << "\n";
std::cout << "Engines available: " << available << "/" << results.size() << "\n";
std::cout << "Successful transcriptions: " << successful << "/" << available << "\n";
std::cout << "\n";
if (successful > 0) {
std::cout << "✅ STT system is working!\n";
return 0;
} else if (available > 0) {
std::cout << "⚠️ Some engines available but all failed (check audio file format)\n";
return 1;
} else {
std::cout << "❌ No STT engines available (install models/libraries)\n";
return 1;
}
}