feat: Phase 7.2 - Integrate Whisper.cpp STT engine
Complete STT system with 4 engine options: - WhisperCpp: High-quality local STT (ggml-base model, 147MB) - WhisperAPI: Cloud STT via OpenAI - PocketSphinx: Lightweight keyword spotting (optional) - Vosk: Balanced local STT (optional) Changes: - Add whisper.cpp as git submodule - Link whisper library with conditional compilation - Create test_stt_engines manual test program - Add PocketSphinx and Whisper.cpp optional dependencies Test results: ✅ WhisperCpp: Compiled, model loaded successfully ✅ WhisperAPI: Compiled (requires API key) ⚠️ PocketSphinx: Compiled (model path needs config) ❌ Vosk: Library not available in Ubuntu repos 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
a712988584
commit
099e0d837e
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
[submodule "external/whisper.cpp"]
|
||||||
|
path = external/whisper.cpp
|
||||||
|
url = https://github.com/ggerganov/whisper.cpp
|
||||||
@ -137,6 +137,31 @@ else()
|
|||||||
message(STATUS "Vosk not found - STT will use fallback engines only")
|
message(STATUS "Vosk not found - STT will use fallback engines only")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# Optional: Link PocketSphinx if available (Phase 7 STT)
|
||||||
|
find_library(POCKETSPHINX_LIBRARY pocketsphinx)
|
||||||
|
find_library(SPHINXBASE_LIBRARY sphinxbase)
|
||||||
|
if(POCKETSPHINX_LIBRARY AND SPHINXBASE_LIBRARY)
|
||||||
|
message(STATUS "PocketSphinx found: ${POCKETSPHINX_LIBRARY}")
|
||||||
|
target_link_libraries(AissiaAudio PUBLIC ${POCKETSPHINX_LIBRARY} ${SPHINXBASE_LIBRARY})
|
||||||
|
target_compile_definitions(AissiaAudio PRIVATE HAVE_POCKETSPHINX)
|
||||||
|
target_include_directories(AissiaAudio PRIVATE /usr/include/pocketsphinx /usr/include/sphinxbase)
|
||||||
|
else()
|
||||||
|
message(STATUS "PocketSphinx not found - keyword spotting unavailable")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# Optional: Link Whisper.cpp if available (Phase 7 STT)
|
||||||
|
find_library(WHISPER_LIBRARY whisper PATHS ${CMAKE_SOURCE_DIR}/external/whisper.cpp/build/src)
|
||||||
|
if(WHISPER_LIBRARY)
|
||||||
|
message(STATUS "Whisper.cpp found: ${WHISPER_LIBRARY}")
|
||||||
|
target_link_libraries(AissiaAudio PUBLIC ${WHISPER_LIBRARY})
|
||||||
|
target_compile_definitions(AissiaAudio PRIVATE HAVE_WHISPER_CPP)
|
||||||
|
target_include_directories(AissiaAudio PRIVATE
|
||||||
|
${CMAKE_SOURCE_DIR}/external/whisper.cpp/include
|
||||||
|
${CMAKE_SOURCE_DIR}/external/whisper.cpp/ggml/include)
|
||||||
|
else()
|
||||||
|
message(STATUS "Whisper.cpp not found - high-quality local STT unavailable")
|
||||||
|
endif()
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# Infrastructure Services Library (linked into main)
|
# Infrastructure Services Library (linked into main)
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@ -333,3 +358,12 @@ option(BUILD_TESTING "Build integration tests" OFF)
|
|||||||
if(BUILD_TESTING)
|
if(BUILD_TESTING)
|
||||||
add_subdirectory(tests)
|
add_subdirectory(tests)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# Manual STT test executable
|
||||||
|
add_executable(test_stt_engines
|
||||||
|
tests/manual/test_stt_engines.cpp
|
||||||
|
)
|
||||||
|
target_link_libraries(test_stt_engines
|
||||||
|
AissiaAudio
|
||||||
|
spdlog::spdlog
|
||||||
|
)
|
||||||
|
|||||||
1
external/whisper.cpp
vendored
Submodule
1
external/whisper.cpp
vendored
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 19ceec8eac980403b714d603e5ca31653cd42a3f
|
||||||
182
tests/manual/test_stt_engines.cpp
Normal file
182
tests/manual/test_stt_engines.cpp
Normal file
@ -0,0 +1,182 @@
|
|||||||
|
/**
|
||||||
|
* @file test_stt_engines.cpp
|
||||||
|
* @brief Manual test program for all 4 STT engines
|
||||||
|
*
|
||||||
|
* Tests each STT engine with the same audio file and compares results.
|
||||||
|
*
|
||||||
|
* Usage: ./test_stt_engines <audio_file.mp3>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "../../src/shared/audio/ISTTEngine.hpp"
|
||||||
|
#include "../../src/shared/audio/PocketSphinxEngine.hpp"
|
||||||
|
#include "../../src/shared/audio/VoskSTTEngine.hpp"
|
||||||
|
#include "../../src/shared/audio/WhisperCppEngine.hpp"
|
||||||
|
#include "../../src/shared/audio/WhisperAPIEngine.hpp"
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
#include <spdlog/sinks/stdout_color_sinks.h>
|
||||||
|
#include <iostream>
|
||||||
|
#include <chrono>
|
||||||
|
#include <cstdlib>
|
||||||
|
|
||||||
|
using namespace aissia;
|
||||||
|
|
||||||
|
struct TestResult {
|
||||||
|
std::string engineName;
|
||||||
|
bool available;
|
||||||
|
std::string transcription;
|
||||||
|
double durationMs;
|
||||||
|
std::string error;
|
||||||
|
};
|
||||||
|
|
||||||
|
TestResult testEngine(ISTTEngine* engine, const std::string& audioFile) {
|
||||||
|
TestResult result;
|
||||||
|
result.engineName = engine->getEngineName();
|
||||||
|
result.available = engine->isAvailable();
|
||||||
|
|
||||||
|
if (!result.available) {
|
||||||
|
result.error = "Engine not available";
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
try {
|
||||||
|
result.transcription = engine->transcribeFile(audioFile);
|
||||||
|
|
||||||
|
auto end = std::chrono::high_resolution_clock::now();
|
||||||
|
result.durationMs = std::chrono::duration<double, std::milli>(end - start).count();
|
||||||
|
|
||||||
|
if (result.transcription.empty()) {
|
||||||
|
result.error = "Empty transcription (file format not supported or processing failed)";
|
||||||
|
}
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
result.error = std::string("Exception: ") + e.what();
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void printResult(const TestResult& result) {
|
||||||
|
std::cout << "\n";
|
||||||
|
std::cout << "┌─────────────────────────────────────────────────────────\n";
|
||||||
|
std::cout << "│ Engine: " << result.engineName << "\n";
|
||||||
|
std::cout << "├─────────────────────────────────────────────────────────\n";
|
||||||
|
|
||||||
|
if (!result.available) {
|
||||||
|
std::cout << "│ Status: ❌ NOT AVAILABLE\n";
|
||||||
|
std::cout << "│ Reason: " << result.error << "\n";
|
||||||
|
} else if (!result.error.empty()) {
|
||||||
|
std::cout << "│ Status: ⚠️ ERROR\n";
|
||||||
|
std::cout << "│ Error: " << result.error << "\n";
|
||||||
|
} else {
|
||||||
|
std::cout << "│ Status: ✅ SUCCESS\n";
|
||||||
|
std::cout << "│ Duration: " << result.durationMs << " ms\n";
|
||||||
|
std::cout << "│ Transcription: \"" << result.transcription << "\"\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "└─────────────────────────────────────────────────────────\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char* argv[]) {
|
||||||
|
// Setup logging
|
||||||
|
auto logger = spdlog::stdout_color_mt("test");
|
||||||
|
spdlog::set_level(spdlog::level::info);
|
||||||
|
|
||||||
|
// Check arguments
|
||||||
|
if (argc < 2) {
|
||||||
|
std::cerr << "Usage: " << argv[0] << " <audio_file>\n";
|
||||||
|
std::cerr << "Example: " << argv[0] << " test_audio.mp3\n";
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string audioFile = argv[1];
|
||||||
|
|
||||||
|
std::cout << "\n";
|
||||||
|
std::cout << "╔═══════════════════════════════════════════════════════════╗\n";
|
||||||
|
std::cout << "║ STT ENGINES TEST - AISSIA Phase 7 ║\n";
|
||||||
|
std::cout << "╚═══════════════════════════════════════════════════════════╝\n";
|
||||||
|
std::cout << "\n";
|
||||||
|
std::cout << "Audio file: " << audioFile << "\n";
|
||||||
|
std::cout << "\n";
|
||||||
|
std::cout << "Testing 4 STT engines...\n";
|
||||||
|
|
||||||
|
// Prepare engines
|
||||||
|
std::vector<std::pair<std::string, std::unique_ptr<ISTTEngine>>> engines;
|
||||||
|
|
||||||
|
// 1. PocketSphinx
|
||||||
|
std::cout << "\n[1/4] Initializing PocketSphinx...\n";
|
||||||
|
engines.push_back({
|
||||||
|
"PocketSphinx",
|
||||||
|
std::make_unique<PocketSphinxEngine>("/usr/share/pocketsphinx/model/en-us")
|
||||||
|
});
|
||||||
|
|
||||||
|
// 2. Vosk
|
||||||
|
std::cout << "[2/4] Initializing Vosk...\n";
|
||||||
|
engines.push_back({
|
||||||
|
"Vosk",
|
||||||
|
std::make_unique<VoskSTTEngine>("./models/vosk-model-small-fr-0.22")
|
||||||
|
});
|
||||||
|
|
||||||
|
// 3. Whisper.cpp
|
||||||
|
std::cout << "[3/4] Initializing Whisper.cpp...\n";
|
||||||
|
engines.push_back({
|
||||||
|
"Whisper.cpp",
|
||||||
|
std::make_unique<WhisperCppEngine>("./models/ggml-base.bin")
|
||||||
|
});
|
||||||
|
|
||||||
|
// 4. Whisper API
|
||||||
|
std::cout << "[4/4] Initializing Whisper API...\n";
|
||||||
|
const char* apiKey = std::getenv("OPENAI_API_KEY");
|
||||||
|
engines.push_back({
|
||||||
|
"Whisper API",
|
||||||
|
std::make_unique<WhisperAPIEngine>(apiKey ? apiKey : "")
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test each engine
|
||||||
|
std::vector<TestResult> results;
|
||||||
|
for (auto& [name, engine] : engines) {
|
||||||
|
std::cout << "\n▶ Testing " << name << "...\n";
|
||||||
|
results.push_back(testEngine(engine.get(), audioFile));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Print results
|
||||||
|
std::cout << "\n\n";
|
||||||
|
std::cout << "╔═══════════════════════════════════════════════════════════╗\n";
|
||||||
|
std::cout << "║ RESULTS ║\n";
|
||||||
|
std::cout << "╚═══════════════════════════════════════════════════════════╝\n";
|
||||||
|
|
||||||
|
for (const auto& result : results) {
|
||||||
|
printResult(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Summary
|
||||||
|
std::cout << "\n\n";
|
||||||
|
std::cout << "╔═══════════════════════════════════════════════════════════╗\n";
|
||||||
|
std::cout << "║ SUMMARY ║\n";
|
||||||
|
std::cout << "╚═══════════════════════════════════════════════════════════╝\n";
|
||||||
|
std::cout << "\n";
|
||||||
|
|
||||||
|
int available = 0;
|
||||||
|
int successful = 0;
|
||||||
|
|
||||||
|
for (const auto& result : results) {
|
||||||
|
if (result.available) available++;
|
||||||
|
if (result.available && result.error.empty()) successful++;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "Total engines tested: " << results.size() << "\n";
|
||||||
|
std::cout << "Engines available: " << available << "/" << results.size() << "\n";
|
||||||
|
std::cout << "Successful transcriptions: " << successful << "/" << available << "\n";
|
||||||
|
std::cout << "\n";
|
||||||
|
|
||||||
|
if (successful > 0) {
|
||||||
|
std::cout << "✅ STT system is working!\n";
|
||||||
|
return 0;
|
||||||
|
} else if (available > 0) {
|
||||||
|
std::cout << "⚠️ Some engines available but all failed (check audio file format)\n";
|
||||||
|
return 1;
|
||||||
|
} else {
|
||||||
|
std::cout << "❌ No STT engines available (install models/libraries)\n";
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user