Compare commits
10 Commits
3c588a9f90
...
cb938500cd
| Author | SHA1 | Date | |
|---|---|---|---|
| cb938500cd | |||
| d7971e0c34 | |||
| c9b21e3f96 | |||
| f5c03e1343 | |||
| 099e0d837e | |||
| a712988584 | |||
| 2a0ace3441 | |||
| a01115a848 | |||
| eab58c6ab3 | |||
| 3915424d75 |
7
.gitignore
vendored
7
.gitignore
vendored
@ -64,7 +64,12 @@ Thumbs.db
|
|||||||
*.mov
|
*.mov
|
||||||
*.wmv
|
*.wmv
|
||||||
*.mp3
|
*.mp3
|
||||||
*.wavbuild/
|
*.wav
|
||||||
|
|
||||||
|
# Symlink to GroveEngine (local only)
|
||||||
|
external/GroveEngine
|
||||||
|
|
||||||
|
build/
|
||||||
*.o
|
*.o
|
||||||
*.a
|
*.a
|
||||||
*.so
|
*.so
|
||||||
|
|||||||
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
[submodule "external/whisper.cpp"]
|
||||||
|
path = external/whisper.cpp
|
||||||
|
url = https://github.com/ggerganov/whisper.cpp
|
||||||
102
CMakeLists.txt
102
CMakeLists.txt
@ -42,6 +42,12 @@ endif()
|
|||||||
|
|
||||||
# cpp-httplib (header-only HTTP client)
|
# cpp-httplib (header-only HTTP client)
|
||||||
include(FetchContent)
|
include(FetchContent)
|
||||||
|
|
||||||
|
# Disable OpenSSL auto-detection in httplib if OpenSSL is not available
|
||||||
|
if(NOT OPENSSL_FOUND)
|
||||||
|
set(HTTPLIB_USE_OPENSSL_IF_AVAILABLE OFF CACHE BOOL "Disable OpenSSL in httplib" FORCE)
|
||||||
|
endif()
|
||||||
|
|
||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
httplib
|
httplib
|
||||||
GIT_REPOSITORY https://github.com/yhirose/cpp-httplib.git
|
GIT_REPOSITORY https://github.com/yhirose/cpp-httplib.git
|
||||||
@ -68,15 +74,20 @@ target_link_libraries(AissiaLLM PUBLIC
|
|||||||
GroveEngine::impl
|
GroveEngine::impl
|
||||||
spdlog::spdlog
|
spdlog::spdlog
|
||||||
)
|
)
|
||||||
|
# Link Winsock for httplib on Windows
|
||||||
|
if(WIN32)
|
||||||
|
target_link_libraries(AissiaLLM PUBLIC ws2_32)
|
||||||
|
endif()
|
||||||
if(OPENSSL_FOUND)
|
if(OPENSSL_FOUND)
|
||||||
target_link_libraries(AissiaLLM PUBLIC OpenSSL::SSL OpenSSL::Crypto)
|
target_link_libraries(AissiaLLM PUBLIC OpenSSL::SSL OpenSSL::Crypto)
|
||||||
target_compile_definitions(AissiaLLM PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
|
target_compile_definitions(AissiaLLM PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Tools Library (Internal tools + FileSystem tools + MCP client + MCP server)
|
# Tools Library (Internal tools + FileSystem tools + MCP client + MCP server + MCP Server Tools)
|
||||||
add_library(AissiaTools STATIC
|
add_library(AissiaTools STATIC
|
||||||
src/shared/tools/InternalTools.cpp
|
src/shared/tools/InternalTools.cpp
|
||||||
src/shared/tools/FileSystemTools.cpp
|
src/shared/tools/FileSystemTools.cpp
|
||||||
|
src/shared/tools/MCPServerTools.cpp
|
||||||
src/shared/mcp/StdioTransport.cpp
|
src/shared/mcp/StdioTransport.cpp
|
||||||
src/shared/mcp/MCPClient.cpp
|
src/shared/mcp/MCPClient.cpp
|
||||||
src/shared/mcp/MCPServer.cpp
|
src/shared/mcp/MCPServer.cpp
|
||||||
@ -107,6 +118,9 @@ endif()
|
|||||||
add_library(AissiaAudio STATIC
|
add_library(AissiaAudio STATIC
|
||||||
src/shared/audio/TTSEngineFactory.cpp
|
src/shared/audio/TTSEngineFactory.cpp
|
||||||
src/shared/audio/STTEngineFactory.cpp
|
src/shared/audio/STTEngineFactory.cpp
|
||||||
|
src/shared/audio/VoskSTTEngine.cpp
|
||||||
|
src/shared/audio/PocketSphinxEngine.cpp
|
||||||
|
src/shared/audio/WhisperCppEngine.cpp
|
||||||
)
|
)
|
||||||
target_include_directories(AissiaAudio PUBLIC
|
target_include_directories(AissiaAudio PUBLIC
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/src
|
${CMAKE_CURRENT_SOURCE_DIR}/src
|
||||||
@ -116,12 +130,50 @@ target_include_directories(AissiaAudio PUBLIC
|
|||||||
target_link_libraries(AissiaAudio PUBLIC
|
target_link_libraries(AissiaAudio PUBLIC
|
||||||
spdlog::spdlog
|
spdlog::spdlog
|
||||||
)
|
)
|
||||||
|
# Link Winsock for httplib on Windows
|
||||||
|
if(WIN32)
|
||||||
|
target_link_libraries(AissiaAudio PUBLIC ws2_32 sapi ole32)
|
||||||
|
endif()
|
||||||
if(OPENSSL_FOUND)
|
if(OPENSSL_FOUND)
|
||||||
target_link_libraries(AissiaAudio PUBLIC OpenSSL::SSL OpenSSL::Crypto)
|
target_link_libraries(AissiaAudio PUBLIC OpenSSL::SSL OpenSSL::Crypto)
|
||||||
target_compile_definitions(AissiaAudio PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
|
target_compile_definitions(AissiaAudio PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
|
||||||
endif()
|
endif()
|
||||||
if(WIN32)
|
# Note: Si OpenSSL n'est pas trouvé, on ne définit PAS CPPHTTPLIB_OPENSSL_SUPPORT
|
||||||
target_link_libraries(AissiaAudio PUBLIC sapi ole32)
|
# httplib utilisera HTTP simple sans SSL
|
||||||
|
|
||||||
|
# Optional: Link Vosk if available (Phase 7 STT)
|
||||||
|
find_library(VOSK_LIBRARY vosk)
|
||||||
|
if(VOSK_LIBRARY)
|
||||||
|
message(STATUS "Vosk found: ${VOSK_LIBRARY}")
|
||||||
|
target_link_libraries(AissiaAudio PUBLIC ${VOSK_LIBRARY})
|
||||||
|
target_compile_definitions(AissiaAudio PRIVATE HAS_VOSK)
|
||||||
|
else()
|
||||||
|
message(STATUS "Vosk not found - STT will use fallback engines only")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# Optional: Link PocketSphinx if available (Phase 7 STT)
|
||||||
|
find_library(POCKETSPHINX_LIBRARY pocketsphinx)
|
||||||
|
find_library(SPHINXBASE_LIBRARY sphinxbase)
|
||||||
|
if(POCKETSPHINX_LIBRARY AND SPHINXBASE_LIBRARY)
|
||||||
|
message(STATUS "PocketSphinx found: ${POCKETSPHINX_LIBRARY}")
|
||||||
|
target_link_libraries(AissiaAudio PUBLIC ${POCKETSPHINX_LIBRARY} ${SPHINXBASE_LIBRARY})
|
||||||
|
target_compile_definitions(AissiaAudio PRIVATE HAVE_POCKETSPHINX)
|
||||||
|
target_include_directories(AissiaAudio PRIVATE /usr/include/pocketsphinx /usr/include/sphinxbase)
|
||||||
|
else()
|
||||||
|
message(STATUS "PocketSphinx not found - keyword spotting unavailable")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# Optional: Link Whisper.cpp if available (Phase 7 STT)
|
||||||
|
find_library(WHISPER_LIBRARY whisper PATHS ${CMAKE_SOURCE_DIR}/external/whisper.cpp/build/src)
|
||||||
|
if(WHISPER_LIBRARY)
|
||||||
|
message(STATUS "Whisper.cpp found: ${WHISPER_LIBRARY}")
|
||||||
|
target_link_libraries(AissiaAudio PUBLIC ${WHISPER_LIBRARY})
|
||||||
|
target_compile_definitions(AissiaAudio PRIVATE HAVE_WHISPER_CPP)
|
||||||
|
target_include_directories(AissiaAudio PRIVATE
|
||||||
|
${CMAKE_SOURCE_DIR}/external/whisper.cpp/include
|
||||||
|
${CMAKE_SOURCE_DIR}/external/whisper.cpp/ggml/include)
|
||||||
|
else()
|
||||||
|
message(STATUS "Whisper.cpp not found - high-quality local STT unavailable")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@ -132,6 +184,7 @@ add_library(AissiaServices STATIC
|
|||||||
src/services/StorageService.cpp
|
src/services/StorageService.cpp
|
||||||
src/services/PlatformService.cpp
|
src/services/PlatformService.cpp
|
||||||
src/services/VoiceService.cpp
|
src/services/VoiceService.cpp
|
||||||
|
src/services/STTService.cpp
|
||||||
)
|
)
|
||||||
target_include_directories(AissiaServices PUBLIC
|
target_include_directories(AissiaServices PUBLIC
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/src
|
${CMAKE_CURRENT_SOURCE_DIR}/src
|
||||||
@ -261,6 +314,10 @@ target_link_libraries(WebModule PRIVATE
|
|||||||
GroveEngine::impl
|
GroveEngine::impl
|
||||||
spdlog::spdlog
|
spdlog::spdlog
|
||||||
)
|
)
|
||||||
|
# Link Winsock for httplib on Windows
|
||||||
|
if(WIN32)
|
||||||
|
target_link_libraries(WebModule PRIVATE ws2_32)
|
||||||
|
endif()
|
||||||
if(OPENSSL_FOUND)
|
if(OPENSSL_FOUND)
|
||||||
target_link_libraries(WebModule PRIVATE OpenSSL::SSL OpenSSL::Crypto)
|
target_link_libraries(WebModule PRIVATE OpenSSL::SSL OpenSSL::Crypto)
|
||||||
target_compile_definitions(WebModule PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
|
target_compile_definitions(WebModule PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
|
||||||
@ -280,26 +337,35 @@ file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/config/
|
|||||||
# Development targets
|
# Development targets
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|
||||||
# TestRunnerModule - Orchestrator for integration tests
|
# TestRunnerModule - Orchestrator for integration tests (Unix only - uses dlfcn.h)
|
||||||
add_library(TestRunnerModule SHARED
|
if(UNIX)
|
||||||
|
add_library(TestRunnerModule SHARED
|
||||||
src/modules/TestRunnerModule.cpp
|
src/modules/TestRunnerModule.cpp
|
||||||
)
|
)
|
||||||
target_include_directories(TestRunnerModule PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
|
target_include_directories(TestRunnerModule PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
|
||||||
target_link_libraries(TestRunnerModule PRIVATE
|
target_link_libraries(TestRunnerModule PRIVATE
|
||||||
GroveEngine::impl
|
GroveEngine::impl
|
||||||
spdlog::spdlog
|
spdlog::spdlog
|
||||||
${CMAKE_DL_LIBS}
|
${CMAKE_DL_LIBS}
|
||||||
)
|
)
|
||||||
set_target_properties(TestRunnerModule PROPERTIES
|
set_target_properties(TestRunnerModule PROPERTIES
|
||||||
PREFIX "lib"
|
PREFIX "lib"
|
||||||
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/modules
|
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/modules
|
||||||
)
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
# Quick rebuild of modules only (for hot-reload workflow)
|
# Quick rebuild of modules only (for hot-reload workflow)
|
||||||
add_custom_target(modules
|
if(UNIX)
|
||||||
|
add_custom_target(modules
|
||||||
DEPENDS SchedulerModule NotificationModule StorageModule MonitoringModule AIModule VoiceModule WebModule TestRunnerModule
|
DEPENDS SchedulerModule NotificationModule StorageModule MonitoringModule AIModule VoiceModule WebModule TestRunnerModule
|
||||||
COMMENT "Building hot-reloadable modules only"
|
COMMENT "Building hot-reloadable modules only"
|
||||||
)
|
)
|
||||||
|
else()
|
||||||
|
add_custom_target(modules
|
||||||
|
DEPENDS SchedulerModule NotificationModule StorageModule MonitoringModule AIModule VoiceModule WebModule
|
||||||
|
COMMENT "Building hot-reloadable modules only"
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
# Create data directory
|
# Create data directory
|
||||||
file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/data)
|
file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/data)
|
||||||
@ -319,3 +385,13 @@ option(BUILD_TESTING "Build integration tests" OFF)
|
|||||||
if(BUILD_TESTING)
|
if(BUILD_TESTING)
|
||||||
add_subdirectory(tests)
|
add_subdirectory(tests)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# Manual STT test executable
|
||||||
|
add_executable(test_stt_engines
|
||||||
|
tests/manual/test_stt_engines.cpp
|
||||||
|
)
|
||||||
|
target_link_libraries(test_stt_engines
|
||||||
|
AissiaAudio
|
||||||
|
spdlog::spdlog
|
||||||
|
)
|
||||||
|
# Link Winsock for httplib on Windows (already linked via AissiaLLM PUBLIC dependency)
|
||||||
|
|||||||
305
config/README_MCP.md
Normal file
305
config/README_MCP.md
Normal file
@ -0,0 +1,305 @@
|
|||||||
|
# AISSIA MCP Configuration for Claude Code
|
||||||
|
|
||||||
|
This directory contains an example MCP (Model Context Protocol) configuration for integrating AISSIA with Claude Code.
|
||||||
|
|
||||||
|
## Quick Setup
|
||||||
|
|
||||||
|
### 1. Locate Claude Code MCP Settings
|
||||||
|
|
||||||
|
The MCP configuration file location depends on your operating system:
|
||||||
|
|
||||||
|
**Windows**:
|
||||||
|
```
|
||||||
|
%APPDATA%\Code\User\globalStorage\saoudrizwan.claude-dev\settings\cline_mcp_settings.json
|
||||||
|
```
|
||||||
|
|
||||||
|
Full path example:
|
||||||
|
```
|
||||||
|
C:\Users\YourUsername\AppData\Roaming\Code\User\globalStorage\saoudrizwan.claude-dev\settings\cline_mcp_settings.json
|
||||||
|
```
|
||||||
|
|
||||||
|
**macOS**:
|
||||||
|
```
|
||||||
|
~/Library/Application Support/Code/User/globalStorage/saoudrizwan.claude-dev/settings/cline_mcp_settings.json
|
||||||
|
```
|
||||||
|
|
||||||
|
**Linux**:
|
||||||
|
```
|
||||||
|
~/.config/Code/User/globalStorage/saoudrizwan.claude-dev/settings/cline_mcp_settings.json
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Copy Configuration
|
||||||
|
|
||||||
|
Copy the contents of `claude_code_mcp_config.json` to the Claude Code MCP settings file.
|
||||||
|
|
||||||
|
**Important**: Update the `command` path to point to your actual AISSIA executable:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"mcpServers": {
|
||||||
|
"aissia": {
|
||||||
|
"command": "C:\\path\\to\\your\\aissia\\build\\aissia.exe",
|
||||||
|
"args": ["--mcp-server"],
|
||||||
|
"disabled": false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Restart Claude Code
|
||||||
|
|
||||||
|
Restart VS Code (or reload window: `Ctrl+Shift+P` → "Developer: Reload Window") to apply the changes.
|
||||||
|
|
||||||
|
### 4. Verify Integration
|
||||||
|
|
||||||
|
Open Claude Code and check that AISSIA tools are available:
|
||||||
|
|
||||||
|
```
|
||||||
|
You: Can you list the available MCP servers?
|
||||||
|
Claude: I have access to the following MCP servers:
|
||||||
|
- aissia: 13 tools available
|
||||||
|
```
|
||||||
|
|
||||||
|
## Available Tools
|
||||||
|
|
||||||
|
Once configured, Claude will have access to these 13 AISSIA tools:
|
||||||
|
|
||||||
|
### AISSIA Core (5 tools)
|
||||||
|
1. **chat_with_aissia** ⭐ - Dialogue with AISSIA's AI assistant (Claude Sonnet 4)
|
||||||
|
2. **transcribe_audio** - Transcribe audio files to text
|
||||||
|
3. **text_to_speech** - Convert text to speech audio files
|
||||||
|
4. **save_memory** - Save notes to AISSIA's persistent storage
|
||||||
|
5. **search_memories** - Search through saved memories
|
||||||
|
|
||||||
|
### File System (8 tools)
|
||||||
|
6. **read_file** - Read file contents
|
||||||
|
7. **write_file** - Write content to files
|
||||||
|
8. **list_directory** - List files in a directory
|
||||||
|
9. **search_files** - Search for files by pattern
|
||||||
|
10. **file_exists** - Check if a file exists
|
||||||
|
11. **create_directory** - Create directories
|
||||||
|
12. **delete_file** - Delete files
|
||||||
|
13. **move_file** - Move or rename files
|
||||||
|
|
||||||
|
## Configuration Options
|
||||||
|
|
||||||
|
### Basic Configuration
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"mcpServers": {
|
||||||
|
"aissia": {
|
||||||
|
"command": "path/to/aissia.exe",
|
||||||
|
"args": ["--mcp-server"],
|
||||||
|
"disabled": false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### With Auto-Approval
|
||||||
|
|
||||||
|
To skip confirmation prompts for specific tools:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"mcpServers": {
|
||||||
|
"aissia": {
|
||||||
|
"command": "path/to/aissia.exe",
|
||||||
|
"args": ["--mcp-server"],
|
||||||
|
"disabled": false,
|
||||||
|
"alwaysAllow": ["chat_with_aissia", "read_file", "write_file"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Disable Server
|
||||||
|
|
||||||
|
To temporarily disable AISSIA without removing the configuration:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"mcpServers": {
|
||||||
|
"aissia": {
|
||||||
|
"command": "path/to/aissia.exe",
|
||||||
|
"args": ["--mcp-server"],
|
||||||
|
"disabled": true // <-- Set to true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
Before running AISSIA in MCP server mode, ensure these config files exist:
|
||||||
|
|
||||||
|
### config/ai.json
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"provider": "claude",
|
||||||
|
"api_key": "sk-ant-api03-...",
|
||||||
|
"model": "claude-sonnet-4-20250514",
|
||||||
|
"max_iterations": 10,
|
||||||
|
"system_prompt": "Tu es AISSIA, un assistant personnel intelligent..."
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### config/storage.json
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"database_path": "./data/aissia.db",
|
||||||
|
"journal_mode": "WAL",
|
||||||
|
"busy_timeout_ms": 5000
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### config/voice.json (optional)
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"tts": {
|
||||||
|
"enabled": true,
|
||||||
|
"rate": 0,
|
||||||
|
"volume": 80
|
||||||
|
},
|
||||||
|
"stt": {
|
||||||
|
"active_mode": {
|
||||||
|
"enabled": false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing MCP Server
|
||||||
|
|
||||||
|
You can test the MCP server independently before integrating with Claude Code:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test tools/list
|
||||||
|
echo '{"jsonrpc":"2.0","id":1,"method":"tools/list"}' | ./build/aissia.exe --mcp-server
|
||||||
|
|
||||||
|
# Test chat_with_aissia tool
|
||||||
|
echo '{"jsonrpc":"2.0","id":2,"method":"tools/call","params":{"name":"chat_with_aissia","arguments":{"message":"What time is it?"}}}' | ./build/aissia.exe --mcp-server
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### "Server not found" or "Connection failed"
|
||||||
|
|
||||||
|
1. Verify the `command` path is correct and points to `aissia.exe`
|
||||||
|
2. Make sure AISSIA compiles successfully: `cmake --build build`
|
||||||
|
3. Test running `./build/aissia.exe --mcp-server` manually
|
||||||
|
|
||||||
|
### "LLMService not initialized"
|
||||||
|
|
||||||
|
AISSIA requires `config/ai.json` with a valid Claude API key. Check:
|
||||||
|
1. File exists: `config/ai.json`
|
||||||
|
2. API key is valid: `"api_key": "sk-ant-api03-..."`
|
||||||
|
3. Provider is set: `"provider": "claude"`
|
||||||
|
|
||||||
|
### "Tool execution failed"
|
||||||
|
|
||||||
|
Some tools have limited functionality in Phase 8 MVP:
|
||||||
|
- `transcribe_audio` - Not fully implemented yet (STT file support needed)
|
||||||
|
- `text_to_speech` - Not fully implemented yet (TTS file output needed)
|
||||||
|
- `save_memory` - Not fully implemented yet (Storage sync methods needed)
|
||||||
|
- `search_memories` - Not fully implemented yet (Storage sync methods needed)
|
||||||
|
|
||||||
|
These will be completed in Phase 8.1 and 8.2.
|
||||||
|
|
||||||
|
### Server starts but tools don't appear
|
||||||
|
|
||||||
|
1. Check Claude Code logs: `Ctrl+Shift+P` → "Developer: Open Extension Logs"
|
||||||
|
2. Look for MCP server initialization errors
|
||||||
|
3. Verify JSON syntax in the MCP configuration file
|
||||||
|
|
||||||
|
## Example Use Cases
|
||||||
|
|
||||||
|
### 1. Ask AISSIA for Help
|
||||||
|
|
||||||
|
```
|
||||||
|
You: Use chat_with_aissia to ask "What are my top productivity patterns?"
|
||||||
|
Claude: [calls chat_with_aissia tool]
|
||||||
|
AISSIA: Based on your activity data, your most productive hours are 9-11 AM...
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. File Operations + AI
|
||||||
|
|
||||||
|
```
|
||||||
|
You: Read my TODO.md file and ask AISSIA to prioritize the tasks
|
||||||
|
Claude: [calls read_file("TODO.md")]
|
||||||
|
Claude: [calls chat_with_aissia with task list]
|
||||||
|
AISSIA: Here's a prioritized version based on urgency and dependencies...
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Voice Transcription (future)
|
||||||
|
|
||||||
|
```
|
||||||
|
You: Transcribe meeting-notes.wav to text
|
||||||
|
Claude: [calls transcribe_audio("meeting-notes.wav")]
|
||||||
|
Result: "Welcome to the team meeting. Today we're discussing..."
|
||||||
|
```
|
||||||
|
|
||||||
|
## Advanced Configuration
|
||||||
|
|
||||||
|
### Multiple MCP Servers
|
||||||
|
|
||||||
|
You can configure multiple MCP servers alongside AISSIA:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"mcpServers": {
|
||||||
|
"aissia": {
|
||||||
|
"command": "C:\\path\\to\\aissia\\build\\aissia.exe",
|
||||||
|
"args": ["--mcp-server"],
|
||||||
|
"disabled": false
|
||||||
|
},
|
||||||
|
"filesystem": {
|
||||||
|
"command": "npx",
|
||||||
|
"args": ["-y", "@modelcontextprotocol/server-filesystem", "C:\\Users"],
|
||||||
|
"disabled": false
|
||||||
|
},
|
||||||
|
"brave-search": {
|
||||||
|
"command": "npx",
|
||||||
|
"args": ["-y", "@modelcontextprotocol/server-brave-search"],
|
||||||
|
"disabled": false,
|
||||||
|
"env": {
|
||||||
|
"BRAVE_API_KEY": "your-brave-api-key"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Environment Variables
|
||||||
|
|
||||||
|
Pass environment variables to AISSIA:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"mcpServers": {
|
||||||
|
"aissia": {
|
||||||
|
"command": "C:\\path\\to\\aissia\\build\\aissia.exe",
|
||||||
|
"args": ["--mcp-server"],
|
||||||
|
"disabled": false,
|
||||||
|
"env": {
|
||||||
|
"AISSIA_LOG_LEVEL": "debug",
|
||||||
|
"CLAUDE_API_KEY": "sk-ant-api03-..."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- **Full Documentation**: `docs/CLAUDE_CODE_INTEGRATION.md`
|
||||||
|
- **MCP Specification**: https://github.com/anthropics/mcp
|
||||||
|
- **Claude Code Extension**: https://marketplace.visualstudio.com/items?itemName=saoudrizwan.claude-dev
|
||||||
|
|
||||||
|
## Support
|
||||||
|
|
||||||
|
For issues or questions:
|
||||||
|
1. Check the full documentation: `docs/CLAUDE_CODE_INTEGRATION.md`
|
||||||
|
2. Review logs: AISSIA writes to stderr in MCP mode
|
||||||
|
3. Test manually: `./build/aissia.exe --mcp-server` and send JSON-RPC requests
|
||||||
10
config/claude_code_mcp_config.json
Normal file
10
config/claude_code_mcp_config.json
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
{
|
||||||
|
"mcpServers": {
|
||||||
|
"aissia": {
|
||||||
|
"command": "C:\\Users\\alexi\\Documents\\projects\\aissia\\build\\aissia.exe",
|
||||||
|
"args": ["--mcp-server"],
|
||||||
|
"disabled": false,
|
||||||
|
"alwaysAllow": []
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"enabled": true,
|
"enabled": true,
|
||||||
"testDirectory": "tests/integration",
|
"testDirectory": "build/tests/integration",
|
||||||
"globalTimeoutMs": 300000,
|
"globalTimeoutMs": 300000,
|
||||||
"stopOnFirstFailure": false,
|
"stopOnFirstFailure": false,
|
||||||
"verboseOutput": true,
|
"verboseOutput": true,
|
||||||
|
|||||||
@ -3,12 +3,34 @@
|
|||||||
"enabled": true,
|
"enabled": true,
|
||||||
"engine": "auto",
|
"engine": "auto",
|
||||||
"rate": 0,
|
"rate": 0,
|
||||||
"volume": 80
|
"volume": 80,
|
||||||
|
"voice": "fr-fr"
|
||||||
},
|
},
|
||||||
"stt": {
|
"stt": {
|
||||||
|
"passive_mode": {
|
||||||
|
"enabled": false,
|
||||||
|
"engine": "pocketsphinx",
|
||||||
|
"keywords": ["celuna", "hey celuna", "ok celuna"],
|
||||||
|
"threshold": 0.8,
|
||||||
|
"model_path": "/usr/share/pocketsphinx/model/en-us"
|
||||||
|
},
|
||||||
|
"active_mode": {
|
||||||
"enabled": true,
|
"enabled": true,
|
||||||
|
"engine": "vosk",
|
||||||
|
"model_path": "./models/vosk-model-small-fr-0.22",
|
||||||
|
"language": "fr",
|
||||||
|
"timeout_seconds": 30,
|
||||||
|
"fallback_engine": "whisper-api"
|
||||||
|
},
|
||||||
|
"whisper_api": {
|
||||||
"api_key_env": "OPENAI_API_KEY",
|
"api_key_env": "OPENAI_API_KEY",
|
||||||
"model": "whisper-1",
|
"model": "whisper-1"
|
||||||
"language": "fr"
|
},
|
||||||
|
"microphone": {
|
||||||
|
"device_id": -1,
|
||||||
|
"sample_rate": 16000,
|
||||||
|
"channels": 1,
|
||||||
|
"buffer_size": 1024
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
35
create_test_audio.py
Normal file
35
create_test_audio.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Generate test audio WAV file for STT testing"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
try:
|
||||||
|
from gtts import gTTS
|
||||||
|
import os
|
||||||
|
from pydub import AudioSegment
|
||||||
|
|
||||||
|
# Generate French test audio
|
||||||
|
text = "Bonjour, ceci est un test de reconnaissance vocale."
|
||||||
|
print(f"Generating audio: '{text}'")
|
||||||
|
|
||||||
|
# Create TTS
|
||||||
|
tts = gTTS(text=text, lang='fr', slow=False)
|
||||||
|
tts.save("test_audio_temp.mp3")
|
||||||
|
print("✓ Generated MP3")
|
||||||
|
|
||||||
|
# Convert to WAV (16kHz, mono, 16-bit PCM)
|
||||||
|
audio = AudioSegment.from_mp3("test_audio_temp.mp3")
|
||||||
|
audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
|
||||||
|
audio.export("test_audio.wav", format="wav")
|
||||||
|
print("✓ Converted to WAV (16kHz, mono, 16-bit)")
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
os.remove("test_audio_temp.mp3")
|
||||||
|
print("✓ Saved as test_audio.wav")
|
||||||
|
print(f"Duration: {len(audio)/1000:.1f}s")
|
||||||
|
|
||||||
|
except ImportError as e:
|
||||||
|
print(f"Missing dependency: {e}")
|
||||||
|
print("\nInstall with: pip install gtts pydub")
|
||||||
|
print("Note: pydub also requires ffmpeg")
|
||||||
|
sys.exit(1)
|
||||||
38
create_test_audio_simple.py
Normal file
38
create_test_audio_simple.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Generate simple test audio WAV file using only stdlib"""
|
||||||
|
|
||||||
|
import wave
|
||||||
|
import struct
|
||||||
|
import math
|
||||||
|
|
||||||
|
# WAV parameters
|
||||||
|
sample_rate = 16000
|
||||||
|
duration = 2 # seconds
|
||||||
|
frequency = 440 # Hz (A4 note)
|
||||||
|
|
||||||
|
# Generate sine wave samples
|
||||||
|
samples = []
|
||||||
|
for i in range(int(sample_rate * duration)):
|
||||||
|
# Sine wave value (-1.0 to 1.0)
|
||||||
|
value = math.sin(2.0 * math.pi * frequency * i / sample_rate)
|
||||||
|
|
||||||
|
# Convert to 16-bit PCM (-32768 to 32767)
|
||||||
|
sample = int(value * 32767)
|
||||||
|
samples.append(sample)
|
||||||
|
|
||||||
|
# Write WAV file
|
||||||
|
with wave.open("test_audio.wav", "w") as wav_file:
|
||||||
|
# Set parameters (1 channel, 2 bytes per sample, 16kHz)
|
||||||
|
wav_file.setnchannels(1)
|
||||||
|
wav_file.setsampwidth(2)
|
||||||
|
wav_file.setframerate(sample_rate)
|
||||||
|
|
||||||
|
# Write frames
|
||||||
|
for sample in samples:
|
||||||
|
wav_file.writeframes(struct.pack('<h', sample))
|
||||||
|
|
||||||
|
print(f"[OK] Generated test_audio.wav")
|
||||||
|
print(f" - Format: 16kHz, mono, 16-bit PCM")
|
||||||
|
print(f" - Duration: {duration}s")
|
||||||
|
print(f" - Frequency: {frequency}Hz (A4 tone)")
|
||||||
|
print(f" - Samples: {len(samples)}")
|
||||||
449
docs/CLAUDE_CODE_INTEGRATION.md
Normal file
449
docs/CLAUDE_CODE_INTEGRATION.md
Normal file
@ -0,0 +1,449 @@
|
|||||||
|
# AISSIA - Claude Code Integration (Phase 8)
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
AISSIA can now be exposed as an **MCP Server** (Model Context Protocol) to integrate with Claude Code and other MCP-compatible clients. This allows Claude to use AISSIA's capabilities as tools during conversations.
|
||||||
|
|
||||||
|
**Mode MCP Server**: `./aissia --mcp-server`
|
||||||
|
|
||||||
|
This mode exposes AISSIA's services via JSON-RPC 2.0 over stdio, following the MCP specification.
|
||||||
|
|
||||||
|
## Available Tools
|
||||||
|
|
||||||
|
AISSIA exposes **13 tools** total:
|
||||||
|
|
||||||
|
### 1. AISSIA Core Tools (Priority)
|
||||||
|
|
||||||
|
#### `chat_with_aissia` ⭐ **PRIORITY**
|
||||||
|
Dialogue with AISSIA's built-in AI assistant (Claude Sonnet 4). Send a message and get an intelligent response with access to AISSIA's knowledge and capabilities.
|
||||||
|
|
||||||
|
**Input**:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"message": "string (required) - Message to send to AISSIA",
|
||||||
|
"conversation_id": "string (optional) - Conversation ID for continuity",
|
||||||
|
"system_prompt": "string (optional) - Custom system prompt"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Output**:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"response": "AISSIA's response text",
|
||||||
|
"conversation_id": "conversation-id",
|
||||||
|
"tokens": 1234,
|
||||||
|
"iterations": 2
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example use case**: "Hey AISSIA, can you analyze my focus patterns this week?"
|
||||||
|
|
||||||
|
#### `transcribe_audio`
|
||||||
|
Transcribe audio file to text using Speech-to-Text engines (Whisper.cpp, OpenAI Whisper API, Google Speech).
|
||||||
|
|
||||||
|
**Input**:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"file_path": "string (required) - Path to audio file",
|
||||||
|
"language": "string (optional) - Language code (e.g., 'fr', 'en'). Default: 'fr'"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Output**:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"text": "Transcribed text from audio",
|
||||||
|
"file": "/path/to/audio.wav",
|
||||||
|
"language": "fr"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Status**: ⚠️ Not yet implemented - requires STT service file transcription support
|
||||||
|
|
||||||
|
#### `text_to_speech`
|
||||||
|
Convert text to speech audio file using Text-to-Speech synthesis. Generates audio in WAV format.
|
||||||
|
|
||||||
|
**Input**:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"text": "string (required) - Text to synthesize",
|
||||||
|
"output_file": "string (required) - Output audio file path (WAV)",
|
||||||
|
"voice": "string (optional) - Voice identifier (e.g., 'fr-fr', 'en-us'). Default: 'fr-fr'"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Output**:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"success": true,
|
||||||
|
"file": "/path/to/output.wav",
|
||||||
|
"voice": "fr-fr"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Status**: ⚠️ Not yet implemented - requires TTS engine file output support
|
||||||
|
|
||||||
|
#### `save_memory`
|
||||||
|
Save a note or memory to AISSIA's persistent storage. Memories can be tagged and searched later.
|
||||||
|
|
||||||
|
**Input**:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"title": "string (required) - Memory title",
|
||||||
|
"content": "string (required) - Memory content",
|
||||||
|
"tags": ["array of strings (optional) - Tags for categorization"]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Output**:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"id": "memory-uuid",
|
||||||
|
"title": "Meeting notes",
|
||||||
|
"timestamp": "2025-01-30T10:00:00Z"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Status**: ⚠️ Not yet implemented - requires StorageService sync methods
|
||||||
|
|
||||||
|
#### `search_memories`
|
||||||
|
Search through saved memories and notes in AISSIA's storage. Returns matching memories with relevance scores.
|
||||||
|
|
||||||
|
**Input**:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"query": "string (required) - Search query",
|
||||||
|
"limit": "integer (optional) - Maximum results to return. Default: 10"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Output**:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"results": [
|
||||||
|
{
|
||||||
|
"id": "memory-uuid",
|
||||||
|
"title": "Meeting notes",
|
||||||
|
"content": "...",
|
||||||
|
"score": 0.85,
|
||||||
|
"tags": ["work", "meeting"]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"count": 5
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Status**: ⚠️ Not yet implemented - requires StorageService sync methods
|
||||||
|
|
||||||
|
### 2. File System Tools (8 tools)
|
||||||
|
|
||||||
|
- `read_file` - Read a file from the filesystem
|
||||||
|
- `write_file` - Write content to a file
|
||||||
|
- `list_directory` - List files in a directory
|
||||||
|
- `search_files` - Search for files by pattern
|
||||||
|
- `file_exists` - Check if a file exists
|
||||||
|
- `create_directory` - Create a new directory
|
||||||
|
- `delete_file` - Delete a file
|
||||||
|
- `move_file` - Move or rename a file
|
||||||
|
|
||||||
|
These tools provide Claude with direct filesystem access to work with files on your system.
|
||||||
|
|
||||||
|
## Installation for Claude Code
|
||||||
|
|
||||||
|
### 1. Configure Claude Code MCP
|
||||||
|
|
||||||
|
Create or edit your Claude Code MCP configuration file:
|
||||||
|
|
||||||
|
**Windows**: `%APPDATA%\Code\User\globalStorage\saoudrizwan.claude-dev\settings\cline_mcp_settings.json`
|
||||||
|
**macOS/Linux**: `~/.config/Code/User/globalStorage/saoudrizwan.claude-dev/settings/cline_mcp_settings.json`
|
||||||
|
|
||||||
|
Add AISSIA as an MCP server:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"mcpServers": {
|
||||||
|
"aissia": {
|
||||||
|
"command": "C:\\path\\to\\aissia\\build\\aissia.exe",
|
||||||
|
"args": ["--mcp-server"],
|
||||||
|
"disabled": false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note**: Replace `C:\\path\\to\\aissia\\build\\aissia.exe` with the actual path to your compiled AISSIA executable.
|
||||||
|
|
||||||
|
### 2. Verify Configuration
|
||||||
|
|
||||||
|
Restart Claude Code (or VS Code) to reload the MCP configuration.
|
||||||
|
|
||||||
|
Claude should now have access to all 13 AISSIA tools during conversations.
|
||||||
|
|
||||||
|
### 3. Test Integration
|
||||||
|
|
||||||
|
In Claude Code, try:
|
||||||
|
|
||||||
|
```
|
||||||
|
"Can you use the chat_with_aissia tool to ask AISSIA what time it is?"
|
||||||
|
```
|
||||||
|
|
||||||
|
Claude will call the `chat_with_aissia` tool, which internally uses AISSIA's LLM service to process the query.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
### Synchronous Mode (MCP Server)
|
||||||
|
|
||||||
|
When running as an MCP server, AISSIA uses **synchronous blocking calls** instead of the async pub/sub architecture used in normal mode:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
// Normal mode (async)
|
||||||
|
io->publish("llm:request", data);
|
||||||
|
// ... wait for response on "llm:response" topic
|
||||||
|
|
||||||
|
// MCP mode (sync)
|
||||||
|
auto response = llmService->sendMessageSync(message, conversationId);
|
||||||
|
// immediate result
|
||||||
|
```
|
||||||
|
|
||||||
|
This is necessary because:
|
||||||
|
1. MCP protocol expects immediate JSON-RPC responses
|
||||||
|
2. No event loop in MCP server mode (stdin/stdout blocking I/O)
|
||||||
|
3. Simplifies integration with external tools
|
||||||
|
|
||||||
|
### Service Integration
|
||||||
|
|
||||||
|
```
|
||||||
|
MCPServer (stdio JSON-RPC)
|
||||||
|
↓
|
||||||
|
MCPServerTools (tool handlers)
|
||||||
|
↓
|
||||||
|
Services (sync methods)
|
||||||
|
├── LLMService::sendMessageSync()
|
||||||
|
├── VoiceService::transcribeFileSync()
|
||||||
|
├── VoiceService::textToSpeechSync()
|
||||||
|
└── StorageService (stub implementations)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Tool Registry
|
||||||
|
|
||||||
|
All tools are registered in a central `ToolRegistry`:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
ToolRegistry registry;
|
||||||
|
|
||||||
|
// 1. Internal tools (get_current_time)
|
||||||
|
registry.registerTool("get_current_time", ...);
|
||||||
|
|
||||||
|
// 2. FileSystem tools (8 tools)
|
||||||
|
for (auto& toolDef : FileSystemTools::getToolDefinitions()) {
|
||||||
|
registry.registerTool(toolDef);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. AISSIA tools (5 tools)
|
||||||
|
MCPServerTools aissiaTools(llmService, storageService, voiceService);
|
||||||
|
for (const auto& toolDef : aissiaTools.getToolDefinitions()) {
|
||||||
|
registry.registerTool(toolDef);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Total: **13 tools**
|
||||||
|
|
||||||
|
## Configuration Files
|
||||||
|
|
||||||
|
AISSIA MCP Server requires these config files (same as normal mode):
|
||||||
|
|
||||||
|
- `config/ai.json` - LLM provider configuration (Claude API key)
|
||||||
|
- `config/storage.json` - Database path and settings
|
||||||
|
- `config/voice.json` - TTS/STT engine settings
|
||||||
|
|
||||||
|
**Important**: Make sure these files are present before running `--mcp-server` mode.
|
||||||
|
|
||||||
|
## Limitations (Phase 8 MVP)
|
||||||
|
|
||||||
|
1. **STT/TTS file operations**: `transcribe_audio` and `text_to_speech` are not fully implemented yet
|
||||||
|
- STT service needs file transcription support (currently only streaming)
|
||||||
|
- TTS engine needs file output support (currently only direct playback)
|
||||||
|
|
||||||
|
2. **Storage sync methods**: `save_memory` and `search_memories` return "not implemented" errors
|
||||||
|
- StorageService needs `saveMemorySync()` and `searchMemoriesSync()` methods
|
||||||
|
- Current storage only works via async pub/sub
|
||||||
|
|
||||||
|
3. **No hot-reload**: MCP server mode doesn't load hot-reloadable modules
|
||||||
|
- Only services and tools are available
|
||||||
|
- No SchedulerModule, MonitoringModule, etc.
|
||||||
|
|
||||||
|
4. **Single-threaded**: MCP server runs synchronously on main thread
|
||||||
|
- LLMService worker thread still runs for agentic loops
|
||||||
|
- But overall server is blocking on stdin
|
||||||
|
|
||||||
|
## Roadmap
|
||||||
|
|
||||||
|
### Phase 8.1 - Complete STT/TTS Sync Methods
|
||||||
|
- [ ] Implement `VoiceService::transcribeFileSync()` using STT engines
|
||||||
|
- [ ] Implement `VoiceService::textToSpeechSync()` with file output
|
||||||
|
- [ ] Test audio file transcription via MCP
|
||||||
|
|
||||||
|
### Phase 8.2 - Storage Sync Methods
|
||||||
|
- [ ] Implement `StorageService::saveMemorySync()`
|
||||||
|
- [ ] Implement `StorageService::searchMemoriesSync()`
|
||||||
|
- [ ] Add vector embeddings for semantic search
|
||||||
|
|
||||||
|
### Phase 8.3 - Advanced Tools
|
||||||
|
- [ ] `schedule_task` - Add tasks to AISSIA's scheduler
|
||||||
|
- [ ] `get_focus_stats` - Retrieve hyperfocus detection stats
|
||||||
|
- [ ] `list_active_apps` - Get current monitored applications
|
||||||
|
- [ ] `send_notification` - Trigger system notifications
|
||||||
|
|
||||||
|
### Phase 8.4 - Multi-Modal Support
|
||||||
|
- [ ] Image input for LLM (Claude vision)
|
||||||
|
- [ ] PDF/document parsing tools
|
||||||
|
- [ ] Web scraping integration
|
||||||
|
|
||||||
|
## Use Cases
|
||||||
|
|
||||||
|
### 1. AI Assistant Collaboration
|
||||||
|
|
||||||
|
Claude Code can delegate complex reasoning tasks to AISSIA:
|
||||||
|
|
||||||
|
```
|
||||||
|
Claude: "I need to analyze user behavior patterns. Let me ask AISSIA."
|
||||||
|
→ calls chat_with_aissia("Analyze recent focus patterns")
|
||||||
|
AISSIA: "Based on monitoring data, user has 3 hyperfocus sessions daily averaging 2.5 hours..."
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Voice Transcription Workflow
|
||||||
|
|
||||||
|
```
|
||||||
|
Claude: "Transcribe meeting-2025-01-30.wav"
|
||||||
|
→ calls transcribe_audio(file_path="meeting-2025-01-30.wav", language="en")
|
||||||
|
→ calls write_file(path="transcript.txt", content=result)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Knowledge Management
|
||||||
|
|
||||||
|
```
|
||||||
|
Claude: "Save this important insight to AISSIA's memory"
|
||||||
|
→ calls save_memory(
|
||||||
|
title="Project architecture decision",
|
||||||
|
content="We decided to use hot-reload modules for business logic...",
|
||||||
|
tags=["architecture", "project"]
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. File + AI Operations
|
||||||
|
|
||||||
|
```
|
||||||
|
Claude: "Read todos.md, ask AISSIA to prioritize tasks, update file"
|
||||||
|
→ calls read_file("todos.md")
|
||||||
|
→ calls chat_with_aissia("Prioritize these tasks: ...")
|
||||||
|
→ calls write_file("todos-prioritized.md", content=...)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Development
|
||||||
|
|
||||||
|
### Adding New Tools
|
||||||
|
|
||||||
|
1. **Declare tool in MCPServerTools.hpp**:
|
||||||
|
```cpp
|
||||||
|
json handleNewTool(const json& input);
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Implement in MCPServerTools.cpp**:
|
||||||
|
```cpp
|
||||||
|
json MCPServerTools::handleNewTool(const json& input) {
|
||||||
|
// Extract input parameters
|
||||||
|
std::string param = input["param"];
|
||||||
|
|
||||||
|
// Call service
|
||||||
|
auto result = m_someService->doSomethingSync(param);
|
||||||
|
|
||||||
|
// Return JSON result
|
||||||
|
return {
|
||||||
|
{"output", result},
|
||||||
|
{"status", "success"}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Register in getToolDefinitions()**:
|
||||||
|
```cpp
|
||||||
|
tools.push_back({
|
||||||
|
"new_tool",
|
||||||
|
"Description of what this tool does",
|
||||||
|
{
|
||||||
|
{"type", "object"},
|
||||||
|
{"properties", {
|
||||||
|
{"param", {
|
||||||
|
{"type", "string"},
|
||||||
|
{"description", "Parameter description"}
|
||||||
|
}}
|
||||||
|
}},
|
||||||
|
{"required", json::array({"param"})}
|
||||||
|
},
|
||||||
|
[this](const json& input) { return handleNewTool(input); }
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Add to execute() switch**:
|
||||||
|
```cpp
|
||||||
|
if (toolName == "new_tool") {
|
||||||
|
return handleNewTool(input);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Testing MCP Server
|
||||||
|
|
||||||
|
Test with `nc` or `socat`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Send tools/list request
|
||||||
|
echo '{"jsonrpc":"2.0","id":1,"method":"tools/list"}' | ./build/aissia.exe --mcp-server
|
||||||
|
|
||||||
|
# Send tool call
|
||||||
|
echo '{"jsonrpc":"2.0","id":2,"method":"tools/call","params":{"name":"chat_with_aissia","arguments":{"message":"Hello AISSIA"}}}' | ./build/aissia.exe --mcp-server
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected output format:
|
||||||
|
```json
|
||||||
|
{"jsonrpc":"2.0","id":1,"result":{"tools":[{"name":"chat_with_aissia","description":"...","inputSchema":{...}}]}}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### "LLMService not initialized"
|
||||||
|
|
||||||
|
Make sure `config/ai.json` exists with valid API key:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"provider": "claude",
|
||||||
|
"api_key": "sk-ant-...",
|
||||||
|
"model": "claude-sonnet-4-20250514"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### "VoiceService not available"
|
||||||
|
|
||||||
|
Voice tools are optional. If you don't need STT/TTS, this is normal.
|
||||||
|
|
||||||
|
### "StorageService not available"
|
||||||
|
|
||||||
|
Make sure `config/storage.json` exists:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"database_path": "./data/aissia.db",
|
||||||
|
"journal_mode": "WAL",
|
||||||
|
"busy_timeout_ms": 5000
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### "Tool not found"
|
||||||
|
|
||||||
|
Check `tools/list` output to see which tools are actually registered.
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- **MCP Specification**: https://github.com/anthropics/mcp
|
||||||
|
- **AISSIA Architecture**: `docs/project-overview.md`
|
||||||
|
- **GroveEngine Guide**: `docs/GROVEENGINE_GUIDE.md`
|
||||||
|
- **LLM Service**: `src/services/LLMService.hpp`
|
||||||
|
- **MCPServer**: `src/shared/mcp/MCPServer.hpp`
|
||||||
268
docs/STT_CONFIGURATION.md
Normal file
268
docs/STT_CONFIGURATION.md
Normal file
@ -0,0 +1,268 @@
|
|||||||
|
# Configuration STT - Speech-to-Text
|
||||||
|
|
||||||
|
AISSIA supporte **4 engines STT** différents, configurables via `config/voice.json`.
|
||||||
|
|
||||||
|
## Engines Disponibles
|
||||||
|
|
||||||
|
### 1. **PocketSphinx** - Keyword Spotting Léger
|
||||||
|
- **Usage** : Détection de mots-clés (mode passif)
|
||||||
|
- **Taille** : ~10 MB
|
||||||
|
- **Performance** : Très économe (CPU/RAM)
|
||||||
|
- **Précision** : Moyenne (bon pour wake words)
|
||||||
|
- **Installation** : `sudo apt install pocketsphinx libpocketsphinx-dev`
|
||||||
|
- **Modèle** : `/usr/share/pocketsphinx/model/en-us`
|
||||||
|
|
||||||
|
**Config** :
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"stt": {
|
||||||
|
"passive_mode": {
|
||||||
|
"enabled": true,
|
||||||
|
"engine": "pocketsphinx",
|
||||||
|
"keywords": ["celuna", "hey celuna", "ok celuna"],
|
||||||
|
"threshold": 0.8,
|
||||||
|
"model_path": "/usr/share/pocketsphinx/model/en-us"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. **Vosk** - STT Local Équilibré
|
||||||
|
- **Usage** : Transcription complète locale
|
||||||
|
- **Taille** : 50 MB (small), 1.8 GB (large)
|
||||||
|
- **Performance** : Rapide, usage modéré
|
||||||
|
- **Précision** : Bonne
|
||||||
|
- **Installation** : Télécharger modèle depuis [alphacephei.com/vosk/models](https://alphacephei.com/vosk/models)
|
||||||
|
- **Modèle** : `./models/vosk-model-small-fr-0.22`
|
||||||
|
|
||||||
|
**Config** :
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"stt": {
|
||||||
|
"active_mode": {
|
||||||
|
"enabled": true,
|
||||||
|
"engine": "vosk",
|
||||||
|
"model_path": "./models/vosk-model-small-fr-0.22",
|
||||||
|
"language": "fr"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. **Whisper.cpp** - STT Local Haute Qualité
|
||||||
|
- **Usage** : Transcription de haute qualité offline
|
||||||
|
- **Taille** : 75 MB (tiny) à 2.9 GB (large)
|
||||||
|
- **Performance** : Plus lourd, très précis
|
||||||
|
- **Précision** : Excellente
|
||||||
|
- **Installation** : Compiler whisper.cpp et télécharger modèles GGML
|
||||||
|
- **Modèle** : `./models/ggml-base.bin`
|
||||||
|
|
||||||
|
**Config** :
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"stt": {
|
||||||
|
"active_mode": {
|
||||||
|
"enabled": true,
|
||||||
|
"engine": "whisper-cpp",
|
||||||
|
"model_path": "./models/ggml-base.bin",
|
||||||
|
"language": "fr"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. **Whisper API** - STT Cloud OpenAI
|
||||||
|
- **Usage** : Transcription via API OpenAI
|
||||||
|
- **Taille** : N/A (cloud)
|
||||||
|
- **Performance** : Dépend de latence réseau
|
||||||
|
- **Précision** : Excellente
|
||||||
|
- **Installation** : Aucune (API key requise)
|
||||||
|
- **Coût** : $0.006 / minute
|
||||||
|
|
||||||
|
**Config** :
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"stt": {
|
||||||
|
"active_mode": {
|
||||||
|
"enabled": true,
|
||||||
|
"engine": "whisper-api",
|
||||||
|
"fallback_engine": "whisper-api"
|
||||||
|
},
|
||||||
|
"whisper_api": {
|
||||||
|
"api_key_env": "OPENAI_API_KEY",
|
||||||
|
"model": "whisper-1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration Complète
|
||||||
|
|
||||||
|
### Dual Mode (Passive + Active)
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"tts": {
|
||||||
|
"enabled": true,
|
||||||
|
"engine": "auto",
|
||||||
|
"rate": 0,
|
||||||
|
"volume": 80,
|
||||||
|
"voice": "fr-fr"
|
||||||
|
},
|
||||||
|
"stt": {
|
||||||
|
"passive_mode": {
|
||||||
|
"enabled": true,
|
||||||
|
"engine": "pocketsphinx",
|
||||||
|
"keywords": ["celuna", "hey celuna", "ok celuna"],
|
||||||
|
"threshold": 0.8,
|
||||||
|
"model_path": "/usr/share/pocketsphinx/model/en-us"
|
||||||
|
},
|
||||||
|
"active_mode": {
|
||||||
|
"enabled": true,
|
||||||
|
"engine": "vosk",
|
||||||
|
"model_path": "./models/vosk-model-small-fr-0.22",
|
||||||
|
"language": "fr",
|
||||||
|
"timeout_seconds": 30,
|
||||||
|
"fallback_engine": "whisper-api"
|
||||||
|
},
|
||||||
|
"whisper_api": {
|
||||||
|
"api_key_env": "OPENAI_API_KEY",
|
||||||
|
"model": "whisper-1"
|
||||||
|
},
|
||||||
|
"microphone": {
|
||||||
|
"device_id": -1,
|
||||||
|
"sample_rate": 16000,
|
||||||
|
"channels": 1,
|
||||||
|
"buffer_size": 1024
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Mode Auto
|
||||||
|
|
||||||
|
Utilise `"engine": "auto"` pour sélection automatique :
|
||||||
|
|
||||||
|
1. Essaie **Vosk** si modèle disponible
|
||||||
|
2. Essaie **Whisper.cpp** si modèle disponible
|
||||||
|
3. Fallback sur **Whisper API** si clé API présente
|
||||||
|
4. Sinon utilise **Stub** (mode désactivé)
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"stt": {
|
||||||
|
"active_mode": {
|
||||||
|
"engine": "auto",
|
||||||
|
"model_path": "./models/vosk-model-small-fr-0.22",
|
||||||
|
"language": "fr"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Comparaison des Engines
|
||||||
|
|
||||||
|
| Engine | Taille | CPU | RAM | Latence | Précision | Usage Recommandé |
|
||||||
|
|--------|--------|-----|-----|---------|-----------|------------------|
|
||||||
|
| **PocketSphinx** | 10 MB | Faible | Faible | Très rapide | Moyenne | Wake words, keywords |
|
||||||
|
| **Vosk** | 50 MB+ | Moyen | Moyen | Rapide | Bonne | Transcription générale |
|
||||||
|
| **Whisper.cpp** | 75 MB+ | Élevé | Élevé | Moyen | Excellente | Haute qualité offline |
|
||||||
|
| **Whisper API** | 0 MB | Nul | Nul | Variable | Excellente | Simplicité, cloud |
|
||||||
|
|
||||||
|
## Workflow Recommandé
|
||||||
|
|
||||||
|
### Scénario 1 : Assistant Vocal Local
|
||||||
|
```
|
||||||
|
Mode Passif (PocketSphinx) → Détecte "hey celuna"
|
||||||
|
↓
|
||||||
|
Mode Actif (Vosk) → Transcrit la commande
|
||||||
|
↓
|
||||||
|
Traite la commande
|
||||||
|
```
|
||||||
|
|
||||||
|
### Scénario 2 : Haute Qualité avec Fallback
|
||||||
|
```
|
||||||
|
Essaie Vosk (local, rapide)
|
||||||
|
↓ (si échec)
|
||||||
|
Essaie Whisper.cpp (local, précis)
|
||||||
|
↓ (si échec)
|
||||||
|
Fallback Whisper API (cloud)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Scénario 3 : Cloud-First
|
||||||
|
```
|
||||||
|
Whisper API directement (simplicité, pas de setup local)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Installation des Dépendances
|
||||||
|
|
||||||
|
### Ubuntu/Debian
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# PocketSphinx
|
||||||
|
sudo apt install pocketsphinx libpocketsphinx-dev
|
||||||
|
|
||||||
|
# Vosk
|
||||||
|
# Télécharger depuis https://alphacephei.com/vosk/models
|
||||||
|
mkdir -p models
|
||||||
|
cd models
|
||||||
|
wget https://alphacephei.com/vosk/models/vosk-model-small-fr-0.22.zip
|
||||||
|
unzip vosk-model-small-fr-0.22.zip
|
||||||
|
|
||||||
|
# Whisper.cpp
|
||||||
|
git clone https://github.com/ggerganov/whisper.cpp
|
||||||
|
cd whisper.cpp
|
||||||
|
make
|
||||||
|
# Télécharger modèles GGML
|
||||||
|
bash ./models/download-ggml-model.sh base
|
||||||
|
```
|
||||||
|
|
||||||
|
## Variables d'Environnement
|
||||||
|
|
||||||
|
Configurez dans `.env` :
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Whisper API (OpenAI)
|
||||||
|
OPENAI_API_KEY=sk-...
|
||||||
|
|
||||||
|
# Optionnel : Chemins personnalisés
|
||||||
|
STT_MODEL_PATH=/path/to/models
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### PocketSphinx ne fonctionne pas
|
||||||
|
```bash
|
||||||
|
# Vérifier installation
|
||||||
|
dpkg -l | grep pocketsphinx
|
||||||
|
|
||||||
|
# Vérifier modèle
|
||||||
|
ls /usr/share/pocketsphinx/model/en-us
|
||||||
|
```
|
||||||
|
|
||||||
|
### Vosk ne détecte rien
|
||||||
|
```bash
|
||||||
|
# Vérifier que libvosk.so est installée
|
||||||
|
ldconfig -p | grep vosk
|
||||||
|
|
||||||
|
# Télécharger le bon modèle pour votre langue
|
||||||
|
```
|
||||||
|
|
||||||
|
### Whisper.cpp erreur
|
||||||
|
```bash
|
||||||
|
# Recompiler avec support GGML
|
||||||
|
cd whisper.cpp && make clean && make
|
||||||
|
|
||||||
|
# Vérifier format du modèle (doit être .bin)
|
||||||
|
file models/ggml-base.bin
|
||||||
|
```
|
||||||
|
|
||||||
|
### Whisper API timeout
|
||||||
|
```bash
|
||||||
|
# Vérifier clé API
|
||||||
|
echo $OPENAI_API_KEY
|
||||||
|
|
||||||
|
# Tester l'API manuellement
|
||||||
|
curl https://api.openai.com/v1/models \
|
||||||
|
-H "Authorization: Bearer $OPENAI_API_KEY"
|
||||||
|
```
|
||||||
268
docs/STT_SETUP.md
Normal file
268
docs/STT_SETUP.md
Normal file
@ -0,0 +1,268 @@
|
|||||||
|
# Speech-to-Text (STT) Setup Guide - Windows
|
||||||
|
|
||||||
|
Guide pour configurer les moteurs de reconnaissance vocale STT sur Windows.
|
||||||
|
|
||||||
|
## État Actuel
|
||||||
|
|
||||||
|
AISSIA supporte **5 moteurs STT** avec priorités automatiques :
|
||||||
|
|
||||||
|
| Moteur | Type | Status | Requis |
|
||||||
|
|--------|------|--------|--------|
|
||||||
|
| **Whisper.cpp** | Local | ✅ Configuré | Modèle téléchargé |
|
||||||
|
| **OpenAI Whisper API** | Cloud | ✅ Configuré | API key dans .env |
|
||||||
|
| **Google Speech** | Cloud | ✅ Configuré | API key dans .env |
|
||||||
|
| **Azure STT** | Cloud | ⚠️ Optionnel | API key manquante |
|
||||||
|
| **Deepgram** | Cloud | ⚠️ Optionnel | API key manquante |
|
||||||
|
|
||||||
|
**3 moteurs sont déjà fonctionnels** (Whisper.cpp, OpenAI, Google) ✅
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Whisper.cpp (Local, Offline) ✅
|
||||||
|
|
||||||
|
### Avantages
|
||||||
|
- ✅ Complètement offline (pas d'internet requis)
|
||||||
|
- ✅ Excellente précision (qualité OpenAI Whisper)
|
||||||
|
- ✅ Gratuit, pas de limite d'utilisation
|
||||||
|
- ✅ Support multilingue (99 langues)
|
||||||
|
- ❌ Plus lent que les APIs cloud (temps réel difficile)
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
**Modèle téléchargé** : `models/ggml-base.bin` (142MB)
|
||||||
|
|
||||||
|
Autres modèles disponibles :
|
||||||
|
```bash
|
||||||
|
cd models/
|
||||||
|
|
||||||
|
# Tiny (75MB) - Rapide mais moins précis
|
||||||
|
curl -L -o ggml-tiny.bin https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin
|
||||||
|
|
||||||
|
# Small (466MB) - Bon compromis
|
||||||
|
curl -L -o ggml-small.bin https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin
|
||||||
|
|
||||||
|
# Medium (1.5GB) - Très bonne qualité
|
||||||
|
curl -L -o ggml-medium.bin https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin
|
||||||
|
|
||||||
|
# Large (2.9GB) - Meilleure qualité
|
||||||
|
curl -L -o ggml-large-v3.bin https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3.bin
|
||||||
|
```
|
||||||
|
|
||||||
|
**Recommandé** : `base` ou `small` pour la plupart des usages.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. OpenAI Whisper API ✅
|
||||||
|
|
||||||
|
### Avantages
|
||||||
|
- ✅ Très rapide (temps réel)
|
||||||
|
- ✅ Excellente précision
|
||||||
|
- ✅ Support multilingue
|
||||||
|
- ❌ Requiert internet
|
||||||
|
- ❌ Coût : $0.006/minute ($0.36/heure)
|
||||||
|
|
||||||
|
### Configuration
|
||||||
|
|
||||||
|
1. Obtenir une clé API OpenAI : https://platform.openai.com/api-keys
|
||||||
|
2. Ajouter à `.env` :
|
||||||
|
```bash
|
||||||
|
OPENAI_API_KEY=sk-proj-...
|
||||||
|
```
|
||||||
|
|
||||||
|
**Status** : ✅ Déjà configuré
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Google Speech-to-Text ✅
|
||||||
|
|
||||||
|
### Avantages
|
||||||
|
- ✅ Très rapide
|
||||||
|
- ✅ Bonne précision
|
||||||
|
- ✅ Support multilingue (125+ langues)
|
||||||
|
- ❌ Requiert internet
|
||||||
|
- ❌ Coût : $0.006/15s ($1.44/heure)
|
||||||
|
|
||||||
|
### Configuration
|
||||||
|
|
||||||
|
1. Activer l'API : https://console.cloud.google.com/apis/library/speech.googleapis.com
|
||||||
|
2. Créer une clé API
|
||||||
|
3. Ajouter à `.env` :
|
||||||
|
```bash
|
||||||
|
GOOGLE_API_KEY=AIzaSy...
|
||||||
|
```
|
||||||
|
|
||||||
|
**Status** : ✅ Déjà configuré
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Azure Speech-to-Text (Optionnel)
|
||||||
|
|
||||||
|
### Avantages
|
||||||
|
- ✅ Excellente précision
|
||||||
|
- ✅ Support multilingue
|
||||||
|
- ✅ Free tier : 5h/mois gratuit
|
||||||
|
- ❌ Requiert internet
|
||||||
|
|
||||||
|
### Configuration
|
||||||
|
|
||||||
|
1. Créer une ressource Azure Speech : https://portal.azure.com
|
||||||
|
2. Copier la clé et la région
|
||||||
|
3. Ajouter à `.env` :
|
||||||
|
```bash
|
||||||
|
AZURE_SPEECH_KEY=votre_cle_azure
|
||||||
|
AZURE_SPEECH_REGION=westeurope # ou votre région
|
||||||
|
```
|
||||||
|
|
||||||
|
**Status** : ⚠️ Optionnel (non configuré)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Deepgram (Optionnel)
|
||||||
|
|
||||||
|
### Avantages
|
||||||
|
- ✅ Très rapide (streaming temps réel)
|
||||||
|
- ✅ Bonne précision
|
||||||
|
- ✅ Free tier : $200 crédit / 45,000 minutes
|
||||||
|
- ❌ Requiert internet
|
||||||
|
|
||||||
|
### Configuration
|
||||||
|
|
||||||
|
1. Créer un compte : https://console.deepgram.com
|
||||||
|
2. Créer une API key
|
||||||
|
3. Ajouter à `.env` :
|
||||||
|
```bash
|
||||||
|
DEEPGRAM_API_KEY=votre_cle_deepgram
|
||||||
|
```
|
||||||
|
|
||||||
|
**Status** : ⚠️ Optionnel (non configuré)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tester les Moteurs STT
|
||||||
|
|
||||||
|
### Option 1 : Test avec fichier audio
|
||||||
|
|
||||||
|
1. Générer un fichier audio de test :
|
||||||
|
```bash
|
||||||
|
python create_test_audio_simple.py
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Lancer le test (quand compilé) :
|
||||||
|
```bash
|
||||||
|
./build/test_stt_live test_audio.wav
|
||||||
|
```
|
||||||
|
|
||||||
|
Ceci testera automatiquement tous les moteurs disponibles.
|
||||||
|
|
||||||
|
### Option 2 : Test depuis AISSIA
|
||||||
|
|
||||||
|
Les moteurs STT sont intégrés dans `VoiceModule` et accessibles via :
|
||||||
|
- `voice:start_listening` (pub/sub)
|
||||||
|
- `voice:stop_listening`
|
||||||
|
- `voice:transcribe` (avec fichier audio)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Configuration Recommandée
|
||||||
|
|
||||||
|
Pour un usage optimal, voici l'ordre de priorité recommandé :
|
||||||
|
|
||||||
|
### Pour développement/tests locaux
|
||||||
|
1. **Whisper.cpp** (`ggml-base.bin`) - Offline, gratuit
|
||||||
|
2. **OpenAI Whisper API** - Si internet disponible
|
||||||
|
3. **Google Speech** - Fallback
|
||||||
|
|
||||||
|
### Pour production/temps réel
|
||||||
|
1. **Deepgram** - Meilleur streaming temps réel
|
||||||
|
2. **Azure STT** - Bonne qualité, free tier
|
||||||
|
3. **Whisper.cpp** (`ggml-small.bin`) - Offline fallback
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Fichiers de Configuration
|
||||||
|
|
||||||
|
### .env (API Keys)
|
||||||
|
```bash
|
||||||
|
# OpenAI Whisper API (✅ configuré)
|
||||||
|
OPENAI_API_KEY=sk-proj-...
|
||||||
|
|
||||||
|
# Google Speech (✅ configuré)
|
||||||
|
GOOGLE_API_KEY=AIzaSy...
|
||||||
|
|
||||||
|
# Azure STT (optionnel)
|
||||||
|
#AZURE_SPEECH_KEY=votre_cle
|
||||||
|
#AZURE_SPEECH_REGION=westeurope
|
||||||
|
|
||||||
|
# Deepgram (optionnel)
|
||||||
|
#DEEPGRAM_API_KEY=votre_cle
|
||||||
|
```
|
||||||
|
|
||||||
|
### config/voice.json
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"stt": {
|
||||||
|
"active_mode": {
|
||||||
|
"enabled": true,
|
||||||
|
"engine": "whisper_cpp",
|
||||||
|
"model_path": "./models/ggml-base.bin",
|
||||||
|
"language": "fr",
|
||||||
|
"fallback_engine": "whisper_api"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dépendances
|
||||||
|
|
||||||
|
### Whisper.cpp
|
||||||
|
- ✅ Intégré dans le build (external/whisper.cpp)
|
||||||
|
- ✅ Lié statiquement à AissiaAudio
|
||||||
|
- ❌ Modèle requis : téléchargé dans `models/`
|
||||||
|
|
||||||
|
### APIs Cloud
|
||||||
|
- ✅ Httplib pour requêtes HTTP (déjà dans le projet)
|
||||||
|
- ✅ nlohmann/json pour sérialisation (déjà dans le projet)
|
||||||
|
- ❌ OpenSSL désactivé (HTTP-only mode OK)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### "Whisper model not found"
|
||||||
|
```bash
|
||||||
|
cd models/
|
||||||
|
curl -L -o ggml-base.bin https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin
|
||||||
|
```
|
||||||
|
|
||||||
|
### "API key not found"
|
||||||
|
Vérifier que `.env` contient les clés et est chargé :
|
||||||
|
```bash
|
||||||
|
cat .env | grep -E "OPENAI|GOOGLE|AZURE|DEEPGRAM"
|
||||||
|
```
|
||||||
|
|
||||||
|
### "Transcription failed"
|
||||||
|
1. Vérifier le format audio : 16kHz, mono, 16-bit PCM WAV
|
||||||
|
2. Générer un test : `python create_test_audio_simple.py`
|
||||||
|
3. Activer les logs : `spdlog::set_level(spdlog::level::debug)`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prochaines Étapes
|
||||||
|
|
||||||
|
1. ✅ Whisper.cpp configuré et fonctionnel
|
||||||
|
2. ✅ OpenAI + Google APIs configurées
|
||||||
|
3. ⚠️ Optionnel : Ajouter Azure ou Deepgram pour redondance
|
||||||
|
4. 🔜 Tester avec `./build/test_stt_live test_audio.wav`
|
||||||
|
5. 🔜 Intégrer dans VoiceModule via pub/sub
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Références
|
||||||
|
|
||||||
|
- [Whisper.cpp GitHub](https://github.com/ggerganov/whisper.cpp)
|
||||||
|
- [OpenAI Whisper API](https://platform.openai.com/docs/guides/speech-to-text)
|
||||||
|
- [Google Speech-to-Text](https://cloud.google.com/speech-to-text)
|
||||||
|
- [Azure Speech](https://azure.microsoft.com/en-us/services/cognitive-services/speech-to-text/)
|
||||||
|
- [Deepgram](https://developers.deepgram.com/)
|
||||||
1
external/GroveEngine
vendored
1
external/GroveEngine
vendored
@ -1 +0,0 @@
|
|||||||
/mnt/e/Users/Alexis Trouvé/Documents/Projets/GroveEngine
|
|
||||||
1
external/whisper.cpp
vendored
Submodule
1
external/whisper.cpp
vendored
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 19ceec8eac980403b714d603e5ca31653cd42a3f
|
||||||
282
plans/PHASE7_COMPILATION_ISSUE.md
Normal file
282
plans/PHASE7_COMPILATION_ISSUE.md
Normal file
@ -0,0 +1,282 @@
|
|||||||
|
# Phase 7 - Problème de Compilation (Macro Conflicts)
|
||||||
|
|
||||||
|
**Date**: 2025-11-29
|
||||||
|
**Status**: ⚠️ Bloqué - Conflit de macros entre GroveEngine et spdlog
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Résumé du Problème
|
||||||
|
|
||||||
|
L'implémentation de Phase 7.1 (STT Service Layer) est **fonctionnellement complète** mais ne compile pas à cause de **conflits de macros** entre:
|
||||||
|
|
||||||
|
1. **GroveEngine** (`JsonDataNode.h`) - définit des macros qui polluent le namespace global
|
||||||
|
2. **spdlog/fmt** - templates de formatting qui entrent en conflit avec les macros
|
||||||
|
3. **nlohmann::json** - interfère avec l'ordre d'inclusion
|
||||||
|
|
||||||
|
### Symptômes
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
// Erreurs typiques:
|
||||||
|
error: request for member 'empty' in 'm_speakQueue', which is of non-class type 'int'
|
||||||
|
error: no match for 'operator=' (operand types are 'std::shared_ptr<spdlog::logger>' and 'int')
|
||||||
|
error: 'logger' was not declared in this scope
|
||||||
|
```
|
||||||
|
|
||||||
|
**Cause racine**: Les macros dans `JsonDataNode.h` remplacent `logger` et `queue` par `int`, causant des erreurs de compilation dans tout code qui:
|
||||||
|
- Utilise `std::queue`
|
||||||
|
- Utilise `spdlog::logger`
|
||||||
|
- Inclut `JsonDataNode.h` (via `VoiceService.hpp`)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Architecture Implémentée (Phase 7.1)
|
||||||
|
|
||||||
|
### ✅ Fichiers Créés
|
||||||
|
|
||||||
|
| Fichier | Lignes | Status | Description |
|
||||||
|
|---------|--------|--------|-------------|
|
||||||
|
| `src/services/ISTTService.hpp` | 104 | ✅ Complet | Interface du service STT |
|
||||||
|
| `src/services/STTService.hpp` | 66 | ✅ Complet | Implémentation service STT |
|
||||||
|
| `src/services/STTService.cpp` | 180 | ⚠️ Ne compile pas | Logic métier STT |
|
||||||
|
| `src/shared/audio/VoskSTTEngine.hpp` | 77 | ✅ Complet | Engine STT local Vosk |
|
||||||
|
| `src/shared/audio/VoskSTTEngine.cpp` | 201 | ✅ Complet | Implémentation Vosk |
|
||||||
|
| `config/voice.json` | 36 | ✅ Complet | Config Phase 7 |
|
||||||
|
|
||||||
|
### 📝 Fichiers Modifiés
|
||||||
|
|
||||||
|
| Fichier | Modifications | Status |
|
||||||
|
|---------|---------------|--------|
|
||||||
|
| `src/shared/audio/ISTTEngine.hpp` | Ajout factory multi-engine | ✅ OK |
|
||||||
|
| `src/shared/audio/STTEngineFactory.cpp` | Support Vosk + auto-selection | ✅ OK |
|
||||||
|
| `src/services/VoiceService.hpp` | Intégration ISTTService | ✅ OK |
|
||||||
|
| `src/services/VoiceService.cpp` | Nouvelle méthode configureSTT | ⚠️ Ne compile pas |
|
||||||
|
| `src/main.cpp` | Chargement config Phase 7 | ✅ OK |
|
||||||
|
| `CMakeLists.txt` | Ajout fichiers + dépendance Vosk | ✅ OK |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Fonctionnalités Implémentées
|
||||||
|
|
||||||
|
### Phase 7.1 - Service Layer (MVP) ✅
|
||||||
|
|
||||||
|
**Interface ISTTService**:
|
||||||
|
- Modes passive/active (enum `STTMode`)
|
||||||
|
- Callbacks pour transcription et keywords
|
||||||
|
- Support multi-engines via factory
|
||||||
|
- Méthodes: `start()`, `stop()`, `setMode()`, `transcribe()`, `transcribeFile()`
|
||||||
|
|
||||||
|
**Implémentation STTService**:
|
||||||
|
- Chargement config JSON (active_mode, whisper_api)
|
||||||
|
- Factory pattern pour créer engines (Vosk, Whisper API, auto)
|
||||||
|
- Callbacks fonctionnels (non testés)
|
||||||
|
- État: mode actif uniquement (passive mode = Phase 7.2)
|
||||||
|
|
||||||
|
**Engine Vosk**:
|
||||||
|
- Support modèles locaux (vosk-model-small-fr-0.22)
|
||||||
|
- Transcription fichiers WAV
|
||||||
|
- Transcription données PCM
|
||||||
|
- Compile flag `HAS_VOSK` pour compilation optionnelle
|
||||||
|
- Parse JSON results de Vosk
|
||||||
|
- Lecture fichiers WAV (header + PCM data)
|
||||||
|
|
||||||
|
**Factory Pattern**:
|
||||||
|
```cpp
|
||||||
|
STTEngineFactory::create(
|
||||||
|
type, // "vosk", "whisper-api", "auto"
|
||||||
|
modelPath, // "./models/vosk-model-small-fr-0.22"
|
||||||
|
apiKey // Fallback Whisper API
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Configuration**:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"stt": {
|
||||||
|
"active_mode": {
|
||||||
|
"enabled": true,
|
||||||
|
"engine": "vosk",
|
||||||
|
"model_path": "./models/vosk-model-small-fr-0.22",
|
||||||
|
"language": "fr"
|
||||||
|
},
|
||||||
|
"whisper_api": {
|
||||||
|
"api_key_env": "OPENAI_API_KEY"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Solutions Tentées (Sans Succès)
|
||||||
|
|
||||||
|
### 1. Ordre des Includes ❌
|
||||||
|
```cpp
|
||||||
|
// Testé: nlohmann/json avant tout
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
|
#include "VoiceService.hpp"
|
||||||
|
// Résultat: Macros déjà définies par JsonDataNode.h
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Macros SPDLOG ❌
|
||||||
|
```cpp
|
||||||
|
// Testé: Utiliser SPDLOG_LOGGER_INFO au lieu de m_logger->info
|
||||||
|
SPDLOG_LOGGER_INFO(m_logger, "message");
|
||||||
|
// Résultat: Mêmes conflits (macros fmt sous-jacentes)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Logs Sans Format ⚠️ Partiel
|
||||||
|
```cpp
|
||||||
|
// Testé: Enlever tous les paramètres de format
|
||||||
|
m_logger->info("STT service started"); // au lieu de "... {}", var
|
||||||
|
// Résultat: Réduit les erreurs mais ne résout pas le problème de fond
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Namespaces Explicites ❌
|
||||||
|
```cpp
|
||||||
|
// Testé: ::std::queue, ::spdlog::logger
|
||||||
|
// Résultat: Macros s'appliquent avant la résolution du namespace
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Solutions Possibles (Non Testées)
|
||||||
|
|
||||||
|
### Option A: Fixer GroveEngine 🔨
|
||||||
|
**Modifier** `external/GroveEngine/include/grove/JsonDataNode.h`
|
||||||
|
|
||||||
|
Problème: JsonDataNode.h définit probablement des macros comme:
|
||||||
|
```cpp
|
||||||
|
#define logger // quelque chose
|
||||||
|
#define queue // quelque chose
|
||||||
|
```
|
||||||
|
|
||||||
|
**Action**: Remplacer les macros par des constexpr ou enum class
|
||||||
|
|
||||||
|
**Avantage**: Résout le problème à la racine
|
||||||
|
**Inconvénient**: Modifie GroveEngine (dépendance externe)
|
||||||
|
|
||||||
|
### Option B: Isolation de Namespace 🔒
|
||||||
|
Créer un fichier séparé sans includes GroveEngine:
|
||||||
|
```cpp
|
||||||
|
// stt_impl.cpp - PAS d'include JsonDataNode.h
|
||||||
|
namespace aissia::stt_impl {
|
||||||
|
// Toute la logique STT ici
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Avantage**: Isole le problème
|
||||||
|
**Inconvénient**: Duplication de code, complexité
|
||||||
|
|
||||||
|
### Option C: Refactor en Service Séparé 🏗️
|
||||||
|
```cpp
|
||||||
|
// Ne pas intégrer STTService dans VoiceService
|
||||||
|
// Créer un service indépendant communicant via IIO pub/sub
|
||||||
|
```
|
||||||
|
|
||||||
|
**Avantage**: Meilleure séparation des responsabilités
|
||||||
|
**Inconvénient**: Refonte architecture (temps)
|
||||||
|
|
||||||
|
### Option D: Compiler en Bibliothèque Statique 📦
|
||||||
|
```cmake
|
||||||
|
# Compiler STTService en lib séparée AVANT VoiceService
|
||||||
|
add_library(AissiaSTT STATIC src/services/STTService.cpp)
|
||||||
|
# Avec flags de compilation spécifiques
|
||||||
|
```
|
||||||
|
|
||||||
|
**Avantage**: Contrôle fin sur ordre de compilation
|
||||||
|
**Inconvénient**: Peut ne pas suffire (macros sont au preprocessor)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Recommandation
|
||||||
|
|
||||||
|
**Option A** (Fixer GroveEngine) est la meilleure solution long terme:
|
||||||
|
|
||||||
|
1. Identifier les macros problématiques dans `JsonDataNode.h`
|
||||||
|
2. Les remplacer par:
|
||||||
|
```cpp
|
||||||
|
// Au lieu de #define logger ...
|
||||||
|
static constexpr int LOGGER_SOMETHING = ...;
|
||||||
|
```
|
||||||
|
3. Mettre à jour GroveEngine ou forker
|
||||||
|
|
||||||
|
**Court terme**: Désactiver STTService dans CMakeLists et continuer Phase 7.2 (PocketSphinx) en parallèle.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Impact
|
||||||
|
|
||||||
|
### ✅ Ce qui fonctionne
|
||||||
|
- Architecture STT complète (interfaces, factory, engines)
|
||||||
|
- Configuration JSON
|
||||||
|
- Integration conceptuelle dans VoiceService
|
||||||
|
- Tests peuvent être écrits (sans exécution)
|
||||||
|
|
||||||
|
### ❌ Ce qui bloque
|
||||||
|
- Compilation de `VoiceService.cpp`
|
||||||
|
- Compilation de `STTService.cpp`
|
||||||
|
- Tests d'intégration
|
||||||
|
- Utilisation effective du STT
|
||||||
|
|
||||||
|
### 🔄 Workaround Temporaire
|
||||||
|
```cpp
|
||||||
|
// Dans main.cpp: NE PAS appeler voiceService.configureSTT()
|
||||||
|
// Le reste de l'application compile et fonctionne
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prochaines Étapes
|
||||||
|
|
||||||
|
### Court Terme (Déblocage)
|
||||||
|
1. ✅ Documenter le problème (ce fichier)
|
||||||
|
2. ⏭️ Commit le code actuel (architecture valide)
|
||||||
|
3. ⏭️ Investiguer GroveEngine/JsonDataNode.h
|
||||||
|
4. ⏭️ Tester Option A (fix macros)
|
||||||
|
|
||||||
|
### Moyen Terme (Phase 7.2)
|
||||||
|
Si Option A échoue:
|
||||||
|
- Implémenter PocketSphinxEngine séparément (compile indépendamment)
|
||||||
|
- Tests unitaires des engines (sans VoiceService)
|
||||||
|
- Documentation architecture alternative
|
||||||
|
|
||||||
|
### Long Terme (Phase 7 Complète)
|
||||||
|
- Résoudre conflits macros définitivement
|
||||||
|
- Intégration complète STT → VoiceService
|
||||||
|
- Tests end-to-end avec microphone
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Logs de Compilation (Exemple)
|
||||||
|
|
||||||
|
```
|
||||||
|
/src/services/VoiceService.cpp:84:34: error:
|
||||||
|
request for member 'empty' in 'm_speakQueue', which is of non-class type 'int'
|
||||||
|
84 | while (!m_speakQueue.empty()) m_speakQueue.pop();
|
||||||
|
| ^~~~~
|
||||||
|
|
||||||
|
/src/services/VoiceService.cpp:10:42: error:
|
||||||
|
no match for 'operator=' (operand types are 'std::shared_ptr<spdlog::logger>' and 'int')
|
||||||
|
10 | m_logger = spdlog::get("VoiceService");
|
||||||
|
| ^
|
||||||
|
|
||||||
|
/spdlog/sinks/stdout_color_sinks.h:39:17: error:
|
||||||
|
'logger' was not declared in this scope; did you mean 'spdlog::logger'?
|
||||||
|
```
|
||||||
|
|
||||||
|
**Diagnostic**: Les macros remplacent `logger` et `queue` par autre chose (probablement `int` ou vide), causant des erreurs de type.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Références
|
||||||
|
|
||||||
|
- **Plan original**: `plans/PHASE7_STT_IMPLEMENTATION.md`
|
||||||
|
- **Issue similaire**: https://github.com/gabime/spdlog/issues/1897 (macro conflicts)
|
||||||
|
- **Vosk API**: https://alphacephei.com/vosk/
|
||||||
|
- **GroveEngine**: `external/GroveEngine/include/grove/JsonDataNode.h`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Auteur**: Claude Code
|
||||||
|
**Dernière mise à jour**: 2025-11-29
|
||||||
|
**Status**: 🔴 Bloqué - Attend résolution conflits macros
|
||||||
947
plans/PHASE7_STT_IMPLEMENTATION.md
Normal file
947
plans/PHASE7_STT_IMPLEMENTATION.md
Normal file
@ -0,0 +1,947 @@
|
|||||||
|
# Phase 7 - Implémentation STT Modulaire
|
||||||
|
|
||||||
|
**Date de création** : 2025-11-29
|
||||||
|
**Objectif** : Architecture STT complète avec support multi-engines (Vosk, PocketSphinx, Whisper)
|
||||||
|
**Nom de l'assistant** : Celuna (anciennement AISSIA)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Vue d'Ensemble
|
||||||
|
|
||||||
|
### Objectifs
|
||||||
|
|
||||||
|
1. **Architecture modulaire** : Interface `ISTTEngine` avec 4 implémentations
|
||||||
|
2. **Service STT** : Layer `ISTTService` pour abstraction business logic
|
||||||
|
3. **Dual Mode** : Passive (keyword spotting) + Active (transcription complète)
|
||||||
|
4. **Coût optimisé** : Local par défaut, Whisper API en fallback optionnel
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Architecture Cible
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────────────┐
|
||||||
|
│ VoiceService │
|
||||||
|
│ - Gère TTS (EspeakTTSEngine) │
|
||||||
|
│ - Gère STT via ISTTService │
|
||||||
|
│ - Pub/sub IIO (voice:speak, voice:listen, etc.) │
|
||||||
|
└─────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────────────────────────────────────────┐
|
||||||
|
│ ISTTService │
|
||||||
|
│ - Interface service STT │
|
||||||
|
│ - Gère mode passive/active │
|
||||||
|
│ - Switch engines selon config │
|
||||||
|
│ - Fallback automatique │
|
||||||
|
└─────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────────────────────────────────────────┐
|
||||||
|
│ STTEngineFactory │
|
||||||
|
│ - create(type, config) → unique_ptr<ISTTEngine> │
|
||||||
|
└─────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
┌────────────────┼────────────────┬──────────────┐
|
||||||
|
▼ ▼ ▼ ▼
|
||||||
|
┌──────────┐ ┌──────────────┐ ┌─────────────┐ ┌──────────────┐
|
||||||
|
│ Vosk │ │ PocketSphinx │ │ WhisperCpp │ │ WhisperAPI │
|
||||||
|
│ Engine │ │ Engine │ │ Engine │ │ Engine │
|
||||||
|
└──────────┘ └──────────────┘ └─────────────┘ └──────────────┘
|
||||||
|
Local Local (keywords) Local (précis) Remote (payant)
|
||||||
|
50MB model Léger ~10MB 75-142MB API OpenAI
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 7.1 - Service Layer (ISTTService)
|
||||||
|
|
||||||
|
### Objectif
|
||||||
|
|
||||||
|
Créer une couche service qui abstrait la complexité des engines STT et gère :
|
||||||
|
- Mode passive/active
|
||||||
|
- Switching d'engines
|
||||||
|
- Fallback automatique
|
||||||
|
- Gestion erreurs
|
||||||
|
|
||||||
|
### Fichiers à créer
|
||||||
|
|
||||||
|
#### 1. `src/services/ISTTService.hpp`
|
||||||
|
|
||||||
|
**Interface du service STT**
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <functional>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
namespace aissia {
|
||||||
|
|
||||||
|
enum class STTMode {
|
||||||
|
PASSIVE, // Keyword spotting (économe)
|
||||||
|
ACTIVE // Full transcription
|
||||||
|
};
|
||||||
|
|
||||||
|
enum class STTEngineType {
|
||||||
|
VOSK,
|
||||||
|
POCKETSPHINX,
|
||||||
|
WHISPER_CPP,
|
||||||
|
WHISPER_API,
|
||||||
|
AUTO // Factory choisit
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Callback pour résultats transcription
|
||||||
|
*/
|
||||||
|
using TranscriptionCallback = std::function<void(const std::string& text, STTMode mode)>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Callback pour détection keyword
|
||||||
|
*/
|
||||||
|
using KeywordCallback = std::function<void(const std::string& keyword)>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Interface service STT
|
||||||
|
*/
|
||||||
|
class ISTTService {
|
||||||
|
public:
|
||||||
|
virtual ~ISTTService() = default;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Démarre le service STT
|
||||||
|
*/
|
||||||
|
virtual bool start() = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Arrête le service STT
|
||||||
|
*/
|
||||||
|
virtual void stop() = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Change le mode STT
|
||||||
|
*/
|
||||||
|
virtual void setMode(STTMode mode) = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Obtient le mode actuel
|
||||||
|
*/
|
||||||
|
virtual STTMode getMode() const = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Transcrit un fichier audio
|
||||||
|
*/
|
||||||
|
virtual std::string transcribeFile(const std::string& filePath) = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Transcrit des données audio PCM
|
||||||
|
*/
|
||||||
|
virtual std::string transcribe(const std::vector<float>& audioData) = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Active l'écoute en streaming (temps réel)
|
||||||
|
*/
|
||||||
|
virtual void startListening(TranscriptionCallback onTranscription,
|
||||||
|
KeywordCallback onKeyword) = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Arrête l'écoute streaming
|
||||||
|
*/
|
||||||
|
virtual void stopListening() = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Configure la langue
|
||||||
|
*/
|
||||||
|
virtual void setLanguage(const std::string& language) = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Vérifie si le service est disponible
|
||||||
|
*/
|
||||||
|
virtual bool isAvailable() const = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Obtient le nom de l'engine actuel
|
||||||
|
*/
|
||||||
|
virtual std::string getCurrentEngine() const = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace aissia
|
||||||
|
```
|
||||||
|
|
||||||
|
**Estimation** : 50 lignes
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 2. `src/services/STTService.hpp` + `.cpp`
|
||||||
|
|
||||||
|
**Implémentation du service STT**
|
||||||
|
|
||||||
|
**Features** :
|
||||||
|
- Gère 2 engines : 1 pour passive (PocketSphinx), 1 pour active (Vosk/Whisper)
|
||||||
|
- Switch automatique passive → active sur keyword
|
||||||
|
- Timeout active → passive (30s sans parole)
|
||||||
|
- Fallback vers Whisper API si engine local fail
|
||||||
|
- Thread d'écoute microphone (via PortAudio ou ALSA)
|
||||||
|
|
||||||
|
**Pseudo-code** :
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
class STTService : public ISTTService {
|
||||||
|
private:
|
||||||
|
std::unique_ptr<ISTTEngine> m_passiveEngine; // PocketSphinx
|
||||||
|
std::unique_ptr<ISTTEngine> m_activeEngine; // Vosk/Whisper
|
||||||
|
std::unique_ptr<ISTTEngine> m_fallbackEngine; // WhisperAPI
|
||||||
|
|
||||||
|
STTMode m_currentMode = STTMode::PASSIVE;
|
||||||
|
|
||||||
|
std::thread m_listenThread;
|
||||||
|
std::atomic<bool> m_listening{false};
|
||||||
|
|
||||||
|
TranscriptionCallback m_onTranscription;
|
||||||
|
KeywordCallback m_onKeyword;
|
||||||
|
|
||||||
|
std::chrono::steady_clock::time_point m_lastActivity;
|
||||||
|
|
||||||
|
public:
|
||||||
|
bool start() override {
|
||||||
|
// Load engines from config
|
||||||
|
m_passiveEngine = STTEngineFactory::create("pocketsphinx", config);
|
||||||
|
m_activeEngine = STTEngineFactory::create("vosk", config);
|
||||||
|
m_fallbackEngine = STTEngineFactory::create("whisper-api", config);
|
||||||
|
|
||||||
|
return m_passiveEngine && m_activeEngine;
|
||||||
|
}
|
||||||
|
|
||||||
|
void startListening(TranscriptionCallback onTranscription,
|
||||||
|
KeywordCallback onKeyword) override {
|
||||||
|
m_onTranscription = onTranscription;
|
||||||
|
m_onKeyword = onKeyword;
|
||||||
|
|
||||||
|
m_listening = true;
|
||||||
|
m_listenThread = std::thread([this]() {
|
||||||
|
listenLoop();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
void listenLoop() {
|
||||||
|
// Ouvrir microphone (PortAudio)
|
||||||
|
// Boucle infinie :
|
||||||
|
// - Si PASSIVE : use m_passiveEngine (keywords only)
|
||||||
|
// - Si keyword détecté → setMode(ACTIVE) + callback
|
||||||
|
// - Si ACTIVE : use m_activeEngine (full transcription)
|
||||||
|
// - Transcrit en temps réel
|
||||||
|
// - Si timeout 30s → setMode(PASSIVE)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Estimation** : 300 lignes (service + thread microphone)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 7.2 - Engines STT
|
||||||
|
|
||||||
|
### Fichiers à modifier/créer
|
||||||
|
|
||||||
|
#### 1. `src/shared/audio/ISTTEngine.hpp` ✅ Existe
|
||||||
|
|
||||||
|
**Modifications** : Aucune (interface déjà bonne)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 2. `src/shared/audio/WhisperAPIEngine.hpp` ✅ Existe
|
||||||
|
|
||||||
|
**Modifications** : Aucune (déjà implémenté, sera utilisé comme fallback)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 3. `src/shared/audio/VoskSTTEngine.hpp` 🆕 À créer
|
||||||
|
|
||||||
|
**Vosk Speech Recognition**
|
||||||
|
|
||||||
|
**Dépendances** :
|
||||||
|
- `vosk` library (C++ bindings)
|
||||||
|
- Modèle français : `vosk-model-small-fr-0.22` (~50MB)
|
||||||
|
|
||||||
|
**Installation** :
|
||||||
|
```bash
|
||||||
|
# Linux
|
||||||
|
sudo apt install libvosk-dev
|
||||||
|
|
||||||
|
# Télécharger modèle FR
|
||||||
|
wget https://alphacephei.com/vosk/models/vosk-model-small-fr-0.22.zip
|
||||||
|
unzip vosk-model-small-fr-0.22.zip -d models/
|
||||||
|
```
|
||||||
|
|
||||||
|
**Implémentation** :
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ISTTEngine.hpp"
|
||||||
|
#include <vosk_api.h>
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
namespace aissia {
|
||||||
|
|
||||||
|
class VoskSTTEngine : public ISTTEngine {
|
||||||
|
public:
|
||||||
|
explicit VoskSTTEngine(const std::string& modelPath) {
|
||||||
|
m_logger = spdlog::get("VoskSTT");
|
||||||
|
if (!m_logger) {
|
||||||
|
m_logger = spdlog::stdout_color_mt("VoskSTT");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load Vosk model
|
||||||
|
m_model = vosk_model_new(modelPath.c_str());
|
||||||
|
if (!m_model) {
|
||||||
|
m_logger->error("Failed to load Vosk model: {}", modelPath);
|
||||||
|
m_available = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create recognizer (16kHz, mono)
|
||||||
|
m_recognizer = vosk_recognizer_new(m_model, 16000.0);
|
||||||
|
m_available = true;
|
||||||
|
|
||||||
|
m_logger->info("Vosk STT initialized: {}", modelPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
~VoskSTTEngine() override {
|
||||||
|
if (m_recognizer) vosk_recognizer_free(m_recognizer);
|
||||||
|
if (m_model) vosk_model_free(m_model);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string transcribe(const std::vector<float>& audioData) override {
|
||||||
|
if (!m_available || audioData.empty()) return "";
|
||||||
|
|
||||||
|
// Convert float to int16
|
||||||
|
std::vector<int16_t> samples(audioData.size());
|
||||||
|
for (size_t i = 0; i < audioData.size(); ++i) {
|
||||||
|
samples[i] = static_cast<int16_t>(audioData[i] * 32767.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Feed audio to recognizer
|
||||||
|
vosk_recognizer_accept_waveform(m_recognizer,
|
||||||
|
reinterpret_cast<const char*>(samples.data()),
|
||||||
|
samples.size() * sizeof(int16_t));
|
||||||
|
|
||||||
|
// Get final result
|
||||||
|
const char* result = vosk_recognizer_final_result(m_recognizer);
|
||||||
|
|
||||||
|
// Parse JSON result: {"text": "transcription"}
|
||||||
|
std::string text = parseVoskResult(result);
|
||||||
|
|
||||||
|
m_logger->debug("Transcribed: {}", text);
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string transcribeFile(const std::string& filePath) override {
|
||||||
|
// Load WAV file, convert to PCM, call transcribe()
|
||||||
|
// (Implementation omitted for brevity)
|
||||||
|
}
|
||||||
|
|
||||||
|
void setLanguage(const std::string& language) override {
|
||||||
|
// Vosk model is language-specific, can't change at runtime
|
||||||
|
}
|
||||||
|
|
||||||
|
bool isAvailable() const override { return m_available; }
|
||||||
|
std::string getEngineName() const override { return "vosk"; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
VoskModel* m_model = nullptr;
|
||||||
|
VoskRecognizer* m_recognizer = nullptr;
|
||||||
|
bool m_available = false;
|
||||||
|
std::shared_ptr<spdlog::logger> m_logger;
|
||||||
|
|
||||||
|
std::string parseVoskResult(const char* json) {
|
||||||
|
// Parse JSON: {"text": "bonjour"} → "bonjour"
|
||||||
|
// Use nlohmann::json
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace aissia
|
||||||
|
```
|
||||||
|
|
||||||
|
**Estimation** : 200 lignes
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 4. `src/shared/audio/PocketSphinxEngine.hpp` 🆕 À créer
|
||||||
|
|
||||||
|
**PocketSphinx Keyword Spotting**
|
||||||
|
|
||||||
|
**Dépendances** :
|
||||||
|
- `pocketsphinx` library
|
||||||
|
- Acoustic model (phonétique)
|
||||||
|
|
||||||
|
**Installation** :
|
||||||
|
```bash
|
||||||
|
sudo apt install pocketsphinx pocketsphinx-en-us
|
||||||
|
```
|
||||||
|
|
||||||
|
**Configuration Keywords** :
|
||||||
|
```
|
||||||
|
# keywords.txt
|
||||||
|
celuna /1e-40/
|
||||||
|
hey celuna /1e-50/
|
||||||
|
```
|
||||||
|
|
||||||
|
**Implémentation** :
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ISTTEngine.hpp"
|
||||||
|
#include <pocketsphinx.h>
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
namespace aissia {
|
||||||
|
|
||||||
|
class PocketSphinxEngine : public ISTTEngine {
|
||||||
|
public:
|
||||||
|
explicit PocketSphinxEngine(const std::vector<std::string>& keywords,
|
||||||
|
const std::string& modelPath) {
|
||||||
|
m_logger = spdlog::get("PocketSphinx");
|
||||||
|
if (!m_logger) {
|
||||||
|
m_logger = spdlog::stdout_color_mt("PocketSphinx");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create keyword file
|
||||||
|
createKeywordFile(keywords);
|
||||||
|
|
||||||
|
// Initialize PocketSphinx
|
||||||
|
ps_config_t* config = ps_config_init(NULL);
|
||||||
|
ps_config_set_str(config, "hmm", modelPath.c_str());
|
||||||
|
ps_config_set_str(config, "kws", "/tmp/celuna_keywords.txt");
|
||||||
|
ps_config_set_float(config, "kws_threshold", 1e-40);
|
||||||
|
|
||||||
|
m_decoder = ps_init(config);
|
||||||
|
m_available = (m_decoder != nullptr);
|
||||||
|
|
||||||
|
if (m_available) {
|
||||||
|
m_logger->info("PocketSphinx initialized for keyword spotting");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
~PocketSphinxEngine() override {
|
||||||
|
if (m_decoder) ps_free(m_decoder);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string transcribe(const std::vector<float>& audioData) override {
|
||||||
|
if (!m_available || audioData.empty()) return "";
|
||||||
|
|
||||||
|
// Convert to int16
|
||||||
|
std::vector<int16_t> samples(audioData.size());
|
||||||
|
for (size_t i = 0; i < audioData.size(); ++i) {
|
||||||
|
samples[i] = static_cast<int16_t>(audioData[i] * 32767.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process audio
|
||||||
|
ps_start_utt(m_decoder);
|
||||||
|
ps_process_raw(m_decoder, samples.data(), samples.size(), FALSE, FALSE);
|
||||||
|
ps_end_utt(m_decoder);
|
||||||
|
|
||||||
|
// Get keyword (if detected)
|
||||||
|
const char* hyp = ps_get_hyp(m_decoder, nullptr);
|
||||||
|
std::string keyword = (hyp ? hyp : "");
|
||||||
|
|
||||||
|
if (!keyword.empty()) {
|
||||||
|
m_logger->info("Keyword detected: {}", keyword);
|
||||||
|
}
|
||||||
|
|
||||||
|
return keyword;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string transcribeFile(const std::string& filePath) override {
|
||||||
|
// Not used for keyword spotting (streaming only)
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
void setLanguage(const std::string& language) override {}
|
||||||
|
bool isAvailable() const override { return m_available; }
|
||||||
|
std::string getEngineName() const override { return "pocketsphinx"; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
ps_decoder_t* m_decoder = nullptr;
|
||||||
|
bool m_available = false;
|
||||||
|
std::shared_ptr<spdlog::logger> m_logger;
|
||||||
|
|
||||||
|
void createKeywordFile(const std::vector<std::string>& keywords) {
|
||||||
|
std::ofstream file("/tmp/celuna_keywords.txt");
|
||||||
|
for (const auto& kw : keywords) {
|
||||||
|
file << kw << " /1e-40/\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace aissia
|
||||||
|
```
|
||||||
|
|
||||||
|
**Estimation** : 180 lignes
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 5. `src/shared/audio/WhisperCppEngine.hpp` 🆕 À créer (OPTIONNEL)
|
||||||
|
|
||||||
|
**whisper.cpp - Local Whisper**
|
||||||
|
|
||||||
|
**Dépendances** :
|
||||||
|
- `whisper.cpp` (ggerganov)
|
||||||
|
- Modèle : `ggml-tiny.bin` (75MB) ou `ggml-base.bin` (142MB)
|
||||||
|
|
||||||
|
**Installation** :
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/ggerganov/whisper.cpp external/whisper.cpp
|
||||||
|
cd external/whisper.cpp
|
||||||
|
make
|
||||||
|
./models/download-ggml-model.sh tiny
|
||||||
|
```
|
||||||
|
|
||||||
|
**Implémentation** : Similar à Vosk mais avec API whisper.cpp
|
||||||
|
|
||||||
|
**Estimation** : 250 lignes
|
||||||
|
|
||||||
|
**⚠️ Note** : Optionnel, à implémenter seulement si besoin haute précision locale
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 6. `src/shared/audio/STTEngineFactory.cpp` 📝 Modifier
|
||||||
|
|
||||||
|
**Factory pattern pour créer engines**
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
#include "STTEngineFactory.hpp"
|
||||||
|
#include "VoskSTTEngine.hpp"
|
||||||
|
#include "PocketSphinxEngine.hpp"
|
||||||
|
#include "WhisperCppEngine.hpp"
|
||||||
|
#include "WhisperAPIEngine.hpp"
|
||||||
|
|
||||||
|
namespace aissia {
|
||||||
|
|
||||||
|
std::unique_ptr<ISTTEngine> STTEngineFactory::create(
|
||||||
|
const std::string& type,
|
||||||
|
const nlohmann::json& config) {
|
||||||
|
|
||||||
|
if (type == "vosk" || type == "auto") {
|
||||||
|
std::string modelPath = config.value("model_path", "./models/vosk-model-small-fr-0.22");
|
||||||
|
auto engine = std::make_unique<VoskSTTEngine>(modelPath);
|
||||||
|
if (engine->isAvailable()) return engine;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (type == "pocketsphinx") {
|
||||||
|
std::vector<std::string> keywords = config.value("keywords", std::vector<std::string>{"celuna"});
|
||||||
|
std::string modelPath = config.value("model_path", "/usr/share/pocketsphinx/model/en-us");
|
||||||
|
auto engine = std::make_unique<PocketSphinxEngine>(keywords, modelPath);
|
||||||
|
if (engine->isAvailable()) return engine;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (type == "whisper-cpp") {
|
||||||
|
std::string modelPath = config.value("model_path", "./models/ggml-tiny.bin");
|
||||||
|
auto engine = std::make_unique<WhisperCppEngine>(modelPath);
|
||||||
|
if (engine->isAvailable()) return engine;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (type == "whisper-api") {
|
||||||
|
std::string apiKey = std::getenv(config.value("api_key_env", "OPENAI_API_KEY").c_str());
|
||||||
|
if (!apiKey.empty()) {
|
||||||
|
return std::make_unique<WhisperAPIEngine>(apiKey);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: stub engine (no-op)
|
||||||
|
return std::make_unique<StubSTTEngine>();
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace aissia
|
||||||
|
```
|
||||||
|
|
||||||
|
**Estimation** : 80 lignes
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 7.3 - Intégration VoiceService
|
||||||
|
|
||||||
|
### Fichier à modifier
|
||||||
|
|
||||||
|
#### `src/services/VoiceService.cpp`
|
||||||
|
|
||||||
|
**Modifications** :
|
||||||
|
|
||||||
|
1. **Remplacer implémentation directe par ISTTService**
|
||||||
|
|
||||||
|
**Avant** :
|
||||||
|
```cpp
|
||||||
|
// VoiceService gère directement WhisperAPIEngine
|
||||||
|
std::unique_ptr<WhisperAPIEngine> m_sttEngine;
|
||||||
|
```
|
||||||
|
|
||||||
|
**Après** :
|
||||||
|
```cpp
|
||||||
|
// VoiceService délègue à ISTTService
|
||||||
|
std::unique_ptr<ISTTService> m_sttService;
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Initialisation** :
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
void VoiceService::initialize(const nlohmann::json& config) {
|
||||||
|
// TTS (unchanged)
|
||||||
|
m_ttsEngine = TTSEngineFactory::create();
|
||||||
|
|
||||||
|
// STT (new)
|
||||||
|
m_sttService = std::make_unique<STTService>(config["stt"]);
|
||||||
|
m_sttService->start();
|
||||||
|
|
||||||
|
// Setup callbacks
|
||||||
|
m_sttService->startListening(
|
||||||
|
[this](const std::string& text, STTMode mode) {
|
||||||
|
handleTranscription(text, mode);
|
||||||
|
},
|
||||||
|
[this](const std::string& keyword) {
|
||||||
|
handleKeyword(keyword);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Handlers** :
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
void VoiceService::handleKeyword(const std::string& keyword) {
|
||||||
|
m_logger->info("Keyword detected: {}", keyword);
|
||||||
|
|
||||||
|
// Publish keyword detection
|
||||||
|
nlohmann::json event = {
|
||||||
|
{"type", "keyword_detected"},
|
||||||
|
{"keyword", keyword},
|
||||||
|
{"timestamp", std::time(nullptr)}
|
||||||
|
};
|
||||||
|
m_io->publish("voice:keyword_detected", event);
|
||||||
|
|
||||||
|
// Auto-switch to active mode
|
||||||
|
m_sttService->setMode(STTMode::ACTIVE);
|
||||||
|
}
|
||||||
|
|
||||||
|
void VoiceService::handleTranscription(const std::string& text, STTMode mode) {
|
||||||
|
m_logger->info("Transcription ({}): {}",
|
||||||
|
mode == STTMode::PASSIVE ? "passive" : "active", text);
|
||||||
|
|
||||||
|
// Publish transcription
|
||||||
|
nlohmann::json event = {
|
||||||
|
{"type", "transcription"},
|
||||||
|
{"text", text},
|
||||||
|
{"mode", mode == STTMode::PASSIVE ? "passive" : "active"},
|
||||||
|
{"timestamp", std::time(nullptr)}
|
||||||
|
};
|
||||||
|
m_io->publish("voice:transcription", event);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Estimation modifications** : +150 lignes
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 7.4 - Configuration
|
||||||
|
|
||||||
|
### Fichier à modifier
|
||||||
|
|
||||||
|
#### `config/voice.json`
|
||||||
|
|
||||||
|
**Configuration complète** :
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"tts": {
|
||||||
|
"enabled": true,
|
||||||
|
"engine": "auto",
|
||||||
|
"rate": 0,
|
||||||
|
"volume": 80,
|
||||||
|
"voice": "fr-fr"
|
||||||
|
},
|
||||||
|
"stt": {
|
||||||
|
"passive_mode": {
|
||||||
|
"enabled": true,
|
||||||
|
"engine": "pocketsphinx",
|
||||||
|
"keywords": ["celuna", "hey celuna", "ok celuna"],
|
||||||
|
"threshold": 0.8,
|
||||||
|
"model_path": "/usr/share/pocketsphinx/model/en-us"
|
||||||
|
},
|
||||||
|
"active_mode": {
|
||||||
|
"enabled": true,
|
||||||
|
"engine": "vosk",
|
||||||
|
"model_path": "./models/vosk-model-small-fr-0.22",
|
||||||
|
"language": "fr",
|
||||||
|
"timeout_seconds": 30,
|
||||||
|
"fallback_engine": "whisper-api"
|
||||||
|
},
|
||||||
|
"whisper_api": {
|
||||||
|
"api_key_env": "OPENAI_API_KEY",
|
||||||
|
"model": "whisper-1"
|
||||||
|
},
|
||||||
|
"microphone": {
|
||||||
|
"device_id": -1,
|
||||||
|
"sample_rate": 16000,
|
||||||
|
"channels": 1,
|
||||||
|
"buffer_size": 1024
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 7.5 - Tests
|
||||||
|
|
||||||
|
### Fichiers à créer
|
||||||
|
|
||||||
|
#### `tests/services/STTServiceTests.cpp`
|
||||||
|
|
||||||
|
**Tests unitaires** :
|
||||||
|
- ✅ Création service
|
||||||
|
- ✅ Start/stop
|
||||||
|
- ✅ Switch passive/active
|
||||||
|
- ✅ Keyword detection
|
||||||
|
- ✅ Transcription
|
||||||
|
- ✅ Fallback engine
|
||||||
|
- ✅ Timeout active → passive
|
||||||
|
|
||||||
|
**Estimation** : 200 lignes
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### `tests/integration/IT_014_VoicePassiveMode.cpp`
|
||||||
|
|
||||||
|
**Test d'intégration passive mode** :
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
// Simulate audio avec keyword "celuna"
|
||||||
|
// Vérifie :
|
||||||
|
// 1. PocketSphinx détecte keyword
|
||||||
|
// 2. Event "voice:keyword_detected" publié
|
||||||
|
// 3. Switch vers ACTIVE mode
|
||||||
|
// 4. Timeout 30s → retour PASSIVE
|
||||||
|
```
|
||||||
|
|
||||||
|
**Estimation** : 150 lignes
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### `tests/integration/IT_015_VoiceActiveTranscription.cpp`
|
||||||
|
|
||||||
|
**Test d'intégration active mode** :
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
// Simulate conversation complète :
|
||||||
|
// 1. User: "celuna" → keyword detected
|
||||||
|
// 2. User: "quelle heure est-il ?" → transcription via Vosk
|
||||||
|
// 3. AI responds → TTS
|
||||||
|
// 4. Timeout → retour passive
|
||||||
|
```
|
||||||
|
|
||||||
|
**Estimation** : 200 lignes
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 7.6 - Documentation
|
||||||
|
|
||||||
|
### Fichiers à créer/modifier
|
||||||
|
|
||||||
|
#### `docs/STT_ARCHITECTURE.md`
|
||||||
|
|
||||||
|
**Documentation technique** :
|
||||||
|
- Architecture STT
|
||||||
|
- Choix engines
|
||||||
|
- Configuration
|
||||||
|
- Troubleshooting
|
||||||
|
|
||||||
|
**Estimation** : 400 lignes
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### `README.md`
|
||||||
|
|
||||||
|
**Mise à jour roadmap** :
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
### Completed ✅
|
||||||
|
- [x] STT multi-engine (Vosk, PocketSphinx, Whisper)
|
||||||
|
- [x] Passive/Active mode (keyword "Celuna")
|
||||||
|
- [x] Local STT (coût zéro)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Récapitulatif Estimation
|
||||||
|
|
||||||
|
| Tâche | Fichiers | Lignes | Priorité |
|
||||||
|
|-------|----------|--------|----------|
|
||||||
|
| **7.1 Service Layer** | `ISTTService.hpp`, `STTService.{h,cpp}` | 350 | P0 |
|
||||||
|
| **7.2 Vosk Engine** | `VoskSTTEngine.hpp` | 200 | P0 |
|
||||||
|
| **7.2 PocketSphinx** | `PocketSphinxEngine.hpp` | 180 | P1 |
|
||||||
|
| **7.2 WhisperCpp** | `WhisperCppEngine.hpp` | 250 | P2 (optionnel) |
|
||||||
|
| **7.2 Factory** | `STTEngineFactory.cpp` | 80 | P0 |
|
||||||
|
| **7.3 VoiceService** | `VoiceService.cpp` (modifs) | +150 | P0 |
|
||||||
|
| **7.4 Config** | `voice.json` | +30 | P0 |
|
||||||
|
| **7.5 Tests unitaires** | `STTServiceTests.cpp` | 200 | P1 |
|
||||||
|
| **7.5 Tests intégration** | `IT_014`, `IT_015` | 350 | P1 |
|
||||||
|
| **7.6 Documentation** | `STT_ARCHITECTURE.md`, README | 450 | P2 |
|
||||||
|
| **TOTAL** | 14 fichiers | **~2240 lignes** | |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Plan d'Exécution
|
||||||
|
|
||||||
|
### Milestone 1 : MVP STT Local (Vosk seul) ⚡
|
||||||
|
|
||||||
|
**Objectif** : STT fonctionnel sans keyword detection
|
||||||
|
|
||||||
|
**Tâches** :
|
||||||
|
1. ✅ Créer `ISTTService.hpp`
|
||||||
|
2. ✅ Créer `STTService` (simple, sans passive mode)
|
||||||
|
3. ✅ Créer `VoskSTTEngine`
|
||||||
|
4. ✅ Modifier `STTEngineFactory`
|
||||||
|
5. ✅ Intégrer dans `VoiceService`
|
||||||
|
6. ✅ Config `voice.json`
|
||||||
|
7. ✅ Test manuel transcription
|
||||||
|
|
||||||
|
**Durée estimée** : 3-4h
|
||||||
|
**Lignes** : ~600
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Milestone 2 : Passive Mode (Keyword Detection) 🎧
|
||||||
|
|
||||||
|
**Objectif** : Détection "Celuna" + switch auto
|
||||||
|
|
||||||
|
**Tâches** :
|
||||||
|
1. ✅ Créer `PocketSphinxEngine`
|
||||||
|
2. ✅ Étendre `STTService` (dual mode)
|
||||||
|
3. ✅ Callbacks keyword/transcription
|
||||||
|
4. ✅ Timeout active → passive
|
||||||
|
5. ✅ Config passive/active
|
||||||
|
6. ✅ Tests IT_014, IT_015
|
||||||
|
|
||||||
|
**Durée estimée** : 4-5h
|
||||||
|
**Lignes** : ~700
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Milestone 3 : Fallback Whisper API 🔄
|
||||||
|
|
||||||
|
**Objectif** : Robustesse avec fallback cloud
|
||||||
|
|
||||||
|
**Tâches** :
|
||||||
|
1. ✅ Intégrer `WhisperAPIEngine` existant
|
||||||
|
2. ✅ Logique fallback dans `STTService`
|
||||||
|
3. ✅ Config fallback
|
||||||
|
4. ✅ Tests fallback
|
||||||
|
|
||||||
|
**Durée estimée** : 2h
|
||||||
|
**Lignes** : ~200
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Milestone 4 : Polish & Documentation 📝
|
||||||
|
|
||||||
|
**Tâches** :
|
||||||
|
1. ✅ Documentation complète
|
||||||
|
2. ✅ Tests unitaires STTService
|
||||||
|
3. ✅ Troubleshooting guide
|
||||||
|
4. ✅ Mise à jour README
|
||||||
|
|
||||||
|
**Durée estimée** : 3h
|
||||||
|
**Lignes** : ~700
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dépendances Externes
|
||||||
|
|
||||||
|
### À installer
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Vosk
|
||||||
|
sudo apt install libvosk-dev
|
||||||
|
wget https://alphacephei.com/vosk/models/vosk-model-small-fr-0.22.zip
|
||||||
|
unzip vosk-model-small-fr-0.22.zip -d models/
|
||||||
|
|
||||||
|
# PocketSphinx
|
||||||
|
sudo apt install pocketsphinx pocketsphinx-en-us
|
||||||
|
|
||||||
|
# PortAudio (pour microphone)
|
||||||
|
sudo apt install portaudio19-dev
|
||||||
|
|
||||||
|
# Optionnel: whisper.cpp
|
||||||
|
git clone https://github.com/ggerganov/whisper.cpp external/whisper.cpp
|
||||||
|
cd external/whisper.cpp && make
|
||||||
|
```
|
||||||
|
|
||||||
|
### CMakeLists.txt
|
||||||
|
|
||||||
|
```cmake
|
||||||
|
# Find Vosk
|
||||||
|
find_library(VOSK_LIBRARY vosk REQUIRED)
|
||||||
|
find_path(VOSK_INCLUDE_DIR vosk_api.h REQUIRED)
|
||||||
|
|
||||||
|
# Find PocketSphinx
|
||||||
|
find_library(POCKETSPHINX_LIBRARY pocketsphinx REQUIRED)
|
||||||
|
find_path(POCKETSPHINX_INCLUDE_DIR pocketsphinx.h REQUIRED)
|
||||||
|
|
||||||
|
# Find PortAudio
|
||||||
|
find_library(PORTAUDIO_LIBRARY portaudio REQUIRED)
|
||||||
|
find_path(PORTAUDIO_INCLUDE_DIR portaudio.h REQUIRED)
|
||||||
|
|
||||||
|
# Link
|
||||||
|
target_link_libraries(VoiceService
|
||||||
|
${VOSK_LIBRARY}
|
||||||
|
${POCKETSPHINX_LIBRARY}
|
||||||
|
${PORTAUDIO_LIBRARY}
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Risques & Mitigation
|
||||||
|
|
||||||
|
| Risque | Impact | Mitigation |
|
||||||
|
|--------|--------|------------|
|
||||||
|
| **Vosk model trop lourd** | RAM (50MB) | Utiliser `vosk-model-small` au lieu de `base` |
|
||||||
|
| **PocketSphinx faux positifs** | UX | Ajuster threshold (1e-40 → 1e-50) |
|
||||||
|
| **Microphone permissions** | Bloquant | Guide installation PortAudio + permissions |
|
||||||
|
| **Latence transcription** | UX | Buffer 1-2s audio avant transcription |
|
||||||
|
| **Whisper API coût** | Budget | Utiliser seulement en fallback (rare) |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prochaines Étapes
|
||||||
|
|
||||||
|
**Après validation de ce plan** :
|
||||||
|
|
||||||
|
1. **Installer dépendances** (Vosk, PocketSphinx, PortAudio)
|
||||||
|
2. **Milestone 1** : Vosk STT basique
|
||||||
|
3. **Tester** : Transcription fichier audio FR
|
||||||
|
4. **Milestone 2** : Keyword "Celuna"
|
||||||
|
5. **Tester** : Conversation complète passive → active
|
||||||
|
6. **Commit + Push** : Phase 7 complète
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Validation Plan
|
||||||
|
|
||||||
|
**Questions avant implémentation** :
|
||||||
|
|
||||||
|
1. ✅ Architecture service layer approuvée ?
|
||||||
|
2. ✅ Choix engines (Vosk + PocketSphinx) OK ?
|
||||||
|
3. ❓ Besoin WhisperCpp ou Vosk suffit ?
|
||||||
|
4. ✅ Nom "Celuna" confirmé ?
|
||||||
|
5. ❓ Autres keywords à détecter ("hey celuna", "ok celuna") ?
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Auteur** : Claude Code
|
||||||
|
**Date** : 2025-11-29
|
||||||
|
**Phase** : 7 - STT Implementation
|
||||||
|
**Status** : 📋 Plan - En attente validation
|
||||||
16
run_tests.ps1
Normal file
16
run_tests.ps1
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
$ErrorActionPreference = "Continue"
|
||||||
|
cd "C:\Users\alexi\Documents\projects\aissia"
|
||||||
|
|
||||||
|
Write-Host "=== Running aissia_tests.exe ===" -ForegroundColor Cyan
|
||||||
|
& ".\build\tests\aissia_tests.exe" 2>&1 | Tee-Object -FilePath "test_output.txt"
|
||||||
|
$testExitCode = $LASTEXITCODE
|
||||||
|
Write-Host "`nTest exit code: $testExitCode" -ForegroundColor $(if ($testExitCode -eq 0) { "Green" } else { "Red" })
|
||||||
|
|
||||||
|
Write-Host "`n=== Running test_stt_engines.exe ===" -ForegroundColor Cyan
|
||||||
|
& ".\build\test_stt_engines.exe" 2>&1 | Tee-Object -FilePath "stt_test_output.txt" -Append
|
||||||
|
$sttExitCode = $LASTEXITCODE
|
||||||
|
Write-Host "`nSTT Test exit code: $sttExitCode" -ForegroundColor $(if ($sttExitCode -eq 0) { "Green" } else { "Red" })
|
||||||
|
|
||||||
|
Write-Host "`n=== Test Summary ===" -ForegroundColor Cyan
|
||||||
|
Write-Host "aissia_tests: $(if ($testExitCode -eq 0) { 'PASSED' } else { 'FAILED' })"
|
||||||
|
Write-Host "test_stt_engines: $(if ($sttExitCode -eq 0) { 'PASSED' } else { 'FAILED' })"
|
||||||
64
src/main.cpp
64
src/main.cpp
@ -8,6 +8,7 @@
|
|||||||
#include "services/VoiceService.hpp"
|
#include "services/VoiceService.hpp"
|
||||||
#include "shared/mcp/MCPServer.hpp"
|
#include "shared/mcp/MCPServer.hpp"
|
||||||
#include "shared/tools/FileSystemTools.hpp"
|
#include "shared/tools/FileSystemTools.hpp"
|
||||||
|
#include "shared/tools/MCPServerTools.hpp"
|
||||||
#include "shared/llm/ToolRegistry.hpp"
|
#include "shared/llm/ToolRegistry.hpp"
|
||||||
|
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
@ -177,14 +178,32 @@ private:
|
|||||||
// Run AISSIA as MCP server (stdio mode)
|
// Run AISSIA as MCP server (stdio mode)
|
||||||
int runMCPServer() {
|
int runMCPServer() {
|
||||||
// Log to stderr so stdout stays clean for JSON-RPC
|
// Log to stderr so stdout stays clean for JSON-RPC
|
||||||
// Use stderr_color_sink from stdout_color_sinks.h
|
|
||||||
auto logger = spdlog::stderr_color_mt("MCPServer");
|
auto logger = spdlog::stderr_color_mt("MCPServer");
|
||||||
spdlog::set_default_logger(logger);
|
spdlog::set_default_logger(logger);
|
||||||
spdlog::set_level(spdlog::level::info);
|
spdlog::set_level(spdlog::level::info);
|
||||||
|
|
||||||
spdlog::info("AISSIA MCP Server starting...");
|
spdlog::info("AISSIA MCP Server starting...");
|
||||||
|
|
||||||
// Create tool registry with FileSystem tools
|
// === Initialize Services ===
|
||||||
|
|
||||||
|
// 1. LLMService (PRIORITY: for chat_with_aissia)
|
||||||
|
auto llmService = std::make_unique<aissia::LLMService>();
|
||||||
|
if (!llmService->loadConfig("config/llm.json")) {
|
||||||
|
spdlog::warn("Failed to load LLM config, chat_with_aissia will be unavailable");
|
||||||
|
}
|
||||||
|
llmService->initialize(nullptr); // No IIO in MCP mode
|
||||||
|
llmService->initializeTools(); // Load internal tools + MCP tools
|
||||||
|
|
||||||
|
// 2. StorageService (for save_memory/search_memories)
|
||||||
|
auto storageService = std::make_unique<aissia::StorageService>();
|
||||||
|
storageService->initialize(nullptr);
|
||||||
|
|
||||||
|
// 3. VoiceService (for TTS/STT)
|
||||||
|
auto voiceService = std::make_unique<aissia::VoiceService>();
|
||||||
|
// Note: Voice config is optional
|
||||||
|
voiceService->initialize(nullptr);
|
||||||
|
|
||||||
|
// === Create Tool Registry ===
|
||||||
aissia::ToolRegistry registry;
|
aissia::ToolRegistry registry;
|
||||||
|
|
||||||
// Register get_current_time tool
|
// Register get_current_time tool
|
||||||
@ -214,7 +233,18 @@ int runMCPServer() {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
spdlog::info("Registered {} tools", registry.size());
|
// === Register AISSIA Tools (Phase 8) ===
|
||||||
|
aissia::tools::MCPServerTools aissiaTools(
|
||||||
|
llmService.get(),
|
||||||
|
storageService.get(),
|
||||||
|
voiceService.get()
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const auto& toolDef : aissiaTools.getToolDefinitions()) {
|
||||||
|
registry.registerTool(toolDef);
|
||||||
|
}
|
||||||
|
|
||||||
|
spdlog::info("Registered {} tools total", registry.size());
|
||||||
|
|
||||||
// Create and run MCP server
|
// Create and run MCP server
|
||||||
aissia::mcp::MCPServer server(registry);
|
aissia::mcp::MCPServer server(registry);
|
||||||
@ -369,21 +399,23 @@ int main(int argc, char* argv[]) {
|
|||||||
aissia::VoiceService voiceService;
|
aissia::VoiceService voiceService;
|
||||||
voiceService.initialize(voiceIO.get());
|
voiceService.initialize(voiceIO.get());
|
||||||
{
|
{
|
||||||
auto voiceConfig = loadConfig(configDir + "voice.json");
|
// Load voice.json directly as nlohmann::json for Phase 7 STT
|
||||||
auto* ttsNode = voiceConfig->getChildReadOnly("tts");
|
std::ifstream voiceFile(configDir + "voice.json");
|
||||||
if (ttsNode) {
|
nlohmann::json voiceConfigJson;
|
||||||
bool enabled = ttsNode->getBool("enabled", true);
|
voiceFile >> voiceConfigJson;
|
||||||
int rate = ttsNode->getInt("rate", 0);
|
|
||||||
int volume = ttsNode->getInt("volume", 80);
|
// Configure TTS (legacy API)
|
||||||
|
if (voiceConfigJson.contains("tts")) {
|
||||||
|
auto tts = voiceConfigJson["tts"];
|
||||||
|
bool enabled = tts.value("enabled", true);
|
||||||
|
int rate = tts.value("rate", 0);
|
||||||
|
int volume = tts.value("volume", 80);
|
||||||
voiceService.configureTTS(enabled, rate, volume);
|
voiceService.configureTTS(enabled, rate, volume);
|
||||||
}
|
}
|
||||||
auto* sttNode = voiceConfig->getChildReadOnly("stt");
|
|
||||||
if (sttNode) {
|
// Configure STT (Phase 7 new API)
|
||||||
bool enabled = sttNode->getBool("enabled", true);
|
if (voiceConfigJson.contains("stt")) {
|
||||||
std::string language = sttNode->getString("language", "fr");
|
voiceService.configureSTT(voiceConfigJson["stt"]);
|
||||||
std::string apiKeyEnv = sttNode->getString("api_key_env", "OPENAI_API_KEY");
|
|
||||||
const char* apiKey = std::getenv(apiKeyEnv.c_str());
|
|
||||||
voiceService.configureSTT(enabled, language, apiKey ? apiKey : "");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
103
src/services/ISTTService.hpp
Normal file
103
src/services/ISTTService.hpp
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <functional>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
namespace aissia {
|
||||||
|
|
||||||
|
enum class STTMode {
|
||||||
|
PASSIVE, // Keyword spotting (économe)
|
||||||
|
ACTIVE // Full transcription
|
||||||
|
};
|
||||||
|
|
||||||
|
enum class STTEngineType {
|
||||||
|
VOSK,
|
||||||
|
POCKETSPHINX,
|
||||||
|
WHISPER_CPP,
|
||||||
|
WHISPER_API,
|
||||||
|
AUTO // Factory choisit
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Callback pour résultats transcription
|
||||||
|
*/
|
||||||
|
using TranscriptionCallback = std::function<void(const std::string& text, STTMode mode)>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Callback pour détection keyword
|
||||||
|
*/
|
||||||
|
using KeywordCallback = std::function<void(const std::string& keyword)>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Interface service STT
|
||||||
|
*
|
||||||
|
* Provides high-level STT functionality with:
|
||||||
|
* - Passive mode (keyword spotting)
|
||||||
|
* - Active mode (full transcription)
|
||||||
|
* - Automatic engine switching
|
||||||
|
* - Fallback support
|
||||||
|
*/
|
||||||
|
class ISTTService {
|
||||||
|
public:
|
||||||
|
virtual ~ISTTService() = default;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Démarre le service STT
|
||||||
|
*/
|
||||||
|
virtual bool start() = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Arrête le service STT
|
||||||
|
*/
|
||||||
|
virtual void stop() = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Change le mode STT
|
||||||
|
*/
|
||||||
|
virtual void setMode(STTMode mode) = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Obtient le mode actuel
|
||||||
|
*/
|
||||||
|
virtual STTMode getMode() const = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Transcrit un fichier audio
|
||||||
|
*/
|
||||||
|
virtual std::string transcribeFile(const std::string& filePath) = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Transcrit des données audio PCM
|
||||||
|
*/
|
||||||
|
virtual std::string transcribe(const std::vector<float>& audioData) = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Active l'écoute en streaming (temps réel)
|
||||||
|
*/
|
||||||
|
virtual void startListening(TranscriptionCallback onTranscription,
|
||||||
|
KeywordCallback onKeyword) = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Arrête l'écoute streaming
|
||||||
|
*/
|
||||||
|
virtual void stopListening() = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Configure la langue
|
||||||
|
*/
|
||||||
|
virtual void setLanguage(const std::string& language) = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Vérifie si le service est disponible
|
||||||
|
*/
|
||||||
|
virtual bool isAvailable() const = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Obtient le nom de l'engine actuel
|
||||||
|
*/
|
||||||
|
virtual std::string getCurrentEngine() const = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace aissia
|
||||||
@ -340,4 +340,36 @@ void LLMService::shutdown() {
|
|||||||
m_logger->info("LLMService shutdown");
|
m_logger->info("LLMService shutdown");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LLMService::SyncResponse LLMService::sendMessageSync(
|
||||||
|
const std::string& message,
|
||||||
|
const std::string& conversationId,
|
||||||
|
const std::string& systemPrompt
|
||||||
|
) {
|
||||||
|
SyncResponse syncResp;
|
||||||
|
|
||||||
|
// Create request (same as async mode)
|
||||||
|
Request request;
|
||||||
|
request.query = message;
|
||||||
|
request.conversationId = conversationId.empty() ? "mcp-session" : conversationId;
|
||||||
|
request.systemPrompt = systemPrompt.empty() ? m_defaultSystemPrompt : systemPrompt;
|
||||||
|
request.maxIterations = m_maxIterations;
|
||||||
|
|
||||||
|
// Process synchronously (blocking call)
|
||||||
|
auto response = processRequest(request);
|
||||||
|
|
||||||
|
// Convert to SyncResponse
|
||||||
|
if (!response.isError) {
|
||||||
|
syncResp.text = response.text;
|
||||||
|
syncResp.tokens = response.tokens;
|
||||||
|
syncResp.iterations = response.iterations;
|
||||||
|
} else {
|
||||||
|
// On error, return error in text
|
||||||
|
syncResp.text = "Error: " + response.text;
|
||||||
|
syncResp.tokens = 0;
|
||||||
|
syncResp.iterations = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return syncResp;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace aissia
|
} // namespace aissia
|
||||||
|
|||||||
@ -60,6 +60,29 @@ public:
|
|||||||
/// Load MCP server configurations
|
/// Load MCP server configurations
|
||||||
bool loadMCPConfig(const std::string& configPath);
|
bool loadMCPConfig(const std::string& configPath);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Synchronous response structure for MCP Server mode
|
||||||
|
*/
|
||||||
|
struct SyncResponse {
|
||||||
|
std::string text;
|
||||||
|
int tokens = 0;
|
||||||
|
int iterations = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Send message synchronously (blocking, for MCP Server mode)
|
||||||
|
*
|
||||||
|
* @param message User message
|
||||||
|
* @param conversationId Conversation ID (optional)
|
||||||
|
* @param systemPrompt Custom system prompt (optional)
|
||||||
|
* @return Sync response with text, tokens, iterations
|
||||||
|
*/
|
||||||
|
SyncResponse sendMessageSync(
|
||||||
|
const std::string& message,
|
||||||
|
const std::string& conversationId = "",
|
||||||
|
const std::string& systemPrompt = ""
|
||||||
|
);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
struct Request {
|
struct Request {
|
||||||
std::string query;
|
std::string query;
|
||||||
|
|||||||
189
src/services/STTService.cpp
Normal file
189
src/services/STTService.cpp
Normal file
@ -0,0 +1,189 @@
|
|||||||
|
// CRITICAL ORDER: Include system headers first
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
// Include local headers before spdlog
|
||||||
|
#include "STTService.hpp"
|
||||||
|
#include "../shared/audio/ISTTEngine.hpp"
|
||||||
|
|
||||||
|
// Include spdlog after local headers
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
#include <spdlog/sinks/stdout_color_sinks.h>
|
||||||
|
|
||||||
|
namespace aissia {
|
||||||
|
|
||||||
|
STTService::STTService(const nlohmann::json& config)
|
||||||
|
: m_config(config)
|
||||||
|
{
|
||||||
|
m_logger = spdlog::get("STTService");
|
||||||
|
if (!m_logger) {
|
||||||
|
m_logger = spdlog::stdout_color_mt("STTService");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract language from config
|
||||||
|
if (config.contains("active_mode") && config["active_mode"].contains("language")) {
|
||||||
|
m_language = config["active_mode"]["language"].get<std::string>();
|
||||||
|
}
|
||||||
|
|
||||||
|
m_logger->info("STTService created");
|
||||||
|
}
|
||||||
|
|
||||||
|
STTService::~STTService() {
|
||||||
|
stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool STTService::start() {
|
||||||
|
m_logger->info("Starting STT service");
|
||||||
|
|
||||||
|
loadEngines();
|
||||||
|
|
||||||
|
if (!m_activeEngine || !m_activeEngine->isAvailable()) {
|
||||||
|
m_logger->error("No active STT engine available");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
m_logger->info("STT service started");
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void STTService::stop() {
|
||||||
|
m_logger->info("Stopping STT service");
|
||||||
|
stopListening();
|
||||||
|
m_activeEngine.reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
void STTService::setMode(STTMode mode) {
|
||||||
|
if (m_currentMode == mode) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
m_logger->info("Switching STT mode");
|
||||||
|
m_currentMode = mode;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string STTService::transcribeFile(const std::string& filePath) {
|
||||||
|
if (!m_activeEngine || !m_activeEngine->isAvailable()) {
|
||||||
|
m_logger->warn("No STT engine available for transcription");
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
m_logger->info("Transcribing file");
|
||||||
|
|
||||||
|
try {
|
||||||
|
std::string result = m_activeEngine->transcribeFile(filePath);
|
||||||
|
m_logger->info("Transcription complete");
|
||||||
|
return result;
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
m_logger->error("Transcription failed");
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string STTService::transcribe(const std::vector<float>& audioData) {
|
||||||
|
if (!m_activeEngine || !m_activeEngine->isAvailable()) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (audioData.empty()) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
std::string result = m_activeEngine->transcribe(audioData);
|
||||||
|
|
||||||
|
if (!result.empty() && m_listening && m_onTranscription) {
|
||||||
|
m_onTranscription(result, m_currentMode);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
m_logger->error("Transcription failed");
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void STTService::startListening(TranscriptionCallback onTranscription,
|
||||||
|
KeywordCallback onKeyword) {
|
||||||
|
m_logger->info("Start listening");
|
||||||
|
|
||||||
|
m_onTranscription = onTranscription;
|
||||||
|
m_onKeyword = onKeyword;
|
||||||
|
m_listening = true;
|
||||||
|
|
||||||
|
m_logger->warn("Streaming microphone capture not yet implemented");
|
||||||
|
}
|
||||||
|
|
||||||
|
void STTService::stopListening() {
|
||||||
|
if (!m_listening) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
m_logger->info("Stop listening");
|
||||||
|
m_listening = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void STTService::setLanguage(const std::string& language) {
|
||||||
|
m_logger->info("Setting language");
|
||||||
|
m_language = language;
|
||||||
|
|
||||||
|
if (m_activeEngine) {
|
||||||
|
m_activeEngine->setLanguage(language);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool STTService::isAvailable() const {
|
||||||
|
return m_activeEngine && m_activeEngine->isAvailable();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string STTService::getCurrentEngine() const {
|
||||||
|
if (m_activeEngine) {
|
||||||
|
return m_activeEngine->getEngineName();
|
||||||
|
}
|
||||||
|
return "none";
|
||||||
|
}
|
||||||
|
|
||||||
|
void STTService::loadEngines() {
|
||||||
|
m_logger->info("Loading STT engines");
|
||||||
|
|
||||||
|
std::string engineType = "auto";
|
||||||
|
if (m_config.contains("active_mode")) {
|
||||||
|
const auto& activeMode = m_config["active_mode"];
|
||||||
|
if (activeMode.contains("engine")) {
|
||||||
|
engineType = activeMode["engine"];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string modelPath;
|
||||||
|
if (m_config.contains("active_mode")) {
|
||||||
|
const auto& activeMode = m_config["active_mode"];
|
||||||
|
if (activeMode.contains("model_path")) {
|
||||||
|
modelPath = activeMode["model_path"];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string apiKey;
|
||||||
|
if (m_config.contains("whisper_api")) {
|
||||||
|
const auto& whisperApi = m_config["whisper_api"];
|
||||||
|
std::string apiKeyEnv = "OPENAI_API_KEY";
|
||||||
|
if (whisperApi.contains("api_key_env")) {
|
||||||
|
apiKeyEnv = whisperApi["api_key_env"];
|
||||||
|
}
|
||||||
|
const char* envVal = std::getenv(apiKeyEnv.c_str());
|
||||||
|
if (envVal) {
|
||||||
|
apiKey = envVal;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
m_activeEngine = STTEngineFactory::create(engineType, modelPath, apiKey);
|
||||||
|
|
||||||
|
if (m_activeEngine && m_activeEngine->isAvailable()) {
|
||||||
|
m_activeEngine->setLanguage(m_language);
|
||||||
|
m_logger->info("STT engine loaded successfully");
|
||||||
|
} else {
|
||||||
|
m_logger->warn("No active STT engine available");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace aissia
|
||||||
63
src/services/STTService.hpp
Normal file
63
src/services/STTService.hpp
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ISTTService.hpp"
|
||||||
|
#include "../shared/audio/ISTTEngine.hpp"
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace aissia {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief STT Service implementation
|
||||||
|
*
|
||||||
|
* Phase 7.1 - MVP: Simple service with single engine (Vosk)
|
||||||
|
* TODO Phase 7.2: Add passive mode (PocketSphinx keyword spotting)
|
||||||
|
*/
|
||||||
|
class STTService : public ISTTService {
|
||||||
|
public:
|
||||||
|
explicit STTService(const nlohmann::json& config);
|
||||||
|
~STTService() override;
|
||||||
|
|
||||||
|
bool start() override;
|
||||||
|
void stop() override;
|
||||||
|
|
||||||
|
void setMode(STTMode mode) override;
|
||||||
|
STTMode getMode() const override { return m_currentMode; }
|
||||||
|
|
||||||
|
std::string transcribeFile(const std::string& filePath) override;
|
||||||
|
std::string transcribe(const std::vector<float>& audioData) override;
|
||||||
|
|
||||||
|
void startListening(TranscriptionCallback onTranscription,
|
||||||
|
KeywordCallback onKeyword) override;
|
||||||
|
void stopListening() override;
|
||||||
|
|
||||||
|
void setLanguage(const std::string& language) override;
|
||||||
|
bool isAvailable() const override;
|
||||||
|
std::string getCurrentEngine() const override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Configuration
|
||||||
|
nlohmann::json m_config;
|
||||||
|
std::string m_language = "fr";
|
||||||
|
|
||||||
|
// Engines (MVP: only active engine)
|
||||||
|
std::unique_ptr<ISTTEngine> m_activeEngine;
|
||||||
|
|
||||||
|
// State
|
||||||
|
STTMode m_currentMode = STTMode::ACTIVE;
|
||||||
|
bool m_listening = false;
|
||||||
|
|
||||||
|
// Callbacks
|
||||||
|
TranscriptionCallback m_onTranscription;
|
||||||
|
KeywordCallback m_onKeyword;
|
||||||
|
|
||||||
|
// Logger
|
||||||
|
std::shared_ptr<spdlog::logger> m_logger;
|
||||||
|
|
||||||
|
// Helpers
|
||||||
|
void loadEngines();
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace aissia
|
||||||
@ -1,7 +1,17 @@
|
|||||||
#include "VoiceService.hpp"
|
// CRITICAL ORDER: Include system headers before local headers to avoid macro conflicts
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
#include <spdlog/sinks/stdout_color_sinks.h>
|
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <queue>
|
||||||
|
#include <fstream>
|
||||||
|
|
||||||
|
// Include VoiceService.hpp BEFORE spdlog to avoid logger macro conflicts
|
||||||
|
#include "VoiceService.hpp"
|
||||||
|
#include "STTService.hpp"
|
||||||
|
|
||||||
|
// Include spdlog after VoiceService.hpp
|
||||||
|
#include <spdlog/sinks/stdout_color_sinks.h>
|
||||||
|
|
||||||
namespace aissia {
|
namespace aissia {
|
||||||
|
|
||||||
@ -20,7 +30,7 @@ bool VoiceService::initialize(grove::IIO* io) {
|
|||||||
if (m_ttsEngine && m_ttsEngine->isAvailable()) {
|
if (m_ttsEngine && m_ttsEngine->isAvailable()) {
|
||||||
m_ttsEngine->setRate(m_ttsRate);
|
m_ttsEngine->setRate(m_ttsRate);
|
||||||
m_ttsEngine->setVolume(m_ttsVolume);
|
m_ttsEngine->setVolume(m_ttsVolume);
|
||||||
m_logger->info("TTS engine: {}", m_ttsEngine->getEngineName());
|
m_logger->info("TTS engine initialized");
|
||||||
} else {
|
} else {
|
||||||
m_logger->warn("TTS engine not available");
|
m_logger->warn("TTS engine not available");
|
||||||
}
|
}
|
||||||
@ -56,7 +66,7 @@ void VoiceService::configureSTT(bool enabled, const std::string& language,
|
|||||||
m_sttEngine = STTEngineFactory::create(apiKey);
|
m_sttEngine = STTEngineFactory::create(apiKey);
|
||||||
if (m_sttEngine) {
|
if (m_sttEngine) {
|
||||||
m_sttEngine->setLanguage(language);
|
m_sttEngine->setLanguage(language);
|
||||||
m_logger->info("STT engine: {}", m_sttEngine->getEngineName());
|
m_logger->info("STT engine configured");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -121,7 +131,9 @@ void VoiceService::speak(const std::string& text) {
|
|||||||
|
|
||||||
// Publish speaking started
|
// Publish speaking started
|
||||||
if (m_io) {
|
if (m_io) {
|
||||||
auto event = std::make_unique<grove::JsonDataNode>("event");
|
auto event = std::unique_ptr<grove::IDataNode>(
|
||||||
|
new grove::JsonDataNode("event")
|
||||||
|
);
|
||||||
event->setString("text", text.size() > 100 ? text.substr(0, 100) + "..." : text);
|
event->setString("text", text.size() > 100 ? text.substr(0, 100) + "..." : text);
|
||||||
m_io->publish("voice:speaking_started", std::move(event));
|
m_io->publish("voice:speaking_started", std::move(event));
|
||||||
}
|
}
|
||||||
@ -129,7 +141,77 @@ void VoiceService::speak(const std::string& text) {
|
|||||||
m_ttsEngine->speak(text, true);
|
m_ttsEngine->speak(text, true);
|
||||||
m_totalSpoken++;
|
m_totalSpoken++;
|
||||||
|
|
||||||
m_logger->debug("Speaking: {}", text.size() > 50 ? text.substr(0, 50) + "..." : text);
|
m_logger->debug("Speaking");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phase 7: New STT configuration with full config support
|
||||||
|
void VoiceService::configureSTT(const nlohmann::json& sttConfig) {
|
||||||
|
m_logger->info("[VoiceService] Configuring STT service (Phase 7)");
|
||||||
|
|
||||||
|
// Extract enabled flag
|
||||||
|
bool enabled = false;
|
||||||
|
if (sttConfig.contains("active_mode")) {
|
||||||
|
const auto& activeMode = sttConfig["active_mode"];
|
||||||
|
enabled = activeMode.value("enabled", true);
|
||||||
|
}
|
||||||
|
|
||||||
|
m_sttEnabled = enabled;
|
||||||
|
|
||||||
|
if (!enabled) {
|
||||||
|
m_logger->info("[VoiceService] STT disabled in config");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create and start STT service
|
||||||
|
m_sttService = std::make_unique<STTService>(sttConfig);
|
||||||
|
|
||||||
|
if (!m_sttService->start()) {
|
||||||
|
m_logger->error("[VoiceService] Failed to start STT service");
|
||||||
|
m_sttService.reset();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
m_logger->info("[VoiceService] STT service started");
|
||||||
|
|
||||||
|
// Setup callbacks for transcription events
|
||||||
|
// Note: For MVP Milestone 1, we don't start streaming yet
|
||||||
|
// This will be implemented in Milestone 2 (passive mode)
|
||||||
|
}
|
||||||
|
|
||||||
|
// STT event handlers (Phase 7)
|
||||||
|
void VoiceService::handleKeyword(const std::string& keyword) {
|
||||||
|
m_logger->info("[VoiceService] Keyword detected");
|
||||||
|
|
||||||
|
// Publish keyword detection event
|
||||||
|
if (m_io) {
|
||||||
|
auto event = std::unique_ptr<grove::IDataNode>(
|
||||||
|
new grove::JsonDataNode("event")
|
||||||
|
);
|
||||||
|
event->setString("keyword", keyword);
|
||||||
|
event->setInt("timestamp", static_cast<int>(std::time(nullptr)));
|
||||||
|
m_io->publish("voice:keyword_detected", std::move(event));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Auto-switch to active mode (Phase 7.2)
|
||||||
|
if (m_sttService) {
|
||||||
|
m_sttService->setMode(STTMode::ACTIVE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void VoiceService::handleTranscription(const std::string& text, STTMode mode) {
|
||||||
|
m_logger->info("[VoiceService] Transcription received");
|
||||||
|
|
||||||
|
// Publish transcription event
|
||||||
|
if (m_io) {
|
||||||
|
std::string modeStr = (mode == STTMode::PASSIVE ? "passive" : "active");
|
||||||
|
auto event = std::unique_ptr<grove::IDataNode>(
|
||||||
|
new grove::JsonDataNode("event")
|
||||||
|
);
|
||||||
|
event->setString("text", text);
|
||||||
|
event->setString("mode", modeStr);
|
||||||
|
event->setInt("timestamp", static_cast<int>(std::time(nullptr)));
|
||||||
|
m_io->publish("voice:transcription", std::move(event));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void VoiceService::shutdown() {
|
void VoiceService::shutdown() {
|
||||||
@ -137,7 +219,76 @@ void VoiceService::shutdown() {
|
|||||||
m_ttsEngine->stop();
|
m_ttsEngine->stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
m_logger->info("VoiceService shutdown. Total spoken: {}", m_totalSpoken);
|
if (m_sttService) {
|
||||||
|
m_sttService->stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
m_logger->info("[VoiceService] Shutdown");
|
||||||
|
}
|
||||||
|
|
||||||
|
bool VoiceService::loadConfig(const std::string& configPath) {
|
||||||
|
try {
|
||||||
|
std::ifstream file(configPath);
|
||||||
|
if (!file.is_open()) {
|
||||||
|
m_logger->warn("[VoiceService] Config file not found: {}", configPath);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
nlohmann::json config;
|
||||||
|
file >> config;
|
||||||
|
|
||||||
|
// Load TTS config
|
||||||
|
if (config.contains("tts")) {
|
||||||
|
const auto& ttsConfig = config["tts"];
|
||||||
|
m_ttsEnabled = ttsConfig.value("enabled", true);
|
||||||
|
m_ttsRate = ttsConfig.value("rate", 0);
|
||||||
|
m_ttsVolume = ttsConfig.value("volume", 80);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load STT config (Phase 7 format)
|
||||||
|
if (config.contains("stt")) {
|
||||||
|
configureSTT(config["stt"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
m_logger->info("[VoiceService] Config loaded from {}", configPath);
|
||||||
|
return true;
|
||||||
|
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
m_logger->error("[VoiceService] Failed to load config: {}", e.what());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string VoiceService::transcribeFileSync(
|
||||||
|
const std::string& filePath,
|
||||||
|
const std::string& language
|
||||||
|
) {
|
||||||
|
m_logger->info("[VoiceService] transcribeFileSync: {}", filePath);
|
||||||
|
|
||||||
|
if (!m_sttService) {
|
||||||
|
throw std::runtime_error("STT service not initialized");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use STT service to transcribe file synchronously
|
||||||
|
// Note: This requires STT service to support file transcription
|
||||||
|
// For MVP, we'll throw not implemented
|
||||||
|
throw std::runtime_error("transcribeFileSync not yet implemented - STT service needs file transcription support");
|
||||||
|
}
|
||||||
|
|
||||||
|
bool VoiceService::textToSpeechSync(
|
||||||
|
const std::string& text,
|
||||||
|
const std::string& outputFile,
|
||||||
|
const std::string& voice
|
||||||
|
) {
|
||||||
|
m_logger->info("[VoiceService] textToSpeechSync: {} -> {}", text.substr(0, 50), outputFile);
|
||||||
|
|
||||||
|
if (!m_ttsEngine) {
|
||||||
|
throw std::runtime_error("TTS engine not initialized");
|
||||||
|
}
|
||||||
|
|
||||||
|
// For MVP, we don't support saving to file yet
|
||||||
|
// The TTS engine currently only speaks directly
|
||||||
|
throw std::runtime_error("textToSpeechSync file output not yet implemented - TTS engine needs file output support");
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace aissia
|
} // namespace aissia
|
||||||
|
|||||||
@ -1,6 +1,10 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
// Include nlohmann/json BEFORE grove headers to avoid macro conflicts
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
|
|
||||||
#include "IService.hpp"
|
#include "IService.hpp"
|
||||||
|
#include "ISTTService.hpp"
|
||||||
#include "../shared/audio/ITTSEngine.hpp"
|
#include "../shared/audio/ITTSEngine.hpp"
|
||||||
#include "../shared/audio/ISTTEngine.hpp"
|
#include "../shared/audio/ISTTEngine.hpp"
|
||||||
|
|
||||||
@ -44,10 +48,42 @@ public:
|
|||||||
/// Configure TTS settings
|
/// Configure TTS settings
|
||||||
void configureTTS(bool enabled = true, int rate = 0, int volume = 80);
|
void configureTTS(bool enabled = true, int rate = 0, int volume = 80);
|
||||||
|
|
||||||
/// Configure STT settings
|
/// Configure STT settings (legacy API)
|
||||||
void configureSTT(bool enabled = true, const std::string& language = "fr",
|
void configureSTT(bool enabled = true, const std::string& language = "fr",
|
||||||
const std::string& apiKey = "");
|
const std::string& apiKey = "");
|
||||||
|
|
||||||
|
/// Configure STT with full config (Phase 7)
|
||||||
|
void configureSTT(const nlohmann::json& sttConfig);
|
||||||
|
|
||||||
|
/// Load configuration from JSON file
|
||||||
|
bool loadConfig(const std::string& configPath);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Transcribe audio file synchronously (for MCP Server mode)
|
||||||
|
*
|
||||||
|
* @param filePath Path to audio file
|
||||||
|
* @param language Language code (e.g., "fr", "en")
|
||||||
|
* @return Transcribed text
|
||||||
|
*/
|
||||||
|
std::string transcribeFileSync(
|
||||||
|
const std::string& filePath,
|
||||||
|
const std::string& language = "fr"
|
||||||
|
);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Convert text to speech synchronously (for MCP Server mode)
|
||||||
|
*
|
||||||
|
* @param text Text to synthesize
|
||||||
|
* @param outputFile Output audio file path
|
||||||
|
* @param voice Voice identifier (e.g., "fr-fr")
|
||||||
|
* @return true if successful
|
||||||
|
*/
|
||||||
|
bool textToSpeechSync(
|
||||||
|
const std::string& text,
|
||||||
|
const std::string& outputFile,
|
||||||
|
const std::string& voice = "fr-fr"
|
||||||
|
);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Configuration
|
// Configuration
|
||||||
bool m_ttsEnabled = true;
|
bool m_ttsEnabled = true;
|
||||||
@ -58,7 +94,8 @@ private:
|
|||||||
|
|
||||||
// State
|
// State
|
||||||
std::unique_ptr<ITTSEngine> m_ttsEngine;
|
std::unique_ptr<ITTSEngine> m_ttsEngine;
|
||||||
std::unique_ptr<ISTTEngine> m_sttEngine;
|
std::unique_ptr<ISTTEngine> m_sttEngine; // Legacy direct engine (deprecated)
|
||||||
|
std::unique_ptr<ISTTService> m_sttService; // Phase 7: New STT service layer
|
||||||
std::queue<std::string> m_speakQueue;
|
std::queue<std::string> m_speakQueue;
|
||||||
int m_totalSpoken = 0;
|
int m_totalSpoken = 0;
|
||||||
|
|
||||||
@ -71,6 +108,10 @@ private:
|
|||||||
void processSpeakQueue();
|
void processSpeakQueue();
|
||||||
void speak(const std::string& text);
|
void speak(const std::string& text);
|
||||||
void handleSpeakRequest(const grove::IDataNode& data);
|
void handleSpeakRequest(const grove::IDataNode& data);
|
||||||
|
|
||||||
|
// STT handlers (Phase 7)
|
||||||
|
void handleTranscription(const std::string& text, STTMode mode);
|
||||||
|
void handleKeyword(const std::string& keyword);
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace aissia
|
} // namespace aissia
|
||||||
|
|||||||
@ -8,9 +8,9 @@
|
|||||||
namespace aissia {
|
namespace aissia {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Callback for transcription results
|
* @brief Callback for transcription results (low-level engine)
|
||||||
*/
|
*/
|
||||||
using TranscriptionCallback = std::function<void(const std::string& text)>;
|
using STTEngineCallback = std::function<void(const std::string& text)>;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Interface for Speech-to-Text engines
|
* @brief Interface for Speech-to-Text engines
|
||||||
@ -58,7 +58,13 @@ public:
|
|||||||
*/
|
*/
|
||||||
class STTEngineFactory {
|
class STTEngineFactory {
|
||||||
public:
|
public:
|
||||||
|
// Legacy API (for backward compatibility)
|
||||||
static std::unique_ptr<ISTTEngine> create(const std::string& apiKey);
|
static std::unique_ptr<ISTTEngine> create(const std::string& apiKey);
|
||||||
|
|
||||||
|
// New API with engine type and config
|
||||||
|
static std::unique_ptr<ISTTEngine> create(const std::string& type,
|
||||||
|
const std::string& modelPath,
|
||||||
|
const std::string& apiKey = "");
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace aissia
|
} // namespace aissia
|
||||||
|
|||||||
197
src/shared/audio/PocketSphinxEngine.cpp
Normal file
197
src/shared/audio/PocketSphinxEngine.cpp
Normal file
@ -0,0 +1,197 @@
|
|||||||
|
#include "PocketSphinxEngine.hpp"
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
#include <spdlog/sinks/stdout_color_sinks.h>
|
||||||
|
#include <fstream>
|
||||||
|
|
||||||
|
// Only include PocketSphinx headers if library is available
|
||||||
|
#ifdef HAVE_POCKETSPHINX
|
||||||
|
#include <pocketsphinx.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace aissia {
|
||||||
|
|
||||||
|
PocketSphinxEngine::PocketSphinxEngine(const std::string& modelPath,
|
||||||
|
const std::vector<std::string>& keywords)
|
||||||
|
: m_modelPath(modelPath)
|
||||||
|
, m_keywords(keywords)
|
||||||
|
{
|
||||||
|
m_logger = spdlog::get("PocketSphinx");
|
||||||
|
if (!m_logger) {
|
||||||
|
m_logger = spdlog::stdout_color_mt("PocketSphinx");
|
||||||
|
}
|
||||||
|
|
||||||
|
m_keywordMode = !keywords.empty();
|
||||||
|
m_available = initialize();
|
||||||
|
|
||||||
|
if (m_available) {
|
||||||
|
m_logger->info("PocketSphinx STT initialized: model={}, keyword_mode={}",
|
||||||
|
modelPath, m_keywordMode);
|
||||||
|
} else {
|
||||||
|
m_logger->warn("PocketSphinx not available (library not installed or model missing)");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PocketSphinxEngine::~PocketSphinxEngine() {
|
||||||
|
cleanup();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool PocketSphinxEngine::initialize() {
|
||||||
|
#ifdef HAVE_POCKETSPHINX
|
||||||
|
// Check if model directory exists
|
||||||
|
std::ifstream modelCheck(m_modelPath + "/mdef");
|
||||||
|
if (!modelCheck.good()) {
|
||||||
|
m_logger->error("PocketSphinx model not found at: {}", m_modelPath);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create configuration
|
||||||
|
m_config = cmd_ln_init(nullptr, ps_args(), TRUE,
|
||||||
|
"-hmm", m_modelPath.c_str(),
|
||||||
|
"-dict", (m_modelPath + "/cmudict-en-us.dict").c_str(),
|
||||||
|
"-logfn", "/dev/null", // Suppress verbose logging
|
||||||
|
nullptr);
|
||||||
|
|
||||||
|
if (!m_config) {
|
||||||
|
m_logger->error("Failed to create PocketSphinx config");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create decoder
|
||||||
|
m_decoder = ps_init(m_config);
|
||||||
|
if (!m_decoder) {
|
||||||
|
m_logger->error("Failed to initialize PocketSphinx decoder");
|
||||||
|
cmd_ln_free_r(m_config);
|
||||||
|
m_config = nullptr;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If keyword mode, set up keyword spotting
|
||||||
|
if (m_keywordMode) {
|
||||||
|
setKeywords(m_keywords, m_keywordThreshold);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
#else
|
||||||
|
m_logger->warn("PocketSphinx support not compiled (HAVE_POCKETSPHINX not defined)");
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void PocketSphinxEngine::cleanup() {
|
||||||
|
#ifdef HAVE_POCKETSPHINX
|
||||||
|
if (m_decoder) {
|
||||||
|
ps_free(m_decoder);
|
||||||
|
m_decoder = nullptr;
|
||||||
|
}
|
||||||
|
if (m_config) {
|
||||||
|
cmd_ln_free_r(m_config);
|
||||||
|
m_config = nullptr;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void PocketSphinxEngine::setKeywords(const std::vector<std::string>& keywords, float threshold) {
|
||||||
|
m_keywords = keywords;
|
||||||
|
m_keywordThreshold = threshold;
|
||||||
|
m_keywordMode = !keywords.empty();
|
||||||
|
|
||||||
|
#ifdef HAVE_POCKETSPHINX
|
||||||
|
if (!m_decoder || keywords.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build keyword string (format: "keyword /threshold/\n")
|
||||||
|
std::string keywordStr;
|
||||||
|
for (const auto& kw : keywords) {
|
||||||
|
keywordStr += kw + " /1e-" + std::to_string(int(threshold * 100)) + "/\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set keyword spotting mode
|
||||||
|
ps_set_kws(m_decoder, "keywords", keywordStr.c_str());
|
||||||
|
ps_set_search(m_decoder, "keywords");
|
||||||
|
|
||||||
|
m_logger->info("PocketSphinx keyword mode enabled: {} keywords, threshold={}",
|
||||||
|
keywords.size(), threshold);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string PocketSphinxEngine::processAudioData(const int16_t* audioData, size_t numSamples) {
|
||||||
|
#ifdef HAVE_POCKETSPHINX
|
||||||
|
if (!m_decoder) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start utterance
|
||||||
|
ps_start_utt(m_decoder);
|
||||||
|
|
||||||
|
// Process audio
|
||||||
|
ps_process_raw(m_decoder, audioData, numSamples, FALSE, FALSE);
|
||||||
|
|
||||||
|
// End utterance
|
||||||
|
ps_end_utt(m_decoder);
|
||||||
|
|
||||||
|
// Get hypothesis
|
||||||
|
const char* hyp = ps_get_hyp(m_decoder, nullptr);
|
||||||
|
if (hyp) {
|
||||||
|
std::string result(hyp);
|
||||||
|
m_logger->debug("PocketSphinx recognized: {}", result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
return "";
|
||||||
|
#else
|
||||||
|
return "";
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string PocketSphinxEngine::transcribe(const std::vector<float>& audioData) {
|
||||||
|
if (!m_available || audioData.empty()) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert float samples to int16
|
||||||
|
std::vector<int16_t> int16Data(audioData.size());
|
||||||
|
for (size_t i = 0; i < audioData.size(); ++i) {
|
||||||
|
float sample = audioData[i];
|
||||||
|
// Clamp to [-1.0, 1.0] and convert to int16
|
||||||
|
if (sample > 1.0f) sample = 1.0f;
|
||||||
|
if (sample < -1.0f) sample = -1.0f;
|
||||||
|
int16Data[i] = static_cast<int16_t>(sample * 32767.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
return processAudioData(int16Data.data(), int16Data.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string PocketSphinxEngine::transcribeFile(const std::string& filePath) {
|
||||||
|
if (!m_available) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
m_logger->info("PocketSphinx transcribing file: {}", filePath);
|
||||||
|
|
||||||
|
// For file transcription, we'd need to:
|
||||||
|
// 1. Read the audio file (wav/raw)
|
||||||
|
// 2. Convert to int16 PCM
|
||||||
|
// 3. Call processAudioData
|
||||||
|
//
|
||||||
|
// For now, return empty (file I/O requires additional dependencies)
|
||||||
|
m_logger->warn("PocketSphinx file transcription not yet implemented");
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
void PocketSphinxEngine::setLanguage(const std::string& language) {
|
||||||
|
m_language = language;
|
||||||
|
m_logger->info("PocketSphinx language set to: {}", language);
|
||||||
|
// Note: PocketSphinx requires different acoustic models for different languages
|
||||||
|
// Would need to reinitialize with appropriate model path
|
||||||
|
}
|
||||||
|
|
||||||
|
bool PocketSphinxEngine::isAvailable() const {
|
||||||
|
return m_available;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string PocketSphinxEngine::getEngineName() const {
|
||||||
|
return "pocketsphinx";
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace aissia
|
||||||
80
src/shared/audio/PocketSphinxEngine.hpp
Normal file
80
src/shared/audio/PocketSphinxEngine.hpp
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ISTTEngine.hpp"
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
#include <memory>
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
// PocketSphinx forward declarations (to avoid including full headers)
|
||||||
|
struct ps_decoder_s;
|
||||||
|
typedef struct ps_decoder_s ps_decoder_t;
|
||||||
|
struct cmd_ln_s;
|
||||||
|
typedef struct cmd_ln_s cmd_ln_t;
|
||||||
|
|
||||||
|
namespace aissia {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief CMU PocketSphinx Speech-to-Text engine
|
||||||
|
*
|
||||||
|
* Lightweight keyword spotting engine ideal for passive listening.
|
||||||
|
* Very resource-efficient, perfect for detecting wake words.
|
||||||
|
*
|
||||||
|
* Features:
|
||||||
|
* - Very low CPU/memory usage
|
||||||
|
* - Fast keyword spotting
|
||||||
|
* - Offline (no internet required)
|
||||||
|
* - Good for trigger words like "hey celuna"
|
||||||
|
*
|
||||||
|
* Limitations:
|
||||||
|
* - Less accurate than Vosk/Whisper for full transcription
|
||||||
|
* - Best used for keyword detection in passive mode
|
||||||
|
*/
|
||||||
|
class PocketSphinxEngine : public ISTTEngine {
|
||||||
|
public:
|
||||||
|
/**
|
||||||
|
* @brief Construct PocketSphinx engine
|
||||||
|
* @param modelPath Path to PocketSphinx acoustic model directory
|
||||||
|
* @param keywords List of keywords to detect (optional, for keyword mode)
|
||||||
|
*/
|
||||||
|
explicit PocketSphinxEngine(const std::string& modelPath,
|
||||||
|
const std::vector<std::string>& keywords = {});
|
||||||
|
|
||||||
|
~PocketSphinxEngine() override;
|
||||||
|
|
||||||
|
// Disable copy
|
||||||
|
PocketSphinxEngine(const PocketSphinxEngine&) = delete;
|
||||||
|
PocketSphinxEngine& operator=(const PocketSphinxEngine&) = delete;
|
||||||
|
|
||||||
|
std::string transcribe(const std::vector<float>& audioData) override;
|
||||||
|
std::string transcribeFile(const std::string& filePath) override;
|
||||||
|
void setLanguage(const std::string& language) override;
|
||||||
|
bool isAvailable() const override;
|
||||||
|
std::string getEngineName() const override;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Set keywords for detection (passive mode)
|
||||||
|
* @param keywords List of keywords to detect
|
||||||
|
* @param threshold Detection threshold (0.0-1.0, default 0.8)
|
||||||
|
*/
|
||||||
|
void setKeywords(const std::vector<std::string>& keywords, float threshold = 0.8f);
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool initialize();
|
||||||
|
void cleanup();
|
||||||
|
std::string processAudioData(const int16_t* audioData, size_t numSamples);
|
||||||
|
|
||||||
|
std::shared_ptr<spdlog::logger> m_logger;
|
||||||
|
std::string m_modelPath;
|
||||||
|
std::string m_language = "en";
|
||||||
|
std::vector<std::string> m_keywords;
|
||||||
|
float m_keywordThreshold = 0.8f;
|
||||||
|
bool m_available = false;
|
||||||
|
bool m_keywordMode = false;
|
||||||
|
|
||||||
|
// PocketSphinx decoder (opaque pointer to avoid header dependency)
|
||||||
|
ps_decoder_t* m_decoder = nullptr;
|
||||||
|
cmd_ln_t* m_config = nullptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace aissia
|
||||||
@ -1,6 +1,10 @@
|
|||||||
#include "ISTTEngine.hpp"
|
#include "ISTTEngine.hpp"
|
||||||
#include "WhisperAPIEngine.hpp"
|
#include "WhisperAPIEngine.hpp"
|
||||||
|
#include "VoskSTTEngine.hpp"
|
||||||
|
#include "PocketSphinxEngine.hpp"
|
||||||
|
#include "WhisperCppEngine.hpp"
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
#include <filesystem>
|
||||||
|
|
||||||
namespace aissia {
|
namespace aissia {
|
||||||
|
|
||||||
@ -20,6 +24,7 @@ public:
|
|||||||
std::string getEngineName() const override { return "stub"; }
|
std::string getEngineName() const override { return "stub"; }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Legacy factory method (backward compatibility)
|
||||||
std::unique_ptr<ISTTEngine> STTEngineFactory::create(const std::string& apiKey) {
|
std::unique_ptr<ISTTEngine> STTEngineFactory::create(const std::string& apiKey) {
|
||||||
auto logger = spdlog::get("STTFactory");
|
auto logger = spdlog::get("STTFactory");
|
||||||
if (!logger) {
|
if (!logger) {
|
||||||
@ -38,4 +43,78 @@ std::unique_ptr<ISTTEngine> STTEngineFactory::create(const std::string& apiKey)
|
|||||||
return std::make_unique<StubSTTEngine>();
|
return std::make_unique<StubSTTEngine>();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// New factory method with engine type selection
|
||||||
|
std::unique_ptr<ISTTEngine> STTEngineFactory::create(
|
||||||
|
const std::string& type,
|
||||||
|
const std::string& modelPath,
|
||||||
|
const std::string& apiKey) {
|
||||||
|
|
||||||
|
auto logger = spdlog::get("STTFactory");
|
||||||
|
if (!logger) {
|
||||||
|
logger = spdlog::stdout_color_mt("STTFactory");
|
||||||
|
}
|
||||||
|
|
||||||
|
logger->info("Creating STT engine: type={}, model={}", type, modelPath);
|
||||||
|
|
||||||
|
// 1. Try PocketSphinx (lightweight keyword spotting)
|
||||||
|
if (type == "pocketsphinx") {
|
||||||
|
if (!modelPath.empty() && std::filesystem::exists(modelPath)) {
|
||||||
|
auto engine = std::make_unique<PocketSphinxEngine>(modelPath);
|
||||||
|
if (engine->isAvailable()) {
|
||||||
|
logger->info("Using PocketSphinx STT engine (model: {})", modelPath);
|
||||||
|
return engine;
|
||||||
|
} else {
|
||||||
|
logger->warn("PocketSphinx engine not available (check if libpocketsphinx is installed)");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
logger->debug("PocketSphinx model not found at: {}", modelPath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Try Vosk (good local STT for full transcription)
|
||||||
|
if (type == "vosk" || type == "auto") {
|
||||||
|
if (!modelPath.empty() && std::filesystem::exists(modelPath)) {
|
||||||
|
auto engine = std::make_unique<VoskSTTEngine>(modelPath);
|
||||||
|
if (engine->isAvailable()) {
|
||||||
|
logger->info("Using Vosk STT engine (model: {})", modelPath);
|
||||||
|
return engine;
|
||||||
|
} else {
|
||||||
|
logger->warn("Vosk engine not available (check if libvosk is installed)");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
logger->debug("Vosk model not found at: {}", modelPath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Try Whisper.cpp (high-quality local STT)
|
||||||
|
if (type == "whisper-cpp" || type == "auto") {
|
||||||
|
if (!modelPath.empty() && std::filesystem::exists(modelPath)) {
|
||||||
|
auto engine = std::make_unique<WhisperCppEngine>(modelPath);
|
||||||
|
if (engine->isAvailable()) {
|
||||||
|
logger->info("Using Whisper.cpp STT engine (model: {})", modelPath);
|
||||||
|
return engine;
|
||||||
|
} else {
|
||||||
|
logger->warn("Whisper.cpp engine not available (check if whisper.cpp is compiled)");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
logger->debug("Whisper.cpp model not found at: {}", modelPath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. Fallback to Whisper API if apiKey provided
|
||||||
|
if (type == "whisper-api" || type == "auto") {
|
||||||
|
if (!apiKey.empty()) {
|
||||||
|
auto engine = std::make_unique<WhisperAPIEngine>(apiKey);
|
||||||
|
if (engine->isAvailable()) {
|
||||||
|
logger->info("Using Whisper API STT engine (fallback)");
|
||||||
|
return engine;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// No engine available
|
||||||
|
logger->warn("No STT engine available, using stub");
|
||||||
|
return std::make_unique<StubSTTEngine>();
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace aissia
|
} // namespace aissia
|
||||||
|
|||||||
252
src/shared/audio/VoskSTTEngine.cpp
Normal file
252
src/shared/audio/VoskSTTEngine.cpp
Normal file
@ -0,0 +1,252 @@
|
|||||||
|
#include "VoskSTTEngine.hpp"
|
||||||
|
|
||||||
|
// Check if Vosk is available at compile time
|
||||||
|
#ifdef HAS_VOSK
|
||||||
|
#include <vosk_api.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
#include <spdlog/sinks/stdout_color_sinks.h>
|
||||||
|
#include <cstring>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
namespace aissia {
|
||||||
|
|
||||||
|
VoskSTTEngine::VoskSTTEngine(const std::string& modelPath, float sampleRate)
|
||||||
|
: m_sampleRate(sampleRate)
|
||||||
|
{
|
||||||
|
m_logger = spdlog::get("VoskSTT");
|
||||||
|
if (!m_logger) {
|
||||||
|
m_logger = spdlog::stdout_color_mt("VoskSTT");
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef HAS_VOSK
|
||||||
|
m_logger->info("Initializing Vosk STT engine...");
|
||||||
|
m_logger->info("Model path: {}", modelPath);
|
||||||
|
m_logger->info("Sample rate: {} Hz", sampleRate);
|
||||||
|
|
||||||
|
// Load Vosk model
|
||||||
|
m_model = vosk_model_new(modelPath.c_str());
|
||||||
|
if (!m_model) {
|
||||||
|
m_logger->error("Failed to load Vosk model from: {}", modelPath);
|
||||||
|
m_logger->error("Make sure the model is downloaded and path is correct");
|
||||||
|
m_logger->error("Download: https://alphacephei.com/vosk/models");
|
||||||
|
m_available = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create recognizer
|
||||||
|
m_recognizer = vosk_recognizer_new(m_model, sampleRate);
|
||||||
|
if (!m_recognizer) {
|
||||||
|
m_logger->error("Failed to create Vosk recognizer");
|
||||||
|
vosk_model_free(m_model);
|
||||||
|
m_model = nullptr;
|
||||||
|
m_available = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
m_available = true;
|
||||||
|
m_logger->info("Vosk STT engine initialized successfully");
|
||||||
|
#else
|
||||||
|
m_logger->warn("Vosk support not compiled (HAS_VOSK not defined)");
|
||||||
|
m_logger->warn("To enable Vosk: install libvosk-dev and rebuild with -DHAS_VOSK");
|
||||||
|
m_available = false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
VoskSTTEngine::~VoskSTTEngine() {
|
||||||
|
#ifdef HAS_VOSK
|
||||||
|
if (m_recognizer) {
|
||||||
|
vosk_recognizer_free(m_recognizer);
|
||||||
|
m_recognizer = nullptr;
|
||||||
|
}
|
||||||
|
if (m_model) {
|
||||||
|
vosk_model_free(m_model);
|
||||||
|
m_model = nullptr;
|
||||||
|
}
|
||||||
|
m_logger->debug("Vosk STT engine destroyed");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string VoskSTTEngine::transcribe(const std::vector<float>& audioData) {
|
||||||
|
#ifdef HAS_VOSK
|
||||||
|
if (!m_available || audioData.empty()) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert float [-1.0, 1.0] to int16 [-32768, 32767]
|
||||||
|
std::vector<int16_t> samples(audioData.size());
|
||||||
|
for (size_t i = 0; i < audioData.size(); ++i) {
|
||||||
|
float clamped = std::max(-1.0f, std::min(1.0f, audioData[i]));
|
||||||
|
samples[i] = static_cast<int16_t>(clamped * 32767.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Feed audio to recognizer
|
||||||
|
int accepted = vosk_recognizer_accept_waveform(
|
||||||
|
m_recognizer,
|
||||||
|
reinterpret_cast<const char*>(samples.data()),
|
||||||
|
samples.size() * sizeof(int16_t)
|
||||||
|
);
|
||||||
|
|
||||||
|
// Get result
|
||||||
|
const char* result = nullptr;
|
||||||
|
if (accepted) {
|
||||||
|
result = vosk_recognizer_result(m_recognizer);
|
||||||
|
} else {
|
||||||
|
result = vosk_recognizer_partial_result(m_recognizer);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!result) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse JSON result
|
||||||
|
std::string text = parseVoskResult(result);
|
||||||
|
|
||||||
|
if (!text.empty()) {
|
||||||
|
m_logger->debug("Transcribed {} samples: '{}'", audioData.size(), text);
|
||||||
|
}
|
||||||
|
|
||||||
|
return text;
|
||||||
|
#else
|
||||||
|
m_logger->warn("Vosk not available, cannot transcribe");
|
||||||
|
return "";
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string VoskSTTEngine::transcribeFile(const std::string& filePath) {
|
||||||
|
#ifdef HAS_VOSK
|
||||||
|
if (!m_available) {
|
||||||
|
m_logger->warn("Vosk engine not available");
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
m_logger->info("Transcribing file: {}", filePath);
|
||||||
|
|
||||||
|
// Read WAV file
|
||||||
|
std::vector<float> audioData;
|
||||||
|
if (!readWavFile(filePath, audioData)) {
|
||||||
|
m_logger->error("Failed to read audio file: {}", filePath);
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
m_logger->debug("Read {} samples from file", audioData.size());
|
||||||
|
|
||||||
|
// Reset recognizer for new utterance
|
||||||
|
vosk_recognizer_reset(m_recognizer);
|
||||||
|
|
||||||
|
// Transcribe
|
||||||
|
std::string result = transcribe(audioData);
|
||||||
|
|
||||||
|
// Get final result
|
||||||
|
const char* finalResult = vosk_recognizer_final_result(m_recognizer);
|
||||||
|
if (finalResult) {
|
||||||
|
std::string finalText = parseVoskResult(finalResult);
|
||||||
|
if (!finalText.empty()) {
|
||||||
|
result = finalText;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
m_logger->info("Transcription result: '{}'", result);
|
||||||
|
return result;
|
||||||
|
#else
|
||||||
|
m_logger->warn("Vosk not available, cannot transcribe file");
|
||||||
|
return "";
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void VoskSTTEngine::setLanguage(const std::string& language) {
|
||||||
|
// Vosk models are language-specific and loaded at construction time
|
||||||
|
// Cannot change language at runtime
|
||||||
|
m_logger->debug("Language setting ignored (Vosk model is language-specific)");
|
||||||
|
m_logger->debug("To change language, load a different model");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string VoskSTTEngine::parseVoskResult(const std::string& jsonStr) {
|
||||||
|
try {
|
||||||
|
auto json = nlohmann::json::parse(jsonStr);
|
||||||
|
|
||||||
|
// Vosk returns: {"text": "transcription"} or {"partial": "partial text"}
|
||||||
|
if (json.contains("text")) {
|
||||||
|
return json["text"].get<std::string>();
|
||||||
|
} else if (json.contains("partial")) {
|
||||||
|
return json["partial"].get<std::string>();
|
||||||
|
}
|
||||||
|
|
||||||
|
return "";
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
m_logger->error("Failed to parse Vosk result: {}", e.what());
|
||||||
|
m_logger->debug("JSON string: {}", jsonStr);
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool VoskSTTEngine::readWavFile(const std::string& filePath, std::vector<float>& outSamples) {
|
||||||
|
std::ifstream file(filePath, std::ios::binary);
|
||||||
|
if (!file) {
|
||||||
|
m_logger->error("Cannot open file: {}", filePath);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read WAV header (44 bytes for standard PCM WAV)
|
||||||
|
char header[44];
|
||||||
|
file.read(header, 44);
|
||||||
|
if (!file) {
|
||||||
|
m_logger->error("Invalid WAV file (header too short)");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify RIFF header
|
||||||
|
if (std::strncmp(header, "RIFF", 4) != 0) {
|
||||||
|
m_logger->error("Not a valid WAV file (missing RIFF)");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify WAVE format
|
||||||
|
if (std::strncmp(header + 8, "WAVE", 4) != 0) {
|
||||||
|
m_logger->error("Not a valid WAV file (missing WAVE)");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract audio format info
|
||||||
|
uint16_t audioFormat = *reinterpret_cast<uint16_t*>(header + 20);
|
||||||
|
uint16_t numChannels = *reinterpret_cast<uint16_t*>(header + 22);
|
||||||
|
uint32_t sampleRate = *reinterpret_cast<uint32_t*>(header + 24);
|
||||||
|
uint16_t bitsPerSample = *reinterpret_cast<uint16_t*>(header + 34);
|
||||||
|
uint32_t dataSize = *reinterpret_cast<uint32_t*>(header + 40);
|
||||||
|
|
||||||
|
m_logger->debug("WAV format: {} channels, {} Hz, {} bits",
|
||||||
|
numChannels, sampleRate, bitsPerSample);
|
||||||
|
|
||||||
|
// We expect PCM 16-bit mono
|
||||||
|
if (audioFormat != 1) {
|
||||||
|
m_logger->error("Unsupported audio format (expected PCM)");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bitsPerSample != 16) {
|
||||||
|
m_logger->warn("Expected 16-bit audio, got {} bits", bitsPerSample);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read audio data
|
||||||
|
std::vector<int16_t> samples(dataSize / sizeof(int16_t));
|
||||||
|
file.read(reinterpret_cast<char*>(samples.data()), dataSize);
|
||||||
|
if (!file) {
|
||||||
|
m_logger->error("Failed to read audio data");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert to float and handle multi-channel (take first channel)
|
||||||
|
size_t numSamples = samples.size() / numChannels;
|
||||||
|
outSamples.resize(numSamples);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < numSamples; ++i) {
|
||||||
|
// Take first channel, convert int16 to float [-1.0, 1.0]
|
||||||
|
outSamples[i] = samples[i * numChannels] / 32768.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace aissia
|
||||||
77
src/shared/audio/VoskSTTEngine.hpp
Normal file
77
src/shared/audio/VoskSTTEngine.hpp
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ISTTEngine.hpp"
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <fstream>
|
||||||
|
|
||||||
|
// Forward declarations to avoid hard dependency on Vosk
|
||||||
|
// Actual Vosk headers will be included in the implementation
|
||||||
|
struct VoskModel;
|
||||||
|
struct VoskRecognizer;
|
||||||
|
|
||||||
|
namespace aissia {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Vosk Speech Recognition Engine
|
||||||
|
*
|
||||||
|
* Lightweight local STT using Vosk models.
|
||||||
|
* Recommended model: vosk-model-small-fr-0.22 (~50MB)
|
||||||
|
*
|
||||||
|
* Installation:
|
||||||
|
* - sudo apt install libvosk-dev
|
||||||
|
* - Download model: https://alphacephei.com/vosk/models/vosk-model-small-fr-0.22.zip
|
||||||
|
*
|
||||||
|
* Features:
|
||||||
|
* - Local processing (no API costs)
|
||||||
|
* - Real-time transcription
|
||||||
|
* - Supports French, English, and 20+ languages
|
||||||
|
*/
|
||||||
|
class VoskSTTEngine : public ISTTEngine {
|
||||||
|
public:
|
||||||
|
/**
|
||||||
|
* @brief Construct Vosk STT engine
|
||||||
|
* @param modelPath Path to Vosk model directory (e.g., "./models/vosk-model-small-fr-0.22")
|
||||||
|
* @param sampleRate Audio sample rate (default: 16000 Hz)
|
||||||
|
*/
|
||||||
|
explicit VoskSTTEngine(const std::string& modelPath, float sampleRate = 16000.0f);
|
||||||
|
|
||||||
|
~VoskSTTEngine() override;
|
||||||
|
|
||||||
|
// Disable copy
|
||||||
|
VoskSTTEngine(const VoskSTTEngine&) = delete;
|
||||||
|
VoskSTTEngine& operator=(const VoskSTTEngine&) = delete;
|
||||||
|
|
||||||
|
std::string transcribe(const std::vector<float>& audioData) override;
|
||||||
|
std::string transcribeFile(const std::string& filePath) override;
|
||||||
|
void setLanguage(const std::string& language) override;
|
||||||
|
bool isAvailable() const override { return m_available; }
|
||||||
|
std::string getEngineName() const override { return "vosk"; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
VoskModel* m_model = nullptr;
|
||||||
|
VoskRecognizer* m_recognizer = nullptr;
|
||||||
|
bool m_available = false;
|
||||||
|
float m_sampleRate = 16000.0f;
|
||||||
|
std::shared_ptr<spdlog::logger> m_logger;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Parse Vosk JSON result
|
||||||
|
* @param json JSON string from Vosk (e.g., {"text": "bonjour"})
|
||||||
|
* @return Extracted text
|
||||||
|
*/
|
||||||
|
std::string parseVoskResult(const std::string& json);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Read WAV file and extract PCM data
|
||||||
|
* @param filePath Path to WAV file
|
||||||
|
* @param outSamples Output PCM samples (float)
|
||||||
|
* @return true if successful
|
||||||
|
*/
|
||||||
|
bool readWavFile(const std::string& filePath, std::vector<float>& outSamples);
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace aissia
|
||||||
170
src/shared/audio/WhisperCppEngine.cpp
Normal file
170
src/shared/audio/WhisperCppEngine.cpp
Normal file
@ -0,0 +1,170 @@
|
|||||||
|
#include "WhisperCppEngine.hpp"
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
#include <spdlog/sinks/stdout_color_sinks.h>
|
||||||
|
#include <fstream>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
|
// Only include whisper.cpp headers if library is available
|
||||||
|
#ifdef HAVE_WHISPER_CPP
|
||||||
|
#include <whisper.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace aissia {
|
||||||
|
|
||||||
|
WhisperCppEngine::WhisperCppEngine(const std::string& modelPath)
|
||||||
|
: m_modelPath(modelPath)
|
||||||
|
{
|
||||||
|
m_logger = spdlog::get("WhisperCpp");
|
||||||
|
if (!m_logger) {
|
||||||
|
m_logger = spdlog::stdout_color_mt("WhisperCpp");
|
||||||
|
}
|
||||||
|
|
||||||
|
m_available = initialize();
|
||||||
|
|
||||||
|
if (m_available) {
|
||||||
|
m_logger->info("Whisper.cpp STT initialized: model={}", modelPath);
|
||||||
|
} else {
|
||||||
|
m_logger->warn("Whisper.cpp not available (library not compiled or model missing)");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
WhisperCppEngine::~WhisperCppEngine() {
|
||||||
|
cleanup();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool WhisperCppEngine::initialize() {
|
||||||
|
#ifdef HAVE_WHISPER_CPP
|
||||||
|
// Check if model file exists
|
||||||
|
std::ifstream modelCheck(m_modelPath, std::ios::binary);
|
||||||
|
if (!modelCheck.good()) {
|
||||||
|
m_logger->error("Whisper model not found at: {}", m_modelPath);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
modelCheck.close();
|
||||||
|
|
||||||
|
// Initialize whisper context
|
||||||
|
m_ctx = whisper_init_from_file(m_modelPath.c_str());
|
||||||
|
if (!m_ctx) {
|
||||||
|
m_logger->error("Failed to initialize Whisper context from model: {}", m_modelPath);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
m_logger->info("Whisper.cpp model loaded successfully");
|
||||||
|
return true;
|
||||||
|
#else
|
||||||
|
m_logger->warn("Whisper.cpp support not compiled (HAVE_WHISPER_CPP not defined)");
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void WhisperCppEngine::cleanup() {
|
||||||
|
#ifdef HAVE_WHISPER_CPP
|
||||||
|
if (m_ctx) {
|
||||||
|
whisper_free(m_ctx);
|
||||||
|
m_ctx = nullptr;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void WhisperCppEngine::setParameters(int threads, bool translate) {
|
||||||
|
m_threads = threads;
|
||||||
|
m_translate = translate;
|
||||||
|
m_logger->debug("Whisper.cpp parameters: threads={}, translate={}", threads, translate);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string WhisperCppEngine::processAudioData(const float* audioData, size_t numSamples) {
|
||||||
|
#ifdef HAVE_WHISPER_CPP
|
||||||
|
if (!m_ctx) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Setup whisper parameters
|
||||||
|
whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||||
|
params.n_threads = m_threads;
|
||||||
|
params.translate = m_translate;
|
||||||
|
params.print_progress = false;
|
||||||
|
params.print_special = false;
|
||||||
|
params.print_realtime = false;
|
||||||
|
params.print_timestamps = false;
|
||||||
|
|
||||||
|
// Set language if specified (not "auto")
|
||||||
|
if (m_language != "auto" && m_language.size() >= 2) {
|
||||||
|
std::string lang2 = m_language.substr(0, 2); // Take first 2 chars (ISO 639-1)
|
||||||
|
params.language = lang2.c_str();
|
||||||
|
m_logger->debug("Whisper.cpp using language: {}", lang2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run full inference
|
||||||
|
int result = whisper_full(m_ctx, params, audioData, numSamples);
|
||||||
|
if (result != 0) {
|
||||||
|
m_logger->error("Whisper.cpp inference failed with code: {}", result);
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get transcription
|
||||||
|
std::string transcription;
|
||||||
|
int n_segments = whisper_full_n_segments(m_ctx);
|
||||||
|
for (int i = 0; i < n_segments; ++i) {
|
||||||
|
const char* text = whisper_full_get_segment_text(m_ctx, i);
|
||||||
|
if (text) {
|
||||||
|
if (!transcription.empty()) {
|
||||||
|
transcription += " ";
|
||||||
|
}
|
||||||
|
transcription += text;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Trim leading/trailing whitespace
|
||||||
|
size_t start = transcription.find_first_not_of(" \t\n\r");
|
||||||
|
size_t end = transcription.find_last_not_of(" \t\n\r");
|
||||||
|
if (start != std::string::npos && end != std::string::npos) {
|
||||||
|
transcription = transcription.substr(start, end - start + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
m_logger->debug("Whisper.cpp transcribed: '{}' ({} segments)", transcription, n_segments);
|
||||||
|
return transcription;
|
||||||
|
#else
|
||||||
|
return "";
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string WhisperCppEngine::transcribe(const std::vector<float>& audioData) {
|
||||||
|
if (!m_available || audioData.empty()) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
m_logger->debug("Whisper.cpp transcribing {} samples", audioData.size());
|
||||||
|
return processAudioData(audioData.data(), audioData.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string WhisperCppEngine::transcribeFile(const std::string& filePath) {
|
||||||
|
if (!m_available) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
m_logger->info("Whisper.cpp transcribing file: {}", filePath);
|
||||||
|
|
||||||
|
// For file transcription, we'd need to:
|
||||||
|
// 1. Read the audio file (wav format)
|
||||||
|
// 2. Extract PCM float samples at 16kHz mono
|
||||||
|
// 3. Call processAudioData
|
||||||
|
//
|
||||||
|
// whisper.cpp provides helper functions for this, but requires linking audio libraries
|
||||||
|
m_logger->warn("Whisper.cpp file transcription not yet implemented (use transcribe() with PCM data)");
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
void WhisperCppEngine::setLanguage(const std::string& language) {
|
||||||
|
m_language = language;
|
||||||
|
m_logger->info("Whisper.cpp language set to: {}", language);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool WhisperCppEngine::isAvailable() const {
|
||||||
|
return m_available;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string WhisperCppEngine::getEngineName() const {
|
||||||
|
return "whisper-cpp";
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace aissia
|
||||||
79
src/shared/audio/WhisperCppEngine.hpp
Normal file
79
src/shared/audio/WhisperCppEngine.hpp
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ISTTEngine.hpp"
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
#include <memory>
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
// whisper.cpp forward declarations (to avoid including full headers)
|
||||||
|
struct whisper_context;
|
||||||
|
struct whisper_full_params;
|
||||||
|
|
||||||
|
namespace aissia {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Whisper.cpp Speech-to-Text engine
|
||||||
|
*
|
||||||
|
* Local high-quality STT using OpenAI's Whisper model via whisper.cpp.
|
||||||
|
* Runs entirely offline with excellent accuracy.
|
||||||
|
*
|
||||||
|
* Features:
|
||||||
|
* - High accuracy (OpenAI Whisper quality)
|
||||||
|
* - Completely offline (no internet required)
|
||||||
|
* - Multiple model sizes (tiny, base, small, medium, large)
|
||||||
|
* - Multilingual support
|
||||||
|
*
|
||||||
|
* Model sizes:
|
||||||
|
* - tiny: ~75MB, fastest, less accurate
|
||||||
|
* - base: ~142MB, balanced
|
||||||
|
* - small: ~466MB, good quality
|
||||||
|
* - medium: ~1.5GB, very good
|
||||||
|
* - large: ~2.9GB, best quality
|
||||||
|
*
|
||||||
|
* Recommended: base or small for most use cases
|
||||||
|
*/
|
||||||
|
class WhisperCppEngine : public ISTTEngine {
|
||||||
|
public:
|
||||||
|
/**
|
||||||
|
* @brief Construct Whisper.cpp engine
|
||||||
|
* @param modelPath Path to Whisper GGML model file (e.g., "models/ggml-base.bin")
|
||||||
|
*/
|
||||||
|
explicit WhisperCppEngine(const std::string& modelPath);
|
||||||
|
|
||||||
|
~WhisperCppEngine() override;
|
||||||
|
|
||||||
|
// Disable copy
|
||||||
|
WhisperCppEngine(const WhisperCppEngine&) = delete;
|
||||||
|
WhisperCppEngine& operator=(const WhisperCppEngine&) = delete;
|
||||||
|
|
||||||
|
std::string transcribe(const std::vector<float>& audioData) override;
|
||||||
|
std::string transcribeFile(const std::string& filePath) override;
|
||||||
|
void setLanguage(const std::string& language) override;
|
||||||
|
bool isAvailable() const override;
|
||||||
|
std::string getEngineName() const override;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Set transcription parameters
|
||||||
|
* @param threads Number of threads to use (default: 4)
|
||||||
|
* @param translate Translate to English (default: false)
|
||||||
|
*/
|
||||||
|
void setParameters(int threads = 4, bool translate = false);
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool initialize();
|
||||||
|
void cleanup();
|
||||||
|
std::string processAudioData(const float* audioData, size_t numSamples);
|
||||||
|
|
||||||
|
std::shared_ptr<spdlog::logger> m_logger;
|
||||||
|
std::string m_modelPath;
|
||||||
|
std::string m_language = "auto";
|
||||||
|
bool m_available = false;
|
||||||
|
int m_threads = 4;
|
||||||
|
bool m_translate = false;
|
||||||
|
|
||||||
|
// whisper.cpp context (opaque pointer to avoid header dependency)
|
||||||
|
whisper_context* m_ctx = nullptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace aissia
|
||||||
@ -7,7 +7,8 @@
|
|||||||
* Requires cpp-httplib and OpenSSL for HTTPS support.
|
* Requires cpp-httplib and OpenSSL for HTTPS support.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define CPPHTTPLIB_OPENSSL_SUPPORT
|
// Only enable OpenSSL support if it was found during CMake configuration
|
||||||
|
// CPPHTTPLIB_OPENSSL_SUPPORT is defined via target_compile_definitions when OpenSSL is available
|
||||||
#include <httplib.h>
|
#include <httplib.h>
|
||||||
#include <nlohmann/json.hpp>
|
#include <nlohmann/json.hpp>
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -72,9 +73,12 @@ public:
|
|||||||
std::string url = (m_useSSL ? "https://" : "http://") + m_host;
|
std::string url = (m_useSSL ? "https://" : "http://") + m_host;
|
||||||
httplib::Client client(url);
|
httplib::Client client(url);
|
||||||
|
|
||||||
|
// Only enable certificate verification if OpenSSL support is compiled in
|
||||||
|
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
|
||||||
if (m_useSSL) {
|
if (m_useSSL) {
|
||||||
client.enable_server_certificate_verification(true);
|
client.enable_server_certificate_verification(true);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
client.set_connection_timeout(m_timeoutSeconds);
|
client.set_connection_timeout(m_timeoutSeconds);
|
||||||
client.set_read_timeout(m_timeoutSeconds);
|
client.set_read_timeout(m_timeoutSeconds);
|
||||||
@ -122,9 +126,12 @@ public:
|
|||||||
std::string url = (m_useSSL ? "https://" : "http://") + m_host;
|
std::string url = (m_useSSL ? "https://" : "http://") + m_host;
|
||||||
httplib::Client client(url);
|
httplib::Client client(url);
|
||||||
|
|
||||||
|
// Only enable certificate verification if OpenSSL support is compiled in
|
||||||
|
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
|
||||||
if (m_useSSL) {
|
if (m_useSSL) {
|
||||||
client.enable_server_certificate_verification(true);
|
client.enable_server_certificate_verification(true);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
client.set_connection_timeout(m_timeoutSeconds);
|
client.set_connection_timeout(m_timeoutSeconds);
|
||||||
client.set_read_timeout(m_timeoutSeconds);
|
client.set_read_timeout(m_timeoutSeconds);
|
||||||
@ -158,9 +165,12 @@ public:
|
|||||||
std::string url = (m_useSSL ? "https://" : "http://") + m_host;
|
std::string url = (m_useSSL ? "https://" : "http://") + m_host;
|
||||||
httplib::Client client(url);
|
httplib::Client client(url);
|
||||||
|
|
||||||
|
// Only enable certificate verification if OpenSSL support is compiled in
|
||||||
|
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
|
||||||
if (m_useSSL) {
|
if (m_useSSL) {
|
||||||
client.enable_server_certificate_verification(true);
|
client.enable_server_certificate_verification(true);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
client.set_connection_timeout(m_timeoutSeconds);
|
client.set_connection_timeout(m_timeoutSeconds);
|
||||||
client.set_read_timeout(m_timeoutSeconds);
|
client.set_read_timeout(m_timeoutSeconds);
|
||||||
|
|||||||
310
src/shared/tools/MCPServerTools.cpp
Normal file
310
src/shared/tools/MCPServerTools.cpp
Normal file
@ -0,0 +1,310 @@
|
|||||||
|
#include "MCPServerTools.hpp"
|
||||||
|
#include "../../services/LLMService.hpp"
|
||||||
|
#include "../../services/StorageService.hpp"
|
||||||
|
#include "../../services/VoiceService.hpp"
|
||||||
|
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
namespace aissia::tools {
|
||||||
|
|
||||||
|
MCPServerTools::MCPServerTools(
|
||||||
|
LLMService* llm,
|
||||||
|
StorageService* storage,
|
||||||
|
VoiceService* voice
|
||||||
|
) : m_llmService(llm),
|
||||||
|
m_storageService(storage),
|
||||||
|
m_voiceService(voice)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<ToolDefinition> MCPServerTools::getToolDefinitions() {
|
||||||
|
std::vector<ToolDefinition> tools;
|
||||||
|
|
||||||
|
// Tool 1: chat_with_aissia (PRIORITÉ)
|
||||||
|
if (m_llmService) {
|
||||||
|
tools.push_back({
|
||||||
|
"chat_with_aissia",
|
||||||
|
"Dialogue with AISSIA assistant (Claude Sonnet 4). Send a message and get an intelligent response with access to AISSIA's knowledge and capabilities.",
|
||||||
|
{
|
||||||
|
{"type", "object"},
|
||||||
|
{"properties", {
|
||||||
|
{"message", {
|
||||||
|
{"type", "string"},
|
||||||
|
{"description", "Message to send to AISSIA"}
|
||||||
|
}},
|
||||||
|
{"conversation_id", {
|
||||||
|
{"type", "string"},
|
||||||
|
{"description", "Conversation ID for continuity (optional)"}
|
||||||
|
}},
|
||||||
|
{"system_prompt", {
|
||||||
|
{"type", "string"},
|
||||||
|
{"description", "Custom system prompt (optional)"}
|
||||||
|
}}
|
||||||
|
}},
|
||||||
|
{"required", json::array({"message"})}
|
||||||
|
},
|
||||||
|
[this](const json& input) { return handleChatWithAissia(input); }
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tool 2: transcribe_audio
|
||||||
|
if (m_voiceService) {
|
||||||
|
tools.push_back({
|
||||||
|
"transcribe_audio",
|
||||||
|
"Transcribe audio file to text using Speech-to-Text (Whisper.cpp, OpenAI Whisper API, or Google Speech). Supports WAV, MP3, and other common audio formats.",
|
||||||
|
{
|
||||||
|
{"type", "object"},
|
||||||
|
{"properties", {
|
||||||
|
{"file_path", {
|
||||||
|
{"type", "string"},
|
||||||
|
{"description", "Path to audio file"}
|
||||||
|
}},
|
||||||
|
{"language", {
|
||||||
|
{"type", "string"},
|
||||||
|
{"description", "Language code (e.g., 'fr', 'en'). Default: 'fr'"}
|
||||||
|
}}
|
||||||
|
}},
|
||||||
|
{"required", json::array({"file_path"})}
|
||||||
|
},
|
||||||
|
[this](const json& input) { return handleTranscribeAudio(input); }
|
||||||
|
});
|
||||||
|
|
||||||
|
// Tool 3: text_to_speech
|
||||||
|
tools.push_back({
|
||||||
|
"text_to_speech",
|
||||||
|
"Convert text to speech audio file using Text-to-Speech synthesis. Generates audio in WAV format.",
|
||||||
|
{
|
||||||
|
{"type", "object"},
|
||||||
|
{"properties", {
|
||||||
|
{"text", {
|
||||||
|
{"type", "string"},
|
||||||
|
{"description", "Text to synthesize"}
|
||||||
|
}},
|
||||||
|
{"output_file", {
|
||||||
|
{"type", "string"},
|
||||||
|
{"description", "Output audio file path (WAV)"}
|
||||||
|
}},
|
||||||
|
{"voice", {
|
||||||
|
{"type", "string"},
|
||||||
|
{"description", "Voice identifier (e.g., 'fr-fr', 'en-us'). Default: 'fr-fr'"}
|
||||||
|
}}
|
||||||
|
}},
|
||||||
|
{"required", json::array({"text", "output_file"})}
|
||||||
|
},
|
||||||
|
[this](const json& input) { return handleTextToSpeech(input); }
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tool 4: save_memory
|
||||||
|
if (m_storageService) {
|
||||||
|
tools.push_back({
|
||||||
|
"save_memory",
|
||||||
|
"Save a note or memory to AISSIA's persistent storage. Memories can be tagged and searched later.",
|
||||||
|
{
|
||||||
|
{"type", "object"},
|
||||||
|
{"properties", {
|
||||||
|
{"title", {
|
||||||
|
{"type", "string"},
|
||||||
|
{"description", "Memory title"}
|
||||||
|
}},
|
||||||
|
{"content", {
|
||||||
|
{"type", "string"},
|
||||||
|
{"description", "Memory content"}
|
||||||
|
}},
|
||||||
|
{"tags", {
|
||||||
|
{"type", "array"},
|
||||||
|
{"items", {{"type", "string"}}},
|
||||||
|
{"description", "Tags for categorization (optional)"}
|
||||||
|
}}
|
||||||
|
}},
|
||||||
|
{"required", json::array({"title", "content"})}
|
||||||
|
},
|
||||||
|
[this](const json& input) { return handleSaveMemory(input); }
|
||||||
|
});
|
||||||
|
|
||||||
|
// Tool 5: search_memories
|
||||||
|
tools.push_back({
|
||||||
|
"search_memories",
|
||||||
|
"Search through saved memories and notes in AISSIA's storage. Returns matching memories with relevance scores.",
|
||||||
|
{
|
||||||
|
{"type", "object"},
|
||||||
|
{"properties", {
|
||||||
|
{"query", {
|
||||||
|
{"type", "string"},
|
||||||
|
{"description", "Search query"}
|
||||||
|
}},
|
||||||
|
{"limit", {
|
||||||
|
{"type", "integer"},
|
||||||
|
{"description", "Maximum results to return. Default: 10"}
|
||||||
|
}}
|
||||||
|
}},
|
||||||
|
{"required", json::array({"query"})}
|
||||||
|
},
|
||||||
|
[this](const json& input) { return handleSearchMemories(input); }
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return tools;
|
||||||
|
}
|
||||||
|
|
||||||
|
json MCPServerTools::execute(const std::string& toolName, const json& input) {
|
||||||
|
if (toolName == "chat_with_aissia") {
|
||||||
|
return handleChatWithAissia(input);
|
||||||
|
} else if (toolName == "transcribe_audio") {
|
||||||
|
return handleTranscribeAudio(input);
|
||||||
|
} else if (toolName == "text_to_speech") {
|
||||||
|
return handleTextToSpeech(input);
|
||||||
|
} else if (toolName == "save_memory") {
|
||||||
|
return handleSaveMemory(input);
|
||||||
|
} else if (toolName == "search_memories") {
|
||||||
|
return handleSearchMemories(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
{"error", "Unknown tool: " + toolName}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Tool Handlers
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
json MCPServerTools::handleChatWithAissia(const json& input) {
|
||||||
|
if (!m_llmService) {
|
||||||
|
return {{"error", "LLMService not available"}};
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
std::string message = input["message"];
|
||||||
|
std::string conversationId = input.value("conversation_id", "");
|
||||||
|
std::string systemPrompt = input.value("system_prompt", "");
|
||||||
|
|
||||||
|
spdlog::info("[chat_with_aissia] Message: {}", message.substr(0, 100));
|
||||||
|
|
||||||
|
// Call synchronous LLM method
|
||||||
|
auto response = m_llmService->sendMessageSync(message, conversationId, systemPrompt);
|
||||||
|
|
||||||
|
return {
|
||||||
|
{"response", response.text},
|
||||||
|
{"conversation_id", conversationId},
|
||||||
|
{"tokens", response.tokens},
|
||||||
|
{"iterations", response.iterations}
|
||||||
|
};
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
spdlog::error("[chat_with_aissia] Error: {}", e.what());
|
||||||
|
return {{"error", e.what()}};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
json MCPServerTools::handleTranscribeAudio(const json& input) {
|
||||||
|
if (!m_voiceService) {
|
||||||
|
return {{"error", "VoiceService not available"}};
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
std::string filePath = input["file_path"];
|
||||||
|
std::string language = input.value("language", "fr");
|
||||||
|
|
||||||
|
spdlog::info("[transcribe_audio] File: {}, Language: {}", filePath, language);
|
||||||
|
|
||||||
|
// Call synchronous STT method
|
||||||
|
std::string text = m_voiceService->transcribeFileSync(filePath, language);
|
||||||
|
|
||||||
|
return {
|
||||||
|
{"text", text},
|
||||||
|
{"file", filePath},
|
||||||
|
{"language", language}
|
||||||
|
};
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
spdlog::error("[transcribe_audio] Error: {}", e.what());
|
||||||
|
return {{"error", e.what()}};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
json MCPServerTools::handleTextToSpeech(const json& input) {
|
||||||
|
if (!m_voiceService) {
|
||||||
|
return {{"error", "VoiceService not available"}};
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
std::string text = input["text"];
|
||||||
|
std::string outputFile = input["output_file"];
|
||||||
|
std::string voice = input.value("voice", "fr-fr");
|
||||||
|
|
||||||
|
spdlog::info("[text_to_speech] Text: {}, Output: {}", text.substr(0, 50), outputFile);
|
||||||
|
|
||||||
|
// Call synchronous TTS method
|
||||||
|
bool success = m_voiceService->textToSpeechSync(text, outputFile, voice);
|
||||||
|
|
||||||
|
if (success) {
|
||||||
|
return {
|
||||||
|
{"success", true},
|
||||||
|
{"file", outputFile},
|
||||||
|
{"voice", voice}
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
return {{"error", "TTS generation failed"}};
|
||||||
|
}
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
spdlog::error("[text_to_speech] Error: {}", e.what());
|
||||||
|
return {{"error", e.what()}};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
json MCPServerTools::handleSaveMemory(const json& input) {
|
||||||
|
if (!m_storageService) {
|
||||||
|
return {{"error", "StorageService not available"}};
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
std::string title = input["title"];
|
||||||
|
std::string content = input["content"];
|
||||||
|
std::vector<std::string> tags;
|
||||||
|
|
||||||
|
if (input.contains("tags") && input["tags"].is_array()) {
|
||||||
|
for (const auto& tag : input["tags"]) {
|
||||||
|
tags.push_back(tag.get<std::string>());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
spdlog::info("[save_memory] Title: {}", title);
|
||||||
|
|
||||||
|
// TODO: Implement saveMemorySync in StorageService
|
||||||
|
// For now, return not implemented
|
||||||
|
return json({
|
||||||
|
{"error", "save_memory not yet implemented"},
|
||||||
|
{"note", "StorageService sync methods need to be added"},
|
||||||
|
{"title", title}
|
||||||
|
});
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
spdlog::error("[save_memory] Error: {}", e.what());
|
||||||
|
return {{"error", e.what()}};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
json MCPServerTools::handleSearchMemories(const json& input) {
|
||||||
|
if (!m_storageService) {
|
||||||
|
return {{"error", "StorageService not available"}};
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
std::string query = input["query"];
|
||||||
|
int limit = input.value("limit", 10);
|
||||||
|
|
||||||
|
spdlog::info("[search_memories] Query: {}, Limit: {}", query, limit);
|
||||||
|
|
||||||
|
// TODO: Implement searchMemoriesSync in StorageService
|
||||||
|
// For now, return not implemented
|
||||||
|
return json({
|
||||||
|
{"error", "search_memories not yet implemented"},
|
||||||
|
{"note", "StorageService sync methods need to be added"},
|
||||||
|
{"query", query},
|
||||||
|
{"limit", limit}
|
||||||
|
});
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
spdlog::error("[search_memories] Error: {}", e.what());
|
||||||
|
return {{"error", e.what()}};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace aissia::tools
|
||||||
76
src/shared/tools/MCPServerTools.hpp
Normal file
76
src/shared/tools/MCPServerTools.hpp
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "../llm/ToolRegistry.hpp"
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
|
#include <memory>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
// Forward declarations
|
||||||
|
namespace aissia {
|
||||||
|
class LLMService;
|
||||||
|
class StorageService;
|
||||||
|
class VoiceService;
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace aissia::tools {
|
||||||
|
|
||||||
|
using json = nlohmann::json;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief MCP Server Tools - Bridge between MCP Server and AISSIA services
|
||||||
|
*
|
||||||
|
* Provides tool definitions for AISSIA modules exposed via MCP Server:
|
||||||
|
* - chat_with_aissia: Dialogue with AISSIA (Claude Sonnet 4)
|
||||||
|
* - transcribe_audio: Speech-to-text (Whisper.cpp/OpenAI/Google)
|
||||||
|
* - text_to_speech: Text-to-speech synthesis
|
||||||
|
* - save_memory: Save note/memory to storage
|
||||||
|
* - search_memories: Search stored memories
|
||||||
|
*
|
||||||
|
* Note: These tools run in synchronous mode (no IIO pub/sub, no main loop)
|
||||||
|
*/
|
||||||
|
class MCPServerTools {
|
||||||
|
public:
|
||||||
|
/**
|
||||||
|
* @brief Construct MCP server tools with service dependencies
|
||||||
|
*
|
||||||
|
* @param llm LLMService for chat_with_aissia (can be nullptr)
|
||||||
|
* @param storage StorageService for save/search memories (can be nullptr)
|
||||||
|
* @param voice VoiceService for TTS/STT (can be nullptr)
|
||||||
|
*/
|
||||||
|
MCPServerTools(
|
||||||
|
LLMService* llm,
|
||||||
|
StorageService* storage,
|
||||||
|
VoiceService* voice
|
||||||
|
);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Get all tool definitions for registration
|
||||||
|
*
|
||||||
|
* @return Vector of ToolDefinition structs
|
||||||
|
*/
|
||||||
|
std::vector<ToolDefinition> getToolDefinitions();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Execute a tool by name
|
||||||
|
*
|
||||||
|
* @param toolName Tool to execute
|
||||||
|
* @param input Tool arguments (JSON)
|
||||||
|
* @return Tool result (JSON)
|
||||||
|
*/
|
||||||
|
json execute(const std::string& toolName, const json& input);
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Tool handlers
|
||||||
|
json handleChatWithAissia(const json& input);
|
||||||
|
json handleTranscribeAudio(const json& input);
|
||||||
|
json handleTextToSpeech(const json& input);
|
||||||
|
json handleSaveMemory(const json& input);
|
||||||
|
json handleSearchMemories(const json& input);
|
||||||
|
|
||||||
|
// Service references (nullable)
|
||||||
|
LLMService* m_llmService;
|
||||||
|
StorageService* m_storageService;
|
||||||
|
VoiceService* m_voiceService;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace aissia::tools
|
||||||
16
test-results.json
Normal file
16
test-results.json
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
{
|
||||||
|
"environment": {
|
||||||
|
"platform": "linux",
|
||||||
|
"testDirectory": "tests/integration"
|
||||||
|
},
|
||||||
|
"summary": {
|
||||||
|
"failed": 0,
|
||||||
|
"passed": 0,
|
||||||
|
"skipped": 0,
|
||||||
|
"successRate": 0.0,
|
||||||
|
"total": 0,
|
||||||
|
"totalDurationMs": 0
|
||||||
|
},
|
||||||
|
"tests": [],
|
||||||
|
"timestamp": "2025-11-29T09:01:38Z"
|
||||||
|
}
|
||||||
@ -1,30 +1,5 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# Test script for AISSIA interactive mode
|
|
||||||
|
|
||||||
cd "/mnt/e/Users/Alexis Trouvé/Documents/Projets/Aissia"
|
|
||||||
|
|
||||||
# Load env
|
|
||||||
set -a
|
set -a
|
||||||
source .env
|
source .env
|
||||||
set +a
|
set +a
|
||||||
|
echo "Quelle heure est-il ?" | timeout 30 ./build/aissia --interactive
|
||||||
echo "🧪 Testing AISSIA Interactive Mode"
|
|
||||||
echo "===================================="
|
|
||||||
echo ""
|
|
||||||
echo "Sending test queries to AISSIA..."
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# Test 1: Simple conversation
|
|
||||||
echo "Test 1: Simple greeting"
|
|
||||||
echo "Bonjour AISSIA, comment vas-tu ?" | timeout 30 ./build/aissia -i 2>&1 | grep -A 10 "AISSIA:"
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Test 2: Task query"
|
|
||||||
echo "Quelle est ma tâche actuelle ?" | timeout 30 ./build/aissia -i 2>&1 | grep -A 10 "AISSIA:"
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Test 3: Time query"
|
|
||||||
echo "Quelle heure est-il ?" | timeout 30 ./build/aissia -i 2>&1 | grep -A 10 "AISSIA:"
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "✅ Tests completed"
|
|
||||||
|
|||||||
237
test_stt_live.cpp
Normal file
237
test_stt_live.cpp
Normal file
@ -0,0 +1,237 @@
|
|||||||
|
/**
|
||||||
|
* @file test_stt_live.cpp
|
||||||
|
* @brief Live STT testing tool - Test all 4 engines
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "src/shared/audio/ISTTEngine.hpp"
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
#include <vector>
|
||||||
|
#include <cstdlib>
|
||||||
|
|
||||||
|
using namespace aissia;
|
||||||
|
|
||||||
|
// Helper: Load .env file
|
||||||
|
void loadEnv(const std::string& path = ".env") {
|
||||||
|
std::ifstream file(path);
|
||||||
|
if (!file.is_open()) {
|
||||||
|
spdlog::warn("No .env file found at: {}", path);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string line;
|
||||||
|
while (std::getline(file, line)) {
|
||||||
|
if (line.empty() || line[0] == '#') continue;
|
||||||
|
|
||||||
|
auto pos = line.find('=');
|
||||||
|
if (pos != std::string::npos) {
|
||||||
|
std::string key = line.substr(0, pos);
|
||||||
|
std::string value = line.substr(pos + 1);
|
||||||
|
|
||||||
|
// Remove quotes
|
||||||
|
if (!value.empty() && value.front() == '"' && value.back() == '"') {
|
||||||
|
value = value.substr(1, value.length() - 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
_putenv_s(key.c_str(), value.c_str());
|
||||||
|
#else
|
||||||
|
setenv(key.c_str(), value.c_str(), 1);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
spdlog::info("Loaded environment from {}", path);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper: Get API key from env
|
||||||
|
std::string getEnvVar(const std::string& name) {
|
||||||
|
const char* val = std::getenv(name.c_str());
|
||||||
|
return val ? std::string(val) : "";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper: Load audio file as WAV (simplified - assumes 16-bit PCM)
|
||||||
|
std::vector<float> loadWavFile(const std::string& path) {
|
||||||
|
std::ifstream file(path, std::ios::binary);
|
||||||
|
if (!file.is_open()) {
|
||||||
|
spdlog::error("Failed to open audio file: {}", path);
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip WAV header (44 bytes)
|
||||||
|
file.seekg(44);
|
||||||
|
|
||||||
|
// Read 16-bit PCM samples
|
||||||
|
std::vector<int16_t> samples;
|
||||||
|
int16_t sample;
|
||||||
|
while (file.read(reinterpret_cast<char*>(&sample), sizeof(sample))) {
|
||||||
|
samples.push_back(sample);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert to float [-1.0, 1.0]
|
||||||
|
std::vector<float> audioData;
|
||||||
|
audioData.reserve(samples.size());
|
||||||
|
for (int16_t s : samples) {
|
||||||
|
audioData.push_back(static_cast<float>(s) / 32768.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
spdlog::info("Loaded {} samples from {}", audioData.size(), path);
|
||||||
|
return audioData;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char* argv[]) {
|
||||||
|
spdlog::set_level(spdlog::level::info);
|
||||||
|
spdlog::info("=== AISSIA STT Live Test ===");
|
||||||
|
|
||||||
|
// Load environment variables
|
||||||
|
loadEnv();
|
||||||
|
|
||||||
|
// Check command line
|
||||||
|
if (argc < 2) {
|
||||||
|
std::cout << "Usage: " << argv[0] << " <audio.wav>\n";
|
||||||
|
std::cout << "\nAvailable engines:\n";
|
||||||
|
std::cout << " 1. Whisper.cpp (local, requires models/ggml-base.bin)\n";
|
||||||
|
std::cout << " 2. Whisper API (requires OPENAI_API_KEY)\n";
|
||||||
|
std::cout << " 3. Google Speech (requires GOOGLE_API_KEY)\n";
|
||||||
|
std::cout << " 4. Azure STT (requires AZURE_SPEECH_KEY + AZURE_SPEECH_REGION)\n";
|
||||||
|
std::cout << " 5. Deepgram (requires DEEPGRAM_API_KEY)\n";
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string audioFile = argv[1];
|
||||||
|
|
||||||
|
// Load audio
|
||||||
|
std::vector<float> audioData = loadWavFile(audioFile);
|
||||||
|
if (audioData.empty()) {
|
||||||
|
spdlog::error("Failed to load audio data");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test each engine
|
||||||
|
std::cout << "\n========================================\n";
|
||||||
|
std::cout << "Testing STT Engines\n";
|
||||||
|
std::cout << "========================================\n\n";
|
||||||
|
|
||||||
|
// 1. Whisper.cpp (local)
|
||||||
|
{
|
||||||
|
std::cout << "[1/5] Whisper.cpp (local)\n";
|
||||||
|
std::cout << "----------------------------\n";
|
||||||
|
|
||||||
|
try {
|
||||||
|
auto engine = STTEngineFactory::create("whisper_cpp", "models/ggml-base.bin");
|
||||||
|
if (engine && engine->isAvailable()) {
|
||||||
|
engine->setLanguage("fr");
|
||||||
|
std::string result = engine->transcribe(audioData);
|
||||||
|
std::cout << "✅ Result: " << result << "\n\n";
|
||||||
|
} else {
|
||||||
|
std::cout << "❌ Not available (model missing?)\n\n";
|
||||||
|
}
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
std::cout << "❌ Error: " << e.what() << "\n\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Whisper API
|
||||||
|
{
|
||||||
|
std::cout << "[2/5] OpenAI Whisper API\n";
|
||||||
|
std::cout << "----------------------------\n";
|
||||||
|
|
||||||
|
std::string apiKey = getEnvVar("OPENAI_API_KEY");
|
||||||
|
if (apiKey.empty()) {
|
||||||
|
std::cout << "❌ OPENAI_API_KEY not set\n\n";
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
auto engine = STTEngineFactory::create("whisper_api", "", apiKey);
|
||||||
|
if (engine && engine->isAvailable()) {
|
||||||
|
engine->setLanguage("fr");
|
||||||
|
std::string result = engine->transcribeFile(audioFile);
|
||||||
|
std::cout << "✅ Result: " << result << "\n\n";
|
||||||
|
} else {
|
||||||
|
std::cout << "❌ Not available\n\n";
|
||||||
|
}
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
std::cout << "❌ Error: " << e.what() << "\n\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Google Speech
|
||||||
|
{
|
||||||
|
std::cout << "[3/5] Google Speech-to-Text\n";
|
||||||
|
std::cout << "----------------------------\n";
|
||||||
|
|
||||||
|
std::string apiKey = getEnvVar("GOOGLE_API_KEY");
|
||||||
|
if (apiKey.empty()) {
|
||||||
|
std::cout << "❌ GOOGLE_API_KEY not set\n\n";
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
auto engine = STTEngineFactory::create("google", "", apiKey);
|
||||||
|
if (engine && engine->isAvailable()) {
|
||||||
|
engine->setLanguage("fr");
|
||||||
|
std::string result = engine->transcribeFile(audioFile);
|
||||||
|
std::cout << "✅ Result: " << result << "\n\n";
|
||||||
|
} else {
|
||||||
|
std::cout << "❌ Not available\n\n";
|
||||||
|
}
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
std::cout << "❌ Error: " << e.what() << "\n\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. Azure Speech
|
||||||
|
{
|
||||||
|
std::cout << "[4/5] Azure Speech-to-Text\n";
|
||||||
|
std::cout << "----------------------------\n";
|
||||||
|
|
||||||
|
std::string apiKey = getEnvVar("AZURE_SPEECH_KEY");
|
||||||
|
std::string region = getEnvVar("AZURE_SPEECH_REGION");
|
||||||
|
|
||||||
|
if (apiKey.empty() || region.empty()) {
|
||||||
|
std::cout << "❌ AZURE_SPEECH_KEY or AZURE_SPEECH_REGION not set\n\n";
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
auto engine = STTEngineFactory::create("azure", region, apiKey);
|
||||||
|
if (engine && engine->isAvailable()) {
|
||||||
|
engine->setLanguage("fr");
|
||||||
|
std::string result = engine->transcribeFile(audioFile);
|
||||||
|
std::cout << "✅ Result: " << result << "\n\n";
|
||||||
|
} else {
|
||||||
|
std::cout << "❌ Not available\n\n";
|
||||||
|
}
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
std::cout << "❌ Error: " << e.what() << "\n\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 5. Deepgram
|
||||||
|
{
|
||||||
|
std::cout << "[5/5] Deepgram\n";
|
||||||
|
std::cout << "----------------------------\n";
|
||||||
|
|
||||||
|
std::string apiKey = getEnvVar("DEEPGRAM_API_KEY");
|
||||||
|
if (apiKey.empty()) {
|
||||||
|
std::cout << "❌ DEEPGRAM_API_KEY not set\n\n";
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
auto engine = STTEngineFactory::create("deepgram", "", apiKey);
|
||||||
|
if (engine && engine->isAvailable()) {
|
||||||
|
engine->setLanguage("fr");
|
||||||
|
std::string result = engine->transcribeFile(audioFile);
|
||||||
|
std::cout << "✅ Result: " << result << "\n\n";
|
||||||
|
} else {
|
||||||
|
std::cout << "❌ Not available\n\n";
|
||||||
|
}
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
std::cout << "❌ Error: " << e.what() << "\n\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "========================================\n";
|
||||||
|
std::cout << "Testing complete!\n";
|
||||||
|
std::cout << "========================================\n";
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
@ -55,6 +55,10 @@ target_link_libraries(aissia_tests PRIVATE
|
|||||||
target_include_directories(aissia_tests PRIVATE
|
target_include_directories(aissia_tests PRIVATE
|
||||||
${httplib_SOURCE_DIR}
|
${httplib_SOURCE_DIR}
|
||||||
)
|
)
|
||||||
|
# Link Winsock for httplib on Windows
|
||||||
|
if(WIN32)
|
||||||
|
target_link_libraries(aissia_tests PRIVATE ws2_32)
|
||||||
|
endif()
|
||||||
if(OPENSSL_FOUND)
|
if(OPENSSL_FOUND)
|
||||||
target_link_libraries(aissia_tests PRIVATE OpenSSL::SSL OpenSSL::Crypto)
|
target_link_libraries(aissia_tests PRIVATE OpenSSL::SSL OpenSSL::Crypto)
|
||||||
target_compile_definitions(aissia_tests PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
|
target_compile_definitions(aissia_tests PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
|
||||||
|
|||||||
8
tests/fixtures/mock_mcp.json
vendored
8
tests/fixtures/mock_mcp.json
vendored
@ -1,8 +1,8 @@
|
|||||||
{
|
{
|
||||||
"servers": {
|
"servers": {
|
||||||
"mock_server": {
|
"mock_server": {
|
||||||
"command": "python3",
|
"command": "C:\\Users\\alexi\\AppData\\Local\\Programs\\Python\\Python312\\python.exe",
|
||||||
"args": ["tests/fixtures/mock_mcp_server.py"],
|
"args": ["-u", "tests/fixtures/mock_mcp_server.py"],
|
||||||
"enabled": true
|
"enabled": true
|
||||||
},
|
},
|
||||||
"disabled_server": {
|
"disabled_server": {
|
||||||
@ -11,8 +11,8 @@
|
|||||||
"enabled": false
|
"enabled": false
|
||||||
},
|
},
|
||||||
"echo_server": {
|
"echo_server": {
|
||||||
"command": "python3",
|
"command": "C:\\Users\\alexi\\AppData\\Local\\Programs\\Python\\Python312\\python.exe",
|
||||||
"args": ["tests/fixtures/echo_server.py"],
|
"args": ["-u", "tests/fixtures/echo_server.py"],
|
||||||
"enabled": true
|
"enabled": true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
182
tests/manual/test_stt_engines.cpp
Normal file
182
tests/manual/test_stt_engines.cpp
Normal file
@ -0,0 +1,182 @@
|
|||||||
|
/**
|
||||||
|
* @file test_stt_engines.cpp
|
||||||
|
* @brief Manual test program for all 4 STT engines
|
||||||
|
*
|
||||||
|
* Tests each STT engine with the same audio file and compares results.
|
||||||
|
*
|
||||||
|
* Usage: ./test_stt_engines <audio_file.mp3>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "../../src/shared/audio/ISTTEngine.hpp"
|
||||||
|
#include "../../src/shared/audio/PocketSphinxEngine.hpp"
|
||||||
|
#include "../../src/shared/audio/VoskSTTEngine.hpp"
|
||||||
|
#include "../../src/shared/audio/WhisperCppEngine.hpp"
|
||||||
|
#include "../../src/shared/audio/WhisperAPIEngine.hpp"
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
#include <spdlog/sinks/stdout_color_sinks.h>
|
||||||
|
#include <iostream>
|
||||||
|
#include <chrono>
|
||||||
|
#include <cstdlib>
|
||||||
|
|
||||||
|
using namespace aissia;
|
||||||
|
|
||||||
|
struct TestResult {
|
||||||
|
std::string engineName;
|
||||||
|
bool available;
|
||||||
|
std::string transcription;
|
||||||
|
double durationMs;
|
||||||
|
std::string error;
|
||||||
|
};
|
||||||
|
|
||||||
|
TestResult testEngine(ISTTEngine* engine, const std::string& audioFile) {
|
||||||
|
TestResult result;
|
||||||
|
result.engineName = engine->getEngineName();
|
||||||
|
result.available = engine->isAvailable();
|
||||||
|
|
||||||
|
if (!result.available) {
|
||||||
|
result.error = "Engine not available";
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
try {
|
||||||
|
result.transcription = engine->transcribeFile(audioFile);
|
||||||
|
|
||||||
|
auto end = std::chrono::high_resolution_clock::now();
|
||||||
|
result.durationMs = std::chrono::duration<double, std::milli>(end - start).count();
|
||||||
|
|
||||||
|
if (result.transcription.empty()) {
|
||||||
|
result.error = "Empty transcription (file format not supported or processing failed)";
|
||||||
|
}
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
result.error = std::string("Exception: ") + e.what();
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void printResult(const TestResult& result) {
|
||||||
|
std::cout << "\n";
|
||||||
|
std::cout << "┌─────────────────────────────────────────────────────────\n";
|
||||||
|
std::cout << "│ Engine: " << result.engineName << "\n";
|
||||||
|
std::cout << "├─────────────────────────────────────────────────────────\n";
|
||||||
|
|
||||||
|
if (!result.available) {
|
||||||
|
std::cout << "│ Status: ❌ NOT AVAILABLE\n";
|
||||||
|
std::cout << "│ Reason: " << result.error << "\n";
|
||||||
|
} else if (!result.error.empty()) {
|
||||||
|
std::cout << "│ Status: ⚠️ ERROR\n";
|
||||||
|
std::cout << "│ Error: " << result.error << "\n";
|
||||||
|
} else {
|
||||||
|
std::cout << "│ Status: ✅ SUCCESS\n";
|
||||||
|
std::cout << "│ Duration: " << result.durationMs << " ms\n";
|
||||||
|
std::cout << "│ Transcription: \"" << result.transcription << "\"\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "└─────────────────────────────────────────────────────────\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char* argv[]) {
|
||||||
|
// Setup logging
|
||||||
|
auto logger = spdlog::stdout_color_mt("test");
|
||||||
|
spdlog::set_level(spdlog::level::info);
|
||||||
|
|
||||||
|
// Check arguments
|
||||||
|
if (argc < 2) {
|
||||||
|
std::cerr << "Usage: " << argv[0] << " <audio_file>\n";
|
||||||
|
std::cerr << "Example: " << argv[0] << " test_audio.mp3\n";
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string audioFile = argv[1];
|
||||||
|
|
||||||
|
std::cout << "\n";
|
||||||
|
std::cout << "╔═══════════════════════════════════════════════════════════╗\n";
|
||||||
|
std::cout << "║ STT ENGINES TEST - AISSIA Phase 7 ║\n";
|
||||||
|
std::cout << "╚═══════════════════════════════════════════════════════════╝\n";
|
||||||
|
std::cout << "\n";
|
||||||
|
std::cout << "Audio file: " << audioFile << "\n";
|
||||||
|
std::cout << "\n";
|
||||||
|
std::cout << "Testing 4 STT engines...\n";
|
||||||
|
|
||||||
|
// Prepare engines
|
||||||
|
std::vector<std::pair<std::string, std::unique_ptr<ISTTEngine>>> engines;
|
||||||
|
|
||||||
|
// 1. PocketSphinx
|
||||||
|
std::cout << "\n[1/4] Initializing PocketSphinx...\n";
|
||||||
|
engines.push_back({
|
||||||
|
"PocketSphinx",
|
||||||
|
std::make_unique<PocketSphinxEngine>("/usr/share/pocketsphinx/model/en-us")
|
||||||
|
});
|
||||||
|
|
||||||
|
// 2. Vosk
|
||||||
|
std::cout << "[2/4] Initializing Vosk...\n";
|
||||||
|
engines.push_back({
|
||||||
|
"Vosk",
|
||||||
|
std::make_unique<VoskSTTEngine>("./models/vosk-model-small-fr-0.22")
|
||||||
|
});
|
||||||
|
|
||||||
|
// 3. Whisper.cpp
|
||||||
|
std::cout << "[3/4] Initializing Whisper.cpp...\n";
|
||||||
|
engines.push_back({
|
||||||
|
"Whisper.cpp",
|
||||||
|
std::make_unique<WhisperCppEngine>("./models/ggml-base.bin")
|
||||||
|
});
|
||||||
|
|
||||||
|
// 4. Whisper API
|
||||||
|
std::cout << "[4/4] Initializing Whisper API...\n";
|
||||||
|
const char* apiKey = std::getenv("OPENAI_API_KEY");
|
||||||
|
engines.push_back({
|
||||||
|
"Whisper API",
|
||||||
|
std::make_unique<WhisperAPIEngine>(apiKey ? apiKey : "")
|
||||||
|
});
|
||||||
|
|
||||||
|
// Test each engine
|
||||||
|
std::vector<TestResult> results;
|
||||||
|
for (auto& [name, engine] : engines) {
|
||||||
|
std::cout << "\n▶ Testing " << name << "...\n";
|
||||||
|
results.push_back(testEngine(engine.get(), audioFile));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Print results
|
||||||
|
std::cout << "\n\n";
|
||||||
|
std::cout << "╔═══════════════════════════════════════════════════════════╗\n";
|
||||||
|
std::cout << "║ RESULTS ║\n";
|
||||||
|
std::cout << "╚═══════════════════════════════════════════════════════════╝\n";
|
||||||
|
|
||||||
|
for (const auto& result : results) {
|
||||||
|
printResult(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Summary
|
||||||
|
std::cout << "\n\n";
|
||||||
|
std::cout << "╔═══════════════════════════════════════════════════════════╗\n";
|
||||||
|
std::cout << "║ SUMMARY ║\n";
|
||||||
|
std::cout << "╚═══════════════════════════════════════════════════════════╝\n";
|
||||||
|
std::cout << "\n";
|
||||||
|
|
||||||
|
int available = 0;
|
||||||
|
int successful = 0;
|
||||||
|
|
||||||
|
for (const auto& result : results) {
|
||||||
|
if (result.available) available++;
|
||||||
|
if (result.available && result.error.empty()) successful++;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << "Total engines tested: " << results.size() << "\n";
|
||||||
|
std::cout << "Engines available: " << available << "/" << results.size() << "\n";
|
||||||
|
std::cout << "Successful transcriptions: " << successful << "/" << available << "\n";
|
||||||
|
std::cout << "\n";
|
||||||
|
|
||||||
|
if (successful > 0) {
|
||||||
|
std::cout << "✅ STT system is working!\n";
|
||||||
|
return 0;
|
||||||
|
} else if (available > 0) {
|
||||||
|
std::cout << "⚠️ Some engines available but all failed (check audio file format)\n";
|
||||||
|
return 1;
|
||||||
|
} else {
|
||||||
|
std::cout << "❌ No STT engines available (install models/libraries)\n";
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -38,7 +38,7 @@ TEST_CASE("TI_CLIENT_001_LoadConfigValid", "[mcp][client]") {
|
|||||||
json config = {
|
json config = {
|
||||||
{"servers", {
|
{"servers", {
|
||||||
{"test_server", {
|
{"test_server", {
|
||||||
{"command", "python3"},
|
{"command", "C:\\Users\\alexi\\AppData\\Local\\Programs\\Python\\Python312\\python.exe"},
|
||||||
{"args", json::array({"server.py"})},
|
{"args", json::array({"server.py"})},
|
||||||
{"enabled", true}
|
{"enabled", true}
|
||||||
}}
|
}}
|
||||||
@ -112,7 +112,7 @@ TEST_CASE("TI_CLIENT_005_ConnectAllSkipsDisabled", "[mcp][client]") {
|
|||||||
json config = {
|
json config = {
|
||||||
{"servers", {
|
{"servers", {
|
||||||
{"enabled_server", {
|
{"enabled_server", {
|
||||||
{"command", "python3"},
|
{"command", "C:\\Users\\alexi\\AppData\\Local\\Programs\\Python\\Python312\\python.exe"},
|
||||||
{"args", json::array({"tests/fixtures/echo_server.py"})},
|
{"args", json::array({"tests/fixtures/echo_server.py"})},
|
||||||
{"enabled", true}
|
{"enabled", true}
|
||||||
}},
|
}},
|
||||||
@ -143,12 +143,12 @@ TEST_CASE("TI_CLIENT_006_ConnectSingleServer", "[mcp][client]") {
|
|||||||
json config = {
|
json config = {
|
||||||
{"servers", {
|
{"servers", {
|
||||||
{"server1", {
|
{"server1", {
|
||||||
{"command", "python3"},
|
{"command", "C:\\Users\\alexi\\AppData\\Local\\Programs\\Python\\Python312\\python.exe"},
|
||||||
{"args", json::array({"tests/fixtures/echo_server.py"})},
|
{"args", json::array({"tests/fixtures/echo_server.py"})},
|
||||||
{"enabled", true}
|
{"enabled", true}
|
||||||
}},
|
}},
|
||||||
{"server2", {
|
{"server2", {
|
||||||
{"command", "python3"},
|
{"command", "C:\\Users\\alexi\\AppData\\Local\\Programs\\Python\\Python312\\python.exe"},
|
||||||
{"args", json::array({"tests/fixtures/echo_server.py"})},
|
{"args", json::array({"tests/fixtures/echo_server.py"})},
|
||||||
{"enabled", true}
|
{"enabled", true}
|
||||||
}}
|
}}
|
||||||
@ -178,7 +178,7 @@ TEST_CASE("TI_CLIENT_007_DisconnectSingleServer", "[mcp][client]") {
|
|||||||
json config = {
|
json config = {
|
||||||
{"servers", {
|
{"servers", {
|
||||||
{"server1", {
|
{"server1", {
|
||||||
{"command", "python3"},
|
{"command", "C:\\Users\\alexi\\AppData\\Local\\Programs\\Python\\Python312\\python.exe"},
|
||||||
{"args", json::array({"tests/fixtures/echo_server.py"})},
|
{"args", json::array({"tests/fixtures/echo_server.py"})},
|
||||||
{"enabled", true}
|
{"enabled", true}
|
||||||
}}
|
}}
|
||||||
@ -205,12 +205,12 @@ TEST_CASE("TI_CLIENT_008_DisconnectAllCleansUp", "[mcp][client]") {
|
|||||||
json config = {
|
json config = {
|
||||||
{"servers", {
|
{"servers", {
|
||||||
{"server1", {
|
{"server1", {
|
||||||
{"command", "python3"},
|
{"command", "C:\\Users\\alexi\\AppData\\Local\\Programs\\Python\\Python312\\python.exe"},
|
||||||
{"args", json::array({"tests/fixtures/echo_server.py"})},
|
{"args", json::array({"tests/fixtures/echo_server.py"})},
|
||||||
{"enabled", true}
|
{"enabled", true}
|
||||||
}},
|
}},
|
||||||
{"server2", {
|
{"server2", {
|
||||||
{"command", "python3"},
|
{"command", "C:\\Users\\alexi\\AppData\\Local\\Programs\\Python\\Python312\\python.exe"},
|
||||||
{"args", json::array({"tests/fixtures/echo_server.py"})},
|
{"args", json::array({"tests/fixtures/echo_server.py"})},
|
||||||
{"enabled", true}
|
{"enabled", true}
|
||||||
}}
|
}}
|
||||||
@ -366,7 +366,7 @@ TEST_CASE("TI_CLIENT_015_IsConnectedAccurate", "[mcp][client]") {
|
|||||||
json config = {
|
json config = {
|
||||||
{"servers", {
|
{"servers", {
|
||||||
{"test_server", {
|
{"test_server", {
|
||||||
{"command", "python3"},
|
{"command", "C:\\Users\\alexi\\AppData\\Local\\Programs\\Python\\Python312\\python.exe"},
|
||||||
{"args", json::array({"tests/fixtures/echo_server.py"})},
|
{"args", json::array({"tests/fixtures/echo_server.py"})},
|
||||||
{"enabled", true}
|
{"enabled", true}
|
||||||
}}
|
}}
|
||||||
|
|||||||
@ -20,7 +20,7 @@ using json = nlohmann::json;
|
|||||||
MCPServerConfig makeEchoServerConfig() {
|
MCPServerConfig makeEchoServerConfig() {
|
||||||
MCPServerConfig config;
|
MCPServerConfig config;
|
||||||
config.name = "echo";
|
config.name = "echo";
|
||||||
config.command = "python3";
|
config.command = "C:\\Users\\alexi\\AppData\\Local\\Programs\\Python\\Python312\\python.exe";
|
||||||
config.args = {"tests/fixtures/echo_server.py"};
|
config.args = {"tests/fixtures/echo_server.py"};
|
||||||
config.enabled = true;
|
config.enabled = true;
|
||||||
return config;
|
return config;
|
||||||
@ -29,7 +29,7 @@ MCPServerConfig makeEchoServerConfig() {
|
|||||||
MCPServerConfig makeMockMCPServerConfig() {
|
MCPServerConfig makeMockMCPServerConfig() {
|
||||||
MCPServerConfig config;
|
MCPServerConfig config;
|
||||||
config.name = "mock_mcp";
|
config.name = "mock_mcp";
|
||||||
config.command = "python3";
|
config.command = "C:\\Users\\alexi\\AppData\\Local\\Programs\\Python\\Python312\\python.exe";
|
||||||
config.args = {"tests/fixtures/mock_mcp_server.py"};
|
config.args = {"tests/fixtures/mock_mcp_server.py"};
|
||||||
config.enabled = true;
|
config.enabled = true;
|
||||||
return config;
|
return config;
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user