From a61a32b57fa52ec09841b158ce33d30ba5559792 Mon Sep 17 00:00:00 2001 From: StillHammer Date: Mon, 27 Oct 2025 23:28:39 +0800 Subject: [PATCH] Reorganize repository structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Move all Python scripts to tools/ directory - Move documentation files to docs/ directory - Create exams/ and homework/ directories for future use - Remove temporary test file (page1_preview.png) ๐Ÿค– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../Projet-Outil-Apprentissage.md | 0 README-OCR.md => docs/README-OCR.md | 0 docs/chinese_audio_tts_pipeline.md | 229 ++++++++++++++++++ .../debug_pdf_placement.py | 0 .../diagnose_alignment.py | 0 .../download_audio_resources.py | 0 .../json_to_pdf_QUADRILATERAL.py | 0 .../json_to_pdf_SIMPLE.py | 0 .../json_to_searchable_pdf.py | 0 ocr_pipeline.py => tools/ocr_pipeline.py | 0 .../requirements-ocr.txt | 0 .../test_center_scaling.py | 0 test_y_offset.py => tools/test_y_offset.py | 0 test_y_scaling.py => tools/test_y_scaling.py | 0 14 files changed, 229 insertions(+) rename Projet-Outil-Apprentissage.md => docs/Projet-Outil-Apprentissage.md (100%) rename README-OCR.md => docs/README-OCR.md (100%) create mode 100644 docs/chinese_audio_tts_pipeline.md rename debug_pdf_placement.py => tools/debug_pdf_placement.py (100%) rename diagnose_alignment.py => tools/diagnose_alignment.py (100%) rename download_audio_resources.py => tools/download_audio_resources.py (100%) rename json_to_pdf_QUADRILATERAL.py => tools/json_to_pdf_QUADRILATERAL.py (100%) rename json_to_pdf_SIMPLE.py => tools/json_to_pdf_SIMPLE.py (100%) rename json_to_searchable_pdf.py => tools/json_to_searchable_pdf.py (100%) rename ocr_pipeline.py => tools/ocr_pipeline.py (100%) rename requirements-ocr.txt => tools/requirements-ocr.txt (100%) rename test_center_scaling.py => tools/test_center_scaling.py (100%) rename test_y_offset.py => tools/test_y_offset.py (100%) rename test_y_scaling.py => tools/test_y_scaling.py (100%) diff --git a/Projet-Outil-Apprentissage.md b/docs/Projet-Outil-Apprentissage.md similarity index 100% rename from Projet-Outil-Apprentissage.md rename to docs/Projet-Outil-Apprentissage.md diff --git a/README-OCR.md b/docs/README-OCR.md similarity index 100% rename from README-OCR.md rename to docs/README-OCR.md diff --git a/docs/chinese_audio_tts_pipeline.md b/docs/chinese_audio_tts_pipeline.md new file mode 100644 index 0000000..17a147e --- /dev/null +++ b/docs/chinese_audio_tts_pipeline.md @@ -0,0 +1,229 @@ +# Chinese Audio to Text Extractor - Simple Transcription + +## Objectif + +Extraire le texte de fichiers MP3 de cours de chinois en utilisant Whisper. + +### Problรจme rรฉsolu +- Besoin de rรฉcupรฉrer le contenu textuel des cours audio +- Conversion MP3 โ†’ Texte simple et rapide + +### Solution +Pipeline minimaliste : MP3 โ†’ Whisper โ†’ Texte brut + +--- + +## Architecture Pipeline + +``` +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ INPUT: cours_chinois.mp3 (45min) โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ”‚ + โ–ผ +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ Transcription (Whisper) โ”‚ +โ”‚ โ”œโ”€ Model: whisper-1 (OpenAI API) โ”‚ +โ”‚ โ”œโ”€ Language: zh (mandarin) โ”‚ +โ”‚ โ””โ”€ Output: transcript.txt โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ”‚ + โ–ผ +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ OUTPUT: cours_chinois.txt โ”‚ +โ”‚ ไฝ ๅฅฝใ€‚ๆˆ‘ๅซAlexisใ€‚ไปŠๅคฉๆˆ‘ไปฌๅญฆไน ... โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ +``` + +--- + +## Plan d'Implรฉmentation Python + +### Structure du projet + +``` +chinese-transcriber/ +โ”œโ”€โ”€ transcribe.py # Script principal +โ”œโ”€โ”€ input/ # MP3 source +โ”œโ”€โ”€ output/ # Fichiers .txt gรฉnรฉrรฉs +โ”œโ”€โ”€ .env # API key +โ””โ”€โ”€ requirements.txt +``` + +### Dรฉpendances (requirements.txt) + +```txt +openai>=1.0.0 # Whisper API +python-dotenv>=1.0.0 # Env variables +``` + +### Script Principal (transcribe.py) + +```python +""" +Transcription simple MP3 โ†’ TXT avec Whisper +""" +import openai +from pathlib import Path +from dotenv import load_dotenv +import os + +def transcribe_audio(audio_path: Path, api_key: str) -> str: + """ + Transcrit un fichier MP3 en chinois + + Args: + audio_path: Chemin vers MP3 + api_key: Clรฉ API OpenAI + + Returns: + Texte transcrit + """ + client = openai.OpenAI(api_key=api_key) + + with open(audio_path, "rb") as audio_file: + transcript = client.audio.transcriptions.create( + model="whisper-1", + file=audio_file, + language="zh", # Force mandarin + response_format="text" # Texte brut + ) + + return transcript + +def main(): + # Load API key + load_dotenv() + api_key = os.getenv("OPENAI_API_KEY") + + if not api_key: + print("Error: OPENAI_API_KEY not found in .env") + return + + # Setup paths + input_dir = Path("input") + output_dir = Path("output") + output_dir.mkdir(exist_ok=True) + + # Get MP3 files + mp3_files = list(input_dir.glob("*.mp3")) + + if not mp3_files: + print(f"No MP3 files found in {input_dir}/") + return + + print(f"Found {len(mp3_files)} MP3 files to transcribe\n") + + # Process each file + for mp3_file in mp3_files: + print(f"Processing: {mp3_file.name}...") + + try: + # Transcribe + text = transcribe_audio(mp3_file, api_key) + + # Save to TXT + output_path = output_dir / f"{mp3_file.stem}.txt" + with open(output_path, "w", encoding="utf-8") as f: + f.write(text) + + print(f"โœ“ Saved to: {output_path}\n") + + except Exception as e: + print(f"โœ— Error: {e}\n") + + print("=== Transcription completed ===") + +if __name__ == "__main__": + main() +``` + +--- + +### Environment Variables (.env) + +```bash +OPENAI_API_KEY=sk-... +``` + +--- + +## Estimation Coรปts + +### Pour 10 heures de cours audio + +| Service | Coรปt | Calcul | +|---------|------|--------| +| **Whisper API** | **$3.60** | 10h ร— $0.006/min ร— 60min | + +**Ultra-abordable** pour extraction simple de texte. + +--- + +## Usage + +### Installation + +```bash +mkdir chinese-transcriber +cd chinese-transcriber + +# Crรฉer structure +mkdir input output + +# Installer dรฉpendances +pip install openai python-dotenv + +# Crรฉer .env +echo "OPENAI_API_KEY=sk-..." > .env + +# Copier le script transcribe.py +``` + +### Exรฉcution + +```bash +# 1. Placer tes MP3 dans input/ +cp /path/to/cours*.mp3 input/ + +# 2. Run script +python transcribe.py + +# Output: +# Found 3 MP3 files to transcribe +# +# Processing: cours_1.mp3... +# โœ“ Saved to: output/cours_1.txt +# +# Processing: cours_2.mp3... +# โœ“ Saved to: output/cours_2.txt +# ... +``` + +### Output + +Fichiers `.txt` avec texte chinois brut : + +``` +output/cours_1.txt: +ไฝ ๅฅฝใ€‚ๆˆ‘ๅซAlexisใ€‚ไปŠๅคฉๆˆ‘ไปฌๅญฆไน ๆฑ‰่ฏญใ€‚ +็ฌฌไธ€่ฏพๆ˜ฏๅ…ณไบŽ้—ฎๅ€™็š„ใ€‚ไฝ ๅฅฝๅ—๏ผŸๆˆ‘ๅพˆๅฅฝ๏ผŒ่ฐข่ฐขใ€‚ +... +``` + +--- + +## Statut + +โœ… **PLAN SIMPLE - PRรŠT ร€ UTILISER** + +Script minimaliste pour extraction texte MP3 โ†’ TXT. + +**Next steps si besoin** : +1. Tester sur tes fichiers MP3 chinois +2. Si besoin dรฉcoupage automatique, voir options full TTS pipeline (commentรฉ dans versions prรฉcรฉdentes) + +--- + +*Crรฉรฉ : 27 octobre 2025* +*Stack : Python 3.10+, Whisper API seulement* diff --git a/debug_pdf_placement.py b/tools/debug_pdf_placement.py similarity index 100% rename from debug_pdf_placement.py rename to tools/debug_pdf_placement.py diff --git a/diagnose_alignment.py b/tools/diagnose_alignment.py similarity index 100% rename from diagnose_alignment.py rename to tools/diagnose_alignment.py diff --git a/download_audio_resources.py b/tools/download_audio_resources.py similarity index 100% rename from download_audio_resources.py rename to tools/download_audio_resources.py diff --git a/json_to_pdf_QUADRILATERAL.py b/tools/json_to_pdf_QUADRILATERAL.py similarity index 100% rename from json_to_pdf_QUADRILATERAL.py rename to tools/json_to_pdf_QUADRILATERAL.py diff --git a/json_to_pdf_SIMPLE.py b/tools/json_to_pdf_SIMPLE.py similarity index 100% rename from json_to_pdf_SIMPLE.py rename to tools/json_to_pdf_SIMPLE.py diff --git a/json_to_searchable_pdf.py b/tools/json_to_searchable_pdf.py similarity index 100% rename from json_to_searchable_pdf.py rename to tools/json_to_searchable_pdf.py diff --git a/ocr_pipeline.py b/tools/ocr_pipeline.py similarity index 100% rename from ocr_pipeline.py rename to tools/ocr_pipeline.py diff --git a/requirements-ocr.txt b/tools/requirements-ocr.txt similarity index 100% rename from requirements-ocr.txt rename to tools/requirements-ocr.txt diff --git a/test_center_scaling.py b/tools/test_center_scaling.py similarity index 100% rename from test_center_scaling.py rename to tools/test_center_scaling.py diff --git a/test_y_offset.py b/tools/test_y_offset.py similarity index 100% rename from test_y_offset.py rename to tools/test_y_offset.py diff --git a/test_y_scaling.py b/tools/test_y_scaling.py similarity index 100% rename from test_y_scaling.py rename to tools/test_y_scaling.py