From a61a32b57fa52ec09841b158ce33d30ba5559792 Mon Sep 17 00:00:00 2001
From: StillHammer <alexistrouve.pro@gmail.com>
Date: Mon, 27 Oct 2025 23:28:39 +0800
Subject: [PATCH] Reorganize repository structure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Move all Python scripts to tools/ directory
- Move documentation files to docs/ directory
- Create exams/ and homework/ directories for future use
- Remove temporary test file (page1_preview.png)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../Projet-Outil-Apprentissage.md             |   0
 README-OCR.md => docs/README-OCR.md           |   0
 docs/chinese_audio_tts_pipeline.md            | 229 ++++++++++++++++++
 .../debug_pdf_placement.py                    |   0
 .../diagnose_alignment.py                     |   0
 .../download_audio_resources.py               |   0
 .../json_to_pdf_QUADRILATERAL.py              |   0
 .../json_to_pdf_SIMPLE.py                     |   0
 .../json_to_searchable_pdf.py                 |   0
 ocr_pipeline.py => tools/ocr_pipeline.py      |   0
 .../requirements-ocr.txt                      |   0
 .../test_center_scaling.py                    |   0
 test_y_offset.py => tools/test_y_offset.py    |   0
 test_y_scaling.py => tools/test_y_scaling.py  |   0
 14 files changed, 229 insertions(+)
 rename Projet-Outil-Apprentissage.md => docs/Projet-Outil-Apprentissage.md (100%)
 rename README-OCR.md => docs/README-OCR.md (100%)
 create mode 100644 docs/chinese_audio_tts_pipeline.md
 rename debug_pdf_placement.py => tools/debug_pdf_placement.py (100%)
 rename diagnose_alignment.py => tools/diagnose_alignment.py (100%)
 rename download_audio_resources.py => tools/download_audio_resources.py (100%)
 rename json_to_pdf_QUADRILATERAL.py => tools/json_to_pdf_QUADRILATERAL.py (100%)
 rename json_to_pdf_SIMPLE.py => tools/json_to_pdf_SIMPLE.py (100%)
 rename json_to_searchable_pdf.py => tools/json_to_searchable_pdf.py (100%)
 rename ocr_pipeline.py => tools/ocr_pipeline.py (100%)
 rename requirements-ocr.txt => tools/requirements-ocr.txt (100%)
 rename test_center_scaling.py => tools/test_center_scaling.py (100%)
 rename test_y_offset.py => tools/test_y_offset.py (100%)
 rename test_y_scaling.py => tools/test_y_scaling.py (100%)

diff --git a/Projet-Outil-Apprentissage.md b/docs/Projet-Outil-Apprentissage.md
similarity index 100%
rename from Projet-Outil-Apprentissage.md
rename to docs/Projet-Outil-Apprentissage.md
diff --git a/README-OCR.md b/docs/README-OCR.md
similarity index 100%
rename from README-OCR.md
rename to docs/README-OCR.md
diff --git a/docs/chinese_audio_tts_pipeline.md b/docs/chinese_audio_tts_pipeline.md
new file mode 100644
index 0000000..17a147e
--- /dev/null
+++ b/docs/chinese_audio_tts_pipeline.md
@@ -0,0 +1,229 @@
+# Chinese Audio to Text Extractor - Simple Transcription
+
+## Objectif
+
+Extraire le texte de fichiers MP3 de cours de chinois en utilisant Whisper.
+
+### Problème résolu
+- Besoin de récupérer le contenu textuel des cours audio
+- Conversion MP3 → Texte simple et rapide
+
+### Solution
+Pipeline minimaliste : MP3 → Whisper → Texte brut
+
+---
+
+## Architecture Pipeline
+
+```
+┌─────────────────────────────────────────┐
+│  INPUT: cours_chinois.mp3 (45min)       │
+└──────────────┬──────────────────────────┘
+               │
+               ▼
+┌─────────────────────────────────────────┐
+│  Transcription (Whisper)                │
+│  ├─ Model: whisper-1 (OpenAI API)      │
+│  ├─ Language: zh (mandarin)            │
+│  └─ Output: transcript.txt             │
+└──────────────┬──────────────────────────┘
+               │
+               ▼
+┌─────────────────────────────────────────┐
+│  OUTPUT: cours_chinois.txt              │
+│  你好。我叫Alexis。今天我们学习...      │
+└─────────────────────────────────────────┘
+```
+
+---
+
+## Plan d'Implémentation Python
+
+### Structure du projet
+
+```
+chinese-transcriber/
+├── transcribe.py           # Script principal
+├── input/                  # MP3 source
+├── output/                 # Fichiers .txt générés
+├── .env                    # API key
+└── requirements.txt
+```
+
+### Dépendances (requirements.txt)
+
+```txt
+openai>=1.0.0              # Whisper API
+python-dotenv>=1.0.0       # Env variables
+```
+
+### Script Principal (transcribe.py)
+
+```python
+"""
+Transcription simple MP3 → TXT avec Whisper
+"""
+import openai
+from pathlib import Path
+from dotenv import load_dotenv
+import os
+
+def transcribe_audio(audio_path: Path, api_key: str) -> str:
+    """
+    Transcrit un fichier MP3 en chinois
+
+    Args:
+        audio_path: Chemin vers MP3
+        api_key: Clé API OpenAI
+
+    Returns:
+        Texte transcrit
+    """
+    client = openai.OpenAI(api_key=api_key)
+
+    with open(audio_path, "rb") as audio_file:
+        transcript = client.audio.transcriptions.create(
+            model="whisper-1",
+            file=audio_file,
+            language="zh",  # Force mandarin
+            response_format="text"  # Texte brut
+        )
+
+    return transcript
+
+def main():
+    # Load API key
+    load_dotenv()
+    api_key = os.getenv("OPENAI_API_KEY")
+
+    if not api_key:
+        print("Error: OPENAI_API_KEY not found in .env")
+        return
+
+    # Setup paths
+    input_dir = Path("input")
+    output_dir = Path("output")
+    output_dir.mkdir(exist_ok=True)
+
+    # Get MP3 files
+    mp3_files = list(input_dir.glob("*.mp3"))
+
+    if not mp3_files:
+        print(f"No MP3 files found in {input_dir}/")
+        return
+
+    print(f"Found {len(mp3_files)} MP3 files to transcribe\n")
+
+    # Process each file
+    for mp3_file in mp3_files:
+        print(f"Processing: {mp3_file.name}...")
+
+        try:
+            # Transcribe
+            text = transcribe_audio(mp3_file, api_key)
+
+            # Save to TXT
+            output_path = output_dir / f"{mp3_file.stem}.txt"
+            with open(output_path, "w", encoding="utf-8") as f:
+                f.write(text)
+
+            print(f"✓ Saved to: {output_path}\n")
+
+        except Exception as e:
+            print(f"✗ Error: {e}\n")
+
+    print("=== Transcription completed ===")
+
+if __name__ == "__main__":
+    main()
+```
+
+---
+
+### Environment Variables (.env)
+
+```bash
+OPENAI_API_KEY=sk-...
+```
+
+---
+
+## Estimation Coûts
+
+### Pour 10 heures de cours audio
+
+| Service | Coût | Calcul |
+|---------|------|--------|
+| **Whisper API** | **$3.60** | 10h × $0.006/min × 60min |
+
+**Ultra-abordable** pour extraction simple de texte.
+
+---
+
+## Usage
+
+### Installation
+
+```bash
+mkdir chinese-transcriber
+cd chinese-transcriber
+
+# Créer structure
+mkdir input output
+
+# Installer dépendances
+pip install openai python-dotenv
+
+# Créer .env
+echo "OPENAI_API_KEY=sk-..." > .env
+
+# Copier le script transcribe.py
+```
+
+### Exécution
+
+```bash
+# 1. Placer tes MP3 dans input/
+cp /path/to/cours*.mp3 input/
+
+# 2. Run script
+python transcribe.py
+
+# Output:
+# Found 3 MP3 files to transcribe
+#
+# Processing: cours_1.mp3...
+# ✓ Saved to: output/cours_1.txt
+#
+# Processing: cours_2.mp3...
+# ✓ Saved to: output/cours_2.txt
+# ...
+```
+
+### Output
+
+Fichiers `.txt` avec texte chinois brut :
+
+```
+output/cours_1.txt:
+你好。我叫Alexis。今天我们学习汉语。
+第一课是关于问候的。你好吗？我很好，谢谢。
+...
+```
+
+---
+
+## Statut
+
+✅ **PLAN SIMPLE - PRÊT À UTILISER**
+
+Script minimaliste pour extraction texte MP3 → TXT.
+
+**Next steps si besoin** :
+1. Tester sur tes fichiers MP3 chinois
+2. Si besoin découpage automatique, voir options full TTS pipeline (commenté dans versions précédentes)
+
+---
+
+*Créé : 27 octobre 2025*
+*Stack : Python 3.10+, Whisper API seulement*
diff --git a/debug_pdf_placement.py b/tools/debug_pdf_placement.py
similarity index 100%
rename from debug_pdf_placement.py
rename to tools/debug_pdf_placement.py
diff --git a/diagnose_alignment.py b/tools/diagnose_alignment.py
similarity index 100%
rename from diagnose_alignment.py
rename to tools/diagnose_alignment.py
diff --git a/download_audio_resources.py b/tools/download_audio_resources.py
similarity index 100%
rename from download_audio_resources.py
rename to tools/download_audio_resources.py
diff --git a/json_to_pdf_QUADRILATERAL.py b/tools/json_to_pdf_QUADRILATERAL.py
similarity index 100%
rename from json_to_pdf_QUADRILATERAL.py
rename to tools/json_to_pdf_QUADRILATERAL.py
diff --git a/json_to_pdf_SIMPLE.py b/tools/json_to_pdf_SIMPLE.py
similarity index 100%
rename from json_to_pdf_SIMPLE.py
rename to tools/json_to_pdf_SIMPLE.py
diff --git a/json_to_searchable_pdf.py b/tools/json_to_searchable_pdf.py
similarity index 100%
rename from json_to_searchable_pdf.py
rename to tools/json_to_searchable_pdf.py
diff --git a/ocr_pipeline.py b/tools/ocr_pipeline.py
similarity index 100%
rename from ocr_pipeline.py
rename to tools/ocr_pipeline.py
diff --git a/requirements-ocr.txt b/tools/requirements-ocr.txt
similarity index 100%
rename from requirements-ocr.txt
rename to tools/requirements-ocr.txt
diff --git a/test_center_scaling.py b/tools/test_center_scaling.py
similarity index 100%
rename from test_center_scaling.py
rename to tools/test_center_scaling.py
diff --git a/test_y_offset.py b/tools/test_y_offset.py
similarity index 100%
rename from test_y_offset.py
rename to tools/test_y_offset.py
diff --git a/test_y_scaling.py b/tools/test_y_scaling.py
similarity index 100%
rename from test_y_scaling.py
rename to tools/test_y_scaling.py