chineseclass/download_audio_resources.py
StillHammer acbe1b4769 Clean up project structure and update content
Remove ClassGenSystem and obsolete tools from tracking, update course content files with latest revisions.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-27 16:24:41 +08:00

259 lines
8.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Script to download all audio resources from BLCUP website
Downloads 185 MP3 files for Developing Chinese Intermediate Listening I
With concurrent downloads (max 20 threads) and progress bar per worker
"""
import re
import requests
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
import time
from tqdm import tqdm
from queue import Queue
from urllib.parse import unquote
# Configuration
MAIN_PAGE_URL = "http://www.blcup.com/MobileResSeries?rid=23fea77b-c66b-4bbe-bfcd-b3c784fb2a79"
DOWNLOAD_DIR = Path("audio_resources/intermediate_listening_1")
MAX_WORKERS = 20 # Maximum concurrent downloads
# Thread-safe counters
stats_lock = Lock()
stats = {"successful": 0, "failed": 0}
def extract_resource_ids(html_content):
"""Extract all resource IDs from the HTML page"""
# Pattern to match: MobileResource?rid=UUID
pattern = r'MobileResource\?rid=([a-f0-9\-]+)'
resource_ids = re.findall(pattern, html_content, re.IGNORECASE)
return list(set(resource_ids)) # Remove duplicates
def extract_exercise_number(original_filename):
"""Extract exercise number like 1-1, 1-2, 6-1 from filename"""
# Pattern to match numbers like "1-1", "1-2", "6-1", "10-3-1" etc
# Also handle "Recording" prefix variations
match = re.search(r'(\d+[-\d]+)', original_filename)
if match:
return match.group(1)
return None
def download_resource(rid, output_dir, worker_bar, overall_bar, max_retries=3):
"""Download a single resource given its RID with per-worker progress and retry logic"""
download_url = f"http://www.blcup.com/Common/DownRes?doi={rid}"
for attempt in range(max_retries):
session = requests.Session()
try:
response = session.get(download_url, timeout=30, stream=True)
response.raise_for_status()
# Try to get filename from Content-Disposition header
original_filename = None
if 'Content-Disposition' in response.headers:
cd = response.headers['Content-Disposition']
filename_match = re.findall('filename="?([^"]+)"?', cd)
if filename_match:
# Decode URL-encoded filename
original_filename = unquote(filename_match[0])
# Extract exercise number from original filename
exercise_number = None
if original_filename:
exercise_number = extract_exercise_number(original_filename)
# Use exercise number as filename, fallback to rid
if exercise_number:
filename = f"{exercise_number}.mp3"
display_name = exercise_number
else:
filename = f"{rid}.mp3"
display_name = rid[:8]
# Get expected file size from server
total_size = int(response.headers.get('content-length', 0))
# Check if file already exists with correct size
output_path = output_dir / filename
if output_path.exists():
local_size = output_path.stat().st_size
if total_size > 0 and local_size == total_size:
# File exists with correct size, skip
with stats_lock:
stats["successful"] += 1
worker_bar.set_postfix_str(f"{display_name} (OK)")
overall_bar.update(1)
session.close()
return {"status": "skipped", "filename": filename, "rid": rid}
else:
# File exists but wrong size, will re-download
worker_bar.set_postfix_str(f"{display_name} (redownload)")
# Reset worker bar for new file
worker_bar.reset(total=total_size if total_size > 0 else 100)
retry_info = f" (retry {attempt+1}/{max_retries})" if attempt > 0 else ""
worker_bar.set_postfix_str(f"📥 {display_name}{retry_info}")
# Save the file with progress
with open(output_path, 'wb') as f:
if total_size > 0:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
worker_bar.update(len(chunk))
else:
content = response.content
f.write(content)
worker_bar.update(len(content))
with stats_lock:
stats["successful"] += 1
worker_bar.set_postfix_str(f"{display_name}")
overall_bar.update(1)
session.close()
return {"status": "success", "filename": filename, "rid": rid}
except Exception as e:
session.close()
if attempt < max_retries - 1:
# Will retry
worker_bar.set_postfix_str(f"⟳ Retry {attempt+2}/{max_retries}...")
time.sleep(1) # Wait before retry
else:
# Final failure
with stats_lock:
stats["failed"] += 1
worker_bar.set_postfix_str(f"✗ Failed after {max_retries} tries")
overall_bar.update(1)
return {"status": "failed", "rid": rid, "error": str(e)}
return {"status": "failed", "rid": rid, "error": "Max retries reached"}
def worker_thread(worker_id, task_queue, output_dir, worker_bar, overall_bar):
"""Worker thread that processes download tasks from queue"""
while True:
rid = task_queue.get()
if rid is None: # Poison pill to stop worker
break
download_resource(rid, output_dir, worker_bar, overall_bar)
task_queue.task_done()
def main():
print("=" * 70)
print("BLCUP Audio Resource Downloader")
print("发展汉语第2版中级听力I")
print("=" * 70)
print()
# Create output directory
DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)
print(f"📁 Download directory: {DOWNLOAD_DIR.absolute()}")
print()
# Fetch the main page
print("🌐 Fetching resource list from main page...")
try:
response = requests.get(MAIN_PAGE_URL, timeout=10)
response.raise_for_status()
html_content = response.text
except Exception as e:
print(f"❌ Error fetching main page: {e}")
return
# Extract all resource IDs
resource_ids = extract_resource_ids(html_content)
total_resources = len(resource_ids)
# Check for already downloaded files
existing_files = list(DOWNLOAD_DIR.glob("*.mp3")) if DOWNLOAD_DIR.exists() else []
already_downloaded = len(existing_files)
print(f"✓ Found {total_resources} resources to download")
if already_downloaded > 0:
print(f"{already_downloaded} files already exist (will skip)")
print(f"⚡ Using {MAX_WORKERS} concurrent workers")
print(f"🔄 Auto-retry up to 3 times on failure")
print()
if total_resources == 0:
print("❌ No resources found. Please check the URL.")
return
# Download all resources with concurrent workers
print("📥 Starting downloads with worker progress bars...")
print()
# Create overall progress bar (position 0)
overall_bar = tqdm(
total=total_resources,
desc="Overall ",
position=0,
unit="file",
ncols=100,
colour="green",
leave=True
)
# Create progress bars for each worker
worker_bars = []
for i in range(MAX_WORKERS):
bar = tqdm(
total=100,
desc=f"Worker {i+1:2d} ",
position=i+1,
leave=True,
unit="B",
unit_scale=True,
unit_divisor=1024,
ncols=100,
colour="cyan"
)
worker_bars.append(bar)
# Create task queue
task_queue = Queue()
# Fill queue with tasks
for rid in resource_ids:
task_queue.put(rid)
# Add poison pills to stop workers
for _ in range(MAX_WORKERS):
task_queue.put(None)
# Start worker threads
from threading import Thread
threads = []
for i in range(MAX_WORKERS):
t = Thread(target=worker_thread, args=(i, task_queue, DOWNLOAD_DIR, worker_bars[i], overall_bar))
t.start()
threads.append(t)
# Wait for all threads to complete
for t in threads:
t.join()
# Close all bars
for bar in worker_bars:
bar.close()
overall_bar.close()
# Summary
print()
print("=" * 70)
print("📊 Download Summary:")
print(f" ✓ Successful: {stats['successful']}")
print(f" ✗ Failed: {stats['failed']}")
print(f" 📁 Files saved in: {DOWNLOAD_DIR.absolute()}")
print("=" * 70)
print()
print("✅ Done!")
if __name__ == "__main__":
main()