- Move all Python scripts to tools/ directory - Move documentation files to docs/ directory - Create exams/ and homework/ directories for future use - Remove temporary test file (page1_preview.png) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
259 lines
8.8 KiB
Python
259 lines
8.8 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Script to download all audio resources from BLCUP website
|
||
Downloads 185 MP3 files for Developing Chinese Intermediate Listening I
|
||
With concurrent downloads (max 20 threads) and progress bar per worker
|
||
"""
|
||
|
||
import re
|
||
import requests
|
||
from pathlib import Path
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
from threading import Lock
|
||
import time
|
||
from tqdm import tqdm
|
||
from queue import Queue
|
||
from urllib.parse import unquote
|
||
|
||
# Configuration
|
||
MAIN_PAGE_URL = "http://www.blcup.com/MobileResSeries?rid=23fea77b-c66b-4bbe-bfcd-b3c784fb2a79"
|
||
DOWNLOAD_DIR = Path("audio_resources/intermediate_listening_1")
|
||
MAX_WORKERS = 20 # Maximum concurrent downloads
|
||
|
||
# Thread-safe counters
|
||
stats_lock = Lock()
|
||
stats = {"successful": 0, "failed": 0}
|
||
|
||
def extract_resource_ids(html_content):
|
||
"""Extract all resource IDs from the HTML page"""
|
||
# Pattern to match: MobileResource?rid=UUID
|
||
pattern = r'MobileResource\?rid=([a-f0-9\-]+)'
|
||
resource_ids = re.findall(pattern, html_content, re.IGNORECASE)
|
||
return list(set(resource_ids)) # Remove duplicates
|
||
|
||
def extract_exercise_number(original_filename):
|
||
"""Extract exercise number like 1-1, 1-2, 6-1 from filename"""
|
||
# Pattern to match numbers like "1-1", "1-2", "6-1", "10-3-1" etc
|
||
# Also handle "Recording" prefix variations
|
||
match = re.search(r'(\d+[-\d]+)', original_filename)
|
||
if match:
|
||
return match.group(1)
|
||
return None
|
||
|
||
def download_resource(rid, output_dir, worker_bar, overall_bar, max_retries=3):
|
||
"""Download a single resource given its RID with per-worker progress and retry logic"""
|
||
download_url = f"http://www.blcup.com/Common/DownRes?doi={rid}"
|
||
|
||
for attempt in range(max_retries):
|
||
session = requests.Session()
|
||
try:
|
||
response = session.get(download_url, timeout=30, stream=True)
|
||
response.raise_for_status()
|
||
|
||
# Try to get filename from Content-Disposition header
|
||
original_filename = None
|
||
if 'Content-Disposition' in response.headers:
|
||
cd = response.headers['Content-Disposition']
|
||
filename_match = re.findall('filename="?([^"]+)"?', cd)
|
||
if filename_match:
|
||
# Decode URL-encoded filename
|
||
original_filename = unquote(filename_match[0])
|
||
|
||
# Extract exercise number from original filename
|
||
exercise_number = None
|
||
if original_filename:
|
||
exercise_number = extract_exercise_number(original_filename)
|
||
|
||
# Use exercise number as filename, fallback to rid
|
||
if exercise_number:
|
||
filename = f"{exercise_number}.mp3"
|
||
display_name = exercise_number
|
||
else:
|
||
filename = f"{rid}.mp3"
|
||
display_name = rid[:8]
|
||
|
||
# Get expected file size from server
|
||
total_size = int(response.headers.get('content-length', 0))
|
||
|
||
# Check if file already exists with correct size
|
||
output_path = output_dir / filename
|
||
if output_path.exists():
|
||
local_size = output_path.stat().st_size
|
||
if total_size > 0 and local_size == total_size:
|
||
# File exists with correct size, skip
|
||
with stats_lock:
|
||
stats["successful"] += 1
|
||
worker_bar.set_postfix_str(f"⊙ {display_name} (OK)")
|
||
overall_bar.update(1)
|
||
session.close()
|
||
return {"status": "skipped", "filename": filename, "rid": rid}
|
||
else:
|
||
# File exists but wrong size, will re-download
|
||
worker_bar.set_postfix_str(f"⚠ {display_name} (redownload)")
|
||
|
||
# Reset worker bar for new file
|
||
worker_bar.reset(total=total_size if total_size > 0 else 100)
|
||
retry_info = f" (retry {attempt+1}/{max_retries})" if attempt > 0 else ""
|
||
worker_bar.set_postfix_str(f"📥 {display_name}{retry_info}")
|
||
|
||
# Save the file with progress
|
||
with open(output_path, 'wb') as f:
|
||
if total_size > 0:
|
||
for chunk in response.iter_content(chunk_size=8192):
|
||
if chunk:
|
||
f.write(chunk)
|
||
worker_bar.update(len(chunk))
|
||
else:
|
||
content = response.content
|
||
f.write(content)
|
||
worker_bar.update(len(content))
|
||
|
||
with stats_lock:
|
||
stats["successful"] += 1
|
||
|
||
worker_bar.set_postfix_str(f"✓ {display_name}")
|
||
overall_bar.update(1)
|
||
session.close()
|
||
return {"status": "success", "filename": filename, "rid": rid}
|
||
|
||
except Exception as e:
|
||
session.close()
|
||
|
||
if attempt < max_retries - 1:
|
||
# Will retry
|
||
worker_bar.set_postfix_str(f"⟳ Retry {attempt+2}/{max_retries}...")
|
||
time.sleep(1) # Wait before retry
|
||
else:
|
||
# Final failure
|
||
with stats_lock:
|
||
stats["failed"] += 1
|
||
worker_bar.set_postfix_str(f"✗ Failed after {max_retries} tries")
|
||
overall_bar.update(1)
|
||
return {"status": "failed", "rid": rid, "error": str(e)}
|
||
|
||
return {"status": "failed", "rid": rid, "error": "Max retries reached"}
|
||
|
||
def worker_thread(worker_id, task_queue, output_dir, worker_bar, overall_bar):
|
||
"""Worker thread that processes download tasks from queue"""
|
||
while True:
|
||
rid = task_queue.get()
|
||
if rid is None: # Poison pill to stop worker
|
||
break
|
||
|
||
download_resource(rid, output_dir, worker_bar, overall_bar)
|
||
task_queue.task_done()
|
||
|
||
def main():
|
||
print("=" * 70)
|
||
print("BLCUP Audio Resource Downloader")
|
||
print("发展汉语(第2版)中级听力(I)")
|
||
print("=" * 70)
|
||
print()
|
||
|
||
# Create output directory
|
||
DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||
print(f"📁 Download directory: {DOWNLOAD_DIR.absolute()}")
|
||
print()
|
||
|
||
# Fetch the main page
|
||
print("🌐 Fetching resource list from main page...")
|
||
try:
|
||
response = requests.get(MAIN_PAGE_URL, timeout=10)
|
||
response.raise_for_status()
|
||
html_content = response.text
|
||
except Exception as e:
|
||
print(f"❌ Error fetching main page: {e}")
|
||
return
|
||
|
||
# Extract all resource IDs
|
||
resource_ids = extract_resource_ids(html_content)
|
||
total_resources = len(resource_ids)
|
||
|
||
# Check for already downloaded files
|
||
existing_files = list(DOWNLOAD_DIR.glob("*.mp3")) if DOWNLOAD_DIR.exists() else []
|
||
already_downloaded = len(existing_files)
|
||
|
||
print(f"✓ Found {total_resources} resources to download")
|
||
if already_downloaded > 0:
|
||
print(f"⊙ {already_downloaded} files already exist (will skip)")
|
||
print(f"⚡ Using {MAX_WORKERS} concurrent workers")
|
||
print(f"🔄 Auto-retry up to 3 times on failure")
|
||
print()
|
||
|
||
if total_resources == 0:
|
||
print("❌ No resources found. Please check the URL.")
|
||
return
|
||
|
||
# Download all resources with concurrent workers
|
||
print("📥 Starting downloads with worker progress bars...")
|
||
print()
|
||
|
||
# Create overall progress bar (position 0)
|
||
overall_bar = tqdm(
|
||
total=total_resources,
|
||
desc="Overall ",
|
||
position=0,
|
||
unit="file",
|
||
ncols=100,
|
||
colour="green",
|
||
leave=True
|
||
)
|
||
|
||
# Create progress bars for each worker
|
||
worker_bars = []
|
||
for i in range(MAX_WORKERS):
|
||
bar = tqdm(
|
||
total=100,
|
||
desc=f"Worker {i+1:2d} ",
|
||
position=i+1,
|
||
leave=True,
|
||
unit="B",
|
||
unit_scale=True,
|
||
unit_divisor=1024,
|
||
ncols=100,
|
||
colour="cyan"
|
||
)
|
||
worker_bars.append(bar)
|
||
|
||
# Create task queue
|
||
task_queue = Queue()
|
||
|
||
# Fill queue with tasks
|
||
for rid in resource_ids:
|
||
task_queue.put(rid)
|
||
|
||
# Add poison pills to stop workers
|
||
for _ in range(MAX_WORKERS):
|
||
task_queue.put(None)
|
||
|
||
# Start worker threads
|
||
from threading import Thread
|
||
threads = []
|
||
for i in range(MAX_WORKERS):
|
||
t = Thread(target=worker_thread, args=(i, task_queue, DOWNLOAD_DIR, worker_bars[i], overall_bar))
|
||
t.start()
|
||
threads.append(t)
|
||
|
||
# Wait for all threads to complete
|
||
for t in threads:
|
||
t.join()
|
||
|
||
# Close all bars
|
||
for bar in worker_bars:
|
||
bar.close()
|
||
overall_bar.close()
|
||
|
||
# Summary
|
||
print()
|
||
print("=" * 70)
|
||
print("📊 Download Summary:")
|
||
print(f" ✓ Successful: {stats['successful']}")
|
||
print(f" ✗ Failed: {stats['failed']}")
|
||
print(f" 📁 Files saved in: {DOWNLOAD_DIR.absolute()}")
|
||
print("=" * 70)
|
||
print()
|
||
print("✅ Done!")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|