chineseclass/download_audio_resources.py

#!/usr/bin/env python3
"""
Script to download all audio resources from BLCUP website
Downloads 185 MP3 files for Developing Chinese Intermediate Listening I
With concurrent downloads (max 20 threads) and progress bar per worker
"""

import re
import requests
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
import time
from tqdm import tqdm
from queue import Queue
from urllib.parse import unquote

# Configuration
MAIN_PAGE_URL = "http://www.blcup.com/MobileResSeries?rid=23fea77b-c66b-4bbe-bfcd-b3c784fb2a79"
DOWNLOAD_DIR = Path("audio_resources/intermediate_listening_1")
MAX_WORKERS = 20  # Maximum concurrent downloads

# Thread-safe counters
stats_lock = Lock()
stats = {"successful": 0, "failed": 0}

def extract_resource_ids(html_content):
    """Extract all resource IDs from the HTML page"""
    # Pattern to match: MobileResource?rid=UUID
    pattern = r'MobileResource\?rid=([a-f0-9\-]+)'
    resource_ids = re.findall(pattern, html_content, re.IGNORECASE)
    return list(set(resource_ids))  # Remove duplicates

def extract_exercise_number(original_filename):
    """Extract exercise number like 1-1, 1-2, 6-1 from filename"""
    # Pattern to match numbers like "1-1", "1-2", "6-1", "10-3-1" etc
    # Also handle "Recording" prefix variations
    match = re.search(r'(\d+[-\d]+)', original_filename)
    if match:
        return match.group(1)
    return None

def download_resource(rid, output_dir, worker_bar, overall_bar, max_retries=3):
    """Download a single resource given its RID with per-worker progress and retry logic"""
    download_url = f"http://www.blcup.com/Common/DownRes?doi={rid}"

    for attempt in range(max_retries):
        session = requests.Session()
        try:
            response = session.get(download_url, timeout=30, stream=True)
            response.raise_for_status()

            # Try to get filename from Content-Disposition header
            original_filename = None
            if 'Content-Disposition' in response.headers:
                cd = response.headers['Content-Disposition']
                filename_match = re.findall('filename="?([^"]+)"?', cd)
                if filename_match:
                    # Decode URL-encoded filename
                    original_filename = unquote(filename_match[0])

            # Extract exercise number from original filename
            exercise_number = None
            if original_filename:
                exercise_number = extract_exercise_number(original_filename)

            # Use exercise number as filename, fallback to rid
            if exercise_number:
                filename = f"{exercise_number}.mp3"
                display_name = exercise_number
            else:
                filename = f"{rid}.mp3"
                display_name = rid[:8]

            # Get expected file size from server
            total_size = int(response.headers.get('content-length', 0))

            # Check if file already exists with correct size
            output_path = output_dir / filename
            if output_path.exists():
                local_size = output_path.stat().st_size
                if total_size > 0 and local_size == total_size:
                    # File exists with correct size, skip
                    with stats_lock:
                        stats["successful"] += 1
                    worker_bar.set_postfix_str(f"⊙ {display_name} (OK)")
                    overall_bar.update(1)
                    session.close()
                    return {"status": "skipped", "filename": filename, "rid": rid}
                else:
                    # File exists but wrong size, will re-download
                    worker_bar.set_postfix_str(f"⚠ {display_name} (redownload)")

            # Reset worker bar for new file
            worker_bar.reset(total=total_size if total_size > 0 else 100)
            retry_info = f" (retry {attempt+1}/{max_retries})" if attempt > 0 else ""
            worker_bar.set_postfix_str(f"📥 {display_name}{retry_info}")

            # Save the file with progress
            with open(output_path, 'wb') as f:
                if total_size > 0:
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)
                            worker_bar.update(len(chunk))
                else:
                    content = response.content
                    f.write(content)
                    worker_bar.update(len(content))

            with stats_lock:
                stats["successful"] += 1

            worker_bar.set_postfix_str(f"✓ {display_name}")
            overall_bar.update(1)
            session.close()
            return {"status": "success", "filename": filename, "rid": rid}

        except Exception as e:
            session.close()

            if attempt < max_retries - 1:
                # Will retry
                worker_bar.set_postfix_str(f"⟳ Retry {attempt+2}/{max_retries}...")
                time.sleep(1)  # Wait before retry
            else:
                # Final failure
                with stats_lock:
                    stats["failed"] += 1
                worker_bar.set_postfix_str(f"✗ Failed after {max_retries} tries")
                overall_bar.update(1)
                return {"status": "failed", "rid": rid, "error": str(e)}

    return {"status": "failed", "rid": rid, "error": "Max retries reached"}

def worker_thread(worker_id, task_queue, output_dir, worker_bar, overall_bar):
    """Worker thread that processes download tasks from queue"""
    while True:
        rid = task_queue.get()
        if rid is None:  # Poison pill to stop worker
            break

        download_resource(rid, output_dir, worker_bar, overall_bar)
        task_queue.task_done()

def main():
    print("=" * 70)
    print("BLCUP Audio Resource Downloader")
    print("发展汉语（第2版）中级听力（I）")
    print("=" * 70)
    print()

    # Create output directory
    DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)
    print(f"📁 Download directory: {DOWNLOAD_DIR.absolute()}")
    print()

    # Fetch the main page
    print("🌐 Fetching resource list from main page...")
    try:
        response = requests.get(MAIN_PAGE_URL, timeout=10)
        response.raise_for_status()
        html_content = response.text
    except Exception as e:
        print(f"❌ Error fetching main page: {e}")
        return

    # Extract all resource IDs
    resource_ids = extract_resource_ids(html_content)
    total_resources = len(resource_ids)

    # Check for already downloaded files
    existing_files = list(DOWNLOAD_DIR.glob("*.mp3")) if DOWNLOAD_DIR.exists() else []
    already_downloaded = len(existing_files)

    print(f"✓ Found {total_resources} resources to download")
    if already_downloaded > 0:
        print(f"⊙ {already_downloaded} files already exist (will skip)")
    print(f"⚡ Using {MAX_WORKERS} concurrent workers")
    print(f"🔄 Auto-retry up to 3 times on failure")
    print()

    if total_resources == 0:
        print("❌ No resources found. Please check the URL.")
        return

    # Download all resources with concurrent workers
    print("📥 Starting downloads with worker progress bars...")
    print()

    # Create overall progress bar (position 0)
    overall_bar = tqdm(
        total=total_resources,
        desc="Overall      ",
        position=0,
        unit="file",
        ncols=100,
        colour="green",
        leave=True
    )

    # Create progress bars for each worker
    worker_bars = []
    for i in range(MAX_WORKERS):
        bar = tqdm(
            total=100,
            desc=f"Worker {i+1:2d}   ",
            position=i+1,
            leave=True,
            unit="B",
            unit_scale=True,
            unit_divisor=1024,
            ncols=100,
            colour="cyan"
        )
        worker_bars.append(bar)

    # Create task queue
    task_queue = Queue()

    # Fill queue with tasks
    for rid in resource_ids:
        task_queue.put(rid)

    # Add poison pills to stop workers
    for _ in range(MAX_WORKERS):
        task_queue.put(None)

    # Start worker threads
    from threading import Thread
    threads = []
    for i in range(MAX_WORKERS):
        t = Thread(target=worker_thread, args=(i, task_queue, DOWNLOAD_DIR, worker_bars[i], overall_bar))
        t.start()
        threads.append(t)

    # Wait for all threads to complete
    for t in threads:
        t.join()

    # Close all bars
    for bar in worker_bars:
        bar.close()
    overall_bar.close()

    # Summary
    print()
    print("=" * 70)
    print("📊 Download Summary:")
    print(f"   ✓ Successful: {stats['successful']}")
    print(f"   ✗ Failed: {stats['failed']}")
    print(f"   📁 Files saved in: {DOWNLOAD_DIR.absolute()}")
    print("=" * 70)
    print()
    print("✅ Done!")

if __name__ == "__main__":
    main()