#!/usr/bin/env python3 """ Script to download all audio resources from BLCUP website Downloads 185 MP3 files for Developing Chinese Intermediate Listening I With concurrent downloads (max 20 threads) and progress bar per worker """ import re import requests from pathlib import Path from concurrent.futures import ThreadPoolExecutor, as_completed from threading import Lock import time from tqdm import tqdm from queue import Queue from urllib.parse import unquote # Configuration MAIN_PAGE_URL = "http://www.blcup.com/MobileResSeries?rid=23fea77b-c66b-4bbe-bfcd-b3c784fb2a79" DOWNLOAD_DIR = Path("audio_resources/intermediate_listening_1") MAX_WORKERS = 20 # Maximum concurrent downloads # Thread-safe counters stats_lock = Lock() stats = {"successful": 0, "failed": 0} def extract_resource_ids(html_content): """Extract all resource IDs from the HTML page""" # Pattern to match: MobileResource?rid=UUID pattern = r'MobileResource\?rid=([a-f0-9\-]+)' resource_ids = re.findall(pattern, html_content, re.IGNORECASE) return list(set(resource_ids)) # Remove duplicates def extract_exercise_number(original_filename): """Extract exercise number like 1-1, 1-2, 6-1 from filename""" # Pattern to match numbers like "1-1", "1-2", "6-1", "10-3-1" etc # Also handle "Recording" prefix variations match = re.search(r'(\d+[-\d]+)', original_filename) if match: return match.group(1) return None def download_resource(rid, output_dir, worker_bar, overall_bar, max_retries=3): """Download a single resource given its RID with per-worker progress and retry logic""" download_url = f"http://www.blcup.com/Common/DownRes?doi={rid}" for attempt in range(max_retries): session = requests.Session() try: response = session.get(download_url, timeout=30, stream=True) response.raise_for_status() # Try to get filename from Content-Disposition header original_filename = None if 'Content-Disposition' in response.headers: cd = response.headers['Content-Disposition'] filename_match = re.findall('filename="?([^"]+)"?', cd) if filename_match: # Decode URL-encoded filename original_filename = unquote(filename_match[0]) # Extract exercise number from original filename exercise_number = None if original_filename: exercise_number = extract_exercise_number(original_filename) # Use exercise number as filename, fallback to rid if exercise_number: filename = f"{exercise_number}.mp3" display_name = exercise_number else: filename = f"{rid}.mp3" display_name = rid[:8] # Get expected file size from server total_size = int(response.headers.get('content-length', 0)) # Check if file already exists with correct size output_path = output_dir / filename if output_path.exists(): local_size = output_path.stat().st_size if total_size > 0 and local_size == total_size: # File exists with correct size, skip with stats_lock: stats["successful"] += 1 worker_bar.set_postfix_str(f"βŠ™ {display_name} (OK)") overall_bar.update(1) session.close() return {"status": "skipped", "filename": filename, "rid": rid} else: # File exists but wrong size, will re-download worker_bar.set_postfix_str(f"⚠ {display_name} (redownload)") # Reset worker bar for new file worker_bar.reset(total=total_size if total_size > 0 else 100) retry_info = f" (retry {attempt+1}/{max_retries})" if attempt > 0 else "" worker_bar.set_postfix_str(f"πŸ“₯ {display_name}{retry_info}") # Save the file with progress with open(output_path, 'wb') as f: if total_size > 0: for chunk in response.iter_content(chunk_size=8192): if chunk: f.write(chunk) worker_bar.update(len(chunk)) else: content = response.content f.write(content) worker_bar.update(len(content)) with stats_lock: stats["successful"] += 1 worker_bar.set_postfix_str(f"βœ“ {display_name}") overall_bar.update(1) session.close() return {"status": "success", "filename": filename, "rid": rid} except Exception as e: session.close() if attempt < max_retries - 1: # Will retry worker_bar.set_postfix_str(f"⟳ Retry {attempt+2}/{max_retries}...") time.sleep(1) # Wait before retry else: # Final failure with stats_lock: stats["failed"] += 1 worker_bar.set_postfix_str(f"βœ— Failed after {max_retries} tries") overall_bar.update(1) return {"status": "failed", "rid": rid, "error": str(e)} return {"status": "failed", "rid": rid, "error": "Max retries reached"} def worker_thread(worker_id, task_queue, output_dir, worker_bar, overall_bar): """Worker thread that processes download tasks from queue""" while True: rid = task_queue.get() if rid is None: # Poison pill to stop worker break download_resource(rid, output_dir, worker_bar, overall_bar) task_queue.task_done() def main(): print("=" * 70) print("BLCUP Audio Resource Downloader") print("ε‘ε±•ζ±‰θ―­οΌˆη¬¬2η‰ˆοΌ‰δΈ­ηΊ§ε¬εŠ›οΌˆIοΌ‰") print("=" * 70) print() # Create output directory DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True) print(f"πŸ“ Download directory: {DOWNLOAD_DIR.absolute()}") print() # Fetch the main page print("🌐 Fetching resource list from main page...") try: response = requests.get(MAIN_PAGE_URL, timeout=10) response.raise_for_status() html_content = response.text except Exception as e: print(f"❌ Error fetching main page: {e}") return # Extract all resource IDs resource_ids = extract_resource_ids(html_content) total_resources = len(resource_ids) # Check for already downloaded files existing_files = list(DOWNLOAD_DIR.glob("*.mp3")) if DOWNLOAD_DIR.exists() else [] already_downloaded = len(existing_files) print(f"βœ“ Found {total_resources} resources to download") if already_downloaded > 0: print(f"βŠ™ {already_downloaded} files already exist (will skip)") print(f"⚑ Using {MAX_WORKERS} concurrent workers") print(f"πŸ”„ Auto-retry up to 3 times on failure") print() if total_resources == 0: print("❌ No resources found. Please check the URL.") return # Download all resources with concurrent workers print("πŸ“₯ Starting downloads with worker progress bars...") print() # Create overall progress bar (position 0) overall_bar = tqdm( total=total_resources, desc="Overall ", position=0, unit="file", ncols=100, colour="green", leave=True ) # Create progress bars for each worker worker_bars = [] for i in range(MAX_WORKERS): bar = tqdm( total=100, desc=f"Worker {i+1:2d} ", position=i+1, leave=True, unit="B", unit_scale=True, unit_divisor=1024, ncols=100, colour="cyan" ) worker_bars.append(bar) # Create task queue task_queue = Queue() # Fill queue with tasks for rid in resource_ids: task_queue.put(rid) # Add poison pills to stop workers for _ in range(MAX_WORKERS): task_queue.put(None) # Start worker threads from threading import Thread threads = [] for i in range(MAX_WORKERS): t = Thread(target=worker_thread, args=(i, task_queue, DOWNLOAD_DIR, worker_bars[i], overall_bar)) t.start() threads.append(t) # Wait for all threads to complete for t in threads: t.join() # Close all bars for bar in worker_bars: bar.close() overall_bar.close() # Summary print() print("=" * 70) print("πŸ“Š Download Summary:") print(f" βœ“ Successful: {stats['successful']}") print(f" βœ— Failed: {stats['failed']}") print(f" πŸ“ Files saved in: {DOWNLOAD_DIR.absolute()}") print("=" * 70) print() print("βœ… Done!") if __name__ == "__main__": main()