#!/usr/bin/env python3 """ Diagnostic script to understand the alignment issue. Prints precise coordinates to find the correction pattern. """ import json from pdf2image import convert_from_path json_path = "Raw/DevelopChinese/OCR/听力Intro.json" pdf_path = "Raw/DevelopChinese/PDF/听力Intro.pdf" # Load JSON with open(json_path) as f: data = json.load(f) page = data['pages'][0] details = [d for d in page['details'] if d.get('bbox') and d.get('text')] # Sort by Y coordinate details_sorted = sorted(details, key=lambda d: min(p[1] for p in d['bbox'])) # Convert PDF imgs = convert_from_path(pdf_path, dpi=300, first_page=1, last_page=1) img_width, img_height = imgs[0].size # Get OCR dimensions max_x = max(max(p[0] for p in d['bbox']) for d in details) max_y = max(max(p[1] for p in d['bbox']) for d in details) ocr_width = int(max_x * 1.02) ocr_height = int(max_y * 1.02) print(f"Image dimensions: {img_width} x {img_height}") print(f"OCR dimensions: {ocr_width} x {ocr_height}") print(f"\n{'='*80}") print("BBOX COORDINATES ANALYSIS") print(f"{'='*80}\n") # Analyze top, middle, bottom blocks test_indices = [0, len(details_sorted)//2, -1] positions = ["TOP", "MIDDLE", "BOTTOM"] for idx, pos in zip(test_indices, positions): detail = details_sorted[idx] text = detail['text'][:30] bbox = detail['bbox'] # Get bbox bounds ys = [p[1] for p in bbox] min_y = min(ys) max_y = max(ys) print(f"{pos} BLOCK: \"{text}\"") print(f" OCR Y coordinates: min_y={min_y}, max_y={max_y}") # Current formula (what we're using) # y_pdf = pdf_height - ((max_y / ocr_height) * pdf_height) pdf_height = float(ocr_height) # Using OCR dims as PDF dims y_current = pdf_height - ((max_y / ocr_height) * pdf_height) print(f" Current calculation: y_pdf = {pdf_height} - (({max_y}/{ocr_height}) * {pdf_height})") print(f" Current result: y_pdf = {y_current:.1f}") # Where SHOULD it be? (assuming 1:1 mapping) # In OCR space: text is at Y = min_y to max_y # In PDF space (origin bottom-left): text should be at Y = (pdf_height - max_y) to (pdf_height - min_y) y_should_be = pdf_height - max_y print(f" Expected (1:1): y_pdf = {pdf_height} - {max_y} = {y_should_be:.1f}") print(f" Difference: {y_current - y_should_be:.1f} points") print() print(f"{'='*80}") print("DIAGNOSTIC QUESTIONS:") print(f"{'='*80}\n") print("If all differences are ~0, the formula is correct but something else is wrong.") print("If TOP has positive diff and BOTTOM has negative diff, there's a scaling issue.") print("If all have the same diff, there's an offset issue.") print("\nLook at the PDF in debug mode and tell me:") print("1. For TOP block: red box is HOW MANY pixels too high/low?") print("2. For MIDDLE block: red box is HOW MANY pixels too high/low?") print("3. For BOTTOM block: red box is HOW MANY pixels too high/low?")