#!/usr/bin/env python3 """ ULTRA-SIMPLE version to test basic alignment. No fancy corrections, just raw bbox to PDF mapping. """ import json import sys from pathlib import Path from PIL import Image from pdf2image import convert_from_path from reportlab.pdfgen import canvas from reportlab.lib.utils import ImageReader from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont from reportlab.lib.colors import red, blue import tempfile import os # Register Chinese font try: pdfmetrics.registerFont(TTFont('ChineseFont', 'C:/Windows/Fonts/msyh.ttc')) font_name = 'ChineseFont' except: font_name = 'Helvetica' print("Warning: No Chinese font, using Helvetica") # Paths json_path = "Raw/DevelopChinese/OCR/听力Intro.json" pdf_path = "Raw/DevelopChinese/PDF/听力Intro.pdf" output_path = "test_SIMPLE.pdf" # Load JSON with open(json_path) as f: data = json.load(f) # Convert PDF to image at fixed 300 DPI print("Converting PDF to image at 300 DPI...") images = convert_from_path(pdf_path, dpi=300) img = images[0] # First page only img_width, img_height = img.size print(f"Image size: {img_width} x {img_height}") # Create PDF with same dimensions as image (1:1 ratio) c = canvas.Canvas(output_path) c.setPageSize((img_width, img_height)) # Save and draw image with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp: img.save(tmp.name, 'PNG') tmp_path = tmp.name img_reader = ImageReader(tmp_path) c.drawImage(img_reader, 0, 0, width=img_width, height=img_height) # Get page data page_data = data['pages'][0] details = page_data.get('details', []) # Find max coordinates from ALL bboxes (this is our "OCR space") max_x_ocr = 0 max_y_ocr = 0 for detail in details: bbox = detail.get('bbox', []) if bbox and len(bbox) == 4: for point in bbox: max_x_ocr = max(max_x_ocr, point[0]) max_y_ocr = max(max_y_ocr, point[1]) print(f"OCR space: 0-{max_x_ocr} x 0-{max_y_ocr}") print(f"PDF/Image space: 0-{img_width} x 0-{img_height}") print(f"Scale factors: X={img_width/max_x_ocr:.4f}, Y={img_height/max_y_ocr:.4f}") # Draw bboxes text_count = 0 for detail in details: text = detail.get('text', '').strip() if not text: continue bbox = detail.get('bbox', []) if not bbox or len(bbox) != 4: continue # Get bbox bounds in OCR space xs = [p[0] for p in bbox] ys = [p[1] for p in bbox] min_x = min(xs) max_x = max(xs) min_y = min(ys) max_y = max(ys) # SIMPLE mapping: scale from OCR space to PDF space x_pdf = (min_x / max_x_ocr) * img_width width_pdf = ((max_x - min_x) / max_x_ocr) * img_width # Y is inverted (PDF origin at bottom) y_pdf = img_height - ((max_y / max_y_ocr) * img_height) height_pdf = ((max_y - min_y) / max_y_ocr) * img_height # Draw red box c.setStrokeColor(red) c.setLineWidth(0.5) c.rect(x_pdf, y_pdf, width_pdf, height_pdf, stroke=1, fill=0) # Draw blue text font_size = max(1, height_pdf * 0.7) c.setFont(font_name, font_size) c.setFillColor(blue) c.drawString(x_pdf, y_pdf, text) text_count += 1 print(f"Added {text_count} text blocks") c.showPage() c.save() # Cleanup os.unlink(tmp_path) print(f"\nDone! Check {output_path}") print("If boxes are STILL misaligned, the problem is in the OCR data itself.")