#!/usr/bin/env python3 """ Test different Y offsets to find the correct one. Creates multiple pages with different offsets. """ import json from pdf2image import convert_from_path from reportlab.pdfgen import canvas from reportlab.lib.utils import ImageReader from reportlab.lib.colors import red, blue from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont import tempfile import os # Font try: pdfmetrics.registerFont(TTFont('CF', 'C:/Windows/Fonts/msyh.ttc')) font = 'CF' except: font = 'Helvetica' # Load with open('Raw/DevelopChinese/OCR/听力Intro.json') as f: data = json.load(f) # Convert imgs = convert_from_path('Raw/DevelopChinese/PDF/听力Intro.pdf', dpi=300, first_page=1, last_page=1) img = imgs[0] w, h = img.size print(f'Image: {w}x{h}') # Save image once with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp: img.save(tmp.name, 'PNG') tmp_path = tmp.name # Create PDF with multiple test pages c = canvas.Canvas('test_Y_OFFSETS.pdf') # Test different Y offsets test_offsets = [-200, -150, -100, -50, 0, 50, 100, 150, 200] for offset in test_offsets: c.setPageSize((w, h)) # Draw image c.drawImage(ImageReader(tmp_path), 0, 0, width=w, height=h) # Draw title c.setFont('Helvetica', 20) c.setFillColorRGB(1, 0, 0) c.drawString(50, h - 50, f'Y OFFSET: {offset:+d} pixels') # Draw boxes with offset count = 0 for d in data['pages'][0]['details']: text = d.get('text', '').strip() if not text or count >= 30: # Limit to first 30 for clarity continue bbox = d.get('bbox', []) if not bbox or len(bbox) != 4: continue xs = [p[0] for p in bbox] ys = [p[1] for p in bbox] x = min(xs) width = max(xs) - x # Apply offset to Y calculation y_base = h - max(ys) y = y_base + offset height = max(ys) - min(ys) c.setStrokeColor(red) c.setLineWidth(0.5) c.rect(x, y, width, height, stroke=1, fill=0) font_size = max(1, height * 0.7) c.setFont(font, font_size) c.setFillColor(blue) c.drawString(x, y, text) count += 1 c.showPage() print(f'Page with offset {offset:+d} done') c.save() os.unlink(tmp_path) print(f'\nCreated test_Y_OFFSETS.pdf with {len(test_offsets)} pages') print('Browse through pages to find which offset aligns boxes correctly!') print('Offsets tested:', test_offsets)