#!/usr/bin/env python3 """ Test different Y scaling factors to find the correct one. The issue: center is aligned but top/bottom are off progressively. This suggests the OCR coordinate space has a different Y scale than the image. """ import json from pdf2image import convert_from_path from reportlab.pdfgen import canvas from reportlab.lib.utils import ImageReader from reportlab.lib.colors import red, blue from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont import tempfile import os # Font try: pdfmetrics.registerFont(TTFont('CF', 'C:/Windows/Fonts/msyh.ttc')) font = 'CF' except: font = 'Helvetica' # Load with open('Raw/DevelopChinese/OCR/听力Intro.json') as f: data = json.load(f) # Convert imgs = convert_from_path('Raw/DevelopChinese/PDF/听力Intro.pdf', dpi=300, first_page=1, last_page=1) img = imgs[0] w, h = img.size print(f'Image: {w}x{h}') # Find actual OCR max Y details = data['pages'][0]['details'] max_y_ocr = max(max(p[1] for p in d['bbox']) for d in details if d.get('bbox')) print(f'OCR max Y: {max_y_ocr}') # Save image with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp: img.save(tmp.name, 'PNG') tmp_path = tmp.name # Create PDF c = canvas.Canvas('test_Y_SCALING.pdf') # Test different OCR heights (scale factors) # If center is good but extremes are off, we need to find the right OCR height test_ocr_heights = [3200, 3300, 3366, 3400, 3433, 3500, 3600, 3700] for ocr_h in test_ocr_heights: c.setPageSize((w, h)) c.drawImage(ImageReader(tmp_path), 0, 0, width=w, height=h) # Draw title c.setFont('Helvetica', 20) c.setFillColorRGB(1, 0, 0) scale = h / ocr_h c.drawString(50, h - 50, f'OCR_HEIGHT: {ocr_h} (scale Y: {scale:.4f})') # Draw boxes with this scaling count = 0 for d in details: text = d.get('text', '').strip() if not text or count >= 30: continue bbox = d.get('bbox', []) if not bbox or len(bbox) != 4: continue xs = [p[0] for p in bbox] ys = [p[1] for p in bbox] x = min(xs) width = max(xs) - x # Scale Y from OCR space (0-ocr_h) to PDF space (0-h) y_ocr_max = max(ys) y_ocr_min = min(ys) # Map from OCR space to image space y_pdf = h - ((y_ocr_max / ocr_h) * h) height = ((y_ocr_max - y_ocr_min) / ocr_h) * h c.setStrokeColor(red) c.setLineWidth(0.5) c.rect(x, y_pdf, width, height, stroke=1, fill=0) font_size = max(1, height * 0.7) c.setFont(font, font_size) c.setFillColor(blue) c.drawString(x, y_pdf, text) count += 1 c.showPage() print(f'Page with OCR height {ocr_h} done (scale: {scale:.4f})') c.save() os.unlink(tmp_path) print(f'\nCreated test_Y_SCALING.pdf with {len(test_ocr_heights)} pages') print('Find which OCR height gives best alignment everywhere!') print(f'Current OCR max Y from data: {max_y_ocr}') print(f'Image height: {h}')