chineseclass/tools/test_y_scaling.py
StillHammer a61a32b57f Reorganize repository structure
- Move all Python scripts to tools/ directory
- Move documentation files to docs/ directory
- Create exams/ and homework/ directories for future use
- Remove temporary test file (page1_preview.png)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-27 23:28:39 +08:00

106 lines
3.0 KiB
Python

#!/usr/bin/env python3
"""
Test different Y scaling factors to find the correct one.
The issue: center is aligned but top/bottom are off progressively.
This suggests the OCR coordinate space has a different Y scale than the image.
"""
import json
from pdf2image import convert_from_path
from reportlab.pdfgen import canvas
from reportlab.lib.utils import ImageReader
from reportlab.lib.colors import red, blue
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
import tempfile
import os
# Font
try:
pdfmetrics.registerFont(TTFont('CF', 'C:/Windows/Fonts/msyh.ttc'))
font = 'CF'
except:
font = 'Helvetica'
# Load
with open('Raw/DevelopChinese/OCR/听力Intro.json') as f:
data = json.load(f)
# Convert
imgs = convert_from_path('Raw/DevelopChinese/PDF/听力Intro.pdf', dpi=300, first_page=1, last_page=1)
img = imgs[0]
w, h = img.size
print(f'Image: {w}x{h}')
# Find actual OCR max Y
details = data['pages'][0]['details']
max_y_ocr = max(max(p[1] for p in d['bbox']) for d in details if d.get('bbox'))
print(f'OCR max Y: {max_y_ocr}')
# Save image
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
img.save(tmp.name, 'PNG')
tmp_path = tmp.name
# Create PDF
c = canvas.Canvas('test_Y_SCALING.pdf')
# Test different OCR heights (scale factors)
# If center is good but extremes are off, we need to find the right OCR height
test_ocr_heights = [3200, 3300, 3366, 3400, 3433, 3500, 3600, 3700]
for ocr_h in test_ocr_heights:
c.setPageSize((w, h))
c.drawImage(ImageReader(tmp_path), 0, 0, width=w, height=h)
# Draw title
c.setFont('Helvetica', 20)
c.setFillColorRGB(1, 0, 0)
scale = h / ocr_h
c.drawString(50, h - 50, f'OCR_HEIGHT: {ocr_h} (scale Y: {scale:.4f})')
# Draw boxes with this scaling
count = 0
for d in details:
text = d.get('text', '').strip()
if not text or count >= 30:
continue
bbox = d.get('bbox', [])
if not bbox or len(bbox) != 4:
continue
xs = [p[0] for p in bbox]
ys = [p[1] for p in bbox]
x = min(xs)
width = max(xs) - x
# Scale Y from OCR space (0-ocr_h) to PDF space (0-h)
y_ocr_max = max(ys)
y_ocr_min = min(ys)
# Map from OCR space to image space
y_pdf = h - ((y_ocr_max / ocr_h) * h)
height = ((y_ocr_max - y_ocr_min) / ocr_h) * h
c.setStrokeColor(red)
c.setLineWidth(0.5)
c.rect(x, y_pdf, width, height, stroke=1, fill=0)
font_size = max(1, height * 0.7)
c.setFont(font, font_size)
c.setFillColor(blue)
c.drawString(x, y_pdf, text)
count += 1
c.showPage()
print(f'Page with OCR height {ocr_h} done (scale: {scale:.4f})')
c.save()
os.unlink(tmp_path)
print(f'\nCreated test_Y_SCALING.pdf with {len(test_ocr_heights)} pages')
print('Find which OCR height gives best alignment everywhere!')
print(f'Current OCR max Y from data: {max_y_ocr}')
print(f'Image height: {h}')