chineseclass/json_to_pdf_SIMPLE.py

#!/usr/bin/env python3
"""
ULTRA-SIMPLE version to test basic alignment.
No fancy corrections, just raw bbox to PDF mapping.
"""

import json
import sys
from pathlib import Path
from PIL import Image
from pdf2image import convert_from_path
from reportlab.pdfgen import canvas
from reportlab.lib.utils import ImageReader
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib.colors import red, blue
import tempfile
import os

# Register Chinese font
try:
    pdfmetrics.registerFont(TTFont('ChineseFont', 'C:/Windows/Fonts/msyh.ttc'))
    font_name = 'ChineseFont'
except:
    font_name = 'Helvetica'
    print("Warning: No Chinese font, using Helvetica")

# Paths
json_path = "Raw/DevelopChinese/OCR/听力Intro.json"
pdf_path = "Raw/DevelopChinese/PDF/听力Intro.pdf"
output_path = "test_SIMPLE.pdf"

# Load JSON
with open(json_path) as f:
    data = json.load(f)

# Convert PDF to image at fixed 300 DPI
print("Converting PDF to image at 300 DPI...")
images = convert_from_path(pdf_path, dpi=300)
img = images[0]  # First page only
img_width, img_height = img.size
print(f"Image size: {img_width} x {img_height}")

# Create PDF with same dimensions as image (1:1 ratio)
c = canvas.Canvas(output_path)
c.setPageSize((img_width, img_height))

# Save and draw image
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
    img.save(tmp.name, 'PNG')
    tmp_path = tmp.name

img_reader = ImageReader(tmp_path)
c.drawImage(img_reader, 0, 0, width=img_width, height=img_height)

# Get page data
page_data = data['pages'][0]
details = page_data.get('details', [])

# Find max coordinates from ALL bboxes (this is our "OCR space")
max_x_ocr = 0
max_y_ocr = 0
for detail in details:
    bbox = detail.get('bbox', [])
    if bbox and len(bbox) == 4:
        for point in bbox:
            max_x_ocr = max(max_x_ocr, point[0])
            max_y_ocr = max(max_y_ocr, point[1])

print(f"OCR space: 0-{max_x_ocr} x 0-{max_y_ocr}")
print(f"PDF/Image space: 0-{img_width} x 0-{img_height}")
print(f"Scale factors: X={img_width/max_x_ocr:.4f}, Y={img_height/max_y_ocr:.4f}")

# Draw bboxes
text_count = 0
for detail in details:
    text = detail.get('text', '').strip()
    if not text:
        continue

    bbox = detail.get('bbox', [])
    if not bbox or len(bbox) != 4:
        continue

    # Get bbox bounds in OCR space
    xs = [p[0] for p in bbox]
    ys = [p[1] for p in bbox]
    min_x = min(xs)
    max_x = max(xs)
    min_y = min(ys)
    max_y = max(ys)

    # SIMPLE mapping: scale from OCR space to PDF space
    x_pdf = (min_x / max_x_ocr) * img_width
    width_pdf = ((max_x - min_x) / max_x_ocr) * img_width

    # Y is inverted (PDF origin at bottom)
    y_pdf = img_height - ((max_y / max_y_ocr) * img_height)
    height_pdf = ((max_y - min_y) / max_y_ocr) * img_height

    # Draw red box
    c.setStrokeColor(red)
    c.setLineWidth(0.5)
    c.rect(x_pdf, y_pdf, width_pdf, height_pdf, stroke=1, fill=0)

    # Draw blue text
    font_size = max(1, height_pdf * 0.7)
    c.setFont(font_name, font_size)
    c.setFillColor(blue)
    c.drawString(x_pdf, y_pdf, text)

    text_count += 1

print(f"Added {text_count} text blocks")

c.showPage()
c.save()

# Cleanup
os.unlink(tmp_path)

print(f"\nDone! Check {output_path}")
print("If boxes are STILL misaligned, the problem is in the OCR data itself.")