chineseclass/diagnose_alignment.py

#!/usr/bin/env python3
"""
Diagnostic script to understand the alignment issue.
Prints precise coordinates to find the correction pattern.
"""

import json
from pdf2image import convert_from_path

json_path = "Raw/DevelopChinese/OCR/听力Intro.json"
pdf_path = "Raw/DevelopChinese/PDF/听力Intro.pdf"

# Load JSON
with open(json_path) as f:
    data = json.load(f)

page = data['pages'][0]
details = [d for d in page['details'] if d.get('bbox') and d.get('text')]

# Sort by Y coordinate
details_sorted = sorted(details, key=lambda d: min(p[1] for p in d['bbox']))

# Convert PDF
imgs = convert_from_path(pdf_path, dpi=300, first_page=1, last_page=1)
img_width, img_height = imgs[0].size

# Get OCR dimensions
max_x = max(max(p[0] for p in d['bbox']) for d in details)
max_y = max(max(p[1] for p in d['bbox']) for d in details)
ocr_width = int(max_x * 1.02)
ocr_height = int(max_y * 1.02)

print(f"Image dimensions: {img_width} x {img_height}")
print(f"OCR dimensions:   {ocr_width} x {ocr_height}")
print(f"\n{'='*80}")
print("BBOX COORDINATES ANALYSIS")
print(f"{'='*80}\n")

# Analyze top, middle, bottom blocks
test_indices = [0, len(details_sorted)//2, -1]
positions = ["TOP", "MIDDLE", "BOTTOM"]

for idx, pos in zip(test_indices, positions):
    detail = details_sorted[idx]
    text = detail['text'][:30]
    bbox = detail['bbox']

    # Get bbox bounds
    ys = [p[1] for p in bbox]
    min_y = min(ys)
    max_y = max(ys)

    print(f"{pos} BLOCK: \"{text}\"")
    print(f"  OCR Y coordinates: min_y={min_y}, max_y={max_y}")

    # Current formula (what we're using)
    # y_pdf = pdf_height - ((max_y / ocr_height) * pdf_height)
    pdf_height = float(ocr_height)  # Using OCR dims as PDF dims
    y_current = pdf_height - ((max_y / ocr_height) * pdf_height)

    print(f"  Current calculation: y_pdf = {pdf_height} - (({max_y}/{ocr_height}) * {pdf_height})")
    print(f"  Current result: y_pdf = {y_current:.1f}")

    # Where SHOULD it be? (assuming 1:1 mapping)
    # In OCR space: text is at Y = min_y to max_y
    # In PDF space (origin bottom-left): text should be at Y = (pdf_height - max_y) to (pdf_height - min_y)
    y_should_be = pdf_height - max_y

    print(f"  Expected (1:1): y_pdf = {pdf_height} - {max_y} = {y_should_be:.1f}")
    print(f"  Difference: {y_current - y_should_be:.1f} points")
    print()

print(f"{'='*80}")
print("DIAGNOSTIC QUESTIONS:")
print(f"{'='*80}\n")
print("If all differences are ~0, the formula is correct but something else is wrong.")
print("If TOP has positive diff and BOTTOM has negative diff, there's a scaling issue.")
print("If all have the same diff, there's an offset issue.")
print("\nLook at the PDF in debug mode and tell me:")
print("1. For TOP block: red box is HOW MANY pixels too high/low?")
print("2. For MIDDLE block: red box is HOW MANY pixels too high/low?")
print("3. For BOTTOM block: red box is HOW MANY pixels too high/low?")