chineseclass/tools/diagnose_alignment.py
StillHammer a61a32b57f Reorganize repository structure
- Move all Python scripts to tools/ directory
- Move documentation files to docs/ directory
- Create exams/ and homework/ directories for future use
- Remove temporary test file (page1_preview.png)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-27 23:28:39 +08:00

83 lines
2.8 KiB
Python

#!/usr/bin/env python3
"""
Diagnostic script to understand the alignment issue.
Prints precise coordinates to find the correction pattern.
"""
import json
from pdf2image import convert_from_path
json_path = "Raw/DevelopChinese/OCR/听力Intro.json"
pdf_path = "Raw/DevelopChinese/PDF/听力Intro.pdf"
# Load JSON
with open(json_path) as f:
data = json.load(f)
page = data['pages'][0]
details = [d for d in page['details'] if d.get('bbox') and d.get('text')]
# Sort by Y coordinate
details_sorted = sorted(details, key=lambda d: min(p[1] for p in d['bbox']))
# Convert PDF
imgs = convert_from_path(pdf_path, dpi=300, first_page=1, last_page=1)
img_width, img_height = imgs[0].size
# Get OCR dimensions
max_x = max(max(p[0] for p in d['bbox']) for d in details)
max_y = max(max(p[1] for p in d['bbox']) for d in details)
ocr_width = int(max_x * 1.02)
ocr_height = int(max_y * 1.02)
print(f"Image dimensions: {img_width} x {img_height}")
print(f"OCR dimensions: {ocr_width} x {ocr_height}")
print(f"\n{'='*80}")
print("BBOX COORDINATES ANALYSIS")
print(f"{'='*80}\n")
# Analyze top, middle, bottom blocks
test_indices = [0, len(details_sorted)//2, -1]
positions = ["TOP", "MIDDLE", "BOTTOM"]
for idx, pos in zip(test_indices, positions):
detail = details_sorted[idx]
text = detail['text'][:30]
bbox = detail['bbox']
# Get bbox bounds
ys = [p[1] for p in bbox]
min_y = min(ys)
max_y = max(ys)
print(f"{pos} BLOCK: \"{text}\"")
print(f" OCR Y coordinates: min_y={min_y}, max_y={max_y}")
# Current formula (what we're using)
# y_pdf = pdf_height - ((max_y / ocr_height) * pdf_height)
pdf_height = float(ocr_height) # Using OCR dims as PDF dims
y_current = pdf_height - ((max_y / ocr_height) * pdf_height)
print(f" Current calculation: y_pdf = {pdf_height} - (({max_y}/{ocr_height}) * {pdf_height})")
print(f" Current result: y_pdf = {y_current:.1f}")
# Where SHOULD it be? (assuming 1:1 mapping)
# In OCR space: text is at Y = min_y to max_y
# In PDF space (origin bottom-left): text should be at Y = (pdf_height - max_y) to (pdf_height - min_y)
y_should_be = pdf_height - max_y
print(f" Expected (1:1): y_pdf = {pdf_height} - {max_y} = {y_should_be:.1f}")
print(f" Difference: {y_current - y_should_be:.1f} points")
print()
print(f"{'='*80}")
print("DIAGNOSTIC QUESTIONS:")
print(f"{'='*80}\n")
print("If all differences are ~0, the formula is correct but something else is wrong.")
print("If TOP has positive diff and BOTTOM has negative diff, there's a scaling issue.")
print("If all have the same diff, there's an offset issue.")
print("\nLook at the PDF in debug mode and tell me:")
print("1. For TOP block: red box is HOW MANY pixels too high/low?")
print("2. For MIDDLE block: red box is HOW MANY pixels too high/low?")
print("3. For BOTTOM block: red box is HOW MANY pixels too high/low?")