chineseclass/json_to_searchable_pdf.py
StillHammer acbe1b4769 Clean up project structure and update content
Remove ClassGenSystem and obsolete tools from tracking, update course content files with latest revisions.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-27 16:24:41 +08:00

705 lines
26 KiB
Python

#!/usr/bin/env python3
"""
Script to convert OCR JSON files to searchable PDFs.
Takes OCR data from JSON and creates a PDF with invisible text layer over the original images.
Modes:
- Single file: python json_to_searchable_pdf.py -i input.json -o output.pdf
- Auto batch: python json_to_searchable_pdf.py --auto [directory]
"""
import json
import argparse
from pathlib import Path
from typing import List, Dict, Tuple
from PIL import Image
from pdf2image import convert_from_path
from reportlab.pdfgen import canvas
from reportlab.lib.utils import ImageReader
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
import tempfile
import os
import sys
import numpy as np
class SearchablePDFGenerator:
"""Generator for searchable PDFs from OCR JSON data."""
# Class variable to track if Chinese font is registered
_chinese_font_registered = False
_chinese_font_name = None
def __init__(self, json_path: str, output_path: str = None, auto_mode: bool = False, debug_mode: bool = False):
"""
Initialize the PDF generator.
Args:
json_path: Path to the OCR JSON file
output_path: Output PDF path (optional in auto mode)
auto_mode: If True, use default output paths
debug_mode: If True, make text visible for debugging
"""
self.json_path = Path(json_path)
self.data = self._load_json()
self.auto_mode = auto_mode
self.debug_mode = debug_mode
# Register Chinese font if not already done
if not SearchablePDFGenerator._chinese_font_registered:
self._register_chinese_font()
# Determine output path
if output_path:
self.output_path = Path(output_path)
elif auto_mode:
# In auto mode, replace original PDF with searchable version
source_pdf = Path(self.data.get("source_pdf", ""))
self.output_path = source_pdf.parent / f"{source_pdf.stem}_searchable.pdf"
else:
raise ValueError("Output path must be specified in non-auto mode")
# Get source PDF path from JSON
self.source_pdf = Path(self.data.get("source_pdf", ""))
if not self.source_pdf.exists():
raise FileNotFoundError(f"Source PDF not found: {self.source_pdf}")
def _register_chinese_font(self):
"""Register a Chinese-compatible font."""
# Common Chinese font paths on different systems
font_paths = [
# Windows
"C:/Windows/Fonts/simsun.ttc",
"C:/Windows/Fonts/msyh.ttc",
"C:/Windows/Fonts/simhei.ttf",
# Linux
"/usr/share/fonts/truetype/arphic/uming.ttc",
"/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc",
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
# macOS
"/System/Library/Fonts/PingFang.ttc",
"/Library/Fonts/Arial Unicode.ttf",
]
for font_path in font_paths:
if Path(font_path).exists():
try:
pdfmetrics.registerFont(TTFont('ChineseFont', font_path))
SearchablePDFGenerator._chinese_font_name = 'ChineseFont'
SearchablePDFGenerator._chinese_font_registered = True
print(f" Registered Chinese font: {Path(font_path).name}")
return
except Exception as e:
continue
# Fallback to Helvetica (won't show Chinese properly but won't crash)
print(" Warning: No Chinese font found, using Helvetica (Chinese chars may not display)")
SearchablePDFGenerator._chinese_font_name = 'Helvetica'
SearchablePDFGenerator._chinese_font_registered = True
def _load_json(self) -> Dict:
"""Load and parse the OCR JSON file."""
with open(self.json_path, 'r', encoding='utf-8') as f:
return json.load(f)
def _detect_margins(self, img: Image.Image, threshold: int = 240) -> Tuple[int, int, int, int]:
"""
Detect white margins in image.
Args:
img: PIL Image
threshold: Brightness threshold for "white" (0-255)
Returns:
Tuple of (top, right, bottom, left) margin sizes in pixels
"""
img_gray = np.array(img.convert('L'))
height, width = img_gray.shape
# Detect top margin
top_margin = 0
for row in range(min(height // 4, 200)): # Check first 25% or 200px
if img_gray[row, :].mean() >= threshold:
top_margin = row
else:
break
# Detect bottom margin
bottom_margin = 0
for row in range(height - 1, max(height * 3 // 4, height - 200), -1):
if img_gray[row, :].mean() >= threshold:
bottom_margin = height - row - 1
else:
break
# For left/right, check less aggressively (Chinese text usually uses full width)
left_margin = 0
right_margin = 0
return (top_margin, right_margin, bottom_margin, left_margin)
def _get_ocr_dimensions_and_offsets(self, page_data: Dict) -> Tuple[int, int, int, int]:
"""
Estimate OCR image dimensions and offsets from bbox coordinates.
Args:
page_data: Page data from JSON
Returns:
Tuple of (width, height, offset_x, offset_y) in pixels
"""
details = page_data.get("details", [])
min_x = float('inf')
max_x = 0
min_y = float('inf')
max_y = 0
for detail in details:
bbox = detail.get("bbox", [])
if bbox and len(bbox) == 4:
for point in bbox:
min_x = min(min_x, point[0])
max_x = max(max_x, point[0])
min_y = min(min_y, point[1])
max_y = max(max_y, point[1])
# Handle edge case of no valid bboxes
if min_x == float('inf'):
return (0, 0, 0, 0)
# Add small margin for edge cases (needed for bottom alignment)
width = int(max_x * 1.02)
height = int(max_y * 1.02)
offset_x = int(min_x)
offset_y = int(min_y)
return (width, height, offset_x, offset_y)
def _get_ocr_dimensions(self, page_data: Dict) -> Tuple[int, int]:
"""Legacy wrapper for backward compatibility."""
width, height, _, _ = self._get_ocr_dimensions_and_offsets(page_data)
return (width, height)
def _convert_pdf_to_images(self, target_width: int = None, target_height: int = None) -> List[Image.Image]:
"""
Convert source PDF to images.
Args:
target_width: Desired image width (will calculate DPI to match)
target_height: Desired image height (will calculate DPI to match)
"""
print(f" Converting PDF to images: {self.source_pdf.name}")
if target_width or target_height:
# Calculate DPI to match target dimensions
# First convert at 300 DPI to get aspect ratio
test_img = convert_from_path(str(self.source_pdf), dpi=300, first_page=1, last_page=1)[0]
test_w, test_h = test_img.size
# Calculate required DPI
if target_width:
required_dpi = int((target_width / test_w) * 300)
print(f" Calculated DPI to match width {target_width}: {required_dpi}")
elif target_height:
required_dpi = int((target_height / test_h) * 300)
print(f" Calculated DPI to match height {target_height}: {required_dpi}")
images = convert_from_path(str(self.source_pdf), dpi=required_dpi)
else:
# Default: 300 DPI
images = convert_from_path(str(self.source_pdf), dpi=300)
return images
def _normalize_bbox(self, bbox: List[List[int]], img_width: int, img_height: int,
pdf_width: float, pdf_height: float,
ocr_width: int = None, ocr_height: int = None,
margins: Tuple[int, int, int, int] = None,
ocr_offsets: Tuple[int, int] = None) -> Tuple[float, float, float, float]:
"""
Convert bbox coordinates from image space to PDF space.
Args:
bbox: Bounding box [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
img_width: Original image width (from PDF conversion)
img_height: Original image height (from PDF conversion)
pdf_width: PDF page width
pdf_height: PDF page height
ocr_width: OCR image width (if different from img_width)
ocr_height: OCR image height (if different from img_height)
margins: (top, right, bottom, left) margins to compensate for
Returns:
Tuple of (x, y, width, height) in PDF coordinates
"""
# Use OCR dimensions if provided, otherwise use image dimensions
source_width = ocr_width if ocr_width else img_width
source_height = ocr_height if ocr_height else img_height
# DEBUG: Print scaling info for first few calls
import threading
if not hasattr(threading.current_thread(), '_bbox_debug_count'):
threading.current_thread()._bbox_debug_count = 0
if threading.current_thread()._bbox_debug_count < 3:
threading.current_thread()._bbox_debug_count += 1
scale_y = pdf_height / source_height if source_height > 0 else 1.0
margin_info = f", top_margin={margins[0] if margins else 0}" if margins else ""
print(f" [normalize_bbox] source_height={source_height}, pdf_height={pdf_height}, scale_y={scale_y:.4f}{margin_info}")
# Get margins - if OCR was done on cropped image, we need to add margins back
if margins:
top_margin, right_margin, bottom_margin, left_margin = margins
else:
top_margin = right_margin = bottom_margin = left_margin = 0
# Get min/max coordinates from bbox
xs = [point[0] for point in bbox]
ys = [point[1] for point in bbox]
min_x = min(xs)
max_x = max(xs)
min_y = min(ys)
max_y = max(ys)
# Subtract OCR offsets if provided (bbox coords may not start at 0,0)
if ocr_offsets:
offset_x, offset_y = ocr_offsets
min_x_adj = min_x - offset_x
max_x_adj = max_x - offset_x
min_y_adj = min_y - offset_y
max_y_adj = max_y - offset_y
else:
min_x_adj = min_x
max_x_adj = max_x
min_y_adj = min_y
max_y_adj = max_y
# Convert to PDF coordinates (origin at bottom-left)
# CRITICAL: divide by source dimensions (OCR space), not image dimensions!
x = (min_x_adj / source_width) * pdf_width
width = ((max_x_adj - min_x_adj) / source_width) * pdf_width
# Y coordinate is inverted in PDF (origin at bottom)
# We want the bottom-left corner of the text box
y_bottom = pdf_height - ((max_y_adj / source_height) * pdf_height)
height = ((max_y_adj - min_y_adj) / source_height) * pdf_height
# Adjust Y to be at the baseline (bottom of text box)
y = y_bottom
return (x, y, width, height)
def _calculate_font_size(self, text: str, bbox_width: float, bbox_height: float,
c: canvas.Canvas, font_name: str = None) -> float:
"""
Calculate optimal font size to fit text in bbox.
Args:
text: The text to fit
bbox_width: Width of bounding box
bbox_height: Height of bounding box
c: Canvas object for text width calculation
font_name: Font name to use (defaults to Chinese font)
Returns:
Optimal font size in points
"""
if font_name is None:
font_name = SearchablePDFGenerator._chinese_font_name
# Start with height-based estimate (reduced by factor for better fit)
font_size = bbox_height * 0.7
# Check if text fits horizontally, adjust if needed
max_iterations = 10
iteration = 0
while iteration < max_iterations:
try:
text_width = c.stringWidth(text, font_name, font_size)
if text_width <= bbox_width * 1.05: # Allow 5% overflow for safety
break
# Text too wide, reduce font size
font_size *= (bbox_width / text_width) * 0.95
if font_size < 1:
font_size = 1
break
iteration += 1
except:
# Fallback to height-based sizing
font_size = bbox_height * 0.5
break
return max(1, font_size)
def _add_text_layer(self, c: canvas.Canvas, page_data: Dict,
img_width: int, img_height: int,
pdf_width: float, pdf_height: float,
margins: Tuple[int, int, int, int] = None):
"""
Add invisible text layer to PDF page.
Args:
c: ReportLab canvas
page_data: Page data from JSON
img_width: Image width
img_height: Image height
pdf_width: PDF page width
pdf_height: PDF page height
Returns:
Dict with metrics: text_count, char_count, avg_confidence, errors
"""
# Detect OCR image dimensions and offsets from bbox coordinates
ocr_width, ocr_height, offset_x, offset_y = self._get_ocr_dimensions_and_offsets(page_data)
ocr_offsets = (offset_x, offset_y)
if self.debug_mode:
if ocr_width > 0:
print(f" OCR dims: {ocr_width}x{ocr_height}, offsets: ({offset_x}, {offset_y})")
print(f" Converted img: {img_width}x{img_height}")
print(f" Using OCR dims as source, scaling to PDF/image dims")
details = page_data.get("details", [])
metrics = {
"text_count": 0,
"char_count": 0,
"confidences": [],
"errors": 0,
"empty_blocks": 0,
"invalid_bbox": 0
}
for detail in details:
text = detail.get("text", "").strip()
confidence = detail.get("confidence", 0.0)
if not text:
metrics["empty_blocks"] += 1
continue
bbox = detail.get("bbox", [])
if not bbox or len(bbox) != 4:
metrics["invalid_bbox"] += 1
continue
# Convert bbox to PDF coordinates
# Disable offsets for now - focus on getting the scaling right first
x, y, width, height = self._normalize_bbox(bbox, img_width, img_height,
pdf_width, pdf_height,
ocr_width, ocr_height, None, None)
# Calculate optimal font size
font_size = self._calculate_font_size(text, width, height, c)
try:
# Use Chinese font
font_name = SearchablePDFGenerator._chinese_font_name
c.setFont(font_name, font_size)
if self.debug_mode:
# Debug mode: draw visible text with bounding boxes
from reportlab.lib.colors import red, blue
# Draw red rectangle around bbox
c.setStrokeColor(red)
c.setLineWidth(0.5)
c.rect(x, y, width, height, stroke=1, fill=0)
# Draw visible text in blue
c.setFillColor(blue)
c.drawString(x, y, text)
else:
# Normal mode: invisible text
text_obj = c.beginText(x, y)
text_obj.setTextRenderMode(3) # Mode 3 = invisible text
text_obj.textLine(text)
c.drawText(text_obj)
metrics["text_count"] += 1
metrics["char_count"] += len(text)
metrics["confidences"].append(confidence)
except Exception as e:
metrics["errors"] += 1
if metrics["errors"] <= 3: # Only show first 3 errors
print(f" Warning: Could not add text '{text[:20]}...': {e}")
continue
return metrics
def generate(self):
"""Generate the searchable PDF."""
print(f"\n{'='*60}")
print(f"Processing: {self.json_path.name}")
print(f"Output: {self.output_path}")
print(f"{'='*60}")
# Create output directory if needed
self.output_path.parent.mkdir(parents=True, exist_ok=True)
pages = self.data.get("pages", [])
# Detect OCR dimensions from first page to match PDF conversion
if pages:
ocr_width, ocr_height, offset_x, offset_y = self._get_ocr_dimensions_and_offsets(pages[0])
print(f" Detected OCR dimensions: {ocr_width} x {ocr_height} (offsets: {offset_x}, {offset_y})")
# CRITICAL FIX: Convert PDF to match OCR height too, not just width!
# This ensures perfect 1:1 aspect ratio match
if ocr_width and ocr_height:
# Calculate DPI needed to match both dimensions
test_img = convert_from_path(str(self.source_pdf), dpi=300, first_page=1, last_page=1)[0]
test_w, test_h = test_img.size
# Calculate DPI for each dimension
dpi_for_width = int((ocr_width / test_w) * 300)
dpi_for_height = int((ocr_height / test_h) * 300)
# Use average or prioritize height (where we had more issues)
target_dpi = int((dpi_for_width + dpi_for_height) / 2)
print(f" DPI for width: {dpi_for_width}, for height: {dpi_for_height}, using: {target_dpi}")
images = convert_from_path(str(self.source_pdf), dpi=target_dpi)
else:
images = self._convert_pdf_to_images()
else:
images = self._convert_pdf_to_images()
# Create PDF
c = canvas.Canvas(str(self.output_path))
# Global metrics
total_metrics = {
"text_blocks": 0,
"characters": 0,
"confidences": [],
"errors": 0,
"empty_blocks": 0,
"invalid_bbox": 0
}
for idx, page_data in enumerate(pages):
if idx >= len(images):
print(f" Warning: No image for page {idx + 1}")
break
img = images[idx]
img_width, img_height = img.size
# Detect white margins in the image
margins = self._detect_margins(img)
# Get OCR dimensions for this page
page_ocr_width, page_ocr_height = self._get_ocr_dimensions(page_data)
# CRITICAL: Use IMAGE dimensions as PDF dimensions, NOT OCR dimensions
# The bbox will be scaled from OCR space to image space in normalize_bbox
pdf_width = float(img_width)
pdf_height = float(img_height)
if self.debug_mode:
print(f" PDF dims (from image): {pdf_width}x{pdf_height}")
print(f" OCR dims (from bbox): {page_ocr_width}x{page_ocr_height}")
if page_ocr_width > 0:
scale_x = pdf_width / page_ocr_width
scale_y = pdf_height / page_ocr_height
print(f" Scale factors: X={scale_x:.4f}, Y={scale_y:.4f}")
c.setPageSize((pdf_width, pdf_height))
# Save image to temporary file
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
img.save(tmp.name, 'PNG')
tmp_path = tmp.name
try:
# Draw image as background
img_reader = ImageReader(tmp_path)
c.drawImage(img_reader, 0, 0, width=pdf_width, height=pdf_height)
# Add invisible text layer
page_metrics = self._add_text_layer(c, page_data, img_width, img_height,
pdf_width, pdf_height, margins)
# Accumulate metrics
total_metrics["text_blocks"] += page_metrics["text_count"]
total_metrics["characters"] += page_metrics["char_count"]
total_metrics["confidences"].extend(page_metrics["confidences"])
total_metrics["errors"] += page_metrics["errors"]
total_metrics["empty_blocks"] += page_metrics["empty_blocks"]
total_metrics["invalid_bbox"] += page_metrics["invalid_bbox"]
# Show page
c.showPage()
# Calculate page average confidence
if page_metrics["confidences"]:
avg_conf = sum(page_metrics["confidences"]) / len(page_metrics["confidences"])
print(f" Page {idx + 1}/{len(pages)}: {page_metrics['text_count']} blocks, "
f"{page_metrics['char_count']} chars, "
f"avg conf: {avg_conf:.1%}")
else:
print(f" Page {idx + 1}/{len(pages)}: {page_metrics['text_count']} blocks")
finally:
# Clean up temp file
os.unlink(tmp_path)
# Save PDF
c.save()
# Display summary metrics
print(f"\n{''*60}")
print(f"QUALITY METRICS:")
print(f"{''*60}")
print(f" Total text blocks: {total_metrics['text_blocks']}")
print(f" Total characters: {total_metrics['characters']}")
if total_metrics["confidences"]:
avg_confidence = sum(total_metrics["confidences"]) / len(total_metrics["confidences"])
min_confidence = min(total_metrics["confidences"])
max_confidence = max(total_metrics["confidences"])
# Count high/low confidence blocks
high_conf = sum(1 for c in total_metrics["confidences"] if c >= 0.9)
low_conf = sum(1 for c in total_metrics["confidences"] if c < 0.7)
print(f" Average confidence: {avg_confidence:.1%}")
print(f" Confidence range: {min_confidence:.1%} - {max_confidence:.1%}")
print(f" High confidence (≥90%): {high_conf}/{total_metrics['text_blocks']} ({high_conf/total_metrics['text_blocks']*100:.1f}%)")
print(f" Low confidence (<70%): {low_conf}/{total_metrics['text_blocks']} ({low_conf/total_metrics['text_blocks']*100:.1f}%)")
if total_metrics["errors"] > 0:
print(f" Errors: {total_metrics['errors']}")
if total_metrics["empty_blocks"] > 0:
print(f" Empty blocks: {total_metrics['empty_blocks']}")
if total_metrics["invalid_bbox"] > 0:
print(f" Invalid bboxes: {total_metrics['invalid_bbox']}")
print(f"{''*60}")
print(f" ✓ Saved to: {self.output_path}")
return total_metrics
def process_single_file(input_path: str, output_path: str, debug_mode: bool = False):
"""Process a single JSON file."""
try:
generator = SearchablePDFGenerator(input_path, output_path, auto_mode=False, debug_mode=debug_mode)
generator.generate()
return True
except Exception as e:
print(f"Error processing {input_path}: {e}")
import traceback
traceback.print_exc()
return False
def process_auto_mode(base_path: str = None):
"""Auto-process all JSON files in OCR directories."""
if base_path:
base_path = Path(base_path)
else:
base_path = Path(__file__).parent / "Raw"
# Find all OCR JSON files
json_files = list(base_path.glob("*/OCR/*.json"))
if not json_files:
print(f"No JSON files found in {base_path}/*/OCR/")
return False
print(f"\nFound {len(json_files)} JSON files to process")
print("="*60)
success_count = 0
error_count = 0
for json_file in json_files:
try:
generator = SearchablePDFGenerator(str(json_file), auto_mode=True)
generator.generate()
success_count += 1
except Exception as e:
print(f"\n❌ Error processing {json_file.name}: {e}")
error_count += 1
continue
print("\n" + "="*60)
print(f"SUMMARY: {success_count} succeeded, {error_count} failed")
print("="*60)
return error_count == 0
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description="Convert OCR JSON to searchable PDF",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Process single file with output
python json_to_searchable_pdf.py -i input.json -o output.pdf
# Debug mode: visible text with red bounding boxes
python json_to_searchable_pdf.py -i input.json -o output.pdf --debug
# Auto-process all JSON files in Raw directory
python json_to_searchable_pdf.py --auto
# Auto-process from specific directory
python json_to_searchable_pdf.py --auto /path/to/directory
"""
)
parser.add_argument(
"-i", "--input",
help="Input OCR JSON file"
)
parser.add_argument(
"-o", "--output",
help="Output PDF file"
)
parser.add_argument(
"--auto",
nargs="?",
const=True,
help="Auto-process all JSON files (optionally specify base directory)"
)
parser.add_argument(
"--debug",
action="store_true",
help="Debug mode: make text visible with red bounding boxes"
)
args = parser.parse_args()
# Validate arguments
if args.auto:
# Auto mode
base_dir = args.auto if isinstance(args.auto, str) else None
success = process_auto_mode(base_dir)
return 0 if success else 1
elif args.input and args.output:
# Single file mode
success = process_single_file(args.input, args.output, debug_mode=args.debug)
return 0 if success else 1
else:
parser.print_help()
print("\n❌ Error: Either specify --auto or both -i and -o")
return 1
if __name__ == "__main__":
exit(main())