Remove ClassGenSystem and obsolete tools from tracking, update course content files with latest revisions. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
705 lines
26 KiB
Python
705 lines
26 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Script to convert OCR JSON files to searchable PDFs.
|
|
Takes OCR data from JSON and creates a PDF with invisible text layer over the original images.
|
|
|
|
Modes:
|
|
- Single file: python json_to_searchable_pdf.py -i input.json -o output.pdf
|
|
- Auto batch: python json_to_searchable_pdf.py --auto [directory]
|
|
"""
|
|
|
|
import json
|
|
import argparse
|
|
from pathlib import Path
|
|
from typing import List, Dict, Tuple
|
|
from PIL import Image
|
|
from pdf2image import convert_from_path
|
|
from reportlab.pdfgen import canvas
|
|
from reportlab.lib.utils import ImageReader
|
|
from reportlab.pdfbase import pdfmetrics
|
|
from reportlab.pdfbase.ttfonts import TTFont
|
|
import tempfile
|
|
import os
|
|
import sys
|
|
import numpy as np
|
|
|
|
|
|
class SearchablePDFGenerator:
|
|
"""Generator for searchable PDFs from OCR JSON data."""
|
|
|
|
# Class variable to track if Chinese font is registered
|
|
_chinese_font_registered = False
|
|
_chinese_font_name = None
|
|
|
|
def __init__(self, json_path: str, output_path: str = None, auto_mode: bool = False, debug_mode: bool = False):
|
|
"""
|
|
Initialize the PDF generator.
|
|
|
|
Args:
|
|
json_path: Path to the OCR JSON file
|
|
output_path: Output PDF path (optional in auto mode)
|
|
auto_mode: If True, use default output paths
|
|
debug_mode: If True, make text visible for debugging
|
|
"""
|
|
self.json_path = Path(json_path)
|
|
self.data = self._load_json()
|
|
self.auto_mode = auto_mode
|
|
self.debug_mode = debug_mode
|
|
|
|
# Register Chinese font if not already done
|
|
if not SearchablePDFGenerator._chinese_font_registered:
|
|
self._register_chinese_font()
|
|
|
|
# Determine output path
|
|
if output_path:
|
|
self.output_path = Path(output_path)
|
|
elif auto_mode:
|
|
# In auto mode, replace original PDF with searchable version
|
|
source_pdf = Path(self.data.get("source_pdf", ""))
|
|
self.output_path = source_pdf.parent / f"{source_pdf.stem}_searchable.pdf"
|
|
else:
|
|
raise ValueError("Output path must be specified in non-auto mode")
|
|
|
|
# Get source PDF path from JSON
|
|
self.source_pdf = Path(self.data.get("source_pdf", ""))
|
|
if not self.source_pdf.exists():
|
|
raise FileNotFoundError(f"Source PDF not found: {self.source_pdf}")
|
|
|
|
def _register_chinese_font(self):
|
|
"""Register a Chinese-compatible font."""
|
|
# Common Chinese font paths on different systems
|
|
font_paths = [
|
|
# Windows
|
|
"C:/Windows/Fonts/simsun.ttc",
|
|
"C:/Windows/Fonts/msyh.ttc",
|
|
"C:/Windows/Fonts/simhei.ttf",
|
|
# Linux
|
|
"/usr/share/fonts/truetype/arphic/uming.ttc",
|
|
"/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc",
|
|
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
|
|
# macOS
|
|
"/System/Library/Fonts/PingFang.ttc",
|
|
"/Library/Fonts/Arial Unicode.ttf",
|
|
]
|
|
|
|
for font_path in font_paths:
|
|
if Path(font_path).exists():
|
|
try:
|
|
pdfmetrics.registerFont(TTFont('ChineseFont', font_path))
|
|
SearchablePDFGenerator._chinese_font_name = 'ChineseFont'
|
|
SearchablePDFGenerator._chinese_font_registered = True
|
|
print(f" Registered Chinese font: {Path(font_path).name}")
|
|
return
|
|
except Exception as e:
|
|
continue
|
|
|
|
# Fallback to Helvetica (won't show Chinese properly but won't crash)
|
|
print(" Warning: No Chinese font found, using Helvetica (Chinese chars may not display)")
|
|
SearchablePDFGenerator._chinese_font_name = 'Helvetica'
|
|
SearchablePDFGenerator._chinese_font_registered = True
|
|
|
|
def _load_json(self) -> Dict:
|
|
"""Load and parse the OCR JSON file."""
|
|
with open(self.json_path, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
|
|
def _detect_margins(self, img: Image.Image, threshold: int = 240) -> Tuple[int, int, int, int]:
|
|
"""
|
|
Detect white margins in image.
|
|
|
|
Args:
|
|
img: PIL Image
|
|
threshold: Brightness threshold for "white" (0-255)
|
|
|
|
Returns:
|
|
Tuple of (top, right, bottom, left) margin sizes in pixels
|
|
"""
|
|
img_gray = np.array(img.convert('L'))
|
|
height, width = img_gray.shape
|
|
|
|
# Detect top margin
|
|
top_margin = 0
|
|
for row in range(min(height // 4, 200)): # Check first 25% or 200px
|
|
if img_gray[row, :].mean() >= threshold:
|
|
top_margin = row
|
|
else:
|
|
break
|
|
|
|
# Detect bottom margin
|
|
bottom_margin = 0
|
|
for row in range(height - 1, max(height * 3 // 4, height - 200), -1):
|
|
if img_gray[row, :].mean() >= threshold:
|
|
bottom_margin = height - row - 1
|
|
else:
|
|
break
|
|
|
|
# For left/right, check less aggressively (Chinese text usually uses full width)
|
|
left_margin = 0
|
|
right_margin = 0
|
|
|
|
return (top_margin, right_margin, bottom_margin, left_margin)
|
|
|
|
def _get_ocr_dimensions_and_offsets(self, page_data: Dict) -> Tuple[int, int, int, int]:
|
|
"""
|
|
Estimate OCR image dimensions and offsets from bbox coordinates.
|
|
|
|
Args:
|
|
page_data: Page data from JSON
|
|
|
|
Returns:
|
|
Tuple of (width, height, offset_x, offset_y) in pixels
|
|
"""
|
|
details = page_data.get("details", [])
|
|
min_x = float('inf')
|
|
max_x = 0
|
|
min_y = float('inf')
|
|
max_y = 0
|
|
|
|
for detail in details:
|
|
bbox = detail.get("bbox", [])
|
|
if bbox and len(bbox) == 4:
|
|
for point in bbox:
|
|
min_x = min(min_x, point[0])
|
|
max_x = max(max_x, point[0])
|
|
min_y = min(min_y, point[1])
|
|
max_y = max(max_y, point[1])
|
|
|
|
# Handle edge case of no valid bboxes
|
|
if min_x == float('inf'):
|
|
return (0, 0, 0, 0)
|
|
|
|
# Add small margin for edge cases (needed for bottom alignment)
|
|
width = int(max_x * 1.02)
|
|
height = int(max_y * 1.02)
|
|
offset_x = int(min_x)
|
|
offset_y = int(min_y)
|
|
|
|
return (width, height, offset_x, offset_y)
|
|
|
|
def _get_ocr_dimensions(self, page_data: Dict) -> Tuple[int, int]:
|
|
"""Legacy wrapper for backward compatibility."""
|
|
width, height, _, _ = self._get_ocr_dimensions_and_offsets(page_data)
|
|
return (width, height)
|
|
|
|
def _convert_pdf_to_images(self, target_width: int = None, target_height: int = None) -> List[Image.Image]:
|
|
"""
|
|
Convert source PDF to images.
|
|
|
|
Args:
|
|
target_width: Desired image width (will calculate DPI to match)
|
|
target_height: Desired image height (will calculate DPI to match)
|
|
"""
|
|
print(f" Converting PDF to images: {self.source_pdf.name}")
|
|
|
|
if target_width or target_height:
|
|
# Calculate DPI to match target dimensions
|
|
# First convert at 300 DPI to get aspect ratio
|
|
test_img = convert_from_path(str(self.source_pdf), dpi=300, first_page=1, last_page=1)[0]
|
|
test_w, test_h = test_img.size
|
|
|
|
# Calculate required DPI
|
|
if target_width:
|
|
required_dpi = int((target_width / test_w) * 300)
|
|
print(f" Calculated DPI to match width {target_width}: {required_dpi}")
|
|
elif target_height:
|
|
required_dpi = int((target_height / test_h) * 300)
|
|
print(f" Calculated DPI to match height {target_height}: {required_dpi}")
|
|
|
|
images = convert_from_path(str(self.source_pdf), dpi=required_dpi)
|
|
else:
|
|
# Default: 300 DPI
|
|
images = convert_from_path(str(self.source_pdf), dpi=300)
|
|
|
|
return images
|
|
|
|
def _normalize_bbox(self, bbox: List[List[int]], img_width: int, img_height: int,
|
|
pdf_width: float, pdf_height: float,
|
|
ocr_width: int = None, ocr_height: int = None,
|
|
margins: Tuple[int, int, int, int] = None,
|
|
ocr_offsets: Tuple[int, int] = None) -> Tuple[float, float, float, float]:
|
|
"""
|
|
Convert bbox coordinates from image space to PDF space.
|
|
|
|
Args:
|
|
bbox: Bounding box [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
|
img_width: Original image width (from PDF conversion)
|
|
img_height: Original image height (from PDF conversion)
|
|
pdf_width: PDF page width
|
|
pdf_height: PDF page height
|
|
ocr_width: OCR image width (if different from img_width)
|
|
ocr_height: OCR image height (if different from img_height)
|
|
margins: (top, right, bottom, left) margins to compensate for
|
|
|
|
Returns:
|
|
Tuple of (x, y, width, height) in PDF coordinates
|
|
"""
|
|
# Use OCR dimensions if provided, otherwise use image dimensions
|
|
source_width = ocr_width if ocr_width else img_width
|
|
source_height = ocr_height if ocr_height else img_height
|
|
|
|
# DEBUG: Print scaling info for first few calls
|
|
import threading
|
|
if not hasattr(threading.current_thread(), '_bbox_debug_count'):
|
|
threading.current_thread()._bbox_debug_count = 0
|
|
|
|
if threading.current_thread()._bbox_debug_count < 3:
|
|
threading.current_thread()._bbox_debug_count += 1
|
|
scale_y = pdf_height / source_height if source_height > 0 else 1.0
|
|
margin_info = f", top_margin={margins[0] if margins else 0}" if margins else ""
|
|
print(f" [normalize_bbox] source_height={source_height}, pdf_height={pdf_height}, scale_y={scale_y:.4f}{margin_info}")
|
|
|
|
# Get margins - if OCR was done on cropped image, we need to add margins back
|
|
if margins:
|
|
top_margin, right_margin, bottom_margin, left_margin = margins
|
|
else:
|
|
top_margin = right_margin = bottom_margin = left_margin = 0
|
|
|
|
# Get min/max coordinates from bbox
|
|
xs = [point[0] for point in bbox]
|
|
ys = [point[1] for point in bbox]
|
|
|
|
min_x = min(xs)
|
|
max_x = max(xs)
|
|
min_y = min(ys)
|
|
max_y = max(ys)
|
|
|
|
# Subtract OCR offsets if provided (bbox coords may not start at 0,0)
|
|
if ocr_offsets:
|
|
offset_x, offset_y = ocr_offsets
|
|
min_x_adj = min_x - offset_x
|
|
max_x_adj = max_x - offset_x
|
|
min_y_adj = min_y - offset_y
|
|
max_y_adj = max_y - offset_y
|
|
else:
|
|
min_x_adj = min_x
|
|
max_x_adj = max_x
|
|
min_y_adj = min_y
|
|
max_y_adj = max_y
|
|
|
|
# Convert to PDF coordinates (origin at bottom-left)
|
|
# CRITICAL: divide by source dimensions (OCR space), not image dimensions!
|
|
x = (min_x_adj / source_width) * pdf_width
|
|
width = ((max_x_adj - min_x_adj) / source_width) * pdf_width
|
|
|
|
# Y coordinate is inverted in PDF (origin at bottom)
|
|
# We want the bottom-left corner of the text box
|
|
y_bottom = pdf_height - ((max_y_adj / source_height) * pdf_height)
|
|
height = ((max_y_adj - min_y_adj) / source_height) * pdf_height
|
|
|
|
# Adjust Y to be at the baseline (bottom of text box)
|
|
y = y_bottom
|
|
|
|
return (x, y, width, height)
|
|
|
|
def _calculate_font_size(self, text: str, bbox_width: float, bbox_height: float,
|
|
c: canvas.Canvas, font_name: str = None) -> float:
|
|
"""
|
|
Calculate optimal font size to fit text in bbox.
|
|
|
|
Args:
|
|
text: The text to fit
|
|
bbox_width: Width of bounding box
|
|
bbox_height: Height of bounding box
|
|
c: Canvas object for text width calculation
|
|
font_name: Font name to use (defaults to Chinese font)
|
|
|
|
Returns:
|
|
Optimal font size in points
|
|
"""
|
|
if font_name is None:
|
|
font_name = SearchablePDFGenerator._chinese_font_name
|
|
|
|
# Start with height-based estimate (reduced by factor for better fit)
|
|
font_size = bbox_height * 0.7
|
|
|
|
# Check if text fits horizontally, adjust if needed
|
|
max_iterations = 10
|
|
iteration = 0
|
|
|
|
while iteration < max_iterations:
|
|
try:
|
|
text_width = c.stringWidth(text, font_name, font_size)
|
|
|
|
if text_width <= bbox_width * 1.05: # Allow 5% overflow for safety
|
|
break
|
|
|
|
# Text too wide, reduce font size
|
|
font_size *= (bbox_width / text_width) * 0.95
|
|
|
|
if font_size < 1:
|
|
font_size = 1
|
|
break
|
|
|
|
iteration += 1
|
|
except:
|
|
# Fallback to height-based sizing
|
|
font_size = bbox_height * 0.5
|
|
break
|
|
|
|
return max(1, font_size)
|
|
|
|
def _add_text_layer(self, c: canvas.Canvas, page_data: Dict,
|
|
img_width: int, img_height: int,
|
|
pdf_width: float, pdf_height: float,
|
|
margins: Tuple[int, int, int, int] = None):
|
|
"""
|
|
Add invisible text layer to PDF page.
|
|
|
|
Args:
|
|
c: ReportLab canvas
|
|
page_data: Page data from JSON
|
|
img_width: Image width
|
|
img_height: Image height
|
|
pdf_width: PDF page width
|
|
pdf_height: PDF page height
|
|
|
|
Returns:
|
|
Dict with metrics: text_count, char_count, avg_confidence, errors
|
|
"""
|
|
# Detect OCR image dimensions and offsets from bbox coordinates
|
|
ocr_width, ocr_height, offset_x, offset_y = self._get_ocr_dimensions_and_offsets(page_data)
|
|
ocr_offsets = (offset_x, offset_y)
|
|
|
|
if self.debug_mode:
|
|
if ocr_width > 0:
|
|
print(f" OCR dims: {ocr_width}x{ocr_height}, offsets: ({offset_x}, {offset_y})")
|
|
print(f" Converted img: {img_width}x{img_height}")
|
|
print(f" Using OCR dims as source, scaling to PDF/image dims")
|
|
|
|
details = page_data.get("details", [])
|
|
metrics = {
|
|
"text_count": 0,
|
|
"char_count": 0,
|
|
"confidences": [],
|
|
"errors": 0,
|
|
"empty_blocks": 0,
|
|
"invalid_bbox": 0
|
|
}
|
|
|
|
for detail in details:
|
|
text = detail.get("text", "").strip()
|
|
confidence = detail.get("confidence", 0.0)
|
|
|
|
if not text:
|
|
metrics["empty_blocks"] += 1
|
|
continue
|
|
|
|
bbox = detail.get("bbox", [])
|
|
if not bbox or len(bbox) != 4:
|
|
metrics["invalid_bbox"] += 1
|
|
continue
|
|
|
|
# Convert bbox to PDF coordinates
|
|
# Disable offsets for now - focus on getting the scaling right first
|
|
x, y, width, height = self._normalize_bbox(bbox, img_width, img_height,
|
|
pdf_width, pdf_height,
|
|
ocr_width, ocr_height, None, None)
|
|
|
|
# Calculate optimal font size
|
|
font_size = self._calculate_font_size(text, width, height, c)
|
|
|
|
try:
|
|
# Use Chinese font
|
|
font_name = SearchablePDFGenerator._chinese_font_name
|
|
c.setFont(font_name, font_size)
|
|
|
|
if self.debug_mode:
|
|
# Debug mode: draw visible text with bounding boxes
|
|
from reportlab.lib.colors import red, blue
|
|
|
|
# Draw red rectangle around bbox
|
|
c.setStrokeColor(red)
|
|
c.setLineWidth(0.5)
|
|
c.rect(x, y, width, height, stroke=1, fill=0)
|
|
|
|
# Draw visible text in blue
|
|
c.setFillColor(blue)
|
|
c.drawString(x, y, text)
|
|
else:
|
|
# Normal mode: invisible text
|
|
text_obj = c.beginText(x, y)
|
|
text_obj.setTextRenderMode(3) # Mode 3 = invisible text
|
|
text_obj.textLine(text)
|
|
c.drawText(text_obj)
|
|
|
|
metrics["text_count"] += 1
|
|
metrics["char_count"] += len(text)
|
|
metrics["confidences"].append(confidence)
|
|
|
|
except Exception as e:
|
|
metrics["errors"] += 1
|
|
if metrics["errors"] <= 3: # Only show first 3 errors
|
|
print(f" Warning: Could not add text '{text[:20]}...': {e}")
|
|
continue
|
|
|
|
return metrics
|
|
|
|
def generate(self):
|
|
"""Generate the searchable PDF."""
|
|
print(f"\n{'='*60}")
|
|
print(f"Processing: {self.json_path.name}")
|
|
print(f"Output: {self.output_path}")
|
|
print(f"{'='*60}")
|
|
|
|
# Create output directory if needed
|
|
self.output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
pages = self.data.get("pages", [])
|
|
|
|
# Detect OCR dimensions from first page to match PDF conversion
|
|
if pages:
|
|
ocr_width, ocr_height, offset_x, offset_y = self._get_ocr_dimensions_and_offsets(pages[0])
|
|
print(f" Detected OCR dimensions: {ocr_width} x {ocr_height} (offsets: {offset_x}, {offset_y})")
|
|
|
|
# CRITICAL FIX: Convert PDF to match OCR height too, not just width!
|
|
# This ensures perfect 1:1 aspect ratio match
|
|
if ocr_width and ocr_height:
|
|
# Calculate DPI needed to match both dimensions
|
|
test_img = convert_from_path(str(self.source_pdf), dpi=300, first_page=1, last_page=1)[0]
|
|
test_w, test_h = test_img.size
|
|
|
|
# Calculate DPI for each dimension
|
|
dpi_for_width = int((ocr_width / test_w) * 300)
|
|
dpi_for_height = int((ocr_height / test_h) * 300)
|
|
|
|
# Use average or prioritize height (where we had more issues)
|
|
target_dpi = int((dpi_for_width + dpi_for_height) / 2)
|
|
print(f" DPI for width: {dpi_for_width}, for height: {dpi_for_height}, using: {target_dpi}")
|
|
|
|
images = convert_from_path(str(self.source_pdf), dpi=target_dpi)
|
|
else:
|
|
images = self._convert_pdf_to_images()
|
|
else:
|
|
images = self._convert_pdf_to_images()
|
|
|
|
# Create PDF
|
|
c = canvas.Canvas(str(self.output_path))
|
|
|
|
# Global metrics
|
|
total_metrics = {
|
|
"text_blocks": 0,
|
|
"characters": 0,
|
|
"confidences": [],
|
|
"errors": 0,
|
|
"empty_blocks": 0,
|
|
"invalid_bbox": 0
|
|
}
|
|
|
|
for idx, page_data in enumerate(pages):
|
|
if idx >= len(images):
|
|
print(f" Warning: No image for page {idx + 1}")
|
|
break
|
|
|
|
img = images[idx]
|
|
img_width, img_height = img.size
|
|
|
|
# Detect white margins in the image
|
|
margins = self._detect_margins(img)
|
|
|
|
# Get OCR dimensions for this page
|
|
page_ocr_width, page_ocr_height = self._get_ocr_dimensions(page_data)
|
|
|
|
# CRITICAL: Use IMAGE dimensions as PDF dimensions, NOT OCR dimensions
|
|
# The bbox will be scaled from OCR space to image space in normalize_bbox
|
|
pdf_width = float(img_width)
|
|
pdf_height = float(img_height)
|
|
|
|
if self.debug_mode:
|
|
print(f" PDF dims (from image): {pdf_width}x{pdf_height}")
|
|
print(f" OCR dims (from bbox): {page_ocr_width}x{page_ocr_height}")
|
|
if page_ocr_width > 0:
|
|
scale_x = pdf_width / page_ocr_width
|
|
scale_y = pdf_height / page_ocr_height
|
|
print(f" Scale factors: X={scale_x:.4f}, Y={scale_y:.4f}")
|
|
|
|
c.setPageSize((pdf_width, pdf_height))
|
|
|
|
# Save image to temporary file
|
|
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
|
|
img.save(tmp.name, 'PNG')
|
|
tmp_path = tmp.name
|
|
|
|
try:
|
|
# Draw image as background
|
|
img_reader = ImageReader(tmp_path)
|
|
c.drawImage(img_reader, 0, 0, width=pdf_width, height=pdf_height)
|
|
|
|
# Add invisible text layer
|
|
page_metrics = self._add_text_layer(c, page_data, img_width, img_height,
|
|
pdf_width, pdf_height, margins)
|
|
|
|
# Accumulate metrics
|
|
total_metrics["text_blocks"] += page_metrics["text_count"]
|
|
total_metrics["characters"] += page_metrics["char_count"]
|
|
total_metrics["confidences"].extend(page_metrics["confidences"])
|
|
total_metrics["errors"] += page_metrics["errors"]
|
|
total_metrics["empty_blocks"] += page_metrics["empty_blocks"]
|
|
total_metrics["invalid_bbox"] += page_metrics["invalid_bbox"]
|
|
|
|
# Show page
|
|
c.showPage()
|
|
|
|
# Calculate page average confidence
|
|
if page_metrics["confidences"]:
|
|
avg_conf = sum(page_metrics["confidences"]) / len(page_metrics["confidences"])
|
|
print(f" Page {idx + 1}/{len(pages)}: {page_metrics['text_count']} blocks, "
|
|
f"{page_metrics['char_count']} chars, "
|
|
f"avg conf: {avg_conf:.1%}")
|
|
else:
|
|
print(f" Page {idx + 1}/{len(pages)}: {page_metrics['text_count']} blocks")
|
|
|
|
finally:
|
|
# Clean up temp file
|
|
os.unlink(tmp_path)
|
|
|
|
# Save PDF
|
|
c.save()
|
|
|
|
# Display summary metrics
|
|
print(f"\n{'─'*60}")
|
|
print(f"QUALITY METRICS:")
|
|
print(f"{'─'*60}")
|
|
print(f" Total text blocks: {total_metrics['text_blocks']}")
|
|
print(f" Total characters: {total_metrics['characters']}")
|
|
|
|
if total_metrics["confidences"]:
|
|
avg_confidence = sum(total_metrics["confidences"]) / len(total_metrics["confidences"])
|
|
min_confidence = min(total_metrics["confidences"])
|
|
max_confidence = max(total_metrics["confidences"])
|
|
|
|
# Count high/low confidence blocks
|
|
high_conf = sum(1 for c in total_metrics["confidences"] if c >= 0.9)
|
|
low_conf = sum(1 for c in total_metrics["confidences"] if c < 0.7)
|
|
|
|
print(f" Average confidence: {avg_confidence:.1%}")
|
|
print(f" Confidence range: {min_confidence:.1%} - {max_confidence:.1%}")
|
|
print(f" High confidence (≥90%): {high_conf}/{total_metrics['text_blocks']} ({high_conf/total_metrics['text_blocks']*100:.1f}%)")
|
|
print(f" Low confidence (<70%): {low_conf}/{total_metrics['text_blocks']} ({low_conf/total_metrics['text_blocks']*100:.1f}%)")
|
|
|
|
if total_metrics["errors"] > 0:
|
|
print(f" Errors: {total_metrics['errors']}")
|
|
if total_metrics["empty_blocks"] > 0:
|
|
print(f" Empty blocks: {total_metrics['empty_blocks']}")
|
|
if total_metrics["invalid_bbox"] > 0:
|
|
print(f" Invalid bboxes: {total_metrics['invalid_bbox']}")
|
|
|
|
print(f"{'─'*60}")
|
|
print(f" ✓ Saved to: {self.output_path}")
|
|
|
|
return total_metrics
|
|
|
|
|
|
def process_single_file(input_path: str, output_path: str, debug_mode: bool = False):
|
|
"""Process a single JSON file."""
|
|
try:
|
|
generator = SearchablePDFGenerator(input_path, output_path, auto_mode=False, debug_mode=debug_mode)
|
|
generator.generate()
|
|
return True
|
|
except Exception as e:
|
|
print(f"Error processing {input_path}: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
|
|
def process_auto_mode(base_path: str = None):
|
|
"""Auto-process all JSON files in OCR directories."""
|
|
if base_path:
|
|
base_path = Path(base_path)
|
|
else:
|
|
base_path = Path(__file__).parent / "Raw"
|
|
|
|
# Find all OCR JSON files
|
|
json_files = list(base_path.glob("*/OCR/*.json"))
|
|
|
|
if not json_files:
|
|
print(f"No JSON files found in {base_path}/*/OCR/")
|
|
return False
|
|
|
|
print(f"\nFound {len(json_files)} JSON files to process")
|
|
print("="*60)
|
|
|
|
success_count = 0
|
|
error_count = 0
|
|
|
|
for json_file in json_files:
|
|
try:
|
|
generator = SearchablePDFGenerator(str(json_file), auto_mode=True)
|
|
generator.generate()
|
|
success_count += 1
|
|
except Exception as e:
|
|
print(f"\n❌ Error processing {json_file.name}: {e}")
|
|
error_count += 1
|
|
continue
|
|
|
|
print("\n" + "="*60)
|
|
print(f"SUMMARY: {success_count} succeeded, {error_count} failed")
|
|
print("="*60)
|
|
|
|
return error_count == 0
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
parser = argparse.ArgumentParser(
|
|
description="Convert OCR JSON to searchable PDF",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Process single file with output
|
|
python json_to_searchable_pdf.py -i input.json -o output.pdf
|
|
|
|
# Debug mode: visible text with red bounding boxes
|
|
python json_to_searchable_pdf.py -i input.json -o output.pdf --debug
|
|
|
|
# Auto-process all JSON files in Raw directory
|
|
python json_to_searchable_pdf.py --auto
|
|
|
|
# Auto-process from specific directory
|
|
python json_to_searchable_pdf.py --auto /path/to/directory
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-i", "--input",
|
|
help="Input OCR JSON file"
|
|
)
|
|
parser.add_argument(
|
|
"-o", "--output",
|
|
help="Output PDF file"
|
|
)
|
|
parser.add_argument(
|
|
"--auto",
|
|
nargs="?",
|
|
const=True,
|
|
help="Auto-process all JSON files (optionally specify base directory)"
|
|
)
|
|
parser.add_argument(
|
|
"--debug",
|
|
action="store_true",
|
|
help="Debug mode: make text visible with red bounding boxes"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate arguments
|
|
if args.auto:
|
|
# Auto mode
|
|
base_dir = args.auto if isinstance(args.auto, str) else None
|
|
success = process_auto_mode(base_dir)
|
|
return 0 if success else 1
|
|
|
|
elif args.input and args.output:
|
|
# Single file mode
|
|
success = process_single_file(args.input, args.output, debug_mode=args.debug)
|
|
return 0 if success else 1
|
|
|
|
else:
|
|
parser.print_help()
|
|
print("\n❌ Error: Either specify --auto or both -i and -o")
|
|
return 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
exit(main())
|