#!/usr/bin/env python3 """ Script to convert OCR JSON files to searchable PDFs. Takes OCR data from JSON and creates a PDF with invisible text layer over the original images. Modes: - Single file: python json_to_searchable_pdf.py -i input.json -o output.pdf - Auto batch: python json_to_searchable_pdf.py --auto [directory] """ import json import argparse from pathlib import Path from typing import List, Dict, Tuple from PIL import Image from pdf2image import convert_from_path from reportlab.pdfgen import canvas from reportlab.lib.utils import ImageReader from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont import tempfile import os import sys import numpy as np class SearchablePDFGenerator: """Generator for searchable PDFs from OCR JSON data.""" # Class variable to track if Chinese font is registered _chinese_font_registered = False _chinese_font_name = None def __init__(self, json_path: str, output_path: str = None, auto_mode: bool = False, debug_mode: bool = False): """ Initialize the PDF generator. Args: json_path: Path to the OCR JSON file output_path: Output PDF path (optional in auto mode) auto_mode: If True, use default output paths debug_mode: If True, make text visible for debugging """ self.json_path = Path(json_path) self.data = self._load_json() self.auto_mode = auto_mode self.debug_mode = debug_mode # Register Chinese font if not already done if not SearchablePDFGenerator._chinese_font_registered: self._register_chinese_font() # Determine output path if output_path: self.output_path = Path(output_path) elif auto_mode: # In auto mode, replace original PDF with searchable version source_pdf = Path(self.data.get("source_pdf", "")) self.output_path = source_pdf.parent / f"{source_pdf.stem}_searchable.pdf" else: raise ValueError("Output path must be specified in non-auto mode") # Get source PDF path from JSON self.source_pdf = Path(self.data.get("source_pdf", "")) if not self.source_pdf.exists(): raise FileNotFoundError(f"Source PDF not found: {self.source_pdf}") def _register_chinese_font(self): """Register a Chinese-compatible font.""" # Common Chinese font paths on different systems font_paths = [ # Windows "C:/Windows/Fonts/simsun.ttc", "C:/Windows/Fonts/msyh.ttc", "C:/Windows/Fonts/simhei.ttf", # Linux "/usr/share/fonts/truetype/arphic/uming.ttc", "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc", "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", # macOS "/System/Library/Fonts/PingFang.ttc", "/Library/Fonts/Arial Unicode.ttf", ] for font_path in font_paths: if Path(font_path).exists(): try: pdfmetrics.registerFont(TTFont('ChineseFont', font_path)) SearchablePDFGenerator._chinese_font_name = 'ChineseFont' SearchablePDFGenerator._chinese_font_registered = True print(f" Registered Chinese font: {Path(font_path).name}") return except Exception as e: continue # Fallback to Helvetica (won't show Chinese properly but won't crash) print(" Warning: No Chinese font found, using Helvetica (Chinese chars may not display)") SearchablePDFGenerator._chinese_font_name = 'Helvetica' SearchablePDFGenerator._chinese_font_registered = True def _load_json(self) -> Dict: """Load and parse the OCR JSON file.""" with open(self.json_path, 'r', encoding='utf-8') as f: return json.load(f) def _detect_margins(self, img: Image.Image, threshold: int = 240) -> Tuple[int, int, int, int]: """ Detect white margins in image. Args: img: PIL Image threshold: Brightness threshold for "white" (0-255) Returns: Tuple of (top, right, bottom, left) margin sizes in pixels """ img_gray = np.array(img.convert('L')) height, width = img_gray.shape # Detect top margin top_margin = 0 for row in range(min(height // 4, 200)): # Check first 25% or 200px if img_gray[row, :].mean() >= threshold: top_margin = row else: break # Detect bottom margin bottom_margin = 0 for row in range(height - 1, max(height * 3 // 4, height - 200), -1): if img_gray[row, :].mean() >= threshold: bottom_margin = height - row - 1 else: break # For left/right, check less aggressively (Chinese text usually uses full width) left_margin = 0 right_margin = 0 return (top_margin, right_margin, bottom_margin, left_margin) def _get_ocr_dimensions_and_offsets(self, page_data: Dict) -> Tuple[int, int, int, int]: """ Estimate OCR image dimensions and offsets from bbox coordinates. Args: page_data: Page data from JSON Returns: Tuple of (width, height, offset_x, offset_y) in pixels """ details = page_data.get("details", []) min_x = float('inf') max_x = 0 min_y = float('inf') max_y = 0 for detail in details: bbox = detail.get("bbox", []) if bbox and len(bbox) == 4: for point in bbox: min_x = min(min_x, point[0]) max_x = max(max_x, point[0]) min_y = min(min_y, point[1]) max_y = max(max_y, point[1]) # Handle edge case of no valid bboxes if min_x == float('inf'): return (0, 0, 0, 0) # Add small margin for edge cases (needed for bottom alignment) width = int(max_x * 1.02) height = int(max_y * 1.02) offset_x = int(min_x) offset_y = int(min_y) return (width, height, offset_x, offset_y) def _get_ocr_dimensions(self, page_data: Dict) -> Tuple[int, int]: """Legacy wrapper for backward compatibility.""" width, height, _, _ = self._get_ocr_dimensions_and_offsets(page_data) return (width, height) def _convert_pdf_to_images(self, target_width: int = None, target_height: int = None) -> List[Image.Image]: """ Convert source PDF to images. Args: target_width: Desired image width (will calculate DPI to match) target_height: Desired image height (will calculate DPI to match) """ print(f" Converting PDF to images: {self.source_pdf.name}") if target_width or target_height: # Calculate DPI to match target dimensions # First convert at 300 DPI to get aspect ratio test_img = convert_from_path(str(self.source_pdf), dpi=300, first_page=1, last_page=1)[0] test_w, test_h = test_img.size # Calculate required DPI if target_width: required_dpi = int((target_width / test_w) * 300) print(f" Calculated DPI to match width {target_width}: {required_dpi}") elif target_height: required_dpi = int((target_height / test_h) * 300) print(f" Calculated DPI to match height {target_height}: {required_dpi}") images = convert_from_path(str(self.source_pdf), dpi=required_dpi) else: # Default: 300 DPI images = convert_from_path(str(self.source_pdf), dpi=300) return images def _normalize_bbox(self, bbox: List[List[int]], img_width: int, img_height: int, pdf_width: float, pdf_height: float, ocr_width: int = None, ocr_height: int = None, margins: Tuple[int, int, int, int] = None, ocr_offsets: Tuple[int, int] = None) -> Tuple[float, float, float, float]: """ Convert bbox coordinates from image space to PDF space. Args: bbox: Bounding box [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] img_width: Original image width (from PDF conversion) img_height: Original image height (from PDF conversion) pdf_width: PDF page width pdf_height: PDF page height ocr_width: OCR image width (if different from img_width) ocr_height: OCR image height (if different from img_height) margins: (top, right, bottom, left) margins to compensate for Returns: Tuple of (x, y, width, height) in PDF coordinates """ # Use OCR dimensions if provided, otherwise use image dimensions source_width = ocr_width if ocr_width else img_width source_height = ocr_height if ocr_height else img_height # DEBUG: Print scaling info for first few calls import threading if not hasattr(threading.current_thread(), '_bbox_debug_count'): threading.current_thread()._bbox_debug_count = 0 if threading.current_thread()._bbox_debug_count < 3: threading.current_thread()._bbox_debug_count += 1 scale_y = pdf_height / source_height if source_height > 0 else 1.0 margin_info = f", top_margin={margins[0] if margins else 0}" if margins else "" print(f" [normalize_bbox] source_height={source_height}, pdf_height={pdf_height}, scale_y={scale_y:.4f}{margin_info}") # Get margins - if OCR was done on cropped image, we need to add margins back if margins: top_margin, right_margin, bottom_margin, left_margin = margins else: top_margin = right_margin = bottom_margin = left_margin = 0 # Get min/max coordinates from bbox xs = [point[0] for point in bbox] ys = [point[1] for point in bbox] min_x = min(xs) max_x = max(xs) min_y = min(ys) max_y = max(ys) # Subtract OCR offsets if provided (bbox coords may not start at 0,0) if ocr_offsets: offset_x, offset_y = ocr_offsets min_x_adj = min_x - offset_x max_x_adj = max_x - offset_x min_y_adj = min_y - offset_y max_y_adj = max_y - offset_y else: min_x_adj = min_x max_x_adj = max_x min_y_adj = min_y max_y_adj = max_y # Convert to PDF coordinates (origin at bottom-left) # CRITICAL: divide by source dimensions (OCR space), not image dimensions! x = (min_x_adj / source_width) * pdf_width width = ((max_x_adj - min_x_adj) / source_width) * pdf_width # Y coordinate is inverted in PDF (origin at bottom) # We want the bottom-left corner of the text box y_bottom = pdf_height - ((max_y_adj / source_height) * pdf_height) height = ((max_y_adj - min_y_adj) / source_height) * pdf_height # Adjust Y to be at the baseline (bottom of text box) y = y_bottom return (x, y, width, height) def _calculate_font_size(self, text: str, bbox_width: float, bbox_height: float, c: canvas.Canvas, font_name: str = None) -> float: """ Calculate optimal font size to fit text in bbox. Args: text: The text to fit bbox_width: Width of bounding box bbox_height: Height of bounding box c: Canvas object for text width calculation font_name: Font name to use (defaults to Chinese font) Returns: Optimal font size in points """ if font_name is None: font_name = SearchablePDFGenerator._chinese_font_name # Start with height-based estimate (reduced by factor for better fit) font_size = bbox_height * 0.7 # Check if text fits horizontally, adjust if needed max_iterations = 10 iteration = 0 while iteration < max_iterations: try: text_width = c.stringWidth(text, font_name, font_size) if text_width <= bbox_width * 1.05: # Allow 5% overflow for safety break # Text too wide, reduce font size font_size *= (bbox_width / text_width) * 0.95 if font_size < 1: font_size = 1 break iteration += 1 except: # Fallback to height-based sizing font_size = bbox_height * 0.5 break return max(1, font_size) def _add_text_layer(self, c: canvas.Canvas, page_data: Dict, img_width: int, img_height: int, pdf_width: float, pdf_height: float, margins: Tuple[int, int, int, int] = None): """ Add invisible text layer to PDF page. Args: c: ReportLab canvas page_data: Page data from JSON img_width: Image width img_height: Image height pdf_width: PDF page width pdf_height: PDF page height Returns: Dict with metrics: text_count, char_count, avg_confidence, errors """ # Detect OCR image dimensions and offsets from bbox coordinates ocr_width, ocr_height, offset_x, offset_y = self._get_ocr_dimensions_and_offsets(page_data) ocr_offsets = (offset_x, offset_y) if self.debug_mode: if ocr_width > 0: print(f" OCR dims: {ocr_width}x{ocr_height}, offsets: ({offset_x}, {offset_y})") print(f" Converted img: {img_width}x{img_height}") print(f" Using OCR dims as source, scaling to PDF/image dims") details = page_data.get("details", []) metrics = { "text_count": 0, "char_count": 0, "confidences": [], "errors": 0, "empty_blocks": 0, "invalid_bbox": 0 } for detail in details: text = detail.get("text", "").strip() confidence = detail.get("confidence", 0.0) if not text: metrics["empty_blocks"] += 1 continue bbox = detail.get("bbox", []) if not bbox or len(bbox) != 4: metrics["invalid_bbox"] += 1 continue # Convert bbox to PDF coordinates # Disable offsets for now - focus on getting the scaling right first x, y, width, height = self._normalize_bbox(bbox, img_width, img_height, pdf_width, pdf_height, ocr_width, ocr_height, None, None) # Calculate optimal font size font_size = self._calculate_font_size(text, width, height, c) try: # Use Chinese font font_name = SearchablePDFGenerator._chinese_font_name c.setFont(font_name, font_size) if self.debug_mode: # Debug mode: draw visible text with bounding boxes from reportlab.lib.colors import red, blue # Draw red rectangle around bbox c.setStrokeColor(red) c.setLineWidth(0.5) c.rect(x, y, width, height, stroke=1, fill=0) # Draw visible text in blue c.setFillColor(blue) c.drawString(x, y, text) else: # Normal mode: invisible text text_obj = c.beginText(x, y) text_obj.setTextRenderMode(3) # Mode 3 = invisible text text_obj.textLine(text) c.drawText(text_obj) metrics["text_count"] += 1 metrics["char_count"] += len(text) metrics["confidences"].append(confidence) except Exception as e: metrics["errors"] += 1 if metrics["errors"] <= 3: # Only show first 3 errors print(f" Warning: Could not add text '{text[:20]}...': {e}") continue return metrics def generate(self): """Generate the searchable PDF.""" print(f"\n{'='*60}") print(f"Processing: {self.json_path.name}") print(f"Output: {self.output_path}") print(f"{'='*60}") # Create output directory if needed self.output_path.parent.mkdir(parents=True, exist_ok=True) pages = self.data.get("pages", []) # Detect OCR dimensions from first page to match PDF conversion if pages: ocr_width, ocr_height, offset_x, offset_y = self._get_ocr_dimensions_and_offsets(pages[0]) print(f" Detected OCR dimensions: {ocr_width} x {ocr_height} (offsets: {offset_x}, {offset_y})") # CRITICAL FIX: Convert PDF to match OCR height too, not just width! # This ensures perfect 1:1 aspect ratio match if ocr_width and ocr_height: # Calculate DPI needed to match both dimensions test_img = convert_from_path(str(self.source_pdf), dpi=300, first_page=1, last_page=1)[0] test_w, test_h = test_img.size # Calculate DPI for each dimension dpi_for_width = int((ocr_width / test_w) * 300) dpi_for_height = int((ocr_height / test_h) * 300) # Use average or prioritize height (where we had more issues) target_dpi = int((dpi_for_width + dpi_for_height) / 2) print(f" DPI for width: {dpi_for_width}, for height: {dpi_for_height}, using: {target_dpi}") images = convert_from_path(str(self.source_pdf), dpi=target_dpi) else: images = self._convert_pdf_to_images() else: images = self._convert_pdf_to_images() # Create PDF c = canvas.Canvas(str(self.output_path)) # Global metrics total_metrics = { "text_blocks": 0, "characters": 0, "confidences": [], "errors": 0, "empty_blocks": 0, "invalid_bbox": 0 } for idx, page_data in enumerate(pages): if idx >= len(images): print(f" Warning: No image for page {idx + 1}") break img = images[idx] img_width, img_height = img.size # Detect white margins in the image margins = self._detect_margins(img) # Get OCR dimensions for this page page_ocr_width, page_ocr_height = self._get_ocr_dimensions(page_data) # CRITICAL: Use IMAGE dimensions as PDF dimensions, NOT OCR dimensions # The bbox will be scaled from OCR space to image space in normalize_bbox pdf_width = float(img_width) pdf_height = float(img_height) if self.debug_mode: print(f" PDF dims (from image): {pdf_width}x{pdf_height}") print(f" OCR dims (from bbox): {page_ocr_width}x{page_ocr_height}") if page_ocr_width > 0: scale_x = pdf_width / page_ocr_width scale_y = pdf_height / page_ocr_height print(f" Scale factors: X={scale_x:.4f}, Y={scale_y:.4f}") c.setPageSize((pdf_width, pdf_height)) # Save image to temporary file with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp: img.save(tmp.name, 'PNG') tmp_path = tmp.name try: # Draw image as background img_reader = ImageReader(tmp_path) c.drawImage(img_reader, 0, 0, width=pdf_width, height=pdf_height) # Add invisible text layer page_metrics = self._add_text_layer(c, page_data, img_width, img_height, pdf_width, pdf_height, margins) # Accumulate metrics total_metrics["text_blocks"] += page_metrics["text_count"] total_metrics["characters"] += page_metrics["char_count"] total_metrics["confidences"].extend(page_metrics["confidences"]) total_metrics["errors"] += page_metrics["errors"] total_metrics["empty_blocks"] += page_metrics["empty_blocks"] total_metrics["invalid_bbox"] += page_metrics["invalid_bbox"] # Show page c.showPage() # Calculate page average confidence if page_metrics["confidences"]: avg_conf = sum(page_metrics["confidences"]) / len(page_metrics["confidences"]) print(f" Page {idx + 1}/{len(pages)}: {page_metrics['text_count']} blocks, " f"{page_metrics['char_count']} chars, " f"avg conf: {avg_conf:.1%}") else: print(f" Page {idx + 1}/{len(pages)}: {page_metrics['text_count']} blocks") finally: # Clean up temp file os.unlink(tmp_path) # Save PDF c.save() # Display summary metrics print(f"\n{'─'*60}") print(f"QUALITY METRICS:") print(f"{'─'*60}") print(f" Total text blocks: {total_metrics['text_blocks']}") print(f" Total characters: {total_metrics['characters']}") if total_metrics["confidences"]: avg_confidence = sum(total_metrics["confidences"]) / len(total_metrics["confidences"]) min_confidence = min(total_metrics["confidences"]) max_confidence = max(total_metrics["confidences"]) # Count high/low confidence blocks high_conf = sum(1 for c in total_metrics["confidences"] if c >= 0.9) low_conf = sum(1 for c in total_metrics["confidences"] if c < 0.7) print(f" Average confidence: {avg_confidence:.1%}") print(f" Confidence range: {min_confidence:.1%} - {max_confidence:.1%}") print(f" High confidence (≥90%): {high_conf}/{total_metrics['text_blocks']} ({high_conf/total_metrics['text_blocks']*100:.1f}%)") print(f" Low confidence (<70%): {low_conf}/{total_metrics['text_blocks']} ({low_conf/total_metrics['text_blocks']*100:.1f}%)") if total_metrics["errors"] > 0: print(f" Errors: {total_metrics['errors']}") if total_metrics["empty_blocks"] > 0: print(f" Empty blocks: {total_metrics['empty_blocks']}") if total_metrics["invalid_bbox"] > 0: print(f" Invalid bboxes: {total_metrics['invalid_bbox']}") print(f"{'─'*60}") print(f" ✓ Saved to: {self.output_path}") return total_metrics def process_single_file(input_path: str, output_path: str, debug_mode: bool = False): """Process a single JSON file.""" try: generator = SearchablePDFGenerator(input_path, output_path, auto_mode=False, debug_mode=debug_mode) generator.generate() return True except Exception as e: print(f"Error processing {input_path}: {e}") import traceback traceback.print_exc() return False def process_auto_mode(base_path: str = None): """Auto-process all JSON files in OCR directories.""" if base_path: base_path = Path(base_path) else: base_path = Path(__file__).parent / "Raw" # Find all OCR JSON files json_files = list(base_path.glob("*/OCR/*.json")) if not json_files: print(f"No JSON files found in {base_path}/*/OCR/") return False print(f"\nFound {len(json_files)} JSON files to process") print("="*60) success_count = 0 error_count = 0 for json_file in json_files: try: generator = SearchablePDFGenerator(str(json_file), auto_mode=True) generator.generate() success_count += 1 except Exception as e: print(f"\n❌ Error processing {json_file.name}: {e}") error_count += 1 continue print("\n" + "="*60) print(f"SUMMARY: {success_count} succeeded, {error_count} failed") print("="*60) return error_count == 0 def main(): """Main entry point.""" parser = argparse.ArgumentParser( description="Convert OCR JSON to searchable PDF", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Process single file with output python json_to_searchable_pdf.py -i input.json -o output.pdf # Debug mode: visible text with red bounding boxes python json_to_searchable_pdf.py -i input.json -o output.pdf --debug # Auto-process all JSON files in Raw directory python json_to_searchable_pdf.py --auto # Auto-process from specific directory python json_to_searchable_pdf.py --auto /path/to/directory """ ) parser.add_argument( "-i", "--input", help="Input OCR JSON file" ) parser.add_argument( "-o", "--output", help="Output PDF file" ) parser.add_argument( "--auto", nargs="?", const=True, help="Auto-process all JSON files (optionally specify base directory)" ) parser.add_argument( "--debug", action="store_true", help="Debug mode: make text visible with red bounding boxes" ) args = parser.parse_args() # Validate arguments if args.auto: # Auto mode base_dir = args.auto if isinstance(args.auto, str) else None success = process_auto_mode(base_dir) return 0 if success else 1 elif args.input and args.output: # Single file mode success = process_single_file(args.input, args.output, debug_mode=args.debug) return 0 if success else 1 else: parser.print_help() print("\n❌ Error: Either specify --auto or both -i and -o") return 1 if __name__ == "__main__": exit(main())