chineseclass/json_to_searchable_pdf.py

#!/usr/bin/env python3
"""
Script to convert OCR JSON files to searchable PDFs.
Takes OCR data from JSON and creates a PDF with invisible text layer over the original images.

Modes:
  - Single file: python json_to_searchable_pdf.py -i input.json -o output.pdf
  - Auto batch: python json_to_searchable_pdf.py --auto [directory]
"""

import json
import argparse
from pathlib import Path
from typing import List, Dict, Tuple
from PIL import Image
from pdf2image import convert_from_path
from reportlab.pdfgen import canvas
from reportlab.lib.utils import ImageReader
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
import tempfile
import os
import sys
import numpy as np


class SearchablePDFGenerator:
    """Generator for searchable PDFs from OCR JSON data."""

    # Class variable to track if Chinese font is registered
    _chinese_font_registered = False
    _chinese_font_name = None

    def __init__(self, json_path: str, output_path: str = None, auto_mode: bool = False, debug_mode: bool = False):
        """
        Initialize the PDF generator.

        Args:
            json_path: Path to the OCR JSON file
            output_path: Output PDF path (optional in auto mode)
            auto_mode: If True, use default output paths
            debug_mode: If True, make text visible for debugging
        """
        self.json_path = Path(json_path)
        self.data = self._load_json()
        self.auto_mode = auto_mode
        self.debug_mode = debug_mode

        # Register Chinese font if not already done
        if not SearchablePDFGenerator._chinese_font_registered:
            self._register_chinese_font()

        # Determine output path
        if output_path:
            self.output_path = Path(output_path)
        elif auto_mode:
            # In auto mode, replace original PDF with searchable version
            source_pdf = Path(self.data.get("source_pdf", ""))
            self.output_path = source_pdf.parent / f"{source_pdf.stem}_searchable.pdf"
        else:
            raise ValueError("Output path must be specified in non-auto mode")

        # Get source PDF path from JSON
        self.source_pdf = Path(self.data.get("source_pdf", ""))
        if not self.source_pdf.exists():
            raise FileNotFoundError(f"Source PDF not found: {self.source_pdf}")

    def _register_chinese_font(self):
        """Register a Chinese-compatible font."""
        # Common Chinese font paths on different systems
        font_paths = [
            # Windows
            "C:/Windows/Fonts/simsun.ttc",
            "C:/Windows/Fonts/msyh.ttc",
            "C:/Windows/Fonts/simhei.ttf",
            # Linux
            "/usr/share/fonts/truetype/arphic/uming.ttc",
            "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc",
            "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
            # macOS
            "/System/Library/Fonts/PingFang.ttc",
            "/Library/Fonts/Arial Unicode.ttf",
        ]

        for font_path in font_paths:
            if Path(font_path).exists():
                try:
                    pdfmetrics.registerFont(TTFont('ChineseFont', font_path))
                    SearchablePDFGenerator._chinese_font_name = 'ChineseFont'
                    SearchablePDFGenerator._chinese_font_registered = True
                    print(f"  Registered Chinese font: {Path(font_path).name}")
                    return
                except Exception as e:
                    continue

        # Fallback to Helvetica (won't show Chinese properly but won't crash)
        print("  Warning: No Chinese font found, using Helvetica (Chinese chars may not display)")
        SearchablePDFGenerator._chinese_font_name = 'Helvetica'
        SearchablePDFGenerator._chinese_font_registered = True

    def _load_json(self) -> Dict:
        """Load and parse the OCR JSON file."""
        with open(self.json_path, 'r', encoding='utf-8') as f:
            return json.load(f)

    def _detect_margins(self, img: Image.Image, threshold: int = 240) -> Tuple[int, int, int, int]:
        """
        Detect white margins in image.

        Args:
            img: PIL Image
            threshold: Brightness threshold for "white" (0-255)

        Returns:
            Tuple of (top, right, bottom, left) margin sizes in pixels
        """
        img_gray = np.array(img.convert('L'))
        height, width = img_gray.shape

        # Detect top margin
        top_margin = 0
        for row in range(min(height // 4, 200)):  # Check first 25% or 200px
            if img_gray[row, :].mean() >= threshold:
                top_margin = row
            else:
                break

        # Detect bottom margin
        bottom_margin = 0
        for row in range(height - 1, max(height * 3 // 4, height - 200), -1):
            if img_gray[row, :].mean() >= threshold:
                bottom_margin = height - row - 1
            else:
                break

        # For left/right, check less aggressively (Chinese text usually uses full width)
        left_margin = 0
        right_margin = 0

        return (top_margin, right_margin, bottom_margin, left_margin)

    def _get_ocr_dimensions_and_offsets(self, page_data: Dict) -> Tuple[int, int, int, int]:
        """
        Estimate OCR image dimensions and offsets from bbox coordinates.

        Args:
            page_data: Page data from JSON

        Returns:
            Tuple of (width, height, offset_x, offset_y) in pixels
        """
        details = page_data.get("details", [])
        min_x = float('inf')
        max_x = 0
        min_y = float('inf')
        max_y = 0

        for detail in details:
            bbox = detail.get("bbox", [])
            if bbox and len(bbox) == 4:
                for point in bbox:
                    min_x = min(min_x, point[0])
                    max_x = max(max_x, point[0])
                    min_y = min(min_y, point[1])
                    max_y = max(max_y, point[1])

        # Handle edge case of no valid bboxes
        if min_x == float('inf'):
            return (0, 0, 0, 0)

        # Add small margin for edge cases (needed for bottom alignment)
        width = int(max_x * 1.02)
        height = int(max_y * 1.02)
        offset_x = int(min_x)
        offset_y = int(min_y)

        return (width, height, offset_x, offset_y)

    def _get_ocr_dimensions(self, page_data: Dict) -> Tuple[int, int]:
        """Legacy wrapper for backward compatibility."""
        width, height, _, _ = self._get_ocr_dimensions_and_offsets(page_data)
        return (width, height)

    def _convert_pdf_to_images(self, target_width: int = None, target_height: int = None) -> List[Image.Image]:
        """
        Convert source PDF to images.

        Args:
            target_width: Desired image width (will calculate DPI to match)
            target_height: Desired image height (will calculate DPI to match)
        """
        print(f"  Converting PDF to images: {self.source_pdf.name}")

        if target_width or target_height:
            # Calculate DPI to match target dimensions
            # First convert at 300 DPI to get aspect ratio
            test_img = convert_from_path(str(self.source_pdf), dpi=300, first_page=1, last_page=1)[0]
            test_w, test_h = test_img.size

            # Calculate required DPI
            if target_width:
                required_dpi = int((target_width / test_w) * 300)
                print(f"  Calculated DPI to match width {target_width}: {required_dpi}")
            elif target_height:
                required_dpi = int((target_height / test_h) * 300)
                print(f"  Calculated DPI to match height {target_height}: {required_dpi}")

            images = convert_from_path(str(self.source_pdf), dpi=required_dpi)
        else:
            # Default: 300 DPI
            images = convert_from_path(str(self.source_pdf), dpi=300)

        return images

    def _normalize_bbox(self, bbox: List[List[int]], img_width: int, img_height: int,
                       pdf_width: float, pdf_height: float,
                       ocr_width: int = None, ocr_height: int = None,
                       margins: Tuple[int, int, int, int] = None,
                       ocr_offsets: Tuple[int, int] = None) -> Tuple[float, float, float, float]:
        """
        Convert bbox coordinates from image space to PDF space.

        Args:
            bbox: Bounding box [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
            img_width: Original image width (from PDF conversion)
            img_height: Original image height (from PDF conversion)
            pdf_width: PDF page width
            pdf_height: PDF page height
            ocr_width: OCR image width (if different from img_width)
            ocr_height: OCR image height (if different from img_height)
            margins: (top, right, bottom, left) margins to compensate for

        Returns:
            Tuple of (x, y, width, height) in PDF coordinates
        """
        # Use OCR dimensions if provided, otherwise use image dimensions
        source_width = ocr_width if ocr_width else img_width
        source_height = ocr_height if ocr_height else img_height

        # DEBUG: Print scaling info for first few calls
        import threading
        if not hasattr(threading.current_thread(), '_bbox_debug_count'):
            threading.current_thread()._bbox_debug_count = 0

        if threading.current_thread()._bbox_debug_count < 3:
            threading.current_thread()._bbox_debug_count += 1
            scale_y = pdf_height / source_height if source_height > 0 else 1.0
            margin_info = f", top_margin={margins[0] if margins else 0}" if margins else ""
            print(f"      [normalize_bbox] source_height={source_height}, pdf_height={pdf_height}, scale_y={scale_y:.4f}{margin_info}")

        # Get margins - if OCR was done on cropped image, we need to add margins back
        if margins:
            top_margin, right_margin, bottom_margin, left_margin = margins
        else:
            top_margin = right_margin = bottom_margin = left_margin = 0

        # Get min/max coordinates from bbox
        xs = [point[0] for point in bbox]
        ys = [point[1] for point in bbox]

        min_x = min(xs)
        max_x = max(xs)
        min_y = min(ys)
        max_y = max(ys)

        # Subtract OCR offsets if provided (bbox coords may not start at 0,0)
        if ocr_offsets:
            offset_x, offset_y = ocr_offsets
            min_x_adj = min_x - offset_x
            max_x_adj = max_x - offset_x
            min_y_adj = min_y - offset_y
            max_y_adj = max_y - offset_y
        else:
            min_x_adj = min_x
            max_x_adj = max_x
            min_y_adj = min_y
            max_y_adj = max_y

        # Convert to PDF coordinates (origin at bottom-left)
        # CRITICAL: divide by source dimensions (OCR space), not image dimensions!
        x = (min_x_adj / source_width) * pdf_width
        width = ((max_x_adj - min_x_adj) / source_width) * pdf_width

        # Y coordinate is inverted in PDF (origin at bottom)
        # We want the bottom-left corner of the text box
        y_bottom = pdf_height - ((max_y_adj / source_height) * pdf_height)
        height = ((max_y_adj - min_y_adj) / source_height) * pdf_height

        # Adjust Y to be at the baseline (bottom of text box)
        y = y_bottom

        return (x, y, width, height)

    def _calculate_font_size(self, text: str, bbox_width: float, bbox_height: float,
                            c: canvas.Canvas, font_name: str = None) -> float:
        """
        Calculate optimal font size to fit text in bbox.

        Args:
            text: The text to fit
            bbox_width: Width of bounding box
            bbox_height: Height of bounding box
            c: Canvas object for text width calculation
            font_name: Font name to use (defaults to Chinese font)

        Returns:
            Optimal font size in points
        """
        if font_name is None:
            font_name = SearchablePDFGenerator._chinese_font_name

        # Start with height-based estimate (reduced by factor for better fit)
        font_size = bbox_height * 0.7

        # Check if text fits horizontally, adjust if needed
        max_iterations = 10
        iteration = 0

        while iteration < max_iterations:
            try:
                text_width = c.stringWidth(text, font_name, font_size)

                if text_width <= bbox_width * 1.05:  # Allow 5% overflow for safety
                    break

                # Text too wide, reduce font size
                font_size *= (bbox_width / text_width) * 0.95

                if font_size < 1:
                    font_size = 1
                    break

                iteration += 1
            except:
                # Fallback to height-based sizing
                font_size = bbox_height * 0.5
                break

        return max(1, font_size)

    def _add_text_layer(self, c: canvas.Canvas, page_data: Dict,
                       img_width: int, img_height: int,
                       pdf_width: float, pdf_height: float,
                       margins: Tuple[int, int, int, int] = None):
        """
        Add invisible text layer to PDF page.

        Args:
            c: ReportLab canvas
            page_data: Page data from JSON
            img_width: Image width
            img_height: Image height
            pdf_width: PDF page width
            pdf_height: PDF page height

        Returns:
            Dict with metrics: text_count, char_count, avg_confidence, errors
        """
        # Detect OCR image dimensions and offsets from bbox coordinates
        ocr_width, ocr_height, offset_x, offset_y = self._get_ocr_dimensions_and_offsets(page_data)
        ocr_offsets = (offset_x, offset_y)

        if self.debug_mode:
            if ocr_width > 0:
                print(f"    OCR dims: {ocr_width}x{ocr_height}, offsets: ({offset_x}, {offset_y})")
                print(f"    Converted img: {img_width}x{img_height}")
                print(f"    Using OCR dims as source, scaling to PDF/image dims")

        details = page_data.get("details", [])
        metrics = {
            "text_count": 0,
            "char_count": 0,
            "confidences": [],
            "errors": 0,
            "empty_blocks": 0,
            "invalid_bbox": 0
        }

        for detail in details:
            text = detail.get("text", "").strip()
            confidence = detail.get("confidence", 0.0)

            if not text:
                metrics["empty_blocks"] += 1
                continue

            bbox = detail.get("bbox", [])
            if not bbox or len(bbox) != 4:
                metrics["invalid_bbox"] += 1
                continue

            # Convert bbox to PDF coordinates
            # Disable offsets for now - focus on getting the scaling right first
            x, y, width, height = self._normalize_bbox(bbox, img_width, img_height,
                                                       pdf_width, pdf_height,
                                                       ocr_width, ocr_height, None, None)

            # Calculate optimal font size
            font_size = self._calculate_font_size(text, width, height, c)

            try:
                # Use Chinese font
                font_name = SearchablePDFGenerator._chinese_font_name
                c.setFont(font_name, font_size)

                if self.debug_mode:
                    # Debug mode: draw visible text with bounding boxes
                    from reportlab.lib.colors import red, blue

                    # Draw red rectangle around bbox
                    c.setStrokeColor(red)
                    c.setLineWidth(0.5)
                    c.rect(x, y, width, height, stroke=1, fill=0)

                    # Draw visible text in blue
                    c.setFillColor(blue)
                    c.drawString(x, y, text)
                else:
                    # Normal mode: invisible text
                    text_obj = c.beginText(x, y)
                    text_obj.setTextRenderMode(3)  # Mode 3 = invisible text
                    text_obj.textLine(text)
                    c.drawText(text_obj)

                metrics["text_count"] += 1
                metrics["char_count"] += len(text)
                metrics["confidences"].append(confidence)

            except Exception as e:
                metrics["errors"] += 1
                if metrics["errors"] <= 3:  # Only show first 3 errors
                    print(f"    Warning: Could not add text '{text[:20]}...': {e}")
                continue

        return metrics

    def generate(self):
        """Generate the searchable PDF."""
        print(f"\n{'='*60}")
        print(f"Processing: {self.json_path.name}")
        print(f"Output: {self.output_path}")
        print(f"{'='*60}")

        # Create output directory if needed
        self.output_path.parent.mkdir(parents=True, exist_ok=True)

        pages = self.data.get("pages", [])

        # Detect OCR dimensions from first page to match PDF conversion
        if pages:
            ocr_width, ocr_height, offset_x, offset_y = self._get_ocr_dimensions_and_offsets(pages[0])
            print(f"  Detected OCR dimensions: {ocr_width} x {ocr_height} (offsets: {offset_x}, {offset_y})")

            # CRITICAL FIX: Convert PDF to match OCR height too, not just width!
            # This ensures perfect 1:1 aspect ratio match
            if ocr_width and ocr_height:
                # Calculate DPI needed to match both dimensions
                test_img = convert_from_path(str(self.source_pdf), dpi=300, first_page=1, last_page=1)[0]
                test_w, test_h = test_img.size

                # Calculate DPI for each dimension
                dpi_for_width = int((ocr_width / test_w) * 300)
                dpi_for_height = int((ocr_height / test_h) * 300)

                # Use average or prioritize height (where we had more issues)
                target_dpi = int((dpi_for_width + dpi_for_height) / 2)
                print(f"  DPI for width: {dpi_for_width}, for height: {dpi_for_height}, using: {target_dpi}")

                images = convert_from_path(str(self.source_pdf), dpi=target_dpi)
            else:
                images = self._convert_pdf_to_images()
        else:
            images = self._convert_pdf_to_images()

        # Create PDF
        c = canvas.Canvas(str(self.output_path))

        # Global metrics
        total_metrics = {
            "text_blocks": 0,
            "characters": 0,
            "confidences": [],
            "errors": 0,
            "empty_blocks": 0,
            "invalid_bbox": 0
        }

        for idx, page_data in enumerate(pages):
            if idx >= len(images):
                print(f"  Warning: No image for page {idx + 1}")
                break

            img = images[idx]
            img_width, img_height = img.size

            # Detect white margins in the image
            margins = self._detect_margins(img)

            # Get OCR dimensions for this page
            page_ocr_width, page_ocr_height = self._get_ocr_dimensions(page_data)

            # CRITICAL: Use IMAGE dimensions as PDF dimensions, NOT OCR dimensions
            # The bbox will be scaled from OCR space to image space in normalize_bbox
            pdf_width = float(img_width)
            pdf_height = float(img_height)

            if self.debug_mode:
                print(f"  PDF dims (from image): {pdf_width}x{pdf_height}")
                print(f"  OCR dims (from bbox): {page_ocr_width}x{page_ocr_height}")
                if page_ocr_width > 0:
                    scale_x = pdf_width / page_ocr_width
                    scale_y = pdf_height / page_ocr_height
                    print(f"  Scale factors: X={scale_x:.4f}, Y={scale_y:.4f}")

            c.setPageSize((pdf_width, pdf_height))

            # Save image to temporary file
            with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
                img.save(tmp.name, 'PNG')
                tmp_path = tmp.name

            try:
                # Draw image as background
                img_reader = ImageReader(tmp_path)
                c.drawImage(img_reader, 0, 0, width=pdf_width, height=pdf_height)

                # Add invisible text layer
                page_metrics = self._add_text_layer(c, page_data, img_width, img_height,
                                                   pdf_width, pdf_height, margins)

                # Accumulate metrics
                total_metrics["text_blocks"] += page_metrics["text_count"]
                total_metrics["characters"] += page_metrics["char_count"]
                total_metrics["confidences"].extend(page_metrics["confidences"])
                total_metrics["errors"] += page_metrics["errors"]
                total_metrics["empty_blocks"] += page_metrics["empty_blocks"]
                total_metrics["invalid_bbox"] += page_metrics["invalid_bbox"]

                # Show page
                c.showPage()

                # Calculate page average confidence
                if page_metrics["confidences"]:
                    avg_conf = sum(page_metrics["confidences"]) / len(page_metrics["confidences"])
                    print(f"  Page {idx + 1}/{len(pages)}: {page_metrics['text_count']} blocks, "
                          f"{page_metrics['char_count']} chars, "
                          f"avg conf: {avg_conf:.1%}")
                else:
                    print(f"  Page {idx + 1}/{len(pages)}: {page_metrics['text_count']} blocks")

            finally:
                # Clean up temp file
                os.unlink(tmp_path)

        # Save PDF
        c.save()

        # Display summary metrics
        print(f"\n{'─'*60}")
        print(f"QUALITY METRICS:")
        print(f"{'─'*60}")
        print(f"  Total text blocks: {total_metrics['text_blocks']}")
        print(f"  Total characters:  {total_metrics['characters']}")

        if total_metrics["confidences"]:
            avg_confidence = sum(total_metrics["confidences"]) / len(total_metrics["confidences"])
            min_confidence = min(total_metrics["confidences"])
            max_confidence = max(total_metrics["confidences"])

            # Count high/low confidence blocks
            high_conf = sum(1 for c in total_metrics["confidences"] if c >= 0.9)
            low_conf = sum(1 for c in total_metrics["confidences"] if c < 0.7)

            print(f"  Average confidence: {avg_confidence:.1%}")
            print(f"  Confidence range:   {min_confidence:.1%} - {max_confidence:.1%}")
            print(f"  High confidence (≥90%): {high_conf}/{total_metrics['text_blocks']} ({high_conf/total_metrics['text_blocks']*100:.1f}%)")
            print(f"  Low confidence (<70%):  {low_conf}/{total_metrics['text_blocks']} ({low_conf/total_metrics['text_blocks']*100:.1f}%)")

        if total_metrics["errors"] > 0:
            print(f"  Errors:            {total_metrics['errors']}")
        if total_metrics["empty_blocks"] > 0:
            print(f"  Empty blocks:      {total_metrics['empty_blocks']}")
        if total_metrics["invalid_bbox"] > 0:
            print(f"  Invalid bboxes:    {total_metrics['invalid_bbox']}")

        print(f"{'─'*60}")
        print(f"  ✓ Saved to: {self.output_path}")

        return total_metrics


def process_single_file(input_path: str, output_path: str, debug_mode: bool = False):
    """Process a single JSON file."""
    try:
        generator = SearchablePDFGenerator(input_path, output_path, auto_mode=False, debug_mode=debug_mode)
        generator.generate()
        return True
    except Exception as e:
        print(f"Error processing {input_path}: {e}")
        import traceback
        traceback.print_exc()
        return False


def process_auto_mode(base_path: str = None):
    """Auto-process all JSON files in OCR directories."""
    if base_path:
        base_path = Path(base_path)
    else:
        base_path = Path(__file__).parent / "Raw"

    # Find all OCR JSON files
    json_files = list(base_path.glob("*/OCR/*.json"))

    if not json_files:
        print(f"No JSON files found in {base_path}/*/OCR/")
        return False

    print(f"\nFound {len(json_files)} JSON files to process")
    print("="*60)

    success_count = 0
    error_count = 0

    for json_file in json_files:
        try:
            generator = SearchablePDFGenerator(str(json_file), auto_mode=True)
            generator.generate()
            success_count += 1
        except Exception as e:
            print(f"\n❌ Error processing {json_file.name}: {e}")
            error_count += 1
            continue

    print("\n" + "="*60)
    print(f"SUMMARY: {success_count} succeeded, {error_count} failed")
    print("="*60)

    return error_count == 0


def main():
    """Main entry point."""
    parser = argparse.ArgumentParser(
        description="Convert OCR JSON to searchable PDF",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Process single file with output
  python json_to_searchable_pdf.py -i input.json -o output.pdf

  # Debug mode: visible text with red bounding boxes
  python json_to_searchable_pdf.py -i input.json -o output.pdf --debug

  # Auto-process all JSON files in Raw directory
  python json_to_searchable_pdf.py --auto

  # Auto-process from specific directory
  python json_to_searchable_pdf.py --auto /path/to/directory
        """
    )

    parser.add_argument(
        "-i", "--input",
        help="Input OCR JSON file"
    )
    parser.add_argument(
        "-o", "--output",
        help="Output PDF file"
    )
    parser.add_argument(
        "--auto",
        nargs="?",
        const=True,
        help="Auto-process all JSON files (optionally specify base directory)"
    )
    parser.add_argument(
        "--debug",
        action="store_true",
        help="Debug mode: make text visible with red bounding boxes"
    )

    args = parser.parse_args()

    # Validate arguments
    if args.auto:
        # Auto mode
        base_dir = args.auto if isinstance(args.auto, str) else None
        success = process_auto_mode(base_dir)
        return 0 if success else 1

    elif args.input and args.output:
        # Single file mode
        success = process_single_file(args.input, args.output, debug_mode=args.debug)
        return 0 if success else 1

    else:
        parser.print_help()
        print("\n❌ Error: Either specify --auto or both -i and -o")
        return 1


if __name__ == "__main__":
    exit(main())