railseek6/debug_ocr_extraction.py

import os
import sys
import fitz  # PyMuPDF
import tempfile
import paddleocr
from pathlib import Path

def debug_ocr_on_pdf(pdf_path):
    """Debug what PaddleOCR is actually extracting from the PDF"""

    print(f"Testing OCR extraction on: {pdf_path}")
    print(f"File exists: {os.path.exists(pdf_path)}")
    print(f"File size: {os.path.getsize(pdf_path)} bytes")

    # Initialize PaddleOCR with current configuration
    try:
        ocr = paddleocr.PaddleOCR(use_angle_cls=True, lang='en')
        print("PaddleOCR initialized successfully")
    except Exception as e:
        print(f"Error initializing PaddleOCR: {e}")
        return

    # Open PDF and process each page
    try:
        doc = fitz.open(pdf_path)
        print(f"PDF has {len(doc)} pages")

        all_extracted_text = ""

        for page_num in range(len(doc)):
            print(f"\n--- Processing Page {page_num + 1} ---")
            page = doc.load_page(page_num)
            pix = page.get_pixmap()
            img_data = pix.tobytes("png")

            # Save temporary image
            with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img:
                temp_img.write(img_data)
                temp_img_path = temp_img.name

            try:
                # Run OCR on the temporary image
                result = ocr.ocr(temp_img_path)
                print(f"OCR result structure: {type(result)}")

                if result:
                    print(f"Number of lines detected: {len(result)}")

                    page_text = ""
                    for line_num, line in enumerate(result):
                        print(f"Line {line_num}: {line}")
                        if line:  # Check if line is not empty
                            for word_info in line:
                                if len(word_info) >= 2:
                                    text = word_info[1][0]
                                    confidence = word_info[1][1] if len(word_info[1]) > 1 else "N/A"
                                    print(f"  Word: '{text}' (confidence: {confidence})")
                                    page_text += text + " "

                    print(f"Extracted text from page {page_num + 1}: '{page_text.strip()}'")
                    all_extracted_text += page_text.strip() + "\n"
                else:
                    print(f"No OCR results for page {page_num + 1}")

            except Exception as e:
                print(f"Error during OCR processing page {page_num + 1}: {e}")
            finally:
                # Clean up temp file
                try:
                    os.unlink(temp_img_path)
                except:
                    pass

        print(f"\n--- FINAL EXTRACTED TEXT ---")
        print(all_extracted_text)
        print(f"Total characters extracted: {len(all_extracted_text)}")

        doc.close()

    except Exception as e:
        print(f"Error processing PDF: {e}")

if __name__ == "__main__":
    # Test on the ocr.pdf file
    pdf_path = "ocr.pdf"
    if not os.path.exists(pdf_path):
        print(f"File {pdf_path} not found. Looking for similar files...")
        # Look for any PDF files that might be the test file
        for file in os.listdir("."):
            if file.lower().endswith(".pdf"):
                print(f"Found PDF: {file}")
                if "ocr" in file.lower():
                    pdf_path = file
                    break

    debug_ocr_on_pdf(pdf_path)