railseek6/test_ocr_current.py

#!/usr/bin/env python3
"""
Test script to verify current OCR functionality
"""

import paddleocr
import os
from pathlib import Path

def test_current_ocr():
    print("Testing current OCR functionality...")

    # Test with the current parameters we're using
    try:
        ocr = paddleocr.PaddleOCR(use_textline_orientation=True, lang='en')
        print("✓ PaddleOCR initialized successfully with current parameters")

        # Test with a simple image if available, or just verify the model loads
        test_image_path = Path("test_ocr.pdf")
        if not test_image_path.exists():
            test_image_path = Path("test_documents/sample_ocr.pdf")

        if test_image_path.exists():
            print(f"Testing OCR on: {test_image_path}")
            result = ocr.ocr(str(test_image_path))
            if result and len(result) > 0:
                print("✓ OCR extraction successful")
                # Print first few lines of extracted text
                text_content = ""
                for page in result:
                    if page and isinstance(page, list):
                        for line in page:
                            if line and len(line) >= 2:
                                text = line[1][0]
                                text_content += text + " "

                if text_content.strip():
                    print(f"Extracted text sample: {text_content[:200]}...")
                else:
                    print("No text extracted")
            else:
                print("✗ OCR extraction failed - no results")
        else:
            print("No test file found, but OCR engine initialized successfully")

    except Exception as e:
        print(f"✗ OCR initialization failed: {e}")

if __name__ == "__main__":
    test_current_ocr()