#!/usr/bin/env python3 """ Test script to verify current OCR functionality """ import paddleocr import os from pathlib import Path def test_current_ocr(): print("Testing current OCR functionality...") # Test with the current parameters we're using try: ocr = paddleocr.PaddleOCR(use_textline_orientation=True, lang='en') print("✓ PaddleOCR initialized successfully with current parameters") # Test with a simple image if available, or just verify the model loads test_image_path = Path("test_ocr.pdf") if not test_image_path.exists(): test_image_path = Path("test_documents/sample_ocr.pdf") if test_image_path.exists(): print(f"Testing OCR on: {test_image_path}") result = ocr.ocr(str(test_image_path)) if result and len(result) > 0: print("✓ OCR extraction successful") # Print first few lines of extracted text text_content = "" for page in result: if page and isinstance(page, list): for line in page: if line and len(line) >= 2: text = line[1][0] text_content += text + " " if text_content.strip(): print(f"Extracted text sample: {text_content[:200]}...") else: print("No text extracted") else: print("✗ OCR extraction failed - no results") else: print("No test file found, but OCR engine initialized successfully") except Exception as e: print(f"✗ OCR initialization failed: {e}") if __name__ == "__main__": test_current_ocr()