Files
railseek6/test_ocr_current.py

50 lines
1.8 KiB
Python

#!/usr/bin/env python3
"""
Test script to verify current OCR functionality
"""
import paddleocr
import os
from pathlib import Path
def test_current_ocr():
print("Testing current OCR functionality...")
# Test with the current parameters we're using
try:
ocr = paddleocr.PaddleOCR(use_textline_orientation=True, lang='en')
print("✓ PaddleOCR initialized successfully with current parameters")
# Test with a simple image if available, or just verify the model loads
test_image_path = Path("test_ocr.pdf")
if not test_image_path.exists():
test_image_path = Path("test_documents/sample_ocr.pdf")
if test_image_path.exists():
print(f"Testing OCR on: {test_image_path}")
result = ocr.ocr(str(test_image_path))
if result and len(result) > 0:
print("✓ OCR extraction successful")
# Print first few lines of extracted text
text_content = ""
for page in result:
if page and isinstance(page, list):
for line in page:
if line and len(line) >= 2:
text = line[1][0]
text_content += text + " "
if text_content.strip():
print(f"Extracted text sample: {text_content[:200]}...")
else:
print("No text extracted")
else:
print("✗ OCR extraction failed - no results")
else:
print("No test file found, but OCR engine initialized successfully")
except Exception as e:
print(f"✗ OCR initialization failed: {e}")
if __name__ == "__main__":
test_current_ocr()