50 lines
1.8 KiB
Python
50 lines
1.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test script to verify current OCR functionality
|
|
"""
|
|
|
|
import paddleocr
|
|
import os
|
|
from pathlib import Path
|
|
|
|
def test_current_ocr():
|
|
print("Testing current OCR functionality...")
|
|
|
|
# Test with the current parameters we're using
|
|
try:
|
|
ocr = paddleocr.PaddleOCR(use_textline_orientation=True, lang='en')
|
|
print("✓ PaddleOCR initialized successfully with current parameters")
|
|
|
|
# Test with a simple image if available, or just verify the model loads
|
|
test_image_path = Path("test_ocr.pdf")
|
|
if not test_image_path.exists():
|
|
test_image_path = Path("test_documents/sample_ocr.pdf")
|
|
|
|
if test_image_path.exists():
|
|
print(f"Testing OCR on: {test_image_path}")
|
|
result = ocr.ocr(str(test_image_path))
|
|
if result and len(result) > 0:
|
|
print("✓ OCR extraction successful")
|
|
# Print first few lines of extracted text
|
|
text_content = ""
|
|
for page in result:
|
|
if page and isinstance(page, list):
|
|
for line in page:
|
|
if line and len(line) >= 2:
|
|
text = line[1][0]
|
|
text_content += text + " "
|
|
|
|
if text_content.strip():
|
|
print(f"Extracted text sample: {text_content[:200]}...")
|
|
else:
|
|
print("No text extracted")
|
|
else:
|
|
print("✗ OCR extraction failed - no results")
|
|
else:
|
|
print("No test file found, but OCR engine initialized successfully")
|
|
|
|
except Exception as e:
|
|
print(f"✗ OCR initialization failed: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
test_current_ocr() |