Files
railseek6/test_direct_ocr.py

113 lines
3.4 KiB
Python

import os
import sys
import asyncio
from pathlib import Path
# Add LightRAG to path
sys.path.append('LightRAG-main')
async def test_direct_ocr():
"""Test OCR directly using the document processor"""
print("Testing direct OCR with GPU...")
print("=" * 50)
try:
# Import the document processor
from lightrag.document_processor import get_document_processor
print("Initializing document processor...")
processor = get_document_processor()
# Test file
ocr_pdf_path = "ocr.pdf"
if not os.path.exists(ocr_pdf_path):
print(f"Error: OCR PDF file not found at {ocr_pdf_path}")
return False
print(f"Processing OCR PDF: {ocr_pdf_path}")
# Process the document directly
result = await processor.process_document(ocr_pdf_path)
print(f"Processing result: {result.success}")
print(f"Error message: {result.error}")
print(f"Content length: {len(result.content)}")
if result.success:
print("✅ OCR processing successful!")
print(f"Extracted content preview: {result.content[:500]}...")
# Check if GPU was used
if hasattr(processor, 'use_gpu'):
print(f"GPU usage: {processor.use_gpu}")
# Check for any fallback messages
if "fallback" in result.content.lower():
print("⚠️ WARNING: Fallback detected in content")
return True
else:
print(f"❌ OCR processing failed: {result.error}")
return False
except ImportError as e:
print(f"❌ Import error: {e}")
return False
except Exception as e:
print(f"❌ Unexpected error: {e}")
import traceback
traceback.print_exc()
return False
async def check_paddleocr_gpu():
"""Check if PaddleOCR is using GPU"""
print("\nChecking PaddleOCR GPU status...")
print("=" * 30)
try:
import paddle
print(f"Paddle version: {paddle.__version__}")
print(f"Paddle is compiled with CUDA: {paddle.is_compiled_with_cuda()}")
print(f"Paddle device: {paddle.get_device()}")
print(f"GPU available: {paddle.is_compiled_with_cuda() and paddle.device.cuda.device_count() > 0}")
if paddle.device.cuda.device_count() > 0:
print(f"Number of GPUs: {paddle.device.cuda.device_count()}")
for i in range(paddle.device.cuda.device_count()):
print(f"GPU {i}: {paddle.device.cuda.get_device_name(i)}")
else:
print("No GPUs detected by PaddlePaddle")
except ImportError:
print("❌ PaddlePaddle not installed")
except Exception as e:
print(f"❌ Error checking PaddleOCR: {e}")
async def main():
"""Main test function"""
print("Direct OCR Test Suite")
print("=" * 50)
# Check PaddleOCR GPU status
await check_paddleocr_gpu()
print("\n" + "=" * 50)
# Test direct OCR
success = await test_direct_ocr()
print("\n" + "=" * 50)
if success:
print("🎉 Direct OCR test completed successfully!")
else:
print("💥 Direct OCR test failed!")
return success
if __name__ == "__main__":
# Run the async tests
asyncio.run(main())