113 lines
3.4 KiB
Python
113 lines
3.4 KiB
Python
import os
|
|
import sys
|
|
import asyncio
|
|
from pathlib import Path
|
|
|
|
# Add LightRAG to path
|
|
sys.path.append('LightRAG-main')
|
|
|
|
async def test_direct_ocr():
|
|
"""Test OCR directly using the document processor"""
|
|
|
|
print("Testing direct OCR with GPU...")
|
|
print("=" * 50)
|
|
|
|
try:
|
|
# Import the document processor
|
|
from lightrag.document_processor import get_document_processor
|
|
|
|
print("Initializing document processor...")
|
|
processor = get_document_processor()
|
|
|
|
# Test file
|
|
ocr_pdf_path = "ocr.pdf"
|
|
|
|
if not os.path.exists(ocr_pdf_path):
|
|
print(f"Error: OCR PDF file not found at {ocr_pdf_path}")
|
|
return False
|
|
|
|
print(f"Processing OCR PDF: {ocr_pdf_path}")
|
|
|
|
# Process the document directly
|
|
result = await processor.process_document(ocr_pdf_path)
|
|
|
|
print(f"Processing result: {result.success}")
|
|
print(f"Error message: {result.error}")
|
|
print(f"Content length: {len(result.content)}")
|
|
|
|
if result.success:
|
|
print("✅ OCR processing successful!")
|
|
print(f"Extracted content preview: {result.content[:500]}...")
|
|
|
|
# Check if GPU was used
|
|
if hasattr(processor, 'use_gpu'):
|
|
print(f"GPU usage: {processor.use_gpu}")
|
|
|
|
# Check for any fallback messages
|
|
if "fallback" in result.content.lower():
|
|
print("⚠️ WARNING: Fallback detected in content")
|
|
|
|
return True
|
|
else:
|
|
print(f"❌ OCR processing failed: {result.error}")
|
|
return False
|
|
|
|
except ImportError as e:
|
|
print(f"❌ Import error: {e}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Unexpected error: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
async def check_paddleocr_gpu():
|
|
"""Check if PaddleOCR is using GPU"""
|
|
|
|
print("\nChecking PaddleOCR GPU status...")
|
|
print("=" * 30)
|
|
|
|
try:
|
|
import paddle
|
|
print(f"Paddle version: {paddle.__version__}")
|
|
print(f"Paddle is compiled with CUDA: {paddle.is_compiled_with_cuda()}")
|
|
print(f"Paddle device: {paddle.get_device()}")
|
|
print(f"GPU available: {paddle.is_compiled_with_cuda() and paddle.device.cuda.device_count() > 0}")
|
|
|
|
if paddle.device.cuda.device_count() > 0:
|
|
print(f"Number of GPUs: {paddle.device.cuda.device_count()}")
|
|
for i in range(paddle.device.cuda.device_count()):
|
|
print(f"GPU {i}: {paddle.device.cuda.get_device_name(i)}")
|
|
else:
|
|
print("No GPUs detected by PaddlePaddle")
|
|
|
|
except ImportError:
|
|
print("❌ PaddlePaddle not installed")
|
|
except Exception as e:
|
|
print(f"❌ Error checking PaddleOCR: {e}")
|
|
|
|
async def main():
|
|
"""Main test function"""
|
|
|
|
print("Direct OCR Test Suite")
|
|
print("=" * 50)
|
|
|
|
# Check PaddleOCR GPU status
|
|
await check_paddleocr_gpu()
|
|
|
|
print("\n" + "=" * 50)
|
|
|
|
# Test direct OCR
|
|
success = await test_direct_ocr()
|
|
|
|
print("\n" + "=" * 50)
|
|
if success:
|
|
print("🎉 Direct OCR test completed successfully!")
|
|
else:
|
|
print("💥 Direct OCR test failed!")
|
|
|
|
return success
|
|
|
|
if __name__ == "__main__":
|
|
# Run the async tests
|
|
asyncio.run(main()) |