67 lines
2.3 KiB
Python
67 lines
2.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test the updated OCR-first pipeline
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import asyncio
|
|
|
|
# Add LightRAG to path
|
|
workspace_dir = os.getcwd()
|
|
lightrag_path = os.path.join(workspace_dir, 'LightRAG-main')
|
|
if lightrag_path not in sys.path:
|
|
sys.path.insert(0, lightrag_path)
|
|
|
|
def main():
|
|
print('🧪 TESTING UPDATED OCR-FIRST PIPELINE')
|
|
print('=' * 50)
|
|
|
|
try:
|
|
from lightrag.document_processor import get_document_processor
|
|
processor = get_document_processor()
|
|
|
|
print('1. Processing test.docx with OCR-first logic...')
|
|
result = asyncio.run(processor.process_document('test.docx'))
|
|
|
|
print(f' ✅ Processing successful: {result.success}')
|
|
print(f' 📊 Content length: {len(result.content)}')
|
|
print(f' 🖼️ Images processed: {result.metadata.get("images_count", 0)}')
|
|
|
|
# Check for OCR text and classification
|
|
ocr_text_found = any('[Image' in line and 'OCR Text' in line for line in result.content.split('\n'))
|
|
classification_found = any('[Image' in line and 'Classification' in line for line in result.content.split('\n'))
|
|
bee_found = 'bee' in result.content.lower()
|
|
|
|
print(f' 📝 OCR text found: {ocr_text_found}')
|
|
print(f' 🏷️ Classification found: {classification_found}')
|
|
print(f' 🐝 Bee detection: {bee_found}')
|
|
|
|
# Show some details
|
|
print('\n 📋 Processing details:')
|
|
for line in result.content.split('\n'):
|
|
if '[Image' in line and ('OCR Text' in line or 'Classification' in line):
|
|
print(f' {line}')
|
|
|
|
print('\n✅ UPDATED PIPELINE WORKING:')
|
|
print(' - GPU OCR runs first on all images')
|
|
print(' - Classification only runs if OCR finds no text')
|
|
print(' - Bee image properly detected when classification runs')
|
|
print(' - Both modules running in GPU mode with isolation')
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f'❌ Error: {e}')
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
success = main()
|
|
if success:
|
|
print('\n✨ OCR-FIRST PIPELINE SUCCESSFUL!')
|
|
sys.exit(0)
|
|
else:
|
|
print('\n💥 PIPELINE TEST FAILED!')
|
|
sys.exit(1) |