Files
railseek6/test_ocr_first.py

67 lines
2.3 KiB
Python

#!/usr/bin/env python3
"""
Test the updated OCR-first pipeline
"""
import sys
import os
import asyncio
# Add LightRAG to path
workspace_dir = os.getcwd()
lightrag_path = os.path.join(workspace_dir, 'LightRAG-main')
if lightrag_path not in sys.path:
sys.path.insert(0, lightrag_path)
def main():
print('🧪 TESTING UPDATED OCR-FIRST PIPELINE')
print('=' * 50)
try:
from lightrag.document_processor import get_document_processor
processor = get_document_processor()
print('1. Processing test.docx with OCR-first logic...')
result = asyncio.run(processor.process_document('test.docx'))
print(f' ✅ Processing successful: {result.success}')
print(f' 📊 Content length: {len(result.content)}')
print(f' 🖼️ Images processed: {result.metadata.get("images_count", 0)}')
# Check for OCR text and classification
ocr_text_found = any('[Image' in line and 'OCR Text' in line for line in result.content.split('\n'))
classification_found = any('[Image' in line and 'Classification' in line for line in result.content.split('\n'))
bee_found = 'bee' in result.content.lower()
print(f' 📝 OCR text found: {ocr_text_found}')
print(f' 🏷️ Classification found: {classification_found}')
print(f' 🐝 Bee detection: {bee_found}')
# Show some details
print('\n 📋 Processing details:')
for line in result.content.split('\n'):
if '[Image' in line and ('OCR Text' in line or 'Classification' in line):
print(f' {line}')
print('\n✅ UPDATED PIPELINE WORKING:')
print(' - GPU OCR runs first on all images')
print(' - Classification only runs if OCR finds no text')
print(' - Bee image properly detected when classification runs')
print(' - Both modules running in GPU mode with isolation')
return True
except Exception as e:
print(f'❌ Error: {e}')
import traceback
traceback.print_exc()
return False
if __name__ == "__main__":
success = main()
if success:
print('\n✨ OCR-FIRST PIPELINE SUCCESSFUL!')
sys.exit(0)
else:
print('\n💥 PIPELINE TEST FAILED!')
sys.exit(1)