169 lines
7.0 KiB
Python
169 lines
7.0 KiB
Python
"""
|
|
Complete Test for Document Processing Pipeline with Dependency Isolation
|
|
Tests OCR, Image Classification, and Bee Detection
|
|
"""
|
|
|
|
import asyncio
|
|
import sys
|
|
import os
|
|
import tempfile
|
|
import zipfile
|
|
|
|
# Add paths
|
|
sys.path.insert(0, "LightRAG-main")
|
|
|
|
async def test_complete_solution():
|
|
"""Test the complete document processing pipeline"""
|
|
print("🧪 COMPLETE SOLUTION TEST")
|
|
print("=" * 50)
|
|
|
|
try:
|
|
from lightrag.document_processor import get_document_processor
|
|
|
|
processor = get_document_processor()
|
|
|
|
print("🎯 COMPONENT STATUS:")
|
|
print(f" OCR: {'✅ Available' if processor.ocr_processor.ocr_available else '❌ Not Available'}")
|
|
print(f" Image Classifier: {'✅ Available' if processor.image_classifier and processor.image_classifier.available else '❌ Not Available'}")
|
|
|
|
# Test 1: Process test.docx with bee image
|
|
test_file = "test.docx"
|
|
if not os.path.exists(test_file):
|
|
print(f"❌ Test file not found: {test_file}")
|
|
return
|
|
|
|
print(f"\n📄 PROCESSING: {test_file}")
|
|
result = await processor.process_document(test_file)
|
|
|
|
if not result.success:
|
|
print(f"❌ Processing failed: {result.error}")
|
|
return
|
|
|
|
print(f"✅ Processing successful")
|
|
print(f"📊 Metadata: {result.metadata}")
|
|
|
|
# Check OCR results
|
|
print(f"\n🔤 OCR PERFORMANCE:")
|
|
ocr_success = False
|
|
for i, img in enumerate(result.images):
|
|
if 'ocr_text' in img and img['ocr_text'].strip():
|
|
ocr_success = True
|
|
text_len = len(img['ocr_text'])
|
|
confidence = img.get('ocr_confidence', 0)
|
|
print(f" ✅ Image {i+1}: {text_len} chars, confidence: {confidence:.3f}")
|
|
if img['ocr_text'].strip():
|
|
print(f" Text: {img['ocr_text'][:50]}...")
|
|
elif 'ocr_error' in img:
|
|
print(f" ❌ Image {i+1}: {img['ocr_error']}")
|
|
else:
|
|
print(f" ⚠️ Image {i+1}: No OCR text")
|
|
|
|
# Check classification
|
|
print(f"\n🖼️ CLASSIFICATION PERFORMANCE:")
|
|
classification_success = False
|
|
bee_found = False
|
|
for i, img in enumerate(result.images):
|
|
if 'classification' in img and img['classification']:
|
|
classification_success = True
|
|
top_result = img['classification'][0]
|
|
label = top_result.get('label', 'unknown')
|
|
score = top_result.get('confidence', 0)
|
|
print(f" ✅ Image {i+1}: {label} (score: {score:.3f})")
|
|
if 'bee' in label.lower():
|
|
bee_found = True
|
|
print(f" 🎯 BEE DETECTED!")
|
|
elif 'classification_error' in img:
|
|
print(f" ❌ Image {i+1}: {img['classification_error']}")
|
|
else:
|
|
print(f" ⚠️ Image {i+1}: No classification")
|
|
|
|
print(f"\n🎯 FINAL RESULTS:")
|
|
print(f" OCR: {'✅ WORKING' if ocr_success else '❌ FAILED'}")
|
|
print(f" Classification: {'✅ WORKING' if classification_success else '❌ FAILED'}")
|
|
print(f" Bee Detection: {'✅ SUCCESS' if bee_found else '❌ NOT FOUND'}")
|
|
print(f" Dependency Isolation: {'✅ ACHIEVED' if ocr_success and classification_success else '❌ FAILED'}")
|
|
|
|
# Test 2: Test OCR with a simple image
|
|
print(f"\n🧪 ADDITIONAL OCR TEST:")
|
|
test_simple_ocr()
|
|
|
|
# Test 3: Test image classification with virtual environment
|
|
print(f"\n🧪 ADDITIONAL CLASSIFICATION TEST:")
|
|
await test_simple_classification(processor)
|
|
|
|
except Exception as e:
|
|
print(f"❌ Test failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
def test_simple_ocr():
|
|
"""Test OCR with simple processor"""
|
|
try:
|
|
from simple_ocr_processor import get_simple_ocr_processor
|
|
|
|
processor = get_simple_ocr_processor()
|
|
if not processor.available:
|
|
print(" ❌ Simple OCR processor not available")
|
|
return
|
|
|
|
# Create a simple test image with text
|
|
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as f:
|
|
test_image_path = f.name
|
|
|
|
# For now, just test if processor works
|
|
result = processor.extract_text_from_image("test.docx") # This will fail but test the process
|
|
print(f" ✅ OCR subprocess execution: {'Working' if 'text' in result else 'Failed'}")
|
|
|
|
# Clean up
|
|
if os.path.exists(test_image_path):
|
|
os.unlink(test_image_path)
|
|
|
|
except Exception as e:
|
|
print(f" ❌ OCR test failed: {e}")
|
|
|
|
async def test_simple_classification(processor):
|
|
"""Test image classification"""
|
|
if not processor.image_classifier or not processor.image_classifier.available:
|
|
print(" ❌ Image classifier not available")
|
|
return
|
|
|
|
try:
|
|
# Extract first image from test.docx for classification test
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
# Extract images from docx using zipfile
|
|
with zipfile.ZipFile("test.docx", 'r') as zip_ref:
|
|
image_files = []
|
|
for file_info in zip_ref.filelist:
|
|
if file_info.filename.startswith('word/media/'):
|
|
# Extract the image
|
|
image_filename = os.path.basename(file_info.filename)
|
|
image_path = os.path.join(temp_dir, image_filename)
|
|
|
|
# Extract and save
|
|
with zip_ref.open(file_info.filename) as source, open(image_path, 'wb') as target:
|
|
target.write(source.read())
|
|
|
|
image_files.append(image_path)
|
|
break # Just test with first image
|
|
|
|
if image_files:
|
|
test_image = image_files[0]
|
|
print(f" Testing classification on: {os.path.basename(test_image)}")
|
|
results = processor.image_classifier.classify_image(test_image, top_k=3)
|
|
|
|
if results and 'error' not in results[0]:
|
|
print(f" ✅ Classification working")
|
|
for result in results:
|
|
print(f" {result['label']}: {result['confidence']:.4f}")
|
|
if 'bee' in result['label'].lower():
|
|
print(f" 🎯 BEE CLASSIFICATION SUCCESS!")
|
|
else:
|
|
print(f" ❌ Classification failed: {results}")
|
|
else:
|
|
print(" ⚠️ No images found in test.docx for classification test")
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Classification test failed: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(test_complete_solution()) |