235 lines
8.5 KiB
Python
235 lines
8.5 KiB
Python
"""
|
||
Standalone Test for Enhanced Document Processor with Bee Classification
|
||
Tests the document processing pipeline directly without server dependencies
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import asyncio
|
||
from pathlib import Path
|
||
|
||
# Add the LightRAG directory to path to import our enhanced processor
|
||
sys.path.insert(0, 'LightRAG-main')
|
||
|
||
def test_document_processor():
|
||
"""Test the enhanced document processor directly"""
|
||
print("🧪 TESTING ENHANCED DOCUMENT PROCESSOR")
|
||
print("=" * 50)
|
||
|
||
try:
|
||
# Import the enhanced document processor
|
||
from lightrag.document_processor import get_document_processor, DocumentProcessor
|
||
|
||
print("✅ Successfully imported enhanced document processor")
|
||
|
||
# Test with test.docx
|
||
test_file = "test.docx"
|
||
if not os.path.exists(test_file):
|
||
print(f"❌ Test file {test_file} not found")
|
||
return False
|
||
|
||
print(f"📄 Testing with file: {test_file}")
|
||
print("⏳ Processing document...")
|
||
|
||
# Create processor instance
|
||
processor = DocumentProcessor()
|
||
|
||
# Process the document
|
||
async def process_doc():
|
||
result = await processor.process_document(test_file)
|
||
return result
|
||
|
||
# Run the async function
|
||
result = asyncio.run(process_doc())
|
||
|
||
print(f"✅ Document processing completed: {result.success}")
|
||
|
||
if result.success:
|
||
print(f"📊 Processing results:")
|
||
print(f" - Content length: {len(result.content)} characters")
|
||
print(f" - Metadata: {result.metadata}")
|
||
print(f" - Images processed: {len(result.images) if result.images else 0}")
|
||
|
||
# Check for bee classification in content
|
||
content_lower = result.content.lower()
|
||
bee_keywords = ['bee', 'insect', 'animal', 'classification', 'photo of a bee']
|
||
|
||
print("\n🔍 Searching for bee classification in content...")
|
||
bee_found = False
|
||
for keyword in bee_keywords:
|
||
if keyword in content_lower:
|
||
bee_found = True
|
||
print(f"✅ Found keyword: '{keyword}'")
|
||
|
||
# Extract specific classification lines
|
||
lines = result.content.split('\n')
|
||
classification_lines = [line for line in lines if 'classification' in line.lower()]
|
||
|
||
if classification_lines:
|
||
print("\n📋 Classification results found:")
|
||
for line in classification_lines:
|
||
print(f" - {line}")
|
||
|
||
if bee_found:
|
||
print("\n🎉 SUCCESS: Bee classification detected in processed content!")
|
||
print(" The enhanced document processor is working correctly.")
|
||
print(" Bee entities should now be searchable in LightRAG.")
|
||
else:
|
||
print("\n❌ No bee classification found in processed content")
|
||
print(" This may indicate that the image classification didn't run")
|
||
print(" or the bee image wasn't properly classified.")
|
||
|
||
# Check if we have image metadata
|
||
if result.images:
|
||
print(f"\n🖼️ Image processing details:")
|
||
for i, image in enumerate(result.images):
|
||
print(f" Image {i+1}:")
|
||
if 'primary_classification' in image:
|
||
print(f" Classification: {image['primary_classification']}")
|
||
if 'classification' in image:
|
||
print(f" Full classification: {image['classification']}")
|
||
if 'ocr_text' in image:
|
||
print(f" OCR Text: {image['ocr_text'][:100]}...")
|
||
|
||
return bee_found
|
||
else:
|
||
print(f"❌ Document processing failed: {result.error}")
|
||
return False
|
||
|
||
except Exception as e:
|
||
print(f"❌ Error testing document processor: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return False
|
||
|
||
def test_image_classifier():
|
||
"""Test the image classifier directly"""
|
||
print("\n" + "=" * 50)
|
||
print("🖼️ TESTING IMAGE CLASSIFIER")
|
||
print("=" * 50)
|
||
|
||
try:
|
||
# Import the image classifier
|
||
from fast_image_classifier import get_image_classifier
|
||
|
||
classifier = get_image_classifier()
|
||
|
||
if classifier.available:
|
||
print("✅ Image classifier is available")
|
||
|
||
# Test with a known image if available
|
||
test_images = []
|
||
for ext in ['.jpg', '.jpeg', '.png']:
|
||
test_img = f"test_image{ext}"
|
||
if os.path.exists(test_img):
|
||
test_images.append(test_img)
|
||
|
||
if test_images:
|
||
for test_img in test_images:
|
||
print(f"🔍 Testing classification on {test_img}...")
|
||
results = classifier.classify_image(test_img, top_k=3)
|
||
print(f"📊 Classification results for {test_img}:")
|
||
for result in results:
|
||
print(f" - {result['label']}: {result['confidence']:.2f}")
|
||
else:
|
||
print("ℹ️ No test images found for direct classification test")
|
||
|
||
return True
|
||
else:
|
||
print("❌ Image classifier is not available")
|
||
return False
|
||
|
||
except Exception as e:
|
||
print(f"❌ Error testing image classifier: {e}")
|
||
return False
|
||
|
||
def check_dependencies():
|
||
"""Check if all required dependencies are available"""
|
||
print("🔍 CHECKING DEPENDENCIES")
|
||
print("=" * 50)
|
||
|
||
dependencies = {
|
||
'PaddleOCR': False,
|
||
'OpenCLIP': False,
|
||
'PyMuPDF (fitz)': False,
|
||
'python-docx': False,
|
||
'BeautifulSoup': False
|
||
}
|
||
|
||
try:
|
||
import paddleocr
|
||
dependencies['PaddleOCR'] = True
|
||
print("✅ PaddleOCR: Available")
|
||
except ImportError:
|
||
print("❌ PaddleOCR: Not available")
|
||
|
||
try:
|
||
import open_clip
|
||
dependencies['OpenCLIP'] = True
|
||
print("✅ OpenCLIP: Available")
|
||
except ImportError:
|
||
print("❌ OpenCLIP: Not available")
|
||
|
||
try:
|
||
import fitz
|
||
dependencies['PyMuPDF (fitz)'] = True
|
||
print("✅ PyMuPDF (fitz): Available")
|
||
except ImportError:
|
||
print("❌ PyMuPDF (fitz): Not available")
|
||
|
||
try:
|
||
import docx
|
||
dependencies['python-docx'] = True
|
||
print("✅ python-docx: Available")
|
||
except ImportError:
|
||
print("❌ python-docx: Not available")
|
||
|
||
try:
|
||
from bs4 import BeautifulSoup
|
||
dependencies['BeautifulSoup'] = True
|
||
print("✅ BeautifulSoup: Available")
|
||
except ImportError:
|
||
print("❌ BeautifulSoup: Not available")
|
||
|
||
return all(dependencies.values())
|
||
|
||
def main():
|
||
"""Main test function"""
|
||
print("🚀 ENHANCED DOCUMENT PROCESSOR TEST SUITE")
|
||
print("=" * 60)
|
||
print("This test verifies the complete document processing pipeline")
|
||
print("with enhanced entity extraction for bee classification.")
|
||
print()
|
||
|
||
# Check dependencies
|
||
if not check_dependencies():
|
||
print("\n⚠️ Some dependencies are missing, but continuing with tests...")
|
||
|
||
# Test image classifier
|
||
classifier_ok = test_image_classifier()
|
||
|
||
# Test document processor
|
||
processor_ok = test_document_processor()
|
||
|
||
print("\n" + "=" * 60)
|
||
print("📊 TEST RESULTS SUMMARY")
|
||
print("=" * 60)
|
||
print(f"✅ Image Classifier: {'WORKING' if classifier_ok else 'ISSUES'}")
|
||
print(f"✅ Document Processor: {'WORKING' if processor_ok else 'ISSUES'}")
|
||
|
||
if processor_ok:
|
||
print("\n🎉 SUCCESS: Enhanced document processing pipeline is working!")
|
||
print(" The bee classification should now be searchable in LightRAG.")
|
||
print(" The enhanced entity extraction inserts bee classification")
|
||
print(" as explicit entities for spaCy to extract.")
|
||
else:
|
||
print("\n❌ ISSUES: There are problems with the document processing pipeline")
|
||
print(" Check the error messages above for details.")
|
||
|
||
print("\n💡 Next steps:")
|
||
print(" 1. Ensure LightRAG server is configured to use the enhanced processor")
|
||
print(" 2. Upload test.docx to verify bee classification appears in search")
|
||
print(" 3. Check server logs for document processing details")
|
||
|
||
if __name__ == "__main__":
|
||
main() |