Files
railseek6/test_document_processor_standalone.py

235 lines
8.5 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Standalone Test for Enhanced Document Processor with Bee Classification
Tests the document processing pipeline directly without server dependencies
"""
import os
import sys
import asyncio
from pathlib import Path
# Add the LightRAG directory to path to import our enhanced processor
sys.path.insert(0, 'LightRAG-main')
def test_document_processor():
"""Test the enhanced document processor directly"""
print("🧪 TESTING ENHANCED DOCUMENT PROCESSOR")
print("=" * 50)
try:
# Import the enhanced document processor
from lightrag.document_processor import get_document_processor, DocumentProcessor
print("✅ Successfully imported enhanced document processor")
# Test with test.docx
test_file = "test.docx"
if not os.path.exists(test_file):
print(f"❌ Test file {test_file} not found")
return False
print(f"📄 Testing with file: {test_file}")
print("⏳ Processing document...")
# Create processor instance
processor = DocumentProcessor()
# Process the document
async def process_doc():
result = await processor.process_document(test_file)
return result
# Run the async function
result = asyncio.run(process_doc())
print(f"✅ Document processing completed: {result.success}")
if result.success:
print(f"📊 Processing results:")
print(f" - Content length: {len(result.content)} characters")
print(f" - Metadata: {result.metadata}")
print(f" - Images processed: {len(result.images) if result.images else 0}")
# Check for bee classification in content
content_lower = result.content.lower()
bee_keywords = ['bee', 'insect', 'animal', 'classification', 'photo of a bee']
print("\n🔍 Searching for bee classification in content...")
bee_found = False
for keyword in bee_keywords:
if keyword in content_lower:
bee_found = True
print(f"✅ Found keyword: '{keyword}'")
# Extract specific classification lines
lines = result.content.split('\n')
classification_lines = [line for line in lines if 'classification' in line.lower()]
if classification_lines:
print("\n📋 Classification results found:")
for line in classification_lines:
print(f" - {line}")
if bee_found:
print("\n🎉 SUCCESS: Bee classification detected in processed content!")
print(" The enhanced document processor is working correctly.")
print(" Bee entities should now be searchable in LightRAG.")
else:
print("\n❌ No bee classification found in processed content")
print(" This may indicate that the image classification didn't run")
print(" or the bee image wasn't properly classified.")
# Check if we have image metadata
if result.images:
print(f"\n🖼️ Image processing details:")
for i, image in enumerate(result.images):
print(f" Image {i+1}:")
if 'primary_classification' in image:
print(f" Classification: {image['primary_classification']}")
if 'classification' in image:
print(f" Full classification: {image['classification']}")
if 'ocr_text' in image:
print(f" OCR Text: {image['ocr_text'][:100]}...")
return bee_found
else:
print(f"❌ Document processing failed: {result.error}")
return False
except Exception as e:
print(f"❌ Error testing document processor: {e}")
import traceback
traceback.print_exc()
return False
def test_image_classifier():
"""Test the image classifier directly"""
print("\n" + "=" * 50)
print("🖼️ TESTING IMAGE CLASSIFIER")
print("=" * 50)
try:
# Import the image classifier
from fast_image_classifier import get_image_classifier
classifier = get_image_classifier()
if classifier.available:
print("✅ Image classifier is available")
# Test with a known image if available
test_images = []
for ext in ['.jpg', '.jpeg', '.png']:
test_img = f"test_image{ext}"
if os.path.exists(test_img):
test_images.append(test_img)
if test_images:
for test_img in test_images:
print(f"🔍 Testing classification on {test_img}...")
results = classifier.classify_image(test_img, top_k=3)
print(f"📊 Classification results for {test_img}:")
for result in results:
print(f" - {result['label']}: {result['confidence']:.2f}")
else:
print(" No test images found for direct classification test")
return True
else:
print("❌ Image classifier is not available")
return False
except Exception as e:
print(f"❌ Error testing image classifier: {e}")
return False
def check_dependencies():
"""Check if all required dependencies are available"""
print("🔍 CHECKING DEPENDENCIES")
print("=" * 50)
dependencies = {
'PaddleOCR': False,
'OpenCLIP': False,
'PyMuPDF (fitz)': False,
'python-docx': False,
'BeautifulSoup': False
}
try:
import paddleocr
dependencies['PaddleOCR'] = True
print("✅ PaddleOCR: Available")
except ImportError:
print("❌ PaddleOCR: Not available")
try:
import open_clip
dependencies['OpenCLIP'] = True
print("✅ OpenCLIP: Available")
except ImportError:
print("❌ OpenCLIP: Not available")
try:
import fitz
dependencies['PyMuPDF (fitz)'] = True
print("✅ PyMuPDF (fitz): Available")
except ImportError:
print("❌ PyMuPDF (fitz): Not available")
try:
import docx
dependencies['python-docx'] = True
print("✅ python-docx: Available")
except ImportError:
print("❌ python-docx: Not available")
try:
from bs4 import BeautifulSoup
dependencies['BeautifulSoup'] = True
print("✅ BeautifulSoup: Available")
except ImportError:
print("❌ BeautifulSoup: Not available")
return all(dependencies.values())
def main():
"""Main test function"""
print("🚀 ENHANCED DOCUMENT PROCESSOR TEST SUITE")
print("=" * 60)
print("This test verifies the complete document processing pipeline")
print("with enhanced entity extraction for bee classification.")
print()
# Check dependencies
if not check_dependencies():
print("\n⚠️ Some dependencies are missing, but continuing with tests...")
# Test image classifier
classifier_ok = test_image_classifier()
# Test document processor
processor_ok = test_document_processor()
print("\n" + "=" * 60)
print("📊 TEST RESULTS SUMMARY")
print("=" * 60)
print(f"✅ Image Classifier: {'WORKING' if classifier_ok else 'ISSUES'}")
print(f"✅ Document Processor: {'WORKING' if processor_ok else 'ISSUES'}")
if processor_ok:
print("\n🎉 SUCCESS: Enhanced document processing pipeline is working!")
print(" The bee classification should now be searchable in LightRAG.")
print(" The enhanced entity extraction inserts bee classification")
print(" as explicit entities for spaCy to extract.")
else:
print("\n❌ ISSUES: There are problems with the document processing pipeline")
print(" Check the error messages above for details.")
print("\n💡 Next steps:")
print(" 1. Ensure LightRAG server is configured to use the enhanced processor")
print(" 2. Upload test.docx to verify bee classification appears in search")
print(" 3. Check server logs for document processing details")
if __name__ == "__main__":
main()