railseek6/test_document_processor_standalone.py

"""
Standalone Test for Enhanced Document Processor with Bee Classification
Tests the document processing pipeline directly without server dependencies
"""

import os
import sys
import asyncio
from pathlib import Path

# Add the LightRAG directory to path to import our enhanced processor
sys.path.insert(0, 'LightRAG-main')

def test_document_processor():
    """Test the enhanced document processor directly"""
    print("🧪 TESTING ENHANCED DOCUMENT PROCESSOR")
    print("=" * 50)

    try:
        # Import the enhanced document processor
        from lightrag.document_processor import get_document_processor, DocumentProcessor

        print("✅ Successfully imported enhanced document processor")

        # Test with test.docx
        test_file = "test.docx"
        if not os.path.exists(test_file):
            print(f"❌ Test file {test_file} not found")
            return False

        print(f"📄 Testing with file: {test_file}")
        print("⏳ Processing document...")

        # Create processor instance
        processor = DocumentProcessor()

        # Process the document
        async def process_doc():
            result = await processor.process_document(test_file)
            return result

        # Run the async function
        result = asyncio.run(process_doc())

        print(f"✅ Document processing completed: {result.success}")

        if result.success:
            print(f"📊 Processing results:")
            print(f"   - Content length: {len(result.content)} characters")
            print(f"   - Metadata: {result.metadata}")
            print(f"   - Images processed: {len(result.images) if result.images else 0}")

            # Check for bee classification in content
            content_lower = result.content.lower()
            bee_keywords = ['bee', 'insect', 'animal', 'classification', 'photo of a bee']

            print("\n🔍 Searching for bee classification in content...")
            bee_found = False
            for keyword in bee_keywords:
                if keyword in content_lower:
                    bee_found = True
                    print(f"✅ Found keyword: '{keyword}'")

            # Extract specific classification lines
            lines = result.content.split('\n')
            classification_lines = [line for line in lines if 'classification' in line.lower()]

            if classification_lines:
                print("\n📋 Classification results found:")
                for line in classification_lines:
                    print(f"   - {line}")

            if bee_found:
                print("\n🎉 SUCCESS: Bee classification detected in processed content!")
                print("   The enhanced document processor is working correctly.")
                print("   Bee entities should now be searchable in LightRAG.")
            else:
                print("\n❌ No bee classification found in processed content")
                print("   This may indicate that the image classification didn't run")
                print("   or the bee image wasn't properly classified.")

            # Check if we have image metadata
            if result.images:
                print(f"\n🖼️  Image processing details:")
                for i, image in enumerate(result.images):
                    print(f"   Image {i+1}:")
                    if 'primary_classification' in image:
                        print(f"      Classification: {image['primary_classification']}")
                    if 'classification' in image:
                        print(f"      Full classification: {image['classification']}")
                    if 'ocr_text' in image:
                        print(f"      OCR Text: {image['ocr_text'][:100]}...")

            return bee_found
        else:
            print(f"❌ Document processing failed: {result.error}")
            return False

    except Exception as e:
        print(f"❌ Error testing document processor: {e}")
        import traceback
        traceback.print_exc()
        return False

def test_image_classifier():
    """Test the image classifier directly"""
    print("\n" + "=" * 50)
    print("🖼️  TESTING IMAGE CLASSIFIER")
    print("=" * 50)

    try:
        # Import the image classifier
        from fast_image_classifier import get_image_classifier

        classifier = get_image_classifier()

        if classifier.available:
            print("✅ Image classifier is available")

            # Test with a known image if available
            test_images = []
            for ext in ['.jpg', '.jpeg', '.png']:
                test_img = f"test_image{ext}"
                if os.path.exists(test_img):
                    test_images.append(test_img)

            if test_images:
                for test_img in test_images:
                    print(f"🔍 Testing classification on {test_img}...")
                    results = classifier.classify_image(test_img, top_k=3)
                    print(f"📊 Classification results for {test_img}:")
                    for result in results:
                        print(f"   - {result['label']}: {result['confidence']:.2f}")
            else:
                print("ℹ️  No test images found for direct classification test")

            return True
        else:
            print("❌ Image classifier is not available")
            return False

    except Exception as e:
        print(f"❌ Error testing image classifier: {e}")
        return False

def check_dependencies():
    """Check if all required dependencies are available"""
    print("🔍 CHECKING DEPENDENCIES")
    print("=" * 50)

    dependencies = {
        'PaddleOCR': False,
        'OpenCLIP': False,
        'PyMuPDF (fitz)': False,
        'python-docx': False,
        'BeautifulSoup': False
    }

    try:
        import paddleocr
        dependencies['PaddleOCR'] = True
        print("✅ PaddleOCR: Available")
    except ImportError:
        print("❌ PaddleOCR: Not available")

    try:
        import open_clip
        dependencies['OpenCLIP'] = True
        print("✅ OpenCLIP: Available")
    except ImportError:
        print("❌ OpenCLIP: Not available")

    try:
        import fitz
        dependencies['PyMuPDF (fitz)'] = True
        print("✅ PyMuPDF (fitz): Available")
    except ImportError:
        print("❌ PyMuPDF (fitz): Not available")

    try:
        import docx
        dependencies['python-docx'] = True
        print("✅ python-docx: Available")
    except ImportError:
        print("❌ python-docx: Not available")

    try:
        from bs4 import BeautifulSoup
        dependencies['BeautifulSoup'] = True
        print("✅ BeautifulSoup: Available")
    except ImportError:
        print("❌ BeautifulSoup: Not available")

    return all(dependencies.values())

def main():
    """Main test function"""
    print("🚀 ENHANCED DOCUMENT PROCESSOR TEST SUITE")
    print("=" * 60)
    print("This test verifies the complete document processing pipeline")
    print("with enhanced entity extraction for bee classification.")
    print()

    # Check dependencies
    if not check_dependencies():
        print("\n⚠️  Some dependencies are missing, but continuing with tests...")

    # Test image classifier
    classifier_ok = test_image_classifier()

    # Test document processor
    processor_ok = test_document_processor()

    print("\n" + "=" * 60)
    print("📊 TEST RESULTS SUMMARY")
    print("=" * 60)
    print(f"✅ Image Classifier: {'WORKING' if classifier_ok else 'ISSUES'}")
    print(f"✅ Document Processor: {'WORKING' if processor_ok else 'ISSUES'}")

    if processor_ok:
        print("\n🎉 SUCCESS: Enhanced document processing pipeline is working!")
        print("   The bee classification should now be searchable in LightRAG.")
        print("   The enhanced entity extraction inserts bee classification")
        print("   as explicit entities for spaCy to extract.")
    else:
        print("\n❌ ISSUES: There are problems with the document processing pipeline")
        print("   Check the error messages above for details.")

    print("\n💡 Next steps:")
    print("   1. Ensure LightRAG server is configured to use the enhanced processor")
    print("   2. Upload test.docx to verify bee classification appears in search")
    print("   3. Check server logs for document processing details")

if __name__ == "__main__":
    main()