railseek6/test_document_processor.py

#!/usr/bin/env python3
"""
Test script to verify document processor functionality
"""

import sys
import os
sys.path.append('LightRAG-main')
from lightrag.document_processor import get_document_processor
import asyncio

async def test_processor():
    """Test the document processor with test.docx"""
    print("🧪 Testing Document Processor")
    print("=" * 40)

    try:
        processor = get_document_processor()
        result = await processor.process_document('test.docx')

        print(f"Success: {result.success}")
        print(f"Content length: {len(result.content)}")
        print(f"Images count: {len(result.images) if result.images else 0}")

        if result.images:
            for img in result.images:
                classification = img.get("primary_classification", "No classification")
                print(f"Image {img.get('index')}: {classification}")

        # Print content snippets that contain 'classification' or 'bee'
        print("\n🔍 Searching for classification content...")
        content_lines = result.content.split('\n')
        found_classification = False
        for line in content_lines:
            if 'classification' in line.lower() or 'bee' in line.lower():
                print(f"Found: {line}")
                found_classification = True

        if not found_classification:
            print("❌ No classification content found in document")

        # Check metadata
        print(f"\n📊 Metadata: {result.metadata}")

    except Exception as e:
        print(f"❌ Error testing processor: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    asyncio.run(test_processor())