51 lines
1.7 KiB
Python
51 lines
1.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test script to verify document processor functionality
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
sys.path.append('LightRAG-main')
|
|
from lightrag.document_processor import get_document_processor
|
|
import asyncio
|
|
|
|
async def test_processor():
|
|
"""Test the document processor with test.docx"""
|
|
print("🧪 Testing Document Processor")
|
|
print("=" * 40)
|
|
|
|
try:
|
|
processor = get_document_processor()
|
|
result = await processor.process_document('test.docx')
|
|
|
|
print(f"Success: {result.success}")
|
|
print(f"Content length: {len(result.content)}")
|
|
print(f"Images count: {len(result.images) if result.images else 0}")
|
|
|
|
if result.images:
|
|
for img in result.images:
|
|
classification = img.get("primary_classification", "No classification")
|
|
print(f"Image {img.get('index')}: {classification}")
|
|
|
|
# Print content snippets that contain 'classification' or 'bee'
|
|
print("\n🔍 Searching for classification content...")
|
|
content_lines = result.content.split('\n')
|
|
found_classification = False
|
|
for line in content_lines:
|
|
if 'classification' in line.lower() or 'bee' in line.lower():
|
|
print(f"Found: {line}")
|
|
found_classification = True
|
|
|
|
if not found_classification:
|
|
print("❌ No classification content found in document")
|
|
|
|
# Check metadata
|
|
print(f"\n📊 Metadata: {result.metadata}")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error testing processor: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(test_processor()) |