railseek6/test_fix_safedistance.py

import asyncio
import sys
import os
sys.path.insert(0, 'LightRAG-main')
from lightrag.document_processor import get_document_processor

async def test_pdf():
    processor = get_document_processor()
    result = await processor.process_document('test/safedistance.pdf')
    print(f"Success: {result.success}")
    print(f"Content length: {len(result.content)}")
    print(f"Tables found: {len(result.tables) if result.tables else 0}")
    if result.tables:
        for i, table in enumerate(result.tables):
            print(f"Table {i}: rows={table.get('rows')}, cols={table.get('columns')}")
            # Print first few rows
            data = table.get('data', [])
            for r in range(min(3, len(data))):
                print(f"  Row {r}: {data[r]}")
    # Check if OCR was used
    if result.metadata.get('processed_with_ocr'):
        print("OCR was used (good)")
    else:
        print("OCR was NOT used (maybe text extraction succeeded?)")
    # Print first 500 chars of content
    print("\n--- First 500 characters of extracted content ---")
    print(result.content[:500])

if __name__ == "__main__":
    asyncio.run(test_pdf())