30 lines
1.1 KiB
Python
30 lines
1.1 KiB
Python
import asyncio
|
|
import sys
|
|
import os
|
|
sys.path.insert(0, 'LightRAG-main')
|
|
from lightrag.document_processor import get_document_processor
|
|
|
|
async def test_pdf():
|
|
processor = get_document_processor()
|
|
result = await processor.process_document('test/safedistance.pdf')
|
|
print(f"Success: {result.success}")
|
|
print(f"Content length: {len(result.content)}")
|
|
print(f"Tables found: {len(result.tables) if result.tables else 0}")
|
|
if result.tables:
|
|
for i, table in enumerate(result.tables):
|
|
print(f"Table {i}: rows={table.get('rows')}, cols={table.get('columns')}")
|
|
# Print first few rows
|
|
data = table.get('data', [])
|
|
for r in range(min(3, len(data))):
|
|
print(f" Row {r}: {data[r]}")
|
|
# Check if OCR was used
|
|
if result.metadata.get('processed_with_ocr'):
|
|
print("OCR was used (good)")
|
|
else:
|
|
print("OCR was NOT used (maybe text extraction succeeded?)")
|
|
# Print first 500 chars of content
|
|
print("\n--- First 500 characters of extracted content ---")
|
|
print(result.content[:500])
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(test_pdf()) |