ocr improved
This commit is contained in:
30
test_fix_safedistance.py
Normal file
30
test_fix_safedistance.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import asyncio
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, 'LightRAG-main')
|
||||
from lightrag.document_processor import get_document_processor
|
||||
|
||||
async def test_pdf():
|
||||
processor = get_document_processor()
|
||||
result = await processor.process_document('test/safedistance.pdf')
|
||||
print(f"Success: {result.success}")
|
||||
print(f"Content length: {len(result.content)}")
|
||||
print(f"Tables found: {len(result.tables) if result.tables else 0}")
|
||||
if result.tables:
|
||||
for i, table in enumerate(result.tables):
|
||||
print(f"Table {i}: rows={table.get('rows')}, cols={table.get('columns')}")
|
||||
# Print first few rows
|
||||
data = table.get('data', [])
|
||||
for r in range(min(3, len(data))):
|
||||
print(f" Row {r}: {data[r]}")
|
||||
# Check if OCR was used
|
||||
if result.metadata.get('processed_with_ocr'):
|
||||
print("OCR was used (good)")
|
||||
else:
|
||||
print("OCR was NOT used (maybe text extraction succeeded?)")
|
||||
# Print first 500 chars of content
|
||||
print("\n--- First 500 characters of extracted content ---")
|
||||
print(result.content[:500])
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_pdf())
|
||||
Reference in New Issue
Block a user