ocr improved

This commit is contained in:
2026-01-13 18:25:49 +08:00
parent 9745ca2476
commit a5eb381384
104 changed files with 818 additions and 229 deletions

30
test_fix_safedistance.py Normal file
View File

@@ -0,0 +1,30 @@
import asyncio
import sys
import os
sys.path.insert(0, 'LightRAG-main')
from lightrag.document_processor import get_document_processor
async def test_pdf():
processor = get_document_processor()
result = await processor.process_document('test/safedistance.pdf')
print(f"Success: {result.success}")
print(f"Content length: {len(result.content)}")
print(f"Tables found: {len(result.tables) if result.tables else 0}")
if result.tables:
for i, table in enumerate(result.tables):
print(f"Table {i}: rows={table.get('rows')}, cols={table.get('columns')}")
# Print first few rows
data = table.get('data', [])
for r in range(min(3, len(data))):
print(f" Row {r}: {data[r]}")
# Check if OCR was used
if result.metadata.get('processed_with_ocr'):
print("OCR was used (good)")
else:
print("OCR was NOT used (maybe text extraction succeeded?)")
# Print first 500 chars of content
print("\n--- First 500 characters of extracted content ---")
print(result.content[:500])
if __name__ == "__main__":
asyncio.run(test_pdf())