30 lines
1006 B
Python
30 lines
1006 B
Python
"""
|
|
Test to analyze the content being processed by the document processor
|
|
"""
|
|
import sys
|
|
import os
|
|
sys.path.append('LightRAG-main')
|
|
|
|
from lightrag.document_processor import get_document_processor
|
|
import asyncio
|
|
|
|
async def test():
|
|
processor = get_document_processor()
|
|
result = await processor.process_document('ocr.pdf')
|
|
print('Content length:', len(result.content))
|
|
print('Content starts with:', repr(result.content[:100]))
|
|
print('Content ends with:', repr(result.content[-100:]))
|
|
print('Is whitespace only:', not result.content.strip())
|
|
print('Stripped length:', len(result.content.strip()))
|
|
print('Success:', result.success)
|
|
print('Error:', result.error)
|
|
|
|
# Check if there's a whitespace issue
|
|
if not result.content.strip():
|
|
print("⚠️ CONTENT IS EMPTY AFTER STRIPPING!")
|
|
print("Raw content:", repr(result.content))
|
|
else:
|
|
print("✅ Content has non-whitespace characters")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(test()) |