Files
railseek6/test_content_analysis.py

30 lines
1006 B
Python

"""
Test to analyze the content being processed by the document processor
"""
import sys
import os
sys.path.append('LightRAG-main')
from lightrag.document_processor import get_document_processor
import asyncio
async def test():
processor = get_document_processor()
result = await processor.process_document('ocr.pdf')
print('Content length:', len(result.content))
print('Content starts with:', repr(result.content[:100]))
print('Content ends with:', repr(result.content[-100:]))
print('Is whitespace only:', not result.content.strip())
print('Stripped length:', len(result.content.strip()))
print('Success:', result.success)
print('Error:', result.error)
# Check if there's a whitespace issue
if not result.content.strip():
print("⚠️ CONTENT IS EMPTY AFTER STRIPPING!")
print("Raw content:", repr(result.content))
else:
print("✅ Content has non-whitespace characters")
if __name__ == "__main__":
asyncio.run(test())