74 lines
2.5 KiB
Python
74 lines
2.5 KiB
Python
"""
|
|
Debug script to trace the server-side processing issue
|
|
"""
|
|
import sys
|
|
import os
|
|
sys.path.append('LightRAG-main')
|
|
|
|
import asyncio
|
|
from lightrag.document_processor import get_document_processor
|
|
from lightrag.api.routers.document_routes import pipeline_enqueue_file
|
|
from lightrag import LightRAG
|
|
from pathlib import Path
|
|
|
|
async def debug_server_processing():
|
|
print("🔍 Debugging server-side processing pipeline...")
|
|
|
|
# Test 1: Direct document processor (already working)
|
|
print("\n📄 Test 1: Direct Document Processor")
|
|
processor = get_document_processor()
|
|
direct_result = await processor.process_document('ocr.pdf')
|
|
print(f" Direct processing - Success: {direct_result.success}")
|
|
print(f" Direct processing - Content length: {len(direct_result.content)}")
|
|
print(f" Direct processing - Content stripped: {len(direct_result.content.strip())}")
|
|
|
|
# Test 2: Simulate server pipeline
|
|
print("\n🔄 Test 2: Simulating Server Pipeline")
|
|
|
|
# Create a temporary RAG instance for testing
|
|
from lightrag.utils import generate_track_id
|
|
|
|
# Create a minimal RAG instance with correct parameters
|
|
rag = LightRAG(
|
|
workspace="test_workspace",
|
|
enable_llm_cache_for_entity_extract=True
|
|
)
|
|
|
|
# Save the file to simulate upload
|
|
temp_file = Path("temp_ocr.pdf")
|
|
if not temp_file.exists():
|
|
import shutil
|
|
shutil.copy2('ocr.pdf', temp_file)
|
|
|
|
try:
|
|
# Test the pipeline_enqueue_file function directly
|
|
print(" Testing pipeline_enqueue_file...")
|
|
success, track_id = await pipeline_enqueue_file(rag, temp_file, generate_track_id("debug"))
|
|
|
|
print(f" Pipeline result - Success: {success}")
|
|
print(f" Pipeline result - Track ID: {track_id}")
|
|
|
|
if not success:
|
|
print(" ❌ Pipeline failed - checking document status...")
|
|
# Check what documents are in the system
|
|
docs_by_status = await rag.get_docs_by_status("FAILED")
|
|
for doc_id, doc_status in docs_by_status.items():
|
|
print(f" Failed doc: {doc_id} - {doc_status.error_msg}")
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Pipeline error: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
finally:
|
|
# Cleanup
|
|
if temp_file.exists():
|
|
temp_file.unlink()
|
|
|
|
# Clean up RAG storage
|
|
await rag.text_chunks.drop()
|
|
await rag.full_docs.drop()
|
|
await rag.doc_status.drop()
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(debug_server_processing()) |