101 lines
4.0 KiB
Python
101 lines
4.0 KiB
Python
import requests
|
|
import time
|
|
import json
|
|
|
|
def upload_ocr_pdf_and_monitor():
|
|
"""Upload ocr.pdf via web UI and monitor for errors"""
|
|
base_url = "http://localhost:3015"
|
|
api_key = "jleu1212"
|
|
|
|
print("=== UPLOADING OCR.PDF VIA WEB UI ===")
|
|
|
|
# Step 1: Upload the file
|
|
print("1. Uploading ocr.pdf...")
|
|
headers = {"X-API-Key": api_key}
|
|
|
|
try:
|
|
with open('ocr.pdf', 'rb') as f:
|
|
files = {'file': ('ocr.pdf', f, 'application/pdf')}
|
|
upload_response = requests.post(
|
|
f"{base_url}/documents/upload",
|
|
files=files,
|
|
headers=headers,
|
|
timeout=60
|
|
)
|
|
|
|
if upload_response.status_code == 200:
|
|
print(" ✅ File uploaded successfully")
|
|
upload_result = upload_response.json()
|
|
print(f" Upload result: {upload_result}")
|
|
else:
|
|
print(f" ❌ Upload failed: {upload_response.status_code}")
|
|
print(f" Response: {upload_response.text}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Upload error: {e}")
|
|
return False
|
|
|
|
# Step 2: Monitor document status
|
|
print("\n2. Monitoring document processing...")
|
|
max_attempts = 30 # 5 minutes max
|
|
attempt = 0
|
|
|
|
while attempt < max_attempts:
|
|
try:
|
|
status_response = requests.get(f"{base_url}/documents", headers=headers)
|
|
if status_response.status_code == 200:
|
|
docs_data = status_response.json()
|
|
statuses = docs_data.get('statuses', {})
|
|
|
|
# Check for processed documents
|
|
processed = statuses.get('PROCESSED', [])
|
|
for doc in processed:
|
|
if doc.get('file_path') == 'ocr.pdf':
|
|
print(" ✅ OCR.PDF PROCESSED SUCCESSFULLY!")
|
|
print(f" Content summary: {doc.get('content_summary', 'N/A')}")
|
|
return True
|
|
|
|
# Check for failed documents
|
|
failed = statuses.get('FAILED', [])
|
|
for doc in failed:
|
|
if doc.get('file_path') == 'ocr.pdf':
|
|
error_msg = doc.get('error_msg', 'Unknown error')
|
|
print(f" ❌ OCR.PDF FAILED: {error_msg}")
|
|
return False
|
|
|
|
# Check for processing documents
|
|
processing = statuses.get('PROCESSING', [])
|
|
processing_ocr = False
|
|
for doc in processing:
|
|
if doc.get('file_path') == 'ocr.pdf':
|
|
processing_ocr = True
|
|
print(f" ⏳ Still processing... (attempt {attempt + 1}/{max_attempts})")
|
|
break
|
|
|
|
if not processing_ocr:
|
|
print(f" ⏳ Waiting for processing to start... (attempt {attempt + 1}/{max_attempts})")
|
|
|
|
else:
|
|
print(f" ⚠️ Status check failed: {status_response.status_code}")
|
|
|
|
except Exception as e:
|
|
print(f" ⚠️ Status check error: {e}")
|
|
|
|
attempt += 1
|
|
time.sleep(10) # Check every 10 seconds
|
|
|
|
print(" ❌ Processing timeout - document not processed within 5 minutes")
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
success = upload_ocr_pdf_and_monitor()
|
|
if not success:
|
|
print("\n=== DEBUGGING WHITESPACE ERROR ===")
|
|
print("The OCR PDF upload failed with a whitespace error.")
|
|
print("This indicates that the document processor detected no text content.")
|
|
print("\nNext steps:")
|
|
print("1. Check LightRAG server logs for detailed error information")
|
|
print("2. Examine the document processor code for whitespace detection logic")
|
|
print("3. Verify PaddleOCR is working correctly in the LightRAG environment")
|
|
exit(0 if success else 1) |