Files
railseek6/upload_and_monitor_ocr.py

101 lines
4.0 KiB
Python

import requests
import time
import json
def upload_ocr_pdf_and_monitor():
"""Upload ocr.pdf via web UI and monitor for errors"""
base_url = "http://localhost:3015"
api_key = "jleu1212"
print("=== UPLOADING OCR.PDF VIA WEB UI ===")
# Step 1: Upload the file
print("1. Uploading ocr.pdf...")
headers = {"X-API-Key": api_key}
try:
with open('ocr.pdf', 'rb') as f:
files = {'file': ('ocr.pdf', f, 'application/pdf')}
upload_response = requests.post(
f"{base_url}/documents/upload",
files=files,
headers=headers,
timeout=60
)
if upload_response.status_code == 200:
print(" ✅ File uploaded successfully")
upload_result = upload_response.json()
print(f" Upload result: {upload_result}")
else:
print(f" ❌ Upload failed: {upload_response.status_code}")
print(f" Response: {upload_response.text}")
return False
except Exception as e:
print(f" ❌ Upload error: {e}")
return False
# Step 2: Monitor document status
print("\n2. Monitoring document processing...")
max_attempts = 30 # 5 minutes max
attempt = 0
while attempt < max_attempts:
try:
status_response = requests.get(f"{base_url}/documents", headers=headers)
if status_response.status_code == 200:
docs_data = status_response.json()
statuses = docs_data.get('statuses', {})
# Check for processed documents
processed = statuses.get('PROCESSED', [])
for doc in processed:
if doc.get('file_path') == 'ocr.pdf':
print(" ✅ OCR.PDF PROCESSED SUCCESSFULLY!")
print(f" Content summary: {doc.get('content_summary', 'N/A')}")
return True
# Check for failed documents
failed = statuses.get('FAILED', [])
for doc in failed:
if doc.get('file_path') == 'ocr.pdf':
error_msg = doc.get('error_msg', 'Unknown error')
print(f" ❌ OCR.PDF FAILED: {error_msg}")
return False
# Check for processing documents
processing = statuses.get('PROCESSING', [])
processing_ocr = False
for doc in processing:
if doc.get('file_path') == 'ocr.pdf':
processing_ocr = True
print(f" ⏳ Still processing... (attempt {attempt + 1}/{max_attempts})")
break
if not processing_ocr:
print(f" ⏳ Waiting for processing to start... (attempt {attempt + 1}/{max_attempts})")
else:
print(f" ⚠️ Status check failed: {status_response.status_code}")
except Exception as e:
print(f" ⚠️ Status check error: {e}")
attempt += 1
time.sleep(10) # Check every 10 seconds
print(" ❌ Processing timeout - document not processed within 5 minutes")
return False
if __name__ == "__main__":
success = upload_ocr_pdf_and_monitor()
if not success:
print("\n=== DEBUGGING WHITESPACE ERROR ===")
print("The OCR PDF upload failed with a whitespace error.")
print("This indicates that the document processor detected no text content.")
print("\nNext steps:")
print("1. Check LightRAG server logs for detailed error information")
print("2. Examine the document processor code for whitespace detection logic")
print("3. Verify PaddleOCR is working correctly in the LightRAG environment")
exit(0 if success else 1)