import requests import time import json def upload_ocr_pdf_and_monitor(): """Upload ocr.pdf via web UI and monitor for errors""" base_url = "http://localhost:3015" api_key = "jleu1212" print("=== UPLOADING OCR.PDF VIA WEB UI ===") # Step 1: Upload the file print("1. Uploading ocr.pdf...") headers = {"X-API-Key": api_key} try: with open('ocr.pdf', 'rb') as f: files = {'file': ('ocr.pdf', f, 'application/pdf')} upload_response = requests.post( f"{base_url}/documents/upload", files=files, headers=headers, timeout=60 ) if upload_response.status_code == 200: print(" ✅ File uploaded successfully") upload_result = upload_response.json() print(f" Upload result: {upload_result}") else: print(f" ❌ Upload failed: {upload_response.status_code}") print(f" Response: {upload_response.text}") return False except Exception as e: print(f" ❌ Upload error: {e}") return False # Step 2: Monitor document status print("\n2. Monitoring document processing...") max_attempts = 30 # 5 minutes max attempt = 0 while attempt < max_attempts: try: status_response = requests.get(f"{base_url}/documents", headers=headers) if status_response.status_code == 200: docs_data = status_response.json() statuses = docs_data.get('statuses', {}) # Check for processed documents processed = statuses.get('PROCESSED', []) for doc in processed: if doc.get('file_path') == 'ocr.pdf': print(" ✅ OCR.PDF PROCESSED SUCCESSFULLY!") print(f" Content summary: {doc.get('content_summary', 'N/A')}") return True # Check for failed documents failed = statuses.get('FAILED', []) for doc in failed: if doc.get('file_path') == 'ocr.pdf': error_msg = doc.get('error_msg', 'Unknown error') print(f" ❌ OCR.PDF FAILED: {error_msg}") return False # Check for processing documents processing = statuses.get('PROCESSING', []) processing_ocr = False for doc in processing: if doc.get('file_path') == 'ocr.pdf': processing_ocr = True print(f" ⏳ Still processing... (attempt {attempt + 1}/{max_attempts})") break if not processing_ocr: print(f" ⏳ Waiting for processing to start... (attempt {attempt + 1}/{max_attempts})") else: print(f" ⚠️ Status check failed: {status_response.status_code}") except Exception as e: print(f" ⚠️ Status check error: {e}") attempt += 1 time.sleep(10) # Check every 10 seconds print(" ❌ Processing timeout - document not processed within 5 minutes") return False if __name__ == "__main__": success = upload_ocr_pdf_and_monitor() if not success: print("\n=== DEBUGGING WHITESPACE ERROR ===") print("The OCR PDF upload failed with a whitespace error.") print("This indicates that the document processor detected no text content.") print("\nNext steps:") print("1. Check LightRAG server logs for detailed error information") print("2. Examine the document processor code for whitespace detection logic") print("3. Verify PaddleOCR is working correctly in the LightRAG environment") exit(0 if success else 1)