railseek6/upload_and_monitor_ocr.py

import requests
import time
import json

def upload_ocr_pdf_and_monitor():
    """Upload ocr.pdf via web UI and monitor for errors"""
    base_url = "http://localhost:3015"
    api_key = "jleu1212"

    print("=== UPLOADING OCR.PDF VIA WEB UI ===")

    # Step 1: Upload the file
    print("1. Uploading ocr.pdf...")
    headers = {"X-API-Key": api_key}

    try:
        with open('ocr.pdf', 'rb') as f:
            files = {'file': ('ocr.pdf', f, 'application/pdf')}
            upload_response = requests.post(
                f"{base_url}/documents/upload",
                files=files,
                headers=headers,
                timeout=60
            )

        if upload_response.status_code == 200:
            print("   ✅ File uploaded successfully")
            upload_result = upload_response.json()
            print(f"   Upload result: {upload_result}")
        else:
            print(f"   ❌ Upload failed: {upload_response.status_code}")
            print(f"   Response: {upload_response.text}")
            return False

    except Exception as e:
        print(f"   ❌ Upload error: {e}")
        return False

    # Step 2: Monitor document status
    print("\n2. Monitoring document processing...")
    max_attempts = 30  # 5 minutes max
    attempt = 0

    while attempt < max_attempts:
        try:
            status_response = requests.get(f"{base_url}/documents", headers=headers)
            if status_response.status_code == 200:
                docs_data = status_response.json()
                statuses = docs_data.get('statuses', {})

                # Check for processed documents
                processed = statuses.get('PROCESSED', [])
                for doc in processed:
                    if doc.get('file_path') == 'ocr.pdf':
                        print("   ✅ OCR.PDF PROCESSED SUCCESSFULLY!")
                        print(f"   Content summary: {doc.get('content_summary', 'N/A')}")
                        return True

                # Check for failed documents
                failed = statuses.get('FAILED', [])
                for doc in failed:
                    if doc.get('file_path') == 'ocr.pdf':
                        error_msg = doc.get('error_msg', 'Unknown error')
                        print(f"   ❌ OCR.PDF FAILED: {error_msg}")
                        return False

                # Check for processing documents
                processing = statuses.get('PROCESSING', [])
                processing_ocr = False
                for doc in processing:
                    if doc.get('file_path') == 'ocr.pdf':
                        processing_ocr = True
                        print(f"   ⏳ Still processing... (attempt {attempt + 1}/{max_attempts})")
                        break

                if not processing_ocr:
                    print(f"   ⏳ Waiting for processing to start... (attempt {attempt + 1}/{max_attempts})")

            else:
                print(f"   ⚠️  Status check failed: {status_response.status_code}")

        except Exception as e:
            print(f"   ⚠️  Status check error: {e}")

        attempt += 1
        time.sleep(10)  # Check every 10 seconds

    print("   ❌ Processing timeout - document not processed within 5 minutes")
    return False

if __name__ == "__main__":
    success = upload_ocr_pdf_and_monitor()
    if not success:
        print("\n=== DEBUGGING WHITESPACE ERROR ===")
        print("The OCR PDF upload failed with a whitespace error.")
        print("This indicates that the document processor detected no text content.")
        print("\nNext steps:")
        print("1. Check LightRAG server logs for detailed error information")
        print("2. Examine the document processor code for whitespace detection logic")
        print("3. Verify PaddleOCR is working correctly in the LightRAG environment")
    exit(0 if success else 1)