railseek6/test_ocr_upload_and_status.py

import requests
import json
import time

def test_ocr_upload_and_status():
    # Test login
    login_url = 'http://localhost:3015/login'
    login_data = {'username': 'jleu3482', 'password': 'jleu1212'}

    print("Testing LightRAG OCR upload and status...")

    try:
        # Login
        response = requests.post(login_url, data=login_data)
        print(f'Login response: {response.status_code}')

        if response.status_code == 200:
            print('Login successful')

            # Get documents list to check current status
            docs_url = 'http://localhost:3015/documents'
            docs_response = requests.get(docs_url)
            print(f'Documents response: {docs_response.status_code}')

            if docs_response.status_code == 200:
                documents = docs_response.json()
                print(f'Documents response: {documents}')

                # Handle the new API response format with statuses
                processed_docs = []
                failed_docs = []

                if isinstance(documents, dict) and 'statuses' in documents:
                    processed_docs = documents['statuses'].get('processed', [])
                    failed_docs = documents['statuses'].get('failed', [])
                    print(f'Processed documents: {len(processed_docs)}')
                    print(f'Failed documents: {len(failed_docs)}')

                    for doc in processed_docs:
                        print(f'  - {doc.get("file_path", "Unknown")}: {doc.get("status", "Unknown")}')
                    for doc in failed_docs:
                        print(f'  - {doc.get("file_path", "Unknown")}: {doc.get("status", "Unknown")} - {doc.get("error_msg", "No error message")}')
                else:
                    print(f'Unexpected documents format: {type(documents)}')

                # Check if ocr.pdf is already uploaded and processed
                ocr_doc = None
                for doc in processed_docs:
                    if doc.get('file_path') == 'ocr.pdf':
                        ocr_doc = doc
                        break

                if ocr_doc:
                    print(f'\nOCR PDF found with status: {ocr_doc.get("status")}')
                    if ocr_doc.get('status') == 'processed':
                        print('OCR PDF already processed successfully!')
                        return True
                    else:
                        print('OCR PDF exists but not processed, monitoring status...')
                else:
                    print('\nOCR PDF not found, uploading...')
                    # Upload ocr.pdf
                    with open('ocr.pdf', 'rb') as f:
                        files = {'file': ('ocr.pdf', f, 'application/pdf')}
                        upload_response = requests.post('http://localhost:3015/documents/upload', files=files)
                        print(f'Upload response: {upload_response.status_code}')
                        if upload_response.status_code == 200:
                            print('OCR PDF uploaded successfully!')
                        else:
                            print(f'Upload failed: {upload_response.text}')
                            return False

                # Monitor processing status
                print('\nMonitoring processing status...')
                for i in range(30):  # Monitor for up to 5 minutes
                    time.sleep(10)
                    docs_response = requests.get(docs_url)
                    if docs_response.status_code == 200:
                        documents = docs_response.json()

                        # Handle the new API response format
                        if isinstance(documents, dict) and 'statuses' in documents:
                            processed_docs = documents['statuses'].get('processed', [])
                            failed_docs = documents['statuses'].get('failed', [])

                            for doc in processed_docs:
                                if doc.get('file_path') == 'ocr.pdf':
                                    print('OCR PDF processing completed successfully!')
                                    return True

                            for doc in failed_docs:
                                if doc.get('file_path') == 'ocr.pdf':
                                    print(f'OCR PDF processing failed: {doc.get("error_msg", "Unknown error")}')
                                    return False

                            print(f'Status check {i+1}: OCR PDF still processing...')
                        else:
                            print(f'Unexpected status format: {type(documents)}')
                print('Timeout waiting for processing to complete')
                return False
            else:
                print(f'Failed to get documents: {docs_response.text}')
                return False
        else:
            print(f'Login failed: {response.text}')
            return False

    except Exception as e:
        print(f'Error during test: {e}')
        return False

if __name__ == '__main__':
    success = test_ocr_upload_and_status()
    if success:
        print('\n✅ OCR upload and processing test PASSED!')
    else:
        print('\n❌ OCR upload and processing test FAILED!')