Files
railseek6/test_ocr_upload_and_status.py

117 lines
5.4 KiB
Python

import requests
import json
import time
def test_ocr_upload_and_status():
# Test login
login_url = 'http://localhost:3015/login'
login_data = {'username': 'jleu3482', 'password': 'jleu1212'}
print("Testing LightRAG OCR upload and status...")
try:
# Login
response = requests.post(login_url, data=login_data)
print(f'Login response: {response.status_code}')
if response.status_code == 200:
print('Login successful')
# Get documents list to check current status
docs_url = 'http://localhost:3015/documents'
docs_response = requests.get(docs_url)
print(f'Documents response: {docs_response.status_code}')
if docs_response.status_code == 200:
documents = docs_response.json()
print(f'Documents response: {documents}')
# Handle the new API response format with statuses
processed_docs = []
failed_docs = []
if isinstance(documents, dict) and 'statuses' in documents:
processed_docs = documents['statuses'].get('processed', [])
failed_docs = documents['statuses'].get('failed', [])
print(f'Processed documents: {len(processed_docs)}')
print(f'Failed documents: {len(failed_docs)}')
for doc in processed_docs:
print(f' - {doc.get("file_path", "Unknown")}: {doc.get("status", "Unknown")}')
for doc in failed_docs:
print(f' - {doc.get("file_path", "Unknown")}: {doc.get("status", "Unknown")} - {doc.get("error_msg", "No error message")}')
else:
print(f'Unexpected documents format: {type(documents)}')
# Check if ocr.pdf is already uploaded and processed
ocr_doc = None
for doc in processed_docs:
if doc.get('file_path') == 'ocr.pdf':
ocr_doc = doc
break
if ocr_doc:
print(f'\nOCR PDF found with status: {ocr_doc.get("status")}')
if ocr_doc.get('status') == 'processed':
print('OCR PDF already processed successfully!')
return True
else:
print('OCR PDF exists but not processed, monitoring status...')
else:
print('\nOCR PDF not found, uploading...')
# Upload ocr.pdf
with open('ocr.pdf', 'rb') as f:
files = {'file': ('ocr.pdf', f, 'application/pdf')}
upload_response = requests.post('http://localhost:3015/documents/upload', files=files)
print(f'Upload response: {upload_response.status_code}')
if upload_response.status_code == 200:
print('OCR PDF uploaded successfully!')
else:
print(f'Upload failed: {upload_response.text}')
return False
# Monitor processing status
print('\nMonitoring processing status...')
for i in range(30): # Monitor for up to 5 minutes
time.sleep(10)
docs_response = requests.get(docs_url)
if docs_response.status_code == 200:
documents = docs_response.json()
# Handle the new API response format
if isinstance(documents, dict) and 'statuses' in documents:
processed_docs = documents['statuses'].get('processed', [])
failed_docs = documents['statuses'].get('failed', [])
for doc in processed_docs:
if doc.get('file_path') == 'ocr.pdf':
print('OCR PDF processing completed successfully!')
return True
for doc in failed_docs:
if doc.get('file_path') == 'ocr.pdf':
print(f'OCR PDF processing failed: {doc.get("error_msg", "Unknown error")}')
return False
print(f'Status check {i+1}: OCR PDF still processing...')
else:
print(f'Unexpected status format: {type(documents)}')
print('Timeout waiting for processing to complete')
return False
else:
print(f'Failed to get documents: {docs_response.text}')
return False
else:
print(f'Login failed: {response.text}')
return False
except Exception as e:
print(f'Error during test: {e}')
return False
if __name__ == '__main__':
success = test_ocr_upload_and_status()
if success:
print('\n✅ OCR upload and processing test PASSED!')
else:
print('\n❌ OCR upload and processing test FAILED!')