import requests import json import os import time def test_ocr_pdf_upload(): """Test OCR PDF upload to LightRAG server""" # Server configuration base_url = "http://localhost:3015" api_key = "lightrag-test-key" # File to upload pdf_file = "inputs/ocr.pdf" if not os.path.exists(pdf_file): print(f"āŒ Test file not found: {pdf_file}") print("Available files in inputs directory:") if os.path.exists("inputs"): for file in os.listdir("inputs"): print(f" - inputs/{file}") return False print(f"šŸ“ Testing OCR PDF upload: {pdf_file}") # Try without authentication first (API key only) headers = { "X-API-Key": api_key } try: # Test if server is accessible print("šŸ” Testing server accessibility...") test_response = requests.get(f"{base_url}/", headers=headers) if test_response.status_code != 200: print(f"āŒ Server not accessible: {test_response.status_code}") return False print("āœ… Server is accessible") # Upload the PDF file upload_url = f"{base_url}/documents/upload" print("šŸ“¤ Uploading PDF file...") with open(pdf_file, "rb") as file: files = {"file": (os.path.basename(pdf_file), file, "application/pdf")} upload_response = requests.post(upload_url, files=files, headers=headers) if upload_response.status_code != 200: print(f"āŒ Upload failed: {upload_response.status_code} - {upload_response.text}") return False upload_result = upload_response.json() print(f"āœ… Upload successful: {json.dumps(upload_result, indent=2)}") # Wait a bit for processing print("ā³ Waiting for OCR processing...") time.sleep(5) # Check document status docs_url = f"{base_url}/documents" print("šŸ“‹ Checking document status...") docs_response = requests.get(docs_url, headers=headers) if docs_response.status_code == 200: documents = docs_response.json() print(f"šŸ“„ Documents in system: {json.dumps(documents, indent=2)}") else: print(f"āš ļø Could not fetch documents: {docs_response.status_code}") # Try a simple search to verify content was indexed search_url = f"{base_url}/search" search_data = { "query": "test document", "top_k": 5 } print("šŸ” Testing search functionality...") search_response = requests.post(search_url, json=search_data, headers=headers) if search_response.status_code == 200: search_results = search_response.json() print(f"šŸ”Ž Search results: {json.dumps(search_results, indent=2)}") else: print(f"āš ļø Search failed: {search_response.status_code} - {search_response.text}") return True except requests.exceptions.ConnectionError: print("āŒ Cannot connect to server. Make sure LightRAG server is running on port 3015.") return False except Exception as e: print(f"āŒ Unexpected error: {e}") return False if __name__ == "__main__": print("šŸš€ Starting OCR PDF upload test...") success = test_ocr_pdf_upload() if success: print("\nšŸŽ‰ Test completed successfully!") else: print("\nšŸ’„ Test failed!") print("\nšŸ“‹ Troubleshooting steps:") print("1. Check if server is running: http://localhost:3015") print("2. Verify the PDF file exists in test_documents/") print("3. Check server logs for OCR processing errors") print("4. Ensure PaddleOCR is properly configured")