Files
railseek6/test_ocr_upload_final.py

106 lines
3.8 KiB
Python

import requests
import json
import os
import time
def test_ocr_pdf_upload():
"""Test OCR PDF upload to LightRAG server"""
# Server configuration
base_url = "http://localhost:3015"
api_key = "lightrag-test-key"
# File to upload
pdf_file = "inputs/ocr.pdf"
if not os.path.exists(pdf_file):
print(f"❌ Test file not found: {pdf_file}")
print("Available files in inputs directory:")
if os.path.exists("inputs"):
for file in os.listdir("inputs"):
print(f" - inputs/{file}")
return False
print(f"📁 Testing OCR PDF upload: {pdf_file}")
# Try without authentication first (API key only)
headers = {
"X-API-Key": api_key
}
try:
# Test if server is accessible
print("🔍 Testing server accessibility...")
test_response = requests.get(f"{base_url}/", headers=headers)
if test_response.status_code != 200:
print(f"❌ Server not accessible: {test_response.status_code}")
return False
print("✅ Server is accessible")
# Upload the PDF file
upload_url = f"{base_url}/documents/upload"
print("📤 Uploading PDF file...")
with open(pdf_file, "rb") as file:
files = {"file": (os.path.basename(pdf_file), file, "application/pdf")}
upload_response = requests.post(upload_url, files=files, headers=headers)
if upload_response.status_code != 200:
print(f"❌ Upload failed: {upload_response.status_code} - {upload_response.text}")
return False
upload_result = upload_response.json()
print(f"✅ Upload successful: {json.dumps(upload_result, indent=2)}")
# Wait a bit for processing
print("⏳ Waiting for OCR processing...")
time.sleep(5)
# Check document status
docs_url = f"{base_url}/documents"
print("📋 Checking document status...")
docs_response = requests.get(docs_url, headers=headers)
if docs_response.status_code == 200:
documents = docs_response.json()
print(f"📄 Documents in system: {json.dumps(documents, indent=2)}")
else:
print(f"⚠️ Could not fetch documents: {docs_response.status_code}")
# Try a simple search to verify content was indexed
search_url = f"{base_url}/search"
search_data = {
"query": "test document",
"top_k": 5
}
print("🔍 Testing search functionality...")
search_response = requests.post(search_url, json=search_data, headers=headers)
if search_response.status_code == 200:
search_results = search_response.json()
print(f"🔎 Search results: {json.dumps(search_results, indent=2)}")
else:
print(f"⚠️ Search failed: {search_response.status_code} - {search_response.text}")
return True
except requests.exceptions.ConnectionError:
print("❌ Cannot connect to server. Make sure LightRAG server is running on port 3015.")
return False
except Exception as e:
print(f"❌ Unexpected error: {e}")
return False
if __name__ == "__main__":
print("🚀 Starting OCR PDF upload test...")
success = test_ocr_pdf_upload()
if success:
print("\n🎉 Test completed successfully!")
else:
print("\n💥 Test failed!")
print("\n📋 Troubleshooting steps:")
print("1. Check if server is running: http://localhost:3015")
print("2. Verify the PDF file exists in test_documents/")
print("3. Check server logs for OCR processing errors")
print("4. Ensure PaddleOCR is properly configured")