106 lines
3.8 KiB
Python
106 lines
3.8 KiB
Python
import requests
|
|
import json
|
|
import os
|
|
import time
|
|
|
|
def test_ocr_pdf_upload():
|
|
"""Test OCR PDF upload to LightRAG server"""
|
|
|
|
# Server configuration
|
|
base_url = "http://localhost:3015"
|
|
api_key = "lightrag-test-key"
|
|
|
|
# File to upload
|
|
pdf_file = "inputs/ocr.pdf"
|
|
if not os.path.exists(pdf_file):
|
|
print(f"❌ Test file not found: {pdf_file}")
|
|
print("Available files in inputs directory:")
|
|
if os.path.exists("inputs"):
|
|
for file in os.listdir("inputs"):
|
|
print(f" - inputs/{file}")
|
|
return False
|
|
|
|
print(f"📁 Testing OCR PDF upload: {pdf_file}")
|
|
|
|
# Try without authentication first (API key only)
|
|
headers = {
|
|
"X-API-Key": api_key
|
|
}
|
|
|
|
try:
|
|
# Test if server is accessible
|
|
print("🔍 Testing server accessibility...")
|
|
test_response = requests.get(f"{base_url}/", headers=headers)
|
|
if test_response.status_code != 200:
|
|
print(f"❌ Server not accessible: {test_response.status_code}")
|
|
return False
|
|
print("✅ Server is accessible")
|
|
|
|
# Upload the PDF file
|
|
upload_url = f"{base_url}/documents/upload"
|
|
|
|
print("📤 Uploading PDF file...")
|
|
with open(pdf_file, "rb") as file:
|
|
files = {"file": (os.path.basename(pdf_file), file, "application/pdf")}
|
|
upload_response = requests.post(upload_url, files=files, headers=headers)
|
|
|
|
if upload_response.status_code != 200:
|
|
print(f"❌ Upload failed: {upload_response.status_code} - {upload_response.text}")
|
|
return False
|
|
|
|
upload_result = upload_response.json()
|
|
print(f"✅ Upload successful: {json.dumps(upload_result, indent=2)}")
|
|
|
|
# Wait a bit for processing
|
|
print("⏳ Waiting for OCR processing...")
|
|
time.sleep(5)
|
|
|
|
# Check document status
|
|
docs_url = f"{base_url}/documents"
|
|
print("📋 Checking document status...")
|
|
docs_response = requests.get(docs_url, headers=headers)
|
|
|
|
if docs_response.status_code == 200:
|
|
documents = docs_response.json()
|
|
print(f"📄 Documents in system: {json.dumps(documents, indent=2)}")
|
|
else:
|
|
print(f"⚠️ Could not fetch documents: {docs_response.status_code}")
|
|
|
|
# Try a simple search to verify content was indexed
|
|
search_url = f"{base_url}/search"
|
|
search_data = {
|
|
"query": "test document",
|
|
"top_k": 5
|
|
}
|
|
|
|
print("🔍 Testing search functionality...")
|
|
search_response = requests.post(search_url, json=search_data, headers=headers)
|
|
|
|
if search_response.status_code == 200:
|
|
search_results = search_response.json()
|
|
print(f"🔎 Search results: {json.dumps(search_results, indent=2)}")
|
|
else:
|
|
print(f"⚠️ Search failed: {search_response.status_code} - {search_response.text}")
|
|
|
|
return True
|
|
|
|
except requests.exceptions.ConnectionError:
|
|
print("❌ Cannot connect to server. Make sure LightRAG server is running on port 3015.")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Unexpected error: {e}")
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
print("🚀 Starting OCR PDF upload test...")
|
|
success = test_ocr_pdf_upload()
|
|
|
|
if success:
|
|
print("\n🎉 Test completed successfully!")
|
|
else:
|
|
print("\n💥 Test failed!")
|
|
print("\n📋 Troubleshooting steps:")
|
|
print("1. Check if server is running: http://localhost:3015")
|
|
print("2. Verify the PDF file exists in test_documents/")
|
|
print("3. Check server logs for OCR processing errors")
|
|
print("4. Ensure PaddleOCR is properly configured") |