railseek6/test_ocr_upload_final.py

import requests
import json
import os
import time

def test_ocr_pdf_upload():
    """Test OCR PDF upload to LightRAG server"""

    # Server configuration
    base_url = "http://localhost:3015"
    api_key = "lightrag-test-key"

    # File to upload
    pdf_file = "inputs/ocr.pdf"
    if not os.path.exists(pdf_file):
        print(f"❌ Test file not found: {pdf_file}")
        print("Available files in inputs directory:")
        if os.path.exists("inputs"):
            for file in os.listdir("inputs"):
                print(f"  - inputs/{file}")
        return False

    print(f"📁 Testing OCR PDF upload: {pdf_file}")

    # Try without authentication first (API key only)
    headers = {
        "X-API-Key": api_key
    }

    try:
        # Test if server is accessible
        print("🔍 Testing server accessibility...")
        test_response = requests.get(f"{base_url}/", headers=headers)
        if test_response.status_code != 200:
            print(f"❌ Server not accessible: {test_response.status_code}")
            return False
        print("✅ Server is accessible")

        # Upload the PDF file
        upload_url = f"{base_url}/documents/upload"

        print("📤 Uploading PDF file...")
        with open(pdf_file, "rb") as file:
            files = {"file": (os.path.basename(pdf_file), file, "application/pdf")}
            upload_response = requests.post(upload_url, files=files, headers=headers)

        if upload_response.status_code != 200:
            print(f"❌ Upload failed: {upload_response.status_code} - {upload_response.text}")
            return False

        upload_result = upload_response.json()
        print(f"✅ Upload successful: {json.dumps(upload_result, indent=2)}")

        # Wait a bit for processing
        print("⏳ Waiting for OCR processing...")
        time.sleep(5)

        # Check document status
        docs_url = f"{base_url}/documents"
        print("📋 Checking document status...")
        docs_response = requests.get(docs_url, headers=headers)

        if docs_response.status_code == 200:
            documents = docs_response.json()
            print(f"📄 Documents in system: {json.dumps(documents, indent=2)}")
        else:
            print(f"⚠️ Could not fetch documents: {docs_response.status_code}")

        # Try a simple search to verify content was indexed
        search_url = f"{base_url}/search"
        search_data = {
            "query": "test document",
            "top_k": 5
        }

        print("🔍 Testing search functionality...")
        search_response = requests.post(search_url, json=search_data, headers=headers)

        if search_response.status_code == 200:
            search_results = search_response.json()
            print(f"🔎 Search results: {json.dumps(search_results, indent=2)}")
        else:
            print(f"⚠️ Search failed: {search_response.status_code} - {search_response.text}")

        return True

    except requests.exceptions.ConnectionError:
        print("❌ Cannot connect to server. Make sure LightRAG server is running on port 3015.")
        return False
    except Exception as e:
        print(f"❌ Unexpected error: {e}")
        return False

if __name__ == "__main__":
    print("🚀 Starting OCR PDF upload test...")
    success = test_ocr_pdf_upload()

    if success:
        print("\n🎉 Test completed successfully!")
    else:
        print("\n💥 Test failed!")
        print("\n📋 Troubleshooting steps:")
        print("1. Check if server is running: http://localhost:3015")
        print("2. Verify the PDF file exists in test_documents/")
        print("3. Check server logs for OCR processing errors")
        print("4. Ensure PaddleOCR is properly configured")