railseek6/test_ocr_pdf_correct_endpoints.py

"""
Complete OCR PDF Upload, Indexing, and Search Test for LightRAG Web UI
Tests the entire workflow: upload ocr.pdf → indexing → search functionality
Uses correct API endpoints based on server structure
"""

import requests
import time
import json
import os
from pathlib import Path

# Configuration
BASE_URL = "http://localhost:3015"
USERNAME = "jleu3482"
PASSWORD = "jleu1212"
OCR_PDF_PATH = "ocr.pdf"
TEST_QUERIES = [
    "LightRAG",
    "OCR",
    "document processing",
    "text extraction"
]

class LightRAGWebUITest:
    def __init__(self):
        self.session = requests.Session()
        self.base_url = BASE_URL
        self.username = USERNAME
        self.password = PASSWORD
        self.access_token = None

    def login(self):
        """Login and get JWT token"""
        print("=== Logging In ===")
        try:
            # Use form data for OAuth2 password flow
            form_data = {
                "username": self.username,
                "password": self.password
            }

            headers = {
                "Content-Type": "application/x-www-form-urlencoded"
            }

            response = self.session.post(
                f"{self.base_url}/login",
                data=form_data,
                headers=headers
            )

            if response.status_code == 200:
                login_data = response.json()
                self.access_token = login_data.get("access_token")
                if self.access_token:
                    print("✅ Login successful")
                    return True
                else:
                    print("❌ Login failed: No access token received")
                    return False
            else:
                print(f"❌ Login failed: {response.status_code} - {response.text}")
                return False
        except Exception as e:
            print(f"❌ Login error: {e}")
            return False

    def test_health(self):
        """Test server health"""
        print("=== Testing Server Health ===")
        try:
            headers = {"Authorization": f"Bearer {self.access_token}"}
            response = self.session.get(f"{self.base_url}/health", headers=headers)
            if response.status_code == 200:
                health_data = response.json()
                print("✅ Server is healthy")
                print(f"   LLM Binding: {health_data.get('configuration', {}).get('llm_binding', 'N/A')}")
                print(f"   Embedding Binding: {health_data.get('configuration', {}).get('embedding_binding', 'N/A')}")
                print(f"   Rerank Binding: {health_data.get('configuration', {}).get('rerank_binding', 'N/A')}")
                return True
            else:
                print(f"❌ Server health check failed: {response.status_code}")
                return False
        except Exception as e:
            print(f"❌ Server health check error: {e}")
            return False

    def test_webui_accessibility(self):
        """Test web UI accessibility"""
        print("\n=== Testing Web UI Accessibility ===")
        try:
            # Test web UI access
            response = self.session.get(f"{self.base_url}/webui/")
            if response.status_code == 200:
                print("✅ Web UI accessible")
                return True
            else:
                print(f"❌ Web UI access failed: {response.status_code}")
                return False
        except Exception as e:
            print(f"❌ Web UI access error: {e}")
            return False

    def upload_ocr_pdf(self):
        """Upload ocr.pdf file"""
        print("\n=== Uploading OCR PDF ===")

        if not os.path.exists(OCR_PDF_PATH):
            print(f"❌ OCR PDF file not found: {OCR_PDF_PATH}")
            return False

        try:
            # Prepare file for upload
            files = {
                'file': (os.path.basename(OCR_PDF_PATH),
                        open(OCR_PDF_PATH, 'rb'),
                        'application/pdf')
            }

            headers = {"Authorization": f"Bearer {self.access_token}"}

            print(f"📤 Uploading {OCR_PDF_PATH}...")
            response = self.session.post(
                f"{self.base_url}/documents/upload",
                files=files,
                headers=headers
            )

            if response.status_code == 200:
                result = response.json()
                print(f"✅ Upload successful: {result}")
                return True
            else:
                print(f"❌ Upload failed: {response.status_code} - {response.text}")
                return False

        except Exception as e:
            print(f"❌ Upload error: {e}")
            return False

    def wait_for_indexing(self, timeout=120):
        """Wait for document indexing to complete"""
        print(f"\n=== Waiting for Indexing (max {timeout}s) ===")

        headers = {"Authorization": f"Bearer {self.access_token}"}
        start_time = time.time()

        while time.time() - start_time < timeout:
            try:
                # Check pipeline status
                response = self.session.get(
                    f"{self.base_url}/documents/pipeline_status",
                    headers=headers
                )

                if response.status_code == 200:
                    pipeline_status = response.json()
                    busy = pipeline_status.get('busy', False)
                    latest_message = pipeline_status.get('latest_message', '')
                    print(f"🔄 Pipeline status: busy={busy}, message='{latest_message}'")

                    if not busy:
                        print("✅ Pipeline processing completed!")
                        return True

                # Check document status
                response = self.session.get(
                    f"{self.base_url}/documents",
                    headers=headers
                )
                if response.status_code == 200:
                    documents = response.json()
                    statuses = documents.get('statuses', {})

                    processed_count = len(statuses.get('PROCESSED', []))
                    pending_count = len(statuses.get('PENDING', []))
                    processing_count = len(statuses.get('PROCESSING', []))

                    print(f"📊 Documents: {processed_count} processed, {pending_count} pending, {processing_count} processing")

                    if pending_count == 0 and processing_count == 0 and processed_count > 0:
                        print("✅ All documents processed!")
                        return True

                time.sleep(5)  # Wait 5 seconds between checks

            except Exception as e:
                print(f"⚠️  Error checking indexing status: {e}")
                time.sleep(5)

        print("⏰ Indexing timeout reached")
        return False

    def test_search_queries(self):
        """Test search functionality with OCR content using query endpoint"""
        print("\n=== Testing Search Queries ===")

        headers = {
            "Authorization": f"Bearer {self.access_token}",
            "Content-Type": "application/json"
        }

        successful_searches = 0

        for query in TEST_QUERIES:
            print(f"\n🔍 Testing query: '{query}'")
            try:
                payload = {
                    "query": query,
                    "top_k": 5,
                    "only_need_context": True  # Only return context for search
                }

                response = self.session.post(
                    f"{self.base_url}/query",
                    json=payload,
                    headers=headers
                )

                if response.status_code == 200:
                    results = response.json()
                    # The query endpoint returns different structure, check for chunks
                    if results and 'chunks' in results and len(results['chunks']) > 0:
                        print(f"✅ Search successful: Found {len(results['chunks'])} chunks")
                        successful_searches += 1

                        # Show first chunk snippet
                        first_chunk = results['chunks'][0]
                        content_preview = first_chunk.get('text', '')[:200] + "..."
                        print(f"   📄 First chunk preview: {content_preview}")
                    else:
                        print(f"⚠️  Search returned no results for: '{query}'")
                        print(f"   Response: {results}")
                else:
                    print(f"❌ Search failed: {response.status_code} - {response.text}")

            except Exception as e:
                print(f"❌ Search error for '{query}': {e}")

        print(f"\n📊 Search Summary: {successful_searches}/{len(TEST_QUERIES)} queries successful")
        return successful_searches > 0

    def check_database_storage(self):
        """Verify data is stored in all databases"""
        print("\n=== Checking Database Storage ===")

        headers = {"Authorization": f"Bearer {self.access_token}"}

        try:
            # Check document status counts
            response = self.session.get(
                f"{self.base_url}/documents/status_counts",
                headers=headers
            )

            if response.status_code == 200:
                status_counts = response.json().get('status_counts', {})
                print(f"📊 Document Status Counts: {status_counts}")

                processed_count = status_counts.get('PROCESSED', 0)
                if processed_count > 0:
                    print("✅ Data stored in databases")
                    return True
                else:
                    print("⚠️  No processed documents found")
                    return False
            else:
                print(f"❌ Could not get status counts: {response.status_code}")
                return False

        except Exception as e:
            print(f"❌ Database check error: {e}")
            return False

    def run_complete_test(self):
        """Run the complete OCR PDF workflow test"""
        print("🚀 Starting Complete OCR PDF Web UI Workflow Test")
        print("=" * 60)

        test_results = {}

        # Step 1: Login first
        test_results['login'] = self.login()
        if not test_results['login']:
            print("❌ Login failed, cannot proceed with other tests")
            return False

        # Step 2: Test server health
        test_results['health'] = self.test_health()

        # Step 3: Test web UI accessibility
        test_results['webui'] = self.test_webui_accessibility()

        # Step 4: Upload OCR PDF
        test_results['upload'] = self.upload_ocr_pdf()

        # Step 5: Wait for indexing
        if test_results['upload']:
            test_results['indexing'] = self.wait_for_indexing()
        else:
            test_results['indexing'] = False

        # Step 6: Test search queries
        if test_results['indexing']:
            test_results['search'] = self.test_search_queries()
        else:
            test_results['search'] = False

        # Step 7: Check database storage
        test_results['storage'] = self.check_database_storage()

        # Summary
        print("\n" + "=" * 60)
        print("📋 TEST SUMMARY")
        print("=" * 60)

        for test_name, result in test_results.items():
            status = "✅ PASS" if result else "❌ FAIL"
            print(f"{test_name.upper():<12} : {status}")

        overall_success = all(test_results.values())

        if overall_success:
            print("\n🎉 ALL TESTS PASSED! OCR PDF workflow is working correctly.")
            print("   - Login successful")
            print("   - Upload successful")
            print("   - Indexing completed")
            print("   - Search functionality working")
            print("   - Data stored in databases")
        else:
            print("\n⚠️  SOME TESTS FAILED. Check the logs above for details.")

        return overall_success

def main():
    """Main test execution"""
    test = LightRAGWebUITest()
    success = test.run_complete_test()

    # Exit with appropriate code
    exit(0 if success else 1)

if __name__ == "__main__":
    main()