railseek6/test_ocr_workflow_with_auth.py

#!/usr/bin/env python3
"""
OCR PDF Upload and Search Validation Script with Authentication
Handles login and authentication for LightRAG web UI
"""

import requests
import json
import time
import sys
import os
from pathlib import Path

# Configuration
BASE_URL = "http://localhost:3015"
WEBUI_URL = f"{BASE_URL}/webui/"
API_URL = f"{BASE_URL}/api"
OCR_PDF_PATH = "ocr.pdf"
TEST_QUERY = "document processing"
MAX_WAIT_TIME = 300
POLL_INTERVAL = 10

# Authentication
USERNAME = "jleu3482"
PASSWORD = "jleu1212"

class OCRWorkflowValidator:
    def __init__(self):
        self.session = requests.Session()
        self.doc_id = None
        self.auth_token = None

    def log_step(self, message, status="INFO"):
        """Log step with timestamp"""
        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
        print(f"[{timestamp}] [{status}] {message}")

    def authenticate(self):
        """Authenticate with the LightRAG server"""
        self.log_step("Authenticating with LightRAG server...")

        # Try multiple authentication methods
        auth_methods = [
            self._authenticate_via_login_api,
            self._authenticate_via_basic_auth,
            self._authenticate_via_webui_login
        ]

        for auth_method in auth_methods:
            try:
                if auth_method():
                    self.log_step("✓ Authentication successful")
                    return True
            except Exception as e:
                self.log_step(f"Authentication method failed: {e}", "WARNING")
                continue

        self.log_step("✗ All authentication methods failed", "ERROR")
        return False

    def _authenticate_via_login_api(self):
        """Authenticate via /auth/login endpoint"""
        login_payload = {
            "username": USERNAME,
            "password": PASSWORD
        }

        login_endpoints = [
            f"{API_URL}/auth/login",
            f"{BASE_URL}/auth/login",
            f"{WEBUI_URL}auth/login"
        ]

        for endpoint in login_endpoints:
            try:
                self.log_step(f"Trying login endpoint: {endpoint}")
                response = self.session.post(
                    endpoint,
                    json=login_payload,
                    timeout=10
                )

                if response.status_code == 200:
                    result = response.json()
                    # Store token if provided
                    if 'access_token' in result:
                        self.auth_token = result['access_token']
                        self.session.headers.update({'Authorization': f'Bearer {self.auth_token}'})
                    elif 'token' in result:
                        self.auth_token = result['token']
                        self.session.headers.update({'Authorization': f'Bearer {self.auth_token}'})
                    return True
            except requests.exceptions.RequestException:
                continue

        return False

    def _authenticate_via_basic_auth(self):
        """Use basic authentication"""
        from requests.auth import HTTPBasicAuth
        self.session.auth = HTTPBasicAuth(USERNAME, PASSWORD)

        # Test if basic auth works
        try:
            response = self.session.get(f"{BASE_URL}/", timeout=5)
            if response.status_code == 200:
                return True
        except:
            pass

        self.session.auth = None
        return False

    def _authenticate_via_webui_login(self):
        """Try web UI login form"""
        # Get CSRF token if needed
        try:
            login_page = self.session.get(f"{WEBUI_URL}login", timeout=5)
            if login_page.status_code == 200:
                # For now, just set basic auth for webui endpoints
                self.session.auth = HTTPBasicAuth(USERNAME, PASSWORD)
                return True
        except:
            pass

        return False

    def check_server_status(self):
        """Check if server is responding"""
        self.log_step("Checking server status...")

        endpoints_to_try = [
            "/",
            "/health",
            "/api/health"
        ]

        for endpoint in endpoints_to_try:
            try:
                response = self.session.get(f"{BASE_URL}{endpoint}", timeout=5)
                self.log_step(f"✓ Endpoint {endpoint}: {response.status_code}")
                if response.status_code == 200:
                    return True
            except requests.exceptions.RequestException as e:
                self.log_step(f"✗ Endpoint {endpoint}: {e}", "WARNING")
                continue

        self.log_step("✗ No working endpoints found", "ERROR")
        return False

    def verify_ocr_pdf_exists(self):
        """Verify the OCR PDF file exists and is valid"""
        self.log_step("Verifying OCR PDF file...")
        if not os.path.exists(OCR_PDF_PATH):
            self.log_step(f"✗ OCR PDF file not found: {OCR_PDF_PATH}", "ERROR")
            return False

        file_size = os.path.getsize(OCR_PDF_PATH)
        if file_size == 0:
            self.log_step("✗ OCR PDF file is empty", "ERROR")
            return False

        self.log_step(f"✓ OCR PDF file verified ({file_size} bytes)")
        return True

    def upload_ocr_pdf(self):
        """Upload OCR PDF with authentication"""
        self.log_step("Uploading OCR PDF file...")

        if not self.verify_ocr_pdf_exists():
            return False

        try:
            with open(OCR_PDF_PATH, 'rb') as file:
                files = {'file': (os.path.basename(OCR_PDF_PATH), file, 'application/pdf')}

                # Try multiple upload endpoints
                upload_endpoints = [
                    f"{BASE_URL}/documents/upload",
                    f"{API_URL}/documents/upload",
                    f"{BASE_URL}/upload",
                    f"{API_URL}/upload"
                ]

                for endpoint in upload_endpoints:
                    try:
                        self.log_step(f"Trying upload endpoint: {endpoint}")
                        response = self.session.post(
                            endpoint,
                            files=files,
                            timeout=30
                        )

                        if response.status_code in [200, 201]:
                            result = response.json()
                            self.log_step("✓ OCR PDF upload successful")

                            # Extract document ID from response
                            if 'document_id' in result:
                                self.doc_id = result['document_id']
                            elif 'id' in result:
                                self.doc_id = result['id']
                            elif 'doc_id' in result:
                                self.doc_id = result['doc_id']

                            if self.doc_id:
                                self.log_step(f"Document ID: {self.doc_id}")
                            return True
                        else:
                            self.log_step(f"Upload endpoint {endpoint} returned {response.status_code}: {response.text}", "WARNING")

                    except requests.exceptions.RequestException as e:
                        self.log_step(f"Upload endpoint {endpoint} failed: {e}", "WARNING")
                        continue

                self.log_step("✗ All upload endpoints failed", "ERROR")
                return False

        except Exception as e:
            self.log_step(f"✗ Upload failed: {e}", "ERROR")
            return False

    def wait_for_indexing(self):
        """Wait for document to be fully indexed"""
        self.log_step("Waiting for document indexing to complete...")

        if not self.doc_id:
            self.log_step("✗ No document ID available to check indexing status", "ERROR")
            return False

        start_time = time.time()

        while time.time() - start_time < MAX_WAIT_TIME:
            try:
                # Try to get document status
                status_endpoints = [
                    f"{BASE_URL}/documents/{self.doc_id}/status",
                    f"{API_URL}/documents/{self.doc_id}/status",
                    f"{BASE_URL}/status/{self.doc_id}",
                    f"{API_URL}/status/{self.doc_id}"
                ]

                for endpoint in status_endpoints:
                    try:
                        response = self.session.get(endpoint, timeout=10)
                        if response.status_code == 200:
                            status_data = response.json()

                            # Check various status indicators
                            if 'status' in status_data:
                                status = status_data['status'].lower()
                                if status in ['completed', 'done', 'indexed']:
                                    self.log_step("✓ Document indexing completed")
                                    return True
                                elif status in ['processing', 'indexing']:
                                    self.log_step(f"Indexing in progress... ({status})")
                                elif status in ['failed', 'error']:
                                    self.log_step(f"✗ Indexing failed: {status_data.get('message', 'Unknown error')}", "ERROR")
                                    return False

                            # Alternative status check
                            if 'indexed' in status_data and status_data['indexed']:
                                self.log_step("✓ Document indexing completed")
                                return True

                    except requests.exceptions.RequestException:
                        continue

                # Also check documents list for status
                if self.check_document_status_in_list():
                    self.log_step("✓ Document indexed (from list)")
                    return True

            except requests.exceptions.RequestException:
                pass

            elapsed = int(time.time() - start_time)
            self.log_step(f"Waiting... ({elapsed}s elapsed)")
            time.sleep(POLL_INTERVAL)

        self.log_step("✗ Indexing timeout reached", "ERROR")
        return False

    def check_document_status_in_list(self):
        """Check document status from documents list"""
        try:
            list_endpoints = [
                f"{BASE_URL}/documents",
                f"{API_URL}/documents"
            ]

            for endpoint in list_endpoints:
                try:
                    response = self.session.get(endpoint, timeout=10)
                    if response.status_code == 200:
                        documents = response.json()
                        for doc in documents:
                            if doc.get('id') == self.doc_id:
                                # Check if document has indexing status
                                if doc.get('status') in ['completed', 'indexed']:
                                    return True
                                elif doc.get('indexed') is True:
                                    return True
                    return False
                except:
                    continue
            return False
        except:
            return False

    def test_search_functionality(self):
        """Test search functionality with OCR content"""
        self.log_step("Testing search functionality...")

        search_payload = {
            "query": TEST_QUERY,
            "top_k": 5
        }

        search_endpoints = [
            f"{BASE_URL}/search",
            f"{API_URL}/search",
            f"{BASE_URL}/query",
            f"{API_URL}/query",
            f"{BASE_URL}/documents/search",
            f"{API_URL}/documents/search"
        ]

        for endpoint in search_endpoints:
            try:
                self.log_step(f"Testing search endpoint: {endpoint}")
                response = self.session.post(
                    endpoint,
                    json=search_payload,
                    timeout=15
                )

                if response.status_code == 200:
                    results = response.json()
                    self.log_step("✓ Search request successful")

                    # Validate search results structure
                    if isinstance(results, list) and len(results) > 0:
                        self.log_step(f"✓ Search returned {len(results)} results")

                        # Check if results contain relevant content
                        for i, result in enumerate(results[:3]):
                            if isinstance(result, dict):
                                content = result.get('content', result.get('text', str(result)))
                            else:
                                content = str(result)
                            content_preview = content[:100] + "..." if len(content) > 100 else content
                            self.log_step(f"Result {i+1}: {content_preview}")

                        return True
                    else:
                        self.log_step("✗ Search returned no results", "WARNING")
                        # Continue to next endpoint

            except requests.exceptions.RequestException as e:
                self.log_step(f"Search endpoint {endpoint} failed: {e}", "WARNING")
                continue

        self.log_step("✗ All search endpoints failed", "ERROR")
        return False

    def run_complete_validation(self):
        """Run the complete validation workflow"""
        self.log_step("Starting OCR PDF Upload and Search Validation")
        self.log_step("=" * 50)

        steps = [
            ("Server Status Check", self.check_server_status),
            ("Authentication", self.authenticate),
            ("OCR PDF Verification", self.verify_ocr_pdf_exists),
            ("PDF Upload", self.upload_ocr_pdf),
            ("Indexing Wait", self.wait_for_indexing),
            ("Search Test", self.test_search_functionality)
        ]

        results = []
        for step_name, step_func in steps:
            self.log_step(f"Executing: {step_name}")
            success = step_func()
            results.append((step_name, success))

            if not success:
                self.log_step(f"✗ Workflow failed at: {step_name}", "ERROR")
                break

        # Generate final report
        self.log_step("=" * 50)
        self.log_step("VALIDATION RESULTS SUMMARY")
        self.log_step("=" * 50)

        passed = 0
        total = len(results)

        for step_name, success in results:
            status = "✓ PASS" if success else "✗ FAIL"
            self.log_step(f"{step_name}: {status}")
            if success:
                passed += 1

        success_rate = (passed / total) * 100
        self.log_step(f"Success Rate: {passed}/{total} ({success_rate:.1f}%)")

        if passed == total:
            self.log_step("🎉 COMPLETE WORKFLOW VALIDATION SUCCESSFUL!", "SUCCESS")
            return True
        else:
            self.log_step("❌ WORKFLOW VALIDATION FAILED", "ERROR")
            return False

def main():
    """Main execution function"""
    validator = OCRWorkflowValidator()

    try:
        success = validator.run_complete_validation()
        sys.exit(0 if success else 1)

    except KeyboardInterrupt:
        validator.log_step("Validation interrupted by user", "WARNING")
        sys.exit(1)
    except Exception as e:
        validator.log_step(f"Unexpected error: {e}", "ERROR")
        sys.exit(1)

if __name__ == "__main__":
    main()