railseek6/test_webui_ocr_workflow.py

#!/usr/bin/env python3
"""
Test OCR PDF upload, indexing, and search through Web UI simulation
This script simulates the complete web UI workflow for OCR PDF processing
"""

import requests
import time
import json
import sys
from pathlib import Path

# Configuration
BASE_URL = "http://localhost:3015"
USERNAME = "jleu3482"
PASSWORD = "jleu1212"
OCR_PDF_PATH = "ocr.pdf"

class WebUITester:
    def __init__(self):
        self.base_url = BASE_URL
        self.session = requests.Session()
        self.access_token = None

    def login(self):
        """Login to get JWT token"""
        print("🔐 Logging in to Web UI...")
        login_data = {
            "username": USERNAME,
            "password": PASSWORD
        }

        try:
            response = self.session.post(
                f"{self.base_url}/login",
                data=login_data
            )

            if response.status_code == 200:
                result = response.json()
                self.access_token = result.get("access_token")
                print(f"✅ Login successful")
                print(f"   Auth Mode: {result.get('auth_mode', 'unknown')}")
                return True
            else:
                print(f"❌ Login failed: {response.status_code} - {response.text}")
                return False

        except Exception as e:
            print(f"❌ Login error: {e}")
            return False

    def check_server_health(self):
        """Check server health"""
        print("\n🏥 Checking server health...")
        headers = {"Authorization": f"Bearer {self.access_token}"}

        try:
            response = self.session.get(f"{self.base_url}/health", headers=headers)
            if response.status_code == 200:
                health_data = response.json()
                print(f"✅ Server is healthy")
                print(f"   LLM: {health_data['configuration']['llm_binding']}")
                print(f"   Embedding: {health_data['configuration']['embedding_binding']}")
                print(f"   Rerank: {health_data['configuration']['rerank_binding']}")
                return True
            else:
                print(f"❌ Health check failed: {response.status_code}")
                return False
        except Exception as e:
            print(f"❌ Health check error: {e}")
            return False

    def upload_ocr_pdf(self):
        """Upload OCR PDF through web UI API"""
        print(f"\n📤 Uploading OCR PDF: {OCR_PDF_PATH}")

        if not Path(OCR_PDF_PATH).exists():
            print(f"❌ OCR PDF file not found: {OCR_PDF_PATH}")
            return False

        headers = {"Authorization": f"Bearer {self.access_token}"}

        try:
            with open(OCR_PDF_PATH, 'rb') as file:
                files = {'file': (OCR_PDF_PATH, file, 'application/pdf')}
                response = self.session.post(
                    f"{self.base_url}/documents/upload",
                    files=files,
                    headers=headers
                )

            if response.status_code == 200:
                result = response.json()
                print(f"✅ Upload successful")
                print(f"   Status: {result.get('status', 'unknown')}")
                print(f"   Message: {result.get('message', 'No message')}")
                if result.get('track_id'):
                    print(f"   Track ID: {result.get('track_id')}")
                return True
            else:
                print(f"❌ Upload failed: {response.status_code} - {response.text}")
                return False

        except Exception as e:
            print(f"❌ Upload error: {e}")
            return False

    def wait_for_indexing(self, max_wait=180):
        """Wait for document indexing to complete"""
        print(f"\n⏳ Waiting for indexing (max {max_wait}s)...")

        headers = {"Authorization": f"Bearer {self.access_token}"}

        for i in range(max_wait):
            try:
                # Check pipeline status
                response = self.session.get(f"{self.base_url}/health", headers=headers)
                if response.status_code == 200:
                    health_data = response.json()
                    busy = health_data.get('pipeline_busy', False)

                    if not busy:
                        print("✅ Indexing completed!")
                        return True

                    if i % 10 == 0:  # Print status every 10 seconds
                        print(f"   Still indexing... ({i}s)")

                time.sleep(1)

            except Exception as e:
                print(f"❌ Error checking indexing status: {e}")
                return False

        print("❌ Indexing timeout reached")
        return False

    def check_document_status(self):
        """Check document processing status"""
        print("\n📊 Checking document status...")
        headers = {"Authorization": f"Bearer {self.access_token}"}

        try:
            response = self.session.get(
                f"{self.base_url}/documents/status_counts",
                headers=headers
            )

            if response.status_code == 200:
                status_data = response.json()
                status_counts = status_data.get('status_counts', {})
                print(f"📈 Document Status Counts:")
                for status, count in status_counts.items():
                    print(f"   {status}: {count}")
                return status_counts
            else:
                print(f"❌ Failed to get status: {response.status_code}")
                return None

        except Exception as e:
            print(f"❌ Error checking document status: {e}")
            return None

    def search_ocr_content(self):
        """Search for OCR content using web UI search"""
        print("\n🔍 Testing search functionality...")

        test_queries = [
            "LightRAG",
            "OCR technology",
            "document processing",
            "text extraction",
            "Retrieval-Augmented Generation"
        ]

        headers = {
            "Authorization": f"Bearer {self.access_token}",
            "Content-Type": "application/json"
        }

        successful_searches = 0

        for query in test_queries:
            print(f"\n   Testing query: '{query}'")
            try:
                payload = {
                    "query": query,
                    "top_k": 5,
                    "only_need_context": True
                }

                response = self.session.post(
                    f"{self.base_url}/query",
                    json=payload,
                    headers=headers
                )

                if response.status_code == 200:
                    results = response.json()

                    # Check if we got actual content
                    if 'chunks' in results and len(results['chunks']) > 0:
                        print(f"   ✅ Found {len(results['chunks'])} results")
                        successful_searches += 1

                        # Show first result preview
                        first_chunk = results['chunks'][0]
                        content = first_chunk.get('text', '')[:150] + "..."
                        score = first_chunk.get('score', 0)
                        print(f"      📄 Preview: {content}")
                        print(f"      📊 Score: {score:.3f}")

                    elif 'response' in results:
                        # Check if LLM responded with content
                        response_text = results['response']
                        if "[no-context]" not in response_text:
                            print(f"   ✅ LLM generated response")
                            successful_searches += 1
                            print(f"      🤖 Response: {response_text[:150]}...")
                        else:
                            print(f"   ⚠️  No context found for query")
                    else:
                        print(f"   ⚠️  No results found")

                else:
                    print(f"   ❌ Search failed: {response.status_code} - {response.text}")

            except Exception as e:
                print(f"   ❌ Search error: {e}")

        print(f"\n📊 Search Summary: {successful_searches}/{len(test_queries)} queries successful")
        return successful_searches > 0

    def run_complete_test(self):
        """Run complete Web UI OCR workflow test"""
        print("🚀 Starting Complete Web UI OCR PDF Workflow Test")
        print("=" * 70)

        # Step 1: Login
        if not self.login():
            return False

        # Step 2: Check server health
        if not self.check_server_health():
            return False

        # Step 3: Upload OCR PDF
        if not self.upload_ocr_pdf():
            return False

        # Step 4: Wait for indexing
        if not self.wait_for_indexing():
            return False

        # Step 5: Check document status
        status_counts = self.check_document_status()

        # Step 6: Test search
        search_success = self.search_ocr_content()

        # Summary
        print("\n" + "=" * 70)
        print("🎯 WEB UI OCR WORKFLOW TEST SUMMARY")
        print("=" * 70)

        if status_counts:
            processed = status_counts.get('PROCESSED', 0)
            failed = status_counts.get('FAILED', 0)

            print(f"📊 Documents: {processed} processed, {failed} failed")

            if processed > 0 and search_success:
                print("✅ SUCCESS: OCR PDF workflow completed successfully!")
                print("   - Upload successful")
                print("   - Indexing completed")
                print("   - Search returning results")
                return True
            else:
                print("⚠️  PARTIAL SUCCESS: Some steps completed but issues detected")
                return False
        else:
            print("❌ FAILED: Could not complete workflow")
            return False

def main():
    tester = WebUITester()
    success = tester.run_complete_test()

    if success:
        print("\n🎉 OCR PDF Web UI workflow test PASSED!")
        sys.exit(0)
    else:
        print("\n💥 OCR PDF Web UI workflow test FAILED!")
        sys.exit(1)

if __name__ == "__main__":
    main()