railseek6/test_ocr_direct_workflow.py

#!/usr/bin/env python3
"""
Direct OCR PDF upload, indexing, and search test
This test bypasses authentication issues and tests the core OCR functionality
"""

import requests
import time
import json
import sys
from pathlib import Path

# Configuration
BASE_URL = "http://localhost:3015"
OCR_PDF_PATH = "ocr.pdf"

class OCRWorkflowTester:
    def __init__(self):
        self.base_url = BASE_URL
        self.session = requests.Session()

    def check_server_status(self):
        """Check if server is accessible"""
        print("🔍 Checking server accessibility...")
        try:
            response = self.session.get(f"{self.base_url}/")
            if response.status_code in [200, 307]:  # 307 for redirect
                print("✅ Server is accessible")
                return True
            else:
                print(f"❌ Server returned status: {response.status_code}")
                return False
        except Exception as e:
            print(f"❌ Cannot connect to server: {e}")
            return False

    def check_health_no_auth(self):
        """Try to check health without authentication"""
        print("\n🏥 Checking server health (no auth)...")
        try:
            response = self.session.get(f"{self.base_url}/health")
            if response.status_code == 200:
                health_data = response.json()
                print(f"✅ Server is healthy")
                print(f"   Status: {health_data.get('status', 'unknown')}")
                print(f"   Auth Mode: {health_data.get('auth_mode', 'unknown')}")
                return health_data
            else:
                print(f"⚠️  Health check returned: {response.status_code}")
                # Try to parse anyway
                try:
                    health_data = response.json()
                    print(f"   Response: {health_data}")
                except:
                    print(f"   Response: {response.text}")
                return None
        except Exception as e:
            print(f"❌ Health check error: {e}")
            return None

    def upload_ocr_pdf_direct(self):
        """Try to upload OCR PDF without authentication"""
        print(f"\n📤 Attempting to upload OCR PDF: {OCR_PDF_PATH}")

        if not Path(OCR_PDF_PATH).exists():
            print(f"❌ OCR PDF file not found: {OCR_PDF_PATH}")
            return False

        try:
            with open(OCR_PDF_PATH, 'rb') as file:
                files = {'file': (OCR_PDF_PATH, file, 'application/pdf')}
                response = self.session.post(
                    f"{self.base_url}/documents/upload",
                    files=files
                )

            if response.status_code == 200:
                result = response.json()
                print(f"✅ Upload successful")
                print(f"   Status: {result.get('status', 'unknown')}")
                print(f"   Message: {result.get('message', 'No message')}")
                return True
            else:
                print(f"❌ Upload failed: {response.status_code}")
                try:
                    error_data = response.json()
                    print(f"   Error: {error_data}")
                except:
                    print(f"   Response: {response.text}")
                return False

        except Exception as e:
            print(f"❌ Upload error: {e}")
            return False

    def monitor_processing(self, max_wait=120):
        """Monitor document processing status"""
        print(f"\n⏳ Monitoring processing (max {max_wait}s)...")

        for i in range(max_wait):
            try:
                # Try to get health status to check pipeline
                response = self.session.get(f"{self.base_url}/health")
                if response.status_code == 200:
                    health_data = response.json()
                    busy = health_data.get('pipeline_busy', False)

                    if not busy:
                        print("✅ Processing appears complete")
                        return True

                    if i % 10 == 0:  # Print status every 10 seconds
                        print(f"   Still processing... ({i}s)")

                time.sleep(1)

            except Exception as e:
                print(f"❌ Error monitoring processing: {e}")
                return False

        print("⚠️  Processing timeout reached")
        return False

    def test_search_without_auth(self):
        """Test search functionality without authentication"""
        print("\n🔍 Testing search without authentication...")

        test_queries = [
            "LightRAG",
            "OCR technology",
            "document processing",
            "text extraction",
            "Retrieval-Augmented Generation"
        ]

        headers = {"Content-Type": "application/json"}

        for query in test_queries:
            print(f"\n   Testing query: '{query}'")
            try:
                payload = {
                    "query": query,
                    "top_k": 5,
                    "only_need_context": True
                }

                response = self.session.post(
                    f"{self.base_url}/query",
                    json=payload,
                    headers=headers
                )

                if response.status_code == 200:
                    results = response.json()

                    # Check if we got actual content
                    if 'chunks' in results and len(results['chunks']) > 0:
                        print(f"   ✅ Found {len(results['chunks'])} results")

                        # Show first result preview
                        first_chunk = results['chunks'][0]
                        content = first_chunk.get('text', '')[:150] + "..."
                        score = first_chunk.get('score', 0)
                        print(f"      📄 Preview: {content}")
                        print(f"      📊 Score: {score:.3f}")
                        return True  # At least one successful search

                    elif 'response' in results:
                        # Check if LLM responded with content
                        response_text = results['response']
                        if "[no-context]" not in response_text:
                            print(f"   ✅ LLM generated response")
                            print(f"      🤖 Response: {response_text[:150]}...")
                            return True
                        else:
                            print(f"   ⚠️  No context found for query")
                    else:
                        print(f"   ⚠️  No results found")

                else:
                    print(f"   ❌ Search failed: {response.status_code}")
                    try:
                        error_data = response.json()
                        print(f"      Error: {error_data}")
                    except:
                        print(f"      Response: {response.text}")

            except Exception as e:
                print(f"   ❌ Search error: {e}")

        return False

    def check_webui_access(self):
        """Check if Web UI is accessible"""
        print("\n🌐 Checking Web UI accessibility...")
        try:
            response = self.session.get(f"{self.base_url}/webui/")
            if response.status_code == 200:
                print("✅ Web UI is accessible")
                return True
            else:
                print(f"⚠️  Web UI returned: {response.status_code}")
                return False
        except Exception as e:
            print(f"❌ Web UI access error: {e}")
            return False

    def run_complete_test(self):
        """Run complete OCR workflow test"""
        print("🚀 Starting Direct OCR PDF Workflow Test")
        print("=" * 60)

        # Step 1: Check server accessibility
        if not self.check_server_status():
            return False

        # Step 2: Check Web UI
        self.check_webui_access()

        # Step 3: Check health (may fail due to auth)
        health_data = self.check_health_no_auth()

        # Step 4: Upload OCR PDF
        upload_success = self.upload_ocr_pdf_direct()

        # Step 5: Monitor processing
        if upload_success:
            processing_success = self.monitor_processing()
        else:
            processing_success = False

        # Step 6: Test search
        search_success = self.test_search_without_auth()

        # Summary
        print("\n" + "=" * 60)
        print("🎯 DIRECT OCR WORKFLOW TEST SUMMARY")
        print("=" * 60)

        print(f"📊 Upload: {'✅ Success' if upload_success else '❌ Failed'}")
        print(f"📊 Processing: {'✅ Complete' if processing_success else '❌ Failed/Timeout'}")
        print(f"📊 Search: {'✅ Working' if search_success else '❌ No results'}")

        if upload_success and search_success:
            print("\n✅ SUCCESS: OCR PDF workflow is functional!")
            print("   - Upload successful")
            print("   - Search returning results")
            return True
        elif upload_success:
            print("\n⚠️  PARTIAL SUCCESS: Upload worked but search issues")
            return False
        else:
            print("\n❌ FAILED: Could not complete workflow")
            return False

def main():
    tester = OCRWorkflowTester()
    success = tester.run_complete_test()

    if success:
        print("\n🎉 OCR PDF direct workflow test PASSED!")
        print("\n📋 Next steps:")
        print("   1. Access Web UI at: http://localhost:3015/webui/")
        print("   2. Use credentials: jleu3482 / jleu1212")
        print("   3. Upload documents and test search")
        sys.exit(0)
    else:
        print("\n💥 OCR PDF direct workflow test had issues.")
        print("\n🔧 Troubleshooting:")
        print("   - Check server authentication configuration")
        print("   - Verify .env file settings")
        print("   - Check database connections")
        sys.exit(1)

if __name__ == "__main__":
    main()