railseek6/final_ocr_validation_test.py

#!/usr/bin/env python3
"""
Final OCR PDF Upload and Search Validation Test
Comprehensive test that handles authentication and tests the complete workflow
"""

import requests
import json
import time
import sys
import os
import base64
from pathlib import Path

# Configuration
BASE_URL = "http://localhost:3015"
OCR_PDF_PATH = "ocr.pdf"
TEST_QUERY = "document processing"
MAX_WAIT_TIME = 300
POLL_INTERVAL = 10

# Authentication
USERNAME = "jleu3482"
PASSWORD = "jleu1212"

class OCRWorkflowValidator:
    def __init__(self):
        self.session = requests.Session()
        self.doc_id = None
        self.auth_token = None

    def log_step(self, message, status="INFO"):
        """Log step with timestamp"""
        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
        print(f"[{timestamp}] [{status}] {message}")

    def setup_authentication(self):
        """Setup authentication using various methods"""
        self.log_step("Setting up authentication...")

        # Method 1: Basic Auth
        try:
            self.session.auth = (USERNAME, PASSWORD)
            # Test if basic auth works
            response = self.session.get(f"{BASE_URL}/", timeout=5)
            if response.status_code == 200:
                self.log_step("✓ Basic authentication configured")
                return True
        except Exception as e:
            self.log_step(f"Basic auth failed: {e}", "WARNING")

        # Method 2: Token-based auth (if available)
        try:
            # Try to get token from various endpoints
            token_endpoints = [
                f"{BASE_URL}/auth/token",
                f"{BASE_URL}/api/token",
                f"{BASE_URL}/token"
            ]

            for endpoint in token_endpoints:
                try:
                    auth_data = {"username": USERNAME, "password": PASSWORD}
                    response = self.session.post(endpoint, data=auth_data, timeout=5)
                    if response.status_code == 200:
                        token_data = response.json()
                        if 'access_token' in token_data:
                            self.auth_token = token_data['access_token']
                            self.session.headers.update({'Authorization': f'Bearer {self.auth_token}'})
                            self.log_step("✓ Token authentication configured")
                            return True
                except:
                    continue
        except Exception as e:
            self.log_step(f"Token auth failed: {e}", "WARNING")

        # Method 3: Custom headers
        try:
            # Try with custom auth headers that might be expected
            auth_headers = [
                {'X-API-Key': PASSWORD},
                {'Authorization': f'Basic {base64.b64encode(f"{USERNAME}:{PASSWORD}".encode()).decode()}'},
                {'X-Username': USERNAME, 'X-Password': PASSWORD}
            ]

            for headers in auth_headers:
                self.session.headers.update(headers)
                response = self.session.get(f"{BASE_URL}/", timeout=5)
                if response.status_code == 200:
                    self.log_step("✓ Custom header authentication configured")
                    return True

            # Reset headers if none worked
            self.session.headers.clear()
        except Exception as e:
            self.log_step(f"Custom header auth failed: {e}", "WARNING")

        self.log_step("✗ All authentication methods failed", "ERROR")
        return False

    def check_server_status(self):
        """Check if server is responding"""
        self.log_step("Checking server status...")

        endpoints_to_try = [
            "/",
            "/health",
            "/api/health"
        ]

        for endpoint in endpoints_to_try:
            try:
                response = self.session.get(f"{BASE_URL}{endpoint}", timeout=5)
                self.log_step(f"✓ Endpoint {endpoint}: {response.status_code}")
                if response.status_code == 200:
                    return True
            except requests.exceptions.RequestException as e:
                self.log_step(f"✗ Endpoint {endpoint}: {e}", "WARNING")
                continue

        self.log_step("✗ No working endpoints found", "ERROR")
        return False

    def verify_ocr_pdf_exists(self):
        """Verify the OCR PDF file exists and is valid"""
        self.log_step("Verifying OCR PDF file...")
        if not os.path.exists(OCR_PDF_PATH):
            self.log_step(f"✗ OCR PDF file not found: {OCR_PDF_PATH}", "ERROR")
            return False

        file_size = os.path.getsize(OCR_PDF_PATH)
        if file_size == 0:
            self.log_step("✗ OCR PDF file is empty", "ERROR")
            return False

        self.log_step(f"✓ OCR PDF file verified ({file_size} bytes)")
        return True

    def test_webui_login(self):
        """Test if we can access the web UI login"""
        self.log_step("Testing web UI access...")
        try:
            response = self.session.get(f"{BASE_URL}/webui/", timeout=10)
            if response.status_code == 200:
                self.log_step("✓ Web UI is accessible")
                return True
            else:
                self.log_step(f"✗ Web UI returned {response.status_code}", "WARNING")
                return False
        except Exception as e:
            self.log_step(f"✗ Web UI access failed: {e}", "WARNING")
            return False

    def upload_ocr_pdf_direct(self):
        """Upload OCR PDF using direct file upload"""
        self.log_step("Uploading OCR PDF file directly...")

        if not self.verify_ocr_pdf_exists():
            return False

        try:
            with open(OCR_PDF_PATH, 'rb') as file:
                files = {'file': (os.path.basename(OCR_PDF_PATH), file, 'application/pdf')}

                # Try multiple upload endpoints with different auth methods
                upload_endpoints = [
                    f"{BASE_URL}/documents/upload",
                    f"{BASE_URL}/upload",
                    f"{BASE_URL}/api/upload"
                ]

                for endpoint in upload_endpoints:
                    try:
                        self.log_step(f"Trying upload endpoint: {endpoint}")

                        # Try with basic auth
                        auth = (USERNAME, PASSWORD)
                        response = self.session.post(
                            endpoint,
                            files=files,
                            auth=auth,
                            timeout=30
                        )

                        if response.status_code in [200, 201]:
                            result = response.json()
                            self.log_step("✓ OCR PDF upload successful")

                            # Extract document ID from response
                            if 'document_id' in result:
                                self.doc_id = result['document_id']
                            elif 'id' in result:
                                self.doc_id = result['id']
                            elif 'doc_id' in result:
                                self.doc_id = result['doc_id']

                            if self.doc_id:
                                self.log_step(f"Document ID: {self.doc_id}")
                            return True
                        else:
                            self.log_step(f"Upload endpoint {endpoint} returned {response.status_code}: {response.text}", "WARNING")

                    except requests.exceptions.RequestException as e:
                        self.log_step(f"Upload endpoint {endpoint} failed: {e}", "WARNING")
                        continue

                self.log_step("✗ All upload endpoints failed", "ERROR")
                return False

        except Exception as e:
            self.log_step(f"✗ Upload failed: {e}", "ERROR")
            return False

    def check_document_processing(self):
        """Check if document is being processed"""
        self.log_step("Checking document processing status...")

        if not self.doc_id:
            self.log_step("✗ No document ID available", "ERROR")
            return False

        start_time = time.time()

        while time.time() - start_time < MAX_WAIT_TIME:
            try:
                # Check documents list
                response = self.session.get(f"{BASE_URL}/documents", timeout=10)
                if response.status_code == 200:
                    documents = response.json()
                    if isinstance(documents, list):
                        for doc in documents:
                            if str(doc.get('id')) == str(self.doc_id):
                                status = doc.get('status', 'unknown')
                                self.log_step(f"Document status: {status}")

                                if status in ['completed', 'processed', 'indexed']:
                                    self.log_step("✓ Document processing completed")
                                    return True
                                elif status in ['processing', 'indexing']:
                                    self.log_step(f"Still processing... ({status})")
                                elif status in ['failed', 'error']:
                                    self.log_step(f"✗ Processing failed: {status}", "ERROR")
                                    return False

                elapsed = int(time.time() - start_time)
                self.log_step(f"Waiting for processing... ({elapsed}s elapsed)")
                time.sleep(POLL_INTERVAL)

            except Exception as e:
                self.log_step(f"Error checking status: {e}", "WARNING")
                time.sleep(POLL_INTERVAL)

        self.log_step("✗ Processing timeout reached", "ERROR")
        return False

    def test_search_functionality(self):
        """Test search functionality with OCR content"""
        self.log_step("Testing search functionality...")

        search_payload = {
            "query": TEST_QUERY,
            "top_k": 5
        }

        search_endpoints = [
            f"{BASE_URL}/search",
            f"{BASE_URL}/query",
            f"{BASE_URL}/api/search"
        ]

        for endpoint in search_endpoints:
            try:
                self.log_step(f"Testing search endpoint: {endpoint}")

                # Try with basic auth
                auth = (USERNAME, PASSWORD)
                response = self.session.post(
                    endpoint,
                    json=search_payload,
                    auth=auth,
                    timeout=15
                )

                if response.status_code == 200:
                    results = response.json()
                    self.log_step("✓ Search request successful")

                    # Validate search results structure
                    if isinstance(results, list) and len(results) > 0:
                        self.log_step(f"✓ Search returned {len(results)} results")

                        # Check if results contain relevant content
                        for i, result in enumerate(results[:3]):
                            if isinstance(result, dict):
                                content = result.get('content', result.get('text', str(result)))
                            else:
                                content = str(result)
                            content_preview = content[:100] + "..." if len(content) > 100 else content
                            self.log_step(f"Result {i+1}: {content_preview}")

                        return True
                    else:
                        self.log_step("✗ Search returned no results", "WARNING")
                        # Continue to next endpoint
                else:
                    self.log_step(f"Search endpoint returned {response.status_code}: {response.text}", "WARNING")

            except requests.exceptions.RequestException as e:
                self.log_step(f"Search endpoint {endpoint} failed: {e}", "WARNING")
                continue

        self.log_step("✗ All search endpoints failed", "ERROR")
        return False

    def verify_database_integration(self):
        """Verify data is stored in databases"""
        self.log_step("Verifying database integration...")

        # Check if we can access document details
        if self.doc_id:
            try:
                response = self.session.get(f"{BASE_URL}/documents/{self.doc_id}", timeout=10)
                if response.status_code == 200:
                    doc_details = response.json()
                    self.log_step("✓ Document details accessible")
                    self.log_step(f"Document metadata: {json.dumps(doc_details, indent=2)}")
                    return True
            except Exception as e:
                self.log_step(f"Document details check failed: {e}", "WARNING")

        self.log_step("✗ Database integration verification incomplete", "WARNING")
        return False

    def run_complete_validation(self):
        """Run the complete validation workflow"""
        self.log_step("Starting Final OCR PDF Upload and Search Validation")
        self.log_step("=" * 60)

        steps = [
            ("Server Status Check", self.check_server_status),
            ("Authentication Setup", self.setup_authentication),
            ("Web UI Access Test", self.test_webui_login),
            ("OCR PDF Verification", self.verify_ocr_pdf_exists),
            ("PDF Upload", self.upload_ocr_pdf_direct),
            ("Document Processing", self.check_document_processing),
            ("Search Functionality", self.test_search_functionality),
            ("Database Integration", self.verify_database_integration)
        ]

        results = []
        for step_name, step_func in steps:
            self.log_step(f"Executing: {step_name}")
            success = step_func()
            results.append((step_name, success))

            if not success:
                self.log_step(f"✗ Workflow failed at: {step_name}", "ERROR")
                # Don't break, continue to gather more information

        # Generate final report
        self.log_step("=" * 60)
        self.log_step("FINAL VALIDATION RESULTS SUMMARY")
        self.log_step("=" * 60)

        passed = 0
        total = len(results)

        for step_name, success in results:
            status = "✓ PASS" if success else "✗ FAIL"
            self.log_step(f"{step_name}: {status}")
            if success:
                passed += 1

        success_rate = (passed / total) * 100
        self.log_step(f"Success Rate: {passed}/{total} ({success_rate:.1f}%)")

        if passed == total:
            self.log_step("🎉 COMPLETE WORKFLOW VALIDATION SUCCESSFUL!", "SUCCESS")
            return True
        elif passed >= total - 2:  # Allow 2 failures for non-critical steps
            self.log_step("⚠️ PARTIAL WORKFLOW VALIDATION - Most functionality working", "WARNING")
            return True
        else:
            self.log_step("❌ WORKFLOW VALIDATION FAILED - Major issues detected", "ERROR")
            return False

def main():
    """Main execution function"""
    validator = OCRWorkflowValidator()

    try:
        success = validator.run_complete_validation()
        sys.exit(0 if success else 1)

    except KeyboardInterrupt:
        validator.log_step("Validation interrupted by user", "WARNING")
        sys.exit(1)
    except Exception as e:
        validator.log_step(f"Unexpected error: {e}", "ERROR")
        sys.exit(1)

if __name__ == "__main__":
    main()