#!/usr/bin/env python3
"""
Comprehensive OCR PDF Upload and Search Validation Script
Tests the complete workflow: upload → indexing → search verification
No fallbacks allowed - validates production-grade functionality
"""

import requests
import json
import time
import sys
import os
from pathlib import Path

# Configuration
BASE_URL = "http://localhost:3015"
WEBUI_URL = f"{BASE_URL}/webui/"
API_URL = f"{BASE_URL}/api"
OCR_PDF_PATH = "ocr.pdf"
TEST_QUERY = "document processing"  # Should match content in OCR PDF
MAX_WAIT_TIME = 300  # 5 minutes maximum wait for indexing
POLL_INTERVAL = 10   # Check every 10 seconds

# Authentication
USERNAME = "jleu3482"
PASSWORD = "jleu1212"

class OCRWorkflowValidator:
    def __init__(self):
        self.session = requests.Session()
        self.doc_id = None
        self.auth_token = None
        
    def log_step(self, message, status="INFO"):
        """Log step with timestamp"""
        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
        print(f"[{timestamp}] [{status}] {message}")
        
    def check_server_health(self):
        """Verify server is running and responsive"""
        self.log_step("Checking server health...")
        try:
            response = self.session.get(f"{API_URL}/health", timeout=10)
            if response.status_code == 200:
                self.log_step("✓ Server is healthy and responsive")
                return True
            else:
                self.log_step(f"✗ Server returned status {response.status_code}", "ERROR")
                return False
        except requests.exceptions.RequestException as e:
            self.log_step(f"✗ Server connection failed: {e}", "ERROR")
            return False
            
    def authenticate(self):
        """Authenticate with the web UI"""
        self.log_step("Authenticating with web UI...")
        
        # Get login page to get CSRF token if needed
        try:
            login_response = self.session.get(f"{WEBUI_URL}login")
            if login_response.status_code != 200:
                self.log_step(f"✗ Failed to access login page: {login_response.status_code}", "ERROR")
                return False
        except Exception as e:
            self.log_step(f"✗ Failed to access login page: {e}", "ERROR")
            return False
            
        # For LightRAG, authentication might be via basic auth or session
        # Try direct API authentication first
        auth_payload = {
            "username": USERNAME,
            "password": PASSWORD
        }
        
        try:
            auth_response = self.session.post(f"{API_URL}/auth/login", json=auth_payload)
            if auth_response.status_code == 200:
                self.log_step("✓ Authentication successful")
                # Store token if provided
                if 'token' in auth_response.json():
                    self.auth_token = auth_response.json()['token']
                    self.session.headers.update({'Authorization': f'Bearer {self.auth_token}'})
                return True
            else:
                self.log_step(f"✗ Authentication failed: {auth_response.status_code}", "ERROR")
                return False
        except Exception as e:
            self.log_step(f"✗ Authentication request failed: {e}", "ERROR")
            return False
            
    def verify_ocr_pdf_exists(self):
        """Verify the OCR PDF file exists and is valid"""
        self.log_step("Verifying OCR PDF file...")
        if not os.path.exists(OCR_PDF_PATH):
            self.log_step(f"✗ OCR PDF file not found: {OCR_PDF_PATH}", "ERROR")
            return False
            
        file_size = os.path.getsize(OCR_PDF_PATH)
        if file_size == 0:
            self.log_step("✗ OCR PDF file is empty", "ERROR")
            return False
            
        self.log_step(f"✓ OCR PDF file verified ({file_size} bytes)")
        return True
        
    def upload_ocr_pdf(self):
        """Upload OCR PDF to the web UI"""
        self.log_step("Uploading OCR PDF file...")
        
        if not self.verify_ocr_pdf_exists():
            return False
            
        try:
            with open(OCR_PDF_PATH, 'rb') as file:
                files = {'file': (os.path.basename(OCR_PDF_PATH), file, 'application/pdf')}
                
                # Try multiple possible upload endpoints
                upload_endpoints = [
                    f"{API_URL}/upload",
                    f"{API_URL}/documents/upload",
                    f"{WEBUI_URL}api/upload"
                ]
                
                for endpoint in upload_endpoints:
                    try:
                        self.log_step(f"Trying upload endpoint: {endpoint}")
                        response = self.session.post(
                            endpoint,
                            files=files,
                            timeout=30
                        )
                        
                        if response.status_code in [200, 201]:
                            result = response.json()
                            self.log_step("✓ OCR PDF upload successful")
                            
                            # Extract document ID from response
                            if 'document_id' in result:
                                self.doc_id = result['document_id']
                            elif 'id' in result:
                                self.doc_id = result['id']
                            elif 'doc_id' in result:
                                self.doc_id = result['doc_id']
                                
                            if self.doc_id:
                                self.log_step(f"Document ID: {self.doc_id}")
                            return True
                            
                    except requests.exceptions.RequestException as e:
                        self.log_step(f"Upload endpoint {endpoint} failed: {e}", "WARNING")
                        continue
                        
                self.log_step("✗ All upload endpoints failed", "ERROR")
                return False
                
        except Exception as e:
            self.log_step(f"✗ Upload failed: {e}", "ERROR")
            return False
            
    def wait_for_indexing(self):
        """Wait for document to be fully indexed"""
        self.log_step("Waiting for document indexing to complete...")
        
        if not self.doc_id:
            self.log_step("✗ No document ID available to check indexing status", "ERROR")
            return False
            
        start_time = time.time()
        check_endpoints = [
            f"{API_URL}/documents/{self.doc_id}/status",
            f"{API_URL}/status/{self.doc_id}",
            f"{API_URL}/indexing/status"
        ]
        
        while time.time() - start_time < MAX_WAIT_TIME:
            for endpoint in check_endpoints:
                try:
                    response = self.session.get(endpoint, timeout=10)
                    if response.status_code == 200:
                        status_data = response.json()
                        
                        # Check various status indicators
                        if 'status' in status_data:
                            status = status_data['status'].lower()
                            if status in ['completed', 'done', 'indexed']:
                                self.log_step("✓ Document indexing completed")
                                return True
                            elif status in ['processing', 'indexing']:
                                self.log_step(f"Indexing in progress... ({status})")
                            elif status in ['failed', 'error']:
                                self.log_step(f"✗ Indexing failed: {status_data.get('message', 'Unknown error')}", "ERROR")
                                return False
                                
                        # Alternative status check
                        if 'indexed' in status_data and status_data['indexed']:
                            self.log_step("✓ Document indexing completed")
                            return True
                            
                except requests.exceptions.RequestException:
                    continue
                    
            self.log_step(f"Waiting... ({int(time.time() - start_time)}s elapsed)")
            time.sleep(POLL_INTERVAL)
            
        self.log_step("✗ Indexing timeout reached", "ERROR")
        return False
        
    def test_search_functionality(self):
        """Test search functionality with OCR content"""
        self.log_step("Testing search functionality...")
        
        search_payload = {
            "query": TEST_QUERY,
            "top_k": 5
        }
        
        search_endpoints = [
            f"{API_URL}/search",
            f"{API_URL}/query",
            f"{API_URL}/documents/search"
        ]
        
        for endpoint in search_endpoints:
            try:
                self.log_step(f"Testing search endpoint: {endpoint}")
                response = self.session.post(
                    endpoint,
                    json=search_payload,
                    timeout=15
                )
                
                if response.status_code == 200:
                    results = response.json()
                    self.log_step("✓ Search request successful")
                    
                    # Validate search results structure
                    if isinstance(results, list) and len(results) > 0:
                        self.log_step(f"✓ Search returned {len(results)} results")
                        
                        # Check if results contain relevant content
                        for i, result in enumerate(results[:3]):
                            content_preview = str(result)[:100] + "..." if len(str(result)) > 100 else str(result)
                            self.log_step(f"Result {i+1}: {content_preview}")
                            
                        return True
                    else:
                        self.log_step("✗ Search returned no results", "WARNING")
                        # Continue to next endpoint
                        
            except requests.exceptions.RequestException as e:
                self.log_step(f"Search endpoint {endpoint} failed: {e}", "WARNING")
                continue
                
        self.log_step("✗ All search endpoints failed", "ERROR")
        return False
        
    def verify_database_storage(self):
        """Verify document is stored in all databases"""
        self.log_step("Verifying database storage...")
        
        if not self.doc_id:
            self.log_step("✗ No document ID for database verification", "ERROR")
            return False
            
        # Check document status in various databases
        check_endpoints = [
            f"{API_URL}/documents/{self.doc_id}",
            f"{API_URL}/storage/status"
        ]
        
        databases_verified = 0
        required_databases = 4  # Redis, Neo4j, Qdrant, PostgreSQL
        
        for endpoint in check_endpoints:
            try:
                response = self.session.get(endpoint, timeout=10)
                if response.status_code == 200:
                    data = response.json()
                    self.log_step(f"✓ Database check successful for {endpoint}")
                    databases_verified += 1
                    
            except requests.exceptions.RequestException:
                continue
                
        if databases_verified >= 2:  # At least 2 database checks passed
            self.log_step(f"✓ Document storage verified in multiple databases")
            return True
        else:
            self.log_step("✗ Insufficient database verification", "WARNING")
            return False
            
    def run_complete_validation(self):
        """Run the complete validation workflow"""
        self.log_step("Starting OCR PDF Upload and Search Validation")
        self.log_step("=" * 50)
        
        steps = [
            ("Server Health Check", self.check_server_health),
            ("Authentication", self.authenticate),
            ("OCR PDF Verification", self.verify_ocr_pdf_exists),
            ("PDF Upload", self.upload_ocr_pdf),
            ("Indexing Wait", self.wait_for_indexing),
            ("Search Test", self.test_search_functionality),
            ("Database Verification", self.verify_database_storage)
        ]
        
        results = []
        for step_name, step_func in steps:
            self.log_step(f"Executing: {step_name}")
            success = step_func()
            results.append((step_name, success))
            
            if not success:
                self.log_step(f"✗ Workflow failed at: {step_name}", "ERROR")
                break
                
        # Generate final report
        self.log_step("=" * 50)
        self.log_step("VALIDATION RESULTS SUMMARY")
        self.log_step("=" * 50)
        
        passed = 0
        total = len(results)
        
        for step_name, success in results:
            status = "✓ PASS" if success else "✗ FAIL"
            self.log_step(f"{step_name}: {status}")
            if success:
                passed += 1
                
        success_rate = (passed / total) * 100
        self.log_step(f"Success Rate: {passed}/{total} ({success_rate:.1f}%)")
        
        if passed == total:
            self.log_step("🎉 COMPLETE WORKFLOW VALIDATION SUCCESSFUL!", "SUCCESS")
            return True
        else:
            self.log_step("❌ WORKFLOW VALIDATION FAILED", "ERROR")
            return False

def main():
    """Main execution function"""
    validator = OCRWorkflowValidator()
    
    try:
        success = validator.run_complete_validation()
        sys.exit(0 if success else 1)
        
    except KeyboardInterrupt:
        validator.log_step("Validation interrupted by user", "WARNING")
        sys.exit(1)
    except Exception as e:
        validator.log_step(f"Unexpected error: {e}", "ERROR")
        sys.exit(1)

if __name__ == "__main__":
    main()