railseek6/final_ocr_test_validation.py

import requests
import json
import base64
import time
import logging
import os
from pathlib import Path

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s] [%(levelname)s] %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

class OCRWorkflowTester:
    def __init__(self, base_url="http://localhost:3015", username="jleu3482", password="jleu1212"):
        self.base_url = base_url
        self.username = username
        self.password = password
        self.session = requests.Session()
        self._setup_auth()

    def _setup_auth(self):
        """Setup authentication for the session"""
        credentials = f"{self.username}:{self.password}"
        encoded_credentials = base64.b64encode(credentials.encode()).decode()
        self.session.headers.update({
            "Authorization": f"Basic {encoded_credentials}",
            "Content-Type": "application/json"
        })

    def check_server_status(self):
        """Check if server is accessible"""
        logging.info("🔍 Checking server status...")
        try:
            response = self.session.get(f"{self.base_url}/health")
            if response.status_code == 200:
                logging.info("✅ Server is running and accessible")
                return True
            else:
                logging.error(f"❌ Server returned status: {response.status_code}")
                return False
        except Exception as e:
            logging.error(f"❌ Server connection failed: {e}")
            return False

    def upload_ocr_pdf(self, file_path="ocr.pdf"):
        """Upload OCR PDF file"""
        logging.info(f"📤 Uploading OCR PDF: {file_path}")

        if not Path(file_path).exists():
            logging.error(f"❌ File not found: {file_path}")
            return False

        try:
            with open(file_path, 'rb') as f:
                files = {'file': (file_path, f, 'application/pdf')}
                # Remove auth header for file upload
                headers = {k: v for k, v in self.session.headers.items() if k != 'Authorization'}
                response = requests.post(
                    f"{self.base_url}/documents/upload",
                    files=files,
                    auth=(self.username, self.password)
                )

            if response.status_code == 200:
                logging.info("✅ Upload successful")
                result = response.json()
                logging.info(f"📊 Upload result: {json.dumps(result, indent=2)}")
                return True
            else:
                logging.error(f"❌ Upload failed: {response.status_code} - {response.text}")
                return False

        except Exception as e:
            logging.error(f"❌ Upload error: {e}")
            return False

    def monitor_indexing(self, max_wait=60):
        """Monitor document indexing progress"""
        logging.info("⏳ Monitoring indexing progress...")

        for attempt in range(max_wait // 5):
            try:
                response = self.session.get(f"{self.base_url}/documents")
                if response.status_code == 200:
                    documents = response.json()
                    if documents:
                        doc = documents[0]
                        status = doc.get('status', 'unknown')
                        logging.info(f"📄 Document status: {status}")

                        if status == 'completed':
                            logging.info("✅ Indexing completed successfully")
                            return True
                        elif status == 'failed':
                            logging.error("❌ Indexing failed")
                            return False
                    else:
                        logging.info("📭 No documents found yet")
                else:
                    logging.error(f"❌ Status check failed: {response.status_code}")

                time.sleep(5)

            except Exception as e:
                logging.error(f"❌ Monitoring error: {e}")
                time.sleep(5)

        logging.warning("⚠️ Indexing timeout reached")
        return False

    def test_search(self, queries):
        """Test search functionality with multiple queries"""
        logging.info("🔍 Testing search functionality...")

        results = {}
        for query in queries:
            logging.info(f"🔎 Searching for: '{query}'")
            try:
                payload = {
                    "query": query,
                    "top_k": 3
                }

                response = self.session.post(
                    f"{self.base_url}/search",
                    json=payload
                )

                if response.status_code == 200:
                    result = response.json()
                    results[query] = {
                        'success': True,
                        'results_count': len(result.get('results', [])),
                        'sample_content': result.get('results', [{}])[0].get('content', '')[:200] if result.get('results') else 'No results'
                    }
                    logging.info(f"✅ Search successful - {len(result.get('results', []))} results")
                    logging.info(f"📝 Sample: {results[query]['sample_content']}...")
                else:
                    results[query] = {
                        'success': False,
                        'error': f"Status {response.status_code}: {response.text}"
                    }
                    logging.error(f"❌ Search failed: {response.status_code} - {response.text}")

            except Exception as e:
                results[query] = {
                    'success': False,
                    'error': str(e)
                }
                logging.error(f"❌ Search error: {e}")

        return results

    def check_database_status(self):
        """Check database connectivity and status"""
        logging.info("🗄️ Checking database connections...")

        try:
            # Check document count
            response = self.session.get(f"{self.base_url}/documents")
            if response.status_code == 200:
                documents = response.json()
                logging.info(f"📊 Documents in system: {len(documents)}")

                # Check search index
                test_response = self.session.post(
                    f"{self.base_url}/search",
                    json={"query": "test", "top_k": 1}
                )
                if test_response.status_code == 200:
                    logging.info("✅ Search index is operational")
                else:
                    logging.warning("⚠️ Search index may have issues")

            return True

        except Exception as e:
            logging.error(f"❌ Database check failed: {e}")
            return False

def main():
    logging.info("🚀 STARTING OCR PDF WORKFLOW VALIDATION")
    logging.info("=" * 70)

    # Initialize tester
    tester = OCRWorkflowTester()

    # Test server connectivity
    if not tester.check_server_status():
        logging.error("❌ Cannot proceed - server not accessible")
        return

    # Upload OCR PDF
    if tester.upload_ocr_pdf():
        # Wait and monitor indexing
        logging.info("⏳ Waiting for indexing to complete...")
        time.sleep(10)

        if tester.monitor_indexing():
            # Test search with OCR-specific queries
            search_queries = [
                "OCR",
                "text extraction",
                "document processing",
                "optical character recognition",
                "PDF conversion"
            ]

            search_results = tester.test_search(search_queries)

            # Check database status
            tester.check_database_status()

            # Print summary
            logging.info("=" * 70)
            logging.info("📋 TEST RESULTS SUMMARY")
            logging.info("=" * 70)

            successful_searches = sum(1 for result in search_results.values() if result['success'])
            logging.info(f"✅ Upload: SUCCESS")
            logging.info(f"✅ Indexing: SUCCESS")
            logging.info(f"🔍 Search: {successful_searches}/{len(search_queries)} queries successful")

            for query, result in search_results.items():
                status = "✅" if result['success'] else "❌"
                logging.info(f"   {status} '{query}': {result.get('results_count', 'N/A')} results")

        else:
            logging.error("❌ Indexing failed or timed out")
    else:
        logging.error("❌ Upload failed - cannot proceed with testing")

    logging.info("=" * 70)
    logging.info("🌐 Web UI: http://localhost:3015/webui/")
    logging.info("👤 Username: jleu3482")
    logging.info("🔑 Password: jleu1212")
    logging.info("📁 Test file: ocr.pdf")

if __name__ == "__main__":
    main()