railseek6/selenium_ocr_performance_test.py

#!/usr/bin/env python3
"""
Selenium OCR Performance Test
Tests OCR performance through the web UI to measure end-to-end performance
"""

import time
import os
import sys
import json
from datetime import datetime
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import pandas as pd

class OCRPerformanceTester:
    """Selenium-based OCR performance tester"""

    def __init__(self, base_url="http://localhost:3015", headless=False):
        """
        Initialize the performance tester

        Args:
            base_url: Base URL of the LightRAG web UI
            headless: Run browser in headless mode
        """
        self.base_url = base_url
        self.headless = headless
        self.driver = None
        self.results = []
        self.test_start_time = None

    def setup_driver(self):
        """Setup Chrome WebDriver with options"""
        chrome_options = Options()
        if self.headless:
            chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")

        self.driver = webdriver.Chrome(options=chrome_options)
        self.driver.implicitly_wait(10)

    def login(self, username="admin", password="admin"):
        """Login to the web UI"""
        print(f"🔐 Logging in to {self.base_url}")
        self.driver.get(f"{self.base_url}/login")

        try:
            # Wait for login form
            username_field = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.NAME, "username"))
            )
            password_field = self.driver.find_element(By.NAME, "password")
            login_button = self.driver.find_element(By.XPATH, "//button[contains(text(), 'Login')]")

            # Fill credentials
            username_field.send_keys(username)
            password_field.send_keys(password)
            login_button.click()

            # Wait for login to complete
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//h1[contains(text(), 'Dashboard')]"))
            )
            print("✅ Login successful")
            return True

        except TimeoutException:
            print("⚠️  Login timeout - assuming already logged in or no auth required")
            return True
        except Exception as e:
            print(f"❌ Login failed: {e}")
            return False

    def upload_document(self, file_path, document_type="pdf"):
        """
        Upload a document and measure upload/processing time

        Args:
            file_path: Path to document file
            document_type: Type of document (pdf, docx, etc.)

        Returns:
            dict: Performance metrics
        """
        if not os.path.exists(file_path):
            print(f"❌ File not found: {file_path}")
            return None

        file_name = os.path.basename(file_path)
        file_size = os.path.getsize(file_path)

        print(f"📤 Uploading {file_name} ({file_size:,} bytes)")

        metrics = {
            "file_name": file_name,
            "file_size": file_size,
            "document_type": document_type,
            "upload_start": time.time()
        }

        try:
            # Navigate to upload page
            self.driver.get(f"{self.base_url}/upload")

            # Wait for upload form
            file_input = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//input[@type='file']"))
            )

            # Upload file
            file_input.send_keys(os.path.abspath(file_path))
            metrics["upload_end"] = time.time()
            metrics["upload_time"] = metrics["upload_end"] - metrics["upload_start"]

            print(f"  📊 Upload time: {metrics['upload_time']:.2f}s")

            # Wait for processing to start
            processing_start = time.time()

            # Look for processing indicators
            try:
                # Wait for processing status element
                processing_element = WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.XPATH, "//*[contains(text(), 'Processing') or contains(text(), 'processing')]"))
                )
                metrics["processing_start"] = processing_start

                # Wait for completion
                try:
                    # Look for success message
                    success_element = WebDriverWait(self.driver, 300).until(  # 5 minute timeout for OCR
                        EC.presence_of_element_located((By.XPATH, "//*[contains(text(), 'Success') or contains(text(), 'success') or contains(text(), 'Complete') or contains(text(), 'complete')]"))
                    )
                    processing_end = time.time()
                    metrics["processing_end"] = processing_end
                    metrics["processing_time"] = processing_end - processing_start

                    print(f"  📊 Processing time: {metrics['processing_time']:.2f}s")

                    # Try to get OCR-specific metrics
                    try:
                        # Look for OCR statistics
                        page_elements = self.driver.find_elements(By.XPATH, "//*[contains(text(), 'page') or contains(text(), 'Page')]")
                        for element in page_elements:
                            text = element.text
                            if "page" in text.lower():
                                # Extract page count
                                import re
                                match = re.search(r'(\d+)\s+page', text)
                                if match:
                                    metrics["pages_processed"] = int(match.group(1))
                                    break
                    except:
                        pass

                    # Total time
                    metrics["total_time"] = metrics["upload_time"] + metrics["processing_time"]
                    print(f"  📊 Total time: {metrics['total_time']:.2f}s")

                    return metrics

                except TimeoutException:
                    print("❌ Processing timeout (5 minutes)")
                    metrics["error"] = "Processing timeout"
                    return metrics

            except TimeoutException:
                print("❌ Processing indicator not found")
                metrics["error"] = "Processing indicator not found"
                return metrics

        except Exception as e:
            print(f"❌ Upload failed: {e}")
            metrics["error"] = str(e)
            return metrics

    def search_document(self, query, expected_results=1):
        """
        Search for content in uploaded documents

        Args:
            query: Search query
            expected_results: Expected number of results

        Returns:
            dict: Search performance metrics
        """
        print(f"🔍 Searching for: '{query}'")

        metrics = {
            "query": query,
            "search_start": time.time()
        }

        try:
            # Navigate to search page
            self.driver.get(f"{self.base_url}/search")

            # Find search input
            search_input = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//input[@type='search' or contains(@placeholder, 'Search')]"))
            )

            # Enter search query
            search_input.clear()
            search_input.send_keys(query)

            # Find and click search button
            search_button = self.driver.find_element(By.XPATH, "//button[contains(text(), 'Search') or contains(@class, 'search')]")
            search_button.click()

            # Wait for results
            search_end = time.time()
            metrics["search_end"] = search_end
            metrics["search_time"] = search_end - metrics["search_start"]

            print(f"  📊 Search time: {metrics['search_time']:.2f}s")

            # Count results
            try:
                results = WebDriverWait(self.driver, 10).until(
                    EC.presence_of_all_elements_located((By.XPATH, "//div[contains(@class, 'result') or contains(@class, 'card')]"))
                )
                metrics["results_found"] = len(results)
                print(f"  📊 Results found: {len(results)}")

                # Check if OCR content is in results
                ocr_content_found = False
                for result in results[:3]:  # Check first 3 results
                    if "ocr" in result.text.lower() or "scanned" in result.text.lower():
                        ocr_content_found = True
                        break
                metrics["ocr_content_found"] = ocr_content_found

            except TimeoutException:
                metrics["results_found"] = 0
                print("  ⚠️  No results found or timeout")

            return metrics

        except Exception as e:
            print(f"❌ Search failed: {e}")
            metrics["error"] = str(e)
            return metrics

    def run_performance_test_suite(self, test_files):
        """
        Run comprehensive performance test suite

        Args:
            test_files: List of test file paths

        Returns:
            list: All performance metrics
        """
        print("🚀 STARTING OCR PERFORMANCE TEST SUITE")
        print("=" * 60)

        self.test_start_time = time.time()
        all_metrics = []

        # Login first
        if not self.login():
            print("❌ Cannot proceed without login")
            return all_metrics

        # Test each file
        for file_path in test_files:
            if not os.path.exists(file_path):
                print(f"⚠️  Skipping missing file: {file_path}")
                continue

            print(f"\n📄 TESTING FILE: {os.path.basename(file_path)}")
            print("-" * 40)

            # Upload and process
            upload_metrics = self.upload_document(file_path)
            if upload_metrics:
                all_metrics.append(upload_metrics)

                # Wait a bit between tests
                time.sleep(2)

                # Search for OCR content
                search_queries = ["text", "document", "content", "scanned"]
                for query in search_queries:
                    search_metrics = self.search_document(query)
                    if search_metrics:
                        search_metrics["test_file"] = os.path.basename(file_path)
                        all_metrics.append(search_metrics)
                    time.sleep(1)

        # Calculate summary statistics
        self.calculate_summary(all_metrics)

        return all_metrics

    def calculate_summary(self, all_metrics):
        """Calculate and display summary statistics"""
        print("\n" + "=" * 60)
        print("📊 PERFORMANCE TEST SUMMARY")
        print("=" * 60)

        # Filter upload metrics
        upload_metrics = [m for m in all_metrics if "upload_time" in m]
        search_metrics = [m for m in all_metrics if "search_time" in m]

        if upload_metrics:
            print(f"\n📤 UPLOAD & PROCESSING ({len(upload_metrics)} tests):")
            upload_times = [m["upload_time"] for m in upload_metrics if "upload_time" in m]
            processing_times = [m["processing_time"] for m in upload_metrics if "processing_time" in m]
            total_times = [m["total_time"] for m in upload_metrics if "total_time" in m]

            if upload_times:
                print(f"  Upload Time:    Avg={sum(upload_times)/len(upload_times):.2f}s, Min={min(upload_times):.2f}s, Max={max(upload_times):.2f}s")
            if processing_times:
                print(f"  Processing Time: Avg={sum(processing_times)/len(processing_times):.2f}s, Min={min(processing_times):.2f}s, Max={max(processing_times):.2f}s")
            if total_times:
                print(f"  Total Time:      Avg={sum(total_times)/len(total_times):.2f}s, Min={min(total_times):.2f}s, Max={max(total_times):.2f}s")

        if search_metrics:
            print(f"\n🔍 SEARCH ({len(search_metrics)} tests):")
            search_times = [m["search_time"] for m in search_metrics if "search_time" in m]
            if search_times:
                print(f"  Search Time:    Avg={sum(search_times)/len(search_times):.2f}s, Min={min(search_times):.2f}s, Max={max(search_times):.2f}s")

            ocr_found = [m for m in search_metrics if m.get("ocr_content_found", False)]
            print(f"  OCR Content Found: {len(ocr_found)}/{len(search_metrics)} searches")

        # Overall test duration
        if self.test_start_time:
            total_duration = time.time() - self.test_start_time
            print(f"\n⏱️  TOTAL TEST DURATION: {total_duration:.2f}s")

    def save_results(self, results, output_file="ocr_performance_results.json"):
        """Save test results to JSON file"""
        # Add timestamp
        results_data = {
            "timestamp": datetime.now().isoformat(),
            "base_url": self.base_url,
            "results": results
        }

        with open(output_file, 'w') as f:
            json.dump(results_data, f, indent=2, default=str)

        print(f"\n💾 Results saved to: {output_file}")

        # Also save as CSV for easier analysis
        self.save_results_csv(results, "ocr_performance_results.csv")

    def save_results_csv(self, results, output_file="ocr_performance_results.csv"):
        """Save test results to CSV file"""
        if not results:
            return

        # Flatten results for CSV
        flat_results = []
        for result in results:
            flat_result = {}
            for key, value in result.items():
                if isinstance(value, (dict, list)):
                    flat_result[key] = str(value)
                else:
                    flat_result[key] = value
            flat_results.append(flat_result)

        df = pd.DataFrame(flat_results)
        df.to_csv(output_file, index=False)
        print(f"📊 CSV results saved to: {output_file}")

    def cleanup(self):
        """Cleanup resources"""
        if self.driver:
            self.driver.quit()
            print("\n🧹 Browser closed")

def main():
    """Main test execution"""
    import argparse

    parser = argparse.ArgumentParser(description="Selenium OCR Performance Test")
    parser.add_argument("--url", default="http://localhost:3015", help="Base URL of LightRAG web UI")
    parser.add_argument("--headless", action="store_true", help="Run browser in headless mode")
    parser.add_argument("--files", nargs="+", help="Test files to upload")
    parser.add_argument("--output", default="ocr_performance_results.json", help="Output file for results")

    args = parser.parse_args()

    # Default test files if none provided
    test_files = args.files or [
        "ocr.pdf",  # OCR test PDF
        "test_meaningful.pdf",  # Text-based PDF
        "test.docx"  # Word document
    ]

    # Filter to existing files
    existing_files = [f for f in test_files if os.path.exists(f)]
    if not existing_files:
        print("❌ No test files found. Please provide valid file paths.")
        return

    print(f"📁 Test files: {existing_files}")

    # Create tester instance
    tester = OCRPerformanceTester(base_url=args.url, headless=args.headless)

    try:
        # Setup and run tests
        tester.setup_driver()
        results = tester.run_performance_test_suite(existing_files)

        # Save results
        if results:
            tester.save_results(results, args.output)
        else:
            print("❌ No results collected")

    except Exception as e:
        print(f"❌ Test execution failed: {e}")
        import traceback
        traceback.print_exc()

    finally:
        tester.cleanup()

if __name__ == "__main__":
    main()