437 lines
17 KiB
Python
437 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Selenium OCR Performance Test
|
|
Tests OCR performance through the web UI to measure end-to-end performance
|
|
"""
|
|
|
|
import time
|
|
import os
|
|
import sys
|
|
import json
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from selenium.webdriver.chrome.options import Options
|
|
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
|
import pandas as pd
|
|
|
|
class OCRPerformanceTester:
|
|
"""Selenium-based OCR performance tester"""
|
|
|
|
def __init__(self, base_url="http://localhost:3015", headless=False):
|
|
"""
|
|
Initialize the performance tester
|
|
|
|
Args:
|
|
base_url: Base URL of the LightRAG web UI
|
|
headless: Run browser in headless mode
|
|
"""
|
|
self.base_url = base_url
|
|
self.headless = headless
|
|
self.driver = None
|
|
self.results = []
|
|
self.test_start_time = None
|
|
|
|
def setup_driver(self):
|
|
"""Setup Chrome WebDriver with options"""
|
|
chrome_options = Options()
|
|
if self.headless:
|
|
chrome_options.add_argument("--headless")
|
|
chrome_options.add_argument("--no-sandbox")
|
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
|
chrome_options.add_argument("--disable-gpu")
|
|
chrome_options.add_argument("--window-size=1920,1080")
|
|
|
|
self.driver = webdriver.Chrome(options=chrome_options)
|
|
self.driver.implicitly_wait(10)
|
|
|
|
def login(self, username="admin", password="admin"):
|
|
"""Login to the web UI"""
|
|
print(f"🔐 Logging in to {self.base_url}")
|
|
self.driver.get(f"{self.base_url}/login")
|
|
|
|
try:
|
|
# Wait for login form
|
|
username_field = WebDriverWait(self.driver, 10).until(
|
|
EC.presence_of_element_located((By.NAME, "username"))
|
|
)
|
|
password_field = self.driver.find_element(By.NAME, "password")
|
|
login_button = self.driver.find_element(By.XPATH, "//button[contains(text(), 'Login')]")
|
|
|
|
# Fill credentials
|
|
username_field.send_keys(username)
|
|
password_field.send_keys(password)
|
|
login_button.click()
|
|
|
|
# Wait for login to complete
|
|
WebDriverWait(self.driver, 10).until(
|
|
EC.presence_of_element_located((By.XPATH, "//h1[contains(text(), 'Dashboard')]"))
|
|
)
|
|
print("✅ Login successful")
|
|
return True
|
|
|
|
except TimeoutException:
|
|
print("⚠️ Login timeout - assuming already logged in or no auth required")
|
|
return True
|
|
except Exception as e:
|
|
print(f"❌ Login failed: {e}")
|
|
return False
|
|
|
|
def upload_document(self, file_path, document_type="pdf"):
|
|
"""
|
|
Upload a document and measure upload/processing time
|
|
|
|
Args:
|
|
file_path: Path to document file
|
|
document_type: Type of document (pdf, docx, etc.)
|
|
|
|
Returns:
|
|
dict: Performance metrics
|
|
"""
|
|
if not os.path.exists(file_path):
|
|
print(f"❌ File not found: {file_path}")
|
|
return None
|
|
|
|
file_name = os.path.basename(file_path)
|
|
file_size = os.path.getsize(file_path)
|
|
|
|
print(f"📤 Uploading {file_name} ({file_size:,} bytes)")
|
|
|
|
metrics = {
|
|
"file_name": file_name,
|
|
"file_size": file_size,
|
|
"document_type": document_type,
|
|
"upload_start": time.time()
|
|
}
|
|
|
|
try:
|
|
# Navigate to upload page
|
|
self.driver.get(f"{self.base_url}/upload")
|
|
|
|
# Wait for upload form
|
|
file_input = WebDriverWait(self.driver, 10).until(
|
|
EC.presence_of_element_located((By.XPATH, "//input[@type='file']"))
|
|
)
|
|
|
|
# Upload file
|
|
file_input.send_keys(os.path.abspath(file_path))
|
|
metrics["upload_end"] = time.time()
|
|
metrics["upload_time"] = metrics["upload_end"] - metrics["upload_start"]
|
|
|
|
print(f" 📊 Upload time: {metrics['upload_time']:.2f}s")
|
|
|
|
# Wait for processing to start
|
|
processing_start = time.time()
|
|
|
|
# Look for processing indicators
|
|
try:
|
|
# Wait for processing status element
|
|
processing_element = WebDriverWait(self.driver, 30).until(
|
|
EC.presence_of_element_located((By.XPATH, "//*[contains(text(), 'Processing') or contains(text(), 'processing')]"))
|
|
)
|
|
metrics["processing_start"] = processing_start
|
|
|
|
# Wait for completion
|
|
try:
|
|
# Look for success message
|
|
success_element = WebDriverWait(self.driver, 300).until( # 5 minute timeout for OCR
|
|
EC.presence_of_element_located((By.XPATH, "//*[contains(text(), 'Success') or contains(text(), 'success') or contains(text(), 'Complete') or contains(text(), 'complete')]"))
|
|
)
|
|
processing_end = time.time()
|
|
metrics["processing_end"] = processing_end
|
|
metrics["processing_time"] = processing_end - processing_start
|
|
|
|
print(f" 📊 Processing time: {metrics['processing_time']:.2f}s")
|
|
|
|
# Try to get OCR-specific metrics
|
|
try:
|
|
# Look for OCR statistics
|
|
page_elements = self.driver.find_elements(By.XPATH, "//*[contains(text(), 'page') or contains(text(), 'Page')]")
|
|
for element in page_elements:
|
|
text = element.text
|
|
if "page" in text.lower():
|
|
# Extract page count
|
|
import re
|
|
match = re.search(r'(\d+)\s+page', text)
|
|
if match:
|
|
metrics["pages_processed"] = int(match.group(1))
|
|
break
|
|
except:
|
|
pass
|
|
|
|
# Total time
|
|
metrics["total_time"] = metrics["upload_time"] + metrics["processing_time"]
|
|
print(f" 📊 Total time: {metrics['total_time']:.2f}s")
|
|
|
|
return metrics
|
|
|
|
except TimeoutException:
|
|
print("❌ Processing timeout (5 minutes)")
|
|
metrics["error"] = "Processing timeout"
|
|
return metrics
|
|
|
|
except TimeoutException:
|
|
print("❌ Processing indicator not found")
|
|
metrics["error"] = "Processing indicator not found"
|
|
return metrics
|
|
|
|
except Exception as e:
|
|
print(f"❌ Upload failed: {e}")
|
|
metrics["error"] = str(e)
|
|
return metrics
|
|
|
|
def search_document(self, query, expected_results=1):
|
|
"""
|
|
Search for content in uploaded documents
|
|
|
|
Args:
|
|
query: Search query
|
|
expected_results: Expected number of results
|
|
|
|
Returns:
|
|
dict: Search performance metrics
|
|
"""
|
|
print(f"🔍 Searching for: '{query}'")
|
|
|
|
metrics = {
|
|
"query": query,
|
|
"search_start": time.time()
|
|
}
|
|
|
|
try:
|
|
# Navigate to search page
|
|
self.driver.get(f"{self.base_url}/search")
|
|
|
|
# Find search input
|
|
search_input = WebDriverWait(self.driver, 10).until(
|
|
EC.presence_of_element_located((By.XPATH, "//input[@type='search' or contains(@placeholder, 'Search')]"))
|
|
)
|
|
|
|
# Enter search query
|
|
search_input.clear()
|
|
search_input.send_keys(query)
|
|
|
|
# Find and click search button
|
|
search_button = self.driver.find_element(By.XPATH, "//button[contains(text(), 'Search') or contains(@class, 'search')]")
|
|
search_button.click()
|
|
|
|
# Wait for results
|
|
search_end = time.time()
|
|
metrics["search_end"] = search_end
|
|
metrics["search_time"] = search_end - metrics["search_start"]
|
|
|
|
print(f" 📊 Search time: {metrics['search_time']:.2f}s")
|
|
|
|
# Count results
|
|
try:
|
|
results = WebDriverWait(self.driver, 10).until(
|
|
EC.presence_of_all_elements_located((By.XPATH, "//div[contains(@class, 'result') or contains(@class, 'card')]"))
|
|
)
|
|
metrics["results_found"] = len(results)
|
|
print(f" 📊 Results found: {len(results)}")
|
|
|
|
# Check if OCR content is in results
|
|
ocr_content_found = False
|
|
for result in results[:3]: # Check first 3 results
|
|
if "ocr" in result.text.lower() or "scanned" in result.text.lower():
|
|
ocr_content_found = True
|
|
break
|
|
metrics["ocr_content_found"] = ocr_content_found
|
|
|
|
except TimeoutException:
|
|
metrics["results_found"] = 0
|
|
print(" ⚠️ No results found or timeout")
|
|
|
|
return metrics
|
|
|
|
except Exception as e:
|
|
print(f"❌ Search failed: {e}")
|
|
metrics["error"] = str(e)
|
|
return metrics
|
|
|
|
def run_performance_test_suite(self, test_files):
|
|
"""
|
|
Run comprehensive performance test suite
|
|
|
|
Args:
|
|
test_files: List of test file paths
|
|
|
|
Returns:
|
|
list: All performance metrics
|
|
"""
|
|
print("🚀 STARTING OCR PERFORMANCE TEST SUITE")
|
|
print("=" * 60)
|
|
|
|
self.test_start_time = time.time()
|
|
all_metrics = []
|
|
|
|
# Login first
|
|
if not self.login():
|
|
print("❌ Cannot proceed without login")
|
|
return all_metrics
|
|
|
|
# Test each file
|
|
for file_path in test_files:
|
|
if not os.path.exists(file_path):
|
|
print(f"⚠️ Skipping missing file: {file_path}")
|
|
continue
|
|
|
|
print(f"\n📄 TESTING FILE: {os.path.basename(file_path)}")
|
|
print("-" * 40)
|
|
|
|
# Upload and process
|
|
upload_metrics = self.upload_document(file_path)
|
|
if upload_metrics:
|
|
all_metrics.append(upload_metrics)
|
|
|
|
# Wait a bit between tests
|
|
time.sleep(2)
|
|
|
|
# Search for OCR content
|
|
search_queries = ["text", "document", "content", "scanned"]
|
|
for query in search_queries:
|
|
search_metrics = self.search_document(query)
|
|
if search_metrics:
|
|
search_metrics["test_file"] = os.path.basename(file_path)
|
|
all_metrics.append(search_metrics)
|
|
time.sleep(1)
|
|
|
|
# Calculate summary statistics
|
|
self.calculate_summary(all_metrics)
|
|
|
|
return all_metrics
|
|
|
|
def calculate_summary(self, all_metrics):
|
|
"""Calculate and display summary statistics"""
|
|
print("\n" + "=" * 60)
|
|
print("📊 PERFORMANCE TEST SUMMARY")
|
|
print("=" * 60)
|
|
|
|
# Filter upload metrics
|
|
upload_metrics = [m for m in all_metrics if "upload_time" in m]
|
|
search_metrics = [m for m in all_metrics if "search_time" in m]
|
|
|
|
if upload_metrics:
|
|
print(f"\n📤 UPLOAD & PROCESSING ({len(upload_metrics)} tests):")
|
|
upload_times = [m["upload_time"] for m in upload_metrics if "upload_time" in m]
|
|
processing_times = [m["processing_time"] for m in upload_metrics if "processing_time" in m]
|
|
total_times = [m["total_time"] for m in upload_metrics if "total_time" in m]
|
|
|
|
if upload_times:
|
|
print(f" Upload Time: Avg={sum(upload_times)/len(upload_times):.2f}s, Min={min(upload_times):.2f}s, Max={max(upload_times):.2f}s")
|
|
if processing_times:
|
|
print(f" Processing Time: Avg={sum(processing_times)/len(processing_times):.2f}s, Min={min(processing_times):.2f}s, Max={max(processing_times):.2f}s")
|
|
if total_times:
|
|
print(f" Total Time: Avg={sum(total_times)/len(total_times):.2f}s, Min={min(total_times):.2f}s, Max={max(total_times):.2f}s")
|
|
|
|
if search_metrics:
|
|
print(f"\n🔍 SEARCH ({len(search_metrics)} tests):")
|
|
search_times = [m["search_time"] for m in search_metrics if "search_time" in m]
|
|
if search_times:
|
|
print(f" Search Time: Avg={sum(search_times)/len(search_times):.2f}s, Min={min(search_times):.2f}s, Max={max(search_times):.2f}s")
|
|
|
|
ocr_found = [m for m in search_metrics if m.get("ocr_content_found", False)]
|
|
print(f" OCR Content Found: {len(ocr_found)}/{len(search_metrics)} searches")
|
|
|
|
# Overall test duration
|
|
if self.test_start_time:
|
|
total_duration = time.time() - self.test_start_time
|
|
print(f"\n⏱️ TOTAL TEST DURATION: {total_duration:.2f}s")
|
|
|
|
def save_results(self, results, output_file="ocr_performance_results.json"):
|
|
"""Save test results to JSON file"""
|
|
# Add timestamp
|
|
results_data = {
|
|
"timestamp": datetime.now().isoformat(),
|
|
"base_url": self.base_url,
|
|
"results": results
|
|
}
|
|
|
|
with open(output_file, 'w') as f:
|
|
json.dump(results_data, f, indent=2, default=str)
|
|
|
|
print(f"\n💾 Results saved to: {output_file}")
|
|
|
|
# Also save as CSV for easier analysis
|
|
self.save_results_csv(results, "ocr_performance_results.csv")
|
|
|
|
def save_results_csv(self, results, output_file="ocr_performance_results.csv"):
|
|
"""Save test results to CSV file"""
|
|
if not results:
|
|
return
|
|
|
|
# Flatten results for CSV
|
|
flat_results = []
|
|
for result in results:
|
|
flat_result = {}
|
|
for key, value in result.items():
|
|
if isinstance(value, (dict, list)):
|
|
flat_result[key] = str(value)
|
|
else:
|
|
flat_result[key] = value
|
|
flat_results.append(flat_result)
|
|
|
|
df = pd.DataFrame(flat_results)
|
|
df.to_csv(output_file, index=False)
|
|
print(f"📊 CSV results saved to: {output_file}")
|
|
|
|
def cleanup(self):
|
|
"""Cleanup resources"""
|
|
if self.driver:
|
|
self.driver.quit()
|
|
print("\n🧹 Browser closed")
|
|
|
|
def main():
|
|
"""Main test execution"""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Selenium OCR Performance Test")
|
|
parser.add_argument("--url", default="http://localhost:3015", help="Base URL of LightRAG web UI")
|
|
parser.add_argument("--headless", action="store_true", help="Run browser in headless mode")
|
|
parser.add_argument("--files", nargs="+", help="Test files to upload")
|
|
parser.add_argument("--output", default="ocr_performance_results.json", help="Output file for results")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Default test files if none provided
|
|
test_files = args.files or [
|
|
"ocr.pdf", # OCR test PDF
|
|
"test_meaningful.pdf", # Text-based PDF
|
|
"test.docx" # Word document
|
|
]
|
|
|
|
# Filter to existing files
|
|
existing_files = [f for f in test_files if os.path.exists(f)]
|
|
if not existing_files:
|
|
print("❌ No test files found. Please provide valid file paths.")
|
|
return
|
|
|
|
print(f"📁 Test files: {existing_files}")
|
|
|
|
# Create tester instance
|
|
tester = OCRPerformanceTester(base_url=args.url, headless=args.headless)
|
|
|
|
try:
|
|
# Setup and run tests
|
|
tester.setup_driver()
|
|
results = tester.run_performance_test_suite(existing_files)
|
|
|
|
# Save results
|
|
if results:
|
|
tester.save_results(results, args.output)
|
|
else:
|
|
print("❌ No results collected")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Test execution failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
finally:
|
|
tester.cleanup()
|
|
|
|
if __name__ == "__main__":
|
|
main() |