Files
railseek6/ocr_performance_benchmark.py

491 lines
20 KiB
Python

#!/usr/bin/env python3
"""
OCR Performance Benchmark
Benchmarks OCR processing performance before and after optimizations
"""
import time
import os
import sys
import json
from datetime import datetime
from pathlib import Path
import statistics
from typing import List, Dict, Any
# Add parent directory to path to import LightRAG modules
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "LightRAG-main"))
class OCRPerformanceBenchmark:
"""Benchmark OCR processing performance"""
def __init__(self):
self.results = []
self.benchmark_start = None
def benchmark_original_ocr(self, image_paths: List[str], iterations: int = 3):
"""Benchmark original OCR implementation"""
print("🔍 BENCHMARKING ORIGINAL OCR IMPLEMENTATION")
print("=" * 60)
try:
# Try to import original document processor
from lightrag.document_processor import DocumentProcessor
processor = DocumentProcessor()
results = []
for i, image_path in enumerate(image_paths):
if not os.path.exists(image_path):
print(f"⚠️ Skipping missing image: {image_path}")
continue
print(f"\n📊 Testing image {i+1}/{len(image_paths)}: {os.path.basename(image_path)}")
# Warm up
print(" Warming up...")
try:
for _ in range(2):
result = processor.extract_text_from_image(image_path)
except Exception as e:
print(f" ⚠️ Warm-up failed: {e}")
continue
# Run benchmark
times = []
for iter_num in range(iterations):
start_time = time.time()
try:
result = processor.extract_text_from_image(image_path)
elapsed = time.time() - start_time
times.append(elapsed)
print(f" Iteration {iter_num+1}: {elapsed:.3f}s, {len(result.get('text', ''))} chars")
except Exception as e:
print(f" ❌ Iteration {iter_num+1} failed: {e}")
continue
if times:
avg_time = statistics.mean(times)
min_time = min(times)
max_time = max(times)
result_data = {
"implementation": "original",
"image": os.path.basename(image_path),
"iterations": len(times),
"avg_time": avg_time,
"min_time": min_time,
"max_time": max_time,
"times": times
}
results.append(result_data)
print(f" 📈 Average: {avg_time:.3f}s, Min: {min_time:.3f}s, Max: {max_time:.3f}s")
return results
except ImportError as e:
print(f"❌ Could not import original DocumentProcessor: {e}")
return []
except Exception as e:
print(f"❌ Benchmark failed: {e}")
import traceback
traceback.print_exc()
return []
def benchmark_optimized_ocr(self, image_paths: List[str], iterations: int = 3):
"""Benchmark optimized OCR implementation"""
print("\n🔍 BENCHMARKING OPTIMIZED OCR IMPLEMENTATION")
print("=" * 60)
try:
# Try to import optimized OCR processor
from lightrag.optimized_ocr_processor import OptimizedOCRProcessor
processor = OptimizedOCRProcessor()
results = []
# Test single image processing
print("\n📊 SINGLE IMAGE PROCESSING")
for i, image_path in enumerate(image_paths):
if not os.path.exists(image_path):
print(f"⚠️ Skipping missing image: {image_path}")
continue
print(f"\n Testing image {i+1}/{len(image_paths)}: {os.path.basename(image_path)}")
# Warm up
print(" Warming up...")
try:
for _ in range(2):
result = processor.extract_text_from_image(image_path)
except Exception as e:
print(f" ⚠️ Warm-up failed: {e}")
continue
# Run benchmark
times = []
for iter_num in range(iterations):
start_time = time.time()
try:
result = processor.extract_text_from_image(image_path)
elapsed = time.time() - start_time
times.append(elapsed)
print(f" Iteration {iter_num+1}: {elapsed:.3f}s, {len(result.get('text', ''))} chars")
except Exception as e:
print(f" ❌ Iteration {iter_num+1} failed: {e}")
continue
if times:
avg_time = statistics.mean(times)
min_time = min(times)
max_time = max(times)
result_data = {
"implementation": "optimized_single",
"image": os.path.basename(image_path),
"iterations": len(times),
"avg_time": avg_time,
"min_time": min_time,
"max_time": max_time,
"times": times
}
results.append(result_data)
print(f" 📈 Average: {avg_time:.3f}s, Min: {min_time:.3f}s, Max: {max_time:.3f}s")
# Test batch processing
print("\n📊 BATCH PROCESSING")
if len(image_paths) >= 2:
# Create batches
batch_sizes = [2, 3, 5]
for batch_size in batch_sizes:
if batch_size > len(image_paths):
continue
batch_images = image_paths[:batch_size]
batch_names = [os.path.basename(img) for img in batch_images]
print(f"\n Testing batch of {batch_size} images: {', '.join(batch_names)}")
# Warm up
print(" Warming up...")
try:
for _ in range(2):
result = processor.extract_text_from_images_batch(batch_images)
except Exception as e:
print(f" ⚠️ Batch warm-up failed: {e}")
continue
# Run benchmark
times = []
for iter_num in range(iterations):
start_time = time.time()
try:
results_list = processor.extract_text_from_images_batch(batch_images)
elapsed = time.time() - start_time
times.append(elapsed)
total_chars = sum(len(r.get('text', '')) for r in results_list)
print(f" Iteration {iter_num+1}: {elapsed:.3f}s, {total_chars} total chars")
except Exception as e:
print(f" ❌ Iteration {iter_num+1} failed: {e}")
continue
if times:
avg_time = statistics.mean(times)
min_time = min(times)
max_time = max(times)
# Calculate per-image time
per_image_time = avg_time / batch_size
result_data = {
"implementation": "optimized_batch",
"batch_size": batch_size,
"images": batch_names,
"iterations": len(times),
"avg_time": avg_time,
"per_image_time": per_image_time,
"min_time": min_time,
"max_time": max_time,
"times": times
}
results.append(result_data)
print(f" 📈 Batch average: {avg_time:.3f}s ({per_image_time:.3f}s per image)")
return results
except ImportError as e:
print(f"❌ Could not import OptimizedOCRProcessor: {e}")
return []
except Exception as e:
print(f"❌ Benchmark failed: {e}")
import traceback
traceback.print_exc()
return []
def benchmark_pdf_processing(self, pdf_path: str, iterations: int = 2):
"""Benchmark PDF processing performance"""
print("\n🔍 BENCHMARKING PDF PROCESSING")
print("=" * 60)
if not os.path.exists(pdf_path):
print(f"❌ PDF not found: {pdf_path}")
return []
pdf_name = os.path.basename(pdf_path)
print(f"📊 Testing PDF: {pdf_name}")
results = []
try:
# Try to import optimized document processor
from lightrag.optimized_document_processor import OptimizedDocumentProcessor
processor = OptimizedDocumentProcessor()
# Warm up
print(" Warming up...")
try:
# Extract first page to warm up
from pdf2image import convert_from_path
images = convert_from_path(pdf_path, first_page=1, last_page=1)
if images:
temp_path = "temp_warmup.png"
images[0].save(temp_path)
processor.extract_text_from_image(temp_path)
if os.path.exists(temp_path):
os.remove(temp_path)
except Exception as e:
print(f" ⚠️ Warm-up failed: {e}")
# Run benchmark
times = []
for iter_num in range(iterations):
print(f" Iteration {iter_num+1}/{iterations}...")
start_time = time.time()
try:
# Process the PDF
result = processor.process_document(pdf_path)
elapsed = time.time() - start_time
times.append(elapsed)
# Extract metrics
pages = result.get('pages', [])
total_chars = sum(len(p.get('text', '')) for p in pages)
print(f" Time: {elapsed:.3f}s, Pages: {len(pages)}, Chars: {total_chars}")
except Exception as e:
print(f" ❌ Iteration {iter_num+1} failed: {e}")
continue
if times:
avg_time = statistics.mean(times)
min_time = min(times)
max_time = max(times)
result_data = {
"implementation": "pdf_processing",
"pdf": pdf_name,
"iterations": len(times),
"avg_time": avg_time,
"min_time": min_time,
"max_time": max_time,
"times": times
}
results.append(result_data)
print(f" 📈 PDF processing average: {avg_time:.3f}s")
return results
except ImportError as e:
print(f"❌ Could not import OptimizedDocumentProcessor: {e}")
return []
except Exception as e:
print(f"❌ PDF benchmark failed: {e}")
import traceback
traceback.print_exc()
return []
def run_comprehensive_benchmark(self, test_images: List[str], test_pdf: str = None):
"""Run comprehensive benchmark suite"""
print("🚀 STARTING COMPREHENSIVE OCR PERFORMANCE BENCHMARK")
print("=" * 60)
self.benchmark_start = time.time()
all_results = []
# Benchmark original OCR
original_results = self.benchmark_original_ocr(test_images)
all_results.extend(original_results)
# Benchmark optimized OCR
optimized_results = self.benchmark_optimized_ocr(test_images)
all_results.extend(optimized_results)
# Benchmark PDF processing if provided
if test_pdf and os.path.exists(test_pdf):
pdf_results = self.benchmark_pdf_processing(test_pdf)
all_results.extend(pdf_results)
# Calculate summary
self.calculate_summary(all_results)
# Save results
self.save_results(all_results)
return all_results
def calculate_summary(self, results: List[Dict[str, Any]]):
"""Calculate and display summary statistics"""
print("\n" + "=" * 60)
print("📊 BENCHMARK SUMMARY")
print("=" * 60)
# Group by implementation
implementations = {}
for result in results:
impl = result.get('implementation', 'unknown')
if impl not in implementations:
implementations[impl] = []
implementations[impl].append(result)
# Calculate statistics for each implementation
for impl, impl_results in implementations.items():
print(f"\n📈 {impl.upper().replace('_', ' ')}:")
# Extract times
all_times = []
for result in impl_results:
if 'avg_time' in result:
all_times.append(result['avg_time'])
if all_times:
avg = statistics.mean(all_times)
min_val = min(all_times)
max_val = max(all_times)
count = len(all_times)
print(f" Tests: {count}, Avg: {avg:.3f}s, Min: {min_val:.3f}s, Max: {max_val:.3f}s")
# Special handling for batch processing
if impl == 'optimized_batch':
per_image_times = [r.get('per_image_time', 0) for r in impl_results if 'per_image_time' in r]
if per_image_times:
avg_per_image = statistics.mean(per_image_times)
print(f" Per-image avg: {avg_per_image:.3f}s")
# Compare original vs optimized single
original_times = [r['avg_time'] for r in results if r.get('implementation') == 'original' and 'avg_time' in r]
optimized_single_times = [r['avg_time'] for r in results if r.get('implementation') == 'optimized_single' and 'avg_time' in r]
if original_times and optimized_single_times:
orig_avg = statistics.mean(original_times)
opt_avg = statistics.mean(optimized_single_times)
if orig_avg > 0:
improvement = ((orig_avg - opt_avg) / orig_avg) * 100
print(f"\n🎯 PERFORMANCE IMPROVEMENT:")
print(f" Original: {orig_avg:.3f}s per image")
print(f" Optimized: {opt_avg:.3f}s per image")
print(f" Improvement: {improvement:.1f}% faster")
# Total benchmark duration
if self.benchmark_start:
total_duration = time.time() - self.benchmark_start
print(f"\n⏱️ TOTAL BENCHMARK DURATION: {total_duration:.2f}s")
def save_results(self, results: List[Dict[str, Any]], output_file: str = "ocr_benchmark_results.json"):
"""Save benchmark results to JSON file"""
# Add metadata
results_data = {
"timestamp": datetime.now().isoformat(),
"system": {
"platform": sys.platform,
"python_version": sys.version
},
"results": results
}
with open(output_file, 'w') as f:
json.dump(results_data, f, indent=2, default=str)
print(f"\n💾 Results saved to: {output_file}")
# Also save summary CSV
self.save_results_csv(results, "ocr_benchmark_summary.csv")
def save_results_csv(self, results: List[Dict[str, Any]], output_file: str = "ocr_benchmark_summary.csv"):
"""Save benchmark summary to CSV file"""
import csv
# Prepare CSV data
csv_data = []
for result in results:
row = {
'implementation': result.get('implementation', ''),
'test_type': result.get('image', result.get('pdf', result.get('batch_size', ''))),
'iterations': result.get('iterations', 0),
'avg_time': result.get('avg_time', 0),
'min_time': result.get('min_time', 0),
'max_time': result.get('max_time', 0),
}
# Add batch-specific fields
if result.get('implementation') == 'optimized_batch':
row['batch_size'] = result.get('batch_size', '')
row['per_image_time'] = result.get('per_image_time', 0)
csv_data.append(row)
# Write CSV
if csv_data:
with open(output_file, 'w', newline='') as f:
fieldnames = ['implementation', 'test_type', 'iterations', 'avg_time', 'min_time', 'max_time', 'batch_size', 'per_image_time']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(csv_data)
print(f"📊 CSV summary saved to: {output_file}")
def main():
"""Main benchmark execution"""
import argparse
parser = argparse.ArgumentParser(description="OCR Performance Benchmark")
parser.add_argument("--images", nargs="+", help="Test image files")
parser.add_argument("--pdf", help="Test PDF file")
parser.add_argument("--iterations", type=int, default=3, help="Iterations per test")
parser.add_argument("--output", default="ocr_benchmark_results.json", help="Output file")
args = parser.parse_args()
# Default test images if none provided
default_images = [
"ocr_high_res.png",
"ocr_page1_rendered.png",
"ocr_page1_preview.png"
]
test_images = args.images or [img for img in default_images if os.path.exists(img)]
if not test_images:
print("❌ No test images found. Please provide valid image paths.")
return
print(f"📁 Test images: {test_images}")
if args.pdf and os.path.exists(args.pdf):
print(f"📄 Test PDF: {args.pdf}")
# Create benchmark instance and run
benchmark = OCRPerformanceBenchmark()
results = benchmark.run_comprehensive_benchmark(
test_images=test_images,
test_pdf=args.pdf
)
print("\n✅ Benchmark completed!")
if __name__ == "__main__":
main()