#!/usr/bin/env python3 """ OCR Performance Benchmark Benchmarks OCR processing performance before and after optimizations """ import time import os import sys import json from datetime import datetime from pathlib import Path import statistics from typing import List, Dict, Any # Add parent directory to path to import LightRAG modules sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "LightRAG-main")) class OCRPerformanceBenchmark: """Benchmark OCR processing performance""" def __init__(self): self.results = [] self.benchmark_start = None def benchmark_original_ocr(self, image_paths: List[str], iterations: int = 3): """Benchmark original OCR implementation""" print("šŸ” BENCHMARKING ORIGINAL OCR IMPLEMENTATION") print("=" * 60) try: # Try to import original document processor from lightrag.document_processor import DocumentProcessor processor = DocumentProcessor() results = [] for i, image_path in enumerate(image_paths): if not os.path.exists(image_path): print(f"āš ļø Skipping missing image: {image_path}") continue print(f"\nšŸ“Š Testing image {i+1}/{len(image_paths)}: {os.path.basename(image_path)}") # Warm up print(" Warming up...") try: for _ in range(2): result = processor.extract_text_from_image(image_path) except Exception as e: print(f" āš ļø Warm-up failed: {e}") continue # Run benchmark times = [] for iter_num in range(iterations): start_time = time.time() try: result = processor.extract_text_from_image(image_path) elapsed = time.time() - start_time times.append(elapsed) print(f" Iteration {iter_num+1}: {elapsed:.3f}s, {len(result.get('text', ''))} chars") except Exception as e: print(f" āŒ Iteration {iter_num+1} failed: {e}") continue if times: avg_time = statistics.mean(times) min_time = min(times) max_time = max(times) result_data = { "implementation": "original", "image": os.path.basename(image_path), "iterations": len(times), "avg_time": avg_time, "min_time": min_time, "max_time": max_time, "times": times } results.append(result_data) print(f" šŸ“ˆ Average: {avg_time:.3f}s, Min: {min_time:.3f}s, Max: {max_time:.3f}s") return results except ImportError as e: print(f"āŒ Could not import original DocumentProcessor: {e}") return [] except Exception as e: print(f"āŒ Benchmark failed: {e}") import traceback traceback.print_exc() return [] def benchmark_optimized_ocr(self, image_paths: List[str], iterations: int = 3): """Benchmark optimized OCR implementation""" print("\nšŸ” BENCHMARKING OPTIMIZED OCR IMPLEMENTATION") print("=" * 60) try: # Try to import optimized OCR processor from lightrag.optimized_ocr_processor import OptimizedOCRProcessor processor = OptimizedOCRProcessor() results = [] # Test single image processing print("\nšŸ“Š SINGLE IMAGE PROCESSING") for i, image_path in enumerate(image_paths): if not os.path.exists(image_path): print(f"āš ļø Skipping missing image: {image_path}") continue print(f"\n Testing image {i+1}/{len(image_paths)}: {os.path.basename(image_path)}") # Warm up print(" Warming up...") try: for _ in range(2): result = processor.extract_text_from_image(image_path) except Exception as e: print(f" āš ļø Warm-up failed: {e}") continue # Run benchmark times = [] for iter_num in range(iterations): start_time = time.time() try: result = processor.extract_text_from_image(image_path) elapsed = time.time() - start_time times.append(elapsed) print(f" Iteration {iter_num+1}: {elapsed:.3f}s, {len(result.get('text', ''))} chars") except Exception as e: print(f" āŒ Iteration {iter_num+1} failed: {e}") continue if times: avg_time = statistics.mean(times) min_time = min(times) max_time = max(times) result_data = { "implementation": "optimized_single", "image": os.path.basename(image_path), "iterations": len(times), "avg_time": avg_time, "min_time": min_time, "max_time": max_time, "times": times } results.append(result_data) print(f" šŸ“ˆ Average: {avg_time:.3f}s, Min: {min_time:.3f}s, Max: {max_time:.3f}s") # Test batch processing print("\nšŸ“Š BATCH PROCESSING") if len(image_paths) >= 2: # Create batches batch_sizes = [2, 3, 5] for batch_size in batch_sizes: if batch_size > len(image_paths): continue batch_images = image_paths[:batch_size] batch_names = [os.path.basename(img) for img in batch_images] print(f"\n Testing batch of {batch_size} images: {', '.join(batch_names)}") # Warm up print(" Warming up...") try: for _ in range(2): result = processor.extract_text_from_images_batch(batch_images) except Exception as e: print(f" āš ļø Batch warm-up failed: {e}") continue # Run benchmark times = [] for iter_num in range(iterations): start_time = time.time() try: results_list = processor.extract_text_from_images_batch(batch_images) elapsed = time.time() - start_time times.append(elapsed) total_chars = sum(len(r.get('text', '')) for r in results_list) print(f" Iteration {iter_num+1}: {elapsed:.3f}s, {total_chars} total chars") except Exception as e: print(f" āŒ Iteration {iter_num+1} failed: {e}") continue if times: avg_time = statistics.mean(times) min_time = min(times) max_time = max(times) # Calculate per-image time per_image_time = avg_time / batch_size result_data = { "implementation": "optimized_batch", "batch_size": batch_size, "images": batch_names, "iterations": len(times), "avg_time": avg_time, "per_image_time": per_image_time, "min_time": min_time, "max_time": max_time, "times": times } results.append(result_data) print(f" šŸ“ˆ Batch average: {avg_time:.3f}s ({per_image_time:.3f}s per image)") return results except ImportError as e: print(f"āŒ Could not import OptimizedOCRProcessor: {e}") return [] except Exception as e: print(f"āŒ Benchmark failed: {e}") import traceback traceback.print_exc() return [] def benchmark_pdf_processing(self, pdf_path: str, iterations: int = 2): """Benchmark PDF processing performance""" print("\nšŸ” BENCHMARKING PDF PROCESSING") print("=" * 60) if not os.path.exists(pdf_path): print(f"āŒ PDF not found: {pdf_path}") return [] pdf_name = os.path.basename(pdf_path) print(f"šŸ“Š Testing PDF: {pdf_name}") results = [] try: # Try to import optimized document processor from lightrag.optimized_document_processor import OptimizedDocumentProcessor processor = OptimizedDocumentProcessor() # Warm up print(" Warming up...") try: # Extract first page to warm up from pdf2image import convert_from_path images = convert_from_path(pdf_path, first_page=1, last_page=1) if images: temp_path = "temp_warmup.png" images[0].save(temp_path) processor.extract_text_from_image(temp_path) if os.path.exists(temp_path): os.remove(temp_path) except Exception as e: print(f" āš ļø Warm-up failed: {e}") # Run benchmark times = [] for iter_num in range(iterations): print(f" Iteration {iter_num+1}/{iterations}...") start_time = time.time() try: # Process the PDF result = processor.process_document(pdf_path) elapsed = time.time() - start_time times.append(elapsed) # Extract metrics pages = result.get('pages', []) total_chars = sum(len(p.get('text', '')) for p in pages) print(f" Time: {elapsed:.3f}s, Pages: {len(pages)}, Chars: {total_chars}") except Exception as e: print(f" āŒ Iteration {iter_num+1} failed: {e}") continue if times: avg_time = statistics.mean(times) min_time = min(times) max_time = max(times) result_data = { "implementation": "pdf_processing", "pdf": pdf_name, "iterations": len(times), "avg_time": avg_time, "min_time": min_time, "max_time": max_time, "times": times } results.append(result_data) print(f" šŸ“ˆ PDF processing average: {avg_time:.3f}s") return results except ImportError as e: print(f"āŒ Could not import OptimizedDocumentProcessor: {e}") return [] except Exception as e: print(f"āŒ PDF benchmark failed: {e}") import traceback traceback.print_exc() return [] def run_comprehensive_benchmark(self, test_images: List[str], test_pdf: str = None): """Run comprehensive benchmark suite""" print("šŸš€ STARTING COMPREHENSIVE OCR PERFORMANCE BENCHMARK") print("=" * 60) self.benchmark_start = time.time() all_results = [] # Benchmark original OCR original_results = self.benchmark_original_ocr(test_images) all_results.extend(original_results) # Benchmark optimized OCR optimized_results = self.benchmark_optimized_ocr(test_images) all_results.extend(optimized_results) # Benchmark PDF processing if provided if test_pdf and os.path.exists(test_pdf): pdf_results = self.benchmark_pdf_processing(test_pdf) all_results.extend(pdf_results) # Calculate summary self.calculate_summary(all_results) # Save results self.save_results(all_results) return all_results def calculate_summary(self, results: List[Dict[str, Any]]): """Calculate and display summary statistics""" print("\n" + "=" * 60) print("šŸ“Š BENCHMARK SUMMARY") print("=" * 60) # Group by implementation implementations = {} for result in results: impl = result.get('implementation', 'unknown') if impl not in implementations: implementations[impl] = [] implementations[impl].append(result) # Calculate statistics for each implementation for impl, impl_results in implementations.items(): print(f"\nšŸ“ˆ {impl.upper().replace('_', ' ')}:") # Extract times all_times = [] for result in impl_results: if 'avg_time' in result: all_times.append(result['avg_time']) if all_times: avg = statistics.mean(all_times) min_val = min(all_times) max_val = max(all_times) count = len(all_times) print(f" Tests: {count}, Avg: {avg:.3f}s, Min: {min_val:.3f}s, Max: {max_val:.3f}s") # Special handling for batch processing if impl == 'optimized_batch': per_image_times = [r.get('per_image_time', 0) for r in impl_results if 'per_image_time' in r] if per_image_times: avg_per_image = statistics.mean(per_image_times) print(f" Per-image avg: {avg_per_image:.3f}s") # Compare original vs optimized single original_times = [r['avg_time'] for r in results if r.get('implementation') == 'original' and 'avg_time' in r] optimized_single_times = [r['avg_time'] for r in results if r.get('implementation') == 'optimized_single' and 'avg_time' in r] if original_times and optimized_single_times: orig_avg = statistics.mean(original_times) opt_avg = statistics.mean(optimized_single_times) if orig_avg > 0: improvement = ((orig_avg - opt_avg) / orig_avg) * 100 print(f"\nšŸŽÆ PERFORMANCE IMPROVEMENT:") print(f" Original: {orig_avg:.3f}s per image") print(f" Optimized: {opt_avg:.3f}s per image") print(f" Improvement: {improvement:.1f}% faster") # Total benchmark duration if self.benchmark_start: total_duration = time.time() - self.benchmark_start print(f"\nā±ļø TOTAL BENCHMARK DURATION: {total_duration:.2f}s") def save_results(self, results: List[Dict[str, Any]], output_file: str = "ocr_benchmark_results.json"): """Save benchmark results to JSON file""" # Add metadata results_data = { "timestamp": datetime.now().isoformat(), "system": { "platform": sys.platform, "python_version": sys.version }, "results": results } with open(output_file, 'w') as f: json.dump(results_data, f, indent=2, default=str) print(f"\nšŸ’¾ Results saved to: {output_file}") # Also save summary CSV self.save_results_csv(results, "ocr_benchmark_summary.csv") def save_results_csv(self, results: List[Dict[str, Any]], output_file: str = "ocr_benchmark_summary.csv"): """Save benchmark summary to CSV file""" import csv # Prepare CSV data csv_data = [] for result in results: row = { 'implementation': result.get('implementation', ''), 'test_type': result.get('image', result.get('pdf', result.get('batch_size', ''))), 'iterations': result.get('iterations', 0), 'avg_time': result.get('avg_time', 0), 'min_time': result.get('min_time', 0), 'max_time': result.get('max_time', 0), } # Add batch-specific fields if result.get('implementation') == 'optimized_batch': row['batch_size'] = result.get('batch_size', '') row['per_image_time'] = result.get('per_image_time', 0) csv_data.append(row) # Write CSV if csv_data: with open(output_file, 'w', newline='') as f: fieldnames = ['implementation', 'test_type', 'iterations', 'avg_time', 'min_time', 'max_time', 'batch_size', 'per_image_time'] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(csv_data) print(f"šŸ“Š CSV summary saved to: {output_file}") def main(): """Main benchmark execution""" import argparse parser = argparse.ArgumentParser(description="OCR Performance Benchmark") parser.add_argument("--images", nargs="+", help="Test image files") parser.add_argument("--pdf", help="Test PDF file") parser.add_argument("--iterations", type=int, default=3, help="Iterations per test") parser.add_argument("--output", default="ocr_benchmark_results.json", help="Output file") args = parser.parse_args() # Default test images if none provided default_images = [ "ocr_high_res.png", "ocr_page1_rendered.png", "ocr_page1_preview.png" ] test_images = args.images or [img for img in default_images if os.path.exists(img)] if not test_images: print("āŒ No test images found. Please provide valid image paths.") return print(f"šŸ“ Test images: {test_images}") if args.pdf and os.path.exists(args.pdf): print(f"šŸ“„ Test PDF: {args.pdf}") # Create benchmark instance and run benchmark = OCRPerformanceBenchmark() results = benchmark.run_comprehensive_benchmark( test_images=test_images, test_pdf=args.pdf ) print("\nāœ… Benchmark completed!") if __name__ == "__main__": main()