491 lines
20 KiB
Python
491 lines
20 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
OCR Performance Benchmark
|
|
Benchmarks OCR processing performance before and after optimizations
|
|
"""
|
|
|
|
import time
|
|
import os
|
|
import sys
|
|
import json
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
import statistics
|
|
from typing import List, Dict, Any
|
|
|
|
# Add parent directory to path to import LightRAG modules
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "LightRAG-main"))
|
|
|
|
class OCRPerformanceBenchmark:
|
|
"""Benchmark OCR processing performance"""
|
|
|
|
def __init__(self):
|
|
self.results = []
|
|
self.benchmark_start = None
|
|
|
|
def benchmark_original_ocr(self, image_paths: List[str], iterations: int = 3):
|
|
"""Benchmark original OCR implementation"""
|
|
print("🔍 BENCHMARKING ORIGINAL OCR IMPLEMENTATION")
|
|
print("=" * 60)
|
|
|
|
try:
|
|
# Try to import original document processor
|
|
from lightrag.document_processor import DocumentProcessor
|
|
processor = DocumentProcessor()
|
|
|
|
results = []
|
|
for i, image_path in enumerate(image_paths):
|
|
if not os.path.exists(image_path):
|
|
print(f"⚠️ Skipping missing image: {image_path}")
|
|
continue
|
|
|
|
print(f"\n📊 Testing image {i+1}/{len(image_paths)}: {os.path.basename(image_path)}")
|
|
|
|
# Warm up
|
|
print(" Warming up...")
|
|
try:
|
|
for _ in range(2):
|
|
result = processor.extract_text_from_image(image_path)
|
|
except Exception as e:
|
|
print(f" ⚠️ Warm-up failed: {e}")
|
|
continue
|
|
|
|
# Run benchmark
|
|
times = []
|
|
for iter_num in range(iterations):
|
|
start_time = time.time()
|
|
try:
|
|
result = processor.extract_text_from_image(image_path)
|
|
elapsed = time.time() - start_time
|
|
times.append(elapsed)
|
|
print(f" Iteration {iter_num+1}: {elapsed:.3f}s, {len(result.get('text', ''))} chars")
|
|
except Exception as e:
|
|
print(f" ❌ Iteration {iter_num+1} failed: {e}")
|
|
continue
|
|
|
|
if times:
|
|
avg_time = statistics.mean(times)
|
|
min_time = min(times)
|
|
max_time = max(times)
|
|
|
|
result_data = {
|
|
"implementation": "original",
|
|
"image": os.path.basename(image_path),
|
|
"iterations": len(times),
|
|
"avg_time": avg_time,
|
|
"min_time": min_time,
|
|
"max_time": max_time,
|
|
"times": times
|
|
}
|
|
results.append(result_data)
|
|
|
|
print(f" 📈 Average: {avg_time:.3f}s, Min: {min_time:.3f}s, Max: {max_time:.3f}s")
|
|
|
|
return results
|
|
|
|
except ImportError as e:
|
|
print(f"❌ Could not import original DocumentProcessor: {e}")
|
|
return []
|
|
except Exception as e:
|
|
print(f"❌ Benchmark failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return []
|
|
|
|
def benchmark_optimized_ocr(self, image_paths: List[str], iterations: int = 3):
|
|
"""Benchmark optimized OCR implementation"""
|
|
print("\n🔍 BENCHMARKING OPTIMIZED OCR IMPLEMENTATION")
|
|
print("=" * 60)
|
|
|
|
try:
|
|
# Try to import optimized OCR processor
|
|
from lightrag.optimized_ocr_processor import OptimizedOCRProcessor
|
|
processor = OptimizedOCRProcessor()
|
|
|
|
results = []
|
|
|
|
# Test single image processing
|
|
print("\n📊 SINGLE IMAGE PROCESSING")
|
|
for i, image_path in enumerate(image_paths):
|
|
if not os.path.exists(image_path):
|
|
print(f"⚠️ Skipping missing image: {image_path}")
|
|
continue
|
|
|
|
print(f"\n Testing image {i+1}/{len(image_paths)}: {os.path.basename(image_path)}")
|
|
|
|
# Warm up
|
|
print(" Warming up...")
|
|
try:
|
|
for _ in range(2):
|
|
result = processor.extract_text_from_image(image_path)
|
|
except Exception as e:
|
|
print(f" ⚠️ Warm-up failed: {e}")
|
|
continue
|
|
|
|
# Run benchmark
|
|
times = []
|
|
for iter_num in range(iterations):
|
|
start_time = time.time()
|
|
try:
|
|
result = processor.extract_text_from_image(image_path)
|
|
elapsed = time.time() - start_time
|
|
times.append(elapsed)
|
|
print(f" Iteration {iter_num+1}: {elapsed:.3f}s, {len(result.get('text', ''))} chars")
|
|
except Exception as e:
|
|
print(f" ❌ Iteration {iter_num+1} failed: {e}")
|
|
continue
|
|
|
|
if times:
|
|
avg_time = statistics.mean(times)
|
|
min_time = min(times)
|
|
max_time = max(times)
|
|
|
|
result_data = {
|
|
"implementation": "optimized_single",
|
|
"image": os.path.basename(image_path),
|
|
"iterations": len(times),
|
|
"avg_time": avg_time,
|
|
"min_time": min_time,
|
|
"max_time": max_time,
|
|
"times": times
|
|
}
|
|
results.append(result_data)
|
|
|
|
print(f" 📈 Average: {avg_time:.3f}s, Min: {min_time:.3f}s, Max: {max_time:.3f}s")
|
|
|
|
# Test batch processing
|
|
print("\n📊 BATCH PROCESSING")
|
|
if len(image_paths) >= 2:
|
|
# Create batches
|
|
batch_sizes = [2, 3, 5]
|
|
for batch_size in batch_sizes:
|
|
if batch_size > len(image_paths):
|
|
continue
|
|
|
|
batch_images = image_paths[:batch_size]
|
|
batch_names = [os.path.basename(img) for img in batch_images]
|
|
|
|
print(f"\n Testing batch of {batch_size} images: {', '.join(batch_names)}")
|
|
|
|
# Warm up
|
|
print(" Warming up...")
|
|
try:
|
|
for _ in range(2):
|
|
result = processor.extract_text_from_images_batch(batch_images)
|
|
except Exception as e:
|
|
print(f" ⚠️ Batch warm-up failed: {e}")
|
|
continue
|
|
|
|
# Run benchmark
|
|
times = []
|
|
for iter_num in range(iterations):
|
|
start_time = time.time()
|
|
try:
|
|
results_list = processor.extract_text_from_images_batch(batch_images)
|
|
elapsed = time.time() - start_time
|
|
times.append(elapsed)
|
|
total_chars = sum(len(r.get('text', '')) for r in results_list)
|
|
print(f" Iteration {iter_num+1}: {elapsed:.3f}s, {total_chars} total chars")
|
|
except Exception as e:
|
|
print(f" ❌ Iteration {iter_num+1} failed: {e}")
|
|
continue
|
|
|
|
if times:
|
|
avg_time = statistics.mean(times)
|
|
min_time = min(times)
|
|
max_time = max(times)
|
|
|
|
# Calculate per-image time
|
|
per_image_time = avg_time / batch_size
|
|
|
|
result_data = {
|
|
"implementation": "optimized_batch",
|
|
"batch_size": batch_size,
|
|
"images": batch_names,
|
|
"iterations": len(times),
|
|
"avg_time": avg_time,
|
|
"per_image_time": per_image_time,
|
|
"min_time": min_time,
|
|
"max_time": max_time,
|
|
"times": times
|
|
}
|
|
results.append(result_data)
|
|
|
|
print(f" 📈 Batch average: {avg_time:.3f}s ({per_image_time:.3f}s per image)")
|
|
|
|
return results
|
|
|
|
except ImportError as e:
|
|
print(f"❌ Could not import OptimizedOCRProcessor: {e}")
|
|
return []
|
|
except Exception as e:
|
|
print(f"❌ Benchmark failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return []
|
|
|
|
def benchmark_pdf_processing(self, pdf_path: str, iterations: int = 2):
|
|
"""Benchmark PDF processing performance"""
|
|
print("\n🔍 BENCHMARKING PDF PROCESSING")
|
|
print("=" * 60)
|
|
|
|
if not os.path.exists(pdf_path):
|
|
print(f"❌ PDF not found: {pdf_path}")
|
|
return []
|
|
|
|
pdf_name = os.path.basename(pdf_path)
|
|
print(f"📊 Testing PDF: {pdf_name}")
|
|
|
|
results = []
|
|
|
|
try:
|
|
# Try to import optimized document processor
|
|
from lightrag.optimized_document_processor import OptimizedDocumentProcessor
|
|
processor = OptimizedDocumentProcessor()
|
|
|
|
# Warm up
|
|
print(" Warming up...")
|
|
try:
|
|
# Extract first page to warm up
|
|
from pdf2image import convert_from_path
|
|
images = convert_from_path(pdf_path, first_page=1, last_page=1)
|
|
if images:
|
|
temp_path = "temp_warmup.png"
|
|
images[0].save(temp_path)
|
|
processor.extract_text_from_image(temp_path)
|
|
if os.path.exists(temp_path):
|
|
os.remove(temp_path)
|
|
except Exception as e:
|
|
print(f" ⚠️ Warm-up failed: {e}")
|
|
|
|
# Run benchmark
|
|
times = []
|
|
for iter_num in range(iterations):
|
|
print(f" Iteration {iter_num+1}/{iterations}...")
|
|
start_time = time.time()
|
|
try:
|
|
# Process the PDF
|
|
result = processor.process_document(pdf_path)
|
|
elapsed = time.time() - start_time
|
|
times.append(elapsed)
|
|
|
|
# Extract metrics
|
|
pages = result.get('pages', [])
|
|
total_chars = sum(len(p.get('text', '')) for p in pages)
|
|
|
|
print(f" Time: {elapsed:.3f}s, Pages: {len(pages)}, Chars: {total_chars}")
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Iteration {iter_num+1} failed: {e}")
|
|
continue
|
|
|
|
if times:
|
|
avg_time = statistics.mean(times)
|
|
min_time = min(times)
|
|
max_time = max(times)
|
|
|
|
result_data = {
|
|
"implementation": "pdf_processing",
|
|
"pdf": pdf_name,
|
|
"iterations": len(times),
|
|
"avg_time": avg_time,
|
|
"min_time": min_time,
|
|
"max_time": max_time,
|
|
"times": times
|
|
}
|
|
results.append(result_data)
|
|
|
|
print(f" 📈 PDF processing average: {avg_time:.3f}s")
|
|
|
|
return results
|
|
|
|
except ImportError as e:
|
|
print(f"❌ Could not import OptimizedDocumentProcessor: {e}")
|
|
return []
|
|
except Exception as e:
|
|
print(f"❌ PDF benchmark failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return []
|
|
|
|
def run_comprehensive_benchmark(self, test_images: List[str], test_pdf: str = None):
|
|
"""Run comprehensive benchmark suite"""
|
|
print("🚀 STARTING COMPREHENSIVE OCR PERFORMANCE BENCHMARK")
|
|
print("=" * 60)
|
|
|
|
self.benchmark_start = time.time()
|
|
all_results = []
|
|
|
|
# Benchmark original OCR
|
|
original_results = self.benchmark_original_ocr(test_images)
|
|
all_results.extend(original_results)
|
|
|
|
# Benchmark optimized OCR
|
|
optimized_results = self.benchmark_optimized_ocr(test_images)
|
|
all_results.extend(optimized_results)
|
|
|
|
# Benchmark PDF processing if provided
|
|
if test_pdf and os.path.exists(test_pdf):
|
|
pdf_results = self.benchmark_pdf_processing(test_pdf)
|
|
all_results.extend(pdf_results)
|
|
|
|
# Calculate summary
|
|
self.calculate_summary(all_results)
|
|
|
|
# Save results
|
|
self.save_results(all_results)
|
|
|
|
return all_results
|
|
|
|
def calculate_summary(self, results: List[Dict[str, Any]]):
|
|
"""Calculate and display summary statistics"""
|
|
print("\n" + "=" * 60)
|
|
print("📊 BENCHMARK SUMMARY")
|
|
print("=" * 60)
|
|
|
|
# Group by implementation
|
|
implementations = {}
|
|
for result in results:
|
|
impl = result.get('implementation', 'unknown')
|
|
if impl not in implementations:
|
|
implementations[impl] = []
|
|
implementations[impl].append(result)
|
|
|
|
# Calculate statistics for each implementation
|
|
for impl, impl_results in implementations.items():
|
|
print(f"\n📈 {impl.upper().replace('_', ' ')}:")
|
|
|
|
# Extract times
|
|
all_times = []
|
|
for result in impl_results:
|
|
if 'avg_time' in result:
|
|
all_times.append(result['avg_time'])
|
|
|
|
if all_times:
|
|
avg = statistics.mean(all_times)
|
|
min_val = min(all_times)
|
|
max_val = max(all_times)
|
|
count = len(all_times)
|
|
|
|
print(f" Tests: {count}, Avg: {avg:.3f}s, Min: {min_val:.3f}s, Max: {max_val:.3f}s")
|
|
|
|
# Special handling for batch processing
|
|
if impl == 'optimized_batch':
|
|
per_image_times = [r.get('per_image_time', 0) for r in impl_results if 'per_image_time' in r]
|
|
if per_image_times:
|
|
avg_per_image = statistics.mean(per_image_times)
|
|
print(f" Per-image avg: {avg_per_image:.3f}s")
|
|
|
|
# Compare original vs optimized single
|
|
original_times = [r['avg_time'] for r in results if r.get('implementation') == 'original' and 'avg_time' in r]
|
|
optimized_single_times = [r['avg_time'] for r in results if r.get('implementation') == 'optimized_single' and 'avg_time' in r]
|
|
|
|
if original_times and optimized_single_times:
|
|
orig_avg = statistics.mean(original_times)
|
|
opt_avg = statistics.mean(optimized_single_times)
|
|
|
|
if orig_avg > 0:
|
|
improvement = ((orig_avg - opt_avg) / orig_avg) * 100
|
|
print(f"\n🎯 PERFORMANCE IMPROVEMENT:")
|
|
print(f" Original: {orig_avg:.3f}s per image")
|
|
print(f" Optimized: {opt_avg:.3f}s per image")
|
|
print(f" Improvement: {improvement:.1f}% faster")
|
|
|
|
# Total benchmark duration
|
|
if self.benchmark_start:
|
|
total_duration = time.time() - self.benchmark_start
|
|
print(f"\n⏱️ TOTAL BENCHMARK DURATION: {total_duration:.2f}s")
|
|
|
|
def save_results(self, results: List[Dict[str, Any]], output_file: str = "ocr_benchmark_results.json"):
|
|
"""Save benchmark results to JSON file"""
|
|
# Add metadata
|
|
results_data = {
|
|
"timestamp": datetime.now().isoformat(),
|
|
"system": {
|
|
"platform": sys.platform,
|
|
"python_version": sys.version
|
|
},
|
|
"results": results
|
|
}
|
|
|
|
with open(output_file, 'w') as f:
|
|
json.dump(results_data, f, indent=2, default=str)
|
|
|
|
print(f"\n💾 Results saved to: {output_file}")
|
|
|
|
# Also save summary CSV
|
|
self.save_results_csv(results, "ocr_benchmark_summary.csv")
|
|
|
|
def save_results_csv(self, results: List[Dict[str, Any]], output_file: str = "ocr_benchmark_summary.csv"):
|
|
"""Save benchmark summary to CSV file"""
|
|
import csv
|
|
|
|
# Prepare CSV data
|
|
csv_data = []
|
|
for result in results:
|
|
row = {
|
|
'implementation': result.get('implementation', ''),
|
|
'test_type': result.get('image', result.get('pdf', result.get('batch_size', ''))),
|
|
'iterations': result.get('iterations', 0),
|
|
'avg_time': result.get('avg_time', 0),
|
|
'min_time': result.get('min_time', 0),
|
|
'max_time': result.get('max_time', 0),
|
|
}
|
|
|
|
# Add batch-specific fields
|
|
if result.get('implementation') == 'optimized_batch':
|
|
row['batch_size'] = result.get('batch_size', '')
|
|
row['per_image_time'] = result.get('per_image_time', 0)
|
|
|
|
csv_data.append(row)
|
|
|
|
# Write CSV
|
|
if csv_data:
|
|
with open(output_file, 'w', newline='') as f:
|
|
fieldnames = ['implementation', 'test_type', 'iterations', 'avg_time', 'min_time', 'max_time', 'batch_size', 'per_image_time']
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(csv_data)
|
|
|
|
print(f"📊 CSV summary saved to: {output_file}")
|
|
|
|
def main():
|
|
"""Main benchmark execution"""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="OCR Performance Benchmark")
|
|
parser.add_argument("--images", nargs="+", help="Test image files")
|
|
parser.add_argument("--pdf", help="Test PDF file")
|
|
parser.add_argument("--iterations", type=int, default=3, help="Iterations per test")
|
|
parser.add_argument("--output", default="ocr_benchmark_results.json", help="Output file")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Default test images if none provided
|
|
default_images = [
|
|
"ocr_high_res.png",
|
|
"ocr_page1_rendered.png",
|
|
"ocr_page1_preview.png"
|
|
]
|
|
|
|
test_images = args.images or [img for img in default_images if os.path.exists(img)]
|
|
if not test_images:
|
|
print("❌ No test images found. Please provide valid image paths.")
|
|
return
|
|
|
|
print(f"📁 Test images: {test_images}")
|
|
if args.pdf and os.path.exists(args.pdf):
|
|
print(f"📄 Test PDF: {args.pdf}")
|
|
|
|
# Create benchmark instance and run
|
|
benchmark = OCRPerformanceBenchmark()
|
|
results = benchmark.run_comprehensive_benchmark(
|
|
test_images=test_images,
|
|
test_pdf=args.pdf
|
|
)
|
|
|
|
print("\n✅ Benchmark completed!")
|
|
|
|
if __name__ == "__main__":
|
|
main() |