Files
railseek6/test_ocr_performance.py

225 lines
7.3 KiB
Python

#!/usr/bin/env python3
"""
Test OCR performance to identify bottlenecks
"""
import time
import os
import sys
from simple_ocr_processor import get_simple_ocr_processor
def test_ocr_performance():
"""Test OCR performance with different image sizes"""
print("🔍 OCR PERFORMANCE TEST")
print("=" * 50)
processor = get_simple_ocr_processor()
if not processor.available:
print("❌ OCR processor not available")
return
print("✅ OCR processor available")
# Test with different images if available
test_images = [
('ocr_high_res.png', 'High resolution image'),
('ocr_page1_rendered.png', 'Rendered page'),
('ocr_page1_preview.png', 'Preview image')
]
for image_file, description in test_images:
if os.path.exists(image_file):
print(f"\n📊 Testing {description} ({image_file})")
# Warm up
print(" Warming up...")
for _ in range(2):
result = processor.extract_text_from_image(image_file)
# Actual test
print(" Running performance test...")
times = []
for i in range(3):
start = time.time()
result = processor.extract_text_from_image(image_file)
elapsed = time.time() - start
times.append(elapsed)
print(f" Run {i+1}: {elapsed:.3f}s, {len(result['text'])} chars")
avg_time = sum(times) / len(times)
print(f" 📈 Average time: {avg_time:.3f}s")
print(f" 📊 Text extracted: {len(result['text'])} characters")
print(f" 🎯 Confidence: {result['confidence']:.3f}")
else:
print(f"\n⚠️ Test image not found: {image_file}")
# Test with a simple generated image
print("\n📊 Testing with simple generated image...")
try:
from PIL import Image
import tempfile
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as f:
img_path = f.name
# Create a simple text image
img = Image.new('RGB', (800, 600), color='white')
img.save(img_path)
start = time.time()
result = processor.extract_text_from_image(img_path)
elapsed = time.time() - start
print(f" Simple image OCR: {elapsed:.3f}s")
print(f" Text length: {len(result['text'])}")
os.unlink(img_path)
except Exception as e:
print(f" ⚠️ Could not test with generated image: {e}")
def analyze_bottlenecks():
"""Analyze potential bottlenecks in OCR processing"""
print("\n🔍 BOTTLENECK ANALYSIS")
print("=" * 50)
print("""
Potential OCR Performance Bottlenecks:
1. **MODEL LOADING TIME**
- PaddleOCR loads detection, recognition, and classification models
- First call includes model loading overhead
- GPU memory allocation and initialization
2. **IMAGE PREPROCESSING**
- Image loading and decoding
- Resizing and normalization
- Color space conversion
3. **DETECTION PHASE**
- Text region detection (most computationally intensive)
- Bounding box generation
- Non-maximum suppression
4. **RECOGNITION PHASE**
- Character recognition for each detected region
- Language model processing
- Confidence scoring
5. **POST-PROCESSING**
- Text line reconstruction
- Confidence averaging
- Result formatting
6. **GPU MEMORY MANAGEMENT**
- Memory allocation/deallocation per image
- GPU-CPU data transfer
- Batch size limitations
7. **PYTHON OVERHEAD**
- Subprocess communication (if using separate process)
- Data serialization/deserialization
- GIL contention
""")
def check_gpu_usage():
"""Check if GPU is being used effectively"""
print("\n🔍 GPU USAGE ANALYSIS")
print("=" * 50)
try:
import paddle
if paddle.device.is_compiled_with_cuda():
print("✅ PaddlePaddle compiled with CUDA support")
gpu_count = paddle.device.cuda.device_count()
print(f"📊 Available GPUs: {gpu_count}")
if gpu_count > 0:
current_device = paddle.device.get_device()
print(f"📊 Current device: {current_device}")
# Check GPU memory
import subprocess
try:
result = subprocess.run(['nvidia-smi', '--query-gpu=memory.used,memory.total', '--format=csv,noheader,nounits'],
capture_output=True, text=True)
if result.returncode == 0:
memory_info = result.stdout.strip().split(',')
if memory_info and len(memory_info) >= 2:
used = int(memory_info[0].strip())
total = int(memory_info[1].strip())
print(f"📊 GPU Memory: {used}MB / {total}MB ({used/total*100:.1f}%)")
except:
print("⚠️ Could not query GPU memory (nvidia-smi not available)")
else:
print("⚠️ PaddlePaddle not compiled with CUDA support (using CPU)")
except Exception as e:
print(f"⚠️ Could not check GPU status: {e}")
def provide_recommendations():
"""Provide optimization recommendations"""
print("\n💡 OPTIMIZATION RECOMMENDATIONS")
print("=" * 50)
print("""
🎯 IMMEDIATE IMPROVEMENTS (Quick Wins):
1. **BATCH PROCESSING**
- Process multiple images in a single OCR call
- Reduces per-image overhead significantly
- Implementation: Modify OCR processor to accept batch input
2. **MODEL WARM-UP**
- Load models once at startup
- Keep models in memory between requests
- Implementation: Singleton pattern with lazy loading
3. **IMAGE OPTIMIZATION**
- Resize large images before OCR
- Convert to optimal format (PNG > JPEG for text)
- Remove unnecessary color channels
- Implementation: Preprocessing pipeline
4. **CONCURRENT PROCESSING**
- Use threading for multiple documents
- Async processing for I/O operations
- Implementation: asyncio or thread pool
5. **MEMORY MANAGEMENT**
- Reuse GPU memory buffers
- Limit maximum image size
- Implement memory monitoring
- Implementation: Context managers, size limits
🔧 ADVANCED OPTIMIZATIONS:
1. **MODEL SELECTION**
- Use lighter OCR models (PP-OCRv3 vs PP-OCRv4)
- Disable angle classification if not needed
- Reduce dictionary size for specific languages
2. **HARDWARE UTILIZATION**
- Ensure proper GPU driver installation
- Monitor GPU utilization during processing
- Consider mixed precision (FP16) if supported
3. **PIPELINE OPTIMIZATION**
- Parallelize detection and recognition
- Stream processing for large documents
- Cache intermediate results
4. **MONITORING AND PROFILING**
- Add detailed timing metrics
- Profile CPU/GPU usage
- Identify specific slow functions
🚀 EXPECTED PERFORMANCE GAINS:
- Batch processing: 2-5x speedup
- Model warm-up: 30-50% reduction in first call
- Image optimization: 10-30% speedup
- Concurrent processing: Scale with core count
""")
if __name__ == "__main__":
test_ocr_performance()
analyze_bottlenecks()
check_gpu_usage()
provide_recommendations()