railseek6/test_ocr_performance.py

#!/usr/bin/env python3
"""
Test OCR performance to identify bottlenecks
"""
import time
import os
import sys
from simple_ocr_processor import get_simple_ocr_processor

def test_ocr_performance():
    """Test OCR performance with different image sizes"""
    print("🔍 OCR PERFORMANCE TEST")
    print("=" * 50)

    processor = get_simple_ocr_processor()
    if not processor.available:
        print("❌ OCR processor not available")
        return

    print("✅ OCR processor available")

    # Test with different images if available
    test_images = [
        ('ocr_high_res.png', 'High resolution image'),
        ('ocr_page1_rendered.png', 'Rendered page'),
        ('ocr_page1_preview.png', 'Preview image')
    ]

    for image_file, description in test_images:
        if os.path.exists(image_file):
            print(f"\n📊 Testing {description} ({image_file})")

            # Warm up
            print("  Warming up...")
            for _ in range(2):
                result = processor.extract_text_from_image(image_file)

            # Actual test
            print("  Running performance test...")
            times = []
            for i in range(3):
                start = time.time()
                result = processor.extract_text_from_image(image_file)
                elapsed = time.time() - start
                times.append(elapsed)
                print(f"    Run {i+1}: {elapsed:.3f}s, {len(result['text'])} chars")

            avg_time = sum(times) / len(times)
            print(f"  📈 Average time: {avg_time:.3f}s")
            print(f"  📊 Text extracted: {len(result['text'])} characters")
            print(f"  🎯 Confidence: {result['confidence']:.3f}")
        else:
            print(f"\n⚠️  Test image not found: {image_file}")

    # Test with a simple generated image
    print("\n📊 Testing with simple generated image...")
    try:
        from PIL import Image
        import tempfile

        with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as f:
            img_path = f.name

        # Create a simple text image
        img = Image.new('RGB', (800, 600), color='white')
        img.save(img_path)

        start = time.time()
        result = processor.extract_text_from_image(img_path)
        elapsed = time.time() - start

        print(f"  Simple image OCR: {elapsed:.3f}s")
        print(f"  Text length: {len(result['text'])}")

        os.unlink(img_path)
    except Exception as e:
        print(f"  ⚠️  Could not test with generated image: {e}")

def analyze_bottlenecks():
    """Analyze potential bottlenecks in OCR processing"""
    print("\n🔍 BOTTLENECK ANALYSIS")
    print("=" * 50)

    print("""
Potential OCR Performance Bottlenecks:

1. **MODEL LOADING TIME**
   - PaddleOCR loads detection, recognition, and classification models
   - First call includes model loading overhead
   - GPU memory allocation and initialization

2. **IMAGE PREPROCESSING**
   - Image loading and decoding
   - Resizing and normalization
   - Color space conversion

3. **DETECTION PHASE**
   - Text region detection (most computationally intensive)
   - Bounding box generation
   - Non-maximum suppression

4. **RECOGNITION PHASE**
   - Character recognition for each detected region
   - Language model processing
   - Confidence scoring

5. **POST-PROCESSING**
   - Text line reconstruction
   - Confidence averaging
   - Result formatting

6. **GPU MEMORY MANAGEMENT**
   - Memory allocation/deallocation per image
   - GPU-CPU data transfer
   - Batch size limitations

7. **PYTHON OVERHEAD**
   - Subprocess communication (if using separate process)
   - Data serialization/deserialization
   - GIL contention
""")

def check_gpu_usage():
    """Check if GPU is being used effectively"""
    print("\n🔍 GPU USAGE ANALYSIS")
    print("=" * 50)

    try:
        import paddle
        if paddle.device.is_compiled_with_cuda():
            print("✅ PaddlePaddle compiled with CUDA support")
            gpu_count = paddle.device.cuda.device_count()
            print(f"📊 Available GPUs: {gpu_count}")

            if gpu_count > 0:
                current_device = paddle.device.get_device()
                print(f"📊 Current device: {current_device}")

                # Check GPU memory
                import subprocess
                try:
                    result = subprocess.run(['nvidia-smi', '--query-gpu=memory.used,memory.total', '--format=csv,noheader,nounits'],
                                          capture_output=True, text=True)
                    if result.returncode == 0:
                        memory_info = result.stdout.strip().split(',')
                        if memory_info and len(memory_info) >= 2:
                            used = int(memory_info[0].strip())
                            total = int(memory_info[1].strip())
                            print(f"📊 GPU Memory: {used}MB / {total}MB ({used/total*100:.1f}%)")
                except:
                    print("⚠️  Could not query GPU memory (nvidia-smi not available)")
        else:
            print("⚠️  PaddlePaddle not compiled with CUDA support (using CPU)")
    except Exception as e:
        print(f"⚠️  Could not check GPU status: {e}")

def provide_recommendations():
    """Provide optimization recommendations"""
    print("\n💡 OPTIMIZATION RECOMMENDATIONS")
    print("=" * 50)

    print("""
🎯 IMMEDIATE IMPROVEMENTS (Quick Wins):

1. **BATCH PROCESSING**
   - Process multiple images in a single OCR call
   - Reduces per-image overhead significantly
   - Implementation: Modify OCR processor to accept batch input

2. **MODEL WARM-UP**
   - Load models once at startup
   - Keep models in memory between requests
   - Implementation: Singleton pattern with lazy loading

3. **IMAGE OPTIMIZATION**
   - Resize large images before OCR
   - Convert to optimal format (PNG > JPEG for text)
   - Remove unnecessary color channels
   - Implementation: Preprocessing pipeline

4. **CONCURRENT PROCESSING**
   - Use threading for multiple documents
   - Async processing for I/O operations
   - Implementation: asyncio or thread pool

5. **MEMORY MANAGEMENT**
   - Reuse GPU memory buffers
   - Limit maximum image size
   - Implement memory monitoring
   - Implementation: Context managers, size limits

🔧 ADVANCED OPTIMIZATIONS:

1. **MODEL SELECTION**
   - Use lighter OCR models (PP-OCRv3 vs PP-OCRv4)
   - Disable angle classification if not needed
   - Reduce dictionary size for specific languages

2. **HARDWARE UTILIZATION**
   - Ensure proper GPU driver installation
   - Monitor GPU utilization during processing
   - Consider mixed precision (FP16) if supported

3. **PIPELINE OPTIMIZATION**
   - Parallelize detection and recognition
   - Stream processing for large documents
   - Cache intermediate results

4. **MONITORING AND PROFILING**
   - Add detailed timing metrics
   - Profile CPU/GPU usage
   - Identify specific slow functions

🚀 EXPECTED PERFORMANCE GAINS:
- Batch processing: 2-5x speedup
- Model warm-up: 30-50% reduction in first call
- Image optimization: 10-30% speedup
- Concurrent processing: Scale with core count
""")

if __name__ == "__main__":
    test_ocr_performance()
    analyze_bottlenecks()
    check_gpu_usage()
    provide_recommendations()