225 lines
7.3 KiB
Python
225 lines
7.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test OCR performance to identify bottlenecks
|
|
"""
|
|
import time
|
|
import os
|
|
import sys
|
|
from simple_ocr_processor import get_simple_ocr_processor
|
|
|
|
def test_ocr_performance():
|
|
"""Test OCR performance with different image sizes"""
|
|
print("🔍 OCR PERFORMANCE TEST")
|
|
print("=" * 50)
|
|
|
|
processor = get_simple_ocr_processor()
|
|
if not processor.available:
|
|
print("❌ OCR processor not available")
|
|
return
|
|
|
|
print("✅ OCR processor available")
|
|
|
|
# Test with different images if available
|
|
test_images = [
|
|
('ocr_high_res.png', 'High resolution image'),
|
|
('ocr_page1_rendered.png', 'Rendered page'),
|
|
('ocr_page1_preview.png', 'Preview image')
|
|
]
|
|
|
|
for image_file, description in test_images:
|
|
if os.path.exists(image_file):
|
|
print(f"\n📊 Testing {description} ({image_file})")
|
|
|
|
# Warm up
|
|
print(" Warming up...")
|
|
for _ in range(2):
|
|
result = processor.extract_text_from_image(image_file)
|
|
|
|
# Actual test
|
|
print(" Running performance test...")
|
|
times = []
|
|
for i in range(3):
|
|
start = time.time()
|
|
result = processor.extract_text_from_image(image_file)
|
|
elapsed = time.time() - start
|
|
times.append(elapsed)
|
|
print(f" Run {i+1}: {elapsed:.3f}s, {len(result['text'])} chars")
|
|
|
|
avg_time = sum(times) / len(times)
|
|
print(f" 📈 Average time: {avg_time:.3f}s")
|
|
print(f" 📊 Text extracted: {len(result['text'])} characters")
|
|
print(f" 🎯 Confidence: {result['confidence']:.3f}")
|
|
else:
|
|
print(f"\n⚠️ Test image not found: {image_file}")
|
|
|
|
# Test with a simple generated image
|
|
print("\n📊 Testing with simple generated image...")
|
|
try:
|
|
from PIL import Image
|
|
import tempfile
|
|
|
|
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as f:
|
|
img_path = f.name
|
|
|
|
# Create a simple text image
|
|
img = Image.new('RGB', (800, 600), color='white')
|
|
img.save(img_path)
|
|
|
|
start = time.time()
|
|
result = processor.extract_text_from_image(img_path)
|
|
elapsed = time.time() - start
|
|
|
|
print(f" Simple image OCR: {elapsed:.3f}s")
|
|
print(f" Text length: {len(result['text'])}")
|
|
|
|
os.unlink(img_path)
|
|
except Exception as e:
|
|
print(f" ⚠️ Could not test with generated image: {e}")
|
|
|
|
def analyze_bottlenecks():
|
|
"""Analyze potential bottlenecks in OCR processing"""
|
|
print("\n🔍 BOTTLENECK ANALYSIS")
|
|
print("=" * 50)
|
|
|
|
print("""
|
|
Potential OCR Performance Bottlenecks:
|
|
|
|
1. **MODEL LOADING TIME**
|
|
- PaddleOCR loads detection, recognition, and classification models
|
|
- First call includes model loading overhead
|
|
- GPU memory allocation and initialization
|
|
|
|
2. **IMAGE PREPROCESSING**
|
|
- Image loading and decoding
|
|
- Resizing and normalization
|
|
- Color space conversion
|
|
|
|
3. **DETECTION PHASE**
|
|
- Text region detection (most computationally intensive)
|
|
- Bounding box generation
|
|
- Non-maximum suppression
|
|
|
|
4. **RECOGNITION PHASE**
|
|
- Character recognition for each detected region
|
|
- Language model processing
|
|
- Confidence scoring
|
|
|
|
5. **POST-PROCESSING**
|
|
- Text line reconstruction
|
|
- Confidence averaging
|
|
- Result formatting
|
|
|
|
6. **GPU MEMORY MANAGEMENT**
|
|
- Memory allocation/deallocation per image
|
|
- GPU-CPU data transfer
|
|
- Batch size limitations
|
|
|
|
7. **PYTHON OVERHEAD**
|
|
- Subprocess communication (if using separate process)
|
|
- Data serialization/deserialization
|
|
- GIL contention
|
|
""")
|
|
|
|
def check_gpu_usage():
|
|
"""Check if GPU is being used effectively"""
|
|
print("\n🔍 GPU USAGE ANALYSIS")
|
|
print("=" * 50)
|
|
|
|
try:
|
|
import paddle
|
|
if paddle.device.is_compiled_with_cuda():
|
|
print("✅ PaddlePaddle compiled with CUDA support")
|
|
gpu_count = paddle.device.cuda.device_count()
|
|
print(f"📊 Available GPUs: {gpu_count}")
|
|
|
|
if gpu_count > 0:
|
|
current_device = paddle.device.get_device()
|
|
print(f"📊 Current device: {current_device}")
|
|
|
|
# Check GPU memory
|
|
import subprocess
|
|
try:
|
|
result = subprocess.run(['nvidia-smi', '--query-gpu=memory.used,memory.total', '--format=csv,noheader,nounits'],
|
|
capture_output=True, text=True)
|
|
if result.returncode == 0:
|
|
memory_info = result.stdout.strip().split(',')
|
|
if memory_info and len(memory_info) >= 2:
|
|
used = int(memory_info[0].strip())
|
|
total = int(memory_info[1].strip())
|
|
print(f"📊 GPU Memory: {used}MB / {total}MB ({used/total*100:.1f}%)")
|
|
except:
|
|
print("⚠️ Could not query GPU memory (nvidia-smi not available)")
|
|
else:
|
|
print("⚠️ PaddlePaddle not compiled with CUDA support (using CPU)")
|
|
except Exception as e:
|
|
print(f"⚠️ Could not check GPU status: {e}")
|
|
|
|
def provide_recommendations():
|
|
"""Provide optimization recommendations"""
|
|
print("\n💡 OPTIMIZATION RECOMMENDATIONS")
|
|
print("=" * 50)
|
|
|
|
print("""
|
|
🎯 IMMEDIATE IMPROVEMENTS (Quick Wins):
|
|
|
|
1. **BATCH PROCESSING**
|
|
- Process multiple images in a single OCR call
|
|
- Reduces per-image overhead significantly
|
|
- Implementation: Modify OCR processor to accept batch input
|
|
|
|
2. **MODEL WARM-UP**
|
|
- Load models once at startup
|
|
- Keep models in memory between requests
|
|
- Implementation: Singleton pattern with lazy loading
|
|
|
|
3. **IMAGE OPTIMIZATION**
|
|
- Resize large images before OCR
|
|
- Convert to optimal format (PNG > JPEG for text)
|
|
- Remove unnecessary color channels
|
|
- Implementation: Preprocessing pipeline
|
|
|
|
4. **CONCURRENT PROCESSING**
|
|
- Use threading for multiple documents
|
|
- Async processing for I/O operations
|
|
- Implementation: asyncio or thread pool
|
|
|
|
5. **MEMORY MANAGEMENT**
|
|
- Reuse GPU memory buffers
|
|
- Limit maximum image size
|
|
- Implement memory monitoring
|
|
- Implementation: Context managers, size limits
|
|
|
|
🔧 ADVANCED OPTIMIZATIONS:
|
|
|
|
1. **MODEL SELECTION**
|
|
- Use lighter OCR models (PP-OCRv3 vs PP-OCRv4)
|
|
- Disable angle classification if not needed
|
|
- Reduce dictionary size for specific languages
|
|
|
|
2. **HARDWARE UTILIZATION**
|
|
- Ensure proper GPU driver installation
|
|
- Monitor GPU utilization during processing
|
|
- Consider mixed precision (FP16) if supported
|
|
|
|
3. **PIPELINE OPTIMIZATION**
|
|
- Parallelize detection and recognition
|
|
- Stream processing for large documents
|
|
- Cache intermediate results
|
|
|
|
4. **MONITORING AND PROFILING**
|
|
- Add detailed timing metrics
|
|
- Profile CPU/GPU usage
|
|
- Identify specific slow functions
|
|
|
|
🚀 EXPECTED PERFORMANCE GAINS:
|
|
- Batch processing: 2-5x speedup
|
|
- Model warm-up: 30-50% reduction in first call
|
|
- Image optimization: 10-30% speedup
|
|
- Concurrent processing: Scale with core count
|
|
""")
|
|
|
|
if __name__ == "__main__":
|
|
test_ocr_performance()
|
|
analyze_bottlenecks()
|
|
check_gpu_usage()
|
|
provide_recommendations() |