railseek6/fix_ocr_processing.py

"""
Fix OCR Processing with GPU Mode
Ensures GPU-accelerated OCR works for scanned PDF tables
"""

import os
import sys
import requests
import time
import json
from pathlib import Path

# Configure environment for GPU acceleration
os.environ['CUDA_PATH'] = r'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8'
os.environ['CUDA_HOME'] = os.environ['CUDA_PATH']
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['PATH'] = f"{os.environ['CUDA_PATH']}\\bin;{os.environ['PATH']}"

# Server configuration
BASE_URL = 'http://localhost:3015'
AUTH_CREDENTIALS = {'username': 'jleu3482', 'password': 'jleu1212'}
OCR_PDF_PATH = 'ocr.pdf'

def test_gpu_ocr_directly():
    """Test OCR directly with GPU mode"""
    print("🧪 Testing GPU OCR directly...")

    try:
        # Import required libraries
        import paddle
        from paddleocr import PaddleOCR
        import fitz  # PyMuPDF

        print(f"✅ PaddlePaddle version: {paddle.__version__}")
        print(f"✅ CUDA available: {paddle.is_compiled_with_cuda()}")
        print(f"✅ GPU devices: {paddle.device.cuda.device_count()}")

        # Initialize PaddleOCR with GPU
        print("🔄 Initializing PaddleOCR with GPU...")
        ocr_engine = PaddleOCR(use_gpu=True, lang='en', show_log=False)
        print("✅ PaddleOCR GPU initialization successful")

        # Test with OCR PDF
        if not os.path.exists(OCR_PDF_PATH):
            print(f"❌ OCR PDF not found: {OCR_PDF_PATH}")
            return False

        print(f"📄 Testing with {OCR_PDF_PATH}")
        pdf_document = fitz.open(OCR_PDF_PATH)

        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]

            # Convert page to high-resolution image for better OCR
            mat = fitz.Matrix(2, 2)  # 2x resolution
            pix = page.get_pixmap(matrix=mat)
            img_data = pix.tobytes("png")

            # Save temporary image
            temp_path = f"temp_page_{page_num+1}.png"
            with open(temp_path, 'wb') as f:
                f.write(img_data)

            # Perform OCR
            print(f"🔄 Performing OCR on page {page_num+1}...")
            start_time = time.time()
            result = ocr_engine.ocr(temp_path, cls=True)
            ocr_time = time.time() - start_time

            if result and result[0]:
                print(f"✅ OCR completed in {ocr_time:.2f} seconds")
                print(f"📝 Extracted {len(result[0])} text boxes:")

                for i, line in enumerate(result[0][:5]):  # Show first 5
                    bbox, (text, confidence) = line
                    print(f"   {i+1}. '{text}' (conf: {confidence:.2f})")

                if len(result[0]) > 5:
                    print(f"   ... and {len(result[0]) - 5} more lines")
            else:
                print(f"❌ No text detected on page {page_num+1}")

            # Clean up
            os.unlink(temp_path)

        pdf_document.close()
        return True

    except Exception as e:
        print(f"❌ GPU OCR test failed: {e}")
        return False

def test_server_upload_with_gpu():
    """Test server upload with GPU OCR"""
    print("\n🌐 Testing server upload with GPU OCR...")

    # Login
    try:
        login_response = requests.post(f'{BASE_URL}/login', data=AUTH_CREDENTIALS, timeout=10)
        if login_response.status_code != 200:
            print(f"❌ Login failed: {login_response.status_code} - {login_response.text}")
            return False

        token = login_response.json().get('access_token')
        headers = {'Authorization': f'Bearer {token}'}
        print("✅ Login successful")

        # Clear existing documents
        clear_response = requests.delete(f'{BASE_URL}/documents', headers=headers, timeout=10)
        if clear_response.status_code == 200:
            print("✅ Cleared existing documents")

        # Upload OCR PDF
        print(f"📤 Uploading {OCR_PDF_PATH}...")
        with open(OCR_PDF_PATH, 'rb') as f:
            files = {'file': (OCR_PDF_PATH, f, 'application/pdf')}
            upload_response = requests.post(f'{BASE_URL}/documents/upload', files=files, headers=headers, timeout=30)

        if upload_response.status_code != 200:
            print(f"❌ Upload failed: {upload_response.status_code} - {upload_response.text}")
            return False

        upload_data = upload_response.json()
        print(f"✅ Upload successful: {upload_data}")

        # Monitor processing
        print("🔄 Monitoring OCR processing...")
        for i in range(60):  # Wait up to 60 seconds
            time.sleep(2)

            docs_response = requests.get(f'{BASE_URL}/documents', headers=headers, timeout=10)
            if docs_response.status_code == 200:
                docs_data = docs_response.json()
                statuses = docs_data.get('statuses', {})

                completed = len(statuses.get('completed', []))
                processing = len(statuses.get('processing', []))
                failed = len(statuses.get('failed', []))

                print(f"⏰ Progress after {i*2}s: Processing={processing}, Completed={completed}, Failed={failed}")

                # Check for completed documents
                if completed > 0:
                    print("🎉 OCR processing completed successfully!")
                    return True

                # Check for failed documents
                if failed > 0:
                    failed_docs = statuses.get('failed', [])
                    for doc in failed_docs:
                        print(f"❌ Failed document: {doc.get('file_path')} - {doc.get('error_msg', 'Unknown error')}")
                    return False

        print("⏰ Processing timeout - check server logs for details")
        return False

    except Exception as e:
        print(f"❌ Server test failed: {e}")
        return False

def test_search_functionality():
    """Test search functionality after OCR processing"""
    print("\n🔍 Testing search functionality...")

    try:
        # Login
        login_response = requests.post(f'{BASE_URL}/login', data=AUTH_CREDENTIALS, timeout=10)
        if login_response.status_code != 200:
            print("❌ Login failed for search test")
            return False

        token = login_response.json().get('access_token')
        headers = {'Authorization': f'Bearer {token}'}

        # Test search queries
        test_queries = [
            "safety precautions",
            "minimum safe distance",
            "high voltage",
            "traction voltage"
        ]

        for query in test_queries:
            search_data = {'query': query}
            search_response = requests.post(f'{BASE_URL}/api/search', json=search_data, headers=headers, timeout=10)

            if search_response.status_code == 200:
                search_results = search_response.json()
                print(f"✅ Search for '{query}': Found {len(search_results.get('results', []))} results")
            else:
                print(f"❌ Search for '{query}' failed: {search_response.status_code}")

        return True

    except Exception as e:
        print(f"❌ Search test failed: {e}")
        return False

def main():
    """Main function to fix and test OCR processing"""
    print("🚀 Fixing OCR Processing with GPU Mode")
    print("=" * 50)

    # Step 1: Test GPU OCR directly
    gpu_ok = test_gpu_ocr_directly()
    if not gpu_ok:
        print("❌ GPU OCR test failed - cannot proceed")
        return

    # Step 2: Test server upload with GPU OCR
    upload_ok = test_server_upload_with_gpu()
    if not upload_ok:
        print("❌ Server upload test failed")
        return

    # Step 3: Test search functionality
    search_ok = test_search_functionality()

    # Final results
    print("\n" + "=" * 50)
    print("📊 FINAL RESULTS:")
    print(f"   GPU OCR: {'✅' if gpu_ok else '❌'}")
    print(f"   Upload & Processing: {'✅' if upload_ok else '❌'}")
    print(f"   Search: {'✅' if search_ok else '❌'}")

    if gpu_ok and upload_ok:
        print("\n🎉 SUCCESS: OCR PDF upload, indexing, and search working with GPU mode!")
    else:
        print("\n❌ FAILED: Some tests did not pass")

if __name__ == "__main__":
    main()