railseek6/test_complete_ocr_workflow.py

"""
Complete OCR Workflow Test with GPU Mode
Tests upload, indexing, and search functionality for scanned PDF tables
"""

import os
import sys
import requests
import time
import json
from pathlib import Path

# Configure GPU environment
os.environ['CUDA_PATH'] = r'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8'
os.environ['CUDA_HOME'] = os.environ['CUDA_PATH']
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['PATH'] = f"{os.environ['CUDA_PATH']}\\bin;{os.environ['PATH']}"

# Server configuration
BASE_URL = 'http://localhost:3015'
AUTH_CREDENTIALS = {'username': 'jleu3482', 'password': 'jleu1212'}
OCR_PDF_PATH = 'ocr.pdf'

def verify_gpu_environment():
    """Verify GPU environment is properly configured"""
    print("🔍 Verifying GPU Environment...")

    try:
        import paddle
        print(f"✅ PaddlePaddle version: {paddle.__version__}")
        print(f"✅ CUDA compiled: {paddle.is_compiled_with_cuda()}")
        print(f"✅ GPU devices: {paddle.device.cuda.device_count()}")

        # Test PaddleOCR GPU initialization
        from paddleocr import PaddleOCR
        ocr_engine = PaddleOCR(use_gpu=True, lang='en', show_log=False)
        print("✅ PaddleOCR GPU initialization successful")

        return True
    except Exception as e:
        print(f"❌ GPU environment verification failed: {e}")
        return False

def test_server_connectivity():
    """Test server connectivity and authentication"""
    print("\n🌐 Testing Server Connectivity...")

    try:
        # Test basic connectivity
        response = requests.get(f'{BASE_URL}/', timeout=5)
        print(f"✅ Server is running (status: {response.status_code})")

        # Test authentication
        login_response = requests.post(f'{BASE_URL}/login', data=AUTH_CREDENTIALS, timeout=10)
        if login_response.status_code == 200:
            token = login_response.json().get('access_token')
            print("✅ Authentication successful")
            return token
        else:
            print(f"❌ Authentication failed: {login_response.status_code} - {login_response.text}")
            return None

    except Exception as e:
        print(f"❌ Server connectivity test failed: {e}")
        return None

def clear_existing_documents(token):
    """Clear existing documents from the system"""
    print("\n🗑️ Clearing existing documents...")

    try:
        headers = {'Authorization': f'Bearer {token}'}
        clear_response = requests.delete(f'{BASE_URL}/documents', headers=headers, timeout=30)

        if clear_response.status_code == 200:
            print("✅ Documents cleared successfully")
            return True
        else:
            print(f"⚠️ Clear documents response: {clear_response.status_code}")
            return True  # Continue even if clear fails
    except Exception as e:
        print(f"⚠️ Clear documents failed: {e}")
        return True  # Continue anyway

def upload_ocr_pdf(token):
    """Upload OCR PDF for processing"""
    print(f"\n📤 Uploading {OCR_PDF_PATH}...")

    try:
        headers = {'Authorization': f'Bearer {token}'}

        if not os.path.exists(OCR_PDF_PATH):
            print(f"❌ OCR PDF not found: {OCR_PDF_PATH}")
            return False

        # Upload with longer timeout
        with open(OCR_PDF_PATH, 'rb') as f:
            files = {'file': (OCR_PDF_PATH, f, 'application/pdf')}
            upload_response = requests.post(f'{BASE_URL}/documents/upload',
                                          files=files, headers=headers, timeout=60)

        if upload_response.status_code == 200:
            upload_data = upload_response.json()
            print(f"✅ Upload successful: {upload_data}")
            return upload_data
        else:
            print(f"❌ Upload failed: {upload_response.status_code} - {upload_response.text}")
            return False

    except Exception as e:
        print(f"❌ Upload failed: {e}")
        return False

def monitor_processing(token, max_wait=120):
    """Monitor document processing with progress updates"""
    print(f"\n🔄 Monitoring OCR processing (max {max_wait}s)...")

    try:
        headers = {'Authorization': f'Bearer {token}'}

        for i in range(max_wait // 5):
            time.sleep(5)  # Check every 5 seconds

            docs_response = requests.get(f'{BASE_URL}/documents', headers=headers, timeout=10)
            if docs_response.status_code == 200:
                docs_data = docs_response.json()
                statuses = docs_data.get('statuses', {})

                completed = len(statuses.get('completed', []))
                processing = len(statuses.get('processing', []))
                failed = len(statuses.get('failed', []))

                elapsed = (i + 1) * 5
                print(f"⏰ Progress after {elapsed}s: Processing={processing}, Completed={completed}, Failed={failed}")

                # Check for completed documents
                if completed > 0:
                    completed_docs = statuses.get('completed', [])
                    for doc in completed_docs:
                        print(f"🎉 Completed: {doc.get('file_path')}")
                        print(f"   Content length: {doc.get('content_length', 0)}")
                        print(f"   Chunks: {doc.get('chunks_count', 0)}")
                    return True

                # Check for failed documents
                if failed > 0:
                    failed_docs = statuses.get('failed', [])
                    for doc in failed_docs:
                        print(f"❌ Failed: {doc.get('file_path')}")
                        print(f"   Error: {doc.get('error_msg', 'Unknown error')}")
                    return False

        print("⏰ Processing timeout - check server logs for details")
        return False

    except Exception as e:
        print(f"❌ Monitoring failed: {e}")
        return False

def test_search_functionality(token):
    """Test search functionality with OCR-extracted content"""
    print("\n🔍 Testing Search Functionality...")

    try:
        headers = {'Authorization': f'Bearer {token}'}

        # Test search queries based on OCR content
        test_queries = [
            "safety precautions",
            "minimum safe distance",
            "high voltage work",
            "traction voltage",
            "conductive tools",
            "live parts"
        ]

        success_count = 0
        for query in test_queries:
            try:
                search_data = {'query': query}
                search_response = requests.post(f'{BASE_URL}/api/search',
                                              json=search_data, headers=headers, timeout=15)

                if search_response.status_code == 200:
                    search_results = search_response.json()
                    results = search_results.get('results', [])
                    print(f"✅ Search '{query}': Found {len(results)} results")

                    # Show first result snippet if available
                    if results:
                        first_result = results[0]
                        snippet = first_result.get('content', '')[:100] + '...'
                        print(f"   📄 First result: {snippet}")

                    success_count += 1
                else:
                    print(f"❌ Search '{query}' failed: {search_response.status_code}")

            except Exception as e:
                print(f"❌ Search '{query}' error: {e}")

        print(f"\n📊 Search test: {success_count}/{len(test_queries)} queries successful")
        return success_count > 0

    except Exception as e:
        print(f"❌ Search functionality test failed: {e}")
        return False

def main():
    """Main OCR workflow test"""
    print("🚀 Complete OCR Workflow Test with GPU Mode")
    print("=" * 60)

    # Step 1: Verify GPU environment
    if not verify_gpu_environment():
        print("❌ Cannot proceed - GPU environment not ready")
        return

    # Step 2: Test server connectivity
    token = test_server_connectivity()
    if not token:
        print("❌ Cannot proceed - server connectivity failed")
        return

    # Step 3: Clear existing documents
    if not clear_existing_documents(token):
        print("⚠️ Clear documents failed, but continuing...")

    # Step 4: Upload OCR PDF
    upload_result = upload_ocr_pdf(token)
    if not upload_result:
        print("❌ OCR PDF upload failed")
        return

    # Step 5: Monitor processing
    processing_ok = monitor_processing(token)
    if not processing_ok:
        print("❌ OCR processing failed")
        return

    # Step 6: Test search functionality
    search_ok = test_search_functionality(token)

    # Final results
    print("\n" + "=" * 60)
    print("📊 FINAL OCR WORKFLOW RESULTS:")
    print(f"   GPU Environment: ✅")
    print(f"   Server Connectivity: ✅")
    print(f"   OCR PDF Upload: ✅")
    print(f"   Processing: {'✅' if processing_ok else '❌'}")
    print(f"   Search: {'✅' if search_ok else '❌'}")

    if processing_ok and search_ok:
        print("\n🎉 SUCCESS: OCR PDF upload, indexing, and search working with GPU mode!")
        print("   The scanned table document has been successfully processed and is searchable.")
    else:
        print("\n⚠️ PARTIAL SUCCESS: Some workflow steps completed, but issues remain.")
        print("   Check server logs for detailed error information.")

if __name__ == "__main__":
    main()