railseek6/test_ocr_cpu_mode.py

import requests
import time
import os

base_url = 'http://localhost:3015'

def wait_for_server(timeout=60):
    """Wait for server to be ready"""
    print("Waiting for server to start...")
    start_time = time.time()
    while time.time() - start_time < timeout:
        try:
            response = requests.get(f'{base_url}/', timeout=5)
            if response.status_code == 200:
                print("✓ Server is ready")
                return True
        except:
            pass
        time.sleep(2)
    print("✗ Server did not start within timeout")
    return False

def test_ocr_with_cpu_mode():
    """Test OCR PDF upload with CPU-only processing"""
    print("=== TESTING OCR PDF WITH CPU-ONLY MODE ===")
    print("Target file: ocr.pdf (scanned table document)")

    # Wait for server
    if not wait_for_server():
        return

    # Login
    login_data = {'username': 'jleu3482', 'password': 'jleu1212'}
    try:
        login_response = requests.post(f'{base_url}/login', data=login_data, timeout=10)
        if login_response.status_code == 200:
            token = login_response.json().get('access_token')
            headers = {'Authorization': f'Bearer {token}'}
            print("✓ Login successful")

            # Clear existing documents first
            print("=== Clearing existing documents ===")
            clear_response = requests.delete(f'{base_url}/documents', headers=headers, timeout=10)
            print(f"Clear status: {clear_response.status_code}")
            if clear_response.status_code == 200:
                print("✓ Documents cleared")

            # Upload OCR PDF
            print("\n=== Uploading OCR PDF ===")
            print(f"File: ocr.pdf ({os.path.getsize('ocr.pdf')} bytes)")

            with open('ocr.pdf', 'rb') as f:
                files = {'file': ('ocr.pdf', f, 'application/pdf')}
                upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers=headers, timeout=30)

            print(f"Upload status: {upload_response.status_code}")
            if upload_response.status_code == 200:
                upload_data = upload_response.json()
                print(f"Upload response: {upload_data}")
                track_id = upload_data.get('track_id')

                if track_id:
                    print(f"\n=== Monitoring OCR Processing (CPU Mode) ===")
                    print("Processing will be slower but more reliable...")

                    # Monitor processing with longer timeout for CPU processing
                    max_attempts = 60  # 10 minutes for CPU processing
                    for attempt in range(max_attempts):
                        try:
                            docs_response = requests.get(f'{base_url}/documents', headers=headers, timeout=10)
                            if docs_response.status_code == 200:
                                docs_data = docs_response.json()
                                statuses = docs_data.get('statuses', {})

                                completed = statuses.get('completed', [])
                                processing = statuses.get('processing', [])
                                failed = statuses.get('failed', [])

                                # Check if our file is completed
                                for doc in completed:
                                    if doc.get('file_path') == 'ocr.pdf':
                                        print(f"✓ OCR processing completed!")
                                        print(f"  File: {doc.get('file_path')}")
                                        print(f"  Size: {doc.get('file_size')}")
                                        print(f"  Chunks: {doc.get('chunk_count')}")

                                        # Now test search
                                        print("\n=== Testing Search ===")
                                        search_queries = [
                                            "table", "data", "information", "document"
                                        ]

                                        for query in search_queries:
                                            print(f"Searching for: '{query}'")
                                            search_data = {'query': query, 'top_k': 5}
                                            search_response = requests.post(f'{base_url}/search', json=search_data, headers=headers, timeout=10)
                                            if search_response.status_code == 200:
                                                search_results = search_response.json()
                                                print(f"  Results: {len(search_results.get('results', []))}")
                                                for i, result in enumerate(search_results.get('results', [])):
                                                    if i < 2:  # Show first 2 results
                                                        print(f"    {i+1}. Score: {result.get('score'):.3f}")
                                                        print(f"       Text: {result.get('text', '')[:100]}...")
                                            else:
                                                print(f"  Search failed: {search_response.text}")

                                        return True

                                # Check if still processing
                                for doc in processing:
                                    if doc.get('file_path') == 'ocr.pdf':
                                        print(f"  Processing... ({attempt + 1}/{max_attempts})")
                                        break
                                else:
                                    # Not in processing, check failed
                                    for doc in failed:
                                        if doc.get('file_path') == 'ocr.pdf':
                                            print(f"✗ OCR processing failed!")
                                            print(f"  Error: {doc.get('error_msg', 'Unknown error')}")
                                            return False

                            time.sleep(10)  # Check every 10 seconds

                        except requests.exceptions.RequestException as e:
                            print(f"  Connection error (attempt {attempt + 1}/{max_attempts}): {e}")
                            time.sleep(10)

                    print("✗ OCR processing timed out")
                    return False
                else:
                    print("✗ No track ID returned")
                    return False
            else:
                print(f"✗ Upload failed: {upload_response.text}")
                return False
        else:
            print(f"✗ Login failed: {login_response.text}")
            return False
    except Exception as e:
        print(f"✗ Error during OCR test: {e}")
        return False

if __name__ == "__main__":
    success = test_ocr_with_cpu_mode()
    if success:
        print("\n🎉 SUCCESS: OCR PDF with scanned table processed successfully!")
    else:
        print("\n❌ FAILED: OCR processing did not complete successfully")