railseek6/test_ocr_simple.py

import requests
import time
import os

base_url = 'http://localhost:3015'

def test_ocr_pdf_simple():
    """Simple test for OCR PDF upload without clearing documents first"""
    print("=== SIMPLE OCR PDF TEST ===")
    print("Testing ocr.pdf upload and processing")

    # Login
    login_data = {'username': 'jleu3482', 'password': 'jleu1212'}
    try:
        login_response = requests.post(f'{base_url}/login', data=login_data, timeout=30)
        if login_response.status_code == 200:
            token = login_response.json().get('access_token')
            headers = {'Authorization': f'Bearer {token}'}
            print("✓ Login successful")

            # Upload OCR PDF directly (skip clearing to avoid timeout)
            print("\n=== Uploading OCR PDF ===")
            print(f"File: ocr.pdf ({os.path.getsize('ocr.pdf')} bytes)")

            with open('ocr.pdf', 'rb') as f:
                files = {'file': ('ocr.pdf', f, 'application/pdf')}
                upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers=headers, timeout=60)

            print(f"Upload status: {upload_response.status_code}")
            if upload_response.status_code == 200:
                upload_data = upload_response.json()
                print(f"Upload response: {upload_data}")
                track_id = upload_data.get('track_id')

                if track_id:
                    print(f"\n=== Monitoring OCR Processing ===")
                    print("Started OCR processing...")
                    print("This may take several minutes for CPU-based OCR...")

                    # Monitor for up to 15 minutes
                    max_wait = 900  # 15 minutes
                    start_time = time.time()

                    while time.time() - start_time < max_wait:
                        try:
                            # Check document status
                            docs_response = requests.get(f'{base_url}/documents', headers=headers, timeout=30)
                            if docs_response.status_code == 200:
                                docs_data = docs_response.json()
                                statuses = docs_data.get('statuses', {})

                                completed = statuses.get('completed', [])
                                processing = statuses.get('processing', [])
                                failed = statuses.get('failed', [])

                                elapsed = int(time.time() - start_time)

                                # Check for our file
                                for doc in completed:
                                    if doc.get('file_path') == 'ocr.pdf':
                                        print(f"\n✓ OCR processing completed in {elapsed} seconds!")
                                        print(f"  File: {doc.get('file_path')}")
                                        print(f"  Size: {doc.get('file_size')}")
                                        print(f"  Chunks: {doc.get('chunk_count')}")

                                        # Test search immediately
                                        print("\n=== Testing Search ===")
                                        search_data = {'query': 'table data information', 'top_k': 10}
                                        search_response = requests.post(f'{base_url}/search', json=search_data, headers=headers, timeout=30)
                                        if search_response.status_code == 200:
                                            search_results = search_response.json()
                                            results = search_results.get('results', [])
                                            print(f"Found {len(results)} search results")

                                            for i, result in enumerate(results[:3]):  # Show top 3
                                                print(f"\nResult {i+1}:")
                                                print(f"  Score: {result.get('score'):.3f}")
                                                text = result.get('text', '')
                                                print(f"  Text: {text[:200]}{'...' if len(text) > 200 else ''}")
                                                print(f"  Source: {result.get('source', {}).get('file_path', 'Unknown')}")
                                        else:
                                            print(f"Search failed: {search_response.text}")

                                        return True

                                # Check if still processing
                                processing_ocr = False
                                for doc in processing:
                                    if doc.get('file_path') == 'ocr.pdf':
                                        processing_ocr = True
                                        break

                                if processing_ocr:
                                    print(f"  Still processing... ({elapsed}s elapsed)")
                                else:
                                    # Check if failed
                                    for doc in failed:
                                        if doc.get('file_path') == 'ocr.pdf':
                                            print(f"\n✗ OCR processing failed after {elapsed}s!")
                                            print(f"  Error: {doc.get('error_msg', 'Unknown error')}")
                                            return False

                                    # Not in any list yet, might be queued
                                    print(f"  Waiting for processing to start... ({elapsed}s)")

                            time.sleep(10)  # Check every 10 seconds

                        except requests.exceptions.RequestException as e:
                            print(f"  Connection error: {e}")
                            time.sleep(10)

                    print(f"\n✗ OCR processing timed out after {max_wait} seconds")
                    return False
                else:
                    print("✗ No track ID returned")
                    return False
            else:
                print(f"✗ Upload failed: {upload_response.text}")
                return False
        else:
            print(f"✗ Login failed: {login_response.text}")
            return False
    except Exception as e:
        print(f"✗ Error during OCR test: {e}")
        return False

if __name__ == "__main__":
    print("Starting OCR PDF test...")
    print("Note: This test uploads ocr.pdf with the scanned table")
    print("      and monitors processing for up to 15 minutes.")
    print("      OCR processing on CPU may be slow but more reliable.\n")

    success = test_ocr_pdf_simple()
    if success:
        print("\n🎉 SUCCESS: OCR PDF with scanned table processed and searchable!")
    else:
        print("\n❌ OCR processing failed or timed out")