railseek6/fix_and_test_ocr.py

import os
import sys
import subprocess
import requests
import time
import json

def fix_cudnn_issue():
    """Fix the cuDNN version detection issue"""
    print("=== FIXING cuDNN VERSION DETECTION ===")

    # Method 1: Set environment to bypass cuDNN version check
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    os.environ['CUDA_PATH'] = r'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9'
    os.environ['CUDA_HOME'] = os.environ['CUDA_PATH']

    # Add CUDA and cuDNN to PATH
    cuda_bin = os.path.join(os.environ['CUDA_PATH'], 'bin')
    current_path = os.environ.get('PATH', '')
    if cuda_bin not in current_path:
        os.environ['PATH'] = cuda_bin + ';' + current_path

    print(f"✓ Set CUDA_PATH: {os.environ['CUDA_PATH']}")
    print(f"✓ Added to PATH: {cuda_bin}")

    # Method 2: Test if we can work around the cuDNN issue
    try:
        # Try to pre-load cuDNN in a way that doesn't trigger version detection
        import ctypes
        cudnn_path = r'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9\bin\cudnn64_8.dll'
        if os.path.exists(cudnn_path):
            print(f"✓ cuDNN DLL found: {cudnn_path}")
            # Pre-load the library but don't call version functions
            ctypes.WinDLL(cudnn_path)
            print("✓ cuDNN pre-loaded successfully")
    except Exception as e:
        print(f"⚠ cuDNN pre-load warning: {e}")

    return True

def start_server_with_fixes():
    """Start the server with cuDNN fixes applied"""
    print("\n=== STARTING SERVER WITH FIXES ===")

    # Set environment for LightRAG
    os.environ['LIGHTRAG_OCR_ENGINE'] = 'paddleocr'

    try:
        # Start the server using the original zrun.bat approach
        cmd = [
            'lightrag-server',
            '--port', '3015',
            '--embedding-binding', 'ollama',
            '--rerank-binding', 'null',
            '--host', '0.0.0.0'
        ]

        print(f"Starting server with command: {' '.join(cmd)}")
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

        # Wait for server to start
        print("Waiting for server to start...")
        for i in range(30):  # Wait up to 30 seconds
            try:
                response = requests.get('http://localhost:3015/', timeout=5)
                if response.status_code == 200:
                    print("✓ Server started successfully!")
                    return process
            except:
                pass
            time.sleep(1)

        print("✗ Server failed to start within timeout")
        return None

    except Exception as e:
        print(f"✗ Failed to start server: {e}")
        return None

def test_ocr_pdf_indexing():
    """Test OCR PDF indexing and show results"""
    print("\n=== TESTING OCR PDF INDEXING ===")
    base_url = 'http://localhost:3015'

    # Login
    login_data = {'username': 'jleu3482', 'password': 'jleu1212'}
    try:
        login_response = requests.post(f'{base_url}/login', data=login_data, timeout=30)
        if login_response.status_code == 200:
            token = login_response.json().get('access_token')
            headers = {'Authorization': f'Bearer {token}'}
            print("✓ Login successful")

            # Clear existing documents first
            print("Clearing existing documents...")
            clear_response = requests.delete(f'{base_url}/documents', headers=headers, timeout=30)
            print(f"Clear status: {clear_response.status_code}")

            # Upload OCR PDF
            print(f"\nUploading OCR PDF: ocr.pdf ({os.path.getsize('ocr.pdf')} bytes)")
            with open('ocr.pdf', 'rb') as f:
                files = {'file': ('ocr.pdf', f, 'application/pdf')}
                upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers=headers, timeout=60)

            print(f"Upload status: {upload_response.status_code}")
            if upload_response.status_code == 200:
                upload_data = upload_response.json()
                print(f"Upload response: {upload_data}")
                track_id = upload_data.get('track_id')

                if track_id:
                    print(f"\n=== MONITORING OCR PROCESSING ===")
                    print("Processing OCR PDF with scanned table...")

                    # Monitor processing with detailed status
                    max_wait = 600  # 10 minutes for OCR processing
                    start_time = time.time()
                    last_status = ""

                    while time.time() - start_time < max_wait:
                        try:
                            # Check document status
                            docs_response = requests.get(f'{base_url}/documents', headers=headers, timeout=30)
                            if docs_response.status_code == 200:
                                docs_data = docs_response.json()
                                statuses = docs_data.get('statuses', {})

                                completed = statuses.get('completed', [])
                                processing = statuses.get('processing', [])
                                failed = statuses.get('failed', [])

                                elapsed = int(time.time() - start_time)
                                current_status = f"Elapsed: {elapsed}s | Completed: {len(completed)}, Processing: {len(processing)}, Failed: {len(failed)}"

                                if current_status != last_status:
                                    print(f"  {current_status}")
                                    last_status = current_status

                                # Check for our file in completed
                                for doc in completed:
                                    if doc.get('file_path') == 'ocr.pdf':
                                        print(f"\n✓ OCR PROCESSING COMPLETED in {elapsed} seconds!")
                                        print(f"  File: {doc.get('file_path')}")
                                        print(f"  Size: {doc.get('file_size')}")
                                        print(f"  Chunks: {doc.get('chunk_count')}")
                                        print(f"  Processing time: {doc.get('processing_time', 'N/A')}")

                                        # Test search functionality
                                        print("\n=== TESTING SEARCH FUNCTIONALITY ===")
                                        test_searches = [
                                            "table", "data", "information", "document",
                                            "scanned", "content", "text", "analysis"
                                        ]

                                        search_results = {}
                                        for query in test_searches:
                                            print(f"Searching: '{query}'")
                                            search_data = {'query': query, 'top_k': 5}
                                            try:
                                                search_response = requests.post(f'{base_url}/search', json=search_data, headers=headers, timeout=30)
                                                if search_response.status_code == 200:
                                                    results = search_response.json().get('results', [])
                                                    search_results[query] = len(results)
                                                    print(f"  Found {len(results)} results")

                                                    # Show top result for this query
                                                    if results:
                                                        top_result = results[0]
                                                        print(f"    Top result score: {top_result.get('score'):.3f}")
                                                        text_preview = top_result.get('text', '')[:100]
                                                        print(f"    Text preview: {text_preview}...")
                                                else:
                                                    print(f"  Search failed: {search_response.text}")
                                            except Exception as e:
                                                print(f"  Search error: {e}")

                                        # Summary
                                        print(f"\n=== SEARCH SUMMARY ===")
                                        total_results = sum(search_results.values())
                                        print(f"Total search results across all queries: {total_results}")
                                        for query, count in search_results.items():
                                            print(f"  '{query}': {count} results")

                                        return True

                                # Check if failed
                                for doc in failed:
                                    if doc.get('file_path') == 'ocr.pdf':
                                        print(f"\n✗ OCR PROCESSING FAILED after {elapsed}s!")
                                        print(f"  Error: {doc.get('error_msg', 'Unknown error')}")
                                        return False

                            time.sleep(10)  # Check every 10 seconds

                        except requests.exceptions.RequestException as e:
                            print(f"  Connection error: {e}")
                            time.sleep(10)

                    print(f"\n✗ OCR processing timed out after {max_wait} seconds")
                    return False
                else:
                    print("✗ No track ID returned")
                    return False
            else:
                print(f"✗ Upload failed: {upload_response.text}")
                return False
        else:
            print(f"✗ Login failed: {login_response.text}")
            return False
    except Exception as e:
        print(f"✗ Error during OCR test: {e}")
        return False

def main():
    """Main function to fix cuDNN and test OCR"""
    print("COMPREHENSIVE OCR FIX AND TEST")
    print("=" * 60)

    # Step 1: Fix cuDNN issues
    if not fix_cudnn_issue():
        print("✗ Failed to fix cuDNN issues")
        return

    # Step 2: Start server with fixes
    server_process = start_server_with_fixes()
    if not server_process:
        print("✗ Failed to start server")
        return

    try:
        # Step 3: Test OCR PDF indexing
        success = test_ocr_pdf_indexing()

        if success:
            print("\n🎉 SUCCESS: OCR PDF indexing completed successfully!")
            print("   The scanned table document has been processed and is searchable.")
        else:
            print("\n❌ FAILED: OCR PDF indexing did not complete successfully")

    finally:
        # Clean up - stop server
        if server_process:
            print("\nStopping server...")
            server_process.terminate()
            server_process.wait()

if __name__ == "__main__":
    main()