railseek6/test_ocr_pdf_fixed.py

import os
import sys
import subprocess
import requests
import time
import fitz  # PyMuPDF
from PIL import Image
import io
import numpy as np

def setup_cuda_environment():
    """Setup CUDA 11.8 environment for GPU PaddleOCR"""
    print("=== SETTING UP CUDA 11.8 ENVIRONMENT ===")

    cuda_path = r'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8'

    if not os.path.exists(cuda_path):
        print(f"✗ CUDA 11.8 not found at: {cuda_path}")
        return None

    print(f"✓ CUDA 11.8 found at: {cuda_path}")

    # Create environment dictionary
    env = os.environ.copy()
    env['CUDA_PATH'] = cuda_path
    env['CUDA_HOME'] = cuda_path
    env['CUDA_VISIBLE_DEVICES'] = '0'
    env['LIGHTRAG_OCR_ENGINE'] = 'paddleocr'

    # Add CUDA to PATH - make sure it's at the beginning
    cuda_bin = os.path.join(cuda_path, 'bin')
    current_path = env.get('PATH', '')

    # Remove any existing CUDA paths to avoid conflicts
    paths = current_path.split(';')
    paths = [p for p in paths if 'CUDA' not in p and 'NVIDIA' not in p]
    clean_path = ';'.join(paths)

    # Add CUDA bin at the beginning
    env['PATH'] = cuda_bin + ';' + clean_path

    # Add encoding environment variables
    env['PYTHONIOENCODING'] = 'utf-8'
    env['LANG'] = 'en_US.UTF-8'
    env['LC_ALL'] = 'en_US.UTF-8'

    print("✓ Environment configured for CUDA 11.8")
    print(f"✓ PATH includes CUDA bin: {cuda_bin}")

    # Verify DLLs are accessible
    cudnn_dll = os.path.join(cuda_bin, 'cudnn_ops_infer64_8.dll')
    if os.path.exists(cudnn_dll):
        print(f"✓ cuDNN DLL found: {cudnn_dll}")
    else:
        print(f"✗ cuDNN DLL not found: {cudnn_dll}")

    return env

def test_paddleocr_gpu_direct():
    """Test PaddleOCR GPU directly on ocr.pdf using proper PDF handling"""
    print("\n=== TESTING PADDLEOCR GPU DIRECTLY ON OCR.PDF ===")

    try:
        import paddle
        from paddleocr import PaddleOCR

        print(f"✓ PaddlePaddle version: {paddle.__version__}")
        print(f"✓ GPU available: {paddle.is_compiled_with_cuda()}")

        if paddle.is_compiled_with_cuda():
            paddle.device.set_device('gpu')
            print("✓ Using GPU for PaddleOCR")

        # Method 1: Convert PDF to images first, then run OCR
        print("\n--- Method 1: Converting PDF to images first ---")

        # Open PDF with PyMuPDF
        pdf_document = fitz.open('ocr.pdf')
        print(f"✓ PDF opened successfully, {pdf_document.page_count} pages")

        all_text = []

        for page_num in range(pdf_document.page_count):
            page = pdf_document.load_page(page_num)
            pix = page.get_pixmap()
            img_data = pix.tobytes("png")

            # Convert to PIL Image then to numpy array
            image = Image.open(io.BytesIO(img_data))
            image_np = np.array(image)

            # Initialize PaddleOCR with GPU
            ocr = PaddleOCR(use_angle_cls=False, lang='en', use_gpu=True)

            # Run OCR on the numpy array
            result = ocr.ocr(image_np, cls=False)

            if result and result[0]:
                page_text = ""
                for line in result[0]:
                    text = line[1][0]
                    confidence = line[1][1]
                    page_text += f"{text} "
                    print(f"  Page {page_num+1}: '{text}' (confidence: {confidence:.3f})")

                all_text.append(page_text.strip())
            else:
                print(f"  Page {page_num+1}: No text detected")

        pdf_document.close()

        if all_text:
            print(f"\n✓ Successfully extracted text from {len(all_text)} pages")
            full_text = " ".join(all_text)
            print(f"Total text length: {len(full_text)} characters")
            print(f"Text preview: {full_text[:200]}...")
            return True
        else:
            print("✗ No text extracted from PDF")
            return False

    except Exception as e:
        print(f"✗ Error in direct PaddleOCR test: {e}")
        import traceback
        traceback.print_exc()
        return False

def start_lightrag_server_with_ocr_fix(env):
    """Start LightRAG server with fixed OCR handling"""
    print("\n=== STARTING LIGHTRAG SERVER WITH OCR FIX ===")

    try:
        cmd = [
            'lightrag-server',
            '--port', '3015',
            '--embedding-binding', 'ollama',
            '--rerank-binding', 'null',
            '--host', '0.0.0.0'
        ]

        print(f"Starting server: {' '.join(cmd)}")
        process = subprocess.Popen(
            cmd,
            env=env,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            encoding='utf-8',
            errors='replace'
        )

        # Wait for server to start
        print("Waiting for server to start...")
        for i in range(60):
            try:
                response = requests.get('http://localhost:3015/', timeout=5)
                if response.status_code == 200:
                    print("✓ Server started successfully!")
                    return process
            except:
                pass
            time.sleep(1)

        print("✗ Server failed to start within timeout")
        return None

    except Exception as e:
        print(f"✗ Failed to start server: {e}")
        return None

def test_ocr_upload_workflow():
    """Test complete OCR upload workflow"""
    print("\n=== TESTING OCR UPLOAD WORKFLOW ===")
    base_url = 'http://localhost:3015'

    try:
        # Login
        login_data = {'username': 'jleu3482', 'password': 'jleu1212'}
        login_response = requests.post(f'{base_url}/login', data=login_data, timeout=30)

        if login_response.status_code != 200:
            print(f"✗ Login failed: {login_response.text}")
            return False

        token = login_response.json().get('access_token')
        headers = {'Authorization': f'Bearer {token}'}
        print("✓ Login successful")

        # Clear existing documents
        print("Clearing existing documents...")
        clear_response = requests.delete(f'{base_url}/documents', headers=headers, timeout=30)
        print(f"Clear status: {clear_response.status_code}")

        # Upload OCR PDF
        print(f"\n=== UPLOADING OCR.PDF ===")
        print(f"File: ocr.pdf ({os.path.getsize('ocr.pdf')} bytes)")

        with open('ocr.pdf', 'rb') as f:
            files = {'file': ('ocr.pdf', f, 'application/pdf')}
            upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers=headers, timeout=60)

        print(f"Upload status: {upload_response.status_code}")
        if upload_response.status_code != 200:
            print(f"✗ Upload failed: {upload_response.text}")
            return False

        upload_data = upload_response.json()
        print(f"Upload response: {upload_data}")
        track_id = upload_data.get('track_id')

        if not track_id:
            print("✗ No track ID returned")
            return False

        # Monitor processing
        print(f"\n=== MONITORING OCR PROCESSING ===")
        print("OCR processing with GPU acceleration...")

        max_wait = 300  # 5 minutes
        start_time = time.time()

        while time.time() - start_time < max_wait:
            try:
                # Check document status
                docs_response = requests.get(f'{base_url}/documents', headers=headers, timeout=30)
                if docs_response.status_code == 200:
                    docs_data = docs_response.json()
                    statuses = docs_data.get('statuses', {})

                    completed = statuses.get('completed', [])
                    processing = statuses.get('processing', [])
                    failed = statuses.get('failed', [])

                    elapsed = int(time.time() - start_time)

                    # Check for our file in completed
                    for doc in completed:
                        if doc.get('file_path') == 'ocr.pdf':
                            print(f"\n🎉 OCR PROCESSING COMPLETED in {elapsed} seconds!")
                            print(f"  File: {doc.get('file_path')}")
                            print(f"  Size: {doc.get('file_size')}")
                            print(f"  Chunks: {doc.get('chunk_count')}")
                            return True

                    # Check if failed
                    for doc in failed:
                        if doc.get('file_path') == 'ocr.pdf':
                            print(f"✗ OCR processing failed: {doc.get('error_msg', 'Unknown error')}")
                            return False

                    # Still processing
                    if elapsed % 30 == 0:
                        print(f"  Still processing... ({elapsed}s elapsed, {len(processing)} files processing)")

                time.sleep(10)

            except requests.exceptions.RequestException as e:
                print(f"  Connection error: {e}")
                time.sleep(10)

        print(f"✗ OCR processing timed out after {max_wait} seconds")
        return False

    except Exception as e:
        print(f"✗ Error during OCR workflow test: {e}")
        return False

def main():
    """Main function to test OCR PDF with GPU PaddleOCR"""
    print("OCR PDF TEST WITH GPU PADDLEOCR")
    print("=" * 50)
    print("Testing: Direct OCR → Server Upload → Processing")
    print("CUDA 11.8: Enabled")
    print("Document: ocr.pdf")
    print("=" * 50)

    # Step 1: Setup CUDA environment
    env = setup_cuda_environment()
    if not env:
        print("\n❌ CUDA setup failed")
        return

    # Step 2: Test PaddleOCR GPU directly on ocr.pdf
    if not test_paddleocr_gpu_direct():
        print("\n❌ Direct PaddleOCR test failed")
        return

    # Step 3: Start server
    server_process = start_lightrag_server_with_ocr_fix(env)
    if not server_process:
        print("\n❌ Failed to start server")
        return

    try:
        # Step 4: Test complete upload workflow
        success = test_ocr_upload_workflow()

        if success:
            print("\n" + "=" * 50)
            print("🎉 SUCCESS: OCR PDF WORKFLOW COMPLETED!")
            print("=" * 50)
            print("The ocr.pdf document has been:")
            print("✓ Successfully processed with GPU-accelerated OCR")
            print("✓ Uploaded to the LightRAG server")
            print("✓ Indexed and made searchable")
            print("\nYou can now access the web UI at: http://localhost:3015")
        else:
            print("\n❌ OCR workflow failed")

    finally:
        # Clean up
        print("\nStopping server...")
        server_process.terminate()
        try:
            server_process.wait(timeout=10)
        except:
            server_process.kill()

        print("Test completed.")

if __name__ == "__main__":
    main()