railseek6/force_paddleocr_fix.py

import os
import sys

def force_paddleocr_config():
    """Force PaddleOCR configuration by modifying environment and config files"""

    print("Forcing PaddleOCR configuration...")

    # 1. Set environment variable permanently
    os.environ['LIGHTRAG_OCR_ENGINE'] = 'paddleocr'
    print("Set LIGHTRAG_OCR_ENGINE=paddleocr environment variable")

    # 2. Check if PaddleOCR is available
    try:
        import paddleocr
        print("PaddleOCR is installed")
    except ImportError:
        print("PaddleOCR is not installed. Installing...")
        os.system("pip install paddleocr")
        print("PaddleOCR installed")

    # 3. Verify configuration in LightRAG
    config_path = "LightRAG-main/config.ini"
    if os.path.exists(config_path):
        with open(config_path, 'r') as f:
            content = f.read()

        if 'ocr_engine = paddleocr' in content:
            print("LightRAG config.ini already set to paddleocr")
        else:
            print("LightRAG config.ini not set to paddleocr")
            # Update the config file
            content = content.replace('ocr_engine = aquaforest', 'ocr_engine = paddleocr')
            with open(config_path, 'w') as f:
                f.write(content)
            print("Updated LightRAG config.ini to use paddleocr")

    # 4. Create a test script to verify OCR functionality
    test_script = """
import paddleocr
import cv2
import os

def test_paddleocr():
    print("Testing PaddleOCR installation...")
    try:
        # Initialize PaddleOCR
        ocr = paddleocr.PaddleOCR(use_angle_cls=True, lang='en')

        # Test with a simple image if available
        test_image = "test_ocr_image.png"
        if os.path.exists(test_image):
            result = ocr.ocr(test_image, cls=True)
            print(f"PaddleOCR test successful: {len(result)} results")
        else:
            print("PaddleOCR initialized successfully (no test image available)")

    except Exception as e:
        print(f"PaddleOCR test failed: {e}")

if __name__ == "__main__":
    test_paddleocr()
"""

    with open("test_paddleocr_installation.py", "w", encoding='utf-8') as f:
        f.write(test_script)

    print("Created PaddleOCR test script")

    # 5. Clear existing failed documents
    print("Clearing existing failed documents...")

    clear_script = """
import requests
import json

def clear_failed_documents():
    base_url = "http://localhost:3015"
    api_key = "lightrag-test-key"

    headers = {
        "X-API-Key": api_key
    }

    try:
        # Get current documents
        docs_response = requests.get(f"{base_url}/documents", headers=headers)
        if docs_response.status_code == 200:
            documents = docs_response.json()
            failed_docs = documents.get('statuses', {}).get('failed', [])

            if failed_docs:
                print(f"Found {len(failed_docs)} failed documents to clear")

                # Clear all documents
                clear_response = requests.delete(f"{base_url}/documents", headers=headers)
                if clear_response.status_code == 200:
                    print("Successfully cleared all documents")
                else:
                    print(f"Failed to clear documents: {clear_response.status_code}")
            else:
                print("No failed documents found")
        else:
            print(f"Could not fetch documents: {docs_response.status_code}")

    except Exception as e:
        print(f"Error clearing documents: {e}")

if __name__ == "__main__":
    clear_failed_documents()
"""

    with open("clear_failed_docs.py", "w", encoding='utf-8') as f:
        f.write(clear_script)

    print("Created document clearing script")

    print("\\nNext steps:")
    print("1. Run: python test_paddleocr_installation.py (to verify PaddleOCR)")
    print("2. Run: python clear_failed_docs.py (to clear failed documents)")
    print("3. Restart the LightRAG server")
    print("4. Run: python test_ocr_upload_final.py (to test OCR PDF upload)")

if __name__ == "__main__":
    force_paddleocr_config()