import os import sys def force_paddleocr_config(): """Force PaddleOCR configuration by modifying environment and config files""" print("Forcing PaddleOCR configuration...") # 1. Set environment variable permanently os.environ['LIGHTRAG_OCR_ENGINE'] = 'paddleocr' print("Set LIGHTRAG_OCR_ENGINE=paddleocr environment variable") # 2. Check if PaddleOCR is available try: import paddleocr print("PaddleOCR is installed") except ImportError: print("PaddleOCR is not installed. Installing...") os.system("pip install paddleocr") print("PaddleOCR installed") # 3. Verify configuration in LightRAG config_path = "LightRAG-main/config.ini" if os.path.exists(config_path): with open(config_path, 'r') as f: content = f.read() if 'ocr_engine = paddleocr' in content: print("LightRAG config.ini already set to paddleocr") else: print("LightRAG config.ini not set to paddleocr") # Update the config file content = content.replace('ocr_engine = aquaforest', 'ocr_engine = paddleocr') with open(config_path, 'w') as f: f.write(content) print("Updated LightRAG config.ini to use paddleocr") # 4. Create a test script to verify OCR functionality test_script = """ import paddleocr import cv2 import os def test_paddleocr(): print("Testing PaddleOCR installation...") try: # Initialize PaddleOCR ocr = paddleocr.PaddleOCR(use_angle_cls=True, lang='en') # Test with a simple image if available test_image = "test_ocr_image.png" if os.path.exists(test_image): result = ocr.ocr(test_image, cls=True) print(f"PaddleOCR test successful: {len(result)} results") else: print("PaddleOCR initialized successfully (no test image available)") except Exception as e: print(f"PaddleOCR test failed: {e}") if __name__ == "__main__": test_paddleocr() """ with open("test_paddleocr_installation.py", "w", encoding='utf-8') as f: f.write(test_script) print("Created PaddleOCR test script") # 5. Clear existing failed documents print("Clearing existing failed documents...") clear_script = """ import requests import json def clear_failed_documents(): base_url = "http://localhost:3015" api_key = "lightrag-test-key" headers = { "X-API-Key": api_key } try: # Get current documents docs_response = requests.get(f"{base_url}/documents", headers=headers) if docs_response.status_code == 200: documents = docs_response.json() failed_docs = documents.get('statuses', {}).get('failed', []) if failed_docs: print(f"Found {len(failed_docs)} failed documents to clear") # Clear all documents clear_response = requests.delete(f"{base_url}/documents", headers=headers) if clear_response.status_code == 200: print("Successfully cleared all documents") else: print(f"Failed to clear documents: {clear_response.status_code}") else: print("No failed documents found") else: print(f"Could not fetch documents: {docs_response.status_code}") except Exception as e: print(f"Error clearing documents: {e}") if __name__ == "__main__": clear_failed_documents() """ with open("clear_failed_docs.py", "w", encoding='utf-8') as f: f.write(clear_script) print("Created document clearing script") print("\\nNext steps:") print("1. Run: python test_paddleocr_installation.py (to verify PaddleOCR)") print("2. Run: python clear_failed_docs.py (to clear failed documents)") print("3. Restart the LightRAG server") print("4. Run: python test_ocr_upload_final.py (to test OCR PDF upload)") if __name__ == "__main__": force_paddleocr_config()