Auto-commit: OCR workflow improvements, performance optimizations, and bug fixes

2026-01-11 18:21:16 +08:00
parent 642dd0ea5f
commit 1ddd49f913
97 changed files with 5909 additions and 451 deletions
--- a/start_server_fixed_improved.py
+++ b/start_server_fixed_improved.py
@@ -0,0 +1,273 @@
+#!/usr/bin/env python3
+"""
+Improved server starter that fixes all identified issues:
+1. Better port conflict handling
+2. Proper LLM configuration loading
+3. Correct .env file path handling
+4. Better error reporting
+"""
+
+import os
+import sys
+import subprocess
+import time
+import socket
+import signal
+
+def kill_process_on_port(port):
+    """Kill any process using the specified port"""
+    # Try psutil method first if available
+    psutil_available = False
+    try:
+        import psutil
+        psutil_available = True
+    except ImportError:
+        pass
+    
+    if psutil_available:
+        try:
+            for proc in psutil.process_iter(['pid', 'name']):
+                try:
+                    for conn in proc.connections(kind='inet'):
+                        if conn.laddr.port == port:
+                            print(f"Found process {proc.pid} ({proc.name()}) using port {port}")
+                            proc.terminate()
+                            proc.wait(timeout=5)
+                            print(f"Terminated process {proc.pid}")
+                            return True
+                except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
+                    pass
+        except Exception as e:
+            print(f"psutil method failed: {e}")
+    
+    # Fallback to netstat method (works on Windows without psutil)
+    try:
+        result = subprocess.run(
+            f'netstat -ano | findstr :{port}',
+            capture_output=True,
+            text=True,
+            shell=True
+        )
+        if result.stdout:
+            for line in result.stdout.strip().split('\n'):
+                if f':{port}' in line:
+                    parts = line.strip().split()
+                    if len(parts) >= 5:
+                        pid = parts[-1]
+                        print(f"Found process {pid} using port {port}")
+                        subprocess.run(f'taskkill /F /PID {pid}',
+                                     capture_output=True, shell=True)
+                        print(f"Killed process {pid}")
+                        return True
+    except Exception as e:
+        print(f"netstat method failed: {e}")
+    
+    return False
+
+def is_port_in_use(port):
+    """Check if a port is in use"""
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        try:
+            s.settimeout(1)
+            s.bind(('0.0.0.0', port))
+            return False
+        except socket.error:
+            return True
+
+def load_env_file(env_path):
+    """Load environment variables from .env file"""
+    config = {}
+    try:
+        with open(env_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    if '=' in line:
+                        key, value = line.split('=', 1)
+                        config[key.strip()] = value.strip()
+        print(f"Loaded {len(config)} configuration variables from {env_path}")
+        return config
+    except FileNotFoundError:
+        print(f"Warning: .env file not found at {env_path}")
+        return {}
+    except Exception as e:
+        print(f"Error reading .env file: {e}")
+        return {}
+
+def main():
+    """Start the LightRAG server with all fixes applied"""
+    print("Starting LightRAG server with improved configuration...")
+    
+    # Set environment variables for UTF-8 encoding
+    env = os.environ.copy()
+    env['PYTHONIOENCODING'] = 'utf-8'
+    env['PYTHONUTF8'] = '1'
+    
+    # Determine the correct .env file path
+    # First try current directory, then LightRAG-main directory
+    env_paths = ['.env', 'LightRAG-main/.env']
+    config = {}
+    
+    for env_path in env_paths:
+        if os.path.exists(env_path):
+            config = load_env_file(env_path)
+            if config:
+                print(f"Using .env file from: {env_path}")
+                break
+    
+    if not config:
+        print("Warning: No .env file found, using defaults")
+    
+    # Ensure critical LLM settings have defaults to prevent accidental OpenAI usage
+    if 'LLM_BINDING_HOST' not in config:
+        config['LLM_BINDING_HOST'] = 'https://api.deepseek.com/v1'
+        print("Warning: LLM_BINDING_HOST not set, defaulting to DeepSeek API")
+    
+    if 'OPENAI_API_BASE' not in config:
+        config['OPENAI_API_BASE'] = config.get('LLM_BINDING_HOST', 'https://api.deepseek.com/v1')
+    
+    if 'LLM_MODEL' not in config:
+        config['LLM_MODEL'] = 'deepseek-chat'
+    
+    # CRITICAL FIX: Ensure LLM_BINDING_API_KEY is set from OPENAI_API_KEY if not present
+    if 'LLM_BINDING_API_KEY' not in config and 'OPENAI_API_KEY' in config:
+        config['LLM_BINDING_API_KEY'] = config['OPENAI_API_KEY']
+        print("Info: Set LLM_BINDING_API_KEY from OPENAI_API_KEY")
+    
+    if 'LLM_BINDING_API_KEY' not in config and 'OPENAI_API_KEY' not in config:
+        print("ERROR: LLM_BINDING_API_KEY or OPENAI_API_KEY must be set in .env")
+        sys.exit(1)
+    
+    # Get configuration values with defaults
+    port = int(config.get('PORT', '3015'))
+    host = config.get('HOST', '0.0.0.0')
+    llm_binding = config.get('LLM_BINDING', 'openai')
+    embedding_binding = config.get('EMBEDDING_BINDING', 'ollama')
+    rerank_binding = config.get('RERANK_BINDING', 'jina')
+    
+    # Check and kill any process using the port
+    print(f"\nChecking port {port}...")
+    if is_port_in_use(port):
+        print(f"Port {port} is in use. Attempting to kill existing process...")
+        if kill_process_on_port(port):
+            print(f"Successfully cleared port {port}")
+            time.sleep(2)  # Wait for port to be released
+        else:
+            print(f"Warning: Could not kill process on port {port}")
+            print("Trying to start server anyway...")
+    
+    # Set LLM-related environment variables
+    llm_keys = [
+        'LLM_BINDING_HOST',
+        'LLM_BINDING_API_KEY',
+        'LLM_MODEL',
+        'OPENAI_API_KEY',
+        'OPENAI_API_BASE',
+        'ENABLE_LLM_CACHE',
+        'ENABLE_LLM_CACHE_FOR_EXTRACT',
+        'TIMEOUT',
+        'TEMPERATURE',
+        'MAX_ASYNC',
+        'MAX_TOKENS',
+        'OPTIMIZE_ENTITY_EXTRACTION'
+    ]
+    
+    for key in llm_keys:
+        if key in config:
+            env[key] = config[key]
+            # Also set as os.environ for the current process
+            os.environ[key] = config[key]
+    
+    # Set embedding-related environment variables
+    embedding_keys = [
+        'EMBEDDING_MODEL',
+        'EMBEDDING_DIM',
+        'EMBEDDING_BINDING_HOST',
+        'EMBEDDING_BATCH_NUM',
+        'EMBEDDING_FUNC_MAX_ASYNC'
+    ]
+    for key in embedding_keys:
+        if key in config:
+            env[key] = config[key]
+    
+    # Set rerank-related environment variables
+    rerank_keys = [
+        'RERANK_MODEL'
+    ]
+    for key in rerank_keys:
+        if key in config:
+            env[key] = config[key]
+    
+    # Build command
+    cmd = [
+        sys.executable, '-m', 'lightrag.api.lightrag_server',
+        '--port', str(port),
+        '--host', host,
+        '--working-dir', 'rag_storage',
+        '--input-dir', '../inputs',
+        '--key', 'jleu1212',
+        '--auto-scan-at-startup',
+        '--llm-binding', llm_binding,
+        '--embedding-binding', embedding_binding,
+        '--rerank-binding', rerank_binding
+    ]
+    
+    print(f"\nServer Configuration:")
+    print(f"  Port: {port}")
+    print(f"  Host: {host}")
+    print(f"  LLM Binding: {llm_binding}")
+    print(f"  LLM Host: {config.get('LLM_BINDING_HOST', 'Not set')}")
+    print(f"  LLM Model: {config.get('LLM_MODEL', 'Not set')}")
+    print(f"  API Key: {'Set' if 'LLM_BINDING_API_KEY' in config else 'Not set'}")
+    print(f"\nCommand: {' '.join(cmd)}")
+    print(f"Starting server on http://{host}:{port}")
+    
+    try:
+        # Change to LightRAG-main directory BEFORE starting the server
+        os.chdir('LightRAG-main')
+        print(f"Changed to directory: {os.getcwd()}")
+        
+        # Start the server
+        process = subprocess.Popen(
+            cmd,
+            env=env,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            encoding='utf-8',
+            errors='replace'
+        )
+        
+        print("\nServer output:")
+        print("-" * 50)
+        
+        # Read and print output
+        try:
+            for line in iter(process.stdout.readline, ''):
+                # Filter out problematic Unicode characters
+                cleaned_line = ''.join(c if ord(c) < 128 else '?' for c in line)
+                print(cleaned_line.rstrip())
+                
+                # Check for common errors
+                if "Errno 10048" in line or "address already in use" in line.lower():
+                    print("\nERROR: Port binding failed. Another process may be using the port.")
+                    print("Try running 'netstat -ano | findstr :3015' to find the process.")
+                    process.terminate()
+                    return 1
+                    
+        except KeyboardInterrupt:
+            print("\nServer stopped by user")
+            process.terminate()
+            
+        process.wait()
+        
+    except Exception as e:
+        print(f"Error starting server: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+    
+    return 0
+
+if __name__ == "__main__":
+    sys.exit(main())