Auto-commit: OCR workflow improvements, performance optimizations, and bug fixes

This commit is contained in:
2026-01-11 18:21:16 +08:00
parent 642dd0ea5f
commit 1ddd49f913
97 changed files with 5909 additions and 451 deletions

View File

@@ -0,0 +1,273 @@
#!/usr/bin/env python3
"""
Improved server starter that fixes all identified issues:
1. Better port conflict handling
2. Proper LLM configuration loading
3. Correct .env file path handling
4. Better error reporting
"""
import os
import sys
import subprocess
import time
import socket
import signal
def kill_process_on_port(port):
"""Kill any process using the specified port"""
# Try psutil method first if available
psutil_available = False
try:
import psutil
psutil_available = True
except ImportError:
pass
if psutil_available:
try:
for proc in psutil.process_iter(['pid', 'name']):
try:
for conn in proc.connections(kind='inet'):
if conn.laddr.port == port:
print(f"Found process {proc.pid} ({proc.name()}) using port {port}")
proc.terminate()
proc.wait(timeout=5)
print(f"Terminated process {proc.pid}")
return True
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
pass
except Exception as e:
print(f"psutil method failed: {e}")
# Fallback to netstat method (works on Windows without psutil)
try:
result = subprocess.run(
f'netstat -ano | findstr :{port}',
capture_output=True,
text=True,
shell=True
)
if result.stdout:
for line in result.stdout.strip().split('\n'):
if f':{port}' in line:
parts = line.strip().split()
if len(parts) >= 5:
pid = parts[-1]
print(f"Found process {pid} using port {port}")
subprocess.run(f'taskkill /F /PID {pid}',
capture_output=True, shell=True)
print(f"Killed process {pid}")
return True
except Exception as e:
print(f"netstat method failed: {e}")
return False
def is_port_in_use(port):
"""Check if a port is in use"""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
try:
s.settimeout(1)
s.bind(('0.0.0.0', port))
return False
except socket.error:
return True
def load_env_file(env_path):
"""Load environment variables from .env file"""
config = {}
try:
with open(env_path, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line and not line.startswith('#'):
if '=' in line:
key, value = line.split('=', 1)
config[key.strip()] = value.strip()
print(f"Loaded {len(config)} configuration variables from {env_path}")
return config
except FileNotFoundError:
print(f"Warning: .env file not found at {env_path}")
return {}
except Exception as e:
print(f"Error reading .env file: {e}")
return {}
def main():
"""Start the LightRAG server with all fixes applied"""
print("Starting LightRAG server with improved configuration...")
# Set environment variables for UTF-8 encoding
env = os.environ.copy()
env['PYTHONIOENCODING'] = 'utf-8'
env['PYTHONUTF8'] = '1'
# Determine the correct .env file path
# First try current directory, then LightRAG-main directory
env_paths = ['.env', 'LightRAG-main/.env']
config = {}
for env_path in env_paths:
if os.path.exists(env_path):
config = load_env_file(env_path)
if config:
print(f"Using .env file from: {env_path}")
break
if not config:
print("Warning: No .env file found, using defaults")
# Ensure critical LLM settings have defaults to prevent accidental OpenAI usage
if 'LLM_BINDING_HOST' not in config:
config['LLM_BINDING_HOST'] = 'https://api.deepseek.com/v1'
print("Warning: LLM_BINDING_HOST not set, defaulting to DeepSeek API")
if 'OPENAI_API_BASE' not in config:
config['OPENAI_API_BASE'] = config.get('LLM_BINDING_HOST', 'https://api.deepseek.com/v1')
if 'LLM_MODEL' not in config:
config['LLM_MODEL'] = 'deepseek-chat'
# CRITICAL FIX: Ensure LLM_BINDING_API_KEY is set from OPENAI_API_KEY if not present
if 'LLM_BINDING_API_KEY' not in config and 'OPENAI_API_KEY' in config:
config['LLM_BINDING_API_KEY'] = config['OPENAI_API_KEY']
print("Info: Set LLM_BINDING_API_KEY from OPENAI_API_KEY")
if 'LLM_BINDING_API_KEY' not in config and 'OPENAI_API_KEY' not in config:
print("ERROR: LLM_BINDING_API_KEY or OPENAI_API_KEY must be set in .env")
sys.exit(1)
# Get configuration values with defaults
port = int(config.get('PORT', '3015'))
host = config.get('HOST', '0.0.0.0')
llm_binding = config.get('LLM_BINDING', 'openai')
embedding_binding = config.get('EMBEDDING_BINDING', 'ollama')
rerank_binding = config.get('RERANK_BINDING', 'jina')
# Check and kill any process using the port
print(f"\nChecking port {port}...")
if is_port_in_use(port):
print(f"Port {port} is in use. Attempting to kill existing process...")
if kill_process_on_port(port):
print(f"Successfully cleared port {port}")
time.sleep(2) # Wait for port to be released
else:
print(f"Warning: Could not kill process on port {port}")
print("Trying to start server anyway...")
# Set LLM-related environment variables
llm_keys = [
'LLM_BINDING_HOST',
'LLM_BINDING_API_KEY',
'LLM_MODEL',
'OPENAI_API_KEY',
'OPENAI_API_BASE',
'ENABLE_LLM_CACHE',
'ENABLE_LLM_CACHE_FOR_EXTRACT',
'TIMEOUT',
'TEMPERATURE',
'MAX_ASYNC',
'MAX_TOKENS',
'OPTIMIZE_ENTITY_EXTRACTION'
]
for key in llm_keys:
if key in config:
env[key] = config[key]
# Also set as os.environ for the current process
os.environ[key] = config[key]
# Set embedding-related environment variables
embedding_keys = [
'EMBEDDING_MODEL',
'EMBEDDING_DIM',
'EMBEDDING_BINDING_HOST',
'EMBEDDING_BATCH_NUM',
'EMBEDDING_FUNC_MAX_ASYNC'
]
for key in embedding_keys:
if key in config:
env[key] = config[key]
# Set rerank-related environment variables
rerank_keys = [
'RERANK_MODEL'
]
for key in rerank_keys:
if key in config:
env[key] = config[key]
# Build command
cmd = [
sys.executable, '-m', 'lightrag.api.lightrag_server',
'--port', str(port),
'--host', host,
'--working-dir', 'rag_storage',
'--input-dir', '../inputs',
'--key', 'jleu1212',
'--auto-scan-at-startup',
'--llm-binding', llm_binding,
'--embedding-binding', embedding_binding,
'--rerank-binding', rerank_binding
]
print(f"\nServer Configuration:")
print(f" Port: {port}")
print(f" Host: {host}")
print(f" LLM Binding: {llm_binding}")
print(f" LLM Host: {config.get('LLM_BINDING_HOST', 'Not set')}")
print(f" LLM Model: {config.get('LLM_MODEL', 'Not set')}")
print(f" API Key: {'Set' if 'LLM_BINDING_API_KEY' in config else 'Not set'}")
print(f"\nCommand: {' '.join(cmd)}")
print(f"Starting server on http://{host}:{port}")
try:
# Change to LightRAG-main directory BEFORE starting the server
os.chdir('LightRAG-main')
print(f"Changed to directory: {os.getcwd()}")
# Start the server
process = subprocess.Popen(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
encoding='utf-8',
errors='replace'
)
print("\nServer output:")
print("-" * 50)
# Read and print output
try:
for line in iter(process.stdout.readline, ''):
# Filter out problematic Unicode characters
cleaned_line = ''.join(c if ord(c) < 128 else '?' for c in line)
print(cleaned_line.rstrip())
# Check for common errors
if "Errno 10048" in line or "address already in use" in line.lower():
print("\nERROR: Port binding failed. Another process may be using the port.")
print("Try running 'netstat -ano | findstr :3015' to find the process.")
process.terminate()
return 1
except KeyboardInterrupt:
print("\nServer stopped by user")
process.terminate()
process.wait()
except Exception as e:
print(f"Error starting server: {e}")
import traceback
traceback.print_exc()
return 1
return 0
if __name__ == "__main__":
sys.exit(main())