Auto-commit: OCR workflow improvements, performance optimizations, and bug fixes

This commit is contained in:
2026-01-11 18:21:16 +08:00
parent 642dd0ea5f
commit 1ddd49f913
97 changed files with 5909 additions and 451 deletions

360
simple_api_test.py Normal file
View File

@@ -0,0 +1,360 @@
#!/usr/bin/env python3
"""
Simple API test for LightRAG workflow.
Tests server startup, login, document status, and search without Selenium.
"""
import os
import sys
import time
import subprocess
import requests
import json
# Configuration
SERVER_URL = "http://localhost:3015"
USERNAME = "jleu3482"
PASSWORD = "jleu1212"
WORKSPACE_DIR = "c:/aaWORK/railseek6"
def kill_existing_server():
"""Kill any existing server on port 3015"""
print("Killing existing server processes...")
try:
# Find and kill processes using port 3015
subprocess.run(["netstat", "-ano"], capture_output=True, text=True)
subprocess.run(["taskkill", "/F", "/IM", "python.exe"], capture_output=True)
time.sleep(2)
except:
pass
def start_server():
"""Start LightRAG server"""
print("Starting LightRAG server...")
# Kill any existing server first
kill_existing_server()
# Start server using the fixed Python script
cmd = [sys.executable, "start_server_fixed.py"]
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
encoding='utf-8',
bufsize=1,
universal_newlines=True
)
# Wait for server to start
print("Waiting for server to start...")
for i in range(30): # Wait up to 30 seconds
try:
response = requests.get(f"{SERVER_URL}/health", timeout=5)
if response.status_code == 200:
print(f"✓ Server started successfully (attempt {i+1})")
# Read initial output
try:
output, _ = process.communicate(timeout=0.1)
if output:
print("Server output snippet:")
for line in output.split('\n')[:20]:
if line.strip():
print(f" {line[:100]}")
except:
pass
return process
except:
pass
time.sleep(1)
print("✗ Server failed to start within 30 seconds")
if process:
process.terminate()
return None
def check_server_health():
"""Check if server is healthy"""
try:
response = requests.get(f"{SERVER_URL}/health", timeout=10)
if response.status_code == 200:
data = response.json()
print(f"✓ Server health: {data.get('status', 'unknown')}")
print(f" Auth mode: {data.get('auth_mode', 'unknown')}")
print(f" LLM: {data.get('configuration', {}).get('llm_binding', 'unknown')} / {data.get('configuration', {}).get('llm_model', 'unknown')}")
print(f" Embedding: {data.get('configuration', {}).get('embedding_binding', 'unknown')}")
return True, data
except Exception as e:
print(f"✗ Health check failed: {e}")
return False, None
def test_login():
"""Test login via API"""
print("Testing login...")
try:
# First check auth status
response = requests.get(f"{SERVER_URL}/auth-status", timeout=10)
if response.status_code == 200:
auth_status = response.json()
print(f" Auth configured: {auth_status.get('auth_configured', 'unknown')}")
if auth_status.get('auth_configured'):
# Try to login
form_data = {
"username": USERNAME,
"password": PASSWORD
}
response = requests.post(f"{SERVER_URL}/login", data=form_data, timeout=10)
if response.status_code == 200:
token_data = response.json()
print(f"✓ Login successful")
print(f" Auth mode: {token_data.get('auth_mode', 'unknown')}")
return True, token_data.get('access_token')
else:
print(f"✗ Login failed: {response.status_code}")
return False, None
else:
print("✓ Auth not configured (guest access enabled)")
return True, None
except Exception as e:
print(f"✗ Login test error: {e}")
return False, None
def test_endpoints():
"""Test various API endpoints"""
print("Testing API endpoints...")
endpoints = [
("/health", "GET"),
("/auth-status", "GET"),
("/api/documents", "GET"),
("/api/workspaces", "GET"),
("/api/query", "POST"), # Will test with dummy query
]
working_endpoints = []
for endpoint, method in endpoints:
try:
if method == "GET":
response = requests.get(f"{SERVER_URL}{endpoint}", timeout=10)
else:
# For POST to /api/query, send a simple test query
if endpoint == "/api/query":
response = requests.post(
f"{SERVER_URL}{endpoint}",
json={"query": "test", "top_k": 1},
timeout=30
)
else:
response = requests.post(f"{SERVER_URL}{endpoint}", timeout=10)
if response.status_code in [200, 201]:
print(f"{endpoint}: {response.status_code}")
working_endpoints.append(endpoint)
else:
print(f"{endpoint}: {response.status_code} - {response.text[:100]}")
except Exception as e:
print(f"{endpoint}: ERROR - {str(e)[:100]}")
return len(working_endpoints) >= 3 # At least 3 endpoints should work
def check_documents():
"""Check existing documents"""
print("Checking documents...")
try:
response = requests.get(f"{SERVER_URL}/api/documents", timeout=10)
if response.status_code == 200:
documents = response.json()
print(f"✓ Found {len(documents)} documents")
for doc in documents[:3]: # Show first 3
print(f" - {doc.get('filename', 'unknown')}: {doc.get('status', 'unknown')}")
return len(documents) > 0
else:
print(f"✗ Failed to get documents: {response.status_code}")
except Exception as e:
print(f"✗ Error checking documents: {e}")
return False
def test_search():
"""Test search functionality"""
print("Testing search...")
test_queries = ["railway", "train", "transport", "test"]
for query in test_queries:
try:
print(f" Testing query: '{query}'")
response = requests.post(
f"{SERVER_URL}/api/query",
json={"query": query, "top_k": 3},
timeout=60 # Longer timeout for search
)
if response.status_code == 200:
results = response.json()
print(f" ✓ Search successful: {len(results.get('results', []))} results")
# Check for evidence of DeepSeek API usage
if "llm_response" in results:
print(f" ✓ DeepSeek API used (LLM response present)")
return True
elif "results" in results and len(results["results"]) > 0:
print(f" ✓ Search returned results (may be using cached/indexed data)")
return True
else:
print(f" ⚠ Search returned no results")
else:
print(f" ✗ Search failed: {response.status_code} - {response.text[:100]}")
except Exception as e:
print(f" ✗ Search error: {e}")
return False
def check_logs_for_components():
"""Check server logs for evidence of indexing components"""
print("Checking logs for indexing components...")
log_file = os.path.join(WORKSPACE_DIR, "LightRAG-main", "logs", "lightrag.log")
components_found = {
"openclip": False,
"paddleocr": False,
"spacy": False,
"deepseek": False
}
if os.path.exists(log_file):
try:
# Read last 1000 lines of log file
with open(log_file, 'r', encoding='utf-8', errors='ignore') as f:
lines = f.readlines()
last_lines = lines[-1000:] if len(lines) > 1000 else lines
log_content = "".join(last_lines).lower()
# Check for component mentions
components_found["openclip"] = "openclip" in log_content
components_found["paddleocr"] = "paddleocr" in log_content or "ocr" in log_content
components_found["spacy"] = "spacy" in log_content or "entity" in log_content
components_found["deepseek"] = "deepseek" in log_content
print("Components found in logs:")
for component, found in components_found.items():
print(f" - {component}: {'' if found else ''}")
return components_found
except Exception as e:
print(f"✗ Error reading log file: {e}")
else:
print(f"✗ Log file not found: {log_file}")
return components_found
def main():
"""Main test function"""
print("=" * 60)
print("LightRAG API Test")
print("=" * 60)
# Change to workspace directory
os.chdir(WORKSPACE_DIR)
test_results = {}
# Step 1: Start server
server_process = start_server()
test_results["server_started"] = server_process is not None
if not test_results["server_started"]:
print("\n✗ FAILED: Could not start server")
return False
# Give server time to fully initialize
time.sleep(3)
# Step 2: Check server health
health_ok, health_data = check_server_health()
test_results["health_check"] = health_ok
# Step 3: Test login
login_ok, token = test_login()
test_results["login"] = login_ok
# Step 4: Test endpoints
test_results["endpoints"] = test_endpoints()
# Step 5: Check documents
test_results["documents_exist"] = check_documents()
# Step 6: Check logs for indexing components
components = check_logs_for_components()
test_results["indexing_components"] = any(components.values())
test_results.update({f"component_{k}": v for k, v in components.items()})
# Step 7: Test search
test_results["search_works"] = test_search()
# Step 8: Cleanup
print("\nCleaning up...")
if server_process:
server_process.terminate()
try:
server_process.wait(timeout=5)
except:
pass
# Step 9: Report results
print("\n" + "=" * 60)
print("TEST SUMMARY")
print("=" * 60)
all_passed = True
for test_name, result in test_results.items():
if isinstance(result, bool):
status = "PASS" if result else "FAIL"
if not result:
all_passed = False
print(f"{test_name:30} {status}")
else:
print(f"{test_name:30} {result}")
print("\n" + "=" * 60)
# Generate detailed report
report = {
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"overall_success": all_passed,
"test_results": test_results,
"server_config": health_data.get("configuration", {}) if health_data else {},
"components_found": components
}
report_file = "lightrag_test_report.json"
with open(report_file, 'w') as f:
json.dump(report, f, indent=2)
print(f"Detailed report saved to: {report_file}")
if all_passed:
print("✓ SUCCESS: All critical tests passed!")
return True
else:
print("⚠ WARNING: Some tests failed or had issues")
print("\nRoot cause analysis:")
print("1. Server startup issues: Fixed Unicode encoding in display_splash_screen()")
print("2. OllamaAPI error: Fixed WorkspaceManager/LightRAG type mismatch")
print("3. WorkspaceManager bug: Fixed lightrag_factory.create() call")
print("\nRemaining issues may require:")
print("- Checking if OCR.pdf exists in test/ directory")
print("- Ensuring DeepSeek API key is valid in .env file")
print("- Verifying Ollama is running for embeddings")
return False
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)