Auto-commit: OCR workflow improvements, performance optimizations, and bug fixes

2026-01-11 18:21:16 +08:00
parent 642dd0ea5f
commit 1ddd49f913
97 changed files with 5909 additions and 451 deletions
--- a/simple_api_test.py
+++ b/simple_api_test.py
@@ -0,0 +1,360 @@
+#!/usr/bin/env python3
+"""
+Simple API test for LightRAG workflow.
+Tests server startup, login, document status, and search without Selenium.
+"""
+
+import os
+import sys
+import time
+import subprocess
+import requests
+import json
+
+# Configuration
+SERVER_URL = "http://localhost:3015"
+USERNAME = "jleu3482"
+PASSWORD = "jleu1212"
+WORKSPACE_DIR = "c:/aaWORK/railseek6"
+
+def kill_existing_server():
+    """Kill any existing server on port 3015"""
+    print("Killing existing server processes...")
+    try:
+        # Find and kill processes using port 3015
+        subprocess.run(["netstat", "-ano"], capture_output=True, text=True)
+        subprocess.run(["taskkill", "/F", "/IM", "python.exe"], capture_output=True)
+        time.sleep(2)
+    except:
+        pass
+
+def start_server():
+    """Start LightRAG server"""
+    print("Starting LightRAG server...")
+    
+    # Kill any existing server first
+    kill_existing_server()
+    
+    # Start server using the fixed Python script
+    cmd = [sys.executable, "start_server_fixed.py"]
+    process = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        encoding='utf-8',
+        bufsize=1,
+        universal_newlines=True
+    )
+    
+    # Wait for server to start
+    print("Waiting for server to start...")
+    for i in range(30):  # Wait up to 30 seconds
+        try:
+            response = requests.get(f"{SERVER_URL}/health", timeout=5)
+            if response.status_code == 200:
+                print(f"✓ Server started successfully (attempt {i+1})")
+                # Read initial output
+                try:
+                    output, _ = process.communicate(timeout=0.1)
+                    if output:
+                        print("Server output snippet:")
+                        for line in output.split('\n')[:20]:
+                            if line.strip():
+                                print(f"  {line[:100]}")
+                except:
+                    pass
+                return process
+        except:
+            pass
+        time.sleep(1)
+    
+    print("✗ Server failed to start within 30 seconds")
+    if process:
+        process.terminate()
+    return None
+
+def check_server_health():
+    """Check if server is healthy"""
+    try:
+        response = requests.get(f"{SERVER_URL}/health", timeout=10)
+        if response.status_code == 200:
+            data = response.json()
+            print(f"✓ Server health: {data.get('status', 'unknown')}")
+            print(f"  Auth mode: {data.get('auth_mode', 'unknown')}")
+            print(f"  LLM: {data.get('configuration', {}).get('llm_binding', 'unknown')} / {data.get('configuration', {}).get('llm_model', 'unknown')}")
+            print(f"  Embedding: {data.get('configuration', {}).get('embedding_binding', 'unknown')}")
+            return True, data
+    except Exception as e:
+        print(f"✗ Health check failed: {e}")
+    return False, None
+
+def test_login():
+    """Test login via API"""
+    print("Testing login...")
+    
+    try:
+        # First check auth status
+        response = requests.get(f"{SERVER_URL}/auth-status", timeout=10)
+        if response.status_code == 200:
+            auth_status = response.json()
+            print(f"  Auth configured: {auth_status.get('auth_configured', 'unknown')}")
+            
+            if auth_status.get('auth_configured'):
+                # Try to login
+                form_data = {
+                    "username": USERNAME,
+                    "password": PASSWORD
+                }
+                response = requests.post(f"{SERVER_URL}/login", data=form_data, timeout=10)
+                if response.status_code == 200:
+                    token_data = response.json()
+                    print(f"✓ Login successful")
+                    print(f"  Auth mode: {token_data.get('auth_mode', 'unknown')}")
+                    return True, token_data.get('access_token')
+                else:
+                    print(f"✗ Login failed: {response.status_code}")
+                    return False, None
+            else:
+                print("✓ Auth not configured (guest access enabled)")
+                return True, None
+    except Exception as e:
+        print(f"✗ Login test error: {e}")
+    
+    return False, None
+
+def test_endpoints():
+    """Test various API endpoints"""
+    print("Testing API endpoints...")
+    
+    endpoints = [
+        ("/health", "GET"),
+        ("/auth-status", "GET"),
+        ("/api/documents", "GET"),
+        ("/api/workspaces", "GET"),
+        ("/api/query", "POST"),  # Will test with dummy query
+    ]
+    
+    working_endpoints = []
+    for endpoint, method in endpoints:
+        try:
+            if method == "GET":
+                response = requests.get(f"{SERVER_URL}{endpoint}", timeout=10)
+            else:
+                # For POST to /api/query, send a simple test query
+                if endpoint == "/api/query":
+                    response = requests.post(
+                        f"{SERVER_URL}{endpoint}",
+                        json={"query": "test", "top_k": 1},
+                        timeout=30
+                    )
+                else:
+                    response = requests.post(f"{SERVER_URL}{endpoint}", timeout=10)
+            
+            if response.status_code in [200, 201]:
+                print(f"✓ {endpoint}: {response.status_code}")
+                working_endpoints.append(endpoint)
+            else:
+                print(f"✗ {endpoint}: {response.status_code} - {response.text[:100]}")
+                
+        except Exception as e:
+            print(f"✗ {endpoint}: ERROR - {str(e)[:100]}")
+    
+    return len(working_endpoints) >= 3  # At least 3 endpoints should work
+
+def check_documents():
+    """Check existing documents"""
+    print("Checking documents...")
+    
+    try:
+        response = requests.get(f"{SERVER_URL}/api/documents", timeout=10)
+        if response.status_code == 200:
+            documents = response.json()
+            print(f"✓ Found {len(documents)} documents")
+            for doc in documents[:3]:  # Show first 3
+                print(f"  - {doc.get('filename', 'unknown')}: {doc.get('status', 'unknown')}")
+            return len(documents) > 0
+        else:
+            print(f"✗ Failed to get documents: {response.status_code}")
+    except Exception as e:
+        print(f"✗ Error checking documents: {e}")
+    
+    return False
+
+def test_search():
+    """Test search functionality"""
+    print("Testing search...")
+    
+    test_queries = ["railway", "train", "transport", "test"]
+    
+    for query in test_queries:
+        try:
+            print(f"  Testing query: '{query}'")
+            response = requests.post(
+                f"{SERVER_URL}/api/query",
+                json={"query": query, "top_k": 3},
+                timeout=60  # Longer timeout for search
+            )
+            
+            if response.status_code == 200:
+                results = response.json()
+                print(f"    ✓ Search successful: {len(results.get('results', []))} results")
+                
+                # Check for evidence of DeepSeek API usage
+                if "llm_response" in results:
+                    print(f"    ✓ DeepSeek API used (LLM response present)")
+                    return True
+                elif "results" in results and len(results["results"]) > 0:
+                    print(f"    ✓ Search returned results (may be using cached/indexed data)")
+                    return True
+                else:
+                    print(f"    ⚠ Search returned no results")
+            else:
+                print(f"    ✗ Search failed: {response.status_code} - {response.text[:100]}")
+                
+        except Exception as e:
+            print(f"    ✗ Search error: {e}")
+    
+    return False
+
+def check_logs_for_components():
+    """Check server logs for evidence of indexing components"""
+    print("Checking logs for indexing components...")
+    
+    log_file = os.path.join(WORKSPACE_DIR, "LightRAG-main", "logs", "lightrag.log")
+    components_found = {
+        "openclip": False,
+        "paddleocr": False,
+        "spacy": False,
+        "deepseek": False
+    }
+    
+    if os.path.exists(log_file):
+        try:
+            # Read last 1000 lines of log file
+            with open(log_file, 'r', encoding='utf-8', errors='ignore') as f:
+                lines = f.readlines()
+                last_lines = lines[-1000:] if len(lines) > 1000 else lines
+                log_content = "".join(last_lines).lower()
+            
+            # Check for component mentions
+            components_found["openclip"] = "openclip" in log_content
+            components_found["paddleocr"] = "paddleocr" in log_content or "ocr" in log_content
+            components_found["spacy"] = "spacy" in log_content or "entity" in log_content
+            components_found["deepseek"] = "deepseek" in log_content
+            
+            print("Components found in logs:")
+            for component, found in components_found.items():
+                print(f"  - {component}: {'✓' if found else '✗'}")
+            
+            return components_found
+        except Exception as e:
+            print(f"✗ Error reading log file: {e}")
+    else:
+        print(f"✗ Log file not found: {log_file}")
+    
+    return components_found
+
+def main():
+    """Main test function"""
+    print("=" * 60)
+    print("LightRAG API Test")
+    print("=" * 60)
+    
+    # Change to workspace directory
+    os.chdir(WORKSPACE_DIR)
+    
+    test_results = {}
+    
+    # Step 1: Start server
+    server_process = start_server()
+    test_results["server_started"] = server_process is not None
+    
+    if not test_results["server_started"]:
+        print("\n✗ FAILED: Could not start server")
+        return False
+    
+    # Give server time to fully initialize
+    time.sleep(3)
+    
+    # Step 2: Check server health
+    health_ok, health_data = check_server_health()
+    test_results["health_check"] = health_ok
+    
+    # Step 3: Test login
+    login_ok, token = test_login()
+    test_results["login"] = login_ok
+    
+    # Step 4: Test endpoints
+    test_results["endpoints"] = test_endpoints()
+    
+    # Step 5: Check documents
+    test_results["documents_exist"] = check_documents()
+    
+    # Step 6: Check logs for indexing components
+    components = check_logs_for_components()
+    test_results["indexing_components"] = any(components.values())
+    test_results.update({f"component_{k}": v for k, v in components.items()})
+    
+    # Step 7: Test search
+    test_results["search_works"] = test_search()
+    
+    # Step 8: Cleanup
+    print("\nCleaning up...")
+    if server_process:
+        server_process.terminate()
+        try:
+            server_process.wait(timeout=5)
+        except:
+            pass
+    
+    # Step 9: Report results
+    print("\n" + "=" * 60)
+    print("TEST SUMMARY")
+    print("=" * 60)
+    
+    all_passed = True
+    for test_name, result in test_results.items():
+        if isinstance(result, bool):
+            status = "PASS" if result else "FAIL"
+            if not result:
+                all_passed = False
+            print(f"{test_name:30} {status}")
+        else:
+            print(f"{test_name:30} {result}")
+    
+    print("\n" + "=" * 60)
+    
+    # Generate detailed report
+    report = {
+        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+        "overall_success": all_passed,
+        "test_results": test_results,
+        "server_config": health_data.get("configuration", {}) if health_data else {},
+        "components_found": components
+    }
+    
+    report_file = "lightrag_test_report.json"
+    with open(report_file, 'w') as f:
+        json.dump(report, f, indent=2)
+    
+    print(f"Detailed report saved to: {report_file}")
+    
+    if all_passed:
+        print("✓ SUCCESS: All critical tests passed!")
+        return True
+    else:
+        print("⚠ WARNING: Some tests failed or had issues")
+        print("\nRoot cause analysis:")
+        print("1. Server startup issues: Fixed Unicode encoding in display_splash_screen()")
+        print("2. OllamaAPI error: Fixed WorkspaceManager/LightRAG type mismatch")
+        print("3. WorkspaceManager bug: Fixed lightrag_factory.create() call")
+        print("\nRemaining issues may require:")
+        print("- Checking if OCR.pdf exists in test/ directory")
+        print("- Ensuring DeepSeek API key is valid in .env file")
+        print("- Verifying Ollama is running for embeddings")
+        return False
+
+if __name__ == "__main__":
+    success = main()
+    sys.exit(0 if success else 1)