Auto-commit: OCR workflow improvements, performance optimizations, and bug fixes

This commit is contained in:
2026-01-11 18:21:16 +08:00
parent 642dd0ea5f
commit 1ddd49f913
97 changed files with 5909 additions and 451 deletions

239
test_workspace_isolation.py Normal file
View File

@@ -0,0 +1,239 @@
#!/usr/bin/env python3
"""
Test script for workspace isolation in LightRAG.
Creates two workspaces, uploads different documents to each, and verifies isolation.
"""
import os
import sys
import time
import json
import requests
import tempfile
from pathlib import Path
# Add LightRAG to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "LightRAG-main"))
# Server configuration
BASE_URL = "http://localhost:8000"
API_KEY = os.environ.get("LIGHTRAG_API_KEY", "test-key")
def create_test_file(content, filename):
"""Create a temporary text file with given content."""
test_dir = Path("test_workspace_files")
test_dir.mkdir(exist_ok=True)
filepath = test_dir / filename
filepath.write_text(content)
return filepath
def make_request(method, endpoint, data=None, files=None, workspace=None):
"""Make HTTP request with proper headers and workspace parameter."""
headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
}
url = f"{BASE_URL}{endpoint}"
# Add workspace query parameter if provided
params = {}
if workspace:
params["workspace"] = workspace
if method == "GET":
response = requests.get(url, headers=headers, params=params)
elif method == "POST":
if files:
# For file uploads, don't use JSON content-type
headers.pop("Content-Type", None)
response = requests.post(url, headers=headers, params=params, files=files, data=data)
else:
response = requests.post(url, headers=headers, params=params, json=data)
elif method == "DELETE":
response = requests.delete(url, headers=headers, params=params)
else:
raise ValueError(f"Unsupported method: {method}")
return response
def test_server_health():
"""Check if server is running."""
try:
response = requests.get(f"{BASE_URL}/health", timeout=5)
return response.status_code == 200
except requests.exceptions.ConnectionError:
return False
def create_workspace(name):
"""Create a new workspace."""
response = make_request("POST", "/workspaces/", data={"name": name})
if response.status_code == 200:
print(f"✓ Created workspace: {name}")
return True
else:
print(f"✗ Failed to create workspace {name}: {response.status_code} - {response.text}")
return False
def list_workspaces():
"""List all workspaces."""
response = make_request("GET", "/workspaces/")
if response.status_code == 200:
return response.json()
else:
print(f"✗ Failed to list workspaces: {response.status_code} - {response.text}")
return []
def upload_document(workspace, filepath, filename=None):
"""Upload a document to a workspace."""
if filename is None:
filename = os.path.basename(filepath)
with open(filepath, 'rb') as f:
files = {'file': (filename, f, 'text/plain')}
data = {'filename': filename}
response = make_request("POST", "/documents/", data=data, files=files, workspace=workspace)
if response.status_code in (200, 201):
print(f"✓ Uploaded {filename} to workspace {workspace}")
return response.json()
else:
print(f"✗ Failed to upload {filename} to workspace {workspace}: {response.status_code} - {response.text}")
return None
def search_documents(workspace, query):
"""Search for documents in a workspace."""
response = make_request("POST", "/search/", data={"query": query}, workspace=workspace)
if response.status_code == 200:
return response.json()
else:
print(f"✗ Failed to search in workspace {workspace}: {response.status_code} - {response.text}")
return None
def query_documents(workspace, query):
"""Query documents in a workspace."""
response = make_request("POST", "/query/", data={"query": query}, workspace=workspace)
if response.status_code == 200:
return response.json()
else:
print(f"✗ Failed to query in workspace {workspace}: {response.status_code} - {response.text}")
return None
def main():
print("=" * 60)
print("Testing Workspace Isolation in LightRAG")
print("=" * 60)
# Check if server is running
print("\n1. Checking server health...")
if not test_server_health():
print("✗ Server is not running. Please start the LightRAG server first.")
print(" Run: python LightRAG-main/lightrag/api/lightrag_server.py")
return False
print("✓ Server is running")
# Create test files
print("\n2. Creating test files...")
workspace_a_file = create_test_file(
"This document belongs to Workspace A. It contains information about artificial intelligence and machine learning.",
"workspace_a_doc.txt"
)
workspace_b_file = create_test_file(
"This document belongs to Workspace B. It contains information about quantum computing and cryptography.",
"workspace_b_doc.txt"
)
print(f"✓ Created test files: {workspace_a_file.name}, {workspace_b_file.name}")
# Create workspaces
print("\n3. Creating workspaces...")
workspace_a = "test_workspace_a"
workspace_b = "test_workspace_b"
if not create_workspace(workspace_a):
print(" Trying to use existing workspace...")
if not create_workspace(workspace_b):
print(" Trying to use existing workspace...")
# List workspaces
workspaces = list_workspaces()
print(f" Available workspaces: {[w['name'] for w in workspaces]}")
# Upload documents to respective workspaces
print("\n4. Uploading documents to workspaces...")
upload_document(workspace_a, workspace_a_file)
upload_document(workspace_b, workspace_b_file)
# Wait for processing
print("\n5. Waiting for document processing (10 seconds)...")
time.sleep(10)
# Test isolation: Search in workspace A
print("\n6. Testing isolation - Search in Workspace A...")
results_a = search_documents(workspace_a, "artificial intelligence")
if results_a:
print(f" Found {len(results_a.get('results', []))} results in workspace A")
# Check if we see workspace B content
for result in results_a.get('results', []):
if "quantum" in result.get('content', '').lower():
print(" ✗ FAIL: Found workspace B content in workspace A search!")
else:
print(" ✓ Workspace A search only shows workspace A content")
# Test isolation: Search in workspace B
print("\n7. Testing isolation - Search in Workspace B...")
results_b = search_documents(workspace_b, "quantum computing")
if results_b:
print(f" Found {len(results_b.get('results', []))} results in workspace B")
# Check if we see workspace A content
for result in results_b.get('results', []):
if "artificial" in result.get('content', '').lower():
print(" ✗ FAIL: Found workspace A content in workspace B search!")
else:
print(" ✓ Workspace B search only shows workspace B content")
# Test cross-workspace contamination
print("\n8. Testing cross-workspace contamination...")
# Search for workspace B content in workspace A
results_cross = search_documents(workspace_a, "quantum")
if results_cross and len(results_cross.get('results', [])) > 0:
print(" ✗ FAIL: Found workspace B content when searching in workspace A!")
else:
print(" ✓ No cross-workspace contamination detected")
# Test query endpoints
print("\n9. Testing query endpoints...")
query_a = query_documents(workspace_a, "What is this document about?")
if query_a:
print(f" Workspace A query response: {query_a.get('answer', '')[:100]}...")
query_b = query_documents(workspace_b, "What is this document about?")
if query_b:
print(f" Workspace B query response: {query_b.get('answer', '')[:100]}...")
# Cleanup (optional)
print("\n10. Test completed!")
print("\nSummary:")
print(" - Workspace isolation appears to be working correctly")
print(" - Documents are properly segregated between workspaces")
print(" - Search and query operations respect workspace boundaries")
print("\nNote: Workspaces will persist in the storage directory.")
print(" To clean up manually, delete the directories:")
print(f" - {Path('LightRAG-main/rag_storage') / workspace_a}")
print(f" - {Path('LightRAG-main/rag_storage') / workspace_b}")
return True
if __name__ == "__main__":
try:
success = main()
sys.exit(0 if success else 1)
except KeyboardInterrupt:
print("\nTest interrupted by user")
sys.exit(1)
except Exception as e:
print(f"\nError during test: {e}")
import traceback
traceback.print_exc()
sys.exit(1)