workspace working

This commit is contained in:
2026-01-12 22:31:11 +08:00
parent 2738a822d1
commit 370fe6368a
149 changed files with 4648 additions and 660 deletions

View File

@@ -1,239 +1,207 @@
#!/usr/bin/env python3
"""
Test script for workspace isolation in LightRAG.
Creates two workspaces, uploads different documents to each, and verifies isolation.
"""
import os
import sys
import time
import json
import requests
import tempfile
import time
import sys
import json
from pathlib import Path
# Add LightRAG to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "LightRAG-main"))
SERVER_URL = "http://localhost:3015"
API_KEY = "jleu1212"
# Server configuration
BASE_URL = "http://localhost:8000"
API_KEY = os.environ.get("LIGHTRAG_API_KEY", "test-key")
def create_test_file(content, filename):
"""Create a temporary text file with given content."""
test_dir = Path("test_workspace_files")
test_dir.mkdir(exist_ok=True)
filepath = test_dir / filename
filepath.write_text(content)
return filepath
def make_request(method, endpoint, data=None, files=None, workspace=None):
"""Make HTTP request with proper headers and workspace parameter."""
headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
}
url = f"{BASE_URL}{endpoint}"
# Add workspace query parameter if provided
params = {}
def get_headers(workspace=None):
headers = {"X-API-Key": API_KEY}
if workspace:
params["workspace"] = workspace
if method == "GET":
response = requests.get(url, headers=headers, params=params)
elif method == "POST":
if files:
# For file uploads, don't use JSON content-type
headers.pop("Content-Type", None)
response = requests.post(url, headers=headers, params=params, files=files, data=data)
headers["X-Workspace"] = workspace
return headers
def upload_file(file_path, workspace=None):
"""Upload file to workspace"""
headers = get_headers(workspace)
with open(file_path, 'rb') as f:
files = {'file': (file_path.name, f)}
resp = requests.post(f"{SERVER_URL}/documents/upload", files=files, headers=headers)
if resp.status_code != 200:
print(f"Upload failed: {resp.status_code} {resp.text}")
return None
data = resp.json()
print(f"Uploaded {file_path.name} to workspace {workspace}: track_id {data.get('track_id')}")
return data.get('track_id')
def wait_for_indexing(timeout=120):
"""Wait until pipeline is not busy"""
start = time.time()
while time.time() - start < timeout:
resp = requests.get(f"{SERVER_URL}/documents/pipeline_status", headers=get_headers())
if resp.status_code == 200:
data = resp.json()
if not data.get('busy', False):
print("Pipeline idle, indexing likely complete")
return True
else:
print(f"Pipeline busy: {data.get('job_name')} {data.get('cur_batch')}/{data.get('batchs')}")
else:
response = requests.post(url, headers=headers, params=params, json=data)
elif method == "DELETE":
response = requests.delete(url, headers=headers, params=params)
else:
raise ValueError(f"Unsupported method: {method}")
return response
print(f"Failed to get pipeline status: {resp.status_code}")
time.sleep(5)
print("Timeout waiting for indexing")
return False
def test_server_health():
"""Check if server is running."""
try:
response = requests.get(f"{BASE_URL}/health", timeout=5)
return response.status_code == 200
except requests.exceptions.ConnectionError:
return False
def create_workspace(name):
"""Create a new workspace."""
response = make_request("POST", "/workspaces/", data={"name": name})
if response.status_code == 200:
print(f"✓ Created workspace: {name}")
return True
else:
print(f"✗ Failed to create workspace {name}: {response.status_code} - {response.text}")
return False
def search(query, workspace=None):
headers = get_headers(workspace)
resp = requests.post(f"{SERVER_URL}/search", json={"query": query}, headers=headers)
if resp.status_code != 200:
print(f"Search failed: {resp.status_code} {resp.text}")
return None
data = resp.json()
return data
def list_workspaces():
"""List all workspaces."""
response = make_request("GET", "/workspaces/")
if response.status_code == 200:
return response.json()
resp = requests.get(f"{SERVER_URL}/workspaces/", headers=get_headers())
if resp.status_code == 200:
return resp.json()
else:
print(f"Failed to list workspaces: {response.status_code} - {response.text}")
print(f"Failed to list workspaces: {resp.status_code} {resp.text}")
return []
def upload_document(workspace, filepath, filename=None):
"""Upload a document to a workspace."""
if filename is None:
filename = os.path.basename(filepath)
with open(filepath, 'rb') as f:
files = {'file': (filename, f, 'text/plain')}
data = {'filename': filename}
response = make_request("POST", "/documents/", data=data, files=files, workspace=workspace)
if response.status_code in (200, 201):
print(f"✓ Uploaded {filename} to workspace {workspace}")
return response.json()
def create_workspace(name):
resp = requests.post(f"{SERVER_URL}/workspaces/", json={"name": name}, headers=get_headers())
if resp.status_code == 200:
print(f"Created workspace {name}")
return True
else:
print(f"Failed to upload {filename} to workspace {workspace}: {response.status_code} - {response.text}")
return None
print(f"Failed to create workspace: {resp.status_code} {resp.text}")
return False
def search_documents(workspace, query):
"""Search for documents in a workspace."""
response = make_request("POST", "/search/", data={"query": query}, workspace=workspace)
if response.status_code == 200:
return response.json()
def delete_workspace(name):
resp = requests.delete(f"{SERVER_URL}/workspaces/{name}", headers=get_headers())
if resp.status_code == 200:
print(f"Deleted workspace {name}")
return True
else:
print(f"Failed to search in workspace {workspace}: {response.status_code} - {response.text}")
return None
print(f"Failed to delete workspace: {resp.status_code} {resp.text}")
return False
def query_documents(workspace, query):
"""Query documents in a workspace."""
response = make_request("POST", "/query/", data={"query": query}, workspace=workspace)
if response.status_code == 200:
return response.json()
def get_documents(workspace=None):
headers = get_headers(workspace)
resp = requests.get(f"{SERVER_URL}/documents", headers=headers)
if resp.status_code == 200:
data = resp.json()
return data
else:
print(f"Failed to query in workspace {workspace}: {response.status_code} - {response.text}")
print(f"Failed to get documents: {resp.status_code} {resp.text}")
return None
def main():
print("=" * 60)
print("Testing Workspace Isolation in LightRAG")
print("=" * 60)
# Ensure test files exist
test_dir = Path("test")
if not test_dir.exists():
print("Test directory not found")
sys.exit(1)
# Check if server is running
print("\n1. Checking server health...")
if not test_server_health():
print("✗ Server is not running. Please start the LightRAG server first.")
print(" Run: python LightRAG-main/lightrag/api/lightrag_server.py")
return False
file1 = test_dir / "test.docx"
file2 = test_dir / "ocr.pdf"
if not file1.exists() or not file2.exists():
print("Test files missing")
sys.exit(1)
print("✓ Server is running")
# Create fresh workspaces
ws1 = "isolated_ws1"
ws2 = "isolated_ws2"
# Create test files
print("\n2. Creating test files...")
workspace_a_file = create_test_file(
"This document belongs to Workspace A. It contains information about artificial intelligence and machine learning.",
"workspace_a_doc.txt"
)
workspace_b_file = create_test_file(
"This document belongs to Workspace B. It contains information about quantum computing and cryptography.",
"workspace_b_doc.txt"
)
print(f"✓ Created test files: {workspace_a_file.name}, {workspace_b_file.name}")
# Delete if they already exist (cleanup)
workspaces = list_workspaces()
for ws in workspaces:
if ws['name'] in [ws1, ws2]:
delete_workspace(ws['name'])
# Create workspaces
print("\n3. Creating workspaces...")
workspace_a = "test_workspace_a"
workspace_b = "test_workspace_b"
create_workspace(ws1)
create_workspace(ws2)
if not create_workspace(workspace_a):
print(" Trying to use existing workspace...")
# Upload file1 to ws1
track1 = upload_file(file1, workspace=ws1)
if not track1:
print("Failed to upload file1")
sys.exit(1)
if not create_workspace(workspace_b):
print(" Trying to use existing workspace...")
# Upload file2 to ws2
track2 = upload_file(file2, workspace=ws2)
if not track2:
print("Failed to upload file2")
sys.exit(1)
# List workspaces
workspaces = list_workspaces()
print(f" Available workspaces: {[w['name'] for w in workspaces]}")
# Wait for indexing
print("Waiting for indexing...")
if not wait_for_indexing():
print("Indexing timed out, but continuing")
# Upload documents to respective workspaces
print("\n4. Uploading documents to workspaces...")
upload_document(workspace_a, workspace_a_file)
upload_document(workspace_b, workspace_b_file)
# Wait for processing
print("\n5. Waiting for document processing (10 seconds)...")
# Give extra time for processing
time.sleep(10)
# Test isolation: Search in workspace A
print("\n6. Testing isolation - Search in Workspace A...")
results_a = search_documents(workspace_a, "artificial intelligence")
if results_a:
print(f" Found {len(results_a.get('results', []))} results in workspace A")
# Check if we see workspace B content
for result in results_a.get('results', []):
if "quantum" in result.get('content', '').lower():
print(" ✗ FAIL: Found workspace B content in workspace A search!")
else:
print(" ✓ Workspace A search only shows workspace A content")
# Check documents in each workspace
print("\n=== Documents in ws1 ===")
docs1 = get_documents(workspace=ws1)
if docs1:
for status, doc_list in docs1.get('statuses', {}).items():
print(f"{status}: {len(doc_list)}")
# Test isolation: Search in workspace B
print("\n7. Testing isolation - Search in Workspace B...")
results_b = search_documents(workspace_b, "quantum computing")
if results_b:
print(f" Found {len(results_b.get('results', []))} results in workspace B")
# Check if we see workspace A content
for result in results_b.get('results', []):
if "artificial" in result.get('content', '').lower():
print(" ✗ FAIL: Found workspace A content in workspace B search!")
else:
print(" ✓ Workspace B search only shows workspace B content")
print("\n=== Documents in ws2 ===")
docs2 = get_documents(workspace=ws2)
if docs2:
for status, doc_list in docs2.get('statuses', {}).items():
print(f"{status}: {len(doc_list)}")
# Test cross-workspace contamination
print("\n8. Testing cross-workspace contamination...")
# Search for workspace B content in workspace A
results_cross = search_documents(workspace_a, "quantum")
if results_cross and len(results_cross.get('results', [])) > 0:
print(" ✗ FAIL: Found workspace B content when searching in workspace A!")
# Search for content in each workspace
# test.docx contains "test" maybe? Let's search generic term
query = "test"
print(f"\n=== Search for '{query}' in ws1 ===")
results1 = search(query, workspace=ws1)
if results1:
print(f"Total results: {results1.get('total_results')}")
for i, r in enumerate(results1.get('results', [])[:3]):
print(f" {i+1}. {r.get('type')}: {r.get('content')[:80]}...")
print(f"\n=== Search for '{query}' in ws2 ===")
results2 = search(query, workspace=ws2)
if results2:
print(f"Total results: {results2.get('total_results')}")
for i, r in enumerate(results2.get('results', [])[:3]):
print(f" {i+1}. {r.get('type')}: {r.get('content')[:80]}...")
# Verify isolation: ws2 should have fewer results (maybe zero) because ocr.pdf doesn't contain "test"
# Actually we can't guarantee; but we can at least verify that search works and returns something.
# Let's also search for "OCR" which should be in ocr.pdf but not in test.docx
query2 = "OCR"
print(f"\n=== Search for '{query2}' in ws1 (should be none) ===")
results1b = search(query2, workspace=ws1)
if results1b:
print(f"Total results: {results1b.get('total_results')}")
print(f"\n=== Search for '{query2}' in ws2 (should have results) ===")
results2b = search(query2, workspace=ws2)
if results2b:
print(f"Total results: {results2b.get('total_results')}")
# Now delete workspace ws1
print(f"\n=== Deleting workspace {ws1} ===")
delete_workspace(ws1)
# Wait a bit for cleanup
time.sleep(5)
# Try to search in ws1 (should fail or return zero results)
print(f"\n=== Search in deleted workspace {ws1} (should fail) ===")
results_deleted = search(query, workspace=ws1)
if results_deleted:
print(f"Unexpectedly got results: {results_deleted.get('total_results')}")
else:
print(" ✓ No cross-workspace contamination detected")
print("Search failed as expected (workspace not found)")
# Test query endpoints
print("\n9. Testing query endpoints...")
query_a = query_documents(workspace_a, "What is this document about?")
if query_a:
print(f" Workspace A query response: {query_a.get('answer', '')[:100]}...")
# Verify ws2 still works
print(f"\n=== Search in remaining workspace {ws2} ===")
results_ws2 = search(query, workspace=ws2)
if results_ws2:
print(f"Workspace still functional: {results_ws2.get('total_results')} results")
query_b = query_documents(workspace_b, "What is this document about?")
if query_b:
print(f" Workspace B query response: {query_b.get('answer', '')[:100]}...")
# Cleanup (optional)
print("\n10. Test completed!")
print("\nSummary:")
print(" - Workspace isolation appears to be working correctly")
print(" - Documents are properly segregated between workspaces")
print(" - Search and query operations respect workspace boundaries")
print("\nNote: Workspaces will persist in the storage directory.")
print(" To clean up manually, delete the directories:")
print(f" - {Path('LightRAG-main/rag_storage') / workspace_a}")
print(f" - {Path('LightRAG-main/rag_storage') / workspace_b}")
return True
print("\n=== Test completed ===")
if __name__ == "__main__":
try:
success = main()
sys.exit(0 if success else 1)
except KeyboardInterrupt:
print("\nTest interrupted by user")
sys.exit(1)
except Exception as e:
print(f"\nError during test: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
main()