workspace working

2026-01-12 22:31:11 +08:00
parent 2738a822d1
commit 370fe6368a
149 changed files with 4648 additions and 660 deletions
--- a/test_workspace_isolation.py
+++ b/test_workspace_isolation.py
@@ -1,239 +1,207 @@
-#!/usr/bin/env python3
-"""
-Test script for workspace isolation in LightRAG.
-Creates two workspaces, uploads different documents to each, and verifies isolation.
-"""
-
-import os
-import sys
-import time
-import json
 import requests
-import tempfile
+import time
+import sys
+import json
 from pathlib import Path

-# Add LightRAG to path
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "LightRAG-main"))
+SERVER_URL = "http://localhost:3015"
+API_KEY = "jleu1212"

-# Server configuration
-BASE_URL = "http://localhost:8000"
-API_KEY = os.environ.get("LIGHTRAG_API_KEY", "test-key")
-
-def create_test_file(content, filename):
-    """Create a temporary text file with given content."""
-    test_dir = Path("test_workspace_files")
-    test_dir.mkdir(exist_ok=True)
-    filepath = test_dir / filename
-    filepath.write_text(content)
-    return filepath
-
-def make_request(method, endpoint, data=None, files=None, workspace=None):
-    """Make HTTP request with proper headers and workspace parameter."""
-    headers = {
-        "Authorization": f"Bearer {API_KEY}",
-        "Content-Type": "application/json"
-    }
-    
-    url = f"{BASE_URL}{endpoint}"
-    
-    # Add workspace query parameter if provided
-    params = {}
+def get_headers(workspace=None):
+    headers = {"X-API-Key": API_KEY}
    if workspace:
-        params["workspace"] = workspace
-    
-    if method == "GET":
-        response = requests.get(url, headers=headers, params=params)
-    elif method == "POST":
-        if files:
-            # For file uploads, don't use JSON content-type
-            headers.pop("Content-Type", None)
-            response = requests.post(url, headers=headers, params=params, files=files, data=data)
+        headers["X-Workspace"] = workspace
+    return headers
+
+def upload_file(file_path, workspace=None):
+    """Upload file to workspace"""
+    headers = get_headers(workspace)
+    with open(file_path, 'rb') as f:
+        files = {'file': (file_path.name, f)}
+        resp = requests.post(f"{SERVER_URL}/documents/upload", files=files, headers=headers)
+    if resp.status_code != 200:
+        print(f"Upload failed: {resp.status_code} {resp.text}")
+        return None
+    data = resp.json()
+    print(f"Uploaded {file_path.name} to workspace {workspace}: track_id {data.get('track_id')}")
+    return data.get('track_id')
+
+def wait_for_indexing(timeout=120):
+    """Wait until pipeline is not busy"""
+    start = time.time()
+    while time.time() - start < timeout:
+        resp = requests.get(f"{SERVER_URL}/documents/pipeline_status", headers=get_headers())
+        if resp.status_code == 200:
+            data = resp.json()
+            if not data.get('busy', False):
+                print("Pipeline idle, indexing likely complete")
+                return True
+            else:
+                print(f"Pipeline busy: {data.get('job_name')} {data.get('cur_batch')}/{data.get('batchs')}")
        else:
-            response = requests.post(url, headers=headers, params=params, json=data)
-    elif method == "DELETE":
-        response = requests.delete(url, headers=headers, params=params)
-    else:
-        raise ValueError(f"Unsupported method: {method}")
-    
-    return response
+            print(f"Failed to get pipeline status: {resp.status_code}")
+        time.sleep(5)
+    print("Timeout waiting for indexing")
+    return False

-def test_server_health():
-    """Check if server is running."""
-    try:
-        response = requests.get(f"{BASE_URL}/health", timeout=5)
-        return response.status_code == 200
-    except requests.exceptions.ConnectionError:
-        return False
-
-def create_workspace(name):
-    """Create a new workspace."""
-    response = make_request("POST", "/workspaces/", data={"name": name})
-    if response.status_code == 200:
-        print(f"✓ Created workspace: {name}")
-        return True
-    else:
-        print(f"✗ Failed to create workspace {name}: {response.status_code} - {response.text}")
-        return False
+def search(query, workspace=None):
+    headers = get_headers(workspace)
+    resp = requests.post(f"{SERVER_URL}/search", json={"query": query}, headers=headers)
+    if resp.status_code != 200:
+        print(f"Search failed: {resp.status_code} {resp.text}")
+        return None
+    data = resp.json()
+    return data

 def list_workspaces():
-    """List all workspaces."""
-    response = make_request("GET", "/workspaces/")
-    if response.status_code == 200:
-        return response.json()
+    resp = requests.get(f"{SERVER_URL}/workspaces/", headers=get_headers())
+    if resp.status_code == 200:
+        return resp.json()
    else:
-        print(f"✗ Failed to list workspaces: {response.status_code} - {response.text}")
+        print(f"Failed to list workspaces: {resp.status_code} {resp.text}")
        return []

-def upload_document(workspace, filepath, filename=None):
-    """Upload a document to a workspace."""
-    if filename is None:
-        filename = os.path.basename(filepath)
-    
-    with open(filepath, 'rb') as f:
-        files = {'file': (filename, f, 'text/plain')}
-        data = {'filename': filename}
-        response = make_request("POST", "/documents/", data=data, files=files, workspace=workspace)
-    
-    if response.status_code in (200, 201):
-        print(f"✓ Uploaded {filename} to workspace {workspace}")
-        return response.json()
+def create_workspace(name):
+    resp = requests.post(f"{SERVER_URL}/workspaces/", json={"name": name}, headers=get_headers())
+    if resp.status_code == 200:
+        print(f"Created workspace {name}")
+        return True
    else:
-        print(f"✗ Failed to upload {filename} to workspace {workspace}: {response.status_code} - {response.text}")
-        return None
+        print(f"Failed to create workspace: {resp.status_code} {resp.text}")
+        return False

-def search_documents(workspace, query):
-    """Search for documents in a workspace."""
-    response = make_request("POST", "/search/", data={"query": query}, workspace=workspace)
-    if response.status_code == 200:
-        return response.json()
+def delete_workspace(name):
+    resp = requests.delete(f"{SERVER_URL}/workspaces/{name}", headers=get_headers())
+    if resp.status_code == 200:
+        print(f"Deleted workspace {name}")
+        return True
    else:
-        print(f"✗ Failed to search in workspace {workspace}: {response.status_code} - {response.text}")
-        return None
+        print(f"Failed to delete workspace: {resp.status_code} {resp.text}")
+        return False

-def query_documents(workspace, query):
-    """Query documents in a workspace."""
-    response = make_request("POST", "/query/", data={"query": query}, workspace=workspace)
-    if response.status_code == 200:
-        return response.json()
+def get_documents(workspace=None):
+    headers = get_headers(workspace)
+    resp = requests.get(f"{SERVER_URL}/documents", headers=headers)
+    if resp.status_code == 200:
+        data = resp.json()
+        return data
    else:
-        print(f"✗ Failed to query in workspace {workspace}: {response.status_code} - {response.text}")
+        print(f"Failed to get documents: {resp.status_code} {resp.text}")
        return None

 def main():
-    print("=" * 60)
-    print("Testing Workspace Isolation in LightRAG")
-    print("=" * 60)
+    # Ensure test files exist
+    test_dir = Path("test")
+    if not test_dir.exists():
+        print("Test directory not found")
+        sys.exit(1)
    
-    # Check if server is running
-    print("\n1. Checking server health...")
-    if not test_server_health():
-        print("✗ Server is not running. Please start the LightRAG server first.")
-        print("  Run: python LightRAG-main/lightrag/api/lightrag_server.py")
-        return False
+    file1 = test_dir / "test.docx"
+    file2 = test_dir / "ocr.pdf"
+    if not file1.exists() or not file2.exists():
+        print("Test files missing")
+        sys.exit(1)
    
-    print("✓ Server is running")
+    # Create fresh workspaces
+    ws1 = "isolated_ws1"
+    ws2 = "isolated_ws2"
    
-    # Create test files
-    print("\n2. Creating test files...")
-    workspace_a_file = create_test_file(
-        "This document belongs to Workspace A. It contains information about artificial intelligence and machine learning.",
-        "workspace_a_doc.txt"
-    )
-    workspace_b_file = create_test_file(
-        "This document belongs to Workspace B. It contains information about quantum computing and cryptography.",
-        "workspace_b_doc.txt"
-    )
-    print(f"✓ Created test files: {workspace_a_file.name}, {workspace_b_file.name}")
+    # Delete if they already exist (cleanup)
+    workspaces = list_workspaces()
+    for ws in workspaces:
+        if ws['name'] in [ws1, ws2]:
+            delete_workspace(ws['name'])
    
    # Create workspaces
-    print("\n3. Creating workspaces...")
-    workspace_a = "test_workspace_a"
-    workspace_b = "test_workspace_b"
+    create_workspace(ws1)
+    create_workspace(ws2)
    
-    if not create_workspace(workspace_a):
-        print("  Trying to use existing workspace...")
+    # Upload file1 to ws1
+    track1 = upload_file(file1, workspace=ws1)
+    if not track1:
+        print("Failed to upload file1")
+        sys.exit(1)
    
-    if not create_workspace(workspace_b):
-        print("  Trying to use existing workspace...")
+    # Upload file2 to ws2
+    track2 = upload_file(file2, workspace=ws2)
+    if not track2:
+        print("Failed to upload file2")
+        sys.exit(1)
    
-    # List workspaces
-    workspaces = list_workspaces()
-    print(f"  Available workspaces: {[w['name'] for w in workspaces]}")
+    # Wait for indexing
+    print("Waiting for indexing...")
+    if not wait_for_indexing():
+        print("Indexing timed out, but continuing")
    
-    # Upload documents to respective workspaces
-    print("\n4. Uploading documents to workspaces...")
-    upload_document(workspace_a, workspace_a_file)
-    upload_document(workspace_b, workspace_b_file)
-    
-    # Wait for processing
-    print("\n5. Waiting for document processing (10 seconds)...")
+    # Give extra time for processing
    time.sleep(10)
    
-    # Test isolation: Search in workspace A
-    print("\n6. Testing isolation - Search in Workspace A...")
-    results_a = search_documents(workspace_a, "artificial intelligence")
-    if results_a:
-        print(f"  Found {len(results_a.get('results', []))} results in workspace A")
-        # Check if we see workspace B content
-        for result in results_a.get('results', []):
-            if "quantum" in result.get('content', '').lower():
-                print("  ✗ FAIL: Found workspace B content in workspace A search!")
-            else:
-                print("  ✓ Workspace A search only shows workspace A content")
+    # Check documents in each workspace
+    print("\n=== Documents in ws1 ===")
+    docs1 = get_documents(workspace=ws1)
+    if docs1:
+        for status, doc_list in docs1.get('statuses', {}).items():
+            print(f"{status}: {len(doc_list)}")
    
-    # Test isolation: Search in workspace B
-    print("\n7. Testing isolation - Search in Workspace B...")
-    results_b = search_documents(workspace_b, "quantum computing")
-    if results_b:
-        print(f"  Found {len(results_b.get('results', []))} results in workspace B")
-        # Check if we see workspace A content
-        for result in results_b.get('results', []):
-            if "artificial" in result.get('content', '').lower():
-                print("  ✗ FAIL: Found workspace A content in workspace B search!")
-            else:
-                print("  ✓ Workspace B search only shows workspace B content")
+    print("\n=== Documents in ws2 ===")
+    docs2 = get_documents(workspace=ws2)
+    if docs2:
+        for status, doc_list in docs2.get('statuses', {}).items():
+            print(f"{status}: {len(doc_list)}")
    
-    # Test cross-workspace contamination
-    print("\n8. Testing cross-workspace contamination...")
-    # Search for workspace B content in workspace A
-    results_cross = search_documents(workspace_a, "quantum")
-    if results_cross and len(results_cross.get('results', [])) > 0:
-        print("  ✗ FAIL: Found workspace B content when searching in workspace A!")
+    # Search for content in each workspace
+    # test.docx contains "test" maybe? Let's search generic term
+    query = "test"
+    print(f"\n=== Search for '{query}' in ws1 ===")
+    results1 = search(query, workspace=ws1)
+    if results1:
+        print(f"Total results: {results1.get('total_results')}")
+        for i, r in enumerate(results1.get('results', [])[:3]):
+            print(f"  {i+1}. {r.get('type')}: {r.get('content')[:80]}...")
+    
+    print(f"\n=== Search for '{query}' in ws2 ===")
+    results2 = search(query, workspace=ws2)
+    if results2:
+        print(f"Total results: {results2.get('total_results')}")
+        for i, r in enumerate(results2.get('results', [])[:3]):
+            print(f"  {i+1}. {r.get('type')}: {r.get('content')[:80]}...")
+    
+    # Verify isolation: ws2 should have fewer results (maybe zero) because ocr.pdf doesn't contain "test"
+    # Actually we can't guarantee; but we can at least verify that search works and returns something.
+    # Let's also search for "OCR" which should be in ocr.pdf but not in test.docx
+    query2 = "OCR"
+    print(f"\n=== Search for '{query2}' in ws1 (should be none) ===")
+    results1b = search(query2, workspace=ws1)
+    if results1b:
+        print(f"Total results: {results1b.get('total_results')}")
+    
+    print(f"\n=== Search for '{query2}' in ws2 (should have results) ===")
+    results2b = search(query2, workspace=ws2)
+    if results2b:
+        print(f"Total results: {results2b.get('total_results')}")
+    
+    # Now delete workspace ws1
+    print(f"\n=== Deleting workspace {ws1} ===")
+    delete_workspace(ws1)
+    
+    # Wait a bit for cleanup
+    time.sleep(5)
+    
+    # Try to search in ws1 (should fail or return zero results)
+    print(f"\n=== Search in deleted workspace {ws1} (should fail) ===")
+    results_deleted = search(query, workspace=ws1)
+    if results_deleted:
+        print(f"Unexpectedly got results: {results_deleted.get('total_results')}")
    else:
-        print("  ✓ No cross-workspace contamination detected")
+        print("Search failed as expected (workspace not found)")
    
-    # Test query endpoints
-    print("\n9. Testing query endpoints...")
-    query_a = query_documents(workspace_a, "What is this document about?")
-    if query_a:
-        print(f"  Workspace A query response: {query_a.get('answer', '')[:100]}...")
+    # Verify ws2 still works
+    print(f"\n=== Search in remaining workspace {ws2} ===")
+    results_ws2 = search(query, workspace=ws2)
+    if results_ws2:
+        print(f"Workspace still functional: {results_ws2.get('total_results')} results")
    
-    query_b = query_documents(workspace_b, "What is this document about?")
-    if query_b:
-        print(f"  Workspace B query response: {query_b.get('answer', '')[:100]}...")
-    
-    # Cleanup (optional)
-    print("\n10. Test completed!")
-    print("\nSummary:")
-    print("  - Workspace isolation appears to be working correctly")
-    print("  - Documents are properly segregated between workspaces")
-    print("  - Search and query operations respect workspace boundaries")
-    print("\nNote: Workspaces will persist in the storage directory.")
-    print("  To clean up manually, delete the directories:")
-    print(f"    - {Path('LightRAG-main/rag_storage') / workspace_a}")
-    print(f"    - {Path('LightRAG-main/rag_storage') / workspace_b}")
-    
-    return True
+    print("\n=== Test completed ===")

 if __name__ == "__main__":
-    try:
-        success = main()
-        sys.exit(0 if success else 1)
-    except KeyboardInterrupt:
-        print("\nTest interrupted by user")
-        sys.exit(1)
-    except Exception as e:
-        print(f"\nError during test: {e}")
-        import traceback
-        traceback.print_exc()
-        sys.exit(1)
+    main()