Files
railseek6/test_workspace_isolation.py
2026-01-12 22:31:11 +08:00

207 lines
7.0 KiB
Python

import requests
import time
import sys
import json
from pathlib import Path
SERVER_URL = "http://localhost:3015"
API_KEY = "jleu1212"
def get_headers(workspace=None):
headers = {"X-API-Key": API_KEY}
if workspace:
headers["X-Workspace"] = workspace
return headers
def upload_file(file_path, workspace=None):
"""Upload file to workspace"""
headers = get_headers(workspace)
with open(file_path, 'rb') as f:
files = {'file': (file_path.name, f)}
resp = requests.post(f"{SERVER_URL}/documents/upload", files=files, headers=headers)
if resp.status_code != 200:
print(f"Upload failed: {resp.status_code} {resp.text}")
return None
data = resp.json()
print(f"Uploaded {file_path.name} to workspace {workspace}: track_id {data.get('track_id')}")
return data.get('track_id')
def wait_for_indexing(timeout=120):
"""Wait until pipeline is not busy"""
start = time.time()
while time.time() - start < timeout:
resp = requests.get(f"{SERVER_URL}/documents/pipeline_status", headers=get_headers())
if resp.status_code == 200:
data = resp.json()
if not data.get('busy', False):
print("Pipeline idle, indexing likely complete")
return True
else:
print(f"Pipeline busy: {data.get('job_name')} {data.get('cur_batch')}/{data.get('batchs')}")
else:
print(f"Failed to get pipeline status: {resp.status_code}")
time.sleep(5)
print("Timeout waiting for indexing")
return False
def search(query, workspace=None):
headers = get_headers(workspace)
resp = requests.post(f"{SERVER_URL}/search", json={"query": query}, headers=headers)
if resp.status_code != 200:
print(f"Search failed: {resp.status_code} {resp.text}")
return None
data = resp.json()
return data
def list_workspaces():
resp = requests.get(f"{SERVER_URL}/workspaces/", headers=get_headers())
if resp.status_code == 200:
return resp.json()
else:
print(f"Failed to list workspaces: {resp.status_code} {resp.text}")
return []
def create_workspace(name):
resp = requests.post(f"{SERVER_URL}/workspaces/", json={"name": name}, headers=get_headers())
if resp.status_code == 200:
print(f"Created workspace {name}")
return True
else:
print(f"Failed to create workspace: {resp.status_code} {resp.text}")
return False
def delete_workspace(name):
resp = requests.delete(f"{SERVER_URL}/workspaces/{name}", headers=get_headers())
if resp.status_code == 200:
print(f"Deleted workspace {name}")
return True
else:
print(f"Failed to delete workspace: {resp.status_code} {resp.text}")
return False
def get_documents(workspace=None):
headers = get_headers(workspace)
resp = requests.get(f"{SERVER_URL}/documents", headers=headers)
if resp.status_code == 200:
data = resp.json()
return data
else:
print(f"Failed to get documents: {resp.status_code} {resp.text}")
return None
def main():
# Ensure test files exist
test_dir = Path("test")
if not test_dir.exists():
print("Test directory not found")
sys.exit(1)
file1 = test_dir / "test.docx"
file2 = test_dir / "ocr.pdf"
if not file1.exists() or not file2.exists():
print("Test files missing")
sys.exit(1)
# Create fresh workspaces
ws1 = "isolated_ws1"
ws2 = "isolated_ws2"
# Delete if they already exist (cleanup)
workspaces = list_workspaces()
for ws in workspaces:
if ws['name'] in [ws1, ws2]:
delete_workspace(ws['name'])
# Create workspaces
create_workspace(ws1)
create_workspace(ws2)
# Upload file1 to ws1
track1 = upload_file(file1, workspace=ws1)
if not track1:
print("Failed to upload file1")
sys.exit(1)
# Upload file2 to ws2
track2 = upload_file(file2, workspace=ws2)
if not track2:
print("Failed to upload file2")
sys.exit(1)
# Wait for indexing
print("Waiting for indexing...")
if not wait_for_indexing():
print("Indexing timed out, but continuing")
# Give extra time for processing
time.sleep(10)
# Check documents in each workspace
print("\n=== Documents in ws1 ===")
docs1 = get_documents(workspace=ws1)
if docs1:
for status, doc_list in docs1.get('statuses', {}).items():
print(f"{status}: {len(doc_list)}")
print("\n=== Documents in ws2 ===")
docs2 = get_documents(workspace=ws2)
if docs2:
for status, doc_list in docs2.get('statuses', {}).items():
print(f"{status}: {len(doc_list)}")
# Search for content in each workspace
# test.docx contains "test" maybe? Let's search generic term
query = "test"
print(f"\n=== Search for '{query}' in ws1 ===")
results1 = search(query, workspace=ws1)
if results1:
print(f"Total results: {results1.get('total_results')}")
for i, r in enumerate(results1.get('results', [])[:3]):
print(f" {i+1}. {r.get('type')}: {r.get('content')[:80]}...")
print(f"\n=== Search for '{query}' in ws2 ===")
results2 = search(query, workspace=ws2)
if results2:
print(f"Total results: {results2.get('total_results')}")
for i, r in enumerate(results2.get('results', [])[:3]):
print(f" {i+1}. {r.get('type')}: {r.get('content')[:80]}...")
# Verify isolation: ws2 should have fewer results (maybe zero) because ocr.pdf doesn't contain "test"
# Actually we can't guarantee; but we can at least verify that search works and returns something.
# Let's also search for "OCR" which should be in ocr.pdf but not in test.docx
query2 = "OCR"
print(f"\n=== Search for '{query2}' in ws1 (should be none) ===")
results1b = search(query2, workspace=ws1)
if results1b:
print(f"Total results: {results1b.get('total_results')}")
print(f"\n=== Search for '{query2}' in ws2 (should have results) ===")
results2b = search(query2, workspace=ws2)
if results2b:
print(f"Total results: {results2b.get('total_results')}")
# Now delete workspace ws1
print(f"\n=== Deleting workspace {ws1} ===")
delete_workspace(ws1)
# Wait a bit for cleanup
time.sleep(5)
# Try to search in ws1 (should fail or return zero results)
print(f"\n=== Search in deleted workspace {ws1} (should fail) ===")
results_deleted = search(query, workspace=ws1)
if results_deleted:
print(f"Unexpectedly got results: {results_deleted.get('total_results')}")
else:
print("Search failed as expected (workspace not found)")
# Verify ws2 still works
print(f"\n=== Search in remaining workspace {ws2} ===")
results_ws2 = search(query, workspace=ws2)
if results_ws2:
print(f"Workspace still functional: {results_ws2.get('total_results')} results")
print("\n=== Test completed ===")
if __name__ == "__main__":
main()