207 lines
7.0 KiB
Python
207 lines
7.0 KiB
Python
import requests
|
|
import time
|
|
import sys
|
|
import json
|
|
from pathlib import Path
|
|
|
|
SERVER_URL = "http://localhost:3015"
|
|
API_KEY = "jleu1212"
|
|
|
|
def get_headers(workspace=None):
|
|
headers = {"X-API-Key": API_KEY}
|
|
if workspace:
|
|
headers["X-Workspace"] = workspace
|
|
return headers
|
|
|
|
def upload_file(file_path, workspace=None):
|
|
"""Upload file to workspace"""
|
|
headers = get_headers(workspace)
|
|
with open(file_path, 'rb') as f:
|
|
files = {'file': (file_path.name, f)}
|
|
resp = requests.post(f"{SERVER_URL}/documents/upload", files=files, headers=headers)
|
|
if resp.status_code != 200:
|
|
print(f"Upload failed: {resp.status_code} {resp.text}")
|
|
return None
|
|
data = resp.json()
|
|
print(f"Uploaded {file_path.name} to workspace {workspace}: track_id {data.get('track_id')}")
|
|
return data.get('track_id')
|
|
|
|
def wait_for_indexing(timeout=120):
|
|
"""Wait until pipeline is not busy"""
|
|
start = time.time()
|
|
while time.time() - start < timeout:
|
|
resp = requests.get(f"{SERVER_URL}/documents/pipeline_status", headers=get_headers())
|
|
if resp.status_code == 200:
|
|
data = resp.json()
|
|
if not data.get('busy', False):
|
|
print("Pipeline idle, indexing likely complete")
|
|
return True
|
|
else:
|
|
print(f"Pipeline busy: {data.get('job_name')} {data.get('cur_batch')}/{data.get('batchs')}")
|
|
else:
|
|
print(f"Failed to get pipeline status: {resp.status_code}")
|
|
time.sleep(5)
|
|
print("Timeout waiting for indexing")
|
|
return False
|
|
|
|
def search(query, workspace=None):
|
|
headers = get_headers(workspace)
|
|
resp = requests.post(f"{SERVER_URL}/search", json={"query": query}, headers=headers)
|
|
if resp.status_code != 200:
|
|
print(f"Search failed: {resp.status_code} {resp.text}")
|
|
return None
|
|
data = resp.json()
|
|
return data
|
|
|
|
def list_workspaces():
|
|
resp = requests.get(f"{SERVER_URL}/workspaces/", headers=get_headers())
|
|
if resp.status_code == 200:
|
|
return resp.json()
|
|
else:
|
|
print(f"Failed to list workspaces: {resp.status_code} {resp.text}")
|
|
return []
|
|
|
|
def create_workspace(name):
|
|
resp = requests.post(f"{SERVER_URL}/workspaces/", json={"name": name}, headers=get_headers())
|
|
if resp.status_code == 200:
|
|
print(f"Created workspace {name}")
|
|
return True
|
|
else:
|
|
print(f"Failed to create workspace: {resp.status_code} {resp.text}")
|
|
return False
|
|
|
|
def delete_workspace(name):
|
|
resp = requests.delete(f"{SERVER_URL}/workspaces/{name}", headers=get_headers())
|
|
if resp.status_code == 200:
|
|
print(f"Deleted workspace {name}")
|
|
return True
|
|
else:
|
|
print(f"Failed to delete workspace: {resp.status_code} {resp.text}")
|
|
return False
|
|
|
|
def get_documents(workspace=None):
|
|
headers = get_headers(workspace)
|
|
resp = requests.get(f"{SERVER_URL}/documents", headers=headers)
|
|
if resp.status_code == 200:
|
|
data = resp.json()
|
|
return data
|
|
else:
|
|
print(f"Failed to get documents: {resp.status_code} {resp.text}")
|
|
return None
|
|
|
|
def main():
|
|
# Ensure test files exist
|
|
test_dir = Path("test")
|
|
if not test_dir.exists():
|
|
print("Test directory not found")
|
|
sys.exit(1)
|
|
|
|
file1 = test_dir / "test.docx"
|
|
file2 = test_dir / "ocr.pdf"
|
|
if not file1.exists() or not file2.exists():
|
|
print("Test files missing")
|
|
sys.exit(1)
|
|
|
|
# Create fresh workspaces
|
|
ws1 = "isolated_ws1"
|
|
ws2 = "isolated_ws2"
|
|
|
|
# Delete if they already exist (cleanup)
|
|
workspaces = list_workspaces()
|
|
for ws in workspaces:
|
|
if ws['name'] in [ws1, ws2]:
|
|
delete_workspace(ws['name'])
|
|
|
|
# Create workspaces
|
|
create_workspace(ws1)
|
|
create_workspace(ws2)
|
|
|
|
# Upload file1 to ws1
|
|
track1 = upload_file(file1, workspace=ws1)
|
|
if not track1:
|
|
print("Failed to upload file1")
|
|
sys.exit(1)
|
|
|
|
# Upload file2 to ws2
|
|
track2 = upload_file(file2, workspace=ws2)
|
|
if not track2:
|
|
print("Failed to upload file2")
|
|
sys.exit(1)
|
|
|
|
# Wait for indexing
|
|
print("Waiting for indexing...")
|
|
if not wait_for_indexing():
|
|
print("Indexing timed out, but continuing")
|
|
|
|
# Give extra time for processing
|
|
time.sleep(10)
|
|
|
|
# Check documents in each workspace
|
|
print("\n=== Documents in ws1 ===")
|
|
docs1 = get_documents(workspace=ws1)
|
|
if docs1:
|
|
for status, doc_list in docs1.get('statuses', {}).items():
|
|
print(f"{status}: {len(doc_list)}")
|
|
|
|
print("\n=== Documents in ws2 ===")
|
|
docs2 = get_documents(workspace=ws2)
|
|
if docs2:
|
|
for status, doc_list in docs2.get('statuses', {}).items():
|
|
print(f"{status}: {len(doc_list)}")
|
|
|
|
# Search for content in each workspace
|
|
# test.docx contains "test" maybe? Let's search generic term
|
|
query = "test"
|
|
print(f"\n=== Search for '{query}' in ws1 ===")
|
|
results1 = search(query, workspace=ws1)
|
|
if results1:
|
|
print(f"Total results: {results1.get('total_results')}")
|
|
for i, r in enumerate(results1.get('results', [])[:3]):
|
|
print(f" {i+1}. {r.get('type')}: {r.get('content')[:80]}...")
|
|
|
|
print(f"\n=== Search for '{query}' in ws2 ===")
|
|
results2 = search(query, workspace=ws2)
|
|
if results2:
|
|
print(f"Total results: {results2.get('total_results')}")
|
|
for i, r in enumerate(results2.get('results', [])[:3]):
|
|
print(f" {i+1}. {r.get('type')}: {r.get('content')[:80]}...")
|
|
|
|
# Verify isolation: ws2 should have fewer results (maybe zero) because ocr.pdf doesn't contain "test"
|
|
# Actually we can't guarantee; but we can at least verify that search works and returns something.
|
|
# Let's also search for "OCR" which should be in ocr.pdf but not in test.docx
|
|
query2 = "OCR"
|
|
print(f"\n=== Search for '{query2}' in ws1 (should be none) ===")
|
|
results1b = search(query2, workspace=ws1)
|
|
if results1b:
|
|
print(f"Total results: {results1b.get('total_results')}")
|
|
|
|
print(f"\n=== Search for '{query2}' in ws2 (should have results) ===")
|
|
results2b = search(query2, workspace=ws2)
|
|
if results2b:
|
|
print(f"Total results: {results2b.get('total_results')}")
|
|
|
|
# Now delete workspace ws1
|
|
print(f"\n=== Deleting workspace {ws1} ===")
|
|
delete_workspace(ws1)
|
|
|
|
# Wait a bit for cleanup
|
|
time.sleep(5)
|
|
|
|
# Try to search in ws1 (should fail or return zero results)
|
|
print(f"\n=== Search in deleted workspace {ws1} (should fail) ===")
|
|
results_deleted = search(query, workspace=ws1)
|
|
if results_deleted:
|
|
print(f"Unexpectedly got results: {results_deleted.get('total_results')}")
|
|
else:
|
|
print("Search failed as expected (workspace not found)")
|
|
|
|
# Verify ws2 still works
|
|
print(f"\n=== Search in remaining workspace {ws2} ===")
|
|
results_ws2 = search(query, workspace=ws2)
|
|
if results_ws2:
|
|
print(f"Workspace still functional: {results_ws2.get('total_results')} results")
|
|
|
|
print("\n=== Test completed ===")
|
|
|
|
if __name__ == "__main__":
|
|
main() |