railseek6/test_workspace_isolation.py

import requests
import time
import sys
import json
from pathlib import Path

SERVER_URL = "http://localhost:3015"
API_KEY = "jleu1212"

def get_headers(workspace=None):
    headers = {"X-API-Key": API_KEY}
    if workspace:
        headers["X-Workspace"] = workspace
    return headers

def upload_file(file_path, workspace=None):
    """Upload file to workspace"""
    headers = get_headers(workspace)
    with open(file_path, 'rb') as f:
        files = {'file': (file_path.name, f)}
        resp = requests.post(f"{SERVER_URL}/documents/upload", files=files, headers=headers)
    if resp.status_code != 200:
        print(f"Upload failed: {resp.status_code} {resp.text}")
        return None
    data = resp.json()
    print(f"Uploaded {file_path.name} to workspace {workspace}: track_id {data.get('track_id')}")
    return data.get('track_id')

def wait_for_indexing(timeout=120):
    """Wait until pipeline is not busy"""
    start = time.time()
    while time.time() - start < timeout:
        resp = requests.get(f"{SERVER_URL}/documents/pipeline_status", headers=get_headers())
        if resp.status_code == 200:
            data = resp.json()
            if not data.get('busy', False):
                print("Pipeline idle, indexing likely complete")
                return True
            else:
                print(f"Pipeline busy: {data.get('job_name')} {data.get('cur_batch')}/{data.get('batchs')}")
        else:
            print(f"Failed to get pipeline status: {resp.status_code}")
        time.sleep(5)
    print("Timeout waiting for indexing")
    return False

def search(query, workspace=None):
    headers = get_headers(workspace)
    resp = requests.post(f"{SERVER_URL}/search", json={"query": query}, headers=headers)
    if resp.status_code != 200:
        print(f"Search failed: {resp.status_code} {resp.text}")
        return None
    data = resp.json()
    return data

def list_workspaces():
    resp = requests.get(f"{SERVER_URL}/workspaces/", headers=get_headers())
    if resp.status_code == 200:
        return resp.json()
    else:
        print(f"Failed to list workspaces: {resp.status_code} {resp.text}")
        return []

def create_workspace(name):
    resp = requests.post(f"{SERVER_URL}/workspaces/", json={"name": name}, headers=get_headers())
    if resp.status_code == 200:
        print(f"Created workspace {name}")
        return True
    else:
        print(f"Failed to create workspace: {resp.status_code} {resp.text}")
        return False

def delete_workspace(name):
    resp = requests.delete(f"{SERVER_URL}/workspaces/{name}", headers=get_headers())
    if resp.status_code == 200:
        print(f"Deleted workspace {name}")
        return True
    else:
        print(f"Failed to delete workspace: {resp.status_code} {resp.text}")
        return False

def get_documents(workspace=None):
    headers = get_headers(workspace)
    resp = requests.get(f"{SERVER_URL}/documents", headers=headers)
    if resp.status_code == 200:
        data = resp.json()
        return data
    else:
        print(f"Failed to get documents: {resp.status_code} {resp.text}")
        return None

def main():
    # Ensure test files exist
    test_dir = Path("test")
    if not test_dir.exists():
        print("Test directory not found")
        sys.exit(1)

    file1 = test_dir / "test.docx"
    file2 = test_dir / "ocr.pdf"
    if not file1.exists() or not file2.exists():
        print("Test files missing")
        sys.exit(1)

    # Create fresh workspaces
    ws1 = "isolated_ws1"
    ws2 = "isolated_ws2"

    # Delete if they already exist (cleanup)
    workspaces = list_workspaces()
    for ws in workspaces:
        if ws['name'] in [ws1, ws2]:
            delete_workspace(ws['name'])

    # Create workspaces
    create_workspace(ws1)
    create_workspace(ws2)

    # Upload file1 to ws1
    track1 = upload_file(file1, workspace=ws1)
    if not track1:
        print("Failed to upload file1")
        sys.exit(1)

    # Upload file2 to ws2
    track2 = upload_file(file2, workspace=ws2)
    if not track2:
        print("Failed to upload file2")
        sys.exit(1)

    # Wait for indexing
    print("Waiting for indexing...")
    if not wait_for_indexing():
        print("Indexing timed out, but continuing")

    # Give extra time for processing
    time.sleep(10)

    # Check documents in each workspace
    print("\n=== Documents in ws1 ===")
    docs1 = get_documents(workspace=ws1)
    if docs1:
        for status, doc_list in docs1.get('statuses', {}).items():
            print(f"{status}: {len(doc_list)}")

    print("\n=== Documents in ws2 ===")
    docs2 = get_documents(workspace=ws2)
    if docs2:
        for status, doc_list in docs2.get('statuses', {}).items():
            print(f"{status}: {len(doc_list)}")

    # Search for content in each workspace
    # test.docx contains "test" maybe? Let's search generic term
    query = "test"
    print(f"\n=== Search for '{query}' in ws1 ===")
    results1 = search(query, workspace=ws1)
    if results1:
        print(f"Total results: {results1.get('total_results')}")
        for i, r in enumerate(results1.get('results', [])[:3]):
            print(f"  {i+1}. {r.get('type')}: {r.get('content')[:80]}...")

    print(f"\n=== Search for '{query}' in ws2 ===")
    results2 = search(query, workspace=ws2)
    if results2:
        print(f"Total results: {results2.get('total_results')}")
        for i, r in enumerate(results2.get('results', [])[:3]):
            print(f"  {i+1}. {r.get('type')}: {r.get('content')[:80]}...")

    # Verify isolation: ws2 should have fewer results (maybe zero) because ocr.pdf doesn't contain "test"
    # Actually we can't guarantee; but we can at least verify that search works and returns something.
    # Let's also search for "OCR" which should be in ocr.pdf but not in test.docx
    query2 = "OCR"
    print(f"\n=== Search for '{query2}' in ws1 (should be none) ===")
    results1b = search(query2, workspace=ws1)
    if results1b:
        print(f"Total results: {results1b.get('total_results')}")

    print(f"\n=== Search for '{query2}' in ws2 (should have results) ===")
    results2b = search(query2, workspace=ws2)
    if results2b:
        print(f"Total results: {results2b.get('total_results')}")

    # Now delete workspace ws1
    print(f"\n=== Deleting workspace {ws1} ===")
    delete_workspace(ws1)

    # Wait a bit for cleanup
    time.sleep(5)

    # Try to search in ws1 (should fail or return zero results)
    print(f"\n=== Search in deleted workspace {ws1} (should fail) ===")
    results_deleted = search(query, workspace=ws1)
    if results_deleted:
        print(f"Unexpectedly got results: {results_deleted.get('total_results')}")
    else:
        print("Search failed as expected (workspace not found)")

    # Verify ws2 still works
    print(f"\n=== Search in remaining workspace {ws2} ===")
    results_ws2 = search(query, workspace=ws2)
    if results_ws2:
        print(f"Workspace still functional: {results_ws2.get('total_results')} results")

    print("\n=== Test completed ===")

if __name__ == "__main__":
    main()