workspace working

This commit is contained in:
2026-01-12 22:31:11 +08:00
parent 2738a822d1
commit 370fe6368a
149 changed files with 4648 additions and 660 deletions

79
test_ocr_retrieval.py Normal file
View File

@@ -0,0 +1,79 @@
import requests
import json
import time
API_BASE = 'http://localhost:3015'
API_KEY = 'jleu1212'
FILE_PATH = 'test/ocr.pdf'
def api_request(method, endpoint, workspace='', data=None, files=None):
url = API_BASE + endpoint
headers = {'X-API-Key': API_KEY}
if workspace:
headers['X-Workspace'] = workspace
if data and not files:
headers['Content-Type'] = 'application/json'
data = json.dumps(data)
response = requests.request(method, url, headers=headers, data=data, files=files)
return response
def upload_and_wait(file_path, workspace=''):
print(f'Uploading {file_path}...')
with open(file_path, 'rb') as f:
files = {'file': (file_path.split('/')[-1], f, 'application/pdf')}
resp = api_request('POST', '/documents/upload', workspace=workspace, files=files)
if resp.status_code != 200:
print(f'Upload failed: {resp.text}')
return None
result = resp.json()
track_id = result.get('track_id')
print(f'Track ID: {track_id}')
# wait for indexing
start = time.time()
while time.time() - start < 120:
resp = api_request('GET', f'/documents/track_status/{track_id}', workspace=workspace)
if resp.status_code == 200:
data = resp.json()
total = data.get('total_count', 0)
processed = data.get('status_summary', {}).get('PROCESSED', 0)
failed = data.get('status_summary', {}).get('FAILED', 0)
pending = data.get('status_summary', {}).get('PENDING', 0)
print(f'Status: total={total}, processed={processed}, failed={failed}, pending={pending}')
if pending == 0:
print('Indexing completed.')
return track_id
time.sleep(2)
raise TimeoutError('Indexing timeout')
def search(query, workspace=''):
print(f'Searching for "{query}" in workspace {workspace if workspace else "default"}')
resp = api_request('POST', '/search', workspace=workspace, data={'query': query})
if resp.status_code != 200:
print(f'Search failed: {resp.text}')
return None
return resp.json()
def main():
# Use default workspace (empty)
workspace = ''
# Upload if needed (maybe already uploaded)
# track_id = upload_and_wait(FILE_PATH, workspace)
# if not track_id:
# return
# Search
results = search('what is the minimum safe working distance', workspace)
if results:
print('Search results:')
print(json.dumps(results, indent=2))
chunks = results.get('chunks', [])
entities = results.get('entities', [])
print(f'Found {len(chunks)} chunks, {len(entities)} entities')
if chunks:
print('First chunk text:', chunks[0].get('text', '')[:200])
else:
print('No chunks returned.')
else:
print('No results.')
if __name__ == '__main__':
main()