Files
railseek6/test_ocr_workflow_complete.py

75 lines
3.1 KiB
Python

import requests
import json
import os
import time
# Server configuration
base_url = 'http://localhost:3015'
pdf_file = 'LightRAG-main/test_documents/ocr.pdf'
print('🚀 Testing OCR PDF upload, indexing and search workflow...')
print(f'📁 Using PDF: {pdf_file}')
if not os.path.exists(pdf_file):
print('❌ Test file not found')
else:
print('✅ Test file found')
# Login first to get token
print('🔐 Logging in...')
login_data = {
'username': 'jleu3482',
'password': 'jleu1212'
}
try:
# Login
login_response = requests.post(f'{base_url}/login', data=login_data)
if login_response.status_code == 200:
token = login_response.json().get('access_token')
headers = {'Authorization': f'Bearer {token}'}
print('✅ Login successful')
# Upload the PDF
print('📤 Uploading PDF...')
with open(pdf_file, 'rb') as file:
files = {'file': ('sample_ocr.pdf', file, 'application/pdf')}
upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers=headers)
print(f'Upload response: {upload_response.status_code}')
if upload_response.status_code == 200:
result = upload_response.json()
print('✅ Upload successful!')
print(f'Upload result: {json.dumps(result, indent=2)}')
# Wait for processing and indexing
print('⏳ Waiting for OCR processing and indexing (30 seconds)...')
time.sleep(30)
# Test search with meaningful queries from OCR content
print('🔍 Testing search with OCR-extracted content...')
search_queries = ['document', 'test', 'sample', 'content', 'pdf']
for query in search_queries:
print(f'\n🔎 Searching for: "{query}"')
search_data = {'query': query, 'top_k': 5}
search_response = requests.post(f'{base_url}/search', json=search_data, headers=headers)
if search_response.status_code == 200:
search_results = search_response.json()
print(f'✅ Search successful! Found {len(search_results)} results')
for i, result in enumerate(search_results):
content = result.get('content', '')
score = result.get('score', 0)
print(f' Result {i+1} (score: {score:.3f}): {content[:150]}...')
else:
print(f'❌ Search failed: {search_response.status_code} - {search_response.text}')
else:
print(f'❌ Upload failed: {upload_response.status_code} - {upload_response.text}')
else:
print(f'❌ Login failed: {login_response.status_code} - {login_response.text}')
except Exception as e:
print(f'❌ Error: {e}')