Files
railseek6/clear_and_retest_ocr.py

90 lines
3.3 KiB
Python

import requests
import os
import json
import time
# Clear the failed ocr.pdf document and retest
base_url = 'http://localhost:3015'
# Login first
print("🔐 Logging in...")
login_data = {'username': 'jleu3482', 'password': 'jleu1212'}
login_response = requests.post(f'{base_url}/login', data=login_data)
if login_response.status_code == 200:
token = login_response.json().get('access_token')
headers = {'Authorization': f'Bearer {token}'}
print('✅ Login successful')
# First, let's clear the failed document
print("\n🗑️ Clearing failed documents...")
clear_response = requests.post(f'{base_url}/documents/clear_cache', headers=headers)
if clear_response.status_code == 200:
print("✅ Cache cleared successfully")
else:
print(f"❌ Cache clear failed: {clear_response.text}")
# Wait a moment for cache to clear
time.sleep(2)
# Now upload ocr.pdf again
pdf_file = 'ocr.pdf'
if not os.path.exists(pdf_file):
print(f"{pdf_file} not found")
exit(1)
print(f"\n📤 Uploading {pdf_file}...")
with open(pdf_file, 'rb') as file:
files = {'file': (pdf_file, file, 'application/pdf')}
upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers=headers)
print(f" Upload Status: {upload_response.status_code}")
if upload_response.status_code == 200:
result = upload_response.json()
print(f" Response: {json.dumps(result, indent=2)}")
if result.get('status') == 'success':
print("✅ Upload successful, waiting for processing...")
# Wait for processing to complete
time.sleep(10)
else:
print(f"⚠️ Upload status: {result.get('status')}")
else:
print(f'❌ Upload failed: {upload_response.text}')
# Check document status after upload
print("\n🔍 Checking document status...")
status_response = requests.get(f'{base_url}/documents', headers=headers)
if status_response.status_code == 200:
documents = status_response.json()
print(f" Documents response: {json.dumps(documents, indent=2)}")
# Test search with content that should be in ocr.pdf
print("\n🔍 Testing search functionality...")
test_queries = [
"table content from ocr.pdf",
"scanned document",
"PDF table data"
]
for query in test_queries:
print(f"\n🔎 Querying: \"{query}\"")
query_data = {'query': query, 'top_k': 3}
search_response = requests.post(f'{base_url}/query', json=query_data, headers=headers)
if search_response.status_code == 200:
results = search_response.json()
if isinstance(results, dict):
response_text = results.get('response', '')
print(f" Response: {response_text[:200]}...")
if 'ocr.pdf' in response_text:
print(" ✅ Found reference to ocr.pdf!")
if 'table' in response_text.lower():
print(" ✅ Found table content!")
else:
print(f" Unexpected result format: {results}")
else:
print(f'❌ Search failed: {search_response.text}')
else:
print('❌ Login failed')