115 lines
4.5 KiB
Python
115 lines
4.5 KiB
Python
import requests
|
|
import os
|
|
import json
|
|
import time
|
|
|
|
# Test uploading and processing ocr.pdf to verify it contains actual content
|
|
base_url = 'http://localhost:3015'
|
|
|
|
# Login first
|
|
print("🔐 Logging in...")
|
|
login_data = {'username': 'jleu3482', 'password': 'jleu1212'}
|
|
login_response = requests.post(f'{base_url}/login', data=login_data)
|
|
|
|
if login_response.status_code == 200:
|
|
token = login_response.json().get('access_token')
|
|
headers = {'Authorization': f'Bearer {token}'}
|
|
print('✅ Login successful')
|
|
|
|
# First, delete all documents to start fresh
|
|
print("\n🗑️ Deleting all documents...")
|
|
delete_response = requests.delete(f'{base_url}/documents', headers=headers)
|
|
if delete_response.status_code == 200:
|
|
print("✅ All documents deleted successfully")
|
|
else:
|
|
print(f"❌ Delete failed: {delete_response.text}")
|
|
|
|
# Wait for cleanup
|
|
time.sleep(3)
|
|
|
|
# Now upload ocr.pdf fresh
|
|
pdf_file = 'ocr.pdf'
|
|
if not os.path.exists(pdf_file):
|
|
print(f"❌ {pdf_file} not found")
|
|
exit(1)
|
|
|
|
print(f"\n📤 Uploading {pdf_file}...")
|
|
with open(pdf_file, 'rb') as file:
|
|
files = {'file': (pdf_file, file, 'application/pdf')}
|
|
upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers=headers)
|
|
|
|
print(f" Upload Status: {upload_response.status_code}")
|
|
if upload_response.status_code == 200:
|
|
result = upload_response.json()
|
|
print(f" Response: {json.dumps(result, indent=2)}")
|
|
|
|
if result.get('status') == 'success':
|
|
print("✅ Upload successful, waiting for processing...")
|
|
# Wait longer for OCR processing
|
|
print("⏳ Waiting 15 seconds for OCR processing...")
|
|
time.sleep(15)
|
|
else:
|
|
print(f"⚠️ Upload status: {result.get('status')}")
|
|
else:
|
|
print(f'❌ Upload failed: {upload_response.text}')
|
|
|
|
# Check document status after upload
|
|
print("\n🔍 Checking document status...")
|
|
status_response = requests.get(f'{base_url}/documents', headers=headers)
|
|
if status_response.status_code == 200:
|
|
documents = status_response.json()
|
|
print(f" Documents response: {json.dumps(documents, indent=2)}")
|
|
|
|
# Extract content summary if available
|
|
if 'statuses' in documents:
|
|
processed = documents['statuses'].get('processed', [])
|
|
failed = documents['statuses'].get('failed', [])
|
|
|
|
for doc in processed:
|
|
if doc.get('file_path') == 'ocr.pdf':
|
|
print(f"\n✅ OCR.PDF PROCESSED SUCCESSFULLY!")
|
|
print(f" Content Summary: {doc.get('content_summary', 'No summary')}")
|
|
print(f" Content Length: {doc.get('content_length', 0)} characters")
|
|
print(f" Chunks: {doc.get('chunks_count', 0)}")
|
|
|
|
for doc in failed:
|
|
if doc.get('file_path') == 'ocr.pdf':
|
|
print(f"\n❌ OCR.PDF FAILED!")
|
|
print(f" Error: {doc.get('error_msg', 'Unknown error')}")
|
|
|
|
# Test search with specific content that should be in ocr.pdf
|
|
print("\n🔍 Testing search functionality...")
|
|
test_queries = [
|
|
"table data",
|
|
"document content",
|
|
"scanned text",
|
|
"PDF information",
|
|
"extracted content"
|
|
]
|
|
|
|
for query in test_queries:
|
|
print(f"\n🔎 Querying: \"{query}\"")
|
|
query_data = {'query': query, 'top_k': 5}
|
|
search_response = requests.post(f'{base_url}/query', json=query_data, headers=headers)
|
|
|
|
if search_response.status_code == 200:
|
|
results = search_response.json()
|
|
if isinstance(results, dict):
|
|
response_text = results.get('response', '')
|
|
print(f" Response length: {len(response_text)} characters")
|
|
print(f" Preview: {response_text[:200]}...")
|
|
|
|
# Check for specific indicators
|
|
if 'ocr.pdf' in response_text.lower():
|
|
print(" ✅ Found reference to ocr.pdf!")
|
|
if 'table' in response_text.lower():
|
|
print(" ✅ Found table content!")
|
|
if len(response_text) > 50: # Meaningful response
|
|
print(" ✅ Got meaningful response!")
|
|
else:
|
|
print(f" Unexpected result format: {type(results)}")
|
|
else:
|
|
print(f'❌ Search failed: {search_response.text}')
|
|
|
|
else:
|
|
print('❌ Login failed') |