169 lines
5.8 KiB
Python
169 lines
5.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test script to verify OCR PDF upload, indexing, and search with DeepSeek API
|
|
"""
|
|
|
|
import requests
|
|
import json
|
|
import time
|
|
import os
|
|
|
|
# Configuration
|
|
BASE_URL = "http://localhost:3015"
|
|
API_KEY = "jleu1212"
|
|
OCR_PDF_PATH = "ocr.pdf"
|
|
|
|
def test_server_health():
|
|
"""Test if server is responding"""
|
|
try:
|
|
response = requests.get(f"{BASE_URL}/")
|
|
print(f"✅ Server is running: {response.status_code}")
|
|
return True
|
|
except Exception as e:
|
|
print(f"❌ Server not responding: {e}")
|
|
return False
|
|
|
|
def test_upload_ocr_pdf():
|
|
"""Test uploading OCR PDF file"""
|
|
if not os.path.exists(OCR_PDF_PATH):
|
|
print(f"❌ OCR PDF file not found: {OCR_PDF_PATH}")
|
|
return False
|
|
|
|
try:
|
|
with open(OCR_PDF_PATH, 'rb') as f:
|
|
files = {'file': (OCR_PDF_PATH, f, 'application/pdf')}
|
|
headers = {'X-API-Key': API_KEY}
|
|
response = requests.post(f"{BASE_URL}/documents/upload", files=files, headers=headers)
|
|
|
|
if response.status_code == 200:
|
|
print(f"✅ OCR PDF uploaded successfully")
|
|
result = response.json()
|
|
print(f" Upload result: {result}")
|
|
return True
|
|
else:
|
|
print(f"❌ Upload failed: {response.status_code} - {response.text}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Upload error: {e}")
|
|
return False
|
|
|
|
def test_document_status():
|
|
"""Check document processing status"""
|
|
try:
|
|
headers = {'X-API-Key': API_KEY}
|
|
response = requests.get(f"{BASE_URL}/documents", headers=headers)
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
# Count total documents across all statuses
|
|
total_docs = sum(len(docs) for docs in data.get('statuses', {}).values())
|
|
print(f"✅ Documents status retrieved: {total_docs} documents")
|
|
|
|
# Print document details
|
|
for status, docs in data.get('statuses', {}).items():
|
|
for doc in docs:
|
|
print(f" - {doc.get('file_path', 'Unknown')}: {status}")
|
|
if doc.get('content_summary'):
|
|
print(f" Summary: {doc.get('content_summary')[:100]}...")
|
|
return True
|
|
else:
|
|
print(f"❌ Failed to get documents: {response.status_code} - {response.text}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Documents status error: {e}")
|
|
return False
|
|
|
|
def test_search_functionality():
|
|
"""Test search functionality with OCR content (without LLM)"""
|
|
search_queries = [
|
|
"table", # Should find table content from OCR
|
|
"data", # General content
|
|
"test" # Should find test content
|
|
]
|
|
|
|
for query in search_queries:
|
|
try:
|
|
payload = {
|
|
"query": query,
|
|
"top_k": 5,
|
|
"use_llm": False # Disable LLM to avoid region restrictions
|
|
}
|
|
headers = {'X-API-Key': API_KEY}
|
|
response = requests.post(f"{BASE_URL}/search", json=payload, headers=headers)
|
|
|
|
if response.status_code == 200:
|
|
results = response.json()
|
|
print(f"✅ Search for '{query}': Found {len(results.get('results', []))} results")
|
|
for i, result in enumerate(results.get('results', [])[:2]):
|
|
print(f" Result {i+1}: {result.get('text', '')[:100]}...")
|
|
else:
|
|
print(f"❌ Search failed for '{query}': {response.status_code} - {response.text}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Search error for '{query}': {e}")
|
|
return False
|
|
|
|
return True
|
|
|
|
def test_vector_search_only():
|
|
"""Test vector search without LLM integration"""
|
|
try:
|
|
payload = {
|
|
"query": "safety precautions",
|
|
"use_llm": False,
|
|
"top_k": 3
|
|
}
|
|
headers = {'X-API-Key': API_KEY}
|
|
response = requests.post(f"{BASE_URL}/search", json=payload, headers=headers)
|
|
|
|
if response.status_code == 200:
|
|
results = response.json()
|
|
print("✅ Vector search test passed")
|
|
print(f" Found {len(results.get('results', []))} results")
|
|
for i, result in enumerate(results.get('results', [])):
|
|
print(f" Result {i+1}: {result.get('text', '')[:150]}...")
|
|
return True
|
|
else:
|
|
print(f"❌ Vector search failed: {response.status_code} - {response.text}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Vector search error: {e}")
|
|
return False
|
|
|
|
def main():
|
|
print("🧪 Testing OCR PDF Upload, Indexing, and Search with DeepSeek API")
|
|
print("=" * 70)
|
|
|
|
# Wait a moment for server to be ready
|
|
print("⏳ Waiting for server to be ready...")
|
|
time.sleep(5)
|
|
|
|
# Test server health
|
|
if not test_server_health():
|
|
print("❌ Server not available, stopping test")
|
|
return
|
|
|
|
# Test upload
|
|
print("\n📤 Testing OCR PDF Upload...")
|
|
if test_upload_ocr_pdf():
|
|
print("⏳ Waiting for document processing...")
|
|
time.sleep(10) # Wait for processing
|
|
|
|
# Test document status
|
|
print("\n📊 Testing Document Status...")
|
|
test_document_status()
|
|
|
|
# Test search functionality (without LLM due to region restrictions)
|
|
print("\n🔍 Testing Search Functionality (without LLM)...")
|
|
if test_search_functionality():
|
|
print("\n🔍 Testing Vector Search Only...")
|
|
test_vector_search_only()
|
|
else:
|
|
print("❌ Search functionality failed")
|
|
else:
|
|
print("❌ OCR PDF upload failed")
|
|
|
|
print("\n" + "=" * 70)
|
|
print("✅ Test completed")
|
|
|
|
if __name__ == "__main__":
|
|
main() |