Files
railseek6/test_ocr_deepseek_workflow.py

169 lines
5.8 KiB
Python

#!/usr/bin/env python3
"""
Test script to verify OCR PDF upload, indexing, and search with DeepSeek API
"""
import requests
import json
import time
import os
# Configuration
BASE_URL = "http://localhost:3015"
API_KEY = "jleu1212"
OCR_PDF_PATH = "ocr.pdf"
def test_server_health():
"""Test if server is responding"""
try:
response = requests.get(f"{BASE_URL}/")
print(f"✅ Server is running: {response.status_code}")
return True
except Exception as e:
print(f"❌ Server not responding: {e}")
return False
def test_upload_ocr_pdf():
"""Test uploading OCR PDF file"""
if not os.path.exists(OCR_PDF_PATH):
print(f"❌ OCR PDF file not found: {OCR_PDF_PATH}")
return False
try:
with open(OCR_PDF_PATH, 'rb') as f:
files = {'file': (OCR_PDF_PATH, f, 'application/pdf')}
headers = {'X-API-Key': API_KEY}
response = requests.post(f"{BASE_URL}/documents/upload", files=files, headers=headers)
if response.status_code == 200:
print(f"✅ OCR PDF uploaded successfully")
result = response.json()
print(f" Upload result: {result}")
return True
else:
print(f"❌ Upload failed: {response.status_code} - {response.text}")
return False
except Exception as e:
print(f"❌ Upload error: {e}")
return False
def test_document_status():
"""Check document processing status"""
try:
headers = {'X-API-Key': API_KEY}
response = requests.get(f"{BASE_URL}/documents", headers=headers)
if response.status_code == 200:
data = response.json()
# Count total documents across all statuses
total_docs = sum(len(docs) for docs in data.get('statuses', {}).values())
print(f"✅ Documents status retrieved: {total_docs} documents")
# Print document details
for status, docs in data.get('statuses', {}).items():
for doc in docs:
print(f" - {doc.get('file_path', 'Unknown')}: {status}")
if doc.get('content_summary'):
print(f" Summary: {doc.get('content_summary')[:100]}...")
return True
else:
print(f"❌ Failed to get documents: {response.status_code} - {response.text}")
return False
except Exception as e:
print(f"❌ Documents status error: {e}")
return False
def test_search_functionality():
"""Test search functionality with OCR content (without LLM)"""
search_queries = [
"table", # Should find table content from OCR
"data", # General content
"test" # Should find test content
]
for query in search_queries:
try:
payload = {
"query": query,
"top_k": 5,
"use_llm": False # Disable LLM to avoid region restrictions
}
headers = {'X-API-Key': API_KEY}
response = requests.post(f"{BASE_URL}/search", json=payload, headers=headers)
if response.status_code == 200:
results = response.json()
print(f"✅ Search for '{query}': Found {len(results.get('results', []))} results")
for i, result in enumerate(results.get('results', [])[:2]):
print(f" Result {i+1}: {result.get('text', '')[:100]}...")
else:
print(f"❌ Search failed for '{query}': {response.status_code} - {response.text}")
return False
except Exception as e:
print(f"❌ Search error for '{query}': {e}")
return False
return True
def test_vector_search_only():
"""Test vector search without LLM integration"""
try:
payload = {
"query": "safety precautions",
"use_llm": False,
"top_k": 3
}
headers = {'X-API-Key': API_KEY}
response = requests.post(f"{BASE_URL}/search", json=payload, headers=headers)
if response.status_code == 200:
results = response.json()
print("✅ Vector search test passed")
print(f" Found {len(results.get('results', []))} results")
for i, result in enumerate(results.get('results', [])):
print(f" Result {i+1}: {result.get('text', '')[:150]}...")
return True
else:
print(f"❌ Vector search failed: {response.status_code} - {response.text}")
return False
except Exception as e:
print(f"❌ Vector search error: {e}")
return False
def main():
print("🧪 Testing OCR PDF Upload, Indexing, and Search with DeepSeek API")
print("=" * 70)
# Wait a moment for server to be ready
print("⏳ Waiting for server to be ready...")
time.sleep(5)
# Test server health
if not test_server_health():
print("❌ Server not available, stopping test")
return
# Test upload
print("\n📤 Testing OCR PDF Upload...")
if test_upload_ocr_pdf():
print("⏳ Waiting for document processing...")
time.sleep(10) # Wait for processing
# Test document status
print("\n📊 Testing Document Status...")
test_document_status()
# Test search functionality (without LLM due to region restrictions)
print("\n🔍 Testing Search Functionality (without LLM)...")
if test_search_functionality():
print("\n🔍 Testing Vector Search Only...")
test_vector_search_only()
else:
print("❌ Search functionality failed")
else:
print("❌ OCR PDF upload failed")
print("\n" + "=" * 70)
print("✅ Test completed")
if __name__ == "__main__":
main()