#!/usr/bin/env python3 """ Test script to verify OCR PDF upload, indexing, and search with DeepSeek API """ import requests import json import time import os # Configuration BASE_URL = "http://localhost:3015" API_KEY = "jleu1212" OCR_PDF_PATH = "ocr.pdf" def test_server_health(): """Test if server is responding""" try: response = requests.get(f"{BASE_URL}/") print(f"✅ Server is running: {response.status_code}") return True except Exception as e: print(f"❌ Server not responding: {e}") return False def test_upload_ocr_pdf(): """Test uploading OCR PDF file""" if not os.path.exists(OCR_PDF_PATH): print(f"❌ OCR PDF file not found: {OCR_PDF_PATH}") return False try: with open(OCR_PDF_PATH, 'rb') as f: files = {'file': (OCR_PDF_PATH, f, 'application/pdf')} headers = {'X-API-Key': API_KEY} response = requests.post(f"{BASE_URL}/documents/upload", files=files, headers=headers) if response.status_code == 200: print(f"✅ OCR PDF uploaded successfully") result = response.json() print(f" Upload result: {result}") return True else: print(f"❌ Upload failed: {response.status_code} - {response.text}") return False except Exception as e: print(f"❌ Upload error: {e}") return False def test_document_status(): """Check document processing status""" try: headers = {'X-API-Key': API_KEY} response = requests.get(f"{BASE_URL}/documents", headers=headers) if response.status_code == 200: data = response.json() # Count total documents across all statuses total_docs = sum(len(docs) for docs in data.get('statuses', {}).values()) print(f"✅ Documents status retrieved: {total_docs} documents") # Print document details for status, docs in data.get('statuses', {}).items(): for doc in docs: print(f" - {doc.get('file_path', 'Unknown')}: {status}") if doc.get('content_summary'): print(f" Summary: {doc.get('content_summary')[:100]}...") return True else: print(f"❌ Failed to get documents: {response.status_code} - {response.text}") return False except Exception as e: print(f"❌ Documents status error: {e}") return False def test_search_functionality(): """Test search functionality with OCR content (without LLM)""" search_queries = [ "table", # Should find table content from OCR "data", # General content "test" # Should find test content ] for query in search_queries: try: payload = { "query": query, "top_k": 5, "use_llm": False # Disable LLM to avoid region restrictions } headers = {'X-API-Key': API_KEY} response = requests.post(f"{BASE_URL}/search", json=payload, headers=headers) if response.status_code == 200: results = response.json() print(f"✅ Search for '{query}': Found {len(results.get('results', []))} results") for i, result in enumerate(results.get('results', [])[:2]): print(f" Result {i+1}: {result.get('text', '')[:100]}...") else: print(f"❌ Search failed for '{query}': {response.status_code} - {response.text}") return False except Exception as e: print(f"❌ Search error for '{query}': {e}") return False return True def test_vector_search_only(): """Test vector search without LLM integration""" try: payload = { "query": "safety precautions", "use_llm": False, "top_k": 3 } headers = {'X-API-Key': API_KEY} response = requests.post(f"{BASE_URL}/search", json=payload, headers=headers) if response.status_code == 200: results = response.json() print("✅ Vector search test passed") print(f" Found {len(results.get('results', []))} results") for i, result in enumerate(results.get('results', [])): print(f" Result {i+1}: {result.get('text', '')[:150]}...") return True else: print(f"❌ Vector search failed: {response.status_code} - {response.text}") return False except Exception as e: print(f"❌ Vector search error: {e}") return False def main(): print("🧪 Testing OCR PDF Upload, Indexing, and Search with DeepSeek API") print("=" * 70) # Wait a moment for server to be ready print("⏳ Waiting for server to be ready...") time.sleep(5) # Test server health if not test_server_health(): print("❌ Server not available, stopping test") return # Test upload print("\n📤 Testing OCR PDF Upload...") if test_upload_ocr_pdf(): print("⏳ Waiting for document processing...") time.sleep(10) # Wait for processing # Test document status print("\n📊 Testing Document Status...") test_document_status() # Test search functionality (without LLM due to region restrictions) print("\n🔍 Testing Search Functionality (without LLM)...") if test_search_functionality(): print("\n🔍 Testing Vector Search Only...") test_vector_search_only() else: print("❌ Search functionality failed") else: print("❌ OCR PDF upload failed") print("\n" + "=" * 70) print("✅ Test completed") if __name__ == "__main__": main()