#!/usr/bin/env python3 """ Test OCR PDF upload, indexing, and vector search without LLM integration Focuses on core functionality without DeepSeek API dependencies """ import requests import time import json import os from pathlib import Path # Configuration BASE_URL = "http://localhost:3015" API_KEY = os.getenv("LIGHTRAG_API_KEY", "test-key-123") OCR_PDF_PATH = "ocr.pdf" def test_server_health(): """Check if server is running""" print("โณ Waiting for server to be ready...") max_retries = 30 for i in range(max_retries): try: response = requests.get(f"{BASE_URL}/api/health", timeout=10) if response.status_code == 200: print("โœ… Server is running: 200") return True except requests.exceptions.RequestException: pass if i < max_retries - 1: time.sleep(2) print("โŒ Server is not responding") return False def test_ocr_pdf_upload(): """Test OCR PDF upload""" print("\n๐Ÿ“ค Testing OCR PDF Upload...") if not Path(OCR_PDF_PATH).exists(): print(f"โŒ OCR PDF file not found: {OCR_PDF_PATH}") return None headers = {'X-API-Key': API_KEY} with open(OCR_PDF_PATH, 'rb') as f: files = {'file': (OCR_PDF_PATH, f, 'application/pdf')} response = requests.post(f"{BASE_URL}/documents/upload", files=files, headers=headers) if response.status_code == 200: result = response.json() print(f"โœ… OCR PDF uploaded successfully") print(f" Upload result: {result}") return result.get('track_id') else: print(f"โŒ Upload failed: {response.status_code} - {response.text}") return None def wait_for_document_processing(track_id, timeout=60): """Wait for document to be processed""" print("โณ Waiting for document processing...") headers = {'X-API-Key': API_KEY} start_time = time.time() while time.time() - start_time < timeout: try: response = requests.get(f"{BASE_URL}/documents", headers=headers) if response.status_code == 200: documents = response.json() if isinstance(documents, list) and len(documents) > 0: for doc in documents: if isinstance(doc, dict) and doc.get('status') == 'processed': print("โœ… Document processing completed") return True elif isinstance(documents, dict) and documents.get('documents'): for doc in documents['documents']: if doc.get('status') == 'processed': print("โœ… Document processing completed") return True time.sleep(5) except requests.exceptions.RequestException as e: print(f"โš ๏ธ Error checking document status: {e}") time.sleep(5) print("โš ๏ธ Document processing timeout - continuing anyway") return False def test_document_status(): """Check document processing status""" print("\n๐Ÿ“Š Testing Document Status...") headers = {'X-API-Key': API_KEY} response = requests.get(f"{BASE_URL}/documents", headers=headers) if response.status_code == 200: documents = response.json() print(f"โœ… Documents status retrieved") if isinstance(documents, list): for doc in documents: if isinstance(doc, dict): print(f" - {doc.get('filename', 'Unknown')}: {doc.get('status', 'unknown')}") if doc.get('summary'): summary = doc.get('summary', '') # Show first 100 chars of summary preview = summary[:100] + "..." if len(summary) > 100 else summary print(f" Summary: {preview}") elif isinstance(documents, dict) and documents.get('documents'): for doc in documents['documents']: print(f" - {doc.get('filename', 'Unknown')}: {doc.get('status', 'unknown')}") if doc.get('summary'): summary = doc.get('summary', '') preview = summary[:100] + "..." if len(summary) > 100 else summary print(f" Summary: {preview}") else: print(f" Document data: {documents}") return True else: print(f"โŒ Failed to get document status: {response.status_code} - {response.text}") return False def test_vector_search(query): """Test vector search without LLM integration""" print(f"\n๐Ÿ” Testing Vector Search for '{query}'...") headers = { 'X-API-Key': API_KEY, 'Content-Type': 'application/json' } # Try direct vector search endpoint if available search_data = { "query": query, "top_k": 5, "use_llm": False # Disable LLM to avoid API restrictions } try: response = requests.post(f"{BASE_URL}/search", json=search_data, headers=headers, timeout=30) if response.status_code == 200: results = response.json() print("โœ… Vector search successful!") print(f" Found {len(results.get('results', []))} results") # Display top results for i, result in enumerate(results.get('results', [])[:3]): content = result.get('content', '') preview = content[:150] + "..." if len(content) > 150 else content print(f" {i+1}. {preview}") return True else: print(f"โŒ Vector search failed: {response.status_code} - {response.text}") return False except requests.exceptions.RequestException as e: print(f"โŒ Search request failed: {e}") return False def test_direct_chunk_retrieval(): """Test direct chunk retrieval to verify indexing worked""" print("\n๐Ÿ“„ Testing Direct Chunk Retrieval...") headers = {'X-API-Key': API_KEY} # Try to get stored chunks try: response = requests.get(f"{BASE_URL}/api/chunks", headers=headers, timeout=10) if response.status_code == 200: chunks = response.json() print(f"โœ… Retrieved {len(chunks)} chunks from storage") if len(chunks) > 0: print(f" First chunk preview: {chunks[0].get('content', '')[:100]}...") return True else: print(f"โš ๏ธ Could not retrieve chunks: {response.status_code}") return False except Exception as e: print(f"โš ๏ธ Chunk retrieval not available: {e}") return False def main(): print("๐Ÿงช Testing OCR PDF Upload, Indexing, and Vector Search (No LLM)") print("=" * 70) # Test server health if not test_server_health(): return # Test OCR PDF upload track_id = test_ocr_pdf_upload() if not track_id: print("โŒ Cannot proceed without successful upload") return # Wait for processing wait_for_document_processing(track_id) # Test document status if not test_document_status(): return # Test direct chunk retrieval test_direct_chunk_retrieval() # Test vector searches for OCR content test_queries = [ "safety precautions", "minimum safe distance", "table", "G1.7.1", "work near" ] successful_searches = 0 for query in test_queries: if test_vector_search(query): successful_searches += 1 print(f"\n๐Ÿ“Š Search Results Summary:") print(f" Successful searches: {successful_searches}/{len(test_queries)}") if successful_searches > 0: print("โœ… OCR workflow is functioning correctly!") print(" - PDF upload: โœ…") print(" - OCR processing: โœ…") print(" - Vector indexing: โœ…") print(" - Search functionality: โœ…") else: print("โŒ Search functionality needs investigation") print("=" * 70) print("โœ… Test completed") if __name__ == "__main__": main()