Files
railseek6/test_ocr_vector_search_only.py

240 lines
8.2 KiB
Python

#!/usr/bin/env python3
"""
Test OCR PDF upload, indexing, and vector search without LLM integration
Focuses on core functionality without DeepSeek API dependencies
"""
import requests
import time
import json
import os
from pathlib import Path
# Configuration
BASE_URL = "http://localhost:3015"
API_KEY = os.getenv("LIGHTRAG_API_KEY", "test-key-123")
OCR_PDF_PATH = "ocr.pdf"
def test_server_health():
"""Check if server is running"""
print("⏳ Waiting for server to be ready...")
max_retries = 30
for i in range(max_retries):
try:
response = requests.get(f"{BASE_URL}/api/health", timeout=10)
if response.status_code == 200:
print("✅ Server is running: 200")
return True
except requests.exceptions.RequestException:
pass
if i < max_retries - 1:
time.sleep(2)
print("❌ Server is not responding")
return False
def test_ocr_pdf_upload():
"""Test OCR PDF upload"""
print("\n📤 Testing OCR PDF Upload...")
if not Path(OCR_PDF_PATH).exists():
print(f"❌ OCR PDF file not found: {OCR_PDF_PATH}")
return None
headers = {'X-API-Key': API_KEY}
with open(OCR_PDF_PATH, 'rb') as f:
files = {'file': (OCR_PDF_PATH, f, 'application/pdf')}
response = requests.post(f"{BASE_URL}/documents/upload", files=files, headers=headers)
if response.status_code == 200:
result = response.json()
print(f"✅ OCR PDF uploaded successfully")
print(f" Upload result: {result}")
return result.get('track_id')
else:
print(f"❌ Upload failed: {response.status_code} - {response.text}")
return None
def wait_for_document_processing(track_id, timeout=60):
"""Wait for document to be processed"""
print("⏳ Waiting for document processing...")
headers = {'X-API-Key': API_KEY}
start_time = time.time()
while time.time() - start_time < timeout:
try:
response = requests.get(f"{BASE_URL}/documents", headers=headers)
if response.status_code == 200:
documents = response.json()
if isinstance(documents, list) and len(documents) > 0:
for doc in documents:
if isinstance(doc, dict) and doc.get('status') == 'processed':
print("✅ Document processing completed")
return True
elif isinstance(documents, dict) and documents.get('documents'):
for doc in documents['documents']:
if doc.get('status') == 'processed':
print("✅ Document processing completed")
return True
time.sleep(5)
except requests.exceptions.RequestException as e:
print(f"⚠️ Error checking document status: {e}")
time.sleep(5)
print("⚠️ Document processing timeout - continuing anyway")
return False
def test_document_status():
"""Check document processing status"""
print("\n📊 Testing Document Status...")
headers = {'X-API-Key': API_KEY}
response = requests.get(f"{BASE_URL}/documents", headers=headers)
if response.status_code == 200:
documents = response.json()
print(f"✅ Documents status retrieved")
if isinstance(documents, list):
for doc in documents:
if isinstance(doc, dict):
print(f" - {doc.get('filename', 'Unknown')}: {doc.get('status', 'unknown')}")
if doc.get('summary'):
summary = doc.get('summary', '')
# Show first 100 chars of summary
preview = summary[:100] + "..." if len(summary) > 100 else summary
print(f" Summary: {preview}")
elif isinstance(documents, dict) and documents.get('documents'):
for doc in documents['documents']:
print(f" - {doc.get('filename', 'Unknown')}: {doc.get('status', 'unknown')}")
if doc.get('summary'):
summary = doc.get('summary', '')
preview = summary[:100] + "..." if len(summary) > 100 else summary
print(f" Summary: {preview}")
else:
print(f" Document data: {documents}")
return True
else:
print(f"❌ Failed to get document status: {response.status_code} - {response.text}")
return False
def test_vector_search(query):
"""Test vector search without LLM integration"""
print(f"\n🔍 Testing Vector Search for '{query}'...")
headers = {
'X-API-Key': API_KEY,
'Content-Type': 'application/json'
}
# Try direct vector search endpoint if available
search_data = {
"query": query,
"top_k": 5,
"use_llm": False # Disable LLM to avoid API restrictions
}
try:
response = requests.post(f"{BASE_URL}/search", json=search_data, headers=headers, timeout=30)
if response.status_code == 200:
results = response.json()
print("✅ Vector search successful!")
print(f" Found {len(results.get('results', []))} results")
# Display top results
for i, result in enumerate(results.get('results', [])[:3]):
content = result.get('content', '')
preview = content[:150] + "..." if len(content) > 150 else content
print(f" {i+1}. {preview}")
return True
else:
print(f"❌ Vector search failed: {response.status_code} - {response.text}")
return False
except requests.exceptions.RequestException as e:
print(f"❌ Search request failed: {e}")
return False
def test_direct_chunk_retrieval():
"""Test direct chunk retrieval to verify indexing worked"""
print("\n📄 Testing Direct Chunk Retrieval...")
headers = {'X-API-Key': API_KEY}
# Try to get stored chunks
try:
response = requests.get(f"{BASE_URL}/api/chunks", headers=headers, timeout=10)
if response.status_code == 200:
chunks = response.json()
print(f"✅ Retrieved {len(chunks)} chunks from storage")
if len(chunks) > 0:
print(f" First chunk preview: {chunks[0].get('content', '')[:100]}...")
return True
else:
print(f"⚠️ Could not retrieve chunks: {response.status_code}")
return False
except Exception as e:
print(f"⚠️ Chunk retrieval not available: {e}")
return False
def main():
print("🧪 Testing OCR PDF Upload, Indexing, and Vector Search (No LLM)")
print("=" * 70)
# Test server health
if not test_server_health():
return
# Test OCR PDF upload
track_id = test_ocr_pdf_upload()
if not track_id:
print("❌ Cannot proceed without successful upload")
return
# Wait for processing
wait_for_document_processing(track_id)
# Test document status
if not test_document_status():
return
# Test direct chunk retrieval
test_direct_chunk_retrieval()
# Test vector searches for OCR content
test_queries = [
"safety precautions",
"minimum safe distance",
"table",
"G1.7.1",
"work near"
]
successful_searches = 0
for query in test_queries:
if test_vector_search(query):
successful_searches += 1
print(f"\n📊 Search Results Summary:")
print(f" Successful searches: {successful_searches}/{len(test_queries)}")
if successful_searches > 0:
print("✅ OCR workflow is functioning correctly!")
print(" - PDF upload: ✅")
print(" - OCR processing: ✅")
print(" - Vector indexing: ✅")
print(" - Search functionality: ✅")
else:
print("❌ Search functionality needs investigation")
print("=" * 70)
print("✅ Test completed")
if __name__ == "__main__":
main()