240 lines
8.2 KiB
Python
240 lines
8.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test OCR PDF upload, indexing, and vector search without LLM integration
|
|
Focuses on core functionality without DeepSeek API dependencies
|
|
"""
|
|
|
|
import requests
|
|
import time
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
|
|
# Configuration
|
|
BASE_URL = "http://localhost:3015"
|
|
API_KEY = os.getenv("LIGHTRAG_API_KEY", "test-key-123")
|
|
OCR_PDF_PATH = "ocr.pdf"
|
|
|
|
def test_server_health():
|
|
"""Check if server is running"""
|
|
print("⏳ Waiting for server to be ready...")
|
|
max_retries = 30
|
|
for i in range(max_retries):
|
|
try:
|
|
response = requests.get(f"{BASE_URL}/api/health", timeout=10)
|
|
if response.status_code == 200:
|
|
print("✅ Server is running: 200")
|
|
return True
|
|
except requests.exceptions.RequestException:
|
|
pass
|
|
|
|
if i < max_retries - 1:
|
|
time.sleep(2)
|
|
|
|
print("❌ Server is not responding")
|
|
return False
|
|
|
|
def test_ocr_pdf_upload():
|
|
"""Test OCR PDF upload"""
|
|
print("\n📤 Testing OCR PDF Upload...")
|
|
|
|
if not Path(OCR_PDF_PATH).exists():
|
|
print(f"❌ OCR PDF file not found: {OCR_PDF_PATH}")
|
|
return None
|
|
|
|
headers = {'X-API-Key': API_KEY}
|
|
|
|
with open(OCR_PDF_PATH, 'rb') as f:
|
|
files = {'file': (OCR_PDF_PATH, f, 'application/pdf')}
|
|
response = requests.post(f"{BASE_URL}/documents/upload", files=files, headers=headers)
|
|
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
print(f"✅ OCR PDF uploaded successfully")
|
|
print(f" Upload result: {result}")
|
|
return result.get('track_id')
|
|
else:
|
|
print(f"❌ Upload failed: {response.status_code} - {response.text}")
|
|
return None
|
|
|
|
def wait_for_document_processing(track_id, timeout=60):
|
|
"""Wait for document to be processed"""
|
|
print("⏳ Waiting for document processing...")
|
|
|
|
headers = {'X-API-Key': API_KEY}
|
|
start_time = time.time()
|
|
|
|
while time.time() - start_time < timeout:
|
|
try:
|
|
response = requests.get(f"{BASE_URL}/documents", headers=headers)
|
|
if response.status_code == 200:
|
|
documents = response.json()
|
|
if isinstance(documents, list) and len(documents) > 0:
|
|
for doc in documents:
|
|
if isinstance(doc, dict) and doc.get('status') == 'processed':
|
|
print("✅ Document processing completed")
|
|
return True
|
|
elif isinstance(documents, dict) and documents.get('documents'):
|
|
for doc in documents['documents']:
|
|
if doc.get('status') == 'processed':
|
|
print("✅ Document processing completed")
|
|
return True
|
|
time.sleep(5)
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"⚠️ Error checking document status: {e}")
|
|
time.sleep(5)
|
|
|
|
print("⚠️ Document processing timeout - continuing anyway")
|
|
return False
|
|
|
|
def test_document_status():
|
|
"""Check document processing status"""
|
|
print("\n📊 Testing Document Status...")
|
|
|
|
headers = {'X-API-Key': API_KEY}
|
|
response = requests.get(f"{BASE_URL}/documents", headers=headers)
|
|
|
|
if response.status_code == 200:
|
|
documents = response.json()
|
|
print(f"✅ Documents status retrieved")
|
|
|
|
if isinstance(documents, list):
|
|
for doc in documents:
|
|
if isinstance(doc, dict):
|
|
print(f" - {doc.get('filename', 'Unknown')}: {doc.get('status', 'unknown')}")
|
|
if doc.get('summary'):
|
|
summary = doc.get('summary', '')
|
|
# Show first 100 chars of summary
|
|
preview = summary[:100] + "..." if len(summary) > 100 else summary
|
|
print(f" Summary: {preview}")
|
|
elif isinstance(documents, dict) and documents.get('documents'):
|
|
for doc in documents['documents']:
|
|
print(f" - {doc.get('filename', 'Unknown')}: {doc.get('status', 'unknown')}")
|
|
if doc.get('summary'):
|
|
summary = doc.get('summary', '')
|
|
preview = summary[:100] + "..." if len(summary) > 100 else summary
|
|
print(f" Summary: {preview}")
|
|
else:
|
|
print(f" Document data: {documents}")
|
|
|
|
return True
|
|
else:
|
|
print(f"❌ Failed to get document status: {response.status_code} - {response.text}")
|
|
return False
|
|
|
|
def test_vector_search(query):
|
|
"""Test vector search without LLM integration"""
|
|
print(f"\n🔍 Testing Vector Search for '{query}'...")
|
|
|
|
headers = {
|
|
'X-API-Key': API_KEY,
|
|
'Content-Type': 'application/json'
|
|
}
|
|
|
|
# Try direct vector search endpoint if available
|
|
search_data = {
|
|
"query": query,
|
|
"top_k": 5,
|
|
"use_llm": False # Disable LLM to avoid API restrictions
|
|
}
|
|
|
|
try:
|
|
response = requests.post(f"{BASE_URL}/search", json=search_data, headers=headers, timeout=30)
|
|
|
|
if response.status_code == 200:
|
|
results = response.json()
|
|
print("✅ Vector search successful!")
|
|
print(f" Found {len(results.get('results', []))} results")
|
|
|
|
# Display top results
|
|
for i, result in enumerate(results.get('results', [])[:3]):
|
|
content = result.get('content', '')
|
|
preview = content[:150] + "..." if len(content) > 150 else content
|
|
print(f" {i+1}. {preview}")
|
|
|
|
return True
|
|
else:
|
|
print(f"❌ Vector search failed: {response.status_code} - {response.text}")
|
|
return False
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"❌ Search request failed: {e}")
|
|
return False
|
|
|
|
def test_direct_chunk_retrieval():
|
|
"""Test direct chunk retrieval to verify indexing worked"""
|
|
print("\n📄 Testing Direct Chunk Retrieval...")
|
|
|
|
headers = {'X-API-Key': API_KEY}
|
|
|
|
# Try to get stored chunks
|
|
try:
|
|
response = requests.get(f"{BASE_URL}/api/chunks", headers=headers, timeout=10)
|
|
if response.status_code == 200:
|
|
chunks = response.json()
|
|
print(f"✅ Retrieved {len(chunks)} chunks from storage")
|
|
if len(chunks) > 0:
|
|
print(f" First chunk preview: {chunks[0].get('content', '')[:100]}...")
|
|
return True
|
|
else:
|
|
print(f"⚠️ Could not retrieve chunks: {response.status_code}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"⚠️ Chunk retrieval not available: {e}")
|
|
return False
|
|
|
|
def main():
|
|
print("🧪 Testing OCR PDF Upload, Indexing, and Vector Search (No LLM)")
|
|
print("=" * 70)
|
|
|
|
# Test server health
|
|
if not test_server_health():
|
|
return
|
|
|
|
# Test OCR PDF upload
|
|
track_id = test_ocr_pdf_upload()
|
|
if not track_id:
|
|
print("❌ Cannot proceed without successful upload")
|
|
return
|
|
|
|
# Wait for processing
|
|
wait_for_document_processing(track_id)
|
|
|
|
# Test document status
|
|
if not test_document_status():
|
|
return
|
|
|
|
# Test direct chunk retrieval
|
|
test_direct_chunk_retrieval()
|
|
|
|
# Test vector searches for OCR content
|
|
test_queries = [
|
|
"safety precautions",
|
|
"minimum safe distance",
|
|
"table",
|
|
"G1.7.1",
|
|
"work near"
|
|
]
|
|
|
|
successful_searches = 0
|
|
for query in test_queries:
|
|
if test_vector_search(query):
|
|
successful_searches += 1
|
|
|
|
print(f"\n📊 Search Results Summary:")
|
|
print(f" Successful searches: {successful_searches}/{len(test_queries)}")
|
|
|
|
if successful_searches > 0:
|
|
print("✅ OCR workflow is functioning correctly!")
|
|
print(" - PDF upload: ✅")
|
|
print(" - OCR processing: ✅")
|
|
print(" - Vector indexing: ✅")
|
|
print(" - Search functionality: ✅")
|
|
else:
|
|
print("❌ Search functionality needs investigation")
|
|
|
|
print("=" * 70)
|
|
print("✅ Test completed")
|
|
|
|
if __name__ == "__main__":
|
|
main() |