Files
railseek6/test_simple_search.py

220 lines
7.1 KiB
Python

"""
Simple Search Test - Bypasses entity extraction issues
Tests the core document processing and search functionality
"""
import os
import sys
import time
import requests
import json
from pathlib import Path
# Configuration
LIGHTRAG_URL = "http://localhost:3016"
API_KEY = "jleu1212"
TEST_FILE = "test.docx"
HEADERS = {"X-API-Key": API_KEY}
def test_direct_search():
"""Test direct search without entity extraction dependencies"""
print("🔍 Testing direct search functionality...")
try:
# Try a simple search that doesn't require entity extraction
search_payload = {
"query": "test",
"top_k": 5,
"mode": "local"
}
response = requests.post(
f"{LIGHTRAG_URL}/search",
json=search_payload,
headers=HEADERS,
timeout=30
)
if response.status_code == 200:
results = response.json()
print("✅ Search completed successfully")
print(f"📊 Found {len(results.get('results', []))} results")
# Print results for debugging
for i, result in enumerate(results.get('results', [])):
print(f" {i+1}. Score: {result.get('score', 0):.4f}")
content = result.get('content', '')
print(f" Content: {content[:200]}...")
return True
else:
print(f"❌ Search failed: {response.status_code} - {response.text}")
return False
except Exception as e:
print(f"❌ Search error: {e}")
return False
def test_documents_endpoint():
"""Test the documents endpoint to see what's indexed"""
print("📄 Checking documents endpoint...")
try:
response = requests.get(f"{LIGHTRAG_URL}/documents", headers=HEADERS, timeout=10)
if response.status_code == 200:
documents = response.json()
print(f"✅ Found {len(documents)} documents in system")
for doc in documents:
print(f" - {doc.get('filename', 'Unknown')}: {doc.get('status', 'Unknown')}")
if 'metadata' in doc:
print(f" Metadata: {doc.get('metadata', {})}")
return True
else:
print(f"❌ Documents endpoint failed: {response.status_code} - {response.text}")
return False
except Exception as e:
print(f"❌ Documents endpoint error: {e}")
return False
def test_health_endpoint():
"""Test server health"""
print("🏥 Testing server health...")
try:
response = requests.get(f"{LIGHTRAG_URL}/", headers=HEADERS, timeout=10)
if response.status_code == 200:
print("✅ Server is healthy")
return True
else:
print(f"❌ Server health check failed: {response.status_code}")
return False
except Exception as e:
print(f"❌ Server health error: {e}")
return False
def check_document_content():
"""Check if document content contains bee classification"""
print("🔎 Checking document content for bee classification...")
try:
# First get all documents
response = requests.get(f"{LIGHTRAG_URL}/documents", headers=HEADERS, timeout=10)
if response.status_code == 200:
documents = response.json()
for doc in documents:
if 'test.docx' in doc.get('filename', ''):
print(f"📄 Found test.docx: {doc}")
# Try to get document details
doc_id = doc.get('id')
if doc_id:
detail_response = requests.get(
f"{LIGHTRAG_URL}/documents/{doc_id}",
headers=HEADERS,
timeout=10
)
if detail_response.status_code == 200:
doc_detail = detail_response.json()
print(f"📋 Document details: {doc_detail}")
return True
print("❌ test.docx not found in documents")
return False
else:
print(f"❌ Could not get documents: {response.status_code}")
return False
except Exception as e:
print(f"❌ Document content check error: {e}")
return False
def test_local_search():
"""Test search with local mode to avoid LLM dependencies"""
print("🔍 Testing local search mode...")
search_terms = [
"test",
"document",
"image",
"classification"
]
for term in search_terms:
try:
search_payload = {
"query": term,
"top_k": 3,
"mode": "local"
}
response = requests.post(
f"{LIGHTRAG_URL}/search",
json=search_payload,
headers=HEADERS,
timeout=30
)
if response.status_code == 200:
results = response.json()
if results.get('results'):
print(f"✅ Found {len(results['results'])} results for '{term}'")
for result in results['results']:
content = result.get('content', '')
if 'bee' in content.lower():
print(f"🎉 FOUND BEE IN SEARCH: {content[:200]}...")
else:
print(f"❌ No results for '{term}'")
else:
print(f"❌ Search for '{term}' failed: {response.status_code}")
except Exception as e:
print(f"❌ Search for '{term}' error: {e}")
def main():
"""Main test function"""
print("=" * 50)
print("🔧 SIMPLE SEARCH TEST")
print("=" * 50)
print(f"📡 Server: {LIGHTRAG_URL}")
print()
# Test 1: Server health
print("1. Testing server health...")
if not test_health_endpoint():
print("❌ Cannot proceed - server not healthy")
return False
# Test 2: Check documents
print("\n2. Checking documents...")
test_documents_endpoint()
# Test 3: Check document content
print("\n3. Checking document content...")
check_document_content()
# Test 4: Simple search
print("\n4. Testing simple search...")
test_direct_search()
# Test 5: Local search with various terms
print("\n5. Testing local search with various terms...")
test_local_search()
print("\n" + "=" * 50)
print("📊 SIMPLE TEST COMPLETE")
print("=" * 50)
print("💡 Next steps:")
print(" - Check the server logs for document processing details")
print(" - Verify test.docx was processed with image extraction")
print(" - Look for 'bee' classification in the processed content")
return True
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)