220 lines
7.1 KiB
Python
220 lines
7.1 KiB
Python
"""
|
|
Simple Search Test - Bypasses entity extraction issues
|
|
Tests the core document processing and search functionality
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import requests
|
|
import json
|
|
from pathlib import Path
|
|
|
|
# Configuration
|
|
LIGHTRAG_URL = "http://localhost:3016"
|
|
API_KEY = "jleu1212"
|
|
TEST_FILE = "test.docx"
|
|
HEADERS = {"X-API-Key": API_KEY}
|
|
|
|
def test_direct_search():
|
|
"""Test direct search without entity extraction dependencies"""
|
|
print("🔍 Testing direct search functionality...")
|
|
|
|
try:
|
|
# Try a simple search that doesn't require entity extraction
|
|
search_payload = {
|
|
"query": "test",
|
|
"top_k": 5,
|
|
"mode": "local"
|
|
}
|
|
|
|
response = requests.post(
|
|
f"{LIGHTRAG_URL}/search",
|
|
json=search_payload,
|
|
headers=HEADERS,
|
|
timeout=30
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
results = response.json()
|
|
print("✅ Search completed successfully")
|
|
print(f"📊 Found {len(results.get('results', []))} results")
|
|
|
|
# Print results for debugging
|
|
for i, result in enumerate(results.get('results', [])):
|
|
print(f" {i+1}. Score: {result.get('score', 0):.4f}")
|
|
content = result.get('content', '')
|
|
print(f" Content: {content[:200]}...")
|
|
|
|
return True
|
|
else:
|
|
print(f"❌ Search failed: {response.status_code} - {response.text}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ Search error: {e}")
|
|
return False
|
|
|
|
def test_documents_endpoint():
|
|
"""Test the documents endpoint to see what's indexed"""
|
|
print("📄 Checking documents endpoint...")
|
|
|
|
try:
|
|
response = requests.get(f"{LIGHTRAG_URL}/documents", headers=HEADERS, timeout=10)
|
|
if response.status_code == 200:
|
|
documents = response.json()
|
|
print(f"✅ Found {len(documents)} documents in system")
|
|
|
|
for doc in documents:
|
|
print(f" - {doc.get('filename', 'Unknown')}: {doc.get('status', 'Unknown')}")
|
|
if 'metadata' in doc:
|
|
print(f" Metadata: {doc.get('metadata', {})}")
|
|
|
|
return True
|
|
else:
|
|
print(f"❌ Documents endpoint failed: {response.status_code} - {response.text}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ Documents endpoint error: {e}")
|
|
return False
|
|
|
|
def test_health_endpoint():
|
|
"""Test server health"""
|
|
print("🏥 Testing server health...")
|
|
|
|
try:
|
|
response = requests.get(f"{LIGHTRAG_URL}/", headers=HEADERS, timeout=10)
|
|
if response.status_code == 200:
|
|
print("✅ Server is healthy")
|
|
return True
|
|
else:
|
|
print(f"❌ Server health check failed: {response.status_code}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Server health error: {e}")
|
|
return False
|
|
|
|
def check_document_content():
|
|
"""Check if document content contains bee classification"""
|
|
print("🔎 Checking document content for bee classification...")
|
|
|
|
try:
|
|
# First get all documents
|
|
response = requests.get(f"{LIGHTRAG_URL}/documents", headers=HEADERS, timeout=10)
|
|
if response.status_code == 200:
|
|
documents = response.json()
|
|
|
|
for doc in documents:
|
|
if 'test.docx' in doc.get('filename', ''):
|
|
print(f"📄 Found test.docx: {doc}")
|
|
|
|
# Try to get document details
|
|
doc_id = doc.get('id')
|
|
if doc_id:
|
|
detail_response = requests.get(
|
|
f"{LIGHTRAG_URL}/documents/{doc_id}",
|
|
headers=HEADERS,
|
|
timeout=10
|
|
)
|
|
if detail_response.status_code == 200:
|
|
doc_detail = detail_response.json()
|
|
print(f"📋 Document details: {doc_detail}")
|
|
|
|
return True
|
|
|
|
print("❌ test.docx not found in documents")
|
|
return False
|
|
else:
|
|
print(f"❌ Could not get documents: {response.status_code}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ Document content check error: {e}")
|
|
return False
|
|
|
|
def test_local_search():
|
|
"""Test search with local mode to avoid LLM dependencies"""
|
|
print("🔍 Testing local search mode...")
|
|
|
|
search_terms = [
|
|
"test",
|
|
"document",
|
|
"image",
|
|
"classification"
|
|
]
|
|
|
|
for term in search_terms:
|
|
try:
|
|
search_payload = {
|
|
"query": term,
|
|
"top_k": 3,
|
|
"mode": "local"
|
|
}
|
|
|
|
response = requests.post(
|
|
f"{LIGHTRAG_URL}/search",
|
|
json=search_payload,
|
|
headers=HEADERS,
|
|
timeout=30
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
results = response.json()
|
|
if results.get('results'):
|
|
print(f"✅ Found {len(results['results'])} results for '{term}'")
|
|
for result in results['results']:
|
|
content = result.get('content', '')
|
|
if 'bee' in content.lower():
|
|
print(f"🎉 FOUND BEE IN SEARCH: {content[:200]}...")
|
|
else:
|
|
print(f"❌ No results for '{term}'")
|
|
else:
|
|
print(f"❌ Search for '{term}' failed: {response.status_code}")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Search for '{term}' error: {e}")
|
|
|
|
def main():
|
|
"""Main test function"""
|
|
print("=" * 50)
|
|
print("🔧 SIMPLE SEARCH TEST")
|
|
print("=" * 50)
|
|
print(f"📡 Server: {LIGHTRAG_URL}")
|
|
print()
|
|
|
|
# Test 1: Server health
|
|
print("1. Testing server health...")
|
|
if not test_health_endpoint():
|
|
print("❌ Cannot proceed - server not healthy")
|
|
return False
|
|
|
|
# Test 2: Check documents
|
|
print("\n2. Checking documents...")
|
|
test_documents_endpoint()
|
|
|
|
# Test 3: Check document content
|
|
print("\n3. Checking document content...")
|
|
check_document_content()
|
|
|
|
# Test 4: Simple search
|
|
print("\n4. Testing simple search...")
|
|
test_direct_search()
|
|
|
|
# Test 5: Local search with various terms
|
|
print("\n5. Testing local search with various terms...")
|
|
test_local_search()
|
|
|
|
print("\n" + "=" * 50)
|
|
print("📊 SIMPLE TEST COMPLETE")
|
|
print("=" * 50)
|
|
print("💡 Next steps:")
|
|
print(" - Check the server logs for document processing details")
|
|
print(" - Verify test.docx was processed with image extraction")
|
|
print(" - Look for 'bee' classification in the processed content")
|
|
|
|
return True
|
|
|
|
if __name__ == "__main__":
|
|
success = main()
|
|
sys.exit(0 if success else 1) |