railseek6/test_simple_search.py

"""
Simple Search Test - Bypasses entity extraction issues
Tests the core document processing and search functionality
"""

import os
import sys
import time
import requests
import json
from pathlib import Path

# Configuration
LIGHTRAG_URL = "http://localhost:3016"
API_KEY = "jleu1212"
TEST_FILE = "test.docx"
HEADERS = {"X-API-Key": API_KEY}

def test_direct_search():
    """Test direct search without entity extraction dependencies"""
    print("🔍 Testing direct search functionality...")

    try:
        # Try a simple search that doesn't require entity extraction
        search_payload = {
            "query": "test",
            "top_k": 5,
            "mode": "local"
        }

        response = requests.post(
            f"{LIGHTRAG_URL}/search",
            json=search_payload,
            headers=HEADERS,
            timeout=30
        )

        if response.status_code == 200:
            results = response.json()
            print("✅ Search completed successfully")
            print(f"📊 Found {len(results.get('results', []))} results")

            # Print results for debugging
            for i, result in enumerate(results.get('results', [])):
                print(f"   {i+1}. Score: {result.get('score', 0):.4f}")
                content = result.get('content', '')
                print(f"      Content: {content[:200]}...")

            return True
        else:
            print(f"❌ Search failed: {response.status_code} - {response.text}")
            return False

    except Exception as e:
        print(f"❌ Search error: {e}")
        return False

def test_documents_endpoint():
    """Test the documents endpoint to see what's indexed"""
    print("📄 Checking documents endpoint...")

    try:
        response = requests.get(f"{LIGHTRAG_URL}/documents", headers=HEADERS, timeout=10)
        if response.status_code == 200:
            documents = response.json()
            print(f"✅ Found {len(documents)} documents in system")

            for doc in documents:
                print(f"   - {doc.get('filename', 'Unknown')}: {doc.get('status', 'Unknown')}")
                if 'metadata' in doc:
                    print(f"     Metadata: {doc.get('metadata', {})}")

            return True
        else:
            print(f"❌ Documents endpoint failed: {response.status_code} - {response.text}")
            return False

    except Exception as e:
        print(f"❌ Documents endpoint error: {e}")
        return False

def test_health_endpoint():
    """Test server health"""
    print("🏥 Testing server health...")

    try:
        response = requests.get(f"{LIGHTRAG_URL}/", headers=HEADERS, timeout=10)
        if response.status_code == 200:
            print("✅ Server is healthy")
            return True
        else:
            print(f"❌ Server health check failed: {response.status_code}")
            return False
    except Exception as e:
        print(f"❌ Server health error: {e}")
        return False

def check_document_content():
    """Check if document content contains bee classification"""
    print("🔎 Checking document content for bee classification...")

    try:
        # First get all documents
        response = requests.get(f"{LIGHTRAG_URL}/documents", headers=HEADERS, timeout=10)
        if response.status_code == 200:
            documents = response.json()

            for doc in documents:
                if 'test.docx' in doc.get('filename', ''):
                    print(f"📄 Found test.docx: {doc}")

                    # Try to get document details
                    doc_id = doc.get('id')
                    if doc_id:
                        detail_response = requests.get(
                            f"{LIGHTRAG_URL}/documents/{doc_id}",
                            headers=HEADERS,
                            timeout=10
                        )
                        if detail_response.status_code == 200:
                            doc_detail = detail_response.json()
                            print(f"📋 Document details: {doc_detail}")

                    return True

            print("❌ test.docx not found in documents")
            return False
        else:
            print(f"❌ Could not get documents: {response.status_code}")
            return False

    except Exception as e:
        print(f"❌ Document content check error: {e}")
        return False

def test_local_search():
    """Test search with local mode to avoid LLM dependencies"""
    print("🔍 Testing local search mode...")

    search_terms = [
        "test",
        "document",
        "image",
        "classification"
    ]

    for term in search_terms:
        try:
            search_payload = {
                "query": term,
                "top_k": 3,
                "mode": "local"
            }

            response = requests.post(
                f"{LIGHTRAG_URL}/search",
                json=search_payload,
                headers=HEADERS,
                timeout=30
            )

            if response.status_code == 200:
                results = response.json()
                if results.get('results'):
                    print(f"✅ Found {len(results['results'])} results for '{term}'")
                    for result in results['results']:
                        content = result.get('content', '')
                        if 'bee' in content.lower():
                            print(f"🎉 FOUND BEE IN SEARCH: {content[:200]}...")
                else:
                    print(f"❌ No results for '{term}'")
            else:
                print(f"❌ Search for '{term}' failed: {response.status_code}")

        except Exception as e:
            print(f"❌ Search for '{term}' error: {e}")

def main():
    """Main test function"""
    print("=" * 50)
    print("🔧 SIMPLE SEARCH TEST")
    print("=" * 50)
    print(f"📡 Server: {LIGHTRAG_URL}")
    print()

    # Test 1: Server health
    print("1. Testing server health...")
    if not test_health_endpoint():
        print("❌ Cannot proceed - server not healthy")
        return False

    # Test 2: Check documents
    print("\n2. Checking documents...")
    test_documents_endpoint()

    # Test 3: Check document content
    print("\n3. Checking document content...")
    check_document_content()

    # Test 4: Simple search
    print("\n4. Testing simple search...")
    test_direct_search()

    # Test 5: Local search with various terms
    print("\n5. Testing local search with various terms...")
    test_local_search()

    print("\n" + "=" * 50)
    print("📊 SIMPLE TEST COMPLETE")
    print("=" * 50)
    print("💡 Next steps:")
    print("   - Check the server logs for document processing details")
    print("   - Verify test.docx was processed with image extraction")
    print("   - Look for 'bee' classification in the processed content")

    return True

if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)