railseek6/check_chunks.py

#!/usr/bin/env python3
"""
Check document chunks to see what's actually stored in the vector database
"""

import requests
import json

LIGHTRAG_URL = 'http://localhost:3015'
API_KEY = 'jleu1212'

def check_document_chunks():
    """Check the chunks of the processed document"""
    print("📄 Checking Document Chunks")
    print("=" * 40)

    headers = {'X-API-Key': API_KEY}

    # First, get the document ID
    try:
        response = requests.get(f"{LIGHTRAG_URL}/documents", headers=headers)
        if response.status_code == 200:
            doc_data = response.json()
            statuses = doc_data.get('statuses', {})

            processed_docs = statuses.get('processed', [])
            if processed_docs:
                doc_id = processed_docs[0].get('id')
                print(f"📋 Document ID: {doc_id}")

                # Try to get chunks for this document
                print(f"\n🔍 Getting chunks for document {doc_id}...")
                chunks_response = requests.get(f"{LIGHTRAG_URL}/documents/{doc_id}/chunks", headers=headers)
                if chunks_response.status_code == 200:
                    chunks_data = chunks_response.json()
                    print(f"Chunks response: {chunks_data}")
                else:
                    print(f"❌ Failed to get chunks: {chunks_response.status_code}")
                    print(f"Response: {chunks_response.text}")
            else:
                print("❌ No processed documents found")

    except Exception as e:
        print(f"❌ Error: {e}")

def test_direct_chunk_search():
    """Test if we can search for chunks directly"""
    print("\n🔍 Testing Direct Chunk Search")
    print("=" * 40)

    headers = {'Content-Type': 'application/json', 'X-API-Key': API_KEY}

    # Try a more specific search that might trigger chunk retrieval
    payload = {
        'query': 'bee classification image photo',
        'top_k': 10,
        'mode': 'local',
        'include_metadata': True
    }

    try:
        response = requests.post(f'{LIGHTRAG_URL}/search', json=payload, headers=headers, timeout=10)
        if response.status_code == 200:
            search_data = response.json()
            results = search_data.get('results', [])
            print(f"Found {len(results)} results")

            for i, result in enumerate(results):
                content = result.get('content', '')
                metadata = result.get('metadata', {})
                print(f"\n--- Result {i+1} ---")
                print(f"Content: {content}")
                print(f"Metadata: {metadata}")
        else:
            print(f"❌ Search failed: {response.status_code}")

    except Exception as e:
        print(f"❌ Error: {e}")

if __name__ == "__main__":
    check_document_chunks()
    test_direct_chunk_search()