railseek6/fix_webui_search.py

import requests
import time
import os

def fix_webui_search():
    base_url = 'http://localhost:3015'
    headers = {'X-API-Key': 'jleu1212', 'Content-Type': 'application/json'}

    print('🔧 Fixing Web UI search for bee classification...')

    # Step 1: Delete all existing documents to start fresh
    print('🗑️  Step 1: Deleting all existing documents...')
    try:
        # First get all documents
        response = requests.get(f'{base_url}/documents', headers=headers, timeout=10)
        if response.status_code == 200:
            data = response.json()
            processed_docs = data.get('statuses', {}).get('processed', [])
            print(f'Found {len(processed_docs)} processed documents')

            # Try to delete each document
            for doc in processed_docs:
                doc_id = doc.get('id')
                if doc_id:
                    print(f'Deleting document: {doc_id}')
                    delete_response = requests.delete(f'{base_url}/documents/{doc_id}', headers=headers, timeout=10)
                    print(f'Delete status: {delete_response.status_code}')

        # Wait for deletion to complete
        time.sleep(3)
    except Exception as e:
        print(f'Delete operation failed: {e}')

    # Step 2: Process the document with the updated processor to include classification
    print('\n📄 Step 2: Processing document with updated processor...')
    try:
        from optimized_document_processor import OptimizedDocumentProcessor
        import asyncio

        processor = OptimizedDocumentProcessor()
        test_file = "test.docx"

        if not os.path.exists(test_file):
            print(f'❌ Test file not found: {test_file}')
            return False

        print(f'Processing {test_file} with classification metadata...')
        result = asyncio.run(processor.process_document(test_file))

        if result["success"]:
            print('✅ Document processed successfully with classification metadata')
            print(f'Text content length: {len(result["text_content"])} chars')

            # Check if classification is included
            if 'Image Classifications:' in result["text_content"]:
                print('✅ Classification metadata IS included in text content')
                # Show the classification section
                lines = result["text_content"].split('\n')
                for line in lines:
                    if 'Image Classifications:' in line or 'bee' in line.lower():
                        print(f'   {line}')
            else:
                print('❌ Classification metadata NOT included in text content')
                return False
        else:
            print(f'❌ Processing failed: {result["metadata"].get("error", "Unknown error")}')
            return False

    except Exception as e:
        print(f'❌ Document processing failed: {e}')
        return False

    # Step 3: Upload the processed document to LightRAG
    print('\n📤 Step 3: Uploading document to LightRAG...')
    try:
        files = {'file': ('test.docx', open('test.docx', 'rb'), 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
        upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers={'X-API-Key': 'jleu1212'}, timeout=30)

        if upload_response.status_code == 200:
            print('✅ Document uploaded successfully')
            upload_data = upload_response.json()
            print(f'Upload response: {upload_data}')
        else:
            print(f'❌ Upload failed: {upload_response.status_code} - {upload_response.text}')
            return False
    except Exception as e:
        print(f'❌ Upload failed: {e}')
        return False

    # Step 4: Wait for indexing and verify search works
    print('\n⏳ Step 4: Waiting for indexing (20 seconds)...')
    time.sleep(20)

    # Step 5: Verify search for bee classification
    print('\n🔍 Step 5: Verifying bee classification search...')
    try:
        response = requests.post(f'{base_url}/api/search', headers=headers, json={'query': 'bee', 'top_k': 10}, timeout=15)
        if response.status_code == 200:
            results = response.json()
            chunks = results.get('chunks', [])
            print(f'Found {len(chunks)} chunks for "bee" search')

            for i, chunk in enumerate(chunks):
                content = chunk.get('content', '')
                file_path = chunk.get('file_path', '')
                print(f'\n📄 Chunk {i+1} from {file_path}:')

                if 'Image Classifications:' in content:
                    print('✅ SUCCESS: Classification metadata IS present and searchable!')
                    print('The bee classification is now available through Web UI search.')
                    lines = content.split('\n')
                    for line in lines:
                        if 'Image Classifications:' in line or 'bee' in line.lower():
                            print(f'   🐝 {line}')
                    return True
                else:
                    print('❌ Still no classification metadata in indexed content')
                    return False
        else:
            print(f'❌ Search failed: {response.status_code}')
            return False
    except Exception as e:
        print(f'❌ Search verification failed: {e}')
        return False

if __name__ == '__main__':
    success = fix_webui_search()
    if success:
        print('\n🎉 SUCCESS: Bee classification is now searchable in Web UI!')
    else:
        print('\n⚠️ Some issues remain. The classification metadata may not be properly indexed.')