131 lines
5.7 KiB
Python
131 lines
5.7 KiB
Python
import requests
|
|
import time
|
|
import os
|
|
|
|
def fix_webui_search():
|
|
base_url = 'http://localhost:3015'
|
|
headers = {'X-API-Key': 'jleu1212', 'Content-Type': 'application/json'}
|
|
|
|
print('🔧 Fixing Web UI search for bee classification...')
|
|
|
|
# Step 1: Delete all existing documents to start fresh
|
|
print('🗑️ Step 1: Deleting all existing documents...')
|
|
try:
|
|
# First get all documents
|
|
response = requests.get(f'{base_url}/documents', headers=headers, timeout=10)
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
processed_docs = data.get('statuses', {}).get('processed', [])
|
|
print(f'Found {len(processed_docs)} processed documents')
|
|
|
|
# Try to delete each document
|
|
for doc in processed_docs:
|
|
doc_id = doc.get('id')
|
|
if doc_id:
|
|
print(f'Deleting document: {doc_id}')
|
|
delete_response = requests.delete(f'{base_url}/documents/{doc_id}', headers=headers, timeout=10)
|
|
print(f'Delete status: {delete_response.status_code}')
|
|
|
|
# Wait for deletion to complete
|
|
time.sleep(3)
|
|
except Exception as e:
|
|
print(f'Delete operation failed: {e}')
|
|
|
|
# Step 2: Process the document with the updated processor to include classification
|
|
print('\n📄 Step 2: Processing document with updated processor...')
|
|
try:
|
|
from optimized_document_processor import OptimizedDocumentProcessor
|
|
import asyncio
|
|
|
|
processor = OptimizedDocumentProcessor()
|
|
test_file = "test.docx"
|
|
|
|
if not os.path.exists(test_file):
|
|
print(f'❌ Test file not found: {test_file}')
|
|
return False
|
|
|
|
print(f'Processing {test_file} with classification metadata...')
|
|
result = asyncio.run(processor.process_document(test_file))
|
|
|
|
if result["success"]:
|
|
print('✅ Document processed successfully with classification metadata')
|
|
print(f'Text content length: {len(result["text_content"])} chars')
|
|
|
|
# Check if classification is included
|
|
if 'Image Classifications:' in result["text_content"]:
|
|
print('✅ Classification metadata IS included in text content')
|
|
# Show the classification section
|
|
lines = result["text_content"].split('\n')
|
|
for line in lines:
|
|
if 'Image Classifications:' in line or 'bee' in line.lower():
|
|
print(f' {line}')
|
|
else:
|
|
print('❌ Classification metadata NOT included in text content')
|
|
return False
|
|
else:
|
|
print(f'❌ Processing failed: {result["metadata"].get("error", "Unknown error")}')
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f'❌ Document processing failed: {e}')
|
|
return False
|
|
|
|
# Step 3: Upload the processed document to LightRAG
|
|
print('\n📤 Step 3: Uploading document to LightRAG...')
|
|
try:
|
|
files = {'file': ('test.docx', open('test.docx', 'rb'), 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
|
|
upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers={'X-API-Key': 'jleu1212'}, timeout=30)
|
|
|
|
if upload_response.status_code == 200:
|
|
print('✅ Document uploaded successfully')
|
|
upload_data = upload_response.json()
|
|
print(f'Upload response: {upload_data}')
|
|
else:
|
|
print(f'❌ Upload failed: {upload_response.status_code} - {upload_response.text}')
|
|
return False
|
|
except Exception as e:
|
|
print(f'❌ Upload failed: {e}')
|
|
return False
|
|
|
|
# Step 4: Wait for indexing and verify search works
|
|
print('\n⏳ Step 4: Waiting for indexing (20 seconds)...')
|
|
time.sleep(20)
|
|
|
|
# Step 5: Verify search for bee classification
|
|
print('\n🔍 Step 5: Verifying bee classification search...')
|
|
try:
|
|
response = requests.post(f'{base_url}/api/search', headers=headers, json={'query': 'bee', 'top_k': 10}, timeout=15)
|
|
if response.status_code == 200:
|
|
results = response.json()
|
|
chunks = results.get('chunks', [])
|
|
print(f'Found {len(chunks)} chunks for "bee" search')
|
|
|
|
for i, chunk in enumerate(chunks):
|
|
content = chunk.get('content', '')
|
|
file_path = chunk.get('file_path', '')
|
|
print(f'\n📄 Chunk {i+1} from {file_path}:')
|
|
|
|
if 'Image Classifications:' in content:
|
|
print('✅ SUCCESS: Classification metadata IS present and searchable!')
|
|
print('The bee classification is now available through Web UI search.')
|
|
lines = content.split('\n')
|
|
for line in lines:
|
|
if 'Image Classifications:' in line or 'bee' in line.lower():
|
|
print(f' 🐝 {line}')
|
|
return True
|
|
else:
|
|
print('❌ Still no classification metadata in indexed content')
|
|
return False
|
|
else:
|
|
print(f'❌ Search failed: {response.status_code}')
|
|
return False
|
|
except Exception as e:
|
|
print(f'❌ Search verification failed: {e}')
|
|
return False
|
|
|
|
if __name__ == '__main__':
|
|
success = fix_webui_search()
|
|
if success:
|
|
print('\n🎉 SUCCESS: Bee classification is now searchable in Web UI!')
|
|
else:
|
|
print('\n⚠️ Some issues remain. The classification metadata may not be properly indexed.') |