439 lines
16 KiB
Python
439 lines
16 KiB
Python
"""
|
|
FINAL COMPREHENSIVE FIX FOR BEE CLASSIFICATION IN WEB UI
|
|
This script addresses all identified issues and ensures bee classification is searchable
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import requests
|
|
import subprocess
|
|
import shutil
|
|
|
|
# Configuration
|
|
LIGHTRAG_URL = "http://localhost:3015"
|
|
API_KEY = "jleu1212"
|
|
HEADERS = {"X-API-Key": API_KEY}
|
|
|
|
def ensure_dependencies():
|
|
"""Ensure all required dependencies are available"""
|
|
print("🔧 ENSURING DEPENDENCIES...")
|
|
|
|
# Copy fast_image_classifier to LightRAG directory
|
|
source_file = "fast_image_classifier.py"
|
|
target_dir = "LightRAG-main"
|
|
|
|
if os.path.exists(source_file):
|
|
shutil.copy(source_file, os.path.join(target_dir, source_file))
|
|
print(f"✅ Copied {source_file} to {target_dir}")
|
|
else:
|
|
print(f"❌ {source_file} not found")
|
|
return False
|
|
|
|
# Check if OpenCLIP environment exists
|
|
openclip_env = "openclip_gpu_env"
|
|
if os.path.exists(openclip_env):
|
|
print(f"✅ OpenCLIP environment found: {openclip_env}")
|
|
else:
|
|
print(f"❌ OpenCLIP environment not found: {openclip_env}")
|
|
return False
|
|
|
|
return True
|
|
|
|
def stop_server():
|
|
"""Stop the current LightRAG server"""
|
|
print("🛑 STOPPING CURRENT SERVER...")
|
|
|
|
try:
|
|
# Find and kill processes using port 3015
|
|
result = subprocess.run(["netstat", "-ano"], capture_output=True, text=True)
|
|
for line in result.stdout.split('\n'):
|
|
if ':3015' in line and 'LISTENING' in line:
|
|
parts = line.split()
|
|
if len(parts) >= 5:
|
|
pid = parts[-1]
|
|
print(f"Found server process with PID: {pid}")
|
|
subprocess.run(["taskkill", "/F", "/PID", pid], capture_output=True)
|
|
print("✅ Server stopped")
|
|
time.sleep(3)
|
|
return True
|
|
print("❌ No server found on port 3015")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Error stopping server: {e}")
|
|
return False
|
|
|
|
def start_server_with_fixed_config():
|
|
"""Start server with fixed configuration that ensures our processor is used"""
|
|
print("🚀 STARTING SERVER WITH FIXED CONFIGURATION...")
|
|
|
|
# Set environment to ensure our processor is used and fix encoding
|
|
env = os.environ.copy()
|
|
env.update({
|
|
"PYTHONPATH": "LightRAG-main", # Ensure our modified processor is used
|
|
"CUSTOM_DOCUMENT_PROCESSOR": "true",
|
|
"PYTHONIOENCODING": "utf-8", # Fix Unicode encoding issue
|
|
"PYTHONUTF8": "1", # Enable UTF-8 mode
|
|
"OPENCLIP_ENV_PATH": "openclip_gpu_env" # Specify OpenCLIP environment
|
|
})
|
|
|
|
# Use the production script with proper configuration
|
|
command = [
|
|
sys.executable, "-m", "lightrag.api.lightrag_server",
|
|
"--port", "3015",
|
|
"--working-dir", "rag_storage",
|
|
"--input-dir", "inputs",
|
|
"--key", "jleu1212",
|
|
"--auto-scan-at-startup",
|
|
"--llm-binding", "openai",
|
|
"--embedding-binding", "ollama",
|
|
"--rerank-binding", "jina",
|
|
"--summary-max-tokens", "1200",
|
|
"--disable-entity-extraction" # Disable problematic entity extraction
|
|
]
|
|
|
|
try:
|
|
process = subprocess.Popen(
|
|
command,
|
|
env=env,
|
|
cwd="LightRAG-main",
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
text=True,
|
|
encoding='utf-8'
|
|
)
|
|
|
|
print("⏳ Waiting for server to start...")
|
|
|
|
# Wait and check for successful startup
|
|
for attempt in range(15):
|
|
time.sleep(2)
|
|
|
|
# Check if process is still running
|
|
if process.poll() is not None:
|
|
stdout, stderr = process.communicate()
|
|
print(f"❌ Server process exited:")
|
|
if stdout:
|
|
print(f"STDOUT: {stdout[-500:]}") # Last 500 chars
|
|
if stderr:
|
|
print(f"STDERR: {stderr[-500:]}") # Last 500 chars
|
|
return None
|
|
|
|
# Check if server is responding
|
|
try:
|
|
response = requests.get("http://localhost:3015/", timeout=2)
|
|
if response.status_code == 200:
|
|
print("✅ Server started successfully and responding")
|
|
return process
|
|
except:
|
|
pass # Server not ready yet
|
|
|
|
print("❌ Server not responding after 30 seconds")
|
|
return None
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error starting server: {e}")
|
|
return None
|
|
|
|
def clear_and_prepare_storage():
|
|
"""Clear existing storage and prepare for fresh processing"""
|
|
print("🗑️ CLEARING AND PREPARING STORAGE...")
|
|
|
|
# Clear rag_storage
|
|
rag_storage = "rag_storage"
|
|
if os.path.exists(rag_storage):
|
|
try:
|
|
shutil.rmtree(rag_storage)
|
|
print(f"✅ Cleared {rag_storage}")
|
|
except Exception as e:
|
|
print(f"❌ Error clearing {rag_storage}: {e}")
|
|
|
|
# Recreate rag_storage
|
|
os.makedirs(rag_storage, exist_ok=True)
|
|
print(f"✅ Created {rag_storage}")
|
|
|
|
# Clear inputs directory
|
|
inputs_dir = "inputs"
|
|
if os.path.exists(inputs_dir):
|
|
try:
|
|
# Remove only the queued files, keep the directory structure
|
|
for root, dirs, files in os.walk(inputs_dir):
|
|
for file in files:
|
|
file_path = os.path.join(root, file)
|
|
os.remove(file_path)
|
|
print(f"✅ Removed {file_path}")
|
|
except Exception as e:
|
|
print(f"❌ Error clearing {inputs_dir}: {e}")
|
|
|
|
print("✅ Storage prepared for fresh processing")
|
|
|
|
def upload_and_process_test_document():
|
|
"""Upload test.docx and wait for processing"""
|
|
print("📤 UPLOADING AND PROCESSING TEST DOCUMENT...")
|
|
|
|
test_file = "test.docx"
|
|
if not os.path.exists(test_file):
|
|
print(f"❌ Test file {test_file} not found")
|
|
return False
|
|
|
|
try:
|
|
with open(test_file, 'rb') as f:
|
|
files = {'file': (test_file, f, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
|
|
response = requests.post(
|
|
f"{LIGHTRAG_URL}/documents/upload",
|
|
files=files,
|
|
headers=HEADERS,
|
|
timeout=60
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
print("✅ Document uploaded successfully")
|
|
result = response.json()
|
|
print(f" Upload result: {result}")
|
|
|
|
# Wait for processing to complete
|
|
return wait_for_processing()
|
|
else:
|
|
print(f"❌ Upload failed: {response.status_code} - {response.text}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ Upload error: {e}")
|
|
return False
|
|
|
|
def wait_for_processing():
|
|
"""Wait for document processing to complete"""
|
|
print("⏳ WAITING FOR DOCUMENT PROCESSING...")
|
|
|
|
for attempt in range(30): # Wait up to 3 minutes
|
|
try:
|
|
response = requests.get(f"{LIGHTRAG_URL}/documents", headers=HEADERS, timeout=10)
|
|
if response.status_code == 200:
|
|
documents = response.json()
|
|
if isinstance(documents, list):
|
|
for doc in documents:
|
|
if 'test.docx' in doc.get('filename', '').lower():
|
|
status = doc.get('status', 'unknown')
|
|
print(f"📄 Document status: {status}")
|
|
if status == 'processed':
|
|
print("✅ Document processing completed")
|
|
return True
|
|
elif status == 'failed':
|
|
print("❌ Document processing failed")
|
|
return False
|
|
time.sleep(6)
|
|
except Exception as e:
|
|
print(f"⚠️ Status check error: {e}")
|
|
time.sleep(6)
|
|
|
|
print("❌ Timeout waiting for processing")
|
|
return False
|
|
|
|
def test_bee_classification_search():
|
|
"""Test if bee classification is searchable"""
|
|
print("🔍 TESTING BEE CLASSIFICATION SEARCH...")
|
|
|
|
search_queries = [
|
|
"bee",
|
|
"Bee",
|
|
"classification",
|
|
"photo of a bee",
|
|
"Entity: Bee",
|
|
"insect",
|
|
"animal",
|
|
"clipart"
|
|
]
|
|
|
|
bee_found = False
|
|
results_found = False
|
|
|
|
for query in search_queries:
|
|
try:
|
|
# Try different search modes
|
|
for mode in ["standard", "hybrid"]:
|
|
search_payload = {
|
|
"query": query,
|
|
"top_k": 10,
|
|
"mode": mode
|
|
}
|
|
|
|
response = requests.post(
|
|
f"{LIGHTRAG_URL}/search",
|
|
json=search_payload,
|
|
headers=HEADERS,
|
|
timeout=15
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
results = response.json()
|
|
if results.get('results'):
|
|
print(f"✅ '{query}' ({mode}): Found {len(results['results'])} results")
|
|
results_found = True
|
|
|
|
# Check if any result contains bee-related content
|
|
for result in results['results']:
|
|
content = result.get('content', '').lower()
|
|
score = result.get('score', 0)
|
|
|
|
if 'bee' in content or 'classification' in content:
|
|
print(f"🎯 BEE FOUND: Score {score:.4f}")
|
|
print(f" Content: {content[:200]}...")
|
|
bee_found = True
|
|
else:
|
|
print(f"❌ '{query}' ({mode}): No results")
|
|
else:
|
|
print(f"❌ '{query}' ({mode}) search failed: {response.status_code}")
|
|
|
|
except Exception as e:
|
|
print(f"❌ '{query}' search error: {e}")
|
|
|
|
return bee_found, results_found
|
|
|
|
def verify_document_content():
|
|
"""Verify that the document content contains bee classification"""
|
|
print("📝 VERIFYING DOCUMENT CONTENT...")
|
|
|
|
try:
|
|
# Get documents list
|
|
response = requests.get(f"{LIGHTRAG_URL}/documents", headers=HEADERS, timeout=10)
|
|
if response.status_code == 200:
|
|
documents = response.json()
|
|
for doc in documents:
|
|
if 'test.docx' in doc.get('filename', '').lower():
|
|
doc_id = doc.get('id')
|
|
print(f"📄 Found test.docx with ID: {doc_id}")
|
|
|
|
# Try to get document chunks or content
|
|
try:
|
|
# Get document details
|
|
doc_response = requests.get(
|
|
f"{LIGHTRAG_URL}/documents/{doc_id}",
|
|
headers=HEADERS,
|
|
timeout=10
|
|
)
|
|
if doc_response.status_code == 200:
|
|
doc_details = doc_response.json()
|
|
print(f"✅ Document details retrieved")
|
|
|
|
# Check if we can get chunks
|
|
chunks_response = requests.get(
|
|
f"{LIGHTRAG_URL}/documents/{doc_id}/chunks",
|
|
headers=HEADERS,
|
|
timeout=10
|
|
)
|
|
if chunks_response.status_code == 200:
|
|
chunks = chunks_response.json()
|
|
print(f"✅ Found {len(chunks)} chunks")
|
|
|
|
# Search for bee content in chunks
|
|
for chunk in chunks:
|
|
content = chunk.get('content', '').lower()
|
|
if 'bee' in content or 'classification' in content:
|
|
print(f"🎯 BEE CLASSIFICATION FOUND IN CHUNK:")
|
|
print(f" Content: {content[:300]}...")
|
|
return True
|
|
else:
|
|
print(f"❌ Could not get chunks: {chunks_response.status_code}")
|
|
except Exception as e:
|
|
print(f"❌ Error getting document content: {e}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Error verifying document content: {e}")
|
|
return False
|
|
|
|
def test_webui_access():
|
|
"""Test Web UI accessibility"""
|
|
print("🌐 TESTING WEB UI ACCESS...")
|
|
|
|
try:
|
|
response = requests.get(f"{LIGHTRAG_URL}/webui", timeout=10)
|
|
if response.status_code == 200:
|
|
print("✅ Web UI is accessible")
|
|
return True
|
|
else:
|
|
print(f"❌ Web UI not accessible: {response.status_code}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Web UI test error: {e}")
|
|
return False
|
|
|
|
def main():
|
|
"""Main comprehensive fix function"""
|
|
print("🔧 FINAL COMPREHENSIVE FIX FOR BEE CLASSIFICATION")
|
|
print("=" * 70)
|
|
|
|
# Step 1: Ensure dependencies
|
|
if not ensure_dependencies():
|
|
print("❌ Cannot proceed - dependencies missing")
|
|
return False
|
|
|
|
# Step 2: Stop current server
|
|
stop_server()
|
|
|
|
# Step 3: Clear and prepare storage
|
|
clear_and_prepare_storage()
|
|
|
|
# Step 4: Start server with fixed configuration
|
|
server_process = start_server_with_fixed_config()
|
|
if not server_process:
|
|
print("❌ Cannot proceed - server not started")
|
|
return False
|
|
|
|
# Step 5: Upload and process test document
|
|
if not upload_and_process_test_document():
|
|
print("❌ Document processing failed")
|
|
return False
|
|
|
|
# Step 6: Test bee classification search
|
|
bee_found, results_found = test_bee_classification_search()
|
|
|
|
# Step 7: Verify document content
|
|
content_verified = verify_document_content()
|
|
|
|
# Step 8: Test Web UI access
|
|
webui_accessible = test_webui_access()
|
|
|
|
print("\n" + "=" * 70)
|
|
print("📊 COMPREHENSIVE FIX RESULTS")
|
|
print("=" * 70)
|
|
|
|
if bee_found:
|
|
print("🎉 SUCCESS: Bee classification is searchable!")
|
|
print(" The enhanced document processor is working correctly.")
|
|
print(" The Web UI should now detect bee classification.")
|
|
elif results_found:
|
|
print("⚠️ PARTIAL SUCCESS: Search is working but bee classification not found")
|
|
print(" The document was processed but bee classification may not have been added.")
|
|
else:
|
|
print("❌ ISSUE: Search not working or bee classification not found")
|
|
print(" There may be an issue with the enhanced processor or search functionality.")
|
|
|
|
print(f"✅ Document content verified: {'Yes' if content_verified else 'No'}")
|
|
print(f"✅ Web UI Accessible: {'Yes' if webui_accessible else 'No'}")
|
|
|
|
print("\n💡 Final verification steps:")
|
|
print(" 1. Open the Web UI at http://localhost:3015/webui")
|
|
print(" 2. Search for 'bee' to verify classification appears")
|
|
print(" 3. Check server logs for any processing details")
|
|
|
|
if bee_found:
|
|
print("\n🎉 FIX COMPLETED: Bee classification should now be detectable in Web UI")
|
|
print(" The complete document processing pipeline is working correctly.")
|
|
return True
|
|
else:
|
|
print("\n⚠️ FIX INCOMPLETE: Some issues remain")
|
|
print(" Please check server logs and verify OpenCLIP classifier availability.")
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
success = main()
|
|
if success:
|
|
print("\n🎉 FINAL SOLUTION IMPLEMENTED SUCCESSFULLY!")
|
|
print(" The document processing pipeline now supports:")
|
|
print(" - Text-first extraction for all file types")
|
|
print(" - Image classification with OpenCLIP")
|
|
print(" - Complete dependency isolation")
|
|
print(" - Bee classification detection in Web UI")
|
|
else:
|
|
print("\n❌ FINAL SOLUTION NEEDS ADJUSTMENT")
|
|
print(" Please review the logs and check OpenCLIP environment.") |