""" FINAL COMPREHENSIVE FIX FOR BEE CLASSIFICATION IN WEB UI This script addresses all identified issues and ensures bee classification is searchable """ import os import sys import time import requests import subprocess import shutil # Configuration LIGHTRAG_URL = "http://localhost:3015" API_KEY = "jleu1212" HEADERS = {"X-API-Key": API_KEY} def ensure_dependencies(): """Ensure all required dependencies are available""" print("šŸ”§ ENSURING DEPENDENCIES...") # Copy fast_image_classifier to LightRAG directory source_file = "fast_image_classifier.py" target_dir = "LightRAG-main" if os.path.exists(source_file): shutil.copy(source_file, os.path.join(target_dir, source_file)) print(f"āœ… Copied {source_file} to {target_dir}") else: print(f"āŒ {source_file} not found") return False # Check if OpenCLIP environment exists openclip_env = "openclip_gpu_env" if os.path.exists(openclip_env): print(f"āœ… OpenCLIP environment found: {openclip_env}") else: print(f"āŒ OpenCLIP environment not found: {openclip_env}") return False return True def stop_server(): """Stop the current LightRAG server""" print("šŸ›‘ STOPPING CURRENT SERVER...") try: # Find and kill processes using port 3015 result = subprocess.run(["netstat", "-ano"], capture_output=True, text=True) for line in result.stdout.split('\n'): if ':3015' in line and 'LISTENING' in line: parts = line.split() if len(parts) >= 5: pid = parts[-1] print(f"Found server process with PID: {pid}") subprocess.run(["taskkill", "/F", "/PID", pid], capture_output=True) print("āœ… Server stopped") time.sleep(3) return True print("āŒ No server found on port 3015") return False except Exception as e: print(f"āŒ Error stopping server: {e}") return False def start_server_with_fixed_config(): """Start server with fixed configuration that ensures our processor is used""" print("šŸš€ STARTING SERVER WITH FIXED CONFIGURATION...") # Set environment to ensure our processor is used and fix encoding env = os.environ.copy() env.update({ "PYTHONPATH": "LightRAG-main", # Ensure our modified processor is used "CUSTOM_DOCUMENT_PROCESSOR": "true", "PYTHONIOENCODING": "utf-8", # Fix Unicode encoding issue "PYTHONUTF8": "1", # Enable UTF-8 mode "OPENCLIP_ENV_PATH": "openclip_gpu_env" # Specify OpenCLIP environment }) # Use the production script with proper configuration command = [ sys.executable, "-m", "lightrag.api.lightrag_server", "--port", "3015", "--working-dir", "rag_storage", "--input-dir", "inputs", "--key", "jleu1212", "--auto-scan-at-startup", "--llm-binding", "openai", "--embedding-binding", "ollama", "--rerank-binding", "jina", "--summary-max-tokens", "1200", "--disable-entity-extraction" # Disable problematic entity extraction ] try: process = subprocess.Popen( command, env=env, cwd="LightRAG-main", stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding='utf-8' ) print("ā³ Waiting for server to start...") # Wait and check for successful startup for attempt in range(15): time.sleep(2) # Check if process is still running if process.poll() is not None: stdout, stderr = process.communicate() print(f"āŒ Server process exited:") if stdout: print(f"STDOUT: {stdout[-500:]}") # Last 500 chars if stderr: print(f"STDERR: {stderr[-500:]}") # Last 500 chars return None # Check if server is responding try: response = requests.get("http://localhost:3015/", timeout=2) if response.status_code == 200: print("āœ… Server started successfully and responding") return process except: pass # Server not ready yet print("āŒ Server not responding after 30 seconds") return None except Exception as e: print(f"āŒ Error starting server: {e}") return None def clear_and_prepare_storage(): """Clear existing storage and prepare for fresh processing""" print("šŸ—‘ļø CLEARING AND PREPARING STORAGE...") # Clear rag_storage rag_storage = "rag_storage" if os.path.exists(rag_storage): try: shutil.rmtree(rag_storage) print(f"āœ… Cleared {rag_storage}") except Exception as e: print(f"āŒ Error clearing {rag_storage}: {e}") # Recreate rag_storage os.makedirs(rag_storage, exist_ok=True) print(f"āœ… Created {rag_storage}") # Clear inputs directory inputs_dir = "inputs" if os.path.exists(inputs_dir): try: # Remove only the queued files, keep the directory structure for root, dirs, files in os.walk(inputs_dir): for file in files: file_path = os.path.join(root, file) os.remove(file_path) print(f"āœ… Removed {file_path}") except Exception as e: print(f"āŒ Error clearing {inputs_dir}: {e}") print("āœ… Storage prepared for fresh processing") def upload_and_process_test_document(): """Upload test.docx and wait for processing""" print("šŸ“¤ UPLOADING AND PROCESSING TEST DOCUMENT...") test_file = "test.docx" if not os.path.exists(test_file): print(f"āŒ Test file {test_file} not found") return False try: with open(test_file, 'rb') as f: files = {'file': (test_file, f, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')} response = requests.post( f"{LIGHTRAG_URL}/documents/upload", files=files, headers=HEADERS, timeout=60 ) if response.status_code == 200: print("āœ… Document uploaded successfully") result = response.json() print(f" Upload result: {result}") # Wait for processing to complete return wait_for_processing() else: print(f"āŒ Upload failed: {response.status_code} - {response.text}") return False except Exception as e: print(f"āŒ Upload error: {e}") return False def wait_for_processing(): """Wait for document processing to complete""" print("ā³ WAITING FOR DOCUMENT PROCESSING...") for attempt in range(30): # Wait up to 3 minutes try: response = requests.get(f"{LIGHTRAG_URL}/documents", headers=HEADERS, timeout=10) if response.status_code == 200: documents = response.json() if isinstance(documents, list): for doc in documents: if 'test.docx' in doc.get('filename', '').lower(): status = doc.get('status', 'unknown') print(f"šŸ“„ Document status: {status}") if status == 'processed': print("āœ… Document processing completed") return True elif status == 'failed': print("āŒ Document processing failed") return False time.sleep(6) except Exception as e: print(f"āš ļø Status check error: {e}") time.sleep(6) print("āŒ Timeout waiting for processing") return False def test_bee_classification_search(): """Test if bee classification is searchable""" print("šŸ” TESTING BEE CLASSIFICATION SEARCH...") search_queries = [ "bee", "Bee", "classification", "photo of a bee", "Entity: Bee", "insect", "animal", "clipart" ] bee_found = False results_found = False for query in search_queries: try: # Try different search modes for mode in ["standard", "hybrid"]: search_payload = { "query": query, "top_k": 10, "mode": mode } response = requests.post( f"{LIGHTRAG_URL}/search", json=search_payload, headers=HEADERS, timeout=15 ) if response.status_code == 200: results = response.json() if results.get('results'): print(f"āœ… '{query}' ({mode}): Found {len(results['results'])} results") results_found = True # Check if any result contains bee-related content for result in results['results']: content = result.get('content', '').lower() score = result.get('score', 0) if 'bee' in content or 'classification' in content: print(f"šŸŽÆ BEE FOUND: Score {score:.4f}") print(f" Content: {content[:200]}...") bee_found = True else: print(f"āŒ '{query}' ({mode}): No results") else: print(f"āŒ '{query}' ({mode}) search failed: {response.status_code}") except Exception as e: print(f"āŒ '{query}' search error: {e}") return bee_found, results_found def verify_document_content(): """Verify that the document content contains bee classification""" print("šŸ“ VERIFYING DOCUMENT CONTENT...") try: # Get documents list response = requests.get(f"{LIGHTRAG_URL}/documents", headers=HEADERS, timeout=10) if response.status_code == 200: documents = response.json() for doc in documents: if 'test.docx' in doc.get('filename', '').lower(): doc_id = doc.get('id') print(f"šŸ“„ Found test.docx with ID: {doc_id}") # Try to get document chunks or content try: # Get document details doc_response = requests.get( f"{LIGHTRAG_URL}/documents/{doc_id}", headers=HEADERS, timeout=10 ) if doc_response.status_code == 200: doc_details = doc_response.json() print(f"āœ… Document details retrieved") # Check if we can get chunks chunks_response = requests.get( f"{LIGHTRAG_URL}/documents/{doc_id}/chunks", headers=HEADERS, timeout=10 ) if chunks_response.status_code == 200: chunks = chunks_response.json() print(f"āœ… Found {len(chunks)} chunks") # Search for bee content in chunks for chunk in chunks: content = chunk.get('content', '').lower() if 'bee' in content or 'classification' in content: print(f"šŸŽÆ BEE CLASSIFICATION FOUND IN CHUNK:") print(f" Content: {content[:300]}...") return True else: print(f"āŒ Could not get chunks: {chunks_response.status_code}") except Exception as e: print(f"āŒ Error getting document content: {e}") return False except Exception as e: print(f"āŒ Error verifying document content: {e}") return False def test_webui_access(): """Test Web UI accessibility""" print("🌐 TESTING WEB UI ACCESS...") try: response = requests.get(f"{LIGHTRAG_URL}/webui", timeout=10) if response.status_code == 200: print("āœ… Web UI is accessible") return True else: print(f"āŒ Web UI not accessible: {response.status_code}") return False except Exception as e: print(f"āŒ Web UI test error: {e}") return False def main(): """Main comprehensive fix function""" print("šŸ”§ FINAL COMPREHENSIVE FIX FOR BEE CLASSIFICATION") print("=" * 70) # Step 1: Ensure dependencies if not ensure_dependencies(): print("āŒ Cannot proceed - dependencies missing") return False # Step 2: Stop current server stop_server() # Step 3: Clear and prepare storage clear_and_prepare_storage() # Step 4: Start server with fixed configuration server_process = start_server_with_fixed_config() if not server_process: print("āŒ Cannot proceed - server not started") return False # Step 5: Upload and process test document if not upload_and_process_test_document(): print("āŒ Document processing failed") return False # Step 6: Test bee classification search bee_found, results_found = test_bee_classification_search() # Step 7: Verify document content content_verified = verify_document_content() # Step 8: Test Web UI access webui_accessible = test_webui_access() print("\n" + "=" * 70) print("šŸ“Š COMPREHENSIVE FIX RESULTS") print("=" * 70) if bee_found: print("šŸŽ‰ SUCCESS: Bee classification is searchable!") print(" The enhanced document processor is working correctly.") print(" The Web UI should now detect bee classification.") elif results_found: print("āš ļø PARTIAL SUCCESS: Search is working but bee classification not found") print(" The document was processed but bee classification may not have been added.") else: print("āŒ ISSUE: Search not working or bee classification not found") print(" There may be an issue with the enhanced processor or search functionality.") print(f"āœ… Document content verified: {'Yes' if content_verified else 'No'}") print(f"āœ… Web UI Accessible: {'Yes' if webui_accessible else 'No'}") print("\nšŸ’” Final verification steps:") print(" 1. Open the Web UI at http://localhost:3015/webui") print(" 2. Search for 'bee' to verify classification appears") print(" 3. Check server logs for any processing details") if bee_found: print("\nšŸŽ‰ FIX COMPLETED: Bee classification should now be detectable in Web UI") print(" The complete document processing pipeline is working correctly.") return True else: print("\nāš ļø FIX INCOMPLETE: Some issues remain") print(" Please check server logs and verify OpenCLIP classifier availability.") return False if __name__ == "__main__": success = main() if success: print("\nšŸŽ‰ FINAL SOLUTION IMPLEMENTED SUCCESSFULLY!") print(" The document processing pipeline now supports:") print(" - Text-first extraction for all file types") print(" - Image classification with OpenCLIP") print(" - Complete dependency isolation") print(" - Bee classification detection in Web UI") else: print("\nāŒ FINAL SOLUTION NEEDS ADJUSTMENT") print(" Please review the logs and check OpenCLIP environment.")