railseek6/final_comprehensive_fix.py

"""
FINAL COMPREHENSIVE FIX FOR BEE CLASSIFICATION IN WEB UI
This script addresses all identified issues and ensures bee classification is searchable
"""

import os
import sys
import time
import requests
import subprocess
import shutil

# Configuration
LIGHTRAG_URL = "http://localhost:3015"
API_KEY = "jleu1212"
HEADERS = {"X-API-Key": API_KEY}

def ensure_dependencies():
    """Ensure all required dependencies are available"""
    print("🔧 ENSURING DEPENDENCIES...")

    # Copy fast_image_classifier to LightRAG directory
    source_file = "fast_image_classifier.py"
    target_dir = "LightRAG-main"

    if os.path.exists(source_file):
        shutil.copy(source_file, os.path.join(target_dir, source_file))
        print(f"✅ Copied {source_file} to {target_dir}")
    else:
        print(f"❌ {source_file} not found")
        return False

    # Check if OpenCLIP environment exists
    openclip_env = "openclip_gpu_env"
    if os.path.exists(openclip_env):
        print(f"✅ OpenCLIP environment found: {openclip_env}")
    else:
        print(f"❌ OpenCLIP environment not found: {openclip_env}")
        return False

    return True

def stop_server():
    """Stop the current LightRAG server"""
    print("🛑 STOPPING CURRENT SERVER...")

    try:
        # Find and kill processes using port 3015
        result = subprocess.run(["netstat", "-ano"], capture_output=True, text=True)
        for line in result.stdout.split('\n'):
            if ':3015' in line and 'LISTENING' in line:
                parts = line.split()
                if len(parts) >= 5:
                    pid = parts[-1]
                    print(f"Found server process with PID: {pid}")
                    subprocess.run(["taskkill", "/F", "/PID", pid], capture_output=True)
                    print("✅ Server stopped")
                    time.sleep(3)
                    return True
        print("❌ No server found on port 3015")
        return False
    except Exception as e:
        print(f"❌ Error stopping server: {e}")
        return False

def start_server_with_fixed_config():
    """Start server with fixed configuration that ensures our processor is used"""
    print("🚀 STARTING SERVER WITH FIXED CONFIGURATION...")

    # Set environment to ensure our processor is used and fix encoding
    env = os.environ.copy()
    env.update({
        "PYTHONPATH": "LightRAG-main",  # Ensure our modified processor is used
        "CUSTOM_DOCUMENT_PROCESSOR": "true",
        "PYTHONIOENCODING": "utf-8",  # Fix Unicode encoding issue
        "PYTHONUTF8": "1",  # Enable UTF-8 mode
        "OPENCLIP_ENV_PATH": "openclip_gpu_env"  # Specify OpenCLIP environment
    })

    # Use the production script with proper configuration
    command = [
        sys.executable, "-m", "lightrag.api.lightrag_server",
        "--port", "3015",
        "--working-dir", "rag_storage",
        "--input-dir", "inputs",
        "--key", "jleu1212",
        "--auto-scan-at-startup",
        "--llm-binding", "openai",
        "--embedding-binding", "ollama",
        "--rerank-binding", "jina",
        "--summary-max-tokens", "1200",
        "--disable-entity-extraction"  # Disable problematic entity extraction
    ]

    try:
        process = subprocess.Popen(
            command,
            env=env,
            cwd="LightRAG-main",
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            encoding='utf-8'
        )

        print("⏳ Waiting for server to start...")

        # Wait and check for successful startup
        for attempt in range(15):
            time.sleep(2)

            # Check if process is still running
            if process.poll() is not None:
                stdout, stderr = process.communicate()
                print(f"❌ Server process exited:")
                if stdout:
                    print(f"STDOUT: {stdout[-500:]}")  # Last 500 chars
                if stderr:
                    print(f"STDERR: {stderr[-500:]}")  # Last 500 chars
                return None

            # Check if server is responding
            try:
                response = requests.get("http://localhost:3015/", timeout=2)
                if response.status_code == 200:
                    print("✅ Server started successfully and responding")
                    return process
            except:
                pass  # Server not ready yet

        print("❌ Server not responding after 30 seconds")
        return None

    except Exception as e:
        print(f"❌ Error starting server: {e}")
        return None

def clear_and_prepare_storage():
    """Clear existing storage and prepare for fresh processing"""
    print("🗑️  CLEARING AND PREPARING STORAGE...")

    # Clear rag_storage
    rag_storage = "rag_storage"
    if os.path.exists(rag_storage):
        try:
            shutil.rmtree(rag_storage)
            print(f"✅ Cleared {rag_storage}")
        except Exception as e:
            print(f"❌ Error clearing {rag_storage}: {e}")

    # Recreate rag_storage
    os.makedirs(rag_storage, exist_ok=True)
    print(f"✅ Created {rag_storage}")

    # Clear inputs directory
    inputs_dir = "inputs"
    if os.path.exists(inputs_dir):
        try:
            # Remove only the queued files, keep the directory structure
            for root, dirs, files in os.walk(inputs_dir):
                for file in files:
                    file_path = os.path.join(root, file)
                    os.remove(file_path)
                    print(f"✅ Removed {file_path}")
        except Exception as e:
            print(f"❌ Error clearing {inputs_dir}: {e}")

    print("✅ Storage prepared for fresh processing")

def upload_and_process_test_document():
    """Upload test.docx and wait for processing"""
    print("📤 UPLOADING AND PROCESSING TEST DOCUMENT...")

    test_file = "test.docx"
    if not os.path.exists(test_file):
        print(f"❌ Test file {test_file} not found")
        return False

    try:
        with open(test_file, 'rb') as f:
            files = {'file': (test_file, f, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
            response = requests.post(
                f"{LIGHTRAG_URL}/documents/upload",
                files=files,
                headers=HEADERS,
                timeout=60
            )

        if response.status_code == 200:
            print("✅ Document uploaded successfully")
            result = response.json()
            print(f"   Upload result: {result}")

            # Wait for processing to complete
            return wait_for_processing()
        else:
            print(f"❌ Upload failed: {response.status_code} - {response.text}")
            return False

    except Exception as e:
        print(f"❌ Upload error: {e}")
        return False

def wait_for_processing():
    """Wait for document processing to complete"""
    print("⏳ WAITING FOR DOCUMENT PROCESSING...")

    for attempt in range(30):  # Wait up to 3 minutes
        try:
            response = requests.get(f"{LIGHTRAG_URL}/documents", headers=HEADERS, timeout=10)
            if response.status_code == 200:
                documents = response.json()
                if isinstance(documents, list):
                    for doc in documents:
                        if 'test.docx' in doc.get('filename', '').lower():
                            status = doc.get('status', 'unknown')
                            print(f"📄 Document status: {status}")
                            if status == 'processed':
                                print("✅ Document processing completed")
                                return True
                            elif status == 'failed':
                                print("❌ Document processing failed")
                                return False
            time.sleep(6)
        except Exception as e:
            print(f"⚠️  Status check error: {e}")
            time.sleep(6)

    print("❌ Timeout waiting for processing")
    return False

def test_bee_classification_search():
    """Test if bee classification is searchable"""
    print("🔍 TESTING BEE CLASSIFICATION SEARCH...")

    search_queries = [
        "bee",
        "Bee",
        "classification",
        "photo of a bee",
        "Entity: Bee",
        "insect",
        "animal",
        "clipart"
    ]

    bee_found = False
    results_found = False

    for query in search_queries:
        try:
            # Try different search modes
            for mode in ["standard", "hybrid"]:
                search_payload = {
                    "query": query,
                    "top_k": 10,
                    "mode": mode
                }

                response = requests.post(
                    f"{LIGHTRAG_URL}/search",
                    json=search_payload,
                    headers=HEADERS,
                    timeout=15
                )

                if response.status_code == 200:
                    results = response.json()
                    if results.get('results'):
                        print(f"✅ '{query}' ({mode}): Found {len(results['results'])} results")
                        results_found = True

                        # Check if any result contains bee-related content
                        for result in results['results']:
                            content = result.get('content', '').lower()
                            score = result.get('score', 0)

                            if 'bee' in content or 'classification' in content:
                                print(f"🎯 BEE FOUND: Score {score:.4f}")
                                print(f"   Content: {content[:200]}...")
                                bee_found = True
                    else:
                        print(f"❌ '{query}' ({mode}): No results")
                else:
                    print(f"❌ '{query}' ({mode}) search failed: {response.status_code}")

        except Exception as e:
            print(f"❌ '{query}' search error: {e}")

    return bee_found, results_found

def verify_document_content():
    """Verify that the document content contains bee classification"""
    print("📝 VERIFYING DOCUMENT CONTENT...")

    try:
        # Get documents list
        response = requests.get(f"{LIGHTRAG_URL}/documents", headers=HEADERS, timeout=10)
        if response.status_code == 200:
            documents = response.json()
            for doc in documents:
                if 'test.docx' in doc.get('filename', '').lower():
                    doc_id = doc.get('id')
                    print(f"📄 Found test.docx with ID: {doc_id}")

                    # Try to get document chunks or content
                    try:
                        # Get document details
                        doc_response = requests.get(
                            f"{LIGHTRAG_URL}/documents/{doc_id}",
                            headers=HEADERS,
                            timeout=10
                        )
                        if doc_response.status_code == 200:
                            doc_details = doc_response.json()
                            print(f"✅ Document details retrieved")

                            # Check if we can get chunks
                            chunks_response = requests.get(
                                f"{LIGHTRAG_URL}/documents/{doc_id}/chunks",
                                headers=HEADERS,
                                timeout=10
                            )
                            if chunks_response.status_code == 200:
                                chunks = chunks_response.json()
                                print(f"✅ Found {len(chunks)} chunks")

                                # Search for bee content in chunks
                                for chunk in chunks:
                                    content = chunk.get('content', '').lower()
                                    if 'bee' in content or 'classification' in content:
                                        print(f"🎯 BEE CLASSIFICATION FOUND IN CHUNK:")
                                        print(f"   Content: {content[:300]}...")
                                        return True
                            else:
                                print(f"❌ Could not get chunks: {chunks_response.status_code}")
                    except Exception as e:
                        print(f"❌ Error getting document content: {e}")
        return False
    except Exception as e:
        print(f"❌ Error verifying document content: {e}")
        return False

def test_webui_access():
    """Test Web UI accessibility"""
    print("🌐 TESTING WEB UI ACCESS...")

    try:
        response = requests.get(f"{LIGHTRAG_URL}/webui", timeout=10)
        if response.status_code == 200:
            print("✅ Web UI is accessible")
            return True
        else:
            print(f"❌ Web UI not accessible: {response.status_code}")
            return False
    except Exception as e:
        print(f"❌ Web UI test error: {e}")
        return False

def main():
    """Main comprehensive fix function"""
    print("🔧 FINAL COMPREHENSIVE FIX FOR BEE CLASSIFICATION")
    print("=" * 70)

    # Step 1: Ensure dependencies
    if not ensure_dependencies():
        print("❌ Cannot proceed - dependencies missing")
        return False

    # Step 2: Stop current server
    stop_server()

    # Step 3: Clear and prepare storage
    clear_and_prepare_storage()

    # Step 4: Start server with fixed configuration
    server_process = start_server_with_fixed_config()
    if not server_process:
        print("❌ Cannot proceed - server not started")
        return False

    # Step 5: Upload and process test document
    if not upload_and_process_test_document():
        print("❌ Document processing failed")
        return False

    # Step 6: Test bee classification search
    bee_found, results_found = test_bee_classification_search()

    # Step 7: Verify document content
    content_verified = verify_document_content()

    # Step 8: Test Web UI access
    webui_accessible = test_webui_access()

    print("\n" + "=" * 70)
    print("📊 COMPREHENSIVE FIX RESULTS")
    print("=" * 70)

    if bee_found:
        print("🎉 SUCCESS: Bee classification is searchable!")
        print("   The enhanced document processor is working correctly.")
        print("   The Web UI should now detect bee classification.")
    elif results_found:
        print("⚠️  PARTIAL SUCCESS: Search is working but bee classification not found")
        print("   The document was processed but bee classification may not have been added.")
    else:
        print("❌ ISSUE: Search not working or bee classification not found")
        print("   There may be an issue with the enhanced processor or search functionality.")

    print(f"✅ Document content verified: {'Yes' if content_verified else 'No'}")
    print(f"✅ Web UI Accessible: {'Yes' if webui_accessible else 'No'}")

    print("\n💡 Final verification steps:")
    print("   1. Open the Web UI at http://localhost:3015/webui")
    print("   2. Search for 'bee' to verify classification appears")
    print("   3. Check server logs for any processing details")

    if bee_found:
        print("\n🎉 FIX COMPLETED: Bee classification should now be detectable in Web UI")
        print("   The complete document processing pipeline is working correctly.")
        return True
    else:
        print("\n⚠️  FIX INCOMPLETE: Some issues remain")
        print("   Please check server logs and verify OpenCLIP classifier availability.")
        return False

if __name__ == "__main__":
    success = main()
    if success:
        print("\n🎉 FINAL SOLUTION IMPLEMENTED SUCCESSFULLY!")
        print("   The document processing pipeline now supports:")
        print("   - Text-first extraction for all file types")
        print("   - Image classification with OpenCLIP")
        print("   - Complete dependency isolation")
        print("   - Bee classification detection in Web UI")
    else:
        print("\n❌ FINAL SOLUTION NEEDS ADJUSTMENT")
        print("   Please review the logs and check OpenCLIP environment.")