railseek6/test.py

#!/usr/bin/env python3
"""
Comprehensive Test for Document Processing Pipeline with OCR and Image Classification
Tests the complete workflow: upload, indexing, and searching with bee detection
"""

import sys
import os
import asyncio
import json
import requests
from pathlib import Path

# Add LightRAG to path
workspace_dir = os.getcwd()
lightrag_path = os.path.join(workspace_dir, 'LightRAG-main')
if lightrag_path not in sys.path:
    sys.path.insert(0, lightrag_path)

def test_document_processor():
    """Test the document processor with bee detection"""
    print("🧪 TESTING DOCUMENT PROCESSOR")
    print("=" * 50)

    try:
        from lightrag.document_processor import get_document_processor
        from fast_image_classifier import get_image_classifier

        # Initialize processors
        print("1. Initializing processors...")
        processor = get_document_processor()
        classifier = get_image_classifier()

        print(f"   ✅ OCR processor: {processor.ocr_processor.ocr_available}")
        print(f"   ✅ Image classifier: {classifier.available}")

        # Process test document
        print("2. Processing test.docx...")
        result = asyncio.run(processor.process_document('test.docx'))

        print(f"   ✅ Processing successful: {result.success}")
        print(f"   📊 Content length: {len(result.content)}")
        print(f"   📋 Metadata: {result.metadata}")

        # Check for bee detection
        bee_detected = 'bee' in result.content.lower()
        print(f"   🐝 Bee detection: {bee_detected}")

        if bee_detected:
            print("   ✅ SUCCESS: Bee image successfully detected and indexed!")

            # Extract bee classification details
            for line in result.content.split('\n'):
                if 'bee' in line.lower() and 'classification' in line.lower():
                    print(f"   📝 {line.strip()}")
        else:
            print("   ❌ FAILED: Bee image not detected")
            return False

        return True

    except Exception as e:
        print(f"❌ Document processor test failed: {e}")
        import traceback
        traceback.print_exc()
        return False

def test_upload_and_indexing():
    """Test document upload and indexing through LightRAG server"""
    print("\n📤 TESTING UPLOAD AND INDEXING")
    print("=" * 50)

    try:
        # Check if server is running
        print("1. Checking server status...")
        try:
            response = requests.get("http://localhost:8000/health", timeout=10)
            if response.status_code == 200:
                print("   ✅ Server is running")
            else:
                print("   ⚠️  Server responded with non-200 status")
        except Exception as e:
            print(f"   ❌ Server not accessible: {e}")
            print("   ⚠️  Please start the server first: python LightRAG-main/start_gpu_server.py")
            return False

        # Upload test document
        print("2. Uploading test.docx...")
        with open('test.docx', 'rb') as f:
            files = {'file': ('test.docx', f, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
            response = requests.post("http://localhost:8000/upload", files=files, timeout=30)

        if response.status_code == 200:
            upload_result = response.json()
            print(f"   ✅ Upload successful: {upload_result}")

            # Check document status
            print("3. Checking document status...")
            doc_id = upload_result.get('document_id')
            if doc_id:
                status_response = requests.get(f"http://localhost:8000/documents/{doc_id}/status", timeout=10)
                if status_response.status_code == 200:
                    status = status_response.json()
                    print(f"   📊 Document status: {status}")
                else:
                    print(f"   ⚠️  Could not get document status: {status_response.text}")
            else:
                print("   ⚠️  No document ID returned from upload")

            return True
        else:
            print(f"   ❌ Upload failed: {response.status_code} - {response.text}")
            return False

    except Exception as e:
        print(f"❌ Upload test failed: {e}")
        import traceback
        traceback.print_exc()
        return False

def test_search_for_bee():
    """Test searching for bee-related content"""
    print("\n🔍 TESTING SEARCH FOR BEE CONTENT")
    print("=" * 50)

    try:
        # Search for bee-related content
        print("1. Searching for 'bee'...")
        search_payload = {
            "query": "bee insect animal",
            "top_k": 10,
            "include_metadata": True
        }

        response = requests.post("http://localhost:8000/search", json=search_payload, timeout=10)

        if response.status_code == 200:
            search_results = response.json()
            print(f"   ✅ Search successful, found {len(search_results.get('results', []))} results")

            # Check if bee content is found
            bee_found = False
            for result in search_results.get('results', []):
                content = result.get('content', '').lower()
                metadata = result.get('metadata', {})

                if 'bee' in content or 'bee' in str(metadata).lower():
                    bee_found = True
                    print(f"   🐝 Found bee content: {result.get('content', '')[:100]}...")
                    print(f"   📊 Score: {result.get('score', 0):.4f}")
                    break

            if bee_found:
                print("   ✅ SUCCESS: Bee content found in search results!")
            else:
                print("   ❌ FAILED: Bee content not found in search results")
                # Show what was found for debugging
                print("   🔍 Available search results:")
                for i, result in enumerate(search_results.get('results', [])[:3]):
                    print(f"      {i+1}. {result.get('content', '')[:80]}...")

            return bee_found
        else:
            print(f"   ❌ Search failed: {response.status_code} - {response.text}")
            return False

    except Exception as e:
        print(f"❌ Search test failed: {e}")
        import traceback
        traceback.print_exc()
        return False

def test_complete_workflow():
    """Test the complete workflow from processing to search"""
    print("\n🚀 COMPREHENSIVE WORKFLOW TEST")
    print("=" * 50)

    results = {
        "document_processing": False,
        "upload_indexing": False,
        "search": False
    }

    # Test document processing
    results["document_processing"] = test_document_processor()

    # Test upload and indexing (if server is available)
    results["upload_indexing"] = test_upload_and_indexing()

    # Test search (if upload was successful)
    if results["upload_indexing"]:
        results["search"] = test_search_for_bee()

    # Summary
    print("\n📋 TEST SUMMARY")
    print("=" * 50)
    for test_name, passed in results.items():
        status = "✅ PASSED" if passed else "❌ FAILED"
        print(f"   {test_name.replace('_', ' ').title()}: {status}")

    all_passed = all(results.values())
    if all_passed:
        print("\n🎉 ALL TESTS PASSED! The document processing pipeline is working correctly.")
        print("   - ✅ PaddleOCR and OpenCLIP are running in complete isolation")
        print("   - ✅ Bee image detection is working")
        print("   - ✅ Document upload and indexing are functional")
        print("   - ✅ Search with bee detection is operational")
    else:
        print("\n⚠️  Some tests failed. Please check the output above for details.")

    return all_passed

if __name__ == "__main__":
    print("🐝 BEE DETECTION WORKFLOW TEST")
    print("Testing: Document Processing → Upload → Indexing → Search")
    print("File: test.docx (should contain a bee image)")
    print()

    success = test_complete_workflow()

    if success:
        print("\n✨ TEST COMPLETED SUCCESSFULLY!")
        print("The modified document processing pipeline is working with:")
        print("1. Text-first extraction for all file types")
        print("2. Isolated PaddleOCR for image text extraction")
        print("3. Isolated OpenCLIP for image classification")
        print("4. Successful bee detection and indexing")
        sys.exit(0)
    else:
        print("\n💥 TEST FAILED!")
        sys.exit(1)