Files
railseek6/test.py

231 lines
8.6 KiB
Python

#!/usr/bin/env python3
"""
Comprehensive Test for Document Processing Pipeline with OCR and Image Classification
Tests the complete workflow: upload, indexing, and searching with bee detection
"""
import sys
import os
import asyncio
import json
import requests
from pathlib import Path
# Add LightRAG to path
workspace_dir = os.getcwd()
lightrag_path = os.path.join(workspace_dir, 'LightRAG-main')
if lightrag_path not in sys.path:
sys.path.insert(0, lightrag_path)
def test_document_processor():
"""Test the document processor with bee detection"""
print("🧪 TESTING DOCUMENT PROCESSOR")
print("=" * 50)
try:
from lightrag.document_processor import get_document_processor
from fast_image_classifier import get_image_classifier
# Initialize processors
print("1. Initializing processors...")
processor = get_document_processor()
classifier = get_image_classifier()
print(f" ✅ OCR processor: {processor.ocr_processor.ocr_available}")
print(f" ✅ Image classifier: {classifier.available}")
# Process test document
print("2. Processing test.docx...")
result = asyncio.run(processor.process_document('test.docx'))
print(f" ✅ Processing successful: {result.success}")
print(f" 📊 Content length: {len(result.content)}")
print(f" 📋 Metadata: {result.metadata}")
# Check for bee detection
bee_detected = 'bee' in result.content.lower()
print(f" 🐝 Bee detection: {bee_detected}")
if bee_detected:
print(" ✅ SUCCESS: Bee image successfully detected and indexed!")
# Extract bee classification details
for line in result.content.split('\n'):
if 'bee' in line.lower() and 'classification' in line.lower():
print(f" 📝 {line.strip()}")
else:
print(" ❌ FAILED: Bee image not detected")
return False
return True
except Exception as e:
print(f"❌ Document processor test failed: {e}")
import traceback
traceback.print_exc()
return False
def test_upload_and_indexing():
"""Test document upload and indexing through LightRAG server"""
print("\n📤 TESTING UPLOAD AND INDEXING")
print("=" * 50)
try:
# Check if server is running
print("1. Checking server status...")
try:
response = requests.get("http://localhost:8000/health", timeout=10)
if response.status_code == 200:
print(" ✅ Server is running")
else:
print(" ⚠️ Server responded with non-200 status")
except Exception as e:
print(f" ❌ Server not accessible: {e}")
print(" ⚠️ Please start the server first: python LightRAG-main/start_gpu_server.py")
return False
# Upload test document
print("2. Uploading test.docx...")
with open('test.docx', 'rb') as f:
files = {'file': ('test.docx', f, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
response = requests.post("http://localhost:8000/upload", files=files, timeout=30)
if response.status_code == 200:
upload_result = response.json()
print(f" ✅ Upload successful: {upload_result}")
# Check document status
print("3. Checking document status...")
doc_id = upload_result.get('document_id')
if doc_id:
status_response = requests.get(f"http://localhost:8000/documents/{doc_id}/status", timeout=10)
if status_response.status_code == 200:
status = status_response.json()
print(f" 📊 Document status: {status}")
else:
print(f" ⚠️ Could not get document status: {status_response.text}")
else:
print(" ⚠️ No document ID returned from upload")
return True
else:
print(f" ❌ Upload failed: {response.status_code} - {response.text}")
return False
except Exception as e:
print(f"❌ Upload test failed: {e}")
import traceback
traceback.print_exc()
return False
def test_search_for_bee():
"""Test searching for bee-related content"""
print("\n🔍 TESTING SEARCH FOR BEE CONTENT")
print("=" * 50)
try:
# Search for bee-related content
print("1. Searching for 'bee'...")
search_payload = {
"query": "bee insect animal",
"top_k": 10,
"include_metadata": True
}
response = requests.post("http://localhost:8000/search", json=search_payload, timeout=10)
if response.status_code == 200:
search_results = response.json()
print(f" ✅ Search successful, found {len(search_results.get('results', []))} results")
# Check if bee content is found
bee_found = False
for result in search_results.get('results', []):
content = result.get('content', '').lower()
metadata = result.get('metadata', {})
if 'bee' in content or 'bee' in str(metadata).lower():
bee_found = True
print(f" 🐝 Found bee content: {result.get('content', '')[:100]}...")
print(f" 📊 Score: {result.get('score', 0):.4f}")
break
if bee_found:
print(" ✅ SUCCESS: Bee content found in search results!")
else:
print(" ❌ FAILED: Bee content not found in search results")
# Show what was found for debugging
print(" 🔍 Available search results:")
for i, result in enumerate(search_results.get('results', [])[:3]):
print(f" {i+1}. {result.get('content', '')[:80]}...")
return bee_found
else:
print(f" ❌ Search failed: {response.status_code} - {response.text}")
return False
except Exception as e:
print(f"❌ Search test failed: {e}")
import traceback
traceback.print_exc()
return False
def test_complete_workflow():
"""Test the complete workflow from processing to search"""
print("\n🚀 COMPREHENSIVE WORKFLOW TEST")
print("=" * 50)
results = {
"document_processing": False,
"upload_indexing": False,
"search": False
}
# Test document processing
results["document_processing"] = test_document_processor()
# Test upload and indexing (if server is available)
results["upload_indexing"] = test_upload_and_indexing()
# Test search (if upload was successful)
if results["upload_indexing"]:
results["search"] = test_search_for_bee()
# Summary
print("\n📋 TEST SUMMARY")
print("=" * 50)
for test_name, passed in results.items():
status = "✅ PASSED" if passed else "❌ FAILED"
print(f" {test_name.replace('_', ' ').title()}: {status}")
all_passed = all(results.values())
if all_passed:
print("\n🎉 ALL TESTS PASSED! The document processing pipeline is working correctly.")
print(" - ✅ PaddleOCR and OpenCLIP are running in complete isolation")
print(" - ✅ Bee image detection is working")
print(" - ✅ Document upload and indexing are functional")
print(" - ✅ Search with bee detection is operational")
else:
print("\n⚠️ Some tests failed. Please check the output above for details.")
return all_passed
if __name__ == "__main__":
print("🐝 BEE DETECTION WORKFLOW TEST")
print("Testing: Document Processing → Upload → Indexing → Search")
print("File: test.docx (should contain a bee image)")
print()
success = test_complete_workflow()
if success:
print("\n✨ TEST COMPLETED SUCCESSFULLY!")
print("The modified document processing pipeline is working with:")
print("1. Text-first extraction for all file types")
print("2. Isolated PaddleOCR for image text extraction")
print("3. Isolated OpenCLIP for image classification")
print("4. Successful bee detection and indexing")
sys.exit(0)
else:
print("\n💥 TEST FAILED!")
sys.exit(1)