231 lines
8.6 KiB
Python
231 lines
8.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Comprehensive Test for Document Processing Pipeline with OCR and Image Classification
|
|
Tests the complete workflow: upload, indexing, and searching with bee detection
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import asyncio
|
|
import json
|
|
import requests
|
|
from pathlib import Path
|
|
|
|
# Add LightRAG to path
|
|
workspace_dir = os.getcwd()
|
|
lightrag_path = os.path.join(workspace_dir, 'LightRAG-main')
|
|
if lightrag_path not in sys.path:
|
|
sys.path.insert(0, lightrag_path)
|
|
|
|
def test_document_processor():
|
|
"""Test the document processor with bee detection"""
|
|
print("🧪 TESTING DOCUMENT PROCESSOR")
|
|
print("=" * 50)
|
|
|
|
try:
|
|
from lightrag.document_processor import get_document_processor
|
|
from fast_image_classifier import get_image_classifier
|
|
|
|
# Initialize processors
|
|
print("1. Initializing processors...")
|
|
processor = get_document_processor()
|
|
classifier = get_image_classifier()
|
|
|
|
print(f" ✅ OCR processor: {processor.ocr_processor.ocr_available}")
|
|
print(f" ✅ Image classifier: {classifier.available}")
|
|
|
|
# Process test document
|
|
print("2. Processing test.docx...")
|
|
result = asyncio.run(processor.process_document('test.docx'))
|
|
|
|
print(f" ✅ Processing successful: {result.success}")
|
|
print(f" 📊 Content length: {len(result.content)}")
|
|
print(f" 📋 Metadata: {result.metadata}")
|
|
|
|
# Check for bee detection
|
|
bee_detected = 'bee' in result.content.lower()
|
|
print(f" 🐝 Bee detection: {bee_detected}")
|
|
|
|
if bee_detected:
|
|
print(" ✅ SUCCESS: Bee image successfully detected and indexed!")
|
|
|
|
# Extract bee classification details
|
|
for line in result.content.split('\n'):
|
|
if 'bee' in line.lower() and 'classification' in line.lower():
|
|
print(f" 📝 {line.strip()}")
|
|
else:
|
|
print(" ❌ FAILED: Bee image not detected")
|
|
return False
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"❌ Document processor test failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
def test_upload_and_indexing():
|
|
"""Test document upload and indexing through LightRAG server"""
|
|
print("\n📤 TESTING UPLOAD AND INDEXING")
|
|
print("=" * 50)
|
|
|
|
try:
|
|
# Check if server is running
|
|
print("1. Checking server status...")
|
|
try:
|
|
response = requests.get("http://localhost:8000/health", timeout=10)
|
|
if response.status_code == 200:
|
|
print(" ✅ Server is running")
|
|
else:
|
|
print(" ⚠️ Server responded with non-200 status")
|
|
except Exception as e:
|
|
print(f" ❌ Server not accessible: {e}")
|
|
print(" ⚠️ Please start the server first: python LightRAG-main/start_gpu_server.py")
|
|
return False
|
|
|
|
# Upload test document
|
|
print("2. Uploading test.docx...")
|
|
with open('test.docx', 'rb') as f:
|
|
files = {'file': ('test.docx', f, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')}
|
|
response = requests.post("http://localhost:8000/upload", files=files, timeout=30)
|
|
|
|
if response.status_code == 200:
|
|
upload_result = response.json()
|
|
print(f" ✅ Upload successful: {upload_result}")
|
|
|
|
# Check document status
|
|
print("3. Checking document status...")
|
|
doc_id = upload_result.get('document_id')
|
|
if doc_id:
|
|
status_response = requests.get(f"http://localhost:8000/documents/{doc_id}/status", timeout=10)
|
|
if status_response.status_code == 200:
|
|
status = status_response.json()
|
|
print(f" 📊 Document status: {status}")
|
|
else:
|
|
print(f" ⚠️ Could not get document status: {status_response.text}")
|
|
else:
|
|
print(" ⚠️ No document ID returned from upload")
|
|
|
|
return True
|
|
else:
|
|
print(f" ❌ Upload failed: {response.status_code} - {response.text}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ Upload test failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
def test_search_for_bee():
|
|
"""Test searching for bee-related content"""
|
|
print("\n🔍 TESTING SEARCH FOR BEE CONTENT")
|
|
print("=" * 50)
|
|
|
|
try:
|
|
# Search for bee-related content
|
|
print("1. Searching for 'bee'...")
|
|
search_payload = {
|
|
"query": "bee insect animal",
|
|
"top_k": 10,
|
|
"include_metadata": True
|
|
}
|
|
|
|
response = requests.post("http://localhost:8000/search", json=search_payload, timeout=10)
|
|
|
|
if response.status_code == 200:
|
|
search_results = response.json()
|
|
print(f" ✅ Search successful, found {len(search_results.get('results', []))} results")
|
|
|
|
# Check if bee content is found
|
|
bee_found = False
|
|
for result in search_results.get('results', []):
|
|
content = result.get('content', '').lower()
|
|
metadata = result.get('metadata', {})
|
|
|
|
if 'bee' in content or 'bee' in str(metadata).lower():
|
|
bee_found = True
|
|
print(f" 🐝 Found bee content: {result.get('content', '')[:100]}...")
|
|
print(f" 📊 Score: {result.get('score', 0):.4f}")
|
|
break
|
|
|
|
if bee_found:
|
|
print(" ✅ SUCCESS: Bee content found in search results!")
|
|
else:
|
|
print(" ❌ FAILED: Bee content not found in search results")
|
|
# Show what was found for debugging
|
|
print(" 🔍 Available search results:")
|
|
for i, result in enumerate(search_results.get('results', [])[:3]):
|
|
print(f" {i+1}. {result.get('content', '')[:80]}...")
|
|
|
|
return bee_found
|
|
else:
|
|
print(f" ❌ Search failed: {response.status_code} - {response.text}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ Search test failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
def test_complete_workflow():
|
|
"""Test the complete workflow from processing to search"""
|
|
print("\n🚀 COMPREHENSIVE WORKFLOW TEST")
|
|
print("=" * 50)
|
|
|
|
results = {
|
|
"document_processing": False,
|
|
"upload_indexing": False,
|
|
"search": False
|
|
}
|
|
|
|
# Test document processing
|
|
results["document_processing"] = test_document_processor()
|
|
|
|
# Test upload and indexing (if server is available)
|
|
results["upload_indexing"] = test_upload_and_indexing()
|
|
|
|
# Test search (if upload was successful)
|
|
if results["upload_indexing"]:
|
|
results["search"] = test_search_for_bee()
|
|
|
|
# Summary
|
|
print("\n📋 TEST SUMMARY")
|
|
print("=" * 50)
|
|
for test_name, passed in results.items():
|
|
status = "✅ PASSED" if passed else "❌ FAILED"
|
|
print(f" {test_name.replace('_', ' ').title()}: {status}")
|
|
|
|
all_passed = all(results.values())
|
|
if all_passed:
|
|
print("\n🎉 ALL TESTS PASSED! The document processing pipeline is working correctly.")
|
|
print(" - ✅ PaddleOCR and OpenCLIP are running in complete isolation")
|
|
print(" - ✅ Bee image detection is working")
|
|
print(" - ✅ Document upload and indexing are functional")
|
|
print(" - ✅ Search with bee detection is operational")
|
|
else:
|
|
print("\n⚠️ Some tests failed. Please check the output above for details.")
|
|
|
|
return all_passed
|
|
|
|
if __name__ == "__main__":
|
|
print("🐝 BEE DETECTION WORKFLOW TEST")
|
|
print("Testing: Document Processing → Upload → Indexing → Search")
|
|
print("File: test.docx (should contain a bee image)")
|
|
print()
|
|
|
|
success = test_complete_workflow()
|
|
|
|
if success:
|
|
print("\n✨ TEST COMPLETED SUCCESSFULLY!")
|
|
print("The modified document processing pipeline is working with:")
|
|
print("1. Text-first extraction for all file types")
|
|
print("2. Isolated PaddleOCR for image text extraction")
|
|
print("3. Isolated OpenCLIP for image classification")
|
|
print("4. Successful bee detection and indexing")
|
|
sys.exit(0)
|
|
else:
|
|
print("\n💥 TEST FAILED!")
|
|
sys.exit(1) |