#!/usr/bin/env python3 """ Comprehensive Test for Document Processing Pipeline with OCR and Image Classification Tests the complete workflow: upload, indexing, and searching with bee detection """ import sys import os import asyncio import json import requests from pathlib import Path # Add LightRAG to path workspace_dir = os.getcwd() lightrag_path = os.path.join(workspace_dir, 'LightRAG-main') if lightrag_path not in sys.path: sys.path.insert(0, lightrag_path) def test_document_processor(): """Test the document processor with bee detection""" print("๐Ÿงช TESTING DOCUMENT PROCESSOR") print("=" * 50) try: from lightrag.document_processor import get_document_processor from fast_image_classifier import get_image_classifier # Initialize processors print("1. Initializing processors...") processor = get_document_processor() classifier = get_image_classifier() print(f" โœ… OCR processor: {processor.ocr_processor.ocr_available}") print(f" โœ… Image classifier: {classifier.available}") # Process test document print("2. Processing test.docx...") result = asyncio.run(processor.process_document('test.docx')) print(f" โœ… Processing successful: {result.success}") print(f" ๐Ÿ“Š Content length: {len(result.content)}") print(f" ๐Ÿ“‹ Metadata: {result.metadata}") # Check for bee detection bee_detected = 'bee' in result.content.lower() print(f" ๐Ÿ Bee detection: {bee_detected}") if bee_detected: print(" โœ… SUCCESS: Bee image successfully detected and indexed!") # Extract bee classification details for line in result.content.split('\n'): if 'bee' in line.lower() and 'classification' in line.lower(): print(f" ๐Ÿ“ {line.strip()}") else: print(" โŒ FAILED: Bee image not detected") return False return True except Exception as e: print(f"โŒ Document processor test failed: {e}") import traceback traceback.print_exc() return False def test_upload_and_indexing(): """Test document upload and indexing through LightRAG server""" print("\n๐Ÿ“ค TESTING UPLOAD AND INDEXING") print("=" * 50) try: # Check if server is running print("1. Checking server status...") try: response = requests.get("http://localhost:8000/health", timeout=10) if response.status_code == 200: print(" โœ… Server is running") else: print(" โš ๏ธ Server responded with non-200 status") except Exception as e: print(f" โŒ Server not accessible: {e}") print(" โš ๏ธ Please start the server first: python LightRAG-main/start_gpu_server.py") return False # Upload test document print("2. Uploading test.docx...") with open('test.docx', 'rb') as f: files = {'file': ('test.docx', f, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')} response = requests.post("http://localhost:8000/upload", files=files, timeout=30) if response.status_code == 200: upload_result = response.json() print(f" โœ… Upload successful: {upload_result}") # Check document status print("3. Checking document status...") doc_id = upload_result.get('document_id') if doc_id: status_response = requests.get(f"http://localhost:8000/documents/{doc_id}/status", timeout=10) if status_response.status_code == 200: status = status_response.json() print(f" ๐Ÿ“Š Document status: {status}") else: print(f" โš ๏ธ Could not get document status: {status_response.text}") else: print(" โš ๏ธ No document ID returned from upload") return True else: print(f" โŒ Upload failed: {response.status_code} - {response.text}") return False except Exception as e: print(f"โŒ Upload test failed: {e}") import traceback traceback.print_exc() return False def test_search_for_bee(): """Test searching for bee-related content""" print("\n๐Ÿ” TESTING SEARCH FOR BEE CONTENT") print("=" * 50) try: # Search for bee-related content print("1. Searching for 'bee'...") search_payload = { "query": "bee insect animal", "top_k": 10, "include_metadata": True } response = requests.post("http://localhost:8000/search", json=search_payload, timeout=10) if response.status_code == 200: search_results = response.json() print(f" โœ… Search successful, found {len(search_results.get('results', []))} results") # Check if bee content is found bee_found = False for result in search_results.get('results', []): content = result.get('content', '').lower() metadata = result.get('metadata', {}) if 'bee' in content or 'bee' in str(metadata).lower(): bee_found = True print(f" ๐Ÿ Found bee content: {result.get('content', '')[:100]}...") print(f" ๐Ÿ“Š Score: {result.get('score', 0):.4f}") break if bee_found: print(" โœ… SUCCESS: Bee content found in search results!") else: print(" โŒ FAILED: Bee content not found in search results") # Show what was found for debugging print(" ๐Ÿ” Available search results:") for i, result in enumerate(search_results.get('results', [])[:3]): print(f" {i+1}. {result.get('content', '')[:80]}...") return bee_found else: print(f" โŒ Search failed: {response.status_code} - {response.text}") return False except Exception as e: print(f"โŒ Search test failed: {e}") import traceback traceback.print_exc() return False def test_complete_workflow(): """Test the complete workflow from processing to search""" print("\n๐Ÿš€ COMPREHENSIVE WORKFLOW TEST") print("=" * 50) results = { "document_processing": False, "upload_indexing": False, "search": False } # Test document processing results["document_processing"] = test_document_processor() # Test upload and indexing (if server is available) results["upload_indexing"] = test_upload_and_indexing() # Test search (if upload was successful) if results["upload_indexing"]: results["search"] = test_search_for_bee() # Summary print("\n๐Ÿ“‹ TEST SUMMARY") print("=" * 50) for test_name, passed in results.items(): status = "โœ… PASSED" if passed else "โŒ FAILED" print(f" {test_name.replace('_', ' ').title()}: {status}") all_passed = all(results.values()) if all_passed: print("\n๐ŸŽ‰ ALL TESTS PASSED! The document processing pipeline is working correctly.") print(" - โœ… PaddleOCR and OpenCLIP are running in complete isolation") print(" - โœ… Bee image detection is working") print(" - โœ… Document upload and indexing are functional") print(" - โœ… Search with bee detection is operational") else: print("\nโš ๏ธ Some tests failed. Please check the output above for details.") return all_passed if __name__ == "__main__": print("๐Ÿ BEE DETECTION WORKFLOW TEST") print("Testing: Document Processing โ†’ Upload โ†’ Indexing โ†’ Search") print("File: test.docx (should contain a bee image)") print() success = test_complete_workflow() if success: print("\nโœจ TEST COMPLETED SUCCESSFULLY!") print("The modified document processing pipeline is working with:") print("1. Text-first extraction for all file types") print("2. Isolated PaddleOCR for image text extraction") print("3. Isolated OpenCLIP for image classification") print("4. Successful bee detection and indexing") sys.exit(0) else: print("\n๐Ÿ’ฅ TEST FAILED!") sys.exit(1)