""" Standalone Test for Enhanced Document Processor with Bee Classification Tests the document processing pipeline directly without server dependencies """ import os import sys import asyncio from pathlib import Path # Add the LightRAG directory to path to import our enhanced processor sys.path.insert(0, 'LightRAG-main') def test_document_processor(): """Test the enhanced document processor directly""" print("๐Ÿงช TESTING ENHANCED DOCUMENT PROCESSOR") print("=" * 50) try: # Import the enhanced document processor from lightrag.document_processor import get_document_processor, DocumentProcessor print("โœ… Successfully imported enhanced document processor") # Test with test.docx test_file = "test.docx" if not os.path.exists(test_file): print(f"โŒ Test file {test_file} not found") return False print(f"๐Ÿ“„ Testing with file: {test_file}") print("โณ Processing document...") # Create processor instance processor = DocumentProcessor() # Process the document async def process_doc(): result = await processor.process_document(test_file) return result # Run the async function result = asyncio.run(process_doc()) print(f"โœ… Document processing completed: {result.success}") if result.success: print(f"๐Ÿ“Š Processing results:") print(f" - Content length: {len(result.content)} characters") print(f" - Metadata: {result.metadata}") print(f" - Images processed: {len(result.images) if result.images else 0}") # Check for bee classification in content content_lower = result.content.lower() bee_keywords = ['bee', 'insect', 'animal', 'classification', 'photo of a bee'] print("\n๐Ÿ” Searching for bee classification in content...") bee_found = False for keyword in bee_keywords: if keyword in content_lower: bee_found = True print(f"โœ… Found keyword: '{keyword}'") # Extract specific classification lines lines = result.content.split('\n') classification_lines = [line for line in lines if 'classification' in line.lower()] if classification_lines: print("\n๐Ÿ“‹ Classification results found:") for line in classification_lines: print(f" - {line}") if bee_found: print("\n๐ŸŽ‰ SUCCESS: Bee classification detected in processed content!") print(" The enhanced document processor is working correctly.") print(" Bee entities should now be searchable in LightRAG.") else: print("\nโŒ No bee classification found in processed content") print(" This may indicate that the image classification didn't run") print(" or the bee image wasn't properly classified.") # Check if we have image metadata if result.images: print(f"\n๐Ÿ–ผ๏ธ Image processing details:") for i, image in enumerate(result.images): print(f" Image {i+1}:") if 'primary_classification' in image: print(f" Classification: {image['primary_classification']}") if 'classification' in image: print(f" Full classification: {image['classification']}") if 'ocr_text' in image: print(f" OCR Text: {image['ocr_text'][:100]}...") return bee_found else: print(f"โŒ Document processing failed: {result.error}") return False except Exception as e: print(f"โŒ Error testing document processor: {e}") import traceback traceback.print_exc() return False def test_image_classifier(): """Test the image classifier directly""" print("\n" + "=" * 50) print("๐Ÿ–ผ๏ธ TESTING IMAGE CLASSIFIER") print("=" * 50) try: # Import the image classifier from fast_image_classifier import get_image_classifier classifier = get_image_classifier() if classifier.available: print("โœ… Image classifier is available") # Test with a known image if available test_images = [] for ext in ['.jpg', '.jpeg', '.png']: test_img = f"test_image{ext}" if os.path.exists(test_img): test_images.append(test_img) if test_images: for test_img in test_images: print(f"๐Ÿ” Testing classification on {test_img}...") results = classifier.classify_image(test_img, top_k=3) print(f"๐Ÿ“Š Classification results for {test_img}:") for result in results: print(f" - {result['label']}: {result['confidence']:.2f}") else: print("โ„น๏ธ No test images found for direct classification test") return True else: print("โŒ Image classifier is not available") return False except Exception as e: print(f"โŒ Error testing image classifier: {e}") return False def check_dependencies(): """Check if all required dependencies are available""" print("๐Ÿ” CHECKING DEPENDENCIES") print("=" * 50) dependencies = { 'PaddleOCR': False, 'OpenCLIP': False, 'PyMuPDF (fitz)': False, 'python-docx': False, 'BeautifulSoup': False } try: import paddleocr dependencies['PaddleOCR'] = True print("โœ… PaddleOCR: Available") except ImportError: print("โŒ PaddleOCR: Not available") try: import open_clip dependencies['OpenCLIP'] = True print("โœ… OpenCLIP: Available") except ImportError: print("โŒ OpenCLIP: Not available") try: import fitz dependencies['PyMuPDF (fitz)'] = True print("โœ… PyMuPDF (fitz): Available") except ImportError: print("โŒ PyMuPDF (fitz): Not available") try: import docx dependencies['python-docx'] = True print("โœ… python-docx: Available") except ImportError: print("โŒ python-docx: Not available") try: from bs4 import BeautifulSoup dependencies['BeautifulSoup'] = True print("โœ… BeautifulSoup: Available") except ImportError: print("โŒ BeautifulSoup: Not available") return all(dependencies.values()) def main(): """Main test function""" print("๐Ÿš€ ENHANCED DOCUMENT PROCESSOR TEST SUITE") print("=" * 60) print("This test verifies the complete document processing pipeline") print("with enhanced entity extraction for bee classification.") print() # Check dependencies if not check_dependencies(): print("\nโš ๏ธ Some dependencies are missing, but continuing with tests...") # Test image classifier classifier_ok = test_image_classifier() # Test document processor processor_ok = test_document_processor() print("\n" + "=" * 60) print("๐Ÿ“Š TEST RESULTS SUMMARY") print("=" * 60) print(f"โœ… Image Classifier: {'WORKING' if classifier_ok else 'ISSUES'}") print(f"โœ… Document Processor: {'WORKING' if processor_ok else 'ISSUES'}") if processor_ok: print("\n๐ŸŽ‰ SUCCESS: Enhanced document processing pipeline is working!") print(" The bee classification should now be searchable in LightRAG.") print(" The enhanced entity extraction inserts bee classification") print(" as explicit entities for spaCy to extract.") else: print("\nโŒ ISSUES: There are problems with the document processing pipeline") print(" Check the error messages above for details.") print("\n๐Ÿ’ก Next steps:") print(" 1. Ensure LightRAG server is configured to use the enhanced processor") print(" 2. Upload test.docx to verify bee classification appears in search") print(" 3. Check server logs for document processing details") if __name__ == "__main__": main()