""" Complete Test for Document Processing Pipeline with Dependency Isolation Tests OCR, Image Classification, and Bee Detection """ import asyncio import sys import os import tempfile import zipfile # Add paths sys.path.insert(0, "LightRAG-main") async def test_complete_solution(): """Test the complete document processing pipeline""" print("๐Ÿงช COMPLETE SOLUTION TEST") print("=" * 50) try: from lightrag.document_processor import get_document_processor processor = get_document_processor() print("๐ŸŽฏ COMPONENT STATUS:") print(f" OCR: {'โœ… Available' if processor.ocr_processor.ocr_available else 'โŒ Not Available'}") print(f" Image Classifier: {'โœ… Available' if processor.image_classifier and processor.image_classifier.available else 'โŒ Not Available'}") # Test 1: Process test.docx with bee image test_file = "test.docx" if not os.path.exists(test_file): print(f"โŒ Test file not found: {test_file}") return print(f"\n๐Ÿ“„ PROCESSING: {test_file}") result = await processor.process_document(test_file) if not result.success: print(f"โŒ Processing failed: {result.error}") return print(f"โœ… Processing successful") print(f"๐Ÿ“Š Metadata: {result.metadata}") # Check OCR results print(f"\n๐Ÿ”ค OCR PERFORMANCE:") ocr_success = False for i, img in enumerate(result.images): if 'ocr_text' in img and img['ocr_text'].strip(): ocr_success = True text_len = len(img['ocr_text']) confidence = img.get('ocr_confidence', 0) print(f" โœ… Image {i+1}: {text_len} chars, confidence: {confidence:.3f}") if img['ocr_text'].strip(): print(f" Text: {img['ocr_text'][:50]}...") elif 'ocr_error' in img: print(f" โŒ Image {i+1}: {img['ocr_error']}") else: print(f" โš ๏ธ Image {i+1}: No OCR text") # Check classification print(f"\n๐Ÿ–ผ๏ธ CLASSIFICATION PERFORMANCE:") classification_success = False bee_found = False for i, img in enumerate(result.images): if 'classification' in img and img['classification']: classification_success = True top_result = img['classification'][0] label = top_result.get('label', 'unknown') score = top_result.get('confidence', 0) print(f" โœ… Image {i+1}: {label} (score: {score:.3f})") if 'bee' in label.lower(): bee_found = True print(f" ๐ŸŽฏ BEE DETECTED!") elif 'classification_error' in img: print(f" โŒ Image {i+1}: {img['classification_error']}") else: print(f" โš ๏ธ Image {i+1}: No classification") print(f"\n๐ŸŽฏ FINAL RESULTS:") print(f" OCR: {'โœ… WORKING' if ocr_success else 'โŒ FAILED'}") print(f" Classification: {'โœ… WORKING' if classification_success else 'โŒ FAILED'}") print(f" Bee Detection: {'โœ… SUCCESS' if bee_found else 'โŒ NOT FOUND'}") print(f" Dependency Isolation: {'โœ… ACHIEVED' if ocr_success and classification_success else 'โŒ FAILED'}") # Test 2: Test OCR with a simple image print(f"\n๐Ÿงช ADDITIONAL OCR TEST:") test_simple_ocr() # Test 3: Test image classification with virtual environment print(f"\n๐Ÿงช ADDITIONAL CLASSIFICATION TEST:") await test_simple_classification(processor) except Exception as e: print(f"โŒ Test failed: {e}") import traceback traceback.print_exc() def test_simple_ocr(): """Test OCR with simple processor""" try: from simple_ocr_processor import get_simple_ocr_processor processor = get_simple_ocr_processor() if not processor.available: print(" โŒ Simple OCR processor not available") return # Create a simple test image with text with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as f: test_image_path = f.name # For now, just test if processor works result = processor.extract_text_from_image("test.docx") # This will fail but test the process print(f" โœ… OCR subprocess execution: {'Working' if 'text' in result else 'Failed'}") # Clean up if os.path.exists(test_image_path): os.unlink(test_image_path) except Exception as e: print(f" โŒ OCR test failed: {e}") async def test_simple_classification(processor): """Test image classification""" if not processor.image_classifier or not processor.image_classifier.available: print(" โŒ Image classifier not available") return try: # Extract first image from test.docx for classification test with tempfile.TemporaryDirectory() as temp_dir: # Extract images from docx using zipfile with zipfile.ZipFile("test.docx", 'r') as zip_ref: image_files = [] for file_info in zip_ref.filelist: if file_info.filename.startswith('word/media/'): # Extract the image image_filename = os.path.basename(file_info.filename) image_path = os.path.join(temp_dir, image_filename) # Extract and save with zip_ref.open(file_info.filename) as source, open(image_path, 'wb') as target: target.write(source.read()) image_files.append(image_path) break # Just test with first image if image_files: test_image = image_files[0] print(f" Testing classification on: {os.path.basename(test_image)}") results = processor.image_classifier.classify_image(test_image, top_k=3) if results and 'error' not in results[0]: print(f" โœ… Classification working") for result in results: print(f" {result['label']}: {result['confidence']:.4f}") if 'bee' in result['label'].lower(): print(f" ๐ŸŽฏ BEE CLASSIFICATION SUCCESS!") else: print(f" โŒ Classification failed: {results}") else: print(" โš ๏ธ No images found in test.docx for classification test") except Exception as e: print(f" โŒ Classification test failed: {e}") if __name__ == "__main__": asyncio.run(test_complete_solution())