""" Standalone test for document processing without dependency conflicts Tests the enhanced pipeline with isolated modules """ import os import sys import asyncio import logging from pathlib import Path # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Add paths current_dir = Path(__file__).parent lightrag_dir = current_dir / "LightRAG-main" sys.path.insert(0, str(current_dir)) sys.path.insert(0, str(lightrag_dir)) async def test_document_with_images(): """Test document processing with a document that contains images""" print("๐Ÿงช Testing Document Processing with Images") print("=" * 50) # Test file test_file = "test.docx" if not os.path.exists(test_file): print(f"โŒ Test file {test_file} not found") return False try: # Import the document processor from lightrag.document_processor import get_document_processor # Initialize processor processor = get_document_processor() print(f"๐Ÿ“„ Processing: {test_file}") print(f"๐Ÿ”ง OCR Available: {processor.ocr_processor.ocr_available}") print(f"๐Ÿ–ผ๏ธ Image Classifier Available: {processor.image_classifier and processor.image_classifier.available}") # Process the document result = await processor.process_document(test_file) if result.success: print("โœ… Document processed successfully") print(f"๐Ÿ“Š Metadata: {result.metadata}") # Check for images if result.images: print(f"๐Ÿ–ผ๏ธ Found {len(result.images)} images in document") for i, img in enumerate(result.images): print(f" Image {i+1}: {img}") else: print("โŒ No images found in document") # Check content print(f"๐Ÿ“ Content length: {len(result.content)} characters") print(f"๐Ÿ“‹ Content preview: {result.content[:200]}...") else: print(f"โŒ Processing failed: {result.error}") return result.success except Exception as e: print(f"โŒ Test failed: {e}") import traceback traceback.print_exc() return False async def test_image_extraction(): """Test image extraction from Word documents specifically""" print("\n๐Ÿ” Testing Image Extraction from Word Documents") print("=" * 50) try: import docx test_file = "test.docx" doc = docx.Document(test_file) # Count inline shapes (images) inline_shapes = list(doc.inline_shapes) print(f"๐Ÿ“Š Found {len(inline_shapes)} inline shapes in document") # Check if any are images image_count = 0 for i, shape in enumerate(inline_shapes): if hasattr(shape, 'image'): image_count += 1 print(f" โœ… Shape {i+1} is an image") else: print(f" โŒ Shape {i+1} is not an image") print(f"๐Ÿ–ผ๏ธ Total images found: {image_count}") return image_count > 0 except Exception as e: print(f"โŒ Image extraction test failed: {e}") return False async def test_ocr_functionality(): """Test OCR functionality separately""" print("\n๐Ÿ”ค Testing OCR Functionality") print("=" * 50) try: from lightrag.document_processor import OCRProcessor # Initialize OCR processor ocr_processor = OCRProcessor(use_gpu=True) if ocr_processor.ocr_available: print("โœ… OCR processor is available") # Test with a simple image if available test_images = ["ocr_high_res.png", "ocr_page1_preview.png"] for test_img in test_images: if os.path.exists(test_img): print(f"๐Ÿงช Testing OCR on: {test_img}") result = ocr_processor.extract_text_from_image(test_img) print(f" Text extracted: {len(result['text'])} characters") print(f" Confidence: {result['confidence']:.4f}") if result['text'].strip(): print(f" Preview: {result['text'][:100]}...") break else: print("โš ๏ธ No test images found for OCR testing") else: print("โŒ OCR processor not available") return ocr_processor.ocr_available except Exception as e: print(f"โŒ OCR test failed: {e}") return False async def test_dependency_isolation(): """Test that PaddleOCR and OpenCLIP can coexist""" print("\n๐Ÿ›ก๏ธ Testing Dependency Isolation") print("=" * 50) try: # Test importing both modules print("๐Ÿ”ง Importing PaddleOCR...") import paddleocr from paddleocr import PaddleOCR print("โœ… PaddleOCR imported successfully") print("๐Ÿ”ง Importing OpenCLIP...") try: import open_clip import torch print("โœ… OpenCLIP imported successfully") # Try to initialize OpenCLIP print("๐Ÿ”„ Initializing OpenCLIP model...") model, _, processor = open_clip.create_model_and_transforms( model_name="ViT-B-32", pretrained="laion2b_s34b_b79k" ) print("โœ… OpenCLIP model initialized successfully") return True except ImportError: print("โš ๏ธ OpenCLIP not available - this is expected if not installed") return True except Exception as e: print(f"โš ๏ธ OpenCLIP initialization failed: {e}") print("This might be due to CUDA conflicts with PaddleOCR") return False except Exception as e: print(f"โŒ Dependency isolation test failed: {e}") return False async def main(): """Run all tests""" print("๐Ÿš€ Starting Standalone Document Processing Tests") print("=" * 60) tests_passed = 0 total_tests = 4 # Test 1: Dependency Isolation if await test_dependency_isolation(): tests_passed += 1 # Test 2: OCR Functionality if await test_ocr_functionality(): tests_passed += 1 # Test 3: Image Extraction if await test_image_extraction(): tests_passed += 1 # Test 4: Document Processing if await test_document_with_images(): tests_passed += 1 # Summary print(f"\n๐Ÿ“Š Test Summary: {tests_passed}/{total_tests} tests passed") if tests_passed == total_tests: print("๐ŸŽ‰ All tests passed! The enhanced pipeline is working correctly.") else: print("โš ๏ธ Some tests failed. Check the output above for details.") return tests_passed == total_tests if __name__ == "__main__": success = asyncio.run(main()) sys.exit(0 if success else 1)