Files
railseek6/test_standalone.py

224 lines
7.1 KiB
Python

"""
Standalone test for document processing without dependency conflicts
Tests the enhanced pipeline with isolated modules
"""
import os
import sys
import asyncio
import logging
from pathlib import Path
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Add paths
current_dir = Path(__file__).parent
lightrag_dir = current_dir / "LightRAG-main"
sys.path.insert(0, str(current_dir))
sys.path.insert(0, str(lightrag_dir))
async def test_document_with_images():
"""Test document processing with a document that contains images"""
print("🧪 Testing Document Processing with Images")
print("=" * 50)
# Test file
test_file = "test.docx"
if not os.path.exists(test_file):
print(f"❌ Test file {test_file} not found")
return False
try:
# Import the document processor
from lightrag.document_processor import get_document_processor
# Initialize processor
processor = get_document_processor()
print(f"📄 Processing: {test_file}")
print(f"🔧 OCR Available: {processor.ocr_processor.ocr_available}")
print(f"🖼️ Image Classifier Available: {processor.image_classifier and processor.image_classifier.available}")
# Process the document
result = await processor.process_document(test_file)
if result.success:
print("✅ Document processed successfully")
print(f"📊 Metadata: {result.metadata}")
# Check for images
if result.images:
print(f"🖼️ Found {len(result.images)} images in document")
for i, img in enumerate(result.images):
print(f" Image {i+1}: {img}")
else:
print("❌ No images found in document")
# Check content
print(f"📝 Content length: {len(result.content)} characters")
print(f"📋 Content preview: {result.content[:200]}...")
else:
print(f"❌ Processing failed: {result.error}")
return result.success
except Exception as e:
print(f"❌ Test failed: {e}")
import traceback
traceback.print_exc()
return False
async def test_image_extraction():
"""Test image extraction from Word documents specifically"""
print("\n🔍 Testing Image Extraction from Word Documents")
print("=" * 50)
try:
import docx
test_file = "test.docx"
doc = docx.Document(test_file)
# Count inline shapes (images)
inline_shapes = list(doc.inline_shapes)
print(f"📊 Found {len(inline_shapes)} inline shapes in document")
# Check if any are images
image_count = 0
for i, shape in enumerate(inline_shapes):
if hasattr(shape, 'image'):
image_count += 1
print(f" ✅ Shape {i+1} is an image")
else:
print(f" ❌ Shape {i+1} is not an image")
print(f"🖼️ Total images found: {image_count}")
return image_count > 0
except Exception as e:
print(f"❌ Image extraction test failed: {e}")
return False
async def test_ocr_functionality():
"""Test OCR functionality separately"""
print("\n🔤 Testing OCR Functionality")
print("=" * 50)
try:
from lightrag.document_processor import OCRProcessor
# Initialize OCR processor
ocr_processor = OCRProcessor(use_gpu=True)
if ocr_processor.ocr_available:
print("✅ OCR processor is available")
# Test with a simple image if available
test_images = ["ocr_high_res.png", "ocr_page1_preview.png"]
for test_img in test_images:
if os.path.exists(test_img):
print(f"🧪 Testing OCR on: {test_img}")
result = ocr_processor.extract_text_from_image(test_img)
print(f" Text extracted: {len(result['text'])} characters")
print(f" Confidence: {result['confidence']:.4f}")
if result['text'].strip():
print(f" Preview: {result['text'][:100]}...")
break
else:
print("⚠️ No test images found for OCR testing")
else:
print("❌ OCR processor not available")
return ocr_processor.ocr_available
except Exception as e:
print(f"❌ OCR test failed: {e}")
return False
async def test_dependency_isolation():
"""Test that PaddleOCR and OpenCLIP can coexist"""
print("\n🛡️ Testing Dependency Isolation")
print("=" * 50)
try:
# Test importing both modules
print("🔧 Importing PaddleOCR...")
import paddleocr
from paddleocr import PaddleOCR
print("✅ PaddleOCR imported successfully")
print("🔧 Importing OpenCLIP...")
try:
import open_clip
import torch
print("✅ OpenCLIP imported successfully")
# Try to initialize OpenCLIP
print("🔄 Initializing OpenCLIP model...")
model, _, processor = open_clip.create_model_and_transforms(
model_name="ViT-B-32",
pretrained="laion2b_s34b_b79k"
)
print("✅ OpenCLIP model initialized successfully")
return True
except ImportError:
print("⚠️ OpenCLIP not available - this is expected if not installed")
return True
except Exception as e:
print(f"⚠️ OpenCLIP initialization failed: {e}")
print("This might be due to CUDA conflicts with PaddleOCR")
return False
except Exception as e:
print(f"❌ Dependency isolation test failed: {e}")
return False
async def main():
"""Run all tests"""
print("🚀 Starting Standalone Document Processing Tests")
print("=" * 60)
tests_passed = 0
total_tests = 4
# Test 1: Dependency Isolation
if await test_dependency_isolation():
tests_passed += 1
# Test 2: OCR Functionality
if await test_ocr_functionality():
tests_passed += 1
# Test 3: Image Extraction
if await test_image_extraction():
tests_passed += 1
# Test 4: Document Processing
if await test_document_with_images():
tests_passed += 1
# Summary
print(f"\n📊 Test Summary: {tests_passed}/{total_tests} tests passed")
if tests_passed == total_tests:
print("🎉 All tests passed! The enhanced pipeline is working correctly.")
else:
print("⚠️ Some tests failed. Check the output above for details.")
return tests_passed == total_tests
if __name__ == "__main__":
success = asyncio.run(main())
sys.exit(0 if success else 1)