224 lines
7.1 KiB
Python
224 lines
7.1 KiB
Python
"""
|
|
Standalone test for document processing without dependency conflicts
|
|
Tests the enhanced pipeline with isolated modules
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import asyncio
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Add paths
|
|
current_dir = Path(__file__).parent
|
|
lightrag_dir = current_dir / "LightRAG-main"
|
|
sys.path.insert(0, str(current_dir))
|
|
sys.path.insert(0, str(lightrag_dir))
|
|
|
|
async def test_document_with_images():
|
|
"""Test document processing with a document that contains images"""
|
|
|
|
print("🧪 Testing Document Processing with Images")
|
|
print("=" * 50)
|
|
|
|
# Test file
|
|
test_file = "test.docx"
|
|
|
|
if not os.path.exists(test_file):
|
|
print(f"❌ Test file {test_file} not found")
|
|
return False
|
|
|
|
try:
|
|
# Import the document processor
|
|
from lightrag.document_processor import get_document_processor
|
|
|
|
# Initialize processor
|
|
processor = get_document_processor()
|
|
|
|
print(f"📄 Processing: {test_file}")
|
|
print(f"🔧 OCR Available: {processor.ocr_processor.ocr_available}")
|
|
print(f"🖼️ Image Classifier Available: {processor.image_classifier and processor.image_classifier.available}")
|
|
|
|
# Process the document
|
|
result = await processor.process_document(test_file)
|
|
|
|
if result.success:
|
|
print("✅ Document processed successfully")
|
|
print(f"📊 Metadata: {result.metadata}")
|
|
|
|
# Check for images
|
|
if result.images:
|
|
print(f"🖼️ Found {len(result.images)} images in document")
|
|
for i, img in enumerate(result.images):
|
|
print(f" Image {i+1}: {img}")
|
|
else:
|
|
print("❌ No images found in document")
|
|
|
|
# Check content
|
|
print(f"📝 Content length: {len(result.content)} characters")
|
|
print(f"📋 Content preview: {result.content[:200]}...")
|
|
|
|
else:
|
|
print(f"❌ Processing failed: {result.error}")
|
|
|
|
return result.success
|
|
|
|
except Exception as e:
|
|
print(f"❌ Test failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
async def test_image_extraction():
|
|
"""Test image extraction from Word documents specifically"""
|
|
|
|
print("\n🔍 Testing Image Extraction from Word Documents")
|
|
print("=" * 50)
|
|
|
|
try:
|
|
import docx
|
|
|
|
test_file = "test.docx"
|
|
doc = docx.Document(test_file)
|
|
|
|
# Count inline shapes (images)
|
|
inline_shapes = list(doc.inline_shapes)
|
|
print(f"📊 Found {len(inline_shapes)} inline shapes in document")
|
|
|
|
# Check if any are images
|
|
image_count = 0
|
|
for i, shape in enumerate(inline_shapes):
|
|
if hasattr(shape, 'image'):
|
|
image_count += 1
|
|
print(f" ✅ Shape {i+1} is an image")
|
|
else:
|
|
print(f" ❌ Shape {i+1} is not an image")
|
|
|
|
print(f"🖼️ Total images found: {image_count}")
|
|
|
|
return image_count > 0
|
|
|
|
except Exception as e:
|
|
print(f"❌ Image extraction test failed: {e}")
|
|
return False
|
|
|
|
async def test_ocr_functionality():
|
|
"""Test OCR functionality separately"""
|
|
|
|
print("\n🔤 Testing OCR Functionality")
|
|
print("=" * 50)
|
|
|
|
try:
|
|
from lightrag.document_processor import OCRProcessor
|
|
|
|
# Initialize OCR processor
|
|
ocr_processor = OCRProcessor(use_gpu=True)
|
|
|
|
if ocr_processor.ocr_available:
|
|
print("✅ OCR processor is available")
|
|
|
|
# Test with a simple image if available
|
|
test_images = ["ocr_high_res.png", "ocr_page1_preview.png"]
|
|
for test_img in test_images:
|
|
if os.path.exists(test_img):
|
|
print(f"🧪 Testing OCR on: {test_img}")
|
|
result = ocr_processor.extract_text_from_image(test_img)
|
|
print(f" Text extracted: {len(result['text'])} characters")
|
|
print(f" Confidence: {result['confidence']:.4f}")
|
|
if result['text'].strip():
|
|
print(f" Preview: {result['text'][:100]}...")
|
|
break
|
|
else:
|
|
print("⚠️ No test images found for OCR testing")
|
|
else:
|
|
print("❌ OCR processor not available")
|
|
|
|
return ocr_processor.ocr_available
|
|
|
|
except Exception as e:
|
|
print(f"❌ OCR test failed: {e}")
|
|
return False
|
|
|
|
async def test_dependency_isolation():
|
|
"""Test that PaddleOCR and OpenCLIP can coexist"""
|
|
|
|
print("\n🛡️ Testing Dependency Isolation")
|
|
print("=" * 50)
|
|
|
|
try:
|
|
# Test importing both modules
|
|
print("🔧 Importing PaddleOCR...")
|
|
import paddleocr
|
|
from paddleocr import PaddleOCR
|
|
print("✅ PaddleOCR imported successfully")
|
|
|
|
print("🔧 Importing OpenCLIP...")
|
|
try:
|
|
import open_clip
|
|
import torch
|
|
print("✅ OpenCLIP imported successfully")
|
|
|
|
# Try to initialize OpenCLIP
|
|
print("🔄 Initializing OpenCLIP model...")
|
|
model, _, processor = open_clip.create_model_and_transforms(
|
|
model_name="ViT-B-32",
|
|
pretrained="laion2b_s34b_b79k"
|
|
)
|
|
print("✅ OpenCLIP model initialized successfully")
|
|
|
|
return True
|
|
|
|
except ImportError:
|
|
print("⚠️ OpenCLIP not available - this is expected if not installed")
|
|
return True
|
|
except Exception as e:
|
|
print(f"⚠️ OpenCLIP initialization failed: {e}")
|
|
print("This might be due to CUDA conflicts with PaddleOCR")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ Dependency isolation test failed: {e}")
|
|
return False
|
|
|
|
async def main():
|
|
"""Run all tests"""
|
|
|
|
print("🚀 Starting Standalone Document Processing Tests")
|
|
print("=" * 60)
|
|
|
|
tests_passed = 0
|
|
total_tests = 4
|
|
|
|
# Test 1: Dependency Isolation
|
|
if await test_dependency_isolation():
|
|
tests_passed += 1
|
|
|
|
# Test 2: OCR Functionality
|
|
if await test_ocr_functionality():
|
|
tests_passed += 1
|
|
|
|
# Test 3: Image Extraction
|
|
if await test_image_extraction():
|
|
tests_passed += 1
|
|
|
|
# Test 4: Document Processing
|
|
if await test_document_with_images():
|
|
tests_passed += 1
|
|
|
|
# Summary
|
|
print(f"\n📊 Test Summary: {tests_passed}/{total_tests} tests passed")
|
|
|
|
if tests_passed == total_tests:
|
|
print("🎉 All tests passed! The enhanced pipeline is working correctly.")
|
|
else:
|
|
print("⚠️ Some tests failed. Check the output above for details.")
|
|
|
|
return tests_passed == total_tests
|
|
|
|
if __name__ == "__main__":
|
|
success = asyncio.run(main())
|
|
sys.exit(0 if success else 1) |