546 lines
21 KiB
Python
546 lines
21 KiB
Python
"""
|
||
Fix OCR issues by completely isolating PaddleOCR from OpenCLIP dependencies
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import subprocess
|
||
import tempfile
|
||
import json
|
||
from pathlib import Path
|
||
|
||
def test_paddleocr_alone():
|
||
"""Test PaddleOCR without any OpenCLIP interference"""
|
||
print("🔍 Testing PaddleOCR in Isolation")
|
||
print("=" * 50)
|
||
|
||
# Test PaddleOCR directly without importing OpenCLIP
|
||
test_code = '''
|
||
import sys
|
||
import os
|
||
|
||
# Remove any OpenCLIP paths from sys.path to ensure isolation
|
||
original_path = sys.path.copy()
|
||
filtered_path = [p for p in sys.path if 'openclip' not in p.lower()]
|
||
sys.path = filtered_path
|
||
|
||
try:
|
||
print("🧪 Testing PaddleOCR without OpenCLIP interference...")
|
||
|
||
# Test basic imports
|
||
import torch
|
||
print(f"✅ PyTorch: {torch.__version__}")
|
||
print(f"✅ CUDA available: {torch.cuda.is_available()}")
|
||
|
||
# Test PaddleOCR
|
||
from paddleocr import PaddleOCR
|
||
print("✅ PaddleOCR imported successfully")
|
||
|
||
# Initialize OCR
|
||
ocr = PaddleOCR(use_gpu=True)
|
||
print("✅ PaddleOCR GPU initialization successful")
|
||
|
||
# Test OCR on an image
|
||
test_image = "extracted_images/image1.png"
|
||
if os.path.exists(test_image):
|
||
print(f"📸 Testing OCR on: {test_image}")
|
||
result = ocr.ocr(test_image, cls=True)
|
||
|
||
if result and result[0]:
|
||
print(f"✅ OCR successful - found {len(result[0])} text lines")
|
||
for i, line in enumerate(result[0][:3]): # Show first 3 lines
|
||
text = line[1][0] if len(line) > 1 and len(line[1]) > 0 else "No text"
|
||
confidence = line[1][1] if len(line) > 1 and len(line[1]) > 1 else 0.0
|
||
print(f" Line {i+1}: '{text}' (confidence: {confidence:.3f})")
|
||
else:
|
||
print("❌ OCR returned no results")
|
||
else:
|
||
print(f"❌ Test image not found: {test_image}")
|
||
|
||
except Exception as e:
|
||
print(f"❌ PaddleOCR test failed: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
finally:
|
||
# Restore original path
|
||
sys.path = original_path
|
||
'''
|
||
|
||
# Run the test in a separate process to ensure complete isolation
|
||
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
|
||
f.write(test_code)
|
||
script_path = f.name
|
||
|
||
try:
|
||
result = subprocess.run([sys.executable, script_path],
|
||
capture_output=True, text=True, timeout=60)
|
||
print(result.stdout)
|
||
if result.stderr:
|
||
print("STDERR:", result.stderr)
|
||
finally:
|
||
os.unlink(script_path)
|
||
|
||
def create_isolated_ocr_processor():
|
||
"""Create a completely isolated OCR processor that doesn't import OpenCLIP"""
|
||
print("\n🔧 Creating Isolated OCR Processor")
|
||
print("=" * 50)
|
||
|
||
ocr_processor_code = '''
|
||
"""
|
||
Completely isolated OCR processor that avoids any OpenCLIP dependencies
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import json
|
||
import tempfile
|
||
from pathlib import Path
|
||
|
||
class IsolatedOCRProcessor:
|
||
"""OCR processor that runs in complete isolation from OpenCLIP"""
|
||
|
||
def __init__(self):
|
||
self.ocr_engine = None
|
||
self.available = False
|
||
self._initialize_ocr()
|
||
|
||
def _initialize_ocr(self):
|
||
"""Initialize PaddleOCR without any OpenCLIP interference"""
|
||
try:
|
||
# Clean up sys.path to remove OpenCLIP paths
|
||
original_path = sys.path.copy()
|
||
sys.path = [p for p in sys.path if 'openclip' not in p.lower()]
|
||
|
||
print("🚀 Initializing PaddleOCR in isolated environment...")
|
||
from paddleocr import PaddleOCR
|
||
|
||
self.ocr_engine = PaddleOCR(use_gpu=True)
|
||
self.available = True
|
||
print("✅ PaddleOCR initialized successfully with GPU")
|
||
|
||
# Restore original path
|
||
sys.path = original_path
|
||
|
||
except Exception as e:
|
||
print(f"❌ PaddleOCR initialization failed: {e}")
|
||
self.available = False
|
||
# Restore original path even on failure
|
||
sys.path = original_path
|
||
raise
|
||
|
||
def extract_text_from_image(self, image_path):
|
||
"""Extract text from image using isolated OCR"""
|
||
if not self.available or not self.ocr_engine:
|
||
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
|
||
|
||
try:
|
||
# Clean up sys.path again for OCR execution
|
||
original_path = sys.path.copy()
|
||
sys.path = [p for p in sys.path if 'openclip' not in p.lower()]
|
||
|
||
result = self.ocr_engine.ocr(image_path, cls=True)
|
||
|
||
# Restore path
|
||
sys.path = original_path
|
||
|
||
if not result or not result[0]:
|
||
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
|
||
|
||
extracted_text = []
|
||
bboxes = []
|
||
total_confidence = 0.0
|
||
line_count = 0
|
||
|
||
for line in result[0]:
|
||
try:
|
||
if len(line) == 2:
|
||
bbox, (text, confidence) = line
|
||
elif len(line) >= 1:
|
||
bbox = line[0] if len(line) > 0 else []
|
||
if len(line) > 1:
|
||
if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2:
|
||
text, confidence = line[1][0], line[1][1]
|
||
else:
|
||
text, confidence = str(line[1]) if len(line) > 1 else "", 0.0
|
||
else:
|
||
text, confidence = "", 0.0
|
||
else:
|
||
continue
|
||
|
||
text_str = str(text) if text is not None else ""
|
||
confidence_float = 0.0
|
||
if confidence is not None:
|
||
if isinstance(confidence, (int, float)):
|
||
confidence_float = float(confidence)
|
||
elif isinstance(confidence, str):
|
||
try:
|
||
confidence_float = float(confidence)
|
||
except ValueError:
|
||
confidence_float = 0.0
|
||
else:
|
||
confidence_float = 0.0
|
||
|
||
extracted_text.append(text_str)
|
||
bboxes.append(bbox)
|
||
total_confidence += confidence_float
|
||
line_count += 1
|
||
|
||
except Exception:
|
||
extracted_text.append("")
|
||
bboxes.append([])
|
||
total_confidence += 0.0
|
||
line_count += 1
|
||
|
||
avg_confidence = total_confidence / line_count if line_count > 0 else 0.0
|
||
full_text = "\\n".join(extracted_text)
|
||
|
||
return {
|
||
"text": full_text,
|
||
"confidence": avg_confidence,
|
||
"bboxes": bboxes,
|
||
"line_count": line_count
|
||
}
|
||
|
||
except Exception as e:
|
||
print(f"❌ OCR processing failed: {e}")
|
||
# Restore path on error
|
||
sys.path = original_path
|
||
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
|
||
|
||
# Singleton instance
|
||
_ocr_instance = None
|
||
|
||
def get_isolated_ocr_processor():
|
||
"""Get singleton isolated OCR processor instance"""
|
||
global _ocr_instance
|
||
if _ocr_instance is None:
|
||
_ocr_instance = IsolatedOCRProcessor()
|
||
return _ocr_instance
|
||
|
||
if __name__ == "__main__":
|
||
# Test the isolated OCR processor
|
||
processor = get_isolated_ocr_processor()
|
||
if processor.available:
|
||
print("✅ Isolated OCR processor is available")
|
||
# Test with an image
|
||
test_image = "extracted_images/image1.png"
|
||
if os.path.exists(test_image):
|
||
result = processor.extract_text_from_image(test_image)
|
||
print(f"OCR Result: {len(result['text'])} characters, confidence: {result['confidence']:.3f}")
|
||
if result['text']:
|
||
print(f"Text preview: {result['text'][:100]}...")
|
||
else:
|
||
print("❌ Test image not found")
|
||
else:
|
||
print("❌ Isolated OCR processor is not available")
|
||
'''
|
||
|
||
with open("isolated_ocr_processor.py", "w", encoding="utf-8") as f:
|
||
f.write(ocr_processor_code)
|
||
|
||
print("✅ Created isolated OCR processor")
|
||
|
||
def update_document_processor_for_isolation():
|
||
"""Update document processor to use the isolated OCR processor"""
|
||
print("\n🔄 Updating Document Processor for Complete Isolation")
|
||
print("=" * 50)
|
||
|
||
# Read current document processor
|
||
with open("LightRAG-main/lightrag/document_processor.py", "r", encoding="utf-8") as f:
|
||
content = f.read()
|
||
|
||
# Replace the OCRProcessor class with a version that uses the isolated processor
|
||
old_ocr_class = '''class OCRProcessor:
|
||
"""GPU-accelerated OCR processing using PaddleOCR with graceful fallback"""
|
||
|
||
def __init__(self, use_gpu: bool = True, languages: List[str] = None):
|
||
self.use_gpu = use_gpu
|
||
self.languages = languages or ['en', 'ch']
|
||
self.ocr_engine = None
|
||
self.ocr_available = False
|
||
self._initialize_ocr()
|
||
|
||
def _initialize_ocr(self):
|
||
"""Initialize PaddleOCR engine with GPU only - no fallbacks"""
|
||
try:
|
||
logger.info("Initializing PaddleOCR with GPU mode only")
|
||
self.ocr_engine = PaddleOCR(use_gpu=True)
|
||
logger.info("PaddleOCR engine initialized successfully with GPU")
|
||
self.ocr_available = True
|
||
|
||
except Exception as e:
|
||
logger.error(f"PaddleOCR GPU initialization failed: {e}")
|
||
self.ocr_engine = None
|
||
self.ocr_available = False
|
||
raise RuntimeError(f"PaddleOCR GPU initialization failed: {e}")'''
|
||
|
||
new_ocr_class = '''class OCRProcessor:
|
||
"""GPU-accelerated OCR processing using isolated PaddleOCR"""
|
||
|
||
def __init__(self, use_gpu: bool = True, languages: List[str] = None):
|
||
self.use_gpu = use_gpu
|
||
self.languages = languages or ['en', 'ch']
|
||
self.ocr_engine = None
|
||
self.ocr_available = False
|
||
self._initialize_ocr()
|
||
|
||
def _initialize_ocr(self):
|
||
"""Initialize isolated PaddleOCR engine with GPU only"""
|
||
try:
|
||
logger.info("Initializing isolated PaddleOCR with GPU mode only")
|
||
# Import the isolated OCR processor
|
||
import sys
|
||
import os
|
||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
if parent_dir not in sys.path:
|
||
sys.path.insert(0, parent_dir)
|
||
from isolated_ocr_processor import get_isolated_ocr_processor
|
||
|
||
self.ocr_engine = get_isolated_ocr_processor()
|
||
if self.ocr_engine.available:
|
||
logger.info("Isolated PaddleOCR engine initialized successfully with GPU")
|
||
self.ocr_available = True
|
||
else:
|
||
raise RuntimeError("Isolated OCR processor not available")
|
||
|
||
except Exception as e:
|
||
logger.error(f"Isolated PaddleOCR GPU initialization failed: {e}")
|
||
self.ocr_engine = None
|
||
self.ocr_available = False
|
||
raise RuntimeError(f"Isolated PaddleOCR GPU initialization failed: {e}")'''
|
||
|
||
content = content.replace(old_ocr_class, new_ocr_class)
|
||
|
||
# Also update the extract_text_from_image method
|
||
old_extract_method = ''' def extract_text_from_image(self, image_path: str) -> Dict[str, Any]:
|
||
"""Extract text from image using OCR"""
|
||
if not self.ocr_engine:
|
||
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
|
||
|
||
try:
|
||
# Perform OCR
|
||
result = self.ocr_engine.ocr(image_path, cls=True)'''
|
||
|
||
new_extract_method = ''' def extract_text_from_image(self, image_path: str) -> Dict[str, Any]:
|
||
"""Extract text from image using isolated OCR"""
|
||
if not self.ocr_engine:
|
||
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
|
||
|
||
try:
|
||
# Perform OCR using isolated processor
|
||
result = self.ocr_engine.extract_text_from_image(image_path)'''
|
||
|
||
content = content.replace(old_extract_method, new_extract_method)
|
||
|
||
# Remove the rest of the old OCR processing code since it's handled by the isolated processor
|
||
old_ocr_processing = '''
|
||
if not result or not result[0]:
|
||
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
|
||
|
||
# Process OCR results - handle different PaddleOCR result structures
|
||
extracted_text = []
|
||
bboxes = []
|
||
total_confidence = 0.0
|
||
line_count = 0
|
||
|
||
for line in result[0]:
|
||
try:
|
||
# Handle different PaddleOCR result structures
|
||
if len(line) == 2:
|
||
# Standard structure: [[bbox], (text, confidence)]
|
||
bbox, (text, confidence) = line
|
||
elif len(line) >= 1:
|
||
# Handle alternative structures
|
||
bbox = line[0] if len(line) > 0 else []
|
||
if len(line) > 1:
|
||
if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2:
|
||
text, confidence = line[1][0], line[1][1]
|
||
else:
|
||
text, confidence = str(line[1]) if len(line) > 1 else "", 0.0
|
||
else:
|
||
text, confidence = "", 0.0
|
||
else:
|
||
continue
|
||
|
||
# Ensure text is string and confidence is float
|
||
text_str = str(text) if text is not None else ""
|
||
confidence_float = 0.0
|
||
if confidence is not None:
|
||
if isinstance(confidence, (int, float)):
|
||
confidence_float = float(confidence)
|
||
elif isinstance(confidence, str):
|
||
try:
|
||
confidence_float = float(confidence)
|
||
except ValueError:
|
||
logger.warning(f"Could not convert confidence string to float: {confidence}")
|
||
confidence_float = 0.0
|
||
else:
|
||
logger.warning(f"Unexpected confidence type: {type(confidence)}, value: {confidence}")
|
||
confidence_float = 0.0
|
||
else:
|
||
confidence_float = 0.0
|
||
|
||
extracted_text.append(text_str)
|
||
bboxes.append(bbox)
|
||
total_confidence += confidence_float
|
||
line_count += 1
|
||
|
||
except (TypeError, ValueError, IndexError) as e:
|
||
logger.warning(f"Type conversion error in OCR line processing: {e}")
|
||
# Add empty text and continue
|
||
extracted_text.append("")
|
||
bboxes.append([])
|
||
total_confidence += 0.0
|
||
line_count += 1
|
||
|
||
try:
|
||
avg_confidence = total_confidence / line_count if line_count > 0 else 0.0
|
||
except (TypeError, ZeroDivisionError):
|
||
avg_confidence = 0.0
|
||
|
||
full_text = "\\n".join(extracted_text)
|
||
|
||
return {
|
||
"text": full_text,
|
||
"confidence": avg_confidence,
|
||
"bboxes": bboxes,
|
||
"line_count": line_count
|
||
}'''
|
||
|
||
# Just remove this block since the isolated processor handles the processing
|
||
content = content.replace(old_ocr_processing, "")
|
||
|
||
# Write the updated content
|
||
with open("LightRAG-main/lightrag/document_processor.py", "w", encoding="utf-8") as f:
|
||
f.write(content)
|
||
|
||
print("✅ Document processor updated for complete isolation")
|
||
|
||
def create_final_test():
|
||
"""Create a final test to verify the complete isolation"""
|
||
print("\n🧪 Creating Final Isolation Test")
|
||
print("=" * 50)
|
||
|
||
test_code = '''
|
||
"""
|
||
Final test to verify complete dependency isolation between PaddleOCR and OpenCLIP
|
||
"""
|
||
|
||
import asyncio
|
||
import sys
|
||
import os
|
||
from pathlib import Path
|
||
|
||
# Add paths
|
||
sys.path.insert(0, "LightRAG-main")
|
||
|
||
async def test_complete_isolation():
|
||
"""Test that PaddleOCR and OpenCLIP are completely isolated"""
|
||
print("🔍 TESTING COMPLETE DEPENDENCY ISOLATION")
|
||
print("=" * 60)
|
||
|
||
try:
|
||
from lightrag.document_processor import get_document_processor
|
||
|
||
processor = get_document_processor()
|
||
|
||
print("🎯 SYSTEM STATUS:")
|
||
print(f" OCR Processor: {'✅ Available' if processor.ocr_processor.ocr_available else '❌ Not Available'}")
|
||
print(f" Image Classifier: {'✅ Available' if processor.image_classifier and processor.image_classifier.available else '❌ Not Available'}")
|
||
|
||
# Process test document
|
||
test_file = "test.docx"
|
||
if not os.path.exists(test_file):
|
||
print(f"❌ Test file not found: {test_file}")
|
||
return
|
||
|
||
print(f"\\n📄 PROCESSING DOCUMENT: {test_file}")
|
||
result = await processor.process_document(test_file)
|
||
|
||
print(f"✅ Processing Success: {result.success}")
|
||
print(f"📊 Metadata: {result.metadata}")
|
||
|
||
# Check OCR results
|
||
print(f"\\n🔤 OCR RESULTS:")
|
||
ocr_working = False
|
||
for i, img in enumerate(result.images):
|
||
if 'ocr_text' in img and img['ocr_text'].strip():
|
||
ocr_working = True
|
||
print(f" ✅ Image {i+1}: OCR extracted {len(img['ocr_text'])} characters")
|
||
if img['ocr_text'].strip():
|
||
print(f" Text: {img['ocr_text'][:100]}...")
|
||
elif 'ocr_error' in img:
|
||
print(f" ❌ Image {i+1}: OCR failed - {img['ocr_error']}")
|
||
else:
|
||
print(f" ⚠️ Image {i+1}: No OCR text extracted")
|
||
|
||
# Check classification results
|
||
print(f"\\n🖼️ CLASSIFICATION RESULTS:")
|
||
classification_working = False
|
||
bee_detected = False
|
||
for i, img in enumerate(result.images):
|
||
if 'classification' in img and img['classification']:
|
||
classification_working = True
|
||
top_label = img['classification'][0]['label'] if img['classification'] else 'unknown'
|
||
print(f" ✅ Image {i+1}: Classified as '{top_label}'")
|
||
if 'bee' in top_label.lower():
|
||
bee_detected = True
|
||
print(f" 🎯 BEE DETECTED in image {i+1}!")
|
||
|
||
print(f"\\n🎯 FINAL VERIFICATION:")
|
||
if ocr_working:
|
||
print(" ✅ OCR is working with complete dependency isolation")
|
||
else:
|
||
print(" ❌ OCR is not working properly")
|
||
|
||
if classification_working:
|
||
print(" ✅ Image classification is working with complete dependency isolation")
|
||
else:
|
||
print(" ❌ Image classification is not working properly")
|
||
|
||
if bee_detected:
|
||
print(" ✅ Bee image successfully detected and classified")
|
||
else:
|
||
print(" ❌ Bee image not detected in classifications")
|
||
|
||
print(f"\\n🚀 DEPENDENCY ISOLATION STATUS: {'✅ SUCCESS' if ocr_working and classification_working else '❌ FAILED'}")
|
||
|
||
except Exception as e:
|
||
print(f"❌ Test failed: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(test_complete_isolation())
|
||
'''
|
||
|
||
with open("final_isolation_test.py", "w", encoding="utf-8") as f:
|
||
f.write(test_code)
|
||
|
||
print("✅ Created final isolation test")
|
||
|
||
def main():
|
||
"""Run all fixes for complete dependency isolation"""
|
||
print("🎯 FIXING OCR ISSUES WITH COMPLETE DEPENDENCY ISOLATION")
|
||
print("=" * 70)
|
||
|
||
# Test PaddleOCR alone first
|
||
test_paddleocr_alone()
|
||
|
||
# Create isolated OCR processor
|
||
create_isolated_ocr_processor()
|
||
|
||
# Update document processor
|
||
update_document_processor_for_isolation()
|
||
|
||
# Create final test
|
||
create_final_test()
|
||
|
||
print(f"\\n✅ COMPLETE ISOLATION SOLUTION IMPLEMENTED:")
|
||
print(" - Created isolated OCR processor that avoids OpenCLIP paths")
|
||
print(" - Updated document processor to use isolated components")
|
||
print(" - Ensured complete dependency separation")
|
||
print(f"\\n🚀 Run the final test: python final_isolation_test.py")
|
||
|
||
if __name__ == "__main__":
|
||
main() |