Files
railseek6/fix_ocr_issues.py

546 lines
21 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Fix OCR issues by completely isolating PaddleOCR from OpenCLIP dependencies
"""
import os
import sys
import subprocess
import tempfile
import json
from pathlib import Path
def test_paddleocr_alone():
"""Test PaddleOCR without any OpenCLIP interference"""
print("🔍 Testing PaddleOCR in Isolation")
print("=" * 50)
# Test PaddleOCR directly without importing OpenCLIP
test_code = '''
import sys
import os
# Remove any OpenCLIP paths from sys.path to ensure isolation
original_path = sys.path.copy()
filtered_path = [p for p in sys.path if 'openclip' not in p.lower()]
sys.path = filtered_path
try:
print("🧪 Testing PaddleOCR without OpenCLIP interference...")
# Test basic imports
import torch
print(f"✅ PyTorch: {torch.__version__}")
print(f"✅ CUDA available: {torch.cuda.is_available()}")
# Test PaddleOCR
from paddleocr import PaddleOCR
print("✅ PaddleOCR imported successfully")
# Initialize OCR
ocr = PaddleOCR(use_gpu=True)
print("✅ PaddleOCR GPU initialization successful")
# Test OCR on an image
test_image = "extracted_images/image1.png"
if os.path.exists(test_image):
print(f"📸 Testing OCR on: {test_image}")
result = ocr.ocr(test_image, cls=True)
if result and result[0]:
print(f"✅ OCR successful - found {len(result[0])} text lines")
for i, line in enumerate(result[0][:3]): # Show first 3 lines
text = line[1][0] if len(line) > 1 and len(line[1]) > 0 else "No text"
confidence = line[1][1] if len(line) > 1 and len(line[1]) > 1 else 0.0
print(f" Line {i+1}: '{text}' (confidence: {confidence:.3f})")
else:
print("❌ OCR returned no results")
else:
print(f"❌ Test image not found: {test_image}")
except Exception as e:
print(f"❌ PaddleOCR test failed: {e}")
import traceback
traceback.print_exc()
finally:
# Restore original path
sys.path = original_path
'''
# Run the test in a separate process to ensure complete isolation
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
f.write(test_code)
script_path = f.name
try:
result = subprocess.run([sys.executable, script_path],
capture_output=True, text=True, timeout=60)
print(result.stdout)
if result.stderr:
print("STDERR:", result.stderr)
finally:
os.unlink(script_path)
def create_isolated_ocr_processor():
"""Create a completely isolated OCR processor that doesn't import OpenCLIP"""
print("\n🔧 Creating Isolated OCR Processor")
print("=" * 50)
ocr_processor_code = '''
"""
Completely isolated OCR processor that avoids any OpenCLIP dependencies
"""
import os
import sys
import json
import tempfile
from pathlib import Path
class IsolatedOCRProcessor:
"""OCR processor that runs in complete isolation from OpenCLIP"""
def __init__(self):
self.ocr_engine = None
self.available = False
self._initialize_ocr()
def _initialize_ocr(self):
"""Initialize PaddleOCR without any OpenCLIP interference"""
try:
# Clean up sys.path to remove OpenCLIP paths
original_path = sys.path.copy()
sys.path = [p for p in sys.path if 'openclip' not in p.lower()]
print("🚀 Initializing PaddleOCR in isolated environment...")
from paddleocr import PaddleOCR
self.ocr_engine = PaddleOCR(use_gpu=True)
self.available = True
print("✅ PaddleOCR initialized successfully with GPU")
# Restore original path
sys.path = original_path
except Exception as e:
print(f"❌ PaddleOCR initialization failed: {e}")
self.available = False
# Restore original path even on failure
sys.path = original_path
raise
def extract_text_from_image(self, image_path):
"""Extract text from image using isolated OCR"""
if not self.available or not self.ocr_engine:
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
try:
# Clean up sys.path again for OCR execution
original_path = sys.path.copy()
sys.path = [p for p in sys.path if 'openclip' not in p.lower()]
result = self.ocr_engine.ocr(image_path, cls=True)
# Restore path
sys.path = original_path
if not result or not result[0]:
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
extracted_text = []
bboxes = []
total_confidence = 0.0
line_count = 0
for line in result[0]:
try:
if len(line) == 2:
bbox, (text, confidence) = line
elif len(line) >= 1:
bbox = line[0] if len(line) > 0 else []
if len(line) > 1:
if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2:
text, confidence = line[1][0], line[1][1]
else:
text, confidence = str(line[1]) if len(line) > 1 else "", 0.0
else:
text, confidence = "", 0.0
else:
continue
text_str = str(text) if text is not None else ""
confidence_float = 0.0
if confidence is not None:
if isinstance(confidence, (int, float)):
confidence_float = float(confidence)
elif isinstance(confidence, str):
try:
confidence_float = float(confidence)
except ValueError:
confidence_float = 0.0
else:
confidence_float = 0.0
extracted_text.append(text_str)
bboxes.append(bbox)
total_confidence += confidence_float
line_count += 1
except Exception:
extracted_text.append("")
bboxes.append([])
total_confidence += 0.0
line_count += 1
avg_confidence = total_confidence / line_count if line_count > 0 else 0.0
full_text = "\\n".join(extracted_text)
return {
"text": full_text,
"confidence": avg_confidence,
"bboxes": bboxes,
"line_count": line_count
}
except Exception as e:
print(f"❌ OCR processing failed: {e}")
# Restore path on error
sys.path = original_path
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
# Singleton instance
_ocr_instance = None
def get_isolated_ocr_processor():
"""Get singleton isolated OCR processor instance"""
global _ocr_instance
if _ocr_instance is None:
_ocr_instance = IsolatedOCRProcessor()
return _ocr_instance
if __name__ == "__main__":
# Test the isolated OCR processor
processor = get_isolated_ocr_processor()
if processor.available:
print("✅ Isolated OCR processor is available")
# Test with an image
test_image = "extracted_images/image1.png"
if os.path.exists(test_image):
result = processor.extract_text_from_image(test_image)
print(f"OCR Result: {len(result['text'])} characters, confidence: {result['confidence']:.3f}")
if result['text']:
print(f"Text preview: {result['text'][:100]}...")
else:
print("❌ Test image not found")
else:
print("❌ Isolated OCR processor is not available")
'''
with open("isolated_ocr_processor.py", "w", encoding="utf-8") as f:
f.write(ocr_processor_code)
print("✅ Created isolated OCR processor")
def update_document_processor_for_isolation():
"""Update document processor to use the isolated OCR processor"""
print("\n🔄 Updating Document Processor for Complete Isolation")
print("=" * 50)
# Read current document processor
with open("LightRAG-main/lightrag/document_processor.py", "r", encoding="utf-8") as f:
content = f.read()
# Replace the OCRProcessor class with a version that uses the isolated processor
old_ocr_class = '''class OCRProcessor:
"""GPU-accelerated OCR processing using PaddleOCR with graceful fallback"""
def __init__(self, use_gpu: bool = True, languages: List[str] = None):
self.use_gpu = use_gpu
self.languages = languages or ['en', 'ch']
self.ocr_engine = None
self.ocr_available = False
self._initialize_ocr()
def _initialize_ocr(self):
"""Initialize PaddleOCR engine with GPU only - no fallbacks"""
try:
logger.info("Initializing PaddleOCR with GPU mode only")
self.ocr_engine = PaddleOCR(use_gpu=True)
logger.info("PaddleOCR engine initialized successfully with GPU")
self.ocr_available = True
except Exception as e:
logger.error(f"PaddleOCR GPU initialization failed: {e}")
self.ocr_engine = None
self.ocr_available = False
raise RuntimeError(f"PaddleOCR GPU initialization failed: {e}")'''
new_ocr_class = '''class OCRProcessor:
"""GPU-accelerated OCR processing using isolated PaddleOCR"""
def __init__(self, use_gpu: bool = True, languages: List[str] = None):
self.use_gpu = use_gpu
self.languages = languages or ['en', 'ch']
self.ocr_engine = None
self.ocr_available = False
self._initialize_ocr()
def _initialize_ocr(self):
"""Initialize isolated PaddleOCR engine with GPU only"""
try:
logger.info("Initializing isolated PaddleOCR with GPU mode only")
# Import the isolated OCR processor
import sys
import os
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if parent_dir not in sys.path:
sys.path.insert(0, parent_dir)
from isolated_ocr_processor import get_isolated_ocr_processor
self.ocr_engine = get_isolated_ocr_processor()
if self.ocr_engine.available:
logger.info("Isolated PaddleOCR engine initialized successfully with GPU")
self.ocr_available = True
else:
raise RuntimeError("Isolated OCR processor not available")
except Exception as e:
logger.error(f"Isolated PaddleOCR GPU initialization failed: {e}")
self.ocr_engine = None
self.ocr_available = False
raise RuntimeError(f"Isolated PaddleOCR GPU initialization failed: {e}")'''
content = content.replace(old_ocr_class, new_ocr_class)
# Also update the extract_text_from_image method
old_extract_method = ''' def extract_text_from_image(self, image_path: str) -> Dict[str, Any]:
"""Extract text from image using OCR"""
if not self.ocr_engine:
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
try:
# Perform OCR
result = self.ocr_engine.ocr(image_path, cls=True)'''
new_extract_method = ''' def extract_text_from_image(self, image_path: str) -> Dict[str, Any]:
"""Extract text from image using isolated OCR"""
if not self.ocr_engine:
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
try:
# Perform OCR using isolated processor
result = self.ocr_engine.extract_text_from_image(image_path)'''
content = content.replace(old_extract_method, new_extract_method)
# Remove the rest of the old OCR processing code since it's handled by the isolated processor
old_ocr_processing = '''
if not result or not result[0]:
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
# Process OCR results - handle different PaddleOCR result structures
extracted_text = []
bboxes = []
total_confidence = 0.0
line_count = 0
for line in result[0]:
try:
# Handle different PaddleOCR result structures
if len(line) == 2:
# Standard structure: [[bbox], (text, confidence)]
bbox, (text, confidence) = line
elif len(line) >= 1:
# Handle alternative structures
bbox = line[0] if len(line) > 0 else []
if len(line) > 1:
if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2:
text, confidence = line[1][0], line[1][1]
else:
text, confidence = str(line[1]) if len(line) > 1 else "", 0.0
else:
text, confidence = "", 0.0
else:
continue
# Ensure text is string and confidence is float
text_str = str(text) if text is not None else ""
confidence_float = 0.0
if confidence is not None:
if isinstance(confidence, (int, float)):
confidence_float = float(confidence)
elif isinstance(confidence, str):
try:
confidence_float = float(confidence)
except ValueError:
logger.warning(f"Could not convert confidence string to float: {confidence}")
confidence_float = 0.0
else:
logger.warning(f"Unexpected confidence type: {type(confidence)}, value: {confidence}")
confidence_float = 0.0
else:
confidence_float = 0.0
extracted_text.append(text_str)
bboxes.append(bbox)
total_confidence += confidence_float
line_count += 1
except (TypeError, ValueError, IndexError) as e:
logger.warning(f"Type conversion error in OCR line processing: {e}")
# Add empty text and continue
extracted_text.append("")
bboxes.append([])
total_confidence += 0.0
line_count += 1
try:
avg_confidence = total_confidence / line_count if line_count > 0 else 0.0
except (TypeError, ZeroDivisionError):
avg_confidence = 0.0
full_text = "\\n".join(extracted_text)
return {
"text": full_text,
"confidence": avg_confidence,
"bboxes": bboxes,
"line_count": line_count
}'''
# Just remove this block since the isolated processor handles the processing
content = content.replace(old_ocr_processing, "")
# Write the updated content
with open("LightRAG-main/lightrag/document_processor.py", "w", encoding="utf-8") as f:
f.write(content)
print("✅ Document processor updated for complete isolation")
def create_final_test():
"""Create a final test to verify the complete isolation"""
print("\n🧪 Creating Final Isolation Test")
print("=" * 50)
test_code = '''
"""
Final test to verify complete dependency isolation between PaddleOCR and OpenCLIP
"""
import asyncio
import sys
import os
from pathlib import Path
# Add paths
sys.path.insert(0, "LightRAG-main")
async def test_complete_isolation():
"""Test that PaddleOCR and OpenCLIP are completely isolated"""
print("🔍 TESTING COMPLETE DEPENDENCY ISOLATION")
print("=" * 60)
try:
from lightrag.document_processor import get_document_processor
processor = get_document_processor()
print("🎯 SYSTEM STATUS:")
print(f" OCR Processor: {'✅ Available' if processor.ocr_processor.ocr_available else '❌ Not Available'}")
print(f" Image Classifier: {'✅ Available' if processor.image_classifier and processor.image_classifier.available else '❌ Not Available'}")
# Process test document
test_file = "test.docx"
if not os.path.exists(test_file):
print(f"❌ Test file not found: {test_file}")
return
print(f"\\n📄 PROCESSING DOCUMENT: {test_file}")
result = await processor.process_document(test_file)
print(f"✅ Processing Success: {result.success}")
print(f"📊 Metadata: {result.metadata}")
# Check OCR results
print(f"\\n🔤 OCR RESULTS:")
ocr_working = False
for i, img in enumerate(result.images):
if 'ocr_text' in img and img['ocr_text'].strip():
ocr_working = True
print(f" ✅ Image {i+1}: OCR extracted {len(img['ocr_text'])} characters")
if img['ocr_text'].strip():
print(f" Text: {img['ocr_text'][:100]}...")
elif 'ocr_error' in img:
print(f" ❌ Image {i+1}: OCR failed - {img['ocr_error']}")
else:
print(f" ⚠️ Image {i+1}: No OCR text extracted")
# Check classification results
print(f"\\n🖼 CLASSIFICATION RESULTS:")
classification_working = False
bee_detected = False
for i, img in enumerate(result.images):
if 'classification' in img and img['classification']:
classification_working = True
top_label = img['classification'][0]['label'] if img['classification'] else 'unknown'
print(f" ✅ Image {i+1}: Classified as '{top_label}'")
if 'bee' in top_label.lower():
bee_detected = True
print(f" 🎯 BEE DETECTED in image {i+1}!")
print(f"\\n🎯 FINAL VERIFICATION:")
if ocr_working:
print(" ✅ OCR is working with complete dependency isolation")
else:
print(" ❌ OCR is not working properly")
if classification_working:
print(" ✅ Image classification is working with complete dependency isolation")
else:
print(" ❌ Image classification is not working properly")
if bee_detected:
print(" ✅ Bee image successfully detected and classified")
else:
print(" ❌ Bee image not detected in classifications")
print(f"\\n🚀 DEPENDENCY ISOLATION STATUS: {'✅ SUCCESS' if ocr_working and classification_working else '❌ FAILED'}")
except Exception as e:
print(f"❌ Test failed: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
asyncio.run(test_complete_isolation())
'''
with open("final_isolation_test.py", "w", encoding="utf-8") as f:
f.write(test_code)
print("✅ Created final isolation test")
def main():
"""Run all fixes for complete dependency isolation"""
print("🎯 FIXING OCR ISSUES WITH COMPLETE DEPENDENCY ISOLATION")
print("=" * 70)
# Test PaddleOCR alone first
test_paddleocr_alone()
# Create isolated OCR processor
create_isolated_ocr_processor()
# Update document processor
update_document_processor_for_isolation()
# Create final test
create_final_test()
print(f"\\n✅ COMPLETE ISOLATION SOLUTION IMPLEMENTED:")
print(" - Created isolated OCR processor that avoids OpenCLIP paths")
print(" - Updated document processor to use isolated components")
print(" - Ensured complete dependency separation")
print(f"\\n🚀 Run the final test: python final_isolation_test.py")
if __name__ == "__main__":
main()