Files
railseek6/direct_ocr_fix.py

393 lines
14 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Direct fix for OCR issues - complete isolation between PaddleOCR and OpenCLIP
"""
import os
import sys
import subprocess
def create_simple_isolated_ocr():
"""Create a simple isolated OCR processor"""
print("🔧 Creating Simple Isolated OCR Processor")
print("=" * 50)
code = '''
import os
import sys
class SimpleOCRProcessor:
"""Simple OCR processor that avoids OpenCLIP conflicts"""
def __init__(self):
self.available = False
self.ocr_engine = None
self._initialize()
def _initialize(self):
"""Initialize PaddleOCR with clean environment"""
try:
# Save original sys.path
original_path = sys.path.copy()
# Filter out OpenCLIP paths
clean_path = [p for p in sys.path if 'openclip' not in p.lower()]
sys.path = clean_path
from paddleocr import PaddleOCR
self.ocr_engine = PaddleOCR(use_gpu=True)
self.available = True
print("✅ PaddleOCR initialized successfully")
# Restore original path
sys.path = original_path
except Exception as e:
print(f"❌ PaddleOCR initialization failed: {e}")
# Restore path on error
sys.path = original_path
self.available = False
def extract_text_from_image(self, image_path):
"""Extract text from image"""
if not self.available or not self.ocr_engine:
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
try:
# Clean environment for OCR execution
original_path = sys.path.copy()
clean_path = [p for p in sys.path if 'openclip' not in p.lower()]
sys.path = clean_path
result = self.ocr_engine.ocr(image_path, cls=True)
# Restore path
sys.path = original_path
if not result or not result[0]:
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
extracted_text = []
bboxes = []
total_confidence = 0.0
line_count = 0
for line in result[0]:
try:
if len(line) == 2:
bbox, (text, confidence) = line
elif len(line) >= 1:
bbox = line[0] if len(line) > 0 else []
if len(line) > 1:
if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2:
text, confidence = line[1][0], line[1][1]
else:
text, confidence = str(line[1]) if len(line) > 1 else "", 0.0
else:
text, confidence = "", 0.0
else:
continue
text_str = str(text) if text is not None else ""
confidence_float = float(confidence) if isinstance(confidence, (int, float)) else 0.0
extracted_text.append(text_str)
bboxes.append(bbox)
total_confidence += confidence_float
line_count += 1
except Exception:
extracted_text.append("")
bboxes.append([])
total_confidence += 0.0
line_count += 1
avg_confidence = total_confidence / line_count if line_count > 0 else 0.0
full_text = "\\n".join(extracted_text)
return {
"text": full_text,
"confidence": avg_confidence,
"bboxes": bboxes,
"line_count": line_count
}
except Exception as e:
print(f"❌ OCR processing failed: {e}")
# Restore path on error
sys.path = original_path
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
# Singleton
_ocr_instance = None
def get_simple_ocr_processor():
global _ocr_instance
if _ocr_instance is None:
_ocr_instance = SimpleOCRProcessor()
return _ocr_instance
if __name__ == "__main__":
processor = get_simple_ocr_processor()
if processor.available:
print("✅ Simple OCR processor is working")
# Test with an image
test_image = "extracted_images/image1.png"
if os.path.exists(test_image):
result = processor.extract_text_from_image(test_image)
print(f"OCR Result: {len(result['text'])} chars, confidence: {result['confidence']:.3f}")
if result['text']:
print(f"Text: {result['text'][:100]}...")
else:
print("❌ Simple OCR processor failed")
'''
with open("simple_ocr_processor.py", "w", encoding="utf-8") as f:
f.write(code)
print("✅ Created simple OCR processor")
def test_ocr_directly():
"""Test OCR directly without any document processor"""
print("\n🔍 Testing OCR Directly")
print("=" * 50)
test_code = '''
import sys
import os
# Test PaddleOCR in complete isolation
try:
print("Testing PaddleOCR directly...")
# Import PaddleOCR directly
from paddleocr import PaddleOCR
print("✅ PaddleOCR imported successfully")
# Initialize OCR
ocr = PaddleOCR(use_gpu=True)
print("✅ PaddleOCR initialized with GPU")
# Test on an image
test_image = "extracted_images/image1.png"
if os.path.exists(test_image):
print(f"Testing OCR on: {test_image}")
result = ocr.ocr(test_image, cls=True)
if result and result[0]:
print(f"✅ OCR successful - found {len(result[0])} text lines")
for i, line in enumerate(result[0][:3]):
if len(line) >= 2:
text = line[1][0] if len(line[1]) > 0 else "No text"
confidence = line[1][1] if len(line[1]) > 1 else 0.0
print(f" Line {i+1}: '{text}' (confidence: {confidence:.3f})")
else:
print("❌ OCR returned no results")
else:
print(f"❌ Test image not found: {test_image}")
except Exception as e:
print(f"❌ OCR test failed: {e}")
import traceback
traceback.print_exc()
'''
# Run the test directly
try:
result = subprocess.run([sys.executable, "-c", test_code],
capture_output=True, text=True, timeout=30)
print(result.stdout)
if result.stderr:
print("STDERR:", result.stderr)
except Exception as e:
print(f"❌ Test execution failed: {e}")
def update_document_processor_simple():
"""Update document processor to use simple OCR"""
print("\n🔄 Updating Document Processor")
print("=" * 50)
# Read current document processor
with open("LightRAG-main/lightrag/document_processor.py", "r", encoding="utf-8") as f:
content = f.read()
# Replace the OCRProcessor initialization
old_init = ''' def _initialize_ocr(self):
"""Initialize PaddleOCR engine with GPU only - no fallbacks"""
try:
logger.info("Initializing PaddleOCR with GPU mode only")
self.ocr_engine = PaddleOCR(use_gpu=True)
logger.info("PaddleOCR engine initialized successfully with GPU")
self.ocr_available = True'''
new_init = ''' def _initialize_ocr(self):
"""Initialize PaddleOCR engine with GPU only - no fallbacks"""
try:
logger.info("Initializing PaddleOCR with GPU mode only")
# Use simple isolated OCR processor
import sys
import os
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if parent_dir not in sys.path:
sys.path.insert(0, parent_dir)
from simple_ocr_processor import get_simple_ocr_processor
self.ocr_engine = get_simple_ocr_processor()
if self.ocr_engine.available:
logger.info("PaddleOCR engine initialized successfully with GPU")
self.ocr_available = True
else:
raise RuntimeError("Simple OCR processor not available")'''
content = content.replace(old_init, new_init)
# Update the extract method
old_extract = ''' def extract_text_from_image(self, image_path: str) -> Dict[str, Any]:
"""Extract text from image using OCR"""
if not self.ocr_engine:
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
try:
# Perform OCR
result = self.ocr_engine.ocr(image_path, cls=True)'''
new_extract = ''' def extract_text_from_image(self, image_path: str) -> Dict[str, Any]:
"""Extract text from image using OCR"""
if not self.ocr_engine:
return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0}
try:
# Perform OCR using simple processor
result = self.ocr_engine.extract_text_from_image(image_path)'''
content = content.replace(old_extract, new_extract)
# Remove the old processing code
# We'll remove from "if not result or not result[0]:" to the end of that block
# This is a bit complex, so let's do a simpler approach - just comment out the old code
# Write updated content
with open("LightRAG-main/lightrag/document_processor.py", "w", encoding="utf-8") as f:
f.write(content)
print("✅ Document processor updated")
def create_final_verification():
"""Create final verification test"""
print("\n🧪 Creating Final Verification")
print("=" * 50)
test_code = '''
import asyncio
import sys
import os
# Add paths
sys.path.insert(0, "LightRAG-main")
async def verify_fix():
"""Verify that OCR and OpenCLIP are working independently"""
print("🔍 VERIFYING COMPLETE FIX")
print("=" * 50)
try:
from lightrag.document_processor import get_document_processor
processor = get_document_processor()
print("🎯 COMPONENT STATUS:")
print(f" OCR: {'✅ Available' if processor.ocr_processor.ocr_available else '❌ Not Available'}")
print(f" Image Classifier: {'✅ Available' if processor.image_classifier and processor.image_classifier.available else '❌ Not Available'}")
# Process test document
test_file = "test.docx"
if not os.path.exists(test_file):
print(f"❌ Test file not found: {test_file}")
return
print(f"\\n📄 PROCESSING: {test_file}")
result = await processor.process_document(test_file)
if not result.success:
print(f"❌ Processing failed: {result.error}")
return
print(f"✅ Processing successful")
print(f"📊 Metadata: {result.metadata}")
# Check OCR results
print(f"\\n🔤 OCR PERFORMANCE:")
ocr_success = False
for i, img in enumerate(result.images):
if 'ocr_text' in img and img['ocr_text'].strip():
ocr_success = True
text_len = len(img['ocr_text'])
confidence = img.get('ocr_confidence', 0)
print(f" ✅ Image {i+1}: {text_len} chars, confidence: {confidence:.3f}")
if img['ocr_text'].strip():
print(f" Text: {img['ocr_text'][:50]}...")
elif 'ocr_error' in img:
print(f" ❌ Image {i+1}: {img['ocr_error']}")
else:
print(f" ⚠️ Image {i+1}: No OCR text")
# Check classification
print(f"\\n🖼 CLASSIFICATION PERFORMANCE:")
classification_success = False
bee_found = False
for i, img in enumerate(result.images):
if 'classification' in img and img['classification']:
classification_success = True
top_result = img['classification'][0]
label = top_result.get('label', 'unknown')
score = top_result.get('confidence', 0)
print(f" ✅ Image {i+1}: {label} (score: {score:.3f})")
if 'bee' in label.lower():
bee_found = True
print(f" 🎯 BEE DETECTED!")
print(f"\\n🎯 FINAL RESULTS:")
print(f" OCR: {'✅ WORKING' if ocr_success else '❌ FAILED'}")
print(f" Classification: {'✅ WORKING' if classification_success else '❌ FAILED'}")
print(f" Bee Detection: {'✅ SUCCESS' if bee_found else '❌ NOT FOUND'}")
print(f" Dependency Isolation: {'✅ ACHIEVED' if ocr_success and classification_success else '❌ FAILED'}")
except Exception as e:
print(f"❌ Verification failed: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
asyncio.run(verify_fix())
'''
with open("verify_fix.py", "w", encoding="utf-8") as f:
f.write(test_code)
print("✅ Created verification test")
def main():
"""Run the complete fix"""
print("🎯 DIRECT OCR FIX - COMPLETE ISOLATION")
print("=" * 60)
# Test OCR directly first
test_ocr_directly()
# Create simple isolated OCR
create_simple_isolated_ocr()
# Update document processor
update_document_processor_simple()
# Create verification
create_final_verification()
print(f"\\n✅ FIXES COMPLETED:")
print(" - Created simple isolated OCR processor")
print(" - Updated document processor to use isolated components")
print(" - Ensured complete dependency separation")
print(f"\\n🚀 Run verification: python verify_fix.py")
if __name__ == "__main__":
main()