""" Fix OCR issues by completely isolating PaddleOCR from OpenCLIP dependencies """ import os import sys import subprocess import tempfile import json from pathlib import Path def test_paddleocr_alone(): """Test PaddleOCR without any OpenCLIP interference""" print("๐Ÿ” Testing PaddleOCR in Isolation") print("=" * 50) # Test PaddleOCR directly without importing OpenCLIP test_code = ''' import sys import os # Remove any OpenCLIP paths from sys.path to ensure isolation original_path = sys.path.copy() filtered_path = [p for p in sys.path if 'openclip' not in p.lower()] sys.path = filtered_path try: print("๐Ÿงช Testing PaddleOCR without OpenCLIP interference...") # Test basic imports import torch print(f"โœ… PyTorch: {torch.__version__}") print(f"โœ… CUDA available: {torch.cuda.is_available()}") # Test PaddleOCR from paddleocr import PaddleOCR print("โœ… PaddleOCR imported successfully") # Initialize OCR ocr = PaddleOCR(use_gpu=True) print("โœ… PaddleOCR GPU initialization successful") # Test OCR on an image test_image = "extracted_images/image1.png" if os.path.exists(test_image): print(f"๐Ÿ“ธ Testing OCR on: {test_image}") result = ocr.ocr(test_image, cls=True) if result and result[0]: print(f"โœ… OCR successful - found {len(result[0])} text lines") for i, line in enumerate(result[0][:3]): # Show first 3 lines text = line[1][0] if len(line) > 1 and len(line[1]) > 0 else "No text" confidence = line[1][1] if len(line) > 1 and len(line[1]) > 1 else 0.0 print(f" Line {i+1}: '{text}' (confidence: {confidence:.3f})") else: print("โŒ OCR returned no results") else: print(f"โŒ Test image not found: {test_image}") except Exception as e: print(f"โŒ PaddleOCR test failed: {e}") import traceback traceback.print_exc() finally: # Restore original path sys.path = original_path ''' # Run the test in a separate process to ensure complete isolation with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f: f.write(test_code) script_path = f.name try: result = subprocess.run([sys.executable, script_path], capture_output=True, text=True, timeout=60) print(result.stdout) if result.stderr: print("STDERR:", result.stderr) finally: os.unlink(script_path) def create_isolated_ocr_processor(): """Create a completely isolated OCR processor that doesn't import OpenCLIP""" print("\n๐Ÿ”ง Creating Isolated OCR Processor") print("=" * 50) ocr_processor_code = ''' """ Completely isolated OCR processor that avoids any OpenCLIP dependencies """ import os import sys import json import tempfile from pathlib import Path class IsolatedOCRProcessor: """OCR processor that runs in complete isolation from OpenCLIP""" def __init__(self): self.ocr_engine = None self.available = False self._initialize_ocr() def _initialize_ocr(self): """Initialize PaddleOCR without any OpenCLIP interference""" try: # Clean up sys.path to remove OpenCLIP paths original_path = sys.path.copy() sys.path = [p for p in sys.path if 'openclip' not in p.lower()] print("๐Ÿš€ Initializing PaddleOCR in isolated environment...") from paddleocr import PaddleOCR self.ocr_engine = PaddleOCR(use_gpu=True) self.available = True print("โœ… PaddleOCR initialized successfully with GPU") # Restore original path sys.path = original_path except Exception as e: print(f"โŒ PaddleOCR initialization failed: {e}") self.available = False # Restore original path even on failure sys.path = original_path raise def extract_text_from_image(self, image_path): """Extract text from image using isolated OCR""" if not self.available or not self.ocr_engine: return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0} try: # Clean up sys.path again for OCR execution original_path = sys.path.copy() sys.path = [p for p in sys.path if 'openclip' not in p.lower()] result = self.ocr_engine.ocr(image_path, cls=True) # Restore path sys.path = original_path if not result or not result[0]: return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0} extracted_text = [] bboxes = [] total_confidence = 0.0 line_count = 0 for line in result[0]: try: if len(line) == 2: bbox, (text, confidence) = line elif len(line) >= 1: bbox = line[0] if len(line) > 0 else [] if len(line) > 1: if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2: text, confidence = line[1][0], line[1][1] else: text, confidence = str(line[1]) if len(line) > 1 else "", 0.0 else: text, confidence = "", 0.0 else: continue text_str = str(text) if text is not None else "" confidence_float = 0.0 if confidence is not None: if isinstance(confidence, (int, float)): confidence_float = float(confidence) elif isinstance(confidence, str): try: confidence_float = float(confidence) except ValueError: confidence_float = 0.0 else: confidence_float = 0.0 extracted_text.append(text_str) bboxes.append(bbox) total_confidence += confidence_float line_count += 1 except Exception: extracted_text.append("") bboxes.append([]) total_confidence += 0.0 line_count += 1 avg_confidence = total_confidence / line_count if line_count > 0 else 0.0 full_text = "\\n".join(extracted_text) return { "text": full_text, "confidence": avg_confidence, "bboxes": bboxes, "line_count": line_count } except Exception as e: print(f"โŒ OCR processing failed: {e}") # Restore path on error sys.path = original_path return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0} # Singleton instance _ocr_instance = None def get_isolated_ocr_processor(): """Get singleton isolated OCR processor instance""" global _ocr_instance if _ocr_instance is None: _ocr_instance = IsolatedOCRProcessor() return _ocr_instance if __name__ == "__main__": # Test the isolated OCR processor processor = get_isolated_ocr_processor() if processor.available: print("โœ… Isolated OCR processor is available") # Test with an image test_image = "extracted_images/image1.png" if os.path.exists(test_image): result = processor.extract_text_from_image(test_image) print(f"OCR Result: {len(result['text'])} characters, confidence: {result['confidence']:.3f}") if result['text']: print(f"Text preview: {result['text'][:100]}...") else: print("โŒ Test image not found") else: print("โŒ Isolated OCR processor is not available") ''' with open("isolated_ocr_processor.py", "w", encoding="utf-8") as f: f.write(ocr_processor_code) print("โœ… Created isolated OCR processor") def update_document_processor_for_isolation(): """Update document processor to use the isolated OCR processor""" print("\n๐Ÿ”„ Updating Document Processor for Complete Isolation") print("=" * 50) # Read current document processor with open("LightRAG-main/lightrag/document_processor.py", "r", encoding="utf-8") as f: content = f.read() # Replace the OCRProcessor class with a version that uses the isolated processor old_ocr_class = '''class OCRProcessor: """GPU-accelerated OCR processing using PaddleOCR with graceful fallback""" def __init__(self, use_gpu: bool = True, languages: List[str] = None): self.use_gpu = use_gpu self.languages = languages or ['en', 'ch'] self.ocr_engine = None self.ocr_available = False self._initialize_ocr() def _initialize_ocr(self): """Initialize PaddleOCR engine with GPU only - no fallbacks""" try: logger.info("Initializing PaddleOCR with GPU mode only") self.ocr_engine = PaddleOCR(use_gpu=True) logger.info("PaddleOCR engine initialized successfully with GPU") self.ocr_available = True except Exception as e: logger.error(f"PaddleOCR GPU initialization failed: {e}") self.ocr_engine = None self.ocr_available = False raise RuntimeError(f"PaddleOCR GPU initialization failed: {e}")''' new_ocr_class = '''class OCRProcessor: """GPU-accelerated OCR processing using isolated PaddleOCR""" def __init__(self, use_gpu: bool = True, languages: List[str] = None): self.use_gpu = use_gpu self.languages = languages or ['en', 'ch'] self.ocr_engine = None self.ocr_available = False self._initialize_ocr() def _initialize_ocr(self): """Initialize isolated PaddleOCR engine with GPU only""" try: logger.info("Initializing isolated PaddleOCR with GPU mode only") # Import the isolated OCR processor import sys import os parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if parent_dir not in sys.path: sys.path.insert(0, parent_dir) from isolated_ocr_processor import get_isolated_ocr_processor self.ocr_engine = get_isolated_ocr_processor() if self.ocr_engine.available: logger.info("Isolated PaddleOCR engine initialized successfully with GPU") self.ocr_available = True else: raise RuntimeError("Isolated OCR processor not available") except Exception as e: logger.error(f"Isolated PaddleOCR GPU initialization failed: {e}") self.ocr_engine = None self.ocr_available = False raise RuntimeError(f"Isolated PaddleOCR GPU initialization failed: {e}")''' content = content.replace(old_ocr_class, new_ocr_class) # Also update the extract_text_from_image method old_extract_method = ''' def extract_text_from_image(self, image_path: str) -> Dict[str, Any]: """Extract text from image using OCR""" if not self.ocr_engine: return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0} try: # Perform OCR result = self.ocr_engine.ocr(image_path, cls=True)''' new_extract_method = ''' def extract_text_from_image(self, image_path: str) -> Dict[str, Any]: """Extract text from image using isolated OCR""" if not self.ocr_engine: return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0} try: # Perform OCR using isolated processor result = self.ocr_engine.extract_text_from_image(image_path)''' content = content.replace(old_extract_method, new_extract_method) # Remove the rest of the old OCR processing code since it's handled by the isolated processor old_ocr_processing = ''' if not result or not result[0]: return {"text": "", "confidence": 0.0, "bboxes": [], "line_count": 0} # Process OCR results - handle different PaddleOCR result structures extracted_text = [] bboxes = [] total_confidence = 0.0 line_count = 0 for line in result[0]: try: # Handle different PaddleOCR result structures if len(line) == 2: # Standard structure: [[bbox], (text, confidence)] bbox, (text, confidence) = line elif len(line) >= 1: # Handle alternative structures bbox = line[0] if len(line) > 0 else [] if len(line) > 1: if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2: text, confidence = line[1][0], line[1][1] else: text, confidence = str(line[1]) if len(line) > 1 else "", 0.0 else: text, confidence = "", 0.0 else: continue # Ensure text is string and confidence is float text_str = str(text) if text is not None else "" confidence_float = 0.0 if confidence is not None: if isinstance(confidence, (int, float)): confidence_float = float(confidence) elif isinstance(confidence, str): try: confidence_float = float(confidence) except ValueError: logger.warning(f"Could not convert confidence string to float: {confidence}") confidence_float = 0.0 else: logger.warning(f"Unexpected confidence type: {type(confidence)}, value: {confidence}") confidence_float = 0.0 else: confidence_float = 0.0 extracted_text.append(text_str) bboxes.append(bbox) total_confidence += confidence_float line_count += 1 except (TypeError, ValueError, IndexError) as e: logger.warning(f"Type conversion error in OCR line processing: {e}") # Add empty text and continue extracted_text.append("") bboxes.append([]) total_confidence += 0.0 line_count += 1 try: avg_confidence = total_confidence / line_count if line_count > 0 else 0.0 except (TypeError, ZeroDivisionError): avg_confidence = 0.0 full_text = "\\n".join(extracted_text) return { "text": full_text, "confidence": avg_confidence, "bboxes": bboxes, "line_count": line_count }''' # Just remove this block since the isolated processor handles the processing content = content.replace(old_ocr_processing, "") # Write the updated content with open("LightRAG-main/lightrag/document_processor.py", "w", encoding="utf-8") as f: f.write(content) print("โœ… Document processor updated for complete isolation") def create_final_test(): """Create a final test to verify the complete isolation""" print("\n๐Ÿงช Creating Final Isolation Test") print("=" * 50) test_code = ''' """ Final test to verify complete dependency isolation between PaddleOCR and OpenCLIP """ import asyncio import sys import os from pathlib import Path # Add paths sys.path.insert(0, "LightRAG-main") async def test_complete_isolation(): """Test that PaddleOCR and OpenCLIP are completely isolated""" print("๐Ÿ” TESTING COMPLETE DEPENDENCY ISOLATION") print("=" * 60) try: from lightrag.document_processor import get_document_processor processor = get_document_processor() print("๐ŸŽฏ SYSTEM STATUS:") print(f" OCR Processor: {'โœ… Available' if processor.ocr_processor.ocr_available else 'โŒ Not Available'}") print(f" Image Classifier: {'โœ… Available' if processor.image_classifier and processor.image_classifier.available else 'โŒ Not Available'}") # Process test document test_file = "test.docx" if not os.path.exists(test_file): print(f"โŒ Test file not found: {test_file}") return print(f"\\n๐Ÿ“„ PROCESSING DOCUMENT: {test_file}") result = await processor.process_document(test_file) print(f"โœ… Processing Success: {result.success}") print(f"๐Ÿ“Š Metadata: {result.metadata}") # Check OCR results print(f"\\n๐Ÿ”ค OCR RESULTS:") ocr_working = False for i, img in enumerate(result.images): if 'ocr_text' in img and img['ocr_text'].strip(): ocr_working = True print(f" โœ… Image {i+1}: OCR extracted {len(img['ocr_text'])} characters") if img['ocr_text'].strip(): print(f" Text: {img['ocr_text'][:100]}...") elif 'ocr_error' in img: print(f" โŒ Image {i+1}: OCR failed - {img['ocr_error']}") else: print(f" โš ๏ธ Image {i+1}: No OCR text extracted") # Check classification results print(f"\\n๐Ÿ–ผ๏ธ CLASSIFICATION RESULTS:") classification_working = False bee_detected = False for i, img in enumerate(result.images): if 'classification' in img and img['classification']: classification_working = True top_label = img['classification'][0]['label'] if img['classification'] else 'unknown' print(f" โœ… Image {i+1}: Classified as '{top_label}'") if 'bee' in top_label.lower(): bee_detected = True print(f" ๐ŸŽฏ BEE DETECTED in image {i+1}!") print(f"\\n๐ŸŽฏ FINAL VERIFICATION:") if ocr_working: print(" โœ… OCR is working with complete dependency isolation") else: print(" โŒ OCR is not working properly") if classification_working: print(" โœ… Image classification is working with complete dependency isolation") else: print(" โŒ Image classification is not working properly") if bee_detected: print(" โœ… Bee image successfully detected and classified") else: print(" โŒ Bee image not detected in classifications") print(f"\\n๐Ÿš€ DEPENDENCY ISOLATION STATUS: {'โœ… SUCCESS' if ocr_working and classification_working else 'โŒ FAILED'}") except Exception as e: print(f"โŒ Test failed: {e}") import traceback traceback.print_exc() if __name__ == "__main__": asyncio.run(test_complete_isolation()) ''' with open("final_isolation_test.py", "w", encoding="utf-8") as f: f.write(test_code) print("โœ… Created final isolation test") def main(): """Run all fixes for complete dependency isolation""" print("๐ŸŽฏ FIXING OCR ISSUES WITH COMPLETE DEPENDENCY ISOLATION") print("=" * 70) # Test PaddleOCR alone first test_paddleocr_alone() # Create isolated OCR processor create_isolated_ocr_processor() # Update document processor update_document_processor_for_isolation() # Create final test create_final_test() print(f"\\nโœ… COMPLETE ISOLATION SOLUTION IMPLEMENTED:") print(" - Created isolated OCR processor that avoids OpenCLIP paths") print(" - Updated document processor to use isolated components") print(" - Ensured complete dependency separation") print(f"\\n๐Ÿš€ Run the final test: python final_isolation_test.py") if __name__ == "__main__": main()