""" Final Integrated Solution for Document Processing Pipeline Complete dependency isolation between PaddleOCR and OpenCLIP """ import os import sys import subprocess import tempfile import asyncio import zipfile from pathlib import Path class IsolatedOCRProcessor: """Isolated OCR processor using subprocess communication""" def __init__(self): self.available = False self._initialize() def _initialize(self): """Initialize OCR processor""" try: # Test if PaddleOCR works test_script = """ import sys try: from paddleocr import PaddleOCR print("OCR_READY") except Exception as e: print(f"OCR_ERROR:{e}") """ result = subprocess.run([sys.executable, "-c", test_script], capture_output=True, text=True, timeout=10) if "OCR_READY" in result.stdout: self.available = True print("✅ OCR processor initialized successfully") else: print(f"❌ OCR initialization failed: {result.stderr}") except Exception as e: print(f"❌ OCR initialization failed: {e}") def extract_text_from_image(self, image_path): """Extract text from image using direct subprocess""" if not self.available or not os.path.exists(image_path): return {"text": "", "confidence": 0.0, "line_count": 0} try: ocr_script = f""" import sys from paddleocr import PaddleOCR try: ocr = PaddleOCR(use_gpu=True, cls=True) result = ocr.ocr(r"{image_path}") if not result or not result[0]: print("OCR_RESULT:EMPTY") sys.exit(0) extracted_text = [] total_confidence = 0.0 line_count = 0 for line in result[0]: try: if len(line) == 2: bbox, (text, confidence) = line elif len(line) >= 1: bbox = line[0] if len(line) > 0 else [] if len(line) > 1: if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2: text, confidence = line[1][0], line[1][1] else: text, confidence = str(line[1]) if len(line) > 1 else "", 0.0 else: text, confidence = "", 0.0 else: continue text_str = str(text) if text is not None else "" confidence_float = float(confidence) if isinstance(confidence, (int, float)) else 0.0 extracted_text.append(text_str) total_confidence += confidence_float line_count += 1 except Exception: extracted_text.append("") total_confidence += 0.0 line_count += 1 avg_confidence = total_confidence / line_count if line_count > 0 else 0.0 full_text = " ".join(extracted_text) print(f"OCR_RESULT:TEXT={{full_text}}") print(f"OCR_RESULT:CONFIDENCE={{avg_confidence}}") print(f"OCR_RESULT:LINES={{line_count}}") except Exception as e: print(f"OCR_ERROR:{{e}}") """ result = subprocess.run([sys.executable, "-c", ocr_script], capture_output=True, text=True, timeout=60) # Parse results from stdout text = "" confidence = 0.0 line_count = 0 for line in result.stdout.split('\n'): if line.startswith("OCR_RESULT:TEXT="): text = line.replace("OCR_RESULT:TEXT=", "").strip() elif line.startswith("OCR_RESULT:CONFIDENCE="): try: confidence = float(line.replace("OCR_RESULT:CONFIDENCE=", "").strip()) except: confidence = 0.0 elif line.startswith("OCR_RESULT:LINES="): try: line_count = int(line.replace("OCR_RESULT:LINES=", "").strip()) except: line_count = 0 return { "text": text, "confidence": confidence, "line_count": line_count } except Exception as e: print(f"❌ OCR processing failed: {e}") return {"text": "", "confidence": 0.0, "line_count": 0} class IsolatedImageClassifier: """Isolated image classifier using virtual environment""" def __init__(self): self.available = False self._initialize() def _initialize(self): """Initialize image classifier""" try: # Check if virtual environment exists and works venv_python = "openclip_env\\Scripts\\python.exe" if not os.path.exists(venv_python): print("❌ OpenCLIP virtual environment not found") return test_script = """ try: import open_clip print("CLASSIFIER_READY") except Exception as e: print(f"CLASSIFIER_ERROR:{e}") """ result = subprocess.run([venv_python, "-c", test_script], capture_output=True, text=True, timeout=30) if "CLASSIFIER_READY" in result.stdout: self.available = True print("✅ Image classifier initialized successfully") else: print(f"❌ Classifier initialization failed: {result.stderr}") except Exception as e: print(f"❌ Classifier initialization failed: {e}") def classify_image(self, image_path, top_k=3): """Classify image using isolated virtual environment""" if not self.available or not os.path.exists(image_path): return [{"label": "classification_unavailable", "confidence": 0.0}] try: venv_python = "openclip_env\\Scripts\\python.exe" classification_script = f""" import open_clip import torch from PIL import Image try: # Load model model, _, processor = open_clip.create_model_and_transforms( model_name="ViT-B-32", pretrained="laion2b_s34b_b79k" ) # Load and process image image = Image.open(r"{image_path}").convert("RGB") image_tensor = processor(image).unsqueeze(0) # Move to GPU if available if torch.cuda.is_available(): model = model.cuda() image_tensor = image_tensor.cuda() # Get predictions with torch.no_grad(): image_features = model.encode_image(image_tensor) image_features /= image_features.norm(dim=-1, keepdim=True) # Common labels for document processing text_labels = [ "a photo of a bee", "a photo of a flower", "a photo of a person", "a photo of a document", "a photo of a chart", "a photo of a diagram", "a photo of a table", "a photo of a graph", "a photo of a logo", "a photo of a signature", "a photo of a stamp", "a photo of a barcode", "a photo of a QR code", "a photo of a screenshot", "a photo of a landscape", "a photo of an animal", "a photo of a building", "a photo of a vehicle", "a photo of text", "a photo of numbers", "a photo of symbols" ] # Encode text labels text_tokens = open_clip.tokenize(text_labels) if torch.cuda.is_available(): text_tokens = text_tokens.cuda() text_features = model.encode_text(text_tokens) text_features /= text_features.norm(dim=-1, keepdim=True) # Calculate similarity similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1) values, indices = similarity[0].topk({top_k}) for value, index in zip(values, indices): label = text_labels[index] confidence = float(value) print(f"CLASSIFICATION_RESULT:{{label}}|{{confidence}}") except Exception as e: print(f"CLASSIFICATION_ERROR:{{e}}") """ result = subprocess.run([venv_python, "-c", classification_script], capture_output=True, text=True, timeout=30) results = [] for line in result.stdout.split('\n'): if line.startswith("CLASSIFICATION_RESULT:"): parts = line.replace("CLASSIFICATION_RESULT:", "").split("|") if len(parts) == 2: try: label = parts[0] confidence = float(parts[1]) results.append({"label": label, "confidence": confidence}) except: continue if results: return results else: return [{"label": "classification_failed", "confidence": 0.0}] except Exception as e: print(f"❌ Classification failed: {e}") return [{"label": "classification_error", "confidence": 0.0}] class DocumentProcessor: """Main document processor with complete dependency isolation""" def __init__(self): self.ocr_processor = IsolatedOCRProcessor() self.image_classifier = IsolatedImageClassifier() print("🎯 Document Processor Initialized") print(f" OCR: {'✅ Available' if self.ocr_processor.available else '❌ Not Available'}") print(f" Classifier: {'✅ Available' if self.image_classifier.available else '❌ Not Available'}") async def process_document(self, file_path): """Process document with text-first extraction and image processing""" try: # Extract text content first text_content = await self._extract_text_content(file_path) # Extract and process images images_metadata, image_content = await self._extract_and_process_images(file_path) # Combine all content full_content = text_content + "\n\n" + image_content if image_content else text_content return { "success": True, "content": full_content, "metadata": { "file_type": "word", "images_count": len(images_metadata), "processed_with_ocr": self.ocr_processor.available, "processed_with_classification": self.image_classifier.available }, "images": images_metadata } except Exception as e: print(f"❌ Document processing failed: {e}") return { "success": False, "content": "", "metadata": {"error": str(e)}, "images": [] } async def _extract_text_content(self, file_path): """Extract text content from Word document""" try: import docx doc = docx.Document(file_path) content_parts = [] # Extract paragraphs for para in doc.paragraphs: if para.text.strip(): content_parts.append(para.text) # Extract tables for table in doc.tables: table_data = [] for row in table.rows: row_data = [cell.text for cell in row.cells] table_data.append(row_data) if table_data: content_parts.append(f"[Table]: {len(table_data)} rows") return "\n".join(content_parts) except Exception as e: print(f"⚠️ Text extraction failed: {e}") return "Text content extraction failed" async def _extract_and_process_images(self, file_path): """Extract and process images from document""" images_metadata = [] image_content_parts = [] try: with tempfile.TemporaryDirectory() as temp_dir: # Extract images from docx using zipfile with zipfile.ZipFile(file_path, 'r') as zip_ref: image_files = [] for file_info in zip_ref.filelist: if file_info.filename.startswith('word/media/'): image_filename = os.path.basename(file_info.filename) image_path = os.path.join(temp_dir, image_filename) with zip_ref.open(file_info.filename) as source, open(image_path, 'wb') as target: target.write(source.read()) image_files.append(image_path) print(f"📸 Extracted image: {image_path}") print(f"Found {len(image_files)} images in document") # Process each image for i, image_path in enumerate(image_files): image_metadata = {"index": i, "path": image_path} # OCR processing if self.ocr_processor.available: ocr_result = self.ocr_processor.extract_text_from_image(image_path) if ocr_result["text"].strip(): image_metadata["ocr_text"] = ocr_result["text"] image_metadata["ocr_confidence"] = ocr_result["confidence"] image_content_parts.append(f"[Image {i+1} OCR]: {ocr_result['text']}") print(f"✅ Image {i+1} OCR: {len(ocr_result['text'])} chars") # Image classification if self.image_classifier.available: classification_results = self.image_classifier.classify_image(image_path) image_metadata["classification"] = classification_results if classification_results and classification_results[0]["confidence"] > 0: top_label = classification_results[0]["label"] top_confidence = classification_results[0]["confidence"] image_content_parts.append(f"[Image {i+1} Classification]: {top_label} ({top_confidence:.3f})") print(f"✅ Image {i+1} Classification: {top_label} ({top_confidence:.3f})") # Check for bee if "bee" in top_label.lower(): print(f"🎯 BEE DETECTED in image {i+1}!") images_metadata.append(image_metadata) return images_metadata, "\n".join(image_content_parts) except Exception as e: print(f"❌ Image processing failed: {e}") return [], "" async def test_final_solution(): """Test the final integrated solution""" print("🧪 FINAL INTEGRATED SOLUTION TEST") print("=" * 50) processor = DocumentProcessor() # Test with test.docx test_file = "test.docx" if not os.path.exists(test_file): print(f"❌ Test file not found: {test_file}") return print(f"\n📄 PROCESSING: {test_file}") result = await processor.process_document(test_file) if not result["success"]: print(f"❌ Processing failed: {result['metadata'].get('error', 'Unknown error')}") return print(f"✅ Processing successful") print(f"📊 Metadata: {result['metadata']}") # Analyze results ocr_working = False classification_working = False bee_found = False for img in result["images"]: if "ocr_text" in img and img["ocr_text"].strip(): ocr_working = True if "classification" in img and img["classification"] and img["classification"][0]["confidence"] > 0: classification_working = True if "bee" in img["classification"][0]["label"].lower(): bee_found = True print(f"\n🎯 FINAL RESULTS:") print(f" OCR: {'✅ WORKING' if ocr_working else '❌ FAILED'}") print(f" Classification: {'✅ WORKING' if classification_working else '❌ FAILED'}") print(f" Bee Detection: {'✅ SUCCESS' if bee_found else '❌ NOT FOUND'}") print(f" Dependency Isolation: {'✅ ACHIEVED' if ocr_working and classification_working else '❌ FAILED'}") # Show sample content if result["content"]: print(f"\n📝 SAMPLE CONTENT (first 500 chars):") print(result["content"][:500] + "...") return result if __name__ == "__main__": asyncio.run(test_final_solution())