Files
railseek6/final_integrated_solution.py

442 lines
17 KiB
Python

"""
Final Integrated Solution for Document Processing Pipeline
Complete dependency isolation between PaddleOCR and OpenCLIP
"""
import os
import sys
import subprocess
import tempfile
import asyncio
import zipfile
from pathlib import Path
class IsolatedOCRProcessor:
"""Isolated OCR processor using subprocess communication"""
def __init__(self):
self.available = False
self._initialize()
def _initialize(self):
"""Initialize OCR processor"""
try:
# Test if PaddleOCR works
test_script = """
import sys
try:
from paddleocr import PaddleOCR
print("OCR_READY")
except Exception as e:
print(f"OCR_ERROR:{e}")
"""
result = subprocess.run([sys.executable, "-c", test_script],
capture_output=True, text=True, timeout=10)
if "OCR_READY" in result.stdout:
self.available = True
print("✅ OCR processor initialized successfully")
else:
print(f"❌ OCR initialization failed: {result.stderr}")
except Exception as e:
print(f"❌ OCR initialization failed: {e}")
def extract_text_from_image(self, image_path):
"""Extract text from image using direct subprocess"""
if not self.available or not os.path.exists(image_path):
return {"text": "", "confidence": 0.0, "line_count": 0}
try:
ocr_script = f"""
import sys
from paddleocr import PaddleOCR
try:
ocr = PaddleOCR(use_gpu=True, cls=True)
result = ocr.ocr(r"{image_path}")
if not result or not result[0]:
print("OCR_RESULT:EMPTY")
sys.exit(0)
extracted_text = []
total_confidence = 0.0
line_count = 0
for line in result[0]:
try:
if len(line) == 2:
bbox, (text, confidence) = line
elif len(line) >= 1:
bbox = line[0] if len(line) > 0 else []
if len(line) > 1:
if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2:
text, confidence = line[1][0], line[1][1]
else:
text, confidence = str(line[1]) if len(line) > 1 else "", 0.0
else:
text, confidence = "", 0.0
else:
continue
text_str = str(text) if text is not None else ""
confidence_float = float(confidence) if isinstance(confidence, (int, float)) else 0.0
extracted_text.append(text_str)
total_confidence += confidence_float
line_count += 1
except Exception:
extracted_text.append("")
total_confidence += 0.0
line_count += 1
avg_confidence = total_confidence / line_count if line_count > 0 else 0.0
full_text = " ".join(extracted_text)
print(f"OCR_RESULT:TEXT={{full_text}}")
print(f"OCR_RESULT:CONFIDENCE={{avg_confidence}}")
print(f"OCR_RESULT:LINES={{line_count}}")
except Exception as e:
print(f"OCR_ERROR:{{e}}")
"""
result = subprocess.run([sys.executable, "-c", ocr_script],
capture_output=True, text=True, timeout=60)
# Parse results from stdout
text = ""
confidence = 0.0
line_count = 0
for line in result.stdout.split('\n'):
if line.startswith("OCR_RESULT:TEXT="):
text = line.replace("OCR_RESULT:TEXT=", "").strip()
elif line.startswith("OCR_RESULT:CONFIDENCE="):
try:
confidence = float(line.replace("OCR_RESULT:CONFIDENCE=", "").strip())
except:
confidence = 0.0
elif line.startswith("OCR_RESULT:LINES="):
try:
line_count = int(line.replace("OCR_RESULT:LINES=", "").strip())
except:
line_count = 0
return {
"text": text,
"confidence": confidence,
"line_count": line_count
}
except Exception as e:
print(f"❌ OCR processing failed: {e}")
return {"text": "", "confidence": 0.0, "line_count": 0}
class IsolatedImageClassifier:
"""Isolated image classifier using virtual environment"""
def __init__(self):
self.available = False
self._initialize()
def _initialize(self):
"""Initialize image classifier"""
try:
# Check if virtual environment exists and works
venv_python = "openclip_env\\Scripts\\python.exe"
if not os.path.exists(venv_python):
print("❌ OpenCLIP virtual environment not found")
return
test_script = """
try:
import open_clip
print("CLASSIFIER_READY")
except Exception as e:
print(f"CLASSIFIER_ERROR:{e}")
"""
result = subprocess.run([venv_python, "-c", test_script],
capture_output=True, text=True, timeout=30)
if "CLASSIFIER_READY" in result.stdout:
self.available = True
print("✅ Image classifier initialized successfully")
else:
print(f"❌ Classifier initialization failed: {result.stderr}")
except Exception as e:
print(f"❌ Classifier initialization failed: {e}")
def classify_image(self, image_path, top_k=3):
"""Classify image using isolated virtual environment"""
if not self.available or not os.path.exists(image_path):
return [{"label": "classification_unavailable", "confidence": 0.0}]
try:
venv_python = "openclip_env\\Scripts\\python.exe"
classification_script = f"""
import open_clip
import torch
from PIL import Image
try:
# Load model
model, _, processor = open_clip.create_model_and_transforms(
model_name="ViT-B-32",
pretrained="laion2b_s34b_b79k"
)
# Load and process image
image = Image.open(r"{image_path}").convert("RGB")
image_tensor = processor(image).unsqueeze(0)
# Move to GPU if available
if torch.cuda.is_available():
model = model.cuda()
image_tensor = image_tensor.cuda()
# Get predictions
with torch.no_grad():
image_features = model.encode_image(image_tensor)
image_features /= image_features.norm(dim=-1, keepdim=True)
# Common labels for document processing
text_labels = [
"a photo of a bee", "a photo of a flower", "a photo of a person",
"a photo of a document", "a photo of a chart", "a photo of a diagram",
"a photo of a table", "a photo of a graph", "a photo of a logo",
"a photo of a signature", "a photo of a stamp", "a photo of a barcode",
"a photo of a QR code", "a photo of a screenshot", "a photo of a landscape",
"a photo of an animal", "a photo of a building", "a photo of a vehicle",
"a photo of text", "a photo of numbers", "a photo of symbols"
]
# Encode text labels
text_tokens = open_clip.tokenize(text_labels)
if torch.cuda.is_available():
text_tokens = text_tokens.cuda()
text_features = model.encode_text(text_tokens)
text_features /= text_features.norm(dim=-1, keepdim=True)
# Calculate similarity
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
values, indices = similarity[0].topk({top_k})
for value, index in zip(values, indices):
label = text_labels[index]
confidence = float(value)
print(f"CLASSIFICATION_RESULT:{{label}}|{{confidence}}")
except Exception as e:
print(f"CLASSIFICATION_ERROR:{{e}}")
"""
result = subprocess.run([venv_python, "-c", classification_script],
capture_output=True, text=True, timeout=30)
results = []
for line in result.stdout.split('\n'):
if line.startswith("CLASSIFICATION_RESULT:"):
parts = line.replace("CLASSIFICATION_RESULT:", "").split("|")
if len(parts) == 2:
try:
label = parts[0]
confidence = float(parts[1])
results.append({"label": label, "confidence": confidence})
except:
continue
if results:
return results
else:
return [{"label": "classification_failed", "confidence": 0.0}]
except Exception as e:
print(f"❌ Classification failed: {e}")
return [{"label": "classification_error", "confidence": 0.0}]
class DocumentProcessor:
"""Main document processor with complete dependency isolation"""
def __init__(self):
self.ocr_processor = IsolatedOCRProcessor()
self.image_classifier = IsolatedImageClassifier()
print("🎯 Document Processor Initialized")
print(f" OCR: {'✅ Available' if self.ocr_processor.available else '❌ Not Available'}")
print(f" Classifier: {'✅ Available' if self.image_classifier.available else '❌ Not Available'}")
async def process_document(self, file_path):
"""Process document with text-first extraction and image processing"""
try:
# Extract text content first
text_content = await self._extract_text_content(file_path)
# Extract and process images
images_metadata, image_content = await self._extract_and_process_images(file_path)
# Combine all content
full_content = text_content + "\n\n" + image_content if image_content else text_content
return {
"success": True,
"content": full_content,
"metadata": {
"file_type": "word",
"images_count": len(images_metadata),
"processed_with_ocr": self.ocr_processor.available,
"processed_with_classification": self.image_classifier.available
},
"images": images_metadata
}
except Exception as e:
print(f"❌ Document processing failed: {e}")
return {
"success": False,
"content": "",
"metadata": {"error": str(e)},
"images": []
}
async def _extract_text_content(self, file_path):
"""Extract text content from Word document"""
try:
import docx
doc = docx.Document(file_path)
content_parts = []
# Extract paragraphs
for para in doc.paragraphs:
if para.text.strip():
content_parts.append(para.text)
# Extract tables
for table in doc.tables:
table_data = []
for row in table.rows:
row_data = [cell.text for cell in row.cells]
table_data.append(row_data)
if table_data:
content_parts.append(f"[Table]: {len(table_data)} rows")
return "\n".join(content_parts)
except Exception as e:
print(f"⚠️ Text extraction failed: {e}")
return "Text content extraction failed"
async def _extract_and_process_images(self, file_path):
"""Extract and process images from document"""
images_metadata = []
image_content_parts = []
try:
with tempfile.TemporaryDirectory() as temp_dir:
# Extract images from docx using zipfile
with zipfile.ZipFile(file_path, 'r') as zip_ref:
image_files = []
for file_info in zip_ref.filelist:
if file_info.filename.startswith('word/media/'):
image_filename = os.path.basename(file_info.filename)
image_path = os.path.join(temp_dir, image_filename)
with zip_ref.open(file_info.filename) as source, open(image_path, 'wb') as target:
target.write(source.read())
image_files.append(image_path)
print(f"📸 Extracted image: {image_path}")
print(f"Found {len(image_files)} images in document")
# Process each image
for i, image_path in enumerate(image_files):
image_metadata = {"index": i, "path": image_path}
# OCR processing
if self.ocr_processor.available:
ocr_result = self.ocr_processor.extract_text_from_image(image_path)
if ocr_result["text"].strip():
image_metadata["ocr_text"] = ocr_result["text"]
image_metadata["ocr_confidence"] = ocr_result["confidence"]
image_content_parts.append(f"[Image {i+1} OCR]: {ocr_result['text']}")
print(f"✅ Image {i+1} OCR: {len(ocr_result['text'])} chars")
# Image classification
if self.image_classifier.available:
classification_results = self.image_classifier.classify_image(image_path)
image_metadata["classification"] = classification_results
if classification_results and classification_results[0]["confidence"] > 0:
top_label = classification_results[0]["label"]
top_confidence = classification_results[0]["confidence"]
image_content_parts.append(f"[Image {i+1} Classification]: {top_label} ({top_confidence:.3f})")
print(f"✅ Image {i+1} Classification: {top_label} ({top_confidence:.3f})")
# Check for bee
if "bee" in top_label.lower():
print(f"🎯 BEE DETECTED in image {i+1}!")
images_metadata.append(image_metadata)
return images_metadata, "\n".join(image_content_parts)
except Exception as e:
print(f"❌ Image processing failed: {e}")
return [], ""
async def test_final_solution():
"""Test the final integrated solution"""
print("🧪 FINAL INTEGRATED SOLUTION TEST")
print("=" * 50)
processor = DocumentProcessor()
# Test with test.docx
test_file = "test.docx"
if not os.path.exists(test_file):
print(f"❌ Test file not found: {test_file}")
return
print(f"\n📄 PROCESSING: {test_file}")
result = await processor.process_document(test_file)
if not result["success"]:
print(f"❌ Processing failed: {result['metadata'].get('error', 'Unknown error')}")
return
print(f"✅ Processing successful")
print(f"📊 Metadata: {result['metadata']}")
# Analyze results
ocr_working = False
classification_working = False
bee_found = False
for img in result["images"]:
if "ocr_text" in img and img["ocr_text"].strip():
ocr_working = True
if "classification" in img and img["classification"] and img["classification"][0]["confidence"] > 0:
classification_working = True
if "bee" in img["classification"][0]["label"].lower():
bee_found = True
print(f"\n🎯 FINAL RESULTS:")
print(f" OCR: {'✅ WORKING' if ocr_working else '❌ FAILED'}")
print(f" Classification: {'✅ WORKING' if classification_working else '❌ FAILED'}")
print(f" Bee Detection: {'✅ SUCCESS' if bee_found else '❌ NOT FOUND'}")
print(f" Dependency Isolation: {'✅ ACHIEVED' if ocr_working and classification_working else '❌ FAILED'}")
# Show sample content
if result["content"]:
print(f"\n📝 SAMPLE CONTENT (first 500 chars):")
print(result["content"][:500] + "...")
return result
if __name__ == "__main__":
asyncio.run(test_final_solution())