Files
railseek6/fixed_ocr_classifier.py

397 lines
15 KiB
Python

"""
Fixed OCR and Image Classification with Complete Dependency Isolation
Direct subprocess communication without file-based JSON parsing
"""
import os
import sys
import subprocess
import tempfile
import asyncio
from pathlib import Path
class FixedOCRProcessor:
"""Fixed OCR processor using direct subprocess communication"""
def __init__(self):
self.available = False
self._initialize()
def _initialize(self):
"""Initialize OCR processor"""
try:
# Test if PaddleOCR works
test_script = """
import sys
try:
from paddleocr import PaddleOCR
print("OCR_READY")
except Exception as e:
print(f"OCR_ERROR:{e}")
"""
result = subprocess.run([sys.executable, "-c", test_script],
capture_output=True, text=True, timeout=10)
if "OCR_READY" in result.stdout:
self.available = True
print("✅ OCR processor initialized successfully")
else:
print(f"❌ OCR initialization failed: {result.stderr}")
except Exception as e:
print(f"❌ OCR initialization failed: {e}")
def extract_text_from_image(self, image_path):
"""Extract text from image using direct subprocess"""
if not self.available or not os.path.exists(image_path):
return {"text": "", "confidence": 0.0, "line_count": 0}
try:
ocr_script = f"""
import sys
from paddleocr import PaddleOCR
try:
ocr = PaddleOCR(use_gpu=True, cls=True)
result = ocr.ocr(r"{image_path}")
if not result or not result[0]:
print("OCR_RESULT:EMPTY")
sys.exit(0)
extracted_text = []
total_confidence = 0.0
line_count = 0
for line in result[0]:
try:
if len(line) == 2:
bbox, (text, confidence) = line
elif len(line) >= 1:
bbox = line[0] if len(line) > 0 else []
if len(line) > 1:
if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2:
text, confidence = line[1][0], line[1][1]
else:
text, confidence = str(line[1]) if len(line) > 1 else "", 0.0
else:
text, confidence = "", 0.0
else:
continue
text_str = str(text) if text is not None else ""
confidence_float = float(confidence) if isinstance(confidence, (int, float)) else 0.0
extracted_text.append(text_str)
total_confidence += confidence_float
line_count += 1
except Exception:
extracted_text.append("")
total_confidence += 0.0
line_count += 1
avg_confidence = total_confidence / line_count if line_count > 0 else 0.0
full_text = " ".join(extracted_text)
print(f"OCR_RESULT:TEXT={{full_text}}")
print(f"OCR_RESULT:CONFIDENCE={{avg_confidence}}")
print(f"OCR_RESULT:LINES={{line_count}}")
except Exception as e:
print(f"OCR_ERROR:{{e}}")
"""
result = subprocess.run([sys.executable, "-c", ocr_script],
capture_output=True, text=True, timeout=60)
# Parse results from stdout
text = ""
confidence = 0.0
line_count = 0
for line in result.stdout.split('\n'):
if line.startswith("OCR_RESULT:TEXT="):
text = line.replace("OCR_RESULT:TEXT=", "").strip()
elif line.startswith("OCR_RESULT:CONFIDENCE="):
try:
confidence = float(line.replace("OCR_RESULT:CONFIDENCE=", "").strip())
except:
confidence = 0.0
elif line.startswith("OCR_RESULT:LINES="):
try:
line_count = int(line.replace("OCR_RESULT:LINES=", "").strip())
except:
line_count = 0
return {
"text": text,
"confidence": confidence,
"line_count": line_count
}
except Exception as e:
print(f"❌ OCR processing failed: {e}")
return {"text": "", "confidence": 0.0, "line_count": 0}
class FixedImageClassifier:
"""Fixed image classifier using direct subprocess communication"""
def __init__(self):
self.available = False
self._initialize()
def _initialize(self):
"""Initialize image classifier"""
try:
# Check if virtual environment exists and works
venv_python = "openclip_env\\Scripts\\python.exe"
if not os.path.exists(venv_python):
print("❌ OpenCLIP virtual environment not found")
return
test_script = """
try:
import open_clip
print("CLASSIFIER_READY")
except Exception as e:
print(f"CLASSIFIER_ERROR:{e}")
"""
result = subprocess.run([venv_python, "-c", test_script],
capture_output=True, text=True, timeout=30)
if "CLASSIFIER_READY" in result.stdout:
self.available = True
print("✅ Image classifier initialized successfully")
else:
print(f"❌ Classifier initialization failed: {result.stderr}")
except Exception as e:
print(f"❌ Classifier initialization failed: {e}")
def classify_image(self, image_path, top_k=3):
"""Classify image using direct subprocess communication"""
if not self.available or not os.path.exists(image_path):
return [{"label": "classification_unavailable", "confidence": 0.0}]
try:
venv_python = "openclip_env\\Scripts\\python.exe"
classification_script = f"""
import open_clip
import torch
from PIL import Image
try:
# Load model
model, _, processor = open_clip.create_model_and_transforms(
model_name="ViT-B-32",
pretrained="laion2b_s34b_b79k"
)
# Load and process image
image = Image.open(r"{image_path}").convert("RGB")
image_tensor = processor(image).unsqueeze(0)
# Move to GPU if available
if torch.cuda.is_available():
model = model.cuda()
image_tensor = image_tensor.cuda()
# Get predictions
with torch.no_grad():
image_features = model.encode_image(image_tensor)
image_features /= image_features.norm(dim=-1, keepdim=True)
# Common labels
text_labels = [
"a photo of a bee", "a photo of a flower", "a photo of a person",
"a photo of a document", "a photo of a chart", "a photo of a diagram",
"a photo of a table", "a photo of a graph", "a photo of a logo",
"a photo of a signature", "a photo of a stamp", "a photo of a barcode"
]
# Encode text labels
text_tokens = open_clip.tokenize(text_labels)
if torch.cuda.is_available():
text_tokens = text_tokens.cuda()
text_features = model.encode_text(text_tokens)
text_features /= text_features.norm(dim=-1, keepdim=True)
# Calculate similarity
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
values, indices = similarity[0].topk({top_k})
for value, index in zip(values, indices):
label = text_labels[index]
confidence = float(value)
print(f"CLASSIFICATION_RESULT:{{label}}|{{confidence}}")
except Exception as e:
print(f"CLASSIFICATION_ERROR:{{e}}")
"""
result = subprocess.run([venv_python, "-c", classification_script],
capture_output=True, text=True, timeout=30)
results = []
for line in result.stdout.split('\n'):
if line.startswith("CLASSIFICATION_RESULT:"):
parts = line.replace("CLASSIFICATION_RESULT:", "").split("|")
if len(parts) == 2:
try:
label = parts[0]
confidence = float(parts[1])
results.append({"label": label, "confidence": confidence})
except:
continue
if results:
return results
else:
return [{"label": "classification_failed", "confidence": 0.0}]
except Exception as e:
print(f"❌ Classification failed: {e}")
return [{"label": "classification_error", "confidence": 0.0}]
class FixedDocumentProcessor:
"""Fixed document processor with complete dependency isolation"""
def __init__(self):
self.ocr_processor = FixedOCRProcessor()
self.image_classifier = FixedImageClassifier()
print("🎯 Fixed Document Processor Initialized")
print(f" OCR: {'✅ Available' if self.ocr_processor.available else '❌ Not Available'}")
print(f" Classifier: {'✅ Available' if self.image_classifier.available else '❌ Not Available'}")
async def process_document(self, file_path):
"""Process document with fixed OCR and classification"""
try:
import zipfile
import tempfile
# Extract images from Word document
images = []
content_parts = []
with tempfile.TemporaryDirectory() as temp_dir:
# Extract images from docx
with zipfile.ZipFile(file_path, 'r') as zip_ref:
image_files = []
for file_info in zip_ref.filelist:
if file_info.filename.startswith('word/media/'):
image_filename = os.path.basename(file_info.filename)
image_path = os.path.join(temp_dir, image_filename)
with zip_ref.open(file_info.filename) as source, open(image_path, 'wb') as target:
target.write(source.read())
image_files.append(image_path)
print(f"📸 Extracted image: {image_path}")
print(f"Found {len(image_files)} images in document")
# Process each image
for i, image_path in enumerate(image_files):
image_metadata = {"index": i, "path": image_path}
# OCR processing
if self.ocr_processor.available:
ocr_result = self.ocr_processor.extract_text_from_image(image_path)
if ocr_result["text"].strip():
image_metadata["ocr_text"] = ocr_result["text"]
image_metadata["ocr_confidence"] = ocr_result["confidence"]
content_parts.append(f"[Image {i+1} OCR]: {ocr_result['text']}")
print(f"✅ Image {i+1} OCR: {len(ocr_result['text'])} chars")
# Image classification
if self.image_classifier.available:
classification_results = self.image_classifier.classify_image(image_path)
image_metadata["classification"] = classification_results
if classification_results and classification_results[0]["confidence"] > 0:
top_label = classification_results[0]["label"]
top_confidence = classification_results[0]["confidence"]
content_parts.append(f"[Image {i+1} Classification]: {top_label} ({top_confidence:.3f})")
print(f"✅ Image {i+1} Classification: {top_label} ({top_confidence:.3f})")
# Check for bee
if "bee" in top_label.lower():
print(f"🎯 BEE DETECTED in image {i+1}!")
images.append(image_metadata)
# Add some basic content
content_parts.insert(0, f"Processed document: {os.path.basename(file_path)}")
content_parts.insert(1, f"Total images: {len(images)}")
full_content = "\n".join(content_parts)
return {
"success": True,
"content": full_content,
"metadata": {
"file_type": "word",
"images_count": len(images),
"processed_with_ocr": self.ocr_processor.available,
"processed_with_classification": self.image_classifier.available
},
"images": images
}
except Exception as e:
print(f"❌ Document processing failed: {e}")
return {
"success": False,
"content": "",
"metadata": {"error": str(e)},
"images": []
}
async def test_fixed_solution():
"""Test the fixed solution"""
print("🧪 TESTING FIXED SOLUTION")
print("=" * 50)
processor = FixedDocumentProcessor()
# Test with test.docx
test_file = "test.docx"
if not os.path.exists(test_file):
print(f"❌ Test file not found: {test_file}")
return
print(f"\n📄 PROCESSING: {test_file}")
result = await processor.process_document(test_file)
if not result["success"]:
print(f"❌ Processing failed: {result['metadata'].get('error', 'Unknown error')}")
return
print(f"✅ Processing successful")
print(f"📊 Metadata: {result['metadata']}")
# Analyze results
ocr_working = False
classification_working = False
bee_found = False
for img in result["images"]:
if "ocr_text" in img and img["ocr_text"].strip():
ocr_working = True
if "classification" in img and img["classification"] and img["classification"][0]["confidence"] > 0:
classification_working = True
if "bee" in img["classification"][0]["label"].lower():
bee_found = True
print(f"\n🎯 FINAL RESULTS:")
print(f" OCR: {'✅ WORKING' if ocr_working else '❌ FAILED'}")
print(f" Classification: {'✅ WORKING' if classification_working else '❌ FAILED'}")
print(f" Bee Detection: {'✅ SUCCESS' if bee_found else '❌ NOT FOUND'}")
print(f" Dependency Isolation: {'✅ ACHIEVED' if ocr_working and classification_working else '❌ FAILED'}")
return result
if __name__ == "__main__":
asyncio.run(test_fixed_solution())