299 lines
13 KiB
Python
299 lines
13 KiB
Python
"""
|
||
Fix indexing to include image classification results in searchable content
|
||
"""
|
||
|
||
import asyncio
|
||
import sys
|
||
import os
|
||
from pathlib import Path
|
||
|
||
# Add paths
|
||
sys.path.insert(0, "LightRAG-main")
|
||
|
||
def test_current_indexing():
|
||
"""Test what content is currently being indexed"""
|
||
print("🔍 Testing Current Indexing Behavior")
|
||
print("=" * 50)
|
||
|
||
try:
|
||
from lightrag.document_processor import get_document_processor
|
||
|
||
processor = get_document_processor()
|
||
|
||
# Process test document
|
||
test_file = "test.docx"
|
||
if not os.path.exists(test_file):
|
||
print(f"❌ Test file not found: {test_file}")
|
||
return
|
||
|
||
print(f"📄 Processing: {test_file}")
|
||
result = asyncio.run(processor.process_document(test_file))
|
||
|
||
print(f"✅ Processing Success: {result.success}")
|
||
print(f"📊 Metadata: {result.metadata}")
|
||
print(f"📝 Content Length: {len(result.content)} characters")
|
||
|
||
# Show what content is actually being indexed
|
||
print(f"\n📋 CONTENT PREVIEW (first 500 chars):")
|
||
print(result.content[:500])
|
||
|
||
print(f"\n📋 CONTENT PREVIEW (last 500 chars):")
|
||
print(result.content[-500:])
|
||
|
||
# Check for image-related content
|
||
print(f"\n🔍 SEARCHING FOR IMAGE CONTENT:")
|
||
if "[Image" in result.content:
|
||
print("✅ Found image metadata in content")
|
||
# Extract all image-related lines
|
||
lines = result.content.split('\n')
|
||
image_lines = [line for line in lines if '[Image' in line]
|
||
for line in image_lines:
|
||
print(f" {line}")
|
||
else:
|
||
print("❌ No image metadata found in content")
|
||
|
||
# Check for bee-related content
|
||
print(f"\n🐝 SEARCHING FOR BEE CONTENT:")
|
||
if 'bee' in result.content.lower():
|
||
print("✅ Found 'bee' in content")
|
||
bee_lines = [line for line in lines if 'bee' in line.lower()]
|
||
for line in bee_lines:
|
||
print(f" {line}")
|
||
else:
|
||
print("❌ No 'bee' found in content")
|
||
|
||
except Exception as e:
|
||
print(f"❌ Test failed: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
|
||
def fix_document_processor():
|
||
"""Fix the document processor to include image classifications in searchable content"""
|
||
print("\n🔧 Fixing Document Processor for Better Indexing")
|
||
print("=" * 50)
|
||
|
||
# Read the current document processor
|
||
with open("LightRAG-main/lightrag/document_processor.py", "r", encoding="utf-8") as f:
|
||
content = f.read()
|
||
|
||
# Find the _extract_and_process_images method and enhance it
|
||
old_method = ''' # OCR processing - ensure it works properly
|
||
if self.ocr_processor.ocr_available:
|
||
try:
|
||
logger.info(f"Running OCR on image {i+1}")
|
||
ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
|
||
logger.info(f"OCR result for image {i+1}: {len(ocr_result['text'])} characters, confidence: {ocr_result['confidence']}")
|
||
|
||
if ocr_result["text"].strip():
|
||
image_metadata["ocr_text"] = ocr_result["text"]
|
||
image_metadata["ocr_confidence"] = ocr_result["confidence"]
|
||
additional_content.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}")
|
||
else:
|
||
logger.warning(f"OCR returned empty text for image {i+1}")
|
||
except Exception as ocr_error:
|
||
logger.error(f"OCR processing failed for image {i+1}: {ocr_error}")
|
||
image_metadata["ocr_error"] = str(ocr_error)'''
|
||
|
||
new_method = ''' # OCR processing - ensure it works properly
|
||
if self.ocr_processor.ocr_available:
|
||
try:
|
||
logger.info(f"Running OCR on image {i+1}")
|
||
ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
|
||
logger.info(f"OCR result for image {i+1}: {len(ocr_result['text'])} characters, confidence: {ocr_result['confidence']}")
|
||
|
||
if ocr_result["text"].strip():
|
||
image_metadata["ocr_text"] = ocr_result["text"]
|
||
image_metadata["ocr_confidence"] = ocr_result["confidence"]
|
||
additional_content.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}")
|
||
# Add OCR text directly to main content for better searchability
|
||
additional_content.append(ocr_result["text"])
|
||
else:
|
||
logger.warning(f"OCR returned empty text for image {i+1}")
|
||
except Exception as ocr_error:
|
||
logger.error(f"OCR processing failed for image {i+1}: {ocr_error}")
|
||
image_metadata["ocr_error"] = str(ocr_error)'''
|
||
|
||
content = content.replace(old_method, new_method)
|
||
|
||
# Also fix the classification part to add more searchable content
|
||
old_classification = ''' # Image classification
|
||
if self.image_classifier and self.image_classifier.available:
|
||
try:
|
||
classification_results = self.image_classifier.classify_image(temp_path, top_k=3)
|
||
image_metadata["classification"] = classification_results
|
||
# Add classification to content for indexing
|
||
top_label = classification_results[0]["label"] if classification_results else "unknown"
|
||
image_metadata["primary_classification"] = top_label
|
||
additional_content.append(f"[Image {i+1} Classification]: {top_label}")
|
||
except Exception as classify_error:
|
||
logger.error(f"Image classification failed for image {i+1}: {classify_error}")
|
||
image_metadata["classification_error"] = str(classify_error)'''
|
||
|
||
new_classification = ''' # Image classification
|
||
if self.image_classifier and self.image_classifier.available:
|
||
try:
|
||
classification_results = self.image_classifier.classify_image(temp_path, top_k=3)
|
||
image_metadata["classification"] = classification_results
|
||
# Add classification to content for indexing
|
||
top_label = classification_results[0]["label"] if classification_results else "unknown"
|
||
image_metadata["primary_classification"] = top_label
|
||
additional_content.append(f"[Image {i+1} Classification]: {top_label}")
|
||
# Add all classification labels for better searchability
|
||
for j, cls in enumerate(classification_results):
|
||
additional_content.append(f"Image {i+1} classified as: {cls['label']} with confidence {cls['confidence']:.3f}")
|
||
except Exception as classify_error:
|
||
logger.error(f"Image classification failed for image {i+1}: {classify_error}")
|
||
image_metadata["classification_error"] = str(classify_error)'''
|
||
|
||
content = content.replace(old_classification, new_classification)
|
||
|
||
# Write the fixed content back
|
||
with open("LightRAG-main/lightrag/document_processor.py", "w", encoding="utf-8") as f:
|
||
f.write(content)
|
||
|
||
print("✅ Document processor updated for better indexing")
|
||
|
||
def create_enhanced_test():
|
||
"""Create a test that simulates the full upload and search workflow"""
|
||
print("\n🚀 Creating Enhanced Search Test")
|
||
print("=" * 50)
|
||
|
||
test_code = '''
|
||
"""
|
||
Enhanced test that simulates upload, indexing, and search
|
||
"""
|
||
|
||
import asyncio
|
||
import sys
|
||
import os
|
||
from pathlib import Path
|
||
|
||
# Add paths
|
||
sys.path.insert(0, "LightRAG-main")
|
||
|
||
async def test_full_workflow():
|
||
"""Test the complete workflow including simulated search"""
|
||
print("🔍 TESTING COMPLETE WORKFLOW WITH SEARCH")
|
||
print("=" * 60)
|
||
|
||
try:
|
||
from lightrag.document_processor import get_document_processor
|
||
|
||
processor = get_document_processor()
|
||
|
||
# Process test document
|
||
test_file = "test.docx"
|
||
if not os.path.exists(test_file):
|
||
print(f"❌ Test file not found: {test_file}")
|
||
return
|
||
|
||
print(f"📄 Processing: {test_file}")
|
||
result = await processor.process_document(test_file)
|
||
|
||
if not result.success:
|
||
print(f"❌ Processing failed: {result.error}")
|
||
return
|
||
|
||
print(f"✅ Processing Success")
|
||
print(f"📊 Metadata: {result.metadata}")
|
||
|
||
# Simulate indexing and search
|
||
print(f"\\n🔍 SIMULATING INDEXING AND SEARCH")
|
||
print("=" * 40)
|
||
|
||
# Extract all searchable content
|
||
search_content = result.content.lower()
|
||
|
||
# Test various search queries
|
||
test_queries = [
|
||
"bee", "insect", "animal", "clipart", "image",
|
||
"docker", "windows", "autologin", "configuration"
|
||
]
|
||
|
||
print("📋 SEARCH RESULTS:")
|
||
for query in test_queries:
|
||
if query in search_content:
|
||
print(f" ✅ '{query}': FOUND in indexed content")
|
||
# Show context
|
||
idx = search_content.find(query)
|
||
context = result.content[max(0, idx-50):min(len(result.content), idx+50)]
|
||
print(f" Context: ...{context}...")
|
||
else:
|
||
print(f" ❌ '{query}': NOT FOUND in indexed content")
|
||
|
||
# Specifically check for image classifications
|
||
print(f"\\n🖼️ IMAGE CLASSIFICATION SEARCH:")
|
||
bee_found = False
|
||
for i, img in enumerate(result.images):
|
||
if 'primary_classification' in img:
|
||
classification = img['primary_classification'].lower()
|
||
print(f" Image {i+1}: {classification}")
|
||
if 'bee' in classification:
|
||
bee_found = True
|
||
print(f" 🎯 BEE DETECTED in image {i+1}")
|
||
else:
|
||
print(f" Image {i+1}: No classification available")
|
||
|
||
if not bee_found:
|
||
print(" ❌ No bee detected in any image classifications")
|
||
|
||
# Check if bee appears in any OCR text
|
||
print(f"\\n🔤 OCR TEXT ANALYSIS:")
|
||
bee_in_ocr = False
|
||
for i, img in enumerate(result.images):
|
||
if 'ocr_text' in img and img['ocr_text']:
|
||
ocr_text = img['ocr_text'].lower()
|
||
if 'bee' in ocr_text:
|
||
bee_in_ocr = True
|
||
print(f" ✅ Image {i+1} OCR contains 'bee': {ocr_text[:100]}...")
|
||
else:
|
||
print(f" Image {i+1} OCR: {ocr_text[:50]}..." if ocr_text else " Image {i+1}: No OCR text")
|
||
else:
|
||
print(f" Image {i+1}: No OCR text available")
|
||
|
||
print(f"\\n🎯 FINAL BEE DETECTION STATUS:")
|
||
if bee_found or bee_in_ocr or 'bee' in search_content:
|
||
print(" ✅ BEE CONTENT IS SEARCHABLE AND INDEXED")
|
||
else:
|
||
print(" ❌ BEE CONTENT IS NOT PROPERLY INDEXED")
|
||
print(" 📝 Recommendations:")
|
||
print(" - Ensure image classifications are included in main content")
|
||
print(" - Add classification labels to searchable text")
|
||
print(" - Include OCR text from images in search index")
|
||
|
||
except Exception as e:
|
||
print(f"❌ Test failed: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(test_full_workflow())
|
||
'''
|
||
|
||
with open("enhanced_search_test.py", "w", encoding="utf-8") as f:
|
||
f.write(test_code)
|
||
|
||
print("✅ Created enhanced search test")
|
||
|
||
def main():
|
||
"""Run all fixes"""
|
||
print("🎯 FIXING INDEXING FOR BEE DETECTION")
|
||
print("=" * 60)
|
||
|
||
# Test current state
|
||
test_current_indexing()
|
||
|
||
# Fix the document processor
|
||
fix_document_processor()
|
||
|
||
# Create enhanced test
|
||
create_enhanced_test()
|
||
|
||
print(f"\\n✅ FIXES APPLIED:")
|
||
print(" - Enhanced OCR text inclusion in searchable content")
|
||
print(" - Improved image classification metadata indexing")
|
||
print(" - Created comprehensive search test")
|
||
print(f"\\n🚀 Run the test: python enhanced_search_test.py")
|
||
|
||
if __name__ == "__main__":
|
||
main() |