Files
railseek6/fix_indexing.py

299 lines
13 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Fix indexing to include image classification results in searchable content
"""
import asyncio
import sys
import os
from pathlib import Path
# Add paths
sys.path.insert(0, "LightRAG-main")
def test_current_indexing():
"""Test what content is currently being indexed"""
print("🔍 Testing Current Indexing Behavior")
print("=" * 50)
try:
from lightrag.document_processor import get_document_processor
processor = get_document_processor()
# Process test document
test_file = "test.docx"
if not os.path.exists(test_file):
print(f"❌ Test file not found: {test_file}")
return
print(f"📄 Processing: {test_file}")
result = asyncio.run(processor.process_document(test_file))
print(f"✅ Processing Success: {result.success}")
print(f"📊 Metadata: {result.metadata}")
print(f"📝 Content Length: {len(result.content)} characters")
# Show what content is actually being indexed
print(f"\n📋 CONTENT PREVIEW (first 500 chars):")
print(result.content[:500])
print(f"\n📋 CONTENT PREVIEW (last 500 chars):")
print(result.content[-500:])
# Check for image-related content
print(f"\n🔍 SEARCHING FOR IMAGE CONTENT:")
if "[Image" in result.content:
print("✅ Found image metadata in content")
# Extract all image-related lines
lines = result.content.split('\n')
image_lines = [line for line in lines if '[Image' in line]
for line in image_lines:
print(f" {line}")
else:
print("❌ No image metadata found in content")
# Check for bee-related content
print(f"\n🐝 SEARCHING FOR BEE CONTENT:")
if 'bee' in result.content.lower():
print("✅ Found 'bee' in content")
bee_lines = [line for line in lines if 'bee' in line.lower()]
for line in bee_lines:
print(f" {line}")
else:
print("❌ No 'bee' found in content")
except Exception as e:
print(f"❌ Test failed: {e}")
import traceback
traceback.print_exc()
def fix_document_processor():
"""Fix the document processor to include image classifications in searchable content"""
print("\n🔧 Fixing Document Processor for Better Indexing")
print("=" * 50)
# Read the current document processor
with open("LightRAG-main/lightrag/document_processor.py", "r", encoding="utf-8") as f:
content = f.read()
# Find the _extract_and_process_images method and enhance it
old_method = ''' # OCR processing - ensure it works properly
if self.ocr_processor.ocr_available:
try:
logger.info(f"Running OCR on image {i+1}")
ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
logger.info(f"OCR result for image {i+1}: {len(ocr_result['text'])} characters, confidence: {ocr_result['confidence']}")
if ocr_result["text"].strip():
image_metadata["ocr_text"] = ocr_result["text"]
image_metadata["ocr_confidence"] = ocr_result["confidence"]
additional_content.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}")
else:
logger.warning(f"OCR returned empty text for image {i+1}")
except Exception as ocr_error:
logger.error(f"OCR processing failed for image {i+1}: {ocr_error}")
image_metadata["ocr_error"] = str(ocr_error)'''
new_method = ''' # OCR processing - ensure it works properly
if self.ocr_processor.ocr_available:
try:
logger.info(f"Running OCR on image {i+1}")
ocr_result = self.ocr_processor.extract_text_from_image(temp_path)
logger.info(f"OCR result for image {i+1}: {len(ocr_result['text'])} characters, confidence: {ocr_result['confidence']}")
if ocr_result["text"].strip():
image_metadata["ocr_text"] = ocr_result["text"]
image_metadata["ocr_confidence"] = ocr_result["confidence"]
additional_content.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}")
# Add OCR text directly to main content for better searchability
additional_content.append(ocr_result["text"])
else:
logger.warning(f"OCR returned empty text for image {i+1}")
except Exception as ocr_error:
logger.error(f"OCR processing failed for image {i+1}: {ocr_error}")
image_metadata["ocr_error"] = str(ocr_error)'''
content = content.replace(old_method, new_method)
# Also fix the classification part to add more searchable content
old_classification = ''' # Image classification
if self.image_classifier and self.image_classifier.available:
try:
classification_results = self.image_classifier.classify_image(temp_path, top_k=3)
image_metadata["classification"] = classification_results
# Add classification to content for indexing
top_label = classification_results[0]["label"] if classification_results else "unknown"
image_metadata["primary_classification"] = top_label
additional_content.append(f"[Image {i+1} Classification]: {top_label}")
except Exception as classify_error:
logger.error(f"Image classification failed for image {i+1}: {classify_error}")
image_metadata["classification_error"] = str(classify_error)'''
new_classification = ''' # Image classification
if self.image_classifier and self.image_classifier.available:
try:
classification_results = self.image_classifier.classify_image(temp_path, top_k=3)
image_metadata["classification"] = classification_results
# Add classification to content for indexing
top_label = classification_results[0]["label"] if classification_results else "unknown"
image_metadata["primary_classification"] = top_label
additional_content.append(f"[Image {i+1} Classification]: {top_label}")
# Add all classification labels for better searchability
for j, cls in enumerate(classification_results):
additional_content.append(f"Image {i+1} classified as: {cls['label']} with confidence {cls['confidence']:.3f}")
except Exception as classify_error:
logger.error(f"Image classification failed for image {i+1}: {classify_error}")
image_metadata["classification_error"] = str(classify_error)'''
content = content.replace(old_classification, new_classification)
# Write the fixed content back
with open("LightRAG-main/lightrag/document_processor.py", "w", encoding="utf-8") as f:
f.write(content)
print("✅ Document processor updated for better indexing")
def create_enhanced_test():
"""Create a test that simulates the full upload and search workflow"""
print("\n🚀 Creating Enhanced Search Test")
print("=" * 50)
test_code = '''
"""
Enhanced test that simulates upload, indexing, and search
"""
import asyncio
import sys
import os
from pathlib import Path
# Add paths
sys.path.insert(0, "LightRAG-main")
async def test_full_workflow():
"""Test the complete workflow including simulated search"""
print("🔍 TESTING COMPLETE WORKFLOW WITH SEARCH")
print("=" * 60)
try:
from lightrag.document_processor import get_document_processor
processor = get_document_processor()
# Process test document
test_file = "test.docx"
if not os.path.exists(test_file):
print(f"❌ Test file not found: {test_file}")
return
print(f"📄 Processing: {test_file}")
result = await processor.process_document(test_file)
if not result.success:
print(f"❌ Processing failed: {result.error}")
return
print(f"✅ Processing Success")
print(f"📊 Metadata: {result.metadata}")
# Simulate indexing and search
print(f"\\n🔍 SIMULATING INDEXING AND SEARCH")
print("=" * 40)
# Extract all searchable content
search_content = result.content.lower()
# Test various search queries
test_queries = [
"bee", "insect", "animal", "clipart", "image",
"docker", "windows", "autologin", "configuration"
]
print("📋 SEARCH RESULTS:")
for query in test_queries:
if query in search_content:
print(f"'{query}': FOUND in indexed content")
# Show context
idx = search_content.find(query)
context = result.content[max(0, idx-50):min(len(result.content), idx+50)]
print(f" Context: ...{context}...")
else:
print(f"'{query}': NOT FOUND in indexed content")
# Specifically check for image classifications
print(f"\\n🖼 IMAGE CLASSIFICATION SEARCH:")
bee_found = False
for i, img in enumerate(result.images):
if 'primary_classification' in img:
classification = img['primary_classification'].lower()
print(f" Image {i+1}: {classification}")
if 'bee' in classification:
bee_found = True
print(f" 🎯 BEE DETECTED in image {i+1}")
else:
print(f" Image {i+1}: No classification available")
if not bee_found:
print(" ❌ No bee detected in any image classifications")
# Check if bee appears in any OCR text
print(f"\\n🔤 OCR TEXT ANALYSIS:")
bee_in_ocr = False
for i, img in enumerate(result.images):
if 'ocr_text' in img and img['ocr_text']:
ocr_text = img['ocr_text'].lower()
if 'bee' in ocr_text:
bee_in_ocr = True
print(f" ✅ Image {i+1} OCR contains 'bee': {ocr_text[:100]}...")
else:
print(f" Image {i+1} OCR: {ocr_text[:50]}..." if ocr_text else " Image {i+1}: No OCR text")
else:
print(f" Image {i+1}: No OCR text available")
print(f"\\n🎯 FINAL BEE DETECTION STATUS:")
if bee_found or bee_in_ocr or 'bee' in search_content:
print(" ✅ BEE CONTENT IS SEARCHABLE AND INDEXED")
else:
print(" ❌ BEE CONTENT IS NOT PROPERLY INDEXED")
print(" 📝 Recommendations:")
print(" - Ensure image classifications are included in main content")
print(" - Add classification labels to searchable text")
print(" - Include OCR text from images in search index")
except Exception as e:
print(f"❌ Test failed: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
asyncio.run(test_full_workflow())
'''
with open("enhanced_search_test.py", "w", encoding="utf-8") as f:
f.write(test_code)
print("✅ Created enhanced search test")
def main():
"""Run all fixes"""
print("🎯 FIXING INDEXING FOR BEE DETECTION")
print("=" * 60)
# Test current state
test_current_indexing()
# Fix the document processor
fix_document_processor()
# Create enhanced test
create_enhanced_test()
print(f"\\n✅ FIXES APPLIED:")
print(" - Enhanced OCR text inclusion in searchable content")
print(" - Improved image classification metadata indexing")
print(" - Created comprehensive search test")
print(f"\\n🚀 Run the test: python enhanced_search_test.py")
if __name__ == "__main__":
main()