""" Fix indexing to include image classification results in searchable content """ import asyncio import sys import os from pathlib import Path # Add paths sys.path.insert(0, "LightRAG-main") def test_current_indexing(): """Test what content is currently being indexed""" print("šŸ” Testing Current Indexing Behavior") print("=" * 50) try: from lightrag.document_processor import get_document_processor processor = get_document_processor() # Process test document test_file = "test.docx" if not os.path.exists(test_file): print(f"āŒ Test file not found: {test_file}") return print(f"šŸ“„ Processing: {test_file}") result = asyncio.run(processor.process_document(test_file)) print(f"āœ… Processing Success: {result.success}") print(f"šŸ“Š Metadata: {result.metadata}") print(f"šŸ“ Content Length: {len(result.content)} characters") # Show what content is actually being indexed print(f"\nšŸ“‹ CONTENT PREVIEW (first 500 chars):") print(result.content[:500]) print(f"\nšŸ“‹ CONTENT PREVIEW (last 500 chars):") print(result.content[-500:]) # Check for image-related content print(f"\nšŸ” SEARCHING FOR IMAGE CONTENT:") if "[Image" in result.content: print("āœ… Found image metadata in content") # Extract all image-related lines lines = result.content.split('\n') image_lines = [line for line in lines if '[Image' in line] for line in image_lines: print(f" {line}") else: print("āŒ No image metadata found in content") # Check for bee-related content print(f"\nšŸ SEARCHING FOR BEE CONTENT:") if 'bee' in result.content.lower(): print("āœ… Found 'bee' in content") bee_lines = [line for line in lines if 'bee' in line.lower()] for line in bee_lines: print(f" {line}") else: print("āŒ No 'bee' found in content") except Exception as e: print(f"āŒ Test failed: {e}") import traceback traceback.print_exc() def fix_document_processor(): """Fix the document processor to include image classifications in searchable content""" print("\nšŸ”§ Fixing Document Processor for Better Indexing") print("=" * 50) # Read the current document processor with open("LightRAG-main/lightrag/document_processor.py", "r", encoding="utf-8") as f: content = f.read() # Find the _extract_and_process_images method and enhance it old_method = ''' # OCR processing - ensure it works properly if self.ocr_processor.ocr_available: try: logger.info(f"Running OCR on image {i+1}") ocr_result = self.ocr_processor.extract_text_from_image(temp_path) logger.info(f"OCR result for image {i+1}: {len(ocr_result['text'])} characters, confidence: {ocr_result['confidence']}") if ocr_result["text"].strip(): image_metadata["ocr_text"] = ocr_result["text"] image_metadata["ocr_confidence"] = ocr_result["confidence"] additional_content.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}") else: logger.warning(f"OCR returned empty text for image {i+1}") except Exception as ocr_error: logger.error(f"OCR processing failed for image {i+1}: {ocr_error}") image_metadata["ocr_error"] = str(ocr_error)''' new_method = ''' # OCR processing - ensure it works properly if self.ocr_processor.ocr_available: try: logger.info(f"Running OCR on image {i+1}") ocr_result = self.ocr_processor.extract_text_from_image(temp_path) logger.info(f"OCR result for image {i+1}: {len(ocr_result['text'])} characters, confidence: {ocr_result['confidence']}") if ocr_result["text"].strip(): image_metadata["ocr_text"] = ocr_result["text"] image_metadata["ocr_confidence"] = ocr_result["confidence"] additional_content.append(f"[Image {i+1} OCR Text]: {ocr_result['text']}") # Add OCR text directly to main content for better searchability additional_content.append(ocr_result["text"]) else: logger.warning(f"OCR returned empty text for image {i+1}") except Exception as ocr_error: logger.error(f"OCR processing failed for image {i+1}: {ocr_error}") image_metadata["ocr_error"] = str(ocr_error)''' content = content.replace(old_method, new_method) # Also fix the classification part to add more searchable content old_classification = ''' # Image classification if self.image_classifier and self.image_classifier.available: try: classification_results = self.image_classifier.classify_image(temp_path, top_k=3) image_metadata["classification"] = classification_results # Add classification to content for indexing top_label = classification_results[0]["label"] if classification_results else "unknown" image_metadata["primary_classification"] = top_label additional_content.append(f"[Image {i+1} Classification]: {top_label}") except Exception as classify_error: logger.error(f"Image classification failed for image {i+1}: {classify_error}") image_metadata["classification_error"] = str(classify_error)''' new_classification = ''' # Image classification if self.image_classifier and self.image_classifier.available: try: classification_results = self.image_classifier.classify_image(temp_path, top_k=3) image_metadata["classification"] = classification_results # Add classification to content for indexing top_label = classification_results[0]["label"] if classification_results else "unknown" image_metadata["primary_classification"] = top_label additional_content.append(f"[Image {i+1} Classification]: {top_label}") # Add all classification labels for better searchability for j, cls in enumerate(classification_results): additional_content.append(f"Image {i+1} classified as: {cls['label']} with confidence {cls['confidence']:.3f}") except Exception as classify_error: logger.error(f"Image classification failed for image {i+1}: {classify_error}") image_metadata["classification_error"] = str(classify_error)''' content = content.replace(old_classification, new_classification) # Write the fixed content back with open("LightRAG-main/lightrag/document_processor.py", "w", encoding="utf-8") as f: f.write(content) print("āœ… Document processor updated for better indexing") def create_enhanced_test(): """Create a test that simulates the full upload and search workflow""" print("\nšŸš€ Creating Enhanced Search Test") print("=" * 50) test_code = ''' """ Enhanced test that simulates upload, indexing, and search """ import asyncio import sys import os from pathlib import Path # Add paths sys.path.insert(0, "LightRAG-main") async def test_full_workflow(): """Test the complete workflow including simulated search""" print("šŸ” TESTING COMPLETE WORKFLOW WITH SEARCH") print("=" * 60) try: from lightrag.document_processor import get_document_processor processor = get_document_processor() # Process test document test_file = "test.docx" if not os.path.exists(test_file): print(f"āŒ Test file not found: {test_file}") return print(f"šŸ“„ Processing: {test_file}") result = await processor.process_document(test_file) if not result.success: print(f"āŒ Processing failed: {result.error}") return print(f"āœ… Processing Success") print(f"šŸ“Š Metadata: {result.metadata}") # Simulate indexing and search print(f"\\nšŸ” SIMULATING INDEXING AND SEARCH") print("=" * 40) # Extract all searchable content search_content = result.content.lower() # Test various search queries test_queries = [ "bee", "insect", "animal", "clipart", "image", "docker", "windows", "autologin", "configuration" ] print("šŸ“‹ SEARCH RESULTS:") for query in test_queries: if query in search_content: print(f" āœ… '{query}': FOUND in indexed content") # Show context idx = search_content.find(query) context = result.content[max(0, idx-50):min(len(result.content), idx+50)] print(f" Context: ...{context}...") else: print(f" āŒ '{query}': NOT FOUND in indexed content") # Specifically check for image classifications print(f"\\nšŸ–¼ļø IMAGE CLASSIFICATION SEARCH:") bee_found = False for i, img in enumerate(result.images): if 'primary_classification' in img: classification = img['primary_classification'].lower() print(f" Image {i+1}: {classification}") if 'bee' in classification: bee_found = True print(f" šŸŽÆ BEE DETECTED in image {i+1}") else: print(f" Image {i+1}: No classification available") if not bee_found: print(" āŒ No bee detected in any image classifications") # Check if bee appears in any OCR text print(f"\\nšŸ”¤ OCR TEXT ANALYSIS:") bee_in_ocr = False for i, img in enumerate(result.images): if 'ocr_text' in img and img['ocr_text']: ocr_text = img['ocr_text'].lower() if 'bee' in ocr_text: bee_in_ocr = True print(f" āœ… Image {i+1} OCR contains 'bee': {ocr_text[:100]}...") else: print(f" Image {i+1} OCR: {ocr_text[:50]}..." if ocr_text else " Image {i+1}: No OCR text") else: print(f" Image {i+1}: No OCR text available") print(f"\\nšŸŽÆ FINAL BEE DETECTION STATUS:") if bee_found or bee_in_ocr or 'bee' in search_content: print(" āœ… BEE CONTENT IS SEARCHABLE AND INDEXED") else: print(" āŒ BEE CONTENT IS NOT PROPERLY INDEXED") print(" šŸ“ Recommendations:") print(" - Ensure image classifications are included in main content") print(" - Add classification labels to searchable text") print(" - Include OCR text from images in search index") except Exception as e: print(f"āŒ Test failed: {e}") import traceback traceback.print_exc() if __name__ == "__main__": asyncio.run(test_full_workflow()) ''' with open("enhanced_search_test.py", "w", encoding="utf-8") as f: f.write(test_code) print("āœ… Created enhanced search test") def main(): """Run all fixes""" print("šŸŽÆ FIXING INDEXING FOR BEE DETECTION") print("=" * 60) # Test current state test_current_indexing() # Fix the document processor fix_document_processor() # Create enhanced test create_enhanced_test() print(f"\\nāœ… FIXES APPLIED:") print(" - Enhanced OCR text inclusion in searchable content") print(" - Improved image classification metadata indexing") print(" - Created comprehensive search test") print(f"\\nšŸš€ Run the test: python enhanced_search_test.py") if __name__ == "__main__": main()