""" Diagnostic Script for Web UI Bee Classification Issue Investigates why Web UI doesn't detect bee classification while test.py can """ import os import sys import requests import json from pathlib import Path # Configuration LIGHTRAG_URL = "http://localhost:3015" API_KEY = "jleu1212" HEADERS = {"X-API-Key": API_KEY} def check_server_status(): """Check if server is running and get basic info""" print("šŸ” CHECKING SERVER STATUS") print("=" * 50) try: response = requests.get(f"{LIGHTRAG_URL}/", headers=HEADERS, timeout=10) if response.status_code == 200: print("āœ… Server is running on port 3015") return True else: print(f"āŒ Server status: {response.status_code}") return False except Exception as e: print(f"āŒ Server not reachable: {e}") return False def list_documents(): """List all documents in the system""" print("\nšŸ“„ LISTING DOCUMENTS IN SYSTEM") print("=" * 50) try: response = requests.get(f"{LIGHTRAG_URL}/documents", headers=HEADERS, timeout=10) if response.status_code == 200: documents = response.json() print(f"šŸ“Š Found {len(documents)} documents:") for doc in documents: print(f" - {doc.get('filename', 'Unknown')}") print(f" Status: {doc.get('status', 'Unknown')}") print(f" ID: {doc.get('id', 'Unknown')}") # Check if it's test.docx if 'test.docx' in doc.get('filename', '').lower(): print(" šŸŽÆ THIS IS THE TEST DOCUMENT") return documents else: print(f"āŒ Failed to get documents: {response.status_code}") return [] except Exception as e: print(f"āŒ Error listing documents: {e}") return [] def check_document_content(doc_id): """Check the actual content of a document""" print(f"\nšŸ” CHECKING DOCUMENT CONTENT FOR ID: {doc_id}") print("=" * 50) try: # Try to get document details response = requests.get(f"{LIGHTRAG_URL}/documents/{doc_id}", headers=HEADERS, timeout=10) if response.status_code == 200: doc_detail = response.json() print(f"šŸ“‹ Document details:") print(f" - Filename: {doc_detail.get('filename')}") print(f" - Status: {doc_detail.get('status')}") print(f" - Metadata: {doc_detail.get('metadata', {})}") # Try to get chunks to see actual content chunks_response = requests.get(f"{LIGHTRAG_URL}/documents/{doc_id}/chunks", headers=HEADERS, timeout=10) if chunks_response.status_code == 200: chunks = chunks_response.json() print(f"šŸ“ Found {len(chunks)} chunks:") bee_found = False for i, chunk in enumerate(chunks[:5]): # Check first 5 chunks content = chunk.get('content', '') print(f" Chunk {i+1}: {content[:200]}...") # Check for bee keywords if 'bee' in content.lower() or 'classification' in content.lower(): bee_found = True print(f" šŸŽÆ BEE CLASSIFICATION FOUND IN CHUNK {i+1}") if not bee_found: print(" āŒ No bee classification found in chunks") return bee_found else: print(f"āŒ Could not get chunks: {chunks_response.status_code}") else: print(f"āŒ Could not get document details: {response.status_code}") except Exception as e: print(f"āŒ Error checking document content: {e}") return False def test_search_methods(): """Test different search methods to see which one works""" print("\nšŸ” TESTING DIFFERENT SEARCH METHODS") print("=" * 50) search_queries = ["bee", "Bee", "classification", "image", "photo of a bee", "Entity: Bee"] for query in search_queries: print(f"\nšŸ” Searching for: '{query}'") # Method 1: Standard search try: search_payload = { "query": query, "top_k": 10, "mode": "local" } response = requests.post( f"{LIGHTRAG_URL}/search", json=search_payload, headers=HEADERS, timeout=30 ) if response.status_code == 200: results = response.json() if results.get('results'): print(f"āœ… STANDARD SEARCH: Found {len(results['results'])} results") for result in results['results']: content = result.get('content', '') score = result.get('score', 0) print(f" - Score: {score:.4f}, Content: {content[:100]}...") else: print(f"āŒ STANDARD SEARCH: No results") else: print(f"āŒ STANDARD SEARCH failed: {response.status_code}") except Exception as e: print(f"āŒ STANDARD SEARCH error: {e}") # Method 2: Try with different modes for mode in ["local", "hybrid", "semantic"]: try: search_payload = { "query": query, "top_k": 5, "mode": mode } response = requests.post( f"{LIGHTRAG_URL}/search", json=search_payload, headers=HEADERS, timeout=30 ) if response.status_code == 200: results = response.json() if results.get('results'): print(f"āœ… {mode.upper()} MODE: Found {len(results['results'])} results") else: print(f"āŒ {mode.upper()} MODE: No results") else: print(f"āŒ {mode.upper()} MODE failed: {response.status_code}") except Exception as e: print(f"āŒ {mode.upper()} MODE error: {e}") def check_processing_logs(): """Check if there are any processing logs or errors""" print("\nšŸ“‹ CHECKING PROCESSING STATUS") print("=" * 50) try: # Try to get processing status status_response = requests.get(f"{LIGHTRAG_URL}/status", headers=HEADERS, timeout=10) if status_response.status_code == 200: status = status_response.json() print(f"šŸ“Š Server status: {status}") else: print(f"āŒ Could not get status: {status_response.status_code}") except Exception as e: print(f"āŒ Error checking status: {e}") def reupload_test_document(): """Re-upload test.docx to ensure it's processed with the latest processor""" print("\nšŸ”„ RE-UPLOADING TEST DOCUMENT") print("=" * 50) test_file = "test.docx" if not os.path.exists(test_file): print(f"āŒ Test file {test_file} not found") return False try: with open(test_file, 'rb') as f: files = {'file': (test_file, f, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')} response = requests.post( f"{LIGHTRAG_URL}/documents/upload", files=files, headers=HEADERS, timeout=30 ) if response.status_code == 200: print("āœ… Document re-uploaded successfully") result = response.json() print(f" Upload result: {result}") return True else: print(f"āŒ Upload failed: {response.status_code} - {response.text}") return False except Exception as e: print(f"āŒ Upload error: {e}") return False def main(): """Main diagnostic function""" print("šŸ”§ WEB UI BEE CLASSIFICATION DIAGNOSTIC") print("=" * 60) print("This script investigates why the Web UI doesn't detect") print("bee classification while test.py can.") print() # Step 1: Check server status if not check_server_status(): print("āŒ Cannot proceed - server not available") return # Step 2: List documents documents = list_documents() # Step 3: Check if test.docx exists and its content test_doc_id = None for doc in documents: if 'test.docx' in doc.get('filename', '').lower(): test_doc_id = doc.get('id') break if test_doc_id: bee_in_content = check_document_content(test_doc_id) else: print("āŒ test.docx not found in documents") bee_in_content = False # Step 4: Test different search methods test_search_methods() # Step 5: Check processing logs check_processing_logs() # Step 6: If bee not found, re-upload the document if not bee_in_content: print("\nšŸ”„ Bee classification not found in current document") print(" Attempting to re-upload with enhanced processor...") reupload_test_document() print("\n" + "=" * 60) print("šŸ“Š DIAGNOSTIC SUMMARY") print("=" * 60) if bee_in_content: print("āœ… Bee classification is present in document content") print("āŒ But Web UI search is not finding it") print("\nšŸ’” Possible issues:") print(" - Web UI might be using different search parameters") print(" - Entity extraction might be filtering out classification text") print(" - Search indexing might need to be refreshed") else: print("āŒ Bee classification is NOT present in document content") print("\nšŸ’” Possible issues:") print(" - Document was processed before enhanced processor was active") print(" - Image classification is not running properly") print(" - Enhanced entity extraction is not working") print("\nšŸ”§ Recommended actions:") print(" 1. Check server logs for document processing details") print(" 2. Verify the enhanced document processor is being used") print(" 3. Try re-uploading test.docx") print(" 4. Check if OpenCLIP classifier is available and working") if __name__ == "__main__": main()