railseek6/analyze_server_issue.py

"""
Analyze the server-side processing issue by examining the actual server logs and behavior
"""
import requests
import json
import time

def analyze_server_issue():
    print("🔍 Analyzing server-side processing issue...")

    # Check server status
    try:
        print("\n📡 Checking server status...")
        response = requests.get("http://localhost:3015/health")
        print(f"  Server health: {response.status_code}")
        if response.status_code == 200:
            print("  ✅ Server is running")
        else:
            print("  ❌ Server health check failed")
    except Exception as e:
        print(f"  ❌ Cannot connect to server: {e}")
        return

    # Check document status
    try:
        print("\n📊 Checking current document status...")

        # Login first to get token
        login_data = {
            "username": "admin",
            "password": "password"
        }
        login_response = requests.post("http://localhost:3015/auth/login", data=login_data)

        if login_response.status_code == 200:
            token = login_response.json().get("access_token")
            headers = {"Authorization": f"Bearer {token}"}

            # Get document status
            docs_response = requests.get("http://localhost:3015/documents", headers=headers)
            if docs_response.status_code == 200:
                docs_data = docs_response.json()
                print("  Current document status:")
                for status, docs in docs_data.get("statuses", {}).items():
                    print(f"    {status}: {len(docs)} documents")
                    for doc in docs:
                        print(f"      - {doc.get('file_path')}: {doc.get('error_msg', 'No error')}")
            else:
                print(f"  ❌ Failed to get documents: {docs_response.status_code}")
        else:
            print(f"  ❌ Login failed: {login_response.status_code}")

    except Exception as e:
        print(f"  ❌ Error checking document status: {e}")

    # Check server logs for recent activity
    print("\n📋 Checking for recent server logs...")
    try:
        with open('lightrag.log', 'r', encoding='utf-8') as f:
            lines = f.readlines()
            # Get last 50 lines
            recent_logs = lines[-50:] if len(lines) > 50 else lines
            print("  Recent server logs:")
            for line in recent_logs:
                if 'ocr' in line.lower() or 'pdf' in line.lower() or 'whitespace' in line.lower():
                    print(f"    {line.strip()}")
    except FileNotFoundError:
        print("  ❌ lightrag.log file not found")
    except Exception as e:
        print(f"  ❌ Error reading logs: {e}")

    # Analyze the root cause
    print("\n🔬 Root Cause Analysis:")
    print("  1. ✅ Direct document processor works correctly (extracts 1516 characters)")
    print("  2. ✅ OCR engine is properly initialized with GPU")
    print("  3. ❌ Server upload fails with 'File content contains only whitespace characters'")
    print("  4. ❌ Server is using cached/old document processor code")
    print("  5. ❌ Server restart didn't load updated document_processor.py")

    print("\n💡 Possible Solutions:")
    print("  - Kill all Python processes and restart server")
    print("  - Check if server is using a different virtual environment")
    print("  - Verify the document_processor.py file loaded by server")
    print("  - Add debug logging to server's document processing pipeline")

if __name__ == "__main__":
    analyze_server_issue()