railseek6/test_workflow.py

#!/usr/bin/env python3
"""
Test script for the complete workflow with clickable document references.
This tests:
1. Search API to get document references
2. Download endpoint functionality
3. Web UI integration
"""

import requests
import json
import sys
from pathlib import Path

def test_search_api():
    """Test search API to get document references"""
    print("Testing Search API...")

    url = "http://localhost:3015/api/search"
    headers = {
        "Authorization": "Bearer jleu1212",
        "Content-Type": "application/json"
    }

    # Test query
    data = {
        "query": "test document",
        "top_k": 5,
        "mode": "default"
    }

    try:
        response = requests.post(url, headers=headers, json=data, timeout=10)
        if response.status_code == 200:
            result = response.json()
            print(f"✓ Search API successful")
            print(f"  Found {len(result.get('results', []))} results")

            # Extract document references
            references = set()
            for item in result.get('results', []):
                metadata = item.get('metadata', {})
                source = metadata.get('source')
                if source:
                    references.add(source)

            print(f"  Unique document references: {len(references)}")
            for ref in list(references)[:5]:  # Show first 5
                print(f"    - {ref}")

            return list(references)
        else:
            print(f"✗ Search API failed: {response.status_code}")
            print(f"  Response: {response.text}")
            return []
    except Exception as e:
        print(f"✗ Search API error: {e}")
        return []

def test_download_endpoint(filename):
    """Test download endpoint for a specific file"""
    print(f"\nTesting Download Endpoint for: {filename}")

    url = f"http://localhost:3015/api/documents/download/{filename}"
    headers = {
        "Authorization": "Bearer jleu1212"
    }

    try:
        response = requests.get(url, headers=headers, timeout=10, stream=True)
        if response.status_code == 200:
            # Check content type
            content_type = response.headers.get('content-type', '')
            content_disposition = response.headers.get('content-disposition', '')

            print(f"✓ Download endpoint successful")
            print(f"  Content-Type: {content_type}")
            print(f"  Content-Disposition: {content_disposition}")
            print(f"  Content-Length: {response.headers.get('content-length', 'unknown')}")

            # Save a small sample to verify
            test_dir = Path("download_test")
            test_dir.mkdir(exist_ok=True)

            sample_path = test_dir / f"sample_{filename}"
            with open(sample_path, 'wb') as f:
                # Read first 1024 bytes
                chunk = next(response.iter_content(chunk_size=1024))
                f.write(chunk)

            print(f"  Sample saved to: {sample_path}")
            print(f"  Sample size: {sample_path.stat().st_size} bytes")
            return True
        else:
            print(f"✗ Download endpoint failed: {response.status_code}")
            print(f"  Response: {response.text[:200] if response.text else 'No response body'}")
            return False
    except Exception as e:
        print(f"✗ Download endpoint error: {e}")
        return False

def test_webui_integration():
    """Test Web UI integration by checking if clickable links are present"""
    print("\nTesting Web UI Integration...")

    # Check if web UI index.html has been updated
    webui_path = Path("LightRAG-main/webui/index.html")
    if not webui_path.exists():
        print(f"✗ Web UI file not found: {webui_path}")
        return False

    try:
        content = webui_path.read_text(encoding='utf-8')

        # Check for clickable links in the displaySearchResults function
        if 'downloadDocument' in content:
            print("✓ downloadDocument function found in Web UI")
        else:
            print("✗ downloadDocument function not found in Web UI")

        if 'href="http://localhost:3015/api/documents/download/' in content:
            print("✓ Clickable download links found in Web UI")
        else:
            print("✗ Clickable download links not found in Web UI")

        if 'References Section' in content or 'references-section' in content:
            print("✓ References section found in Web UI")
        else:
            # Check for references container
            if 'referencesContainer' in content or 'id="references"' in content:
                print("✓ References container found in Web UI")
            else:
                print("✗ References section not found in Web UI")

        return True
    except Exception as e:
        print(f"✗ Web UI check error: {e}")
        return False

def test_complete_workflow():
    """Test the complete workflow"""
    print("=" * 60)
    print("Testing Complete Workflow with Clickable Document References")
    print("=" * 60)

    # Check if server is running
    print("\n1. Checking if server is running...")
    try:
        response = requests.get("http://localhost:3015/", timeout=5)
        if response.status_code in [200, 307]:
            print("✓ Server is running on http://localhost:3015")
        else:
            print(f"✗ Server returned status: {response.status_code}")
            return False
    except Exception as e:
        print(f"✗ Cannot connect to server: {e}")
        print("  Make sure the server is running on port 3015")
        return False

    # Test Web UI updates
    webui_ok = test_webui_integration()

    # Test search API
    references = test_search_api()

    # Test download endpoint with a known file
    if references:
        # Try to download the first reference
        test_file = references[0] if references else "ocr.pdf"
        download_ok = test_download_endpoint(test_file)
    else:
        # Try with a known file
        download_ok = test_download_endpoint("ocr.pdf")

    # Summary
    print("\n" + "=" * 60)
    print("WORKFLOW TEST SUMMARY")
    print("=" * 60)

    all_tests_passed = webui_ok and (references or download_ok)

    if all_tests_passed:
        print("✓ All tests passed!")
        print("\nThe workflow is working correctly:")
        print("1. Web UI has been updated with clickable document references")
        print("2. Search API returns document metadata with source information")
        print("3. Download endpoint serves files correctly")
        print("\nUsers can now:")
        print("- Search for documents in the Web UI")
        print("- See clickable document references in search results")
        print("- Download original files by clicking the links")
    else:
        print("⚠ Some tests failed or had warnings")
        print("\nIssues found:")
        if not webui_ok:
            print("- Web UI may not have been updated with clickable links")
        if not references:
            print("- Search API may not be returning document references")
        if not download_ok:
            print("- Download endpoint may not be working")

        print("\nPossible solutions:")
        print("1. Restart the LightRAG server to pick up new endpoint")
        print("2. Check server logs for errors")
        print("3. Verify the download endpoint is registered in document_routes.py")

    return all_tests_passed

def create_gitea_repository_simple():
    """Create Gitea repository using API (simplified version)"""
    print("\n" + "=" * 60)
    print("Gitea Repository Setup (Simplified)")
    print("=" * 60)

    gitea_url = "https://git.mtrcompute.com"
    username = "jleu3482"
    password = "jleu1212"
    repo_name = "lightrag-project"

    print(f"Creating repository: {repo_name}")
    print(f"URL: {gitea_url}")
    print(f"Username: {username}")

    # Create repository via API
    api_url = f"{gitea_url}/api/v1/user/repos"
    data = {
        "name": repo_name,
        "description": "LightRAG - GPU-accelerated RAG system with OCR and image classification",
        "private": False,
        "auto_init": True,
        "gitignores": "Python",
        "license": "mit",
        "readme": "Default"
    }

    try:
        response = requests.post(api_url, auth=(username, password), json=data, timeout=10)
        if response.status_code == 201:
            repo_info = response.json()
            print(f"✓ Repository created successfully!")
            print(f"  URL: {repo_info.get('html_url')}")
            print(f"  Clone URL: {repo_info.get('clone_url')}")

            # Create README with setup instructions
            readme_content = f"""# LightRAG Project

GPU-accelerated RAG (Retrieval-Augmented Generation) system with OCR and image classification capabilities.

## Features

- **GPU-accelerated OCR**: Fast document processing with PaddleOCR
- **Image Classification**: OpenCLIP-based image analysis
- **Document Search**: Semantic search with vector embeddings
- **Web UI**: User-friendly interface with clickable document references
- **Auto-commit**: Automatic Git commits for major changes

## Setup

1. Clone the repository:
   ```bash
   git clone {repo_info.get('clone_url')}
   cd lightrag-project
   ```

2. Install dependencies:
   ```bash
   pip install -r requirements.txt
   ```

3. Start the server:
   ```bash
   python start_server_fixed.py
   ```

4. Access the Web UI:
   - Open http://localhost:3015 in your browser

## Auto-commit System

The project includes an auto-commit system that automatically commits major changes:
- Use `python git_auto_commit.py "Description of changes"` to commit changes
- Changes are automatically pushed to this repository

## Recent Updates

- Added document download endpoint to API
- Updated Web UI with clickable document references
- Implemented auto-commit functionality for Git
- Fixed OCR processing pipeline for better performance

## API Documentation

### Search API
```
POST /api/search
Authorization: Bearer jleu1212
Content-Type: application/json

{{
  "query": "search text",
  "top_k": 5,
  "mode": "default"
}}
```

### Document Download
```
GET /api/documents/download/{filename}
Authorization: Bearer jleu1212
```

Files are served from the `inputs/__enqueued__` directory.
"""

            # Save README locally
            readme_path = Path("README.md")
            readme_path.write_text(readme_content, encoding='utf-8')
            print(f"✓ README.md created with setup instructions")

            return repo_info.get('clone_url')
        elif response.status_code == 409:
            print(f"⚠ Repository already exists")
            # Get existing repository info
            repo_url = f"{gitea_url}/api/v1/repos/{username}/{repo_name}"
            repo_response = requests.get(repo_url, auth=(username, password), timeout=10)
            if repo_response.status_code == 200:
                repo_info = repo_response.json()
                print(f"  Existing repository: {repo_info.get('html_url')}")
                return repo_info.get('clone_url')
            return None
        else:
            print(f"✗ Failed to create repository: {response.status_code}")
            print(f"  Response: {response.text[:200]}")
            return None
    except Exception as e:
        print(f"✗ Error creating repository: {e}")
        return None

if __name__ == "__main__":
    # Test the complete workflow
    workflow_ok = test_complete_workflow()

    # Create Gitea repository
    print("\n" + "=" * 60)
    print("Setting up Gitea Repository")
    print("=" * 60)

    clone_url = create_gitea_repository_simple()

    if clone_url:
        print(f"\n✓ Repository setup complete!")
        print(f"  Clone URL: {clone_url}")
        print(f"  Web URL: {clone_url.replace('.git', '').replace('git@', 'https://').replace('ssh://', 'https://')}")

        # Create simple auto-commit instructions
        instructions = f"""
## Next Steps for Git Setup:

1. Initialize local Git repository:
   ```
   git init
   git add .
   git commit -m "Initial commit: LightRAG project with document download and auto-commit"
   ```

2. Add remote repository:
   ```
   git remote add origin {clone_url}
   ```

3. Push to Gitea:
   ```
   git push -u origin main
   ```

4. For auto-committing future changes:
   ```
   python git_auto_commit.py "Description of changes made"
   ```

5. View your repository at:
   {clone_url.replace('.git', '').replace('git@', 'https://').replace('ssh://', 'https://')}
"""
        print(instructions)

        # Save instructions to file
        with open("GIT_SETUP_INSTRUCTIONS.txt", "w", encoding='utf-8') as f:
            f.write(instructions)
        print("✓ Git setup instructions saved to GIT_SETUP_INSTRUCTIONS.txt")

    print("\n" + "=" * 60)
    if workflow_ok:
        print("✓ COMPLETE WORKFLOW TEST SUCCESSFUL!")
    else:
        print("⚠ WORKFLOW TEST COMPLETED WITH ISSUES")
    print("=" * 60)

    sys.exit(0 if workflow_ok else 1)