#!/usr/bin/env python3 """ Test script to verify the OCR fix is working with the server """ import asyncio import requests import json import sys from pathlib import Path # Test configuration SERVER_URL = "http://localhost:3015" TEST_PDF = "ocr.pdf" async def test_server_ocr(): """Test OCR PDF upload to the running server""" print("๐Ÿงช Testing Server OCR Fix") print("=" * 50) # Check if server is running try: health_response = requests.get(f"{SERVER_URL}/docs") if health_response.status_code != 200: print("โŒ Server is not responding properly") return False print("โœ… Server is running") except Exception as e: print(f"โŒ Cannot connect to server: {e}") return False # Check if PDF file exists if not Path(TEST_PDF).exists(): print(f"โŒ Test PDF file not found: {TEST_PDF}") return False print(f"โœ… Test PDF found: {TEST_PDF}") # Test document upload try: print(f"๐Ÿ“ค Uploading {TEST_PDF} to server...") with open(TEST_PDF, 'rb') as f: files = {'file': (TEST_PDF, f, 'application/pdf')} response = requests.post(f"{SERVER_URL}/documents/upload", files=files) if response.status_code == 200: result = response.json() print(f"โœ… Upload successful: {result}") # Check processing status track_id = result.get('track_id') if track_id: print(f"๐Ÿ“Š Checking processing status for track_id: {track_id}") # Wait a bit for processing await asyncio.sleep(2) status_response = requests.get(f"{SERVER_URL}/documents/track_status/{track_id}") if status_response.status_code == 200: status_data = status_response.json() print(f"๐Ÿ“ˆ Processing status: {status_data}") # Check if any documents were processed successfully documents = status_data.get('documents', []) if documents: for doc in documents: print(f"๐Ÿ“„ Document: {doc.get('id')}") print(f" Status: {doc.get('status')}") print(f" Content Length: {doc.get('content_length')}") print(f" Error: {doc.get('error_msg', 'None')}") if doc.get('status') == 'PROCESSED': print("๐ŸŽ‰ OCR processing successful!") return True elif doc.get('status') == 'FAILED': print(f"โŒ OCR processing failed: {doc.get('error_msg')}") return False else: print("โŒ No documents found in processing status") return False else: print(f"โŒ Failed to get processing status: {status_response.status_code}") return False else: print("โŒ No track_id returned from upload") return False else: print(f"โŒ Upload failed: {response.status_code} - {response.text}") return False except Exception as e: print(f"โŒ Error during upload test: {e}") return False async def test_direct_processor(): """Test the document processor directly""" print("\n๐Ÿ”ง Testing Document Processor Directly") print("=" * 50) try: sys.path.append('LightRAG-main') from lightrag.document_processor import get_document_processor processor = get_document_processor() print("โœ… Document processor loaded successfully") # Test OCR initialization if processor.ocr_processor.ocr_available: print("โœ… OCR processor is available") print(f" GPU enabled: {processor.ocr_processor.use_gpu}") else: print("โŒ OCR processor not available") return False # Test processing the PDF print(f"๐Ÿ“„ Processing {TEST_PDF} directly...") result = await processor.process_document(TEST_PDF) print(f"โœ… Direct processing result:") print(f" Success: {result.success}") print(f" Content length: {len(result.content)}") print(f" Error: {result.error}") print(f" Metadata: {result.metadata}") if result.success and len(result.content) > 0: print("๐ŸŽ‰ Direct OCR processing successful!") return True else: print("โŒ Direct OCR processing failed") return False except Exception as e: print(f"โŒ Error in direct processor test: {e}") import traceback traceback.print_exc() return False async def main(): """Run all tests""" print("๐Ÿš€ Starting OCR Fix Verification Tests") print("=" * 50) # Test 1: Direct processor test direct_result = await test_direct_processor() # Test 2: Server upload test server_result = await test_server_ocr() print("\n" + "=" * 50) print("๐Ÿ“Š TEST RESULTS SUMMARY") print("=" * 50) print(f"Direct Processor Test: {'โœ… PASS' if direct_result else 'โŒ FAIL'}") print(f"Server Upload Test: {'โœ… PASS' if server_result else 'โŒ FAIL'}") if direct_result and server_result: print("\n๐ŸŽ‰ ALL TESTS PASSED - OCR fix is working!") return True else: print("\n๐Ÿ’ฅ SOME TESTS FAILED - OCR fix needs more work") return False if __name__ == "__main__": success = asyncio.run(main()) sys.exit(0 if success else 1)