Files
railseek6/test_gpu_only_ocr.py

129 lines
4.9 KiB
Python

import requests
import json
import time
import os
def test_gpu_only_ocr():
"""Test OCR PDF upload with GPU-only processing"""
# Server configuration
base_url = "http://localhost:3015"
upload_url = f"{base_url}/documents/upload"
# Try without authentication first
headers = {}
# File to upload
ocr_pdf_path = "ocr.pdf"
if not os.path.exists(ocr_pdf_path):
print(f"Error: OCR PDF file not found at {ocr_pdf_path}")
return False
# First, let's check server status
try:
status_response = requests.get(f"{base_url}/documents/pipeline_status", headers=headers)
if status_response.status_code == 200:
print("Server is running and accessible")
else:
print(f"Server status check failed: {status_response.status_code}")
except Exception as e:
print(f"Error connecting to server: {e}")
return False
# Upload the OCR PDF
print(f"Uploading OCR PDF: {ocr_pdf_path}")
try:
with open(ocr_pdf_path, 'rb') as file:
files = {'file': (os.path.basename(ocr_pdf_path), file, 'application/pdf')}
response = requests.post(upload_url, files=files, headers=headers)
if response.status_code == 200:
result = response.json()
print(f"Upload successful: {result}")
track_id = result.get('track_id')
if track_id:
print(f"Tracking ID: {track_id}")
print("Waiting for processing to complete...")
# Monitor processing status
for i in range(30): # Wait up to 5 minutes
time.sleep(10)
status_url = f"{base_url}/documents/track_status/{track_id}"
status_response = requests.get(status_url, headers=headers)
if status_response.status_code == 200:
status_data = status_response.json()
documents = status_data.get('documents', [])
if documents:
doc_status = documents[0].get('status')
print(f"Document status: {doc_status}")
if doc_status == "PROCESSED":
print("OCR processing completed successfully!")
print(f"Content summary: {documents[0].get('content_summary')}")
print(f"Content length: {documents[0].get('content_length')}")
return True
elif doc_status == "FAILED":
print(f"OCR processing failed: {documents[0].get('error_msg')}")
return False
else:
print("No documents found in track status")
else:
print(f"Error checking status: {status_response.status_code}")
print("Processing timeout - checking final status...")
return False
else:
print("No track ID returned")
return False
else:
print(f"Upload failed: {response.status_code} - {response.text}")
return False
except Exception as e:
print(f"Error during upload: {e}")
return False
def check_server_logs():
"""Check server logs for OCR-related messages"""
log_file = "lightrag.log"
if os.path.exists(log_file):
print(f"\nChecking server logs in {log_file}...")
with open(log_file, 'r', encoding='utf-8') as f:
logs = f.read()
# Look for OCR-related messages
if "PaddleOCR" in logs:
print("Found PaddleOCR references in logs")
if "GPU" in logs:
print("Found GPU references in logs")
if "fallback" in logs.lower():
print("WARNING: Found fallback references in logs")
if "error" in logs.lower():
print("Found error messages in logs")
# Also check for any recent errors
print("\nChecking for recent errors...")
try:
response = requests.get("http://localhost:3015/documents/pipeline_status")
if response.status_code == 200:
pipeline_status = response.json()
print(f"Pipeline status: {pipeline_status}")
except Exception as e:
print(f"Error checking pipeline status: {e}")
if __name__ == "__main__":
print("Testing GPU-only OCR PDF upload...")
print("=" * 50)
success = test_gpu_only_ocr()
print("\n" + "=" * 50)
if success:
print("✅ OCR PDF upload test completed successfully!")
else:
print("❌ OCR PDF upload test failed!")
check_server_logs()