Files
railseek6/test_ocr_cpu_fallback.py

262 lines
9.3 KiB
Python

import os
import sys
import subprocess
import requests
import time
import fitz # PyMuPDF
from PIL import Image
import io
import numpy as np
def test_ocr_with_cpu():
"""Test OCR on ocr.pdf using CPU as fallback"""
print("=== TESTING OCR ON OCR.PDF (CPU FALLBACK) ===")
try:
from paddleocr import PaddleOCR
# Initialize PaddleOCR with CPU
print("Initializing PaddleOCR with CPU...")
ocr = PaddleOCR(use_angle_cls=False, lang='en', use_gpu=False)
print("✓ PaddleOCR initialized with CPU")
# Open PDF with PyMuPDF
pdf_document = fitz.open('ocr.pdf')
print(f"✓ PDF opened successfully, {pdf_document.page_count} pages")
all_text = []
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
pix = page.get_pixmap()
img_data = pix.tobytes("png")
# Convert to PIL Image then to numpy array
image = Image.open(io.BytesIO(img_data))
image_np = np.array(image)
# Run OCR on the numpy array
result = ocr.ocr(image_np, cls=False)
if result and result[0]:
page_text = ""
print(f"Page {page_num+1}: Found {len(result[0])} text boxes")
for i, line in enumerate(result[0][:10]): # Show first 10 results
text = line[1][0]
confidence = line[1][1]
page_text += f"{text} "
print(f" {i+1}: '{text}' (confidence: {confidence:.3f})")
all_text.append(page_text.strip())
else:
print(f"Page {page_num+1}: No text detected")
pdf_document.close()
if all_text:
print(f"\n✓ Successfully extracted text from {len(all_text)} pages")
full_text = " ".join(all_text)
print(f"Total text length: {len(full_text)} characters")
print(f"Text preview: {full_text[:500]}...")
return True, full_text
else:
print("✗ No text extracted from PDF")
return False, ""
except Exception as e:
print(f"✗ Error in OCR test: {e}")
import traceback
traceback.print_exc()
return False, ""
def start_lightrag_server_cpu():
"""Start LightRAG server with CPU OCR"""
print("\n=== STARTING LIGHTRAG SERVER WITH CPU OCR ===")
try:
cmd = [
'lightrag-server',
'--port', '3015',
'--embedding-binding', 'ollama',
'--rerank-binding', 'null',
'--host', '0.0.0.0'
]
print(f"Starting server: {' '.join(cmd)}")
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
encoding='utf-8',
errors='replace'
)
# Wait for server to start
print("Waiting for server to start...")
for i in range(60):
try:
response = requests.get('http://localhost:3015/', timeout=5)
if response.status_code == 200:
print("✓ Server started successfully!")
return process
except:
pass
time.sleep(1)
print("✗ Server failed to start within timeout")
return None
except Exception as e:
print(f"✗ Failed to start server: {e}")
return None
def test_ocr_upload_workflow():
"""Test complete OCR upload workflow"""
print("\n=== TESTING OCR UPLOAD WORKFLOW ===")
base_url = 'http://localhost:3015'
try:
# Login
login_data = {'username': 'jleu3482', 'password': 'jleu1212'}
login_response = requests.post(f'{base_url}/login', data=login_data, timeout=30)
if login_response.status_code != 200:
print(f"✗ Login failed: {login_response.text}")
return False
token = login_response.json().get('access_token')
headers = {'Authorization': f'Bearer {token}'}
print("✓ Login successful")
# Clear existing documents
print("Clearing existing documents...")
clear_response = requests.delete(f'{base_url}/documents', headers=headers, timeout=30)
print(f"Clear status: {clear_response.status_code}")
# Upload OCR PDF
print(f"\n=== UPLOADING OCR.PDF ===")
print(f"File: ocr.pdf ({os.path.getsize('ocr.pdf')} bytes)")
with open('ocr.pdf', 'rb') as f:
files = {'file': ('ocr.pdf', f, 'application/pdf')}
upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers=headers, timeout=60)
print(f"Upload status: {upload_response.status_code}")
if upload_response.status_code != 200:
print(f"✗ Upload failed: {upload_response.text}")
return False
upload_data = upload_response.json()
print(f"Upload response: {upload_data}")
track_id = upload_data.get('track_id')
if not track_id:
print("✗ No track ID returned")
return False
# Monitor processing
print(f"\n=== MONITORING OCR PROCESSING ===")
print("OCR processing with CPU...")
max_wait = 300 # 5 minutes
start_time = time.time()
while time.time() - start_time < max_wait:
try:
# Check document status
docs_response = requests.get(f'{base_url}/documents', headers=headers, timeout=30)
if docs_response.status_code == 200:
docs_data = docs_response.json()
statuses = docs_data.get('statuses', {})
completed = statuses.get('completed', [])
processing = statuses.get('processing', [])
failed = statuses.get('failed', [])
elapsed = int(time.time() - start_time)
# Check for our file in completed
for doc in completed:
if doc.get('file_path') == 'ocr.pdf':
print(f"\n🎉 OCR PROCESSING COMPLETED in {elapsed} seconds!")
print(f" File: {doc.get('file_path')}")
print(f" Size: {doc.get('file_size')}")
print(f" Chunks: {doc.get('chunk_count')}")
return True
# Check if failed
for doc in failed:
if doc.get('file_path') == 'ocr.pdf':
error_msg = doc.get('error_msg', 'Unknown error')
print(f"✗ OCR processing failed: {error_msg}")
return False
# Still processing
if elapsed % 30 == 0:
print(f" Still processing... ({elapsed}s elapsed, {len(processing)} files processing)")
time.sleep(10)
except requests.exceptions.RequestException as e:
print(f" Connection error: {e}")
time.sleep(10)
print(f"✗ OCR processing timed out after {max_wait} seconds")
return False
except Exception as e:
print(f"✗ Error during OCR workflow test: {e}")
return False
def main():
"""Main function to test OCR PDF with CPU fallback"""
print("OCR PDF TEST WITH CPU FALLBACK")
print("=" * 50)
print("Testing: Direct OCR → Server Upload → Processing")
print("Mode: CPU (GPU currently having cuDNN issues)")
print("Document: ocr.pdf")
print("=" * 50)
# Step 1: Test OCR directly on ocr.pdf with CPU
success, extracted_text = test_ocr_with_cpu()
if not success:
print("\n❌ Direct OCR test failed")
return
# Step 2: Start server
server_process = start_lightrag_server_cpu()
if not server_process:
print("\n❌ Failed to start server")
return
try:
# Step 3: Test complete upload workflow
success = test_ocr_upload_workflow()
if success:
print("\n" + "=" * 50)
print("🎉 SUCCESS: OCR PDF WORKFLOW COMPLETED!")
print("=" * 50)
print("The ocr.pdf document has been:")
print("✓ Successfully processed with OCR (CPU)")
print("✓ Uploaded to the LightRAG server")
print("✓ Indexed and made searchable")
print("\nYou can now access the web UI at: http://localhost:3015")
print("and search for content from the scanned table document.")
else:
print("\n❌ OCR workflow failed")
finally:
# Clean up
print("\nStopping server...")
server_process.terminate()
try:
server_process.wait(timeout=10)
except:
server_process.kill()
print("Test completed.")
if __name__ == "__main__":
main()