import os import sys import subprocess import requests import time import fitz # PyMuPDF from PIL import Image import io import numpy as np def test_ocr_with_cpu(): """Test OCR on ocr.pdf using CPU as fallback""" print("=== TESTING OCR ON OCR.PDF (CPU FALLBACK) ===") try: from paddleocr import PaddleOCR # Initialize PaddleOCR with CPU print("Initializing PaddleOCR with CPU...") ocr = PaddleOCR(use_angle_cls=False, lang='en', use_gpu=False) print("✓ PaddleOCR initialized with CPU") # Open PDF with PyMuPDF pdf_document = fitz.open('ocr.pdf') print(f"✓ PDF opened successfully, {pdf_document.page_count} pages") all_text = [] for page_num in range(pdf_document.page_count): page = pdf_document.load_page(page_num) pix = page.get_pixmap() img_data = pix.tobytes("png") # Convert to PIL Image then to numpy array image = Image.open(io.BytesIO(img_data)) image_np = np.array(image) # Run OCR on the numpy array result = ocr.ocr(image_np, cls=False) if result and result[0]: page_text = "" print(f"Page {page_num+1}: Found {len(result[0])} text boxes") for i, line in enumerate(result[0][:10]): # Show first 10 results text = line[1][0] confidence = line[1][1] page_text += f"{text} " print(f" {i+1}: '{text}' (confidence: {confidence:.3f})") all_text.append(page_text.strip()) else: print(f"Page {page_num+1}: No text detected") pdf_document.close() if all_text: print(f"\n✓ Successfully extracted text from {len(all_text)} pages") full_text = " ".join(all_text) print(f"Total text length: {len(full_text)} characters") print(f"Text preview: {full_text[:500]}...") return True, full_text else: print("✗ No text extracted from PDF") return False, "" except Exception as e: print(f"✗ Error in OCR test: {e}") import traceback traceback.print_exc() return False, "" def start_lightrag_server_cpu(): """Start LightRAG server with CPU OCR""" print("\n=== STARTING LIGHTRAG SERVER WITH CPU OCR ===") try: cmd = [ 'lightrag-server', '--port', '3015', '--embedding-binding', 'ollama', '--rerank-binding', 'null', '--host', '0.0.0.0' ] print(f"Starting server: {' '.join(cmd)}") process = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding='utf-8', errors='replace' ) # Wait for server to start print("Waiting for server to start...") for i in range(60): try: response = requests.get('http://localhost:3015/', timeout=5) if response.status_code == 200: print("✓ Server started successfully!") return process except: pass time.sleep(1) print("✗ Server failed to start within timeout") return None except Exception as e: print(f"✗ Failed to start server: {e}") return None def test_ocr_upload_workflow(): """Test complete OCR upload workflow""" print("\n=== TESTING OCR UPLOAD WORKFLOW ===") base_url = 'http://localhost:3015' try: # Login login_data = {'username': 'jleu3482', 'password': 'jleu1212'} login_response = requests.post(f'{base_url}/login', data=login_data, timeout=30) if login_response.status_code != 200: print(f"✗ Login failed: {login_response.text}") return False token = login_response.json().get('access_token') headers = {'Authorization': f'Bearer {token}'} print("✓ Login successful") # Clear existing documents print("Clearing existing documents...") clear_response = requests.delete(f'{base_url}/documents', headers=headers, timeout=30) print(f"Clear status: {clear_response.status_code}") # Upload OCR PDF print(f"\n=== UPLOADING OCR.PDF ===") print(f"File: ocr.pdf ({os.path.getsize('ocr.pdf')} bytes)") with open('ocr.pdf', 'rb') as f: files = {'file': ('ocr.pdf', f, 'application/pdf')} upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers=headers, timeout=60) print(f"Upload status: {upload_response.status_code}") if upload_response.status_code != 200: print(f"✗ Upload failed: {upload_response.text}") return False upload_data = upload_response.json() print(f"Upload response: {upload_data}") track_id = upload_data.get('track_id') if not track_id: print("✗ No track ID returned") return False # Monitor processing print(f"\n=== MONITORING OCR PROCESSING ===") print("OCR processing with CPU...") max_wait = 300 # 5 minutes start_time = time.time() while time.time() - start_time < max_wait: try: # Check document status docs_response = requests.get(f'{base_url}/documents', headers=headers, timeout=30) if docs_response.status_code == 200: docs_data = docs_response.json() statuses = docs_data.get('statuses', {}) completed = statuses.get('completed', []) processing = statuses.get('processing', []) failed = statuses.get('failed', []) elapsed = int(time.time() - start_time) # Check for our file in completed for doc in completed: if doc.get('file_path') == 'ocr.pdf': print(f"\n🎉 OCR PROCESSING COMPLETED in {elapsed} seconds!") print(f" File: {doc.get('file_path')}") print(f" Size: {doc.get('file_size')}") print(f" Chunks: {doc.get('chunk_count')}") return True # Check if failed for doc in failed: if doc.get('file_path') == 'ocr.pdf': error_msg = doc.get('error_msg', 'Unknown error') print(f"✗ OCR processing failed: {error_msg}") return False # Still processing if elapsed % 30 == 0: print(f" Still processing... ({elapsed}s elapsed, {len(processing)} files processing)") time.sleep(10) except requests.exceptions.RequestException as e: print(f" Connection error: {e}") time.sleep(10) print(f"✗ OCR processing timed out after {max_wait} seconds") return False except Exception as e: print(f"✗ Error during OCR workflow test: {e}") return False def main(): """Main function to test OCR PDF with CPU fallback""" print("OCR PDF TEST WITH CPU FALLBACK") print("=" * 50) print("Testing: Direct OCR → Server Upload → Processing") print("Mode: CPU (GPU currently having cuDNN issues)") print("Document: ocr.pdf") print("=" * 50) # Step 1: Test OCR directly on ocr.pdf with CPU success, extracted_text = test_ocr_with_cpu() if not success: print("\n❌ Direct OCR test failed") return # Step 2: Start server server_process = start_lightrag_server_cpu() if not server_process: print("\n❌ Failed to start server") return try: # Step 3: Test complete upload workflow success = test_ocr_upload_workflow() if success: print("\n" + "=" * 50) print("🎉 SUCCESS: OCR PDF WORKFLOW COMPLETED!") print("=" * 50) print("The ocr.pdf document has been:") print("✓ Successfully processed with OCR (CPU)") print("✓ Uploaded to the LightRAG server") print("✓ Indexed and made searchable") print("\nYou can now access the web UI at: http://localhost:3015") print("and search for content from the scanned table document.") else: print("\n❌ OCR workflow failed") finally: # Clean up print("\nStopping server...") server_process.terminate() try: server_process.wait(timeout=10) except: server_process.kill() print("Test completed.") if __name__ == "__main__": main()