322 lines
11 KiB
Python
322 lines
11 KiB
Python
import os
|
|
import sys
|
|
import subprocess
|
|
import requests
|
|
import time
|
|
import fitz # PyMuPDF
|
|
from PIL import Image
|
|
import io
|
|
import numpy as np
|
|
|
|
def setup_cuda_environment():
|
|
"""Setup CUDA 11.8 environment for GPU PaddleOCR"""
|
|
print("=== SETTING UP CUDA 11.8 ENVIRONMENT ===")
|
|
|
|
cuda_path = r'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8'
|
|
|
|
if not os.path.exists(cuda_path):
|
|
print(f"✗ CUDA 11.8 not found at: {cuda_path}")
|
|
return None
|
|
|
|
print(f"✓ CUDA 11.8 found at: {cuda_path}")
|
|
|
|
# Create environment dictionary
|
|
env = os.environ.copy()
|
|
env['CUDA_PATH'] = cuda_path
|
|
env['CUDA_HOME'] = cuda_path
|
|
env['CUDA_VISIBLE_DEVICES'] = '0'
|
|
env['LIGHTRAG_OCR_ENGINE'] = 'paddleocr'
|
|
|
|
# Add CUDA to PATH - make sure it's at the beginning
|
|
cuda_bin = os.path.join(cuda_path, 'bin')
|
|
current_path = env.get('PATH', '')
|
|
|
|
# Remove any existing CUDA paths to avoid conflicts
|
|
paths = current_path.split(';')
|
|
paths = [p for p in paths if 'CUDA' not in p and 'NVIDIA' not in p]
|
|
clean_path = ';'.join(paths)
|
|
|
|
# Add CUDA bin at the beginning
|
|
env['PATH'] = cuda_bin + ';' + clean_path
|
|
|
|
# Add encoding environment variables
|
|
env['PYTHONIOENCODING'] = 'utf-8'
|
|
env['LANG'] = 'en_US.UTF-8'
|
|
env['LC_ALL'] = 'en_US.UTF-8'
|
|
|
|
print("✓ Environment configured for CUDA 11.8")
|
|
print(f"✓ PATH includes CUDA bin: {cuda_bin}")
|
|
|
|
# Verify DLLs are accessible
|
|
cudnn_dll = os.path.join(cuda_bin, 'cudnn_ops_infer64_8.dll')
|
|
if os.path.exists(cudnn_dll):
|
|
print(f"✓ cuDNN DLL found: {cudnn_dll}")
|
|
else:
|
|
print(f"✗ cuDNN DLL not found: {cudnn_dll}")
|
|
|
|
return env
|
|
|
|
def test_paddleocr_gpu_direct():
|
|
"""Test PaddleOCR GPU directly on ocr.pdf using proper PDF handling"""
|
|
print("\n=== TESTING PADDLEOCR GPU DIRECTLY ON OCR.PDF ===")
|
|
|
|
try:
|
|
import paddle
|
|
from paddleocr import PaddleOCR
|
|
|
|
print(f"✓ PaddlePaddle version: {paddle.__version__}")
|
|
print(f"✓ GPU available: {paddle.is_compiled_with_cuda()}")
|
|
|
|
if paddle.is_compiled_with_cuda():
|
|
paddle.device.set_device('gpu')
|
|
print("✓ Using GPU for PaddleOCR")
|
|
|
|
# Method 1: Convert PDF to images first, then run OCR
|
|
print("\n--- Method 1: Converting PDF to images first ---")
|
|
|
|
# Open PDF with PyMuPDF
|
|
pdf_document = fitz.open('ocr.pdf')
|
|
print(f"✓ PDF opened successfully, {pdf_document.page_count} pages")
|
|
|
|
all_text = []
|
|
|
|
for page_num in range(pdf_document.page_count):
|
|
page = pdf_document.load_page(page_num)
|
|
pix = page.get_pixmap()
|
|
img_data = pix.tobytes("png")
|
|
|
|
# Convert to PIL Image then to numpy array
|
|
image = Image.open(io.BytesIO(img_data))
|
|
image_np = np.array(image)
|
|
|
|
# Initialize PaddleOCR with GPU
|
|
ocr = PaddleOCR(use_angle_cls=False, lang='en', use_gpu=True)
|
|
|
|
# Run OCR on the numpy array
|
|
result = ocr.ocr(image_np, cls=False)
|
|
|
|
if result and result[0]:
|
|
page_text = ""
|
|
for line in result[0]:
|
|
text = line[1][0]
|
|
confidence = line[1][1]
|
|
page_text += f"{text} "
|
|
print(f" Page {page_num+1}: '{text}' (confidence: {confidence:.3f})")
|
|
|
|
all_text.append(page_text.strip())
|
|
else:
|
|
print(f" Page {page_num+1}: No text detected")
|
|
|
|
pdf_document.close()
|
|
|
|
if all_text:
|
|
print(f"\n✓ Successfully extracted text from {len(all_text)} pages")
|
|
full_text = " ".join(all_text)
|
|
print(f"Total text length: {len(full_text)} characters")
|
|
print(f"Text preview: {full_text[:200]}...")
|
|
return True
|
|
else:
|
|
print("✗ No text extracted from PDF")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"✗ Error in direct PaddleOCR test: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
def start_lightrag_server_with_ocr_fix(env):
|
|
"""Start LightRAG server with fixed OCR handling"""
|
|
print("\n=== STARTING LIGHTRAG SERVER WITH OCR FIX ===")
|
|
|
|
try:
|
|
cmd = [
|
|
'lightrag-server',
|
|
'--port', '3015',
|
|
'--embedding-binding', 'ollama',
|
|
'--rerank-binding', 'null',
|
|
'--host', '0.0.0.0'
|
|
]
|
|
|
|
print(f"Starting server: {' '.join(cmd)}")
|
|
process = subprocess.Popen(
|
|
cmd,
|
|
env=env,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
text=True,
|
|
encoding='utf-8',
|
|
errors='replace'
|
|
)
|
|
|
|
# Wait for server to start
|
|
print("Waiting for server to start...")
|
|
for i in range(60):
|
|
try:
|
|
response = requests.get('http://localhost:3015/', timeout=5)
|
|
if response.status_code == 200:
|
|
print("✓ Server started successfully!")
|
|
return process
|
|
except:
|
|
pass
|
|
time.sleep(1)
|
|
|
|
print("✗ Server failed to start within timeout")
|
|
return None
|
|
|
|
except Exception as e:
|
|
print(f"✗ Failed to start server: {e}")
|
|
return None
|
|
|
|
def test_ocr_upload_workflow():
|
|
"""Test complete OCR upload workflow"""
|
|
print("\n=== TESTING OCR UPLOAD WORKFLOW ===")
|
|
base_url = 'http://localhost:3015'
|
|
|
|
try:
|
|
# Login
|
|
login_data = {'username': 'jleu3482', 'password': 'jleu1212'}
|
|
login_response = requests.post(f'{base_url}/login', data=login_data, timeout=30)
|
|
|
|
if login_response.status_code != 200:
|
|
print(f"✗ Login failed: {login_response.text}")
|
|
return False
|
|
|
|
token = login_response.json().get('access_token')
|
|
headers = {'Authorization': f'Bearer {token}'}
|
|
print("✓ Login successful")
|
|
|
|
# Clear existing documents
|
|
print("Clearing existing documents...")
|
|
clear_response = requests.delete(f'{base_url}/documents', headers=headers, timeout=30)
|
|
print(f"Clear status: {clear_response.status_code}")
|
|
|
|
# Upload OCR PDF
|
|
print(f"\n=== UPLOADING OCR.PDF ===")
|
|
print(f"File: ocr.pdf ({os.path.getsize('ocr.pdf')} bytes)")
|
|
|
|
with open('ocr.pdf', 'rb') as f:
|
|
files = {'file': ('ocr.pdf', f, 'application/pdf')}
|
|
upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers=headers, timeout=60)
|
|
|
|
print(f"Upload status: {upload_response.status_code}")
|
|
if upload_response.status_code != 200:
|
|
print(f"✗ Upload failed: {upload_response.text}")
|
|
return False
|
|
|
|
upload_data = upload_response.json()
|
|
print(f"Upload response: {upload_data}")
|
|
track_id = upload_data.get('track_id')
|
|
|
|
if not track_id:
|
|
print("✗ No track ID returned")
|
|
return False
|
|
|
|
# Monitor processing
|
|
print(f"\n=== MONITORING OCR PROCESSING ===")
|
|
print("OCR processing with GPU acceleration...")
|
|
|
|
max_wait = 300 # 5 minutes
|
|
start_time = time.time()
|
|
|
|
while time.time() - start_time < max_wait:
|
|
try:
|
|
# Check document status
|
|
docs_response = requests.get(f'{base_url}/documents', headers=headers, timeout=30)
|
|
if docs_response.status_code == 200:
|
|
docs_data = docs_response.json()
|
|
statuses = docs_data.get('statuses', {})
|
|
|
|
completed = statuses.get('completed', [])
|
|
processing = statuses.get('processing', [])
|
|
failed = statuses.get('failed', [])
|
|
|
|
elapsed = int(time.time() - start_time)
|
|
|
|
# Check for our file in completed
|
|
for doc in completed:
|
|
if doc.get('file_path') == 'ocr.pdf':
|
|
print(f"\n🎉 OCR PROCESSING COMPLETED in {elapsed} seconds!")
|
|
print(f" File: {doc.get('file_path')}")
|
|
print(f" Size: {doc.get('file_size')}")
|
|
print(f" Chunks: {doc.get('chunk_count')}")
|
|
return True
|
|
|
|
# Check if failed
|
|
for doc in failed:
|
|
if doc.get('file_path') == 'ocr.pdf':
|
|
print(f"✗ OCR processing failed: {doc.get('error_msg', 'Unknown error')}")
|
|
return False
|
|
|
|
# Still processing
|
|
if elapsed % 30 == 0:
|
|
print(f" Still processing... ({elapsed}s elapsed, {len(processing)} files processing)")
|
|
|
|
time.sleep(10)
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f" Connection error: {e}")
|
|
time.sleep(10)
|
|
|
|
print(f"✗ OCR processing timed out after {max_wait} seconds")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"✗ Error during OCR workflow test: {e}")
|
|
return False
|
|
|
|
def main():
|
|
"""Main function to test OCR PDF with GPU PaddleOCR"""
|
|
print("OCR PDF TEST WITH GPU PADDLEOCR")
|
|
print("=" * 50)
|
|
print("Testing: Direct OCR → Server Upload → Processing")
|
|
print("CUDA 11.8: Enabled")
|
|
print("Document: ocr.pdf")
|
|
print("=" * 50)
|
|
|
|
# Step 1: Setup CUDA environment
|
|
env = setup_cuda_environment()
|
|
if not env:
|
|
print("\n❌ CUDA setup failed")
|
|
return
|
|
|
|
# Step 2: Test PaddleOCR GPU directly on ocr.pdf
|
|
if not test_paddleocr_gpu_direct():
|
|
print("\n❌ Direct PaddleOCR test failed")
|
|
return
|
|
|
|
# Step 3: Start server
|
|
server_process = start_lightrag_server_with_ocr_fix(env)
|
|
if not server_process:
|
|
print("\n❌ Failed to start server")
|
|
return
|
|
|
|
try:
|
|
# Step 4: Test complete upload workflow
|
|
success = test_ocr_upload_workflow()
|
|
|
|
if success:
|
|
print("\n" + "=" * 50)
|
|
print("🎉 SUCCESS: OCR PDF WORKFLOW COMPLETED!")
|
|
print("=" * 50)
|
|
print("The ocr.pdf document has been:")
|
|
print("✓ Successfully processed with GPU-accelerated OCR")
|
|
print("✓ Uploaded to the LightRAG server")
|
|
print("✓ Indexed and made searchable")
|
|
print("\nYou can now access the web UI at: http://localhost:3015")
|
|
else:
|
|
print("\n❌ OCR workflow failed")
|
|
|
|
finally:
|
|
# Clean up
|
|
print("\nStopping server...")
|
|
server_process.terminate()
|
|
try:
|
|
server_process.wait(timeout=10)
|
|
except:
|
|
server_process.kill()
|
|
|
|
print("Test completed.")
|
|
|
|
if __name__ == "__main__":
|
|
main() |