Files
railseek6/test_ocr_pdf_directly.py

31 lines
868 B
Python

from paddleocr import PaddleOCR
import os
# Initialize PaddleOCR with CPU to avoid GPU conflict with server
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False)
# Check the ocr.pdf
pdf_path = 'ocr.pdf'
if not os.path.exists(pdf_path):
print(f"{pdf_path} not found")
exit(1)
print(f"Running PaddleOCR on {pdf_path}...")
# Run OCR on the PDF
result = ocr.ocr(pdf_path, cls=True)
# Print the result
if result:
for page_num, page in enumerate(result):
print(f"Page {page_num+1}:")
if page:
for line_num, line in enumerate(page):
print(f" Line {line_num+1}: {line}")
else:
print(" No text detected")
else:
print("No result returned")
# If no text found, print a message
if not result or all(len(page) == 0 for page in result):
print("No text found in the PDF by PaddleOCR")