Files
railseek6/test_paddleocr_on_pdf.py

49 lines
1.9 KiB
Python

import paddleocr
import os
def test_paddleocr_on_pdf(pdf_path):
print(f"Testing PaddleOCR on: {pdf_path}")
print(f"File exists: {os.path.exists(pdf_path)}")
print(f"File size: {os.path.getsize(pdf_path)} bytes")
# Initialize PaddleOCR
ocr = paddleocr.PaddleOCR(use_angle_cls=True, lang='en')
try:
# Run OCR on the PDF
result = ocr.ocr(pdf_path)
print(f"OCR result type: {type(result)}")
print(f"OCR result length: {len(result) if result else 0}")
# Extract and display text
extracted_text = ""
if result:
for page_num, page_result in enumerate(result):
print(f"\n--- Page {page_num + 1} ---")
if page_result:
for line_num, line in enumerate(page_result):
if line:
# Each line is [coordinates, (text, confidence)]
if len(line) >= 2:
text_info = line[1]
if len(text_info) >= 1:
text = text_info[0]
confidence = text_info[1] if len(text_info) > 1 else "N/A"
extracted_text += text + " "
print(f"Line {line_num}: '{text}' (confidence: {confidence})")
else:
print("No text detected on this page")
print(f"\nTotal extracted text length: {len(extracted_text)}")
print(f"Extracted text: '{extracted_text.strip()}'")
return extracted_text.strip()
except Exception as e:
print(f"Error during OCR: {e}")
return ""
if __name__ == "__main__":
extracted = test_paddleocr_on_pdf('ocr.pdf')
print(f"\nFinal result: Text extracted: {len(extracted) > 0}")