railseek6/test_paddleocr_on_pdf.py

import paddleocr
import os

def test_paddleocr_on_pdf(pdf_path):
    print(f"Testing PaddleOCR on: {pdf_path}")
    print(f"File exists: {os.path.exists(pdf_path)}")
    print(f"File size: {os.path.getsize(pdf_path)} bytes")

    # Initialize PaddleOCR
    ocr = paddleocr.PaddleOCR(use_angle_cls=True, lang='en')

    try:
        # Run OCR on the PDF
        result = ocr.ocr(pdf_path)

        print(f"OCR result type: {type(result)}")
        print(f"OCR result length: {len(result) if result else 0}")

        # Extract and display text
        extracted_text = ""
        if result:
            for page_num, page_result in enumerate(result):
                print(f"\n--- Page {page_num + 1} ---")
                if page_result:
                    for line_num, line in enumerate(page_result):
                        if line:
                            # Each line is [coordinates, (text, confidence)]
                            if len(line) >= 2:
                                text_info = line[1]
                                if len(text_info) >= 1:
                                    text = text_info[0]
                                    confidence = text_info[1] if len(text_info) > 1 else "N/A"
                                    extracted_text += text + " "
                                    print(f"Line {line_num}: '{text}' (confidence: {confidence})")
                else:
                    print("No text detected on this page")

        print(f"\nTotal extracted text length: {len(extracted_text)}")
        print(f"Extracted text: '{extracted_text.strip()}'")

        return extracted_text.strip()

    except Exception as e:
        print(f"Error during OCR: {e}")
        return ""

if __name__ == "__main__":
    extracted = test_paddleocr_on_pdf('ocr.pdf')
    print(f"\nFinal result: Text extracted: {len(extracted) > 0}")