import paddleocr import os def test_paddleocr_on_pdf(pdf_path): print(f"Testing PaddleOCR on: {pdf_path}") print(f"File exists: {os.path.exists(pdf_path)}") print(f"File size: {os.path.getsize(pdf_path)} bytes") # Initialize PaddleOCR ocr = paddleocr.PaddleOCR(use_angle_cls=True, lang='en') try: # Run OCR on the PDF result = ocr.ocr(pdf_path) print(f"OCR result type: {type(result)}") print(f"OCR result length: {len(result) if result else 0}") # Extract and display text extracted_text = "" if result: for page_num, page_result in enumerate(result): print(f"\n--- Page {page_num + 1} ---") if page_result: for line_num, line in enumerate(page_result): if line: # Each line is [coordinates, (text, confidence)] if len(line) >= 2: text_info = line[1] if len(text_info) >= 1: text = text_info[0] confidence = text_info[1] if len(text_info) > 1 else "N/A" extracted_text += text + " " print(f"Line {line_num}: '{text}' (confidence: {confidence})") else: print("No text detected on this page") print(f"\nTotal extracted text length: {len(extracted_text)}") print(f"Extracted text: '{extracted_text.strip()}'") return extracted_text.strip() except Exception as e: print(f"Error during OCR: {e}") return "" if __name__ == "__main__": extracted = test_paddleocr_on_pdf('ocr.pdf') print(f"\nFinal result: Text extracted: {len(extracted) > 0}")