import sys import os sys.path.append('.') from paddleocr import PaddleOCR import fitz # PyMuPDF import tempfile import shutil def pdf_to_images(pdf_path, output_dir): """Convert PDF pages to images""" doc = fitz.open(pdf_path) images = [] for page_num in range(len(doc)): page = doc[page_num] pix = page.get_pixmap(dpi=150) img_path = os.path.join(output_dir, f"page_{page_num}.png") pix.save(img_path) images.append(img_path) doc.close() return images def ocr_pdf(pdf_path): print(f"Processing {pdf_path}") # Create temp directory for images temp_dir = tempfile.mkdtemp() try: images = pdf_to_images(pdf_path, temp_dir) ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False) all_text = [] for img_path in images: result = ocr.ocr(img_path, cls=True) if result is None: continue for line in result: if line: for word_info in line: text = word_info[1][0] all_text.append(text) print(f"Image {img_path} extracted {len(result) if result else 0} lines") print("\nExtracted text samples:") for i, text in enumerate(all_text[:20]): print(f" {i}: {text}") print(f"Total text boxes: {len(all_text)}") finally: shutil.rmtree(temp_dir, ignore_errors=True) if __name__ == "__main__": pdf_path = "test/safedistance.pdf" if not os.path.exists(pdf_path): print(f"File not found: {pdf_path}") else: ocr_pdf(pdf_path)