52 lines
1.6 KiB
Python
52 lines
1.6 KiB
Python
import sys
|
|
import os
|
|
sys.path.append('.')
|
|
from paddleocr import PaddleOCR
|
|
import fitz # PyMuPDF
|
|
import tempfile
|
|
import shutil
|
|
|
|
def pdf_to_images(pdf_path, output_dir):
|
|
"""Convert PDF pages to images"""
|
|
doc = fitz.open(pdf_path)
|
|
images = []
|
|
for page_num in range(len(doc)):
|
|
page = doc[page_num]
|
|
pix = page.get_pixmap(dpi=150)
|
|
img_path = os.path.join(output_dir, f"page_{page_num}.png")
|
|
pix.save(img_path)
|
|
images.append(img_path)
|
|
doc.close()
|
|
return images
|
|
|
|
def ocr_pdf(pdf_path):
|
|
print(f"Processing {pdf_path}")
|
|
# Create temp directory for images
|
|
temp_dir = tempfile.mkdtemp()
|
|
try:
|
|
images = pdf_to_images(pdf_path, temp_dir)
|
|
ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
|
|
all_text = []
|
|
for img_path in images:
|
|
result = ocr.ocr(img_path, cls=True)
|
|
if result is None:
|
|
continue
|
|
for line in result:
|
|
if line:
|
|
for word_info in line:
|
|
text = word_info[1][0]
|
|
all_text.append(text)
|
|
print(f"Image {img_path} extracted {len(result) if result else 0} lines")
|
|
print("\nExtracted text samples:")
|
|
for i, text in enumerate(all_text[:20]):
|
|
print(f" {i}: {text}")
|
|
print(f"Total text boxes: {len(all_text)}")
|
|
finally:
|
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
|
|
if __name__ == "__main__":
|
|
pdf_path = "test/safedistance.pdf"
|
|
if not os.path.exists(pdf_path):
|
|
print(f"File not found: {pdf_path}")
|
|
else:
|
|
ocr_pdf(pdf_path) |