ocr improved
This commit is contained in:
52
test_ocr_safedistance.py
Normal file
52
test_ocr_safedistance.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import sys
|
||||
import os
|
||||
sys.path.append('.')
|
||||
from paddleocr import PaddleOCR
|
||||
import fitz # PyMuPDF
|
||||
import tempfile
|
||||
import shutil
|
||||
|
||||
def pdf_to_images(pdf_path, output_dir):
|
||||
"""Convert PDF pages to images"""
|
||||
doc = fitz.open(pdf_path)
|
||||
images = []
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
pix = page.get_pixmap(dpi=150)
|
||||
img_path = os.path.join(output_dir, f"page_{page_num}.png")
|
||||
pix.save(img_path)
|
||||
images.append(img_path)
|
||||
doc.close()
|
||||
return images
|
||||
|
||||
def ocr_pdf(pdf_path):
|
||||
print(f"Processing {pdf_path}")
|
||||
# Create temp directory for images
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
try:
|
||||
images = pdf_to_images(pdf_path, temp_dir)
|
||||
ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
|
||||
all_text = []
|
||||
for img_path in images:
|
||||
result = ocr.ocr(img_path, cls=True)
|
||||
if result is None:
|
||||
continue
|
||||
for line in result:
|
||||
if line:
|
||||
for word_info in line:
|
||||
text = word_info[1][0]
|
||||
all_text.append(text)
|
||||
print(f"Image {img_path} extracted {len(result) if result else 0} lines")
|
||||
print("\nExtracted text samples:")
|
||||
for i, text in enumerate(all_text[:20]):
|
||||
print(f" {i}: {text}")
|
||||
print(f"Total text boxes: {len(all_text)}")
|
||||
finally:
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
pdf_path = "test/safedistance.pdf"
|
||||
if not os.path.exists(pdf_path):
|
||||
print(f"File not found: {pdf_path}")
|
||||
else:
|
||||
ocr_pdf(pdf_path)
|
||||
Reference in New Issue
Block a user