ocr improved

This commit is contained in:
2026-01-13 18:25:49 +08:00
parent 9745ca2476
commit a5eb381384
104 changed files with 818 additions and 229 deletions

52
test_ocr_safedistance.py Normal file
View File

@@ -0,0 +1,52 @@
import sys
import os
sys.path.append('.')
from paddleocr import PaddleOCR
import fitz # PyMuPDF
import tempfile
import shutil
def pdf_to_images(pdf_path, output_dir):
"""Convert PDF pages to images"""
doc = fitz.open(pdf_path)
images = []
for page_num in range(len(doc)):
page = doc[page_num]
pix = page.get_pixmap(dpi=150)
img_path = os.path.join(output_dir, f"page_{page_num}.png")
pix.save(img_path)
images.append(img_path)
doc.close()
return images
def ocr_pdf(pdf_path):
print(f"Processing {pdf_path}")
# Create temp directory for images
temp_dir = tempfile.mkdtemp()
try:
images = pdf_to_images(pdf_path, temp_dir)
ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
all_text = []
for img_path in images:
result = ocr.ocr(img_path, cls=True)
if result is None:
continue
for line in result:
if line:
for word_info in line:
text = word_info[1][0]
all_text.append(text)
print(f"Image {img_path} extracted {len(result) if result else 0} lines")
print("\nExtracted text samples:")
for i, text in enumerate(all_text[:20]):
print(f" {i}: {text}")
print(f"Total text boxes: {len(all_text)}")
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
if __name__ == "__main__":
pdf_path = "test/safedistance.pdf"
if not os.path.exists(pdf_path):
print(f"File not found: {pdf_path}")
else:
ocr_pdf(pdf_path)