ocr improved

2026-01-13 18:25:49 +08:00
parent 9745ca2476
commit a5eb381384
104 changed files with 818 additions and 229 deletions
--- a/test_ocr_safedistance.py
+++ b/test_ocr_safedistance.py
@@ -0,0 +1,52 @@
+import sys
+import os
+sys.path.append('.')
+from paddleocr import PaddleOCR
+import fitz  # PyMuPDF
+import tempfile
+import shutil
+
+def pdf_to_images(pdf_path, output_dir):
+    """Convert PDF pages to images"""
+    doc = fitz.open(pdf_path)
+    images = []
+    for page_num in range(len(doc)):
+        page = doc[page_num]
+        pix = page.get_pixmap(dpi=150)
+        img_path = os.path.join(output_dir, f"page_{page_num}.png")
+        pix.save(img_path)
+        images.append(img_path)
+    doc.close()
+    return images
+
+def ocr_pdf(pdf_path):
+    print(f"Processing {pdf_path}")
+    # Create temp directory for images
+    temp_dir = tempfile.mkdtemp()
+    try:
+        images = pdf_to_images(pdf_path, temp_dir)
+        ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
+        all_text = []
+        for img_path in images:
+            result = ocr.ocr(img_path, cls=True)
+            if result is None:
+                continue
+            for line in result:
+                if line:
+                    for word_info in line:
+                        text = word_info[1][0]
+                        all_text.append(text)
+            print(f"Image {img_path} extracted {len(result) if result else 0} lines")
+        print("\nExtracted text samples:")
+        for i, text in enumerate(all_text[:20]):
+            print(f"  {i}: {text}")
+        print(f"Total text boxes: {len(all_text)}")
+    finally:
+        shutil.rmtree(temp_dir, ignore_errors=True)
+
+if __name__ == "__main__":
+    pdf_path = "test/safedistance.pdf"
+    if not os.path.exists(pdf_path):
+        print(f"File not found: {pdf_path}")
+    else:
+        ocr_pdf(pdf_path)