96 lines
3.5 KiB
Python
96 lines
3.5 KiB
Python
import os
|
|
import sys
|
|
import fitz # PyMuPDF
|
|
import tempfile
|
|
import paddleocr
|
|
from pathlib import Path
|
|
|
|
def debug_ocr_on_pdf(pdf_path):
|
|
"""Debug what PaddleOCR is actually extracting from the PDF"""
|
|
|
|
print(f"Testing OCR extraction on: {pdf_path}")
|
|
print(f"File exists: {os.path.exists(pdf_path)}")
|
|
print(f"File size: {os.path.getsize(pdf_path)} bytes")
|
|
|
|
# Initialize PaddleOCR with current configuration
|
|
try:
|
|
ocr = paddleocr.PaddleOCR(use_angle_cls=True, lang='en')
|
|
print("PaddleOCR initialized successfully")
|
|
except Exception as e:
|
|
print(f"Error initializing PaddleOCR: {e}")
|
|
return
|
|
|
|
# Open PDF and process each page
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
print(f"PDF has {len(doc)} pages")
|
|
|
|
all_extracted_text = ""
|
|
|
|
for page_num in range(len(doc)):
|
|
print(f"\n--- Processing Page {page_num + 1} ---")
|
|
page = doc.load_page(page_num)
|
|
pix = page.get_pixmap()
|
|
img_data = pix.tobytes("png")
|
|
|
|
# Save temporary image
|
|
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img:
|
|
temp_img.write(img_data)
|
|
temp_img_path = temp_img.name
|
|
|
|
try:
|
|
# Run OCR on the temporary image
|
|
result = ocr.ocr(temp_img_path)
|
|
print(f"OCR result structure: {type(result)}")
|
|
|
|
if result:
|
|
print(f"Number of lines detected: {len(result)}")
|
|
|
|
page_text = ""
|
|
for line_num, line in enumerate(result):
|
|
print(f"Line {line_num}: {line}")
|
|
if line: # Check if line is not empty
|
|
for word_info in line:
|
|
if len(word_info) >= 2:
|
|
text = word_info[1][0]
|
|
confidence = word_info[1][1] if len(word_info[1]) > 1 else "N/A"
|
|
print(f" Word: '{text}' (confidence: {confidence})")
|
|
page_text += text + " "
|
|
|
|
print(f"Extracted text from page {page_num + 1}: '{page_text.strip()}'")
|
|
all_extracted_text += page_text.strip() + "\n"
|
|
else:
|
|
print(f"No OCR results for page {page_num + 1}")
|
|
|
|
except Exception as e:
|
|
print(f"Error during OCR processing page {page_num + 1}: {e}")
|
|
finally:
|
|
# Clean up temp file
|
|
try:
|
|
os.unlink(temp_img_path)
|
|
except:
|
|
pass
|
|
|
|
print(f"\n--- FINAL EXTRACTED TEXT ---")
|
|
print(all_extracted_text)
|
|
print(f"Total characters extracted: {len(all_extracted_text)}")
|
|
|
|
doc.close()
|
|
|
|
except Exception as e:
|
|
print(f"Error processing PDF: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
# Test on the ocr.pdf file
|
|
pdf_path = "ocr.pdf"
|
|
if not os.path.exists(pdf_path):
|
|
print(f"File {pdf_path} not found. Looking for similar files...")
|
|
# Look for any PDF files that might be the test file
|
|
for file in os.listdir("."):
|
|
if file.lower().endswith(".pdf"):
|
|
print(f"Found PDF: {file}")
|
|
if "ocr" in file.lower():
|
|
pdf_path = file
|
|
break
|
|
|
|
debug_ocr_on_pdf(pdf_path) |