import os import sys import fitz # PyMuPDF import tempfile import paddleocr from pathlib import Path def debug_ocr_on_pdf(pdf_path): """Debug what PaddleOCR is actually extracting from the PDF""" print(f"Testing OCR extraction on: {pdf_path}") print(f"File exists: {os.path.exists(pdf_path)}") print(f"File size: {os.path.getsize(pdf_path)} bytes") # Initialize PaddleOCR with current configuration try: ocr = paddleocr.PaddleOCR(use_angle_cls=True, lang='en') print("PaddleOCR initialized successfully") except Exception as e: print(f"Error initializing PaddleOCR: {e}") return # Open PDF and process each page try: doc = fitz.open(pdf_path) print(f"PDF has {len(doc)} pages") all_extracted_text = "" for page_num in range(len(doc)): print(f"\n--- Processing Page {page_num + 1} ---") page = doc.load_page(page_num) pix = page.get_pixmap() img_data = pix.tobytes("png") # Save temporary image with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img: temp_img.write(img_data) temp_img_path = temp_img.name try: # Run OCR on the temporary image result = ocr.ocr(temp_img_path) print(f"OCR result structure: {type(result)}") if result: print(f"Number of lines detected: {len(result)}") page_text = "" for line_num, line in enumerate(result): print(f"Line {line_num}: {line}") if line: # Check if line is not empty for word_info in line: if len(word_info) >= 2: text = word_info[1][0] confidence = word_info[1][1] if len(word_info[1]) > 1 else "N/A" print(f" Word: '{text}' (confidence: {confidence})") page_text += text + " " print(f"Extracted text from page {page_num + 1}: '{page_text.strip()}'") all_extracted_text += page_text.strip() + "\n" else: print(f"No OCR results for page {page_num + 1}") except Exception as e: print(f"Error during OCR processing page {page_num + 1}: {e}") finally: # Clean up temp file try: os.unlink(temp_img_path) except: pass print(f"\n--- FINAL EXTRACTED TEXT ---") print(all_extracted_text) print(f"Total characters extracted: {len(all_extracted_text)}") doc.close() except Exception as e: print(f"Error processing PDF: {e}") if __name__ == "__main__": # Test on the ocr.pdf file pdf_path = "ocr.pdf" if not os.path.exists(pdf_path): print(f"File {pdf_path} not found. Looking for similar files...") # Look for any PDF files that might be the test file for file in os.listdir("."): if file.lower().endswith(".pdf"): print(f"Found PDF: {file}") if "ocr" in file.lower(): pdf_path = file break debug_ocr_on_pdf(pdf_path)