import fitz from paddleocr import PaddleOCR import os # Test 1: Extract images from PDF print('=== PDF Analysis ===') doc = fitz.open('ocr.pdf') print(f'Pages: {doc.page_count}') page = doc[0] # Extract images image_list = page.get_images() print(f'Images found: {len(image_list)}') # Test 2: Convert page to image and save pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # Higher resolution pix.save('ocr_high_res.png') print('Saved high-res image: ocr_high_res.png') # Test 3: Try PaddleOCR on the image print('\n=== PaddleOCR Test ===') ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True) result = ocr.ocr('ocr_high_res.png') if result: print(f'OCR result length: {len(result)}') for page_num, page_result in enumerate(result): print(f'Page {page_num}: {len(page_result)} text boxes') for i, line in enumerate(page_result[:5]): # Show first 5 if line and len(line) >= 2: text_info = line[1] if len(text_info) >= 1: text = text_info[0] print(f' Box {i}: \"{text}\"') else: print('No OCR results')