36 lines
1.1 KiB
Python
36 lines
1.1 KiB
Python
import fitz
|
|
from paddleocr import PaddleOCR
|
|
import os
|
|
|
|
# Test 1: Extract images from PDF
|
|
print('=== PDF Analysis ===')
|
|
doc = fitz.open('ocr.pdf')
|
|
print(f'Pages: {doc.page_count}')
|
|
page = doc[0]
|
|
|
|
# Extract images
|
|
image_list = page.get_images()
|
|
print(f'Images found: {len(image_list)}')
|
|
|
|
# Test 2: Convert page to image and save
|
|
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # Higher resolution
|
|
pix.save('ocr_high_res.png')
|
|
print('Saved high-res image: ocr_high_res.png')
|
|
|
|
# Test 3: Try PaddleOCR on the image
|
|
print('\n=== PaddleOCR Test ===')
|
|
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)
|
|
result = ocr.ocr('ocr_high_res.png')
|
|
|
|
if result:
|
|
print(f'OCR result length: {len(result)}')
|
|
for page_num, page_result in enumerate(result):
|
|
print(f'Page {page_num}: {len(page_result)} text boxes')
|
|
for i, line in enumerate(page_result[:5]): # Show first 5
|
|
if line and len(line) >= 2:
|
|
text_info = line[1]
|
|
if len(text_info) >= 1:
|
|
text = text_info[0]
|
|
print(f' Box {i}: \"{text}\"')
|
|
else:
|
|
print('No OCR results') |