Files
railseek6/test_ocr_detailed.py

36 lines
1.1 KiB
Python

import fitz
from paddleocr import PaddleOCR
import os
# Test 1: Extract images from PDF
print('=== PDF Analysis ===')
doc = fitz.open('ocr.pdf')
print(f'Pages: {doc.page_count}')
page = doc[0]
# Extract images
image_list = page.get_images()
print(f'Images found: {len(image_list)}')
# Test 2: Convert page to image and save
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # Higher resolution
pix.save('ocr_high_res.png')
print('Saved high-res image: ocr_high_res.png')
# Test 3: Try PaddleOCR on the image
print('\n=== PaddleOCR Test ===')
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)
result = ocr.ocr('ocr_high_res.png')
if result:
print(f'OCR result length: {len(result)}')
for page_num, page_result in enumerate(result):
print(f'Page {page_num}: {len(page_result)} text boxes')
for i, line in enumerate(page_result[:5]): # Show first 5
if line and len(line) >= 2:
text_info = line[1]
if len(text_info) >= 1:
text = text_info[0]
print(f' Box {i}: \"{text}\"')
else:
print('No OCR results')