Files
railseek6/create_test_ocr_pdf.py

33 lines
1.2 KiB
Python

from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
import os
def create_test_ocr_pdf():
# Create a new PDF with actual text content for OCR testing
pdf_path = 'test_ocr_content.pdf'
c = canvas.Canvas(pdf_path, pagesize=letter)
width, height = letter
# Add some text at different positions
c.setFont("Helvetica", 12)
c.drawString(100, 750, "This is a test PDF document for OCR processing.")
c.drawString(100, 730, "It contains sample text that should be extracted by PaddleOCR.")
c.drawString(100, 710, "Using GPU acceleration for faster processing.")
c.drawString(100, 690, "Page 1 of the test document.")
# Add more text in different fonts and sizes
c.setFont("Helvetica-Bold", 14)
c.drawString(100, 650, "Important Heading")
c.setFont("Helvetica", 10)
c.drawString(100, 630, "This is smaller text to test OCR accuracy.")
c.drawString(100, 610, "Numbers: 1234567890")
c.drawString(100, 590, "Special characters: !@#$%^&*()")
c.save()
print(f"✅ Created test PDF: {pdf_path}")
return pdf_path
if __name__ == "__main__":
create_test_ocr_pdf()