railseek6/create_test_ocr_pdf.py

from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
import os

def create_test_ocr_pdf():
    # Create a new PDF with actual text content for OCR testing
    pdf_path = 'test_ocr_content.pdf'

    c = canvas.Canvas(pdf_path, pagesize=letter)
    width, height = letter

    # Add some text at different positions
    c.setFont("Helvetica", 12)
    c.drawString(100, 750, "This is a test PDF document for OCR processing.")
    c.drawString(100, 730, "It contains sample text that should be extracted by PaddleOCR.")
    c.drawString(100, 710, "Using GPU acceleration for faster processing.")
    c.drawString(100, 690, "Page 1 of the test document.")

    # Add more text in different fonts and sizes
    c.setFont("Helvetica-Bold", 14)
    c.drawString(100, 650, "Important Heading")

    c.setFont("Helvetica", 10)
    c.drawString(100, 630, "This is smaller text to test OCR accuracy.")
    c.drawString(100, 610, "Numbers: 1234567890")
    c.drawString(100, 590, "Special characters: !@#$%^&*()")

    c.save()
    print(f"✅ Created test PDF: {pdf_path}")
    return pdf_path

if __name__ == "__main__":
    create_test_ocr_pdf()