33 lines
1.2 KiB
Python
33 lines
1.2 KiB
Python
from reportlab.pdfgen import canvas
|
|
from reportlab.lib.pagesizes import letter
|
|
import os
|
|
|
|
def create_test_ocr_pdf():
|
|
# Create a new PDF with actual text content for OCR testing
|
|
pdf_path = 'test_ocr_content.pdf'
|
|
|
|
c = canvas.Canvas(pdf_path, pagesize=letter)
|
|
width, height = letter
|
|
|
|
# Add some text at different positions
|
|
c.setFont("Helvetica", 12)
|
|
c.drawString(100, 750, "This is a test PDF document for OCR processing.")
|
|
c.drawString(100, 730, "It contains sample text that should be extracted by PaddleOCR.")
|
|
c.drawString(100, 710, "Using GPU acceleration for faster processing.")
|
|
c.drawString(100, 690, "Page 1 of the test document.")
|
|
|
|
# Add more text in different fonts and sizes
|
|
c.setFont("Helvetica-Bold", 14)
|
|
c.drawString(100, 650, "Important Heading")
|
|
|
|
c.setFont("Helvetica", 10)
|
|
c.drawString(100, 630, "This is smaller text to test OCR accuracy.")
|
|
c.drawString(100, 610, "Numbers: 1234567890")
|
|
c.drawString(100, 590, "Special characters: !@#$%^&*()")
|
|
|
|
c.save()
|
|
print(f"✅ Created test PDF: {pdf_path}")
|
|
return pdf_path
|
|
|
|
if __name__ == "__main__":
|
|
create_test_ocr_pdf() |