35 lines
1.2 KiB
Python
35 lines
1.2 KiB
Python
from reportlab.lib.pagesizes import letter
|
|
from reportlab.pdfgen import canvas
|
|
import os
|
|
|
|
def create_test_pdf():
|
|
filename = "test_meaningful.pdf"
|
|
c = canvas.Canvas(filename, pagesize=letter)
|
|
|
|
# Set up some sample text with entities
|
|
text_lines = [
|
|
"Test Document for OCR Processing",
|
|
"This document contains various entities for testing.",
|
|
"John Smith works at Microsoft Corporation in Seattle.",
|
|
"The company was founded by Bill Gates and Paul Allen.",
|
|
"Microsoft develops software products like Windows and Office.",
|
|
"The headquarters is located in Redmond, Washington.",
|
|
"This document was created on October 28, 2025.",
|
|
"It contains names of people, organizations, and locations.",
|
|
"These should be extracted as entities by the system."
|
|
]
|
|
|
|
# Draw text on the page
|
|
y_position = 700
|
|
for line in text_lines:
|
|
c.drawString(100, y_position, line)
|
|
y_position -= 20
|
|
|
|
c.save()
|
|
print(f"Created test PDF: {filename}")
|
|
print(f"File size: {os.path.getsize(filename)} bytes")
|
|
|
|
return filename
|
|
|
|
if __name__ == "__main__":
|
|
create_test_pdf() |