railseek6/analyze_ocr_pdf_table.py

import PyPDF2
import os
import fitz  # PyMuPDF
from PIL import Image
import io

def analyze_pdf_structure(pdf_path):
    if not os.path.exists(pdf_path):
        print(f"❌ {pdf_path} not found")
        return

    print(f"📊 Analyzing {pdf_path} structure...")

    # Method 1: Try PyPDF2 for text extraction
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            num_pages = len(reader.pages)
            print(f"  Pages: {num_pages}")

            for i, page in enumerate(reader.pages):
                text = page.extract_text()
                print(f"  Page {i+1} - PyPDF2 text length: {len(text)}")
                if text.strip():
                    print(f"    Text preview: {repr(text[:200])}")
                else:
                    print("    No text found with PyPDF2")
    except Exception as e:
        print(f"  PyPDF2 error: {e}")

    # Method 2: Try PyMuPDF for better table/text detection
    try:
        doc = fitz.open(pdf_path)
        print(f"  PyMuPDF analysis:")
        print(f"  - Page count: {len(doc)}")

        for page_num in range(len(doc)):
            page = doc[page_num]
            text = page.get_text()
            print(f"  Page {page_num+1} - PyMuPDF text length: {len(text)}")

            if text.strip():
                print(f"    Text preview: {repr(text[:200])}")

                # Check for tables by looking for tabular patterns
                lines = text.split('\n')
                table_like_lines = [line for line in lines if len(line.split()) > 3]
                if len(table_like_lines) > 2:
                    print(f"    Potential table detected with {len(table_like_lines)} table-like lines")
                    print(f"    Sample table lines:")
                    for i, line in enumerate(table_like_lines[:3]):
                        print(f"      {i+1}: {line}")
            else:
                print("    No text found with PyMuPDF")

            # Check if page has images (scanned document)
            image_list = page.get_images()
            if image_list:
                print(f"    Images found: {len(image_list)}")

        doc.close()
    except Exception as e:
        print(f"  PyMuPDF error: {e}")

    # Method 3: Check file properties
    file_size = os.path.getsize(pdf_path)
    print(f"  File size: {file_size} bytes")

    if file_size < 1000:
        print("  ⚠️  Very small file - might be corrupted")
    elif file_size > 10000000:
        print("  ⚠️  Very large file - might be complex")

if __name__ == "__main__":
    analyze_pdf_structure("ocr.pdf")