141 lines
5.7 KiB
Python
141 lines
5.7 KiB
Python
import os
|
|
import sys
|
|
import requests
|
|
import time
|
|
|
|
def show_current_status():
|
|
"""Show honest current status of OCR PDF upload"""
|
|
print("=== HONEST OCR PDF UPLOAD STATUS ===")
|
|
print("=" * 50)
|
|
|
|
# Check what we actually have working
|
|
print("\n✅ WHAT'S WORKING:")
|
|
print("1. OCR PDF file exists and contains valid content")
|
|
print("2. PaddleOCR can extract text from the PDF (CPU mode)")
|
|
print("3. CUDA 11.8 is installed and accessible")
|
|
print("4. cuDNN 8.x DLLs are present in CUDA directory")
|
|
|
|
print("\n❌ WHAT'S NOT WORKING:")
|
|
print("1. GPU PaddleOCR - cuDNN DLL loading fails despite DLLs being present")
|
|
print("2. LightRAG server startup - times out in our automated tests")
|
|
print("3. No successful end-to-end upload → processing → search workflow yet")
|
|
|
|
print("\n📄 OCR.PDF CONTENT (Verified):")
|
|
print(" File: ocr.pdf (262,676 bytes)")
|
|
print(" Pages: 1")
|
|
print(" Content: Safety precautions document about 'Minimum Safe Distance'")
|
|
print(" Text extracted: 56 text boxes with high confidence (0.96-1.00)")
|
|
|
|
print("\n🔧 CURRENT BLOCKERS:")
|
|
print("1. PaddlePaddle GPU initialization fails with cuDNN error")
|
|
print("2. Server startup timing out in automated tests")
|
|
print("3. Need manual server startup and testing")
|
|
|
|
def test_server_connectivity():
|
|
"""Test if server is currently running"""
|
|
print("\n=== SERVER CONNECTIVITY TEST ===")
|
|
try:
|
|
response = requests.get("http://localhost:3015", timeout=5)
|
|
if response.status_code == 200:
|
|
print("✅ Server is running at http://localhost:3015")
|
|
return True
|
|
else:
|
|
print(f"❌ Server returned status: {response.status_code}")
|
|
return False
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"❌ Server not accessible: {e}")
|
|
return False
|
|
|
|
def test_manual_upload_if_possible():
|
|
"""Test upload if server is running"""
|
|
if not test_server_connectivity():
|
|
print("\n💡 MANUAL TESTING REQUIRED:")
|
|
print("1. Start server manually: run 'zrun_cuda11.8_ocr.bat'")
|
|
print("2. Wait for server to fully start (check terminal output)")
|
|
print("3. Open http://localhost:3015 in browser")
|
|
print("4. Login with: jleu3482 / jleu1212")
|
|
print("5. Upload ocr.pdf and monitor processing")
|
|
return False
|
|
|
|
print("\n=== ATTEMPTING UPLOAD TEST ===")
|
|
try:
|
|
# Login
|
|
login_data = {'username': 'jleu3482', 'password': 'jleu1212'}
|
|
login_response = requests.post('http://localhost:3015/login', data=login_data, timeout=30)
|
|
|
|
if login_response.status_code != 200:
|
|
print(f"❌ Login failed: {login_response.text}")
|
|
return False
|
|
|
|
token = login_response.json().get('access_token')
|
|
headers = {'Authorization': f'Bearer {token}'}
|
|
print("✅ Login successful")
|
|
|
|
# Upload
|
|
print("Uploading ocr.pdf...")
|
|
with open('ocr.pdf', 'rb') as f:
|
|
files = {'file': ('ocr.pdf', f, 'application/pdf')}
|
|
upload_response = requests.post('http://localhost:3015/documents/upload', files=files, headers=headers, timeout=60)
|
|
|
|
print(f"Upload status: {upload_response.status_code}")
|
|
if upload_response.status_code == 200:
|
|
print("✅ Upload successful!")
|
|
upload_data = upload_response.json()
|
|
print(f"Response: {upload_data}")
|
|
return True
|
|
else:
|
|
print(f"❌ Upload failed: {upload_response.text}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
print(f"❌ Upload test error: {e}")
|
|
return False
|
|
|
|
def show_ocr_content_preview():
|
|
"""Show what content we know is in ocr.pdf"""
|
|
print("\n=== OCR.PDF CONTENT PREVIEW (Verified) ===")
|
|
print("This is what PaddleOCR successfully extracted from the PDF:")
|
|
print()
|
|
print("G1.Safety Precautions")
|
|
print("G1.7 Minimum Safe Distance")
|
|
print("G1.7.1 For work near or when conducting tests on a high")
|
|
print("All voltage or traction voltage fixed electrical installation, ad")
|
|
print("minimum safe distance must be maintained between")
|
|
print("any part of a person or any conductive tool being")
|
|
print("directly handled and exposed live parts:")
|
|
print()
|
|
print("... and 49 more text boxes with safety precaution content")
|
|
print("Total: 56 text elements extracted with high confidence")
|
|
|
|
def main():
|
|
"""Main function to show honest status"""
|
|
print("HONEST OCR PDF UPLOAD TEST RESULTS")
|
|
print("=" * 60)
|
|
|
|
show_current_status()
|
|
show_ocr_content_preview()
|
|
|
|
print("\n" + "=" * 60)
|
|
print("📊 SUMMARY OF ACTUAL RESULTS:")
|
|
print("=" * 60)
|
|
print("✅ OCR FUNCTIONALITY: Working (CPU mode)")
|
|
print("✅ PDF CONTENT: Verified and extractable")
|
|
print("✅ CUDA ENVIRONMENT: Configured for 11.8")
|
|
print("❌ GPU ACCELERATION: Blocked by cuDNN loading")
|
|
print("❌ END-TO-END WORKFLOW: Not yet demonstrated")
|
|
print("❌ SERVER INTEGRATION: Requires manual testing")
|
|
|
|
print("\n🔍 NEXT STEPS FOR COMPLETE VERIFICATION:")
|
|
print("1. Manual server startup with zrun_cuda11.8_ocr.bat")
|
|
print("2. Manual upload via web UI at http://localhost:3015")
|
|
print("3. Monitor processing in server logs")
|
|
print("4. Test search functionality after processing")
|
|
|
|
print("\n⚠️ HONEST ASSESSMENT:")
|
|
print("The OCR PDF processing capability is functional but the")
|
|
print("complete automated workflow from upload to search has")
|
|
print("not been successfully demonstrated in this session.")
|
|
print("Manual testing is required to verify the full integration.")
|
|
|
|
if __name__ == "__main__":
|
|
main() |