import requests import os import json import time # Test uploading and processing ocr.pdf to verify it contains actual content base_url = 'http://localhost:3015' # Login first print("šŸ” Logging in...") login_data = {'username': 'jleu3482', 'password': 'jleu1212'} login_response = requests.post(f'{base_url}/login', data=login_data) if login_response.status_code == 200: token = login_response.json().get('access_token') headers = {'Authorization': f'Bearer {token}'} print('āœ… Login successful') # First, delete all documents to start fresh print("\nšŸ—‘ļø Deleting all documents...") delete_response = requests.delete(f'{base_url}/documents', headers=headers) if delete_response.status_code == 200: print("āœ… All documents deleted successfully") else: print(f"āŒ Delete failed: {delete_response.text}") # Wait for cleanup time.sleep(3) # Now upload ocr.pdf fresh pdf_file = 'ocr.pdf' if not os.path.exists(pdf_file): print(f"āŒ {pdf_file} not found") exit(1) print(f"\nšŸ“¤ Uploading {pdf_file}...") with open(pdf_file, 'rb') as file: files = {'file': (pdf_file, file, 'application/pdf')} upload_response = requests.post(f'{base_url}/documents/upload', files=files, headers=headers) print(f" Upload Status: {upload_response.status_code}") if upload_response.status_code == 200: result = upload_response.json() print(f" Response: {json.dumps(result, indent=2)}") if result.get('status') == 'success': print("āœ… Upload successful, waiting for processing...") # Wait longer for OCR processing print("ā³ Waiting 15 seconds for OCR processing...") time.sleep(15) else: print(f"āš ļø Upload status: {result.get('status')}") else: print(f'āŒ Upload failed: {upload_response.text}') # Check document status after upload print("\nšŸ” Checking document status...") status_response = requests.get(f'{base_url}/documents', headers=headers) if status_response.status_code == 200: documents = status_response.json() print(f" Documents response: {json.dumps(documents, indent=2)}") # Extract content summary if available if 'statuses' in documents: processed = documents['statuses'].get('processed', []) failed = documents['statuses'].get('failed', []) for doc in processed: if doc.get('file_path') == 'ocr.pdf': print(f"\nāœ… OCR.PDF PROCESSED SUCCESSFULLY!") print(f" Content Summary: {doc.get('content_summary', 'No summary')}") print(f" Content Length: {doc.get('content_length', 0)} characters") print(f" Chunks: {doc.get('chunks_count', 0)}") for doc in failed: if doc.get('file_path') == 'ocr.pdf': print(f"\nāŒ OCR.PDF FAILED!") print(f" Error: {doc.get('error_msg', 'Unknown error')}") # Test search with specific content that should be in ocr.pdf print("\nšŸ” Testing search functionality...") test_queries = [ "table data", "document content", "scanned text", "PDF information", "extracted content" ] for query in test_queries: print(f"\nšŸ”Ž Querying: \"{query}\"") query_data = {'query': query, 'top_k': 5} search_response = requests.post(f'{base_url}/query', json=query_data, headers=headers) if search_response.status_code == 200: results = search_response.json() if isinstance(results, dict): response_text = results.get('response', '') print(f" Response length: {len(response_text)} characters") print(f" Preview: {response_text[:200]}...") # Check for specific indicators if 'ocr.pdf' in response_text.lower(): print(" āœ… Found reference to ocr.pdf!") if 'table' in response_text.lower(): print(" āœ… Found table content!") if len(response_text) > 50: # Meaningful response print(" āœ… Got meaningful response!") else: print(f" Unexpected result format: {type(results)}") else: print(f'āŒ Search failed: {search_response.text}') else: print('āŒ Login failed')