watsonx.ai_prompting_examples

Sleeping

App Files Files Community

MilanM commited on Mar 14

Commit

1602ff5

verified ·

1 Parent(s): 8d8c41d

Upload 2 files

Browse files

Files changed (2) hide show

cos_stream_schema_examples.py +101 -0
stream_files_to_cos.py +300 -0

cos_stream_schema_examples.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# Define input data schema as a proper Python list (not a string)
+input_schema = [
+    {
+        'id': '1',
+        'type': 'struct',
+        'fields': [
+            {
+                'name': 'cos_config',
+                'type': 'object',
+                'nullable': False,
+                'metadata': {
+                    'description': 'Cloud Object Storage configuration'
+                }
+            },
+            {
+                'name': 'source_urls',
+                'type': 'array',
+                'nullable': False,
+                'metadata': {
+                    'description': 'URLs of files to download and upload to COS'
+                }
+            },
+            {
+                'name': 'prefix',
+                'type': 'string',
+                'nullable': True,
+                'metadata': {
+                    'description': 'Optional prefix to add to the file names in COS'
+                }
+            },
+            {
+                'name': 'http_method',
+                'type': 'string',
+                'nullable': True,
+                'metadata': {
+                    'description': 'HTTP method to use for downloading files'
+                }
+            }
+        ]
+    }
+]
+# Define output data schema as a proper Python list (not a string)
+output_schema = [
+    {
+        'id': '1',
+        'type': 'struct',
+        'fields': [
+            {
+                'name': 'status',
+                'type': 'string',
+                'nullable': False,
+                'metadata': {
+                    'description': 'Status of the operation (success or error)'
+                }
+            },
+            {
+                'name': 'data',
+                'type': 'object',
+                'nullable': True,
+                'metadata': {
+                    'description': 'Response data containing upload details'
+                }
+            },
+            {
+                'name': 'message',
+                'type': 'string',
+                'nullable': True,
+                'metadata': {
+                    'description': 'Error message if status is error'
+                }
+            }
+        ]
+    }
+]
+# Define sample scoring input
+sample_input = {
+    'input_data': [
+        {
+            'fields': ['cos_config', 'source_urls', 'prefix', 'http_method'],
+            'values': [
+                [
+                    {
+                        'bucket_name': 'my-bucket',
+                        'api_key': '<your-cos-api-key>',
+                        'instance_id': '<your-cos-instance-id>',
+                        'auth_endpoint': 'https://iam.cloud.ibm.com/identity/token',
+                        'endpoint_url': 'https://s3.eu-de.cloud-object-storage.appdomain.cloud'
+                    },
+                    [
+                        'https://example.com/sample-file.pdf',
+                        'https://example.com/another-file.csv'
+                    ],
+                    'uploads/files',
+                    'GET'
+                ]
+            ]
+        }
+    ]
+}

stream_files_to_cos.py ADDED Viewed

	@@ -0,0 +1,300 @@

+def stream_file_to_cos():
+    # # Install required dependencies
+    # import subprocess
+    # subprocess.check_output('pip install ibm-cos-sdk requests', shell=True)
+    ### ^^^ Not necessary in this case since both are part of the default python 'runtime-24.1-py3.11' environment on watsox.ai
+    # Import dependencies
+    import ibm_boto3
+    import requests
+    from ibm_botocore.client import Config
+    import json
+    import os
+    import re
+    from urllib.parse import unquote
+    def extract_filename_from_headers(response):
+        """
+        Extract the actual filename from response headers.
+        Checks Content-Disposition and falls back to other methods if needed.
+        """
+        # Try Content-Disposition header first
+        content_disposition = response.headers.get('Content-Disposition')
+        if content_disposition:
+            # Look for filename= or filename*= parameters
+            matches = re.findall(r'filename\*?=(?:([^\']*\'\')?([^;\n]*))', content_disposition)
+            if matches:
+                # Take the last match and handle encoded filenames
+                encoding, filename = matches[-1]
+                if encoding:
+                    filename = unquote(filename)
+                return filename.strip('"\'')
+        # Try Content-Type for file extension
+        content_type = response.headers.get('Content-Type', '').split(';')[0]
+        extension_map = {
+            # Documents
+            'application/pdf': '.pdf',
+            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
+            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
+            'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx',
+            'text/csv': '.csv',
+            'application/xml': '.xml',
+            'text/xml': '.xml',
+            'application/yaml': '.yaml',
+            'text/yaml': '.yaml',
+            'application/toml': '.toml',
+            'text/plain': '.txt',
+            # Archives
+            'application/x-rar-compressed': '.rar',
+            'application/x-7z-compressed': '.7z',
+            'application/zip': '.zip',
+            'application/x-tar': '.tar',
+            'application/gzip': '.gz',
+            'application/x-gzip': '.gz',
+            # Executables
+            'application/x-msdownload': '.exe',
+            'application/x-apple-diskimage': '.dmg',
+            # Data formats
+            'application/json': '.json',
+            'application/x-jsonlines': '.jsonl',
+            'application/parquet': '.parquet',
+            # Images
+            'image/jpeg': '.jpg',
+            'image/png': '.png',
+            'image/tiff': '.tiff',
+            'image/gif': '.gif',
+            # Code and notebooks
+            'application/x-ipynb+json': '.ipynb',
+            'text/x-python': '.py',
+            'application/x-python-code': '.py'
+        }
+        # If we have a valid content type with extension mapping
+        if content_type in extension_map:
+            # Try to find a filename in the URL path
+            url_path = response.url.split('/')[-1]
+            # Remove query parameters if any
+            url_path = url_path.split('?')[0]
+            # If the URL path has no extension, add the appropriate one
+            if '.' not in url_path:
+                return f"{url_path}{extension_map[content_type]}"
+        # Fallback to URL filename
+        return response.url.split('/')[-1].split('?')[0]
+    def score(payload, token=None):
+        """
+        WatsonX.ai deployable function to stream files from HTTP to Cloud Object Storage
+        Expected payload format:
+        {
+            "input_data": [{
+                "fields": ["cos_config", "source_urls", "prefix", "http_method"],
+                "values": [[{
+                    "bucket_name": "my-bucket",
+                    "api_key": "my-api-key",
+                    "instance_id": "my-instance-id",
+                    "auth_endpoint": "https://iam.cloud.ibm.com/identity/token",
+                    "endpoint_url": "https://s3.us-south.cloud-object-storage.appdomain.cloud"
+                },
+                ["https://example.com/file1.pdf", "https://example.com/file2.csv"],
+                "my/prefix",
+                "GET"]]
+            }]
+        }
+        """
+        try:
+            # Extract input parameters from payload
+            input_data = payload.get("input_data")[0]
+            fields = input_data.get("fields")
+            values = input_data.get("values")[0]
+            # Map fields to values
+            params = dict(zip(fields, values))
+            # Extract COS configuration
+            cos_config = params.get('cos_config', {})
+            # Verify all required config values are present
+            missing_configs = [k for k, v in cos_config.items() if not v]
+            if missing_configs:
+                return {
+                    'predictions': [{
+                        'fields': ['status', 'message'],
+                        'values': [['error', f"Missing required configuration: {', '.join(missing_configs)}"]]
+                    }]
+                }
+            # Get function parameters
+            source_urls = params.get('source_urls', [])
+            if not source_urls:
+                return {
+                    'predictions': [{
+                        'fields': ['status', 'message'],
+                        'values': [['error', "Missing required parameter: source_urls"]]
+                    }]
+                }
+            # Convert single URL to list if necessary
+            if isinstance(source_urls, str):
+                source_urls = [source_urls]
+            prefix = params.get('prefix', '')
+            http_method = params.get('http_method', 'GET')
+            # Initialize COS client
+            cos_client = ibm_boto3.client(
+                "s3",
+                ibm_api_key_id=cos_config['api_key'],
+                ibm_service_instance_id=cos_config['instance_id'],
+                ibm_auth_endpoint=cos_config['auth_endpoint'],
+                config=Config(signature_version="oauth"),
+                endpoint_url=cos_config['endpoint_url']
+            )
+            # Normalize prefix
+            if prefix:
+                prefix = prefix.strip('/')
+                if prefix:
+                    prefix = f"{prefix}/"
+            # Track results for each URL
+            results = []
+            errors = []
+            for source_url in source_urls:
+                try:
+                    # Setup download stream
+                    session = requests.Session()
+                    response = session.request(http_method, source_url, stream=True)
+                    response.raise_for_status()
+                    # Extract actual filename from response
+                    filename = extract_filename_from_headers(response)
+                    # Combine prefix with filename for the full COS key
+                    target_key = f"{prefix}{filename}" if prefix else filename
+                    # Upload file to COS
+                    conf = ibm_boto3.s3.transfer.TransferConfig(
+                        multipart_threshold=1024**2,  # 1MB
+                        max_concurrency=100
+                    )
+                    cos_client.upload_fileobj(
+                        response.raw,
+                        cos_config['bucket_name'],
+                        target_key,
+                        Config=conf
+                    )
+                    results.append({
+                        "source_url": source_url,
+                        "bucket": cos_config['bucket_name'],
+                        "key": target_key,
+                        "filename": filename,
+                        "status": "success"
+                    })
+                except Exception as e:
+                    errors.append({
+                        "source_url": source_url,
+                        "error": str(e)
+                    })
+            # Prepare response in watsonx.ai format
+            response_data = {
+                "successful_uploads": results,
+                "failed_uploads": errors,
+                "total_processed": len(source_urls),
+                "successful_count": len(results),
+                "failed_count": len(errors)
+            }
+            return {
+                'predictions': [{
+                    'fields': ['status', 'data'],
+                    'values': [['success' if results else 'error', response_data]]
+                }]
+            }
+        except Exception as e:
+            return {
+                'predictions': [{
+                    'fields': ['status', 'message'],
+                    'values': [['error', f"Error processing request: {str(e)}"]]
+                }]
+            }
+    return score
+# For testing in notebook
+score = stream_file_to_cos()
+# ------------------------------------------------------------------------------------------------------------
+### Example Usage:
+# try:
+#     import requests
+#     import json
+#     wx_api_key = ""
+#     wx_region = "us-south" ### watsonx.ai region
+#     serving_name = "" ### Serving name or id of your deployment
+#     ## Retrieve a bearer token
+#     token_response = requests.post('https://iam.cloud.ibm.com/identity/token',
+#         data={
+#             "apikey": wx_api_key,
+#             "grant_type": 'urn:ibm:params:oauth:grant-type:apikey'
+#         }
+#     )
+#     bearer_tk = token_response.json()["access_token"]
+#     # Example run of function
+#     scoring_inputs = {
+#         "input_data": [
+#             {
+#             "fields": [
+#                 "cos_config",
+#                 "source_urls",
+#                 "prefix",
+#                 "http_method"],
+#             "values": [
+#                 [
+#                     {
+#                     "api_key": "<insert_api_key>",
+#                     "auth_endpoint": "https://iam.cloud.ibm.com/identity/token",
+#                     "bucket_name": "<target_bucket_name>",
+#                     "endpoint_url": "https://s3.eu-de.cloud-object-storage.appdomain.cloud",   ### preset for Frankfurt Regional Here
+#                     "instance_id": "<resource_instance_id starts with crn:...>"
+#                     },
+#                     [
+#                     "https://data.mendeley.com/public-files/datasets/27c8pwsd6v/files/8145e2c0-83f8-4367-87d7-6778a7bc2e5f/file_downloaded", ### Example Data Links
+#                     "https://data.mendeley.com/public-files/datasets/27c8pwsd6v/files/136853fb-52b3-457f-94cf-c79821ed5145/file_downloaded",
+#                     "https://data.mendeley.com/public-files/datasets/27c8pwsd6v/files/8be42620-b4c2-4535-b9ce-e9b62190202f/file_downloaded",
+#                     "https://data.mendeley.com/public-files/datasets/27c8pwsd6v/files/f88087d7-4d29-444a-b9ec-e203c41ec52b/file_downloaded"
+#                     ],
+#                     "cos_stream_test_run_batch", ### "Folder path to save to"
+#                     "GET"
+#                 ]
+#             ]
+#             }
+#         ]
+#     }
+#     function_run = requests.post(
+#         url = f'https://{wx_region}.ml.cloud.ibm.com/ml/v4/deployments/{serving_name}/predictions?version=2021-05-01',
+#         json = scoring_inputs,
+#         headers = {'Authorization': 'Bearer ' + bearer_tk}
+#     )
+# finally:
+#     print(function_run.json())