Spaces:

NCEE-Build-Lab
/

watsonx.ai_Function_Deployment_MNB

Running

App Files Files Community

MilanM commited on Apr 10

Commit

696e220

verified ·

1 Parent(s): 65ce127

Update stream_files_to_cos.py

Browse files

Files changed (1) hide show

stream_files_to_cos.py +72 -143

stream_files_to_cos.py CHANGED Viewed

@@ -1,9 +1,4 @@
 def stream_file_to_cos():
-    # # Install required dependencies
-    # import subprocess
-    # subprocess.check_output('pip install ibm-cos-sdk requests', shell=True)
-    ### ^^^ Not necessary in this case since both are part of the default python 'runtime-24.1-py3.11' environment on watsox.ai
     # Import dependencies
     import ibm_boto3
     import requests
@@ -17,7 +12,27 @@ def stream_file_to_cos():
         """
         Extract the actual filename from response headers.
         Checks Content-Disposition and falls back to other methods if needed.
         """
         # Try Content-Disposition header first
         content_disposition = response.headers.get('Content-Disposition')
         if content_disposition:
@@ -30,99 +45,75 @@ def stream_file_to_cos():
                     filename = unquote(filename)
                 return filename.strip('"\'')
         # Try Content-Type for file extension
         content_type = response.headers.get('Content-Type', '').split(';')[0]
-        extension_map = {
-            # Documents
-            'application/pdf': '.pdf',
-            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
-            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
-            'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx',
-            'text/csv': '.csv',
-            'application/xml': '.xml',
-            'text/xml': '.xml',
-            'application/yaml': '.yaml',
-            'text/yaml': '.yaml',
-            'application/toml': '.toml',
-            'text/plain': '.txt',
-            # Archives
-            'application/x-rar-compressed': '.rar',
-            'application/x-7z-compressed': '.7z',
-            'application/zip': '.zip',
-            'application/x-tar': '.tar',
-            'application/gzip': '.gz',
-            'application/x-gzip': '.gz',
-            # Executables
-            'application/x-msdownload': '.exe',
-            'application/x-apple-diskimage': '.dmg',
-            # Data formats
-            'application/json': '.json',
-            'application/x-jsonlines': '.jsonl',
-            'application/parquet': '.parquet',
-            # Images
-            'image/jpeg': '.jpg',
-            'image/png': '.png',
-            'image/tiff': '.tiff',
-            'image/gif': '.gif',
-            # Code and notebooks
-            'application/x-ipynb+json': '.ipynb',
-            'text/x-python': '.py',
-            'application/x-python-code': '.py'
-        }
-        # If we have a valid content type with extension mapping
-        if content_type in extension_map:
-            # Try to find a filename in the URL path
-            url_path = response.url.split('/')[-1]
-            # Remove query parameters if any
-            url_path = url_path.split('?')[0]
-            # If the URL path has no extension, add the appropriate one
-            if '.' not in url_path:
-                return f"{url_path}{extension_map[content_type]}"
         # Fallback to URL filename
-        return response.url.split('/')[-1].split('?')[0]
-    def score(payload, token=None):
         """
         WatsonX.ai deployable function to stream files from HTTP to Cloud Object Storage
-        Expected payload format:
         {
-            "input_data": [{
-                "fields": ["cos_config", "source_urls", "prefix", "http_method"],
-                "values": [[{
-                    "bucket_name": "my-bucket",
-                    "api_key": "my-api-key",
-                    "instance_id": "my-instance-id",
-                    "auth_endpoint": "https://iam.cloud.ibm.com/identity/token",
-                    "endpoint_url": "https://s3.us-south.cloud-object-storage.appdomain.cloud"
-                },
-                ["https://example.com/file1.pdf", "https://example.com/file2.csv"],
-                "my/prefix",
-                "GET"]]
-            }]
         }
         """
         try:
-            # Extract input parameters from payload
-            input_data = payload.get("input_data")[0]
-            fields = input_data.get("fields")
-            values = input_data.get("values")[0]
-            # Map fields to values
             params = dict(zip(fields, values))
             # Extract COS configuration
             cos_config = params.get('cos_config', {})
             # Verify all required config values are present
-            missing_configs = [k for k, v in cos_config.items() if not v]
             if missing_configs:
                 return {
                     'predictions': [{
@@ -234,67 +225,5 @@ def stream_file_to_cos():
     return score
-# For testing in notebook
-score = stream_file_to_cos()
-# ------------------------------------------------------------------------------------------------------------
-### Example Usage:
-# try:
-#     import requests
-#     import json
-#     wx_api_key = ""
-#     wx_region = "us-south" ### watsonx.ai region
-#     serving_name = "" ### Serving name or id of your deployment
-#     ## Retrieve a bearer token
-#     token_response = requests.post('https://iam.cloud.ibm.com/identity/token',
-#         data={
-#             "apikey": wx_api_key,
-#             "grant_type": 'urn:ibm:params:oauth:grant-type:apikey'
-#         }
-#     )
-#     bearer_tk = token_response.json()["access_token"]
-#     # Example run of function
-#     scoring_inputs = {
-#         "input_data": [
-#             {
-#             "fields": [
-#                 "cos_config",
-#                 "source_urls",
-#                 "prefix",
-#                 "http_method"],
-#             "values": [
-#                 [
-#                     {
-#                     "api_key": "<insert_api_key>",
-#                     "auth_endpoint": "https://iam.cloud.ibm.com/identity/token",
-#                     "bucket_name": "<target_bucket_name>",
-#                     "endpoint_url": "https://s3.eu-de.cloud-object-storage.appdomain.cloud",   ### preset for Frankfurt Regional Here
-#                     "instance_id": "<resource_instance_id starts with crn:...>"
-#                     },
-#                     [
-#                     "https://data.mendeley.com/public-files/datasets/27c8pwsd6v/files/8145e2c0-83f8-4367-87d7-6778a7bc2e5f/file_downloaded", ### Example Data Links
-#                     "https://data.mendeley.com/public-files/datasets/27c8pwsd6v/files/136853fb-52b3-457f-94cf-c79821ed5145/file_downloaded",
-#                     "https://data.mendeley.com/public-files/datasets/27c8pwsd6v/files/8be42620-b4c2-4535-b9ce-e9b62190202f/file_downloaded",
-#                     "https://data.mendeley.com/public-files/datasets/27c8pwsd6v/files/f88087d7-4d29-444a-b9ec-e203c41ec52b/file_downloaded"
-#                     ],
-#                     "cos_stream_test_run_batch", ### "Folder path to save to"
-#                     "GET"
-#                 ]
-#             ]
-#             }
-#         ]
-#     }
-#     function_run = requests.post(
-#         url = f'https://{wx_region}.ml.cloud.ibm.com/ml/v4/deployments/{serving_name}/predictions?version=2021-05-01',
-#         json = scoring_inputs,
-#         headers = {'Authorization': 'Bearer ' + bearer_tk}
-#     )
-# finally:
-#     print(function_run.json())

 def stream_file_to_cos():
     # Import dependencies
     import ibm_boto3
     import requests
         """
         Extract the actual filename from response headers.
         Checks Content-Disposition and falls back to other methods if needed.
+        Uses mimetypes library for extension mapping.
         """
+        import mimetypes
+        # Ensure mimetypes database is initialized with common types
+        mimetypes.init()
+        # Add any missing but common MIME types that might not be in the default database
+        if not mimetypes.guess_extension('application/x-jsonlines'):
+            mimetypes.add_type('application/x-jsonlines', '.jsonl')
+        if not mimetypes.guess_extension('application/parquet'):
+            mimetypes.add_type('application/parquet', '.parquet')
+        if not mimetypes.guess_extension('application/x-ipynb+json'):
+            mimetypes.add_type('application/x-ipynb+json', '.ipynb')
+        if not mimetypes.guess_extension('application/yaml'):
+            mimetypes.add_type('application/yaml', '.yaml')
+        if not mimetypes.guess_extension('text/yaml'):
+            mimetypes.add_type('text/yaml', '.yaml')
+        if not mimetypes.guess_extension('application/toml'):
+            mimetypes.add_type('application/toml', '.toml')
         # Try Content-Disposition header first
         content_disposition = response.headers.get('Content-Disposition')
         if content_disposition:
                     filename = unquote(filename)
                 return filename.strip('"\'')
+        # Get the URL path as fallback filename
+        url_path = response.url.split('/')[-1].split('?')[0]
         # Try Content-Type for file extension
         content_type = response.headers.get('Content-Type', '').split(';')[0]
+        if content_type and '.' not in url_path:
+            # Get extension from mimetype
+            extension = mimetypes.guess_extension(content_type)
+            if extension:
+                return f"{url_path}{extension}"
         # Fallback to URL filename
+        return url_path
+    def score(payload): ### or def score(payload, token=None) if you want to add authentication
         """
         WatsonX.ai deployable function to stream files from HTTP to Cloud Object Storage
+        Expected simplified format:
+        [
         {
+            "cos_config": {
+                "bucket_name": "my-bucket",
+                "api_key": "my-api-key",
+                "instance_id": "my-instance-id",
+                "auth_endpoint": "https://iam.cloud.ibm.com/identity/token",
+                "endpoint_url": "https://s3.us-south.cloud-object-storage.appdomain.cloud"
+            },
+            "source_urls": ["https://example.com/file1.pdf", "https://example.com/file2.csv"],
+            "prefix": "my/prefix",
+            "http_method": "GET"
         }
+        ]
+        Which you can run through this kind of helper function:
+        ### --- --- ---
+        def reformat_for_wxai_scoring(input_data):
+            '''Converts input data to WatsonX.ai scoring payload format.'''
+            # Convert single dict to list
+            inputs = [input_data] if isinstance(input_data, dict) else input_data
+            if not inputs:
+                return {"input_data": [{"fields": [], "values": [[]]}]}
+            # Extract fields from first object
+            fields = list(inputs[0].keys())
+            # Build values array
+            values = [[obj.get(field, None) for field in fields] for obj in inputs]
+            return {"input_data": [{"fields": fields, "values": values}]}
+        ### --- --- ---
         """
         try:
+            # Extract the actual payload from input_data format
+            fields = payload["input_data"][0]["fields"]
+            values = payload["input_data"][0]["values"][0]
+            # Create a dictionary from fields and values
             params = dict(zip(fields, values))
             # Extract COS configuration
             cos_config = params.get('cos_config', {})
             # Verify all required config values are present
+            required_configs = ['bucket_name', 'api_key', 'instance_id', 'auth_endpoint', 'endpoint_url']
+            missing_configs = [k for k in required_configs if k not in cos_config or not cos_config[k]]
             if missing_configs:
                 return {
                     'predictions': [{
     return score
+score = stream_file_to_cos()