from typing import List,Dict import re def parse_model_entries(model_entries: List[str]) -> List[Dict[str, str]]: """ Parse a list of model entries into structured dictionaries with provider, model name, version, region, and type. Args: model_entries: List of model entry strings as found in models.txt Returns: List of dictionaries with parsed model information containing keys: - provider: Name of the provider (e.g., 'azure', 'openai', 'anthropic', etc.) - model_name: Base name of the model - version: Version of the model (if available) - region: Deployment region (if available) - model_type: Type of the model (text, image, audio based on pattern analysis) """ parsed_models = [] # Common provider prefixes to identify known_providers = [ 'azure', 'bedrock', 'anthropic', 'openai', 'cohere', 'google', 'mistral', 'meta', 'amazon', 'ai21', 'anyscale', 'stability', 'cloudflare', 'databricks', 'cerebras', 'assemblyai' ] # Image-related keywords to identify image models image_indicators = ['dall-e', 'stable-diffusion', 'image', 'canvas', 'x-', 'steps'] # Audio-related keywords to identify audio models audio_indicators = ['whisper', 'tts', 'audio', 'voice'] for entry in model_entries: model_info = { 'provider': '', 'model_name': '', 'version': '', 'region': '', 'model_type': 'text' # Default to text } # Check for image models if any(indicator in entry.lower() for indicator in image_indicators): model_info['model_type'] = 'image' # Check for audio models elif any(indicator in entry.lower() for indicator in audio_indicators): model_info['model_type'] = 'audio' # Parse the entry based on common patterns parts = entry.split('/') # Handle region and provider extraction if len(parts) >= 2: # Extract provider from the beginning (common pattern) if parts[0].lower() in known_providers: model_info['provider'] = parts[0].lower() # For bedrock and azure, the region is often the next part if parts[0].lower() in ['bedrock', 'azure'] and len(parts) >= 3: # Skip commitment parts if present if 'commitment' not in parts[1]: model_info['region'] = parts[1] # The last part typically contains the model name and possibly version model_with_version = parts[-1] else: # For single-part entries model_with_version = entry # Extract provider from model name if not already set if not model_info['provider']: # Look for known providers within the model name for provider in known_providers: if provider in model_with_version.lower() or f'{provider}.' in model_with_version.lower(): model_info['provider'] = provider # Remove provider prefix if it exists at the beginning if model_with_version.lower().startswith(f'{provider}.'): model_with_version = model_with_version[len(provider) + 1:] break # Extract version information version_match = re.search(r'[:.-]v(\d+(?:\.\d+)*(?:-\d+)?|\d+)(?::\d+)?$', model_with_version) if version_match: model_info['version'] = version_match.group(1) # Remove version from model name model_name = model_with_version[:version_match.start()] else: # Look for date-based versions like 2024-08-06 date_match = re.search(r'-(\d{4}-\d{2}-\d{2})$', model_with_version) if date_match: model_info['version'] = date_match.group(1) model_name = model_with_version[:date_match.start()] else: model_name = model_with_version # Clean up model name by removing trailing/leading separators model_info['model_name'] = model_name.strip('.-:') parsed_models.append(model_info) return parsed_models def create_model_hierarchy(model_entries: List[str]) -> Dict[str, Dict[str, Dict[str, Dict[str, str]]]]: """ Organize model entries into a nested dictionary structure by provider, model, version, and region. Args: model_entries: List of model entry strings as found in models.txt Returns: Nested dictionary with the structure: Provider -> Model -> Version -> Region = full model string If region or version is None, they are replaced with "NA". """ # Parse the model entries to get structured information parsed_models = parse_model_entries(model_entries) # Create the nested dictionary structure hierarchy = {} for i, model_info in enumerate(parsed_models): provider = model_info['provider'] if model_info['provider'] else 'unknown' model_name = model_info['model_name'] version = model_info['version'] if model_info['version'] else 'NA' # For Azure models, always use 'NA' as region since they are globally available region = 'NA' if provider == 'azure' else (model_info['region'] if model_info['region'] else 'NA') # Initialize nested dictionaries if they don't exist if provider not in hierarchy: hierarchy[provider] = {} if model_name not in hierarchy[provider]: hierarchy[provider][model_name] = {} if version not in hierarchy[provider][model_name]: hierarchy[provider][model_name][version] = {} # Store the full model string at the leaf node hierarchy[provider][model_name][version][region] = model_entries[i] return hierarchy