import os import re import pandas as pd def save_df_to_dir(results_df, base_dir, sub_dirs, file_name_format, add_context, model_name): # Get the root directory of the project root_dir = os.path.dirname(os.path.abspath(__file__)) # Construct the output directory path output_dir = os.path.join(root_dir, base_dir, *sub_dirs) os.makedirs(output_dir, exist_ok=True) # Construct the file name file_name = file_name_format.format(model_name=model_name, context="with_context" if add_context else "without_context") # Construct the full file path file_path = os.path.join(output_dir, file_name) # Save the DataFrame to CSV results_df.to_csv(file_path, index=False) def merge_dfs(base_dir, exp_name, part_format="part_{i}_", output_dir=None, filename="patchscopes_results.parquet", output_filename="patchscopes_results.parquet"): """ Merges DataFrames from directories matching the part format into a single DataFrame, and optionally saves the result to a file. Args: base_dir (str): The base directory containing the data. exp_name (str): The experiment name to look for within part directories. part_format (str): The general format for identifying parts (e.g., "part_{i}_"). output_dir (str, optional): Directory to save the merged DataFrame. Default is None. filename (str): The filename of the Parquet file to read in each part directory. output_filename (str): Name of the output file if saving is enabled. Returns: pd.DataFrame: A single DataFrame containing data from all parts. """ dataframes = [] part_regex = part_format.replace("{i}", r"\d+") # List all directories in base_dir for dir_name in os.listdir(base_dir): if os.path.isdir(os.path.join(base_dir, dir_name)) and re.match(part_regex, dir_name) and (dir_name.endswith(exp_name)): part_dir = os.path.join(base_dir, dir_name) file_path = os.path.join(part_dir, filename) if os.path.exists(file_path): # Read the DataFrame and add it to the list df = pd.read_parquet(file_path) dataframes.append(df) # Concatenate all DataFrames into a single DataFrame merged_df = pd.concat(dataframes, axis=1) # Save the result to file if output_dir is given if output_dir: os.makedirs(output_dir, exist_ok=True) output_path = os.path.join(output_dir, output_filename) merged_df.to_parquet(output_path, index=False) return merged_df, dataframes def parse_string_list_from_file(file_path, delimiter=None): """ Parses a list of strings from a file, handling various list formats. Args: file_path (str): Path to the file containing the list. Returns: list: A list of parsed strings. """ with open(file_path, 'r') as file: content = file.read() if delimiter is None: # Remove newlines and excess whitespace content = re.sub(r'\s+', ' ', content.strip()) # Handle different delimiters and list formats # Removes common list notations like commas, brackets, quotes, etc. items = re.split(r'[,\[\]\(\)\{\}"\'\s]+', content) else: if delimiter == "newline": # TODO fix this delimiter = "\n" items = [item.strip() for item in content.split(delimiter)] # Filter out any empty strings from the list return [item for item in items if item]