import os import re import glob def extract_assistant_answers(input_file): """Extract the text after 'Assistant:' from the input file.""" with open(input_file, 'r', encoding='utf-8') as f: content = f.read() # Split content by "Assistant:" to get all sections after it sections = content.split("Assistant:") # Process each section to get clean answers answers = [] for section in sections[1:]: # Skip the first split as it's before first "Assistant:" # Get text up to next "Q" or "User:" or end of string answer = section.split("Q")[0].split("User:")[0].strip() if answer: answers.append(answer) return answers def process_all_files(): """Process all image_*.txt files in the qa_outputs directory.""" # Get all image_*.txt files input_files = glob.glob("qa_outputs/image_*.txt") for input_file in input_files: # Extract the base name without extension base_name = os.path.splitext(input_file)[0] output_file = f"{base_name}_extr.txt" # Extract answers answers = extract_assistant_answers(input_file) # Write answers to the output file with open(output_file, 'w', encoding='utf-8') as f: for i, answer in enumerate(answers, 1): f.write(f"{answer}\n") print(f"Processed {input_file} -> {output_file}") if __name__ == "__main__": process_all_files() print("Extraction complete! Check the files with '_extr' suffix.")