Spaces:
Sleeping
Sleeping
import pandas as pd | |
import json | |
import re | |
def remove_prefix(text, prefix_pattern): | |
""" | |
Removes the prefix matching the given pattern from the text. | |
""" | |
return re.sub(prefix_pattern, "", text).strip() | |
def main(): | |
# Read the Excel file | |
try: | |
df = pd.read_excel("data/manabi.xlsx") | |
print("Excel file read successfully.") | |
except FileNotFoundError: | |
print("The file 'data/manabi.xlsx' was not found. Please check the file path.") | |
return | |
except Exception as e: | |
print(f"An error occurred while reading the Excel file: {e}") | |
return | |
# Check if the necessary columns exist | |
if "θ³ͺε" not in df.columns or "εη" not in df.columns: | |
print("The Excel file must contain 'θ³ͺε' and 'εη' columns.") | |
return | |
# Initialize the list to store processed data | |
qa_list = [] | |
# Iterate over each row in the DataFrame | |
for index, row in df.iterrows(): | |
raw_question = str(row["θ³ͺε"]) | |
raw_answer = str(row["εη"]) | |
# Remove prefixes using regex patterns | |
question = remove_prefix(raw_question, r"^Q\d+\.\s*") | |
answer = remove_prefix(raw_answer, r"^A\.\s*") | |
qa_list.append({"question": question, "answer": answer}) | |
# Save the list to a JSON file | |
try: | |
with open("data/qa_data.json", "w", encoding="utf-8") as json_file: | |
json.dump(qa_list, json_file, ensure_ascii=False, indent=2) | |
print("Data has been successfully saved to 'data/qa_data.json'.") | |
except Exception as e: | |
print(f"An error occurred while writing to JSON file: {e}") | |
if __name__ == "__main__": | |
main() | |