import json import re import time from openai import OpenAI # ========= Parameters ========= n = 100 # Only generate for the first n data points with empty instruction input_path = "localization_samples.json" output_path = "localization_with_instruction.json" model = "gpt-4o" # ============================== client = OpenAI(api_key="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") system_prompt = ( "You are a geospatial task design expert. The user will provide a JSON data point containing a series of function-based steps (`steps`). " "Your job is to generate a natural and meaningful instruction (`instruction`) based on these steps.\n\n" "There are four types of functions you might encounter:\n" "1. Coords: Input is a location name, output is the coordinates of that location.\n" "2. Relative: Input is a region (or place name), a direction, and a distance. Output is a new region offset by that direction and distance.\n" "3. Between: Input is two coordinates (or place names), output is the midpoint between them.\n" "4. Azimuth: Similar to Relative, but the direction is represented as an angle instead of a word.\n\n" "Your tasks are:\n" "- Complete the `instruction` field in the given JSON data point;\n" "- Replace all placeholders like LOC_1, LOC_2 with real global place names;\n" "- Also update the `steps` so that LOC_1, LOC_2 are replaced accordingly;\n" "- The final `instruction` should NOT contain any LOC_x placeholders;\n" "- Create a realistic scenario, such as navigation, trip planning, or station setup;\n" "- First explain your reasoning: why you chose this scenario and these places;\n" "- Then output a JSON object with the following format, enclosed between ```json and ```:\n\n" "Format:\n" "```json\n" "{\n" ' "index": xxx,\n' ' "instruction": "....",\n' ' "steps": [\n' " {\"id\": 1, \"function\": ..., \"inputs\": [...]},\n" " ...\n" " ]\n" "}\n" "```" ) def extract_json_block(text): match = re.search(r"```json\s*(\{.*?\})\s*```", text, re.DOTALL) if match: try: return json.loads(match.group(1)) except json.JSONDecodeError: return None return None def format_step(step): inputs = [ f'"{inp}"' if isinstance(inp, str) else str(inp) for inp in step["inputs"] ] return f'{{"id": {step["id"]}, "function": "{step["function"]}", "inputs": [{", ".join(inputs)}]}}' # Load original data with open(input_path, "r", encoding="utf-8") as f: data = json.load(f) completed = 0 output_data = [] for item in data: original_item = item.copy() if item.get("instruction", "").strip() == "": if completed >= n: break user_prompt = f"Here is a geospatial task. Please generate an appropriate instruction according to the rules:\n{json.dumps(item, ensure_ascii=False)}" try: response = client.chat.completions.create( model=model, messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}, ] ) content = response.choices[0].message.content.strip() parsed = extract_json_block(content) item = parsed completed += 1 except Exception as e: print(f"[āœ—] index {item['index']} error: {e}") item = original_item continue with open('localization_with_instruction.json', "r", encoding="utf-8") as f: try: datapoint = json.load(f) except: datapoint = [] datapoint.append(item) with open(output_path, "w", encoding="utf-8") as f: json.dump(datapoint, f, ensure_ascii=False, indent=2) time.sleep(1.5) output_data.append(item) # # Save full output as JSON # with open(output_path, "w", encoding="utf-8") as f: # json.dump(output_data, f, ensure_ascii=False, indent=2) print(f"\nāœ… Completed {completed} data points. Output saved to {output_path}") def write_custom_json(data, filename): def format_step(step): inputs = json.dumps(step["inputs"], ensure_ascii=False) return f'{{"id": {step["id"]}, "function": "{step["function"]}", "inputs": {inputs}}}' with open(filename, "w", encoding="utf-8") as f: f.write("[\n") for i, item in enumerate(data): f.write(" {\n") f.write(f' "index": {item["index"]},\n') instruction = json.dumps(item["instruction"], ensure_ascii=False) f.write(f' "instruction": {instruction},\n') f.write(' "steps": [\n') step_lines = [f" {format_step(step)}" for step in item["steps"]] f.write(",\n".join(step_lines)) f.write("\n ]\n") f.write(" }" + (",\n" if i < len(data) - 1 else "\n")) f.write("]\n") # Regenerate custom formatted JSON with open("localization_with_instruction.json", "r", encoding="utf-8") as f: data = json.load(f) write_custom_json(data, "localization_with_instruction.json")