Spaces:
Runtime error
Runtime error
import json | |
import re | |
import time | |
from openai import OpenAI | |
# ========= Parameters ========= | |
n = 100 # Only generate for the first n data points with empty instruction | |
input_path = "localization_samples.json" | |
output_path = "localization_with_instruction.json" | |
model = "gpt-4o" | |
# ============================== | |
client = OpenAI(api_key="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") | |
system_prompt = ( | |
"You are a geospatial task design expert. The user will provide a JSON data point containing a series of function-based steps (`steps`). " | |
"Your job is to generate a natural and meaningful instruction (`instruction`) based on these steps.\n\n" | |
"There are four types of functions you might encounter:\n" | |
"1. Coords: Input is a location name, output is the coordinates of that location.\n" | |
"2. Relative: Input is a region (or place name), a direction, and a distance. Output is a new region offset by that direction and distance.\n" | |
"3. Between: Input is two coordinates (or place names), output is the midpoint between them.\n" | |
"4. Azimuth: Similar to Relative, but the direction is represented as an angle instead of a word.\n\n" | |
"Your tasks are:\n" | |
"- Complete the `instruction` field in the given JSON data point;\n" | |
"- Replace all placeholders like LOC_1, LOC_2 with real global place names;\n" | |
"- Also update the `steps` so that LOC_1, LOC_2 are replaced accordingly;\n" | |
"- The final `instruction` should NOT contain any LOC_x placeholders;\n" | |
"- Create a realistic scenario, such as navigation, trip planning, or station setup;\n" | |
"- First explain your reasoning: why you chose this scenario and these places;\n" | |
"- Then output a JSON object with the following format, enclosed between ```json and ```:\n\n" | |
"Format:\n" | |
"```json\n" | |
"{\n" | |
' "index": xxx,\n' | |
' "instruction": "....",\n' | |
' "steps": [\n' | |
" {\"id\": 1, \"function\": ..., \"inputs\": [...]},\n" | |
" ...\n" | |
" ]\n" | |
"}\n" | |
"```" | |
) | |
def extract_json_block(text): | |
match = re.search(r"```json\s*(\{.*?\})\s*```", text, re.DOTALL) | |
if match: | |
try: | |
return json.loads(match.group(1)) | |
except json.JSONDecodeError: | |
return None | |
return None | |
def format_step(step): | |
inputs = [ | |
f'"{inp}"' if isinstance(inp, str) else str(inp) | |
for inp in step["inputs"] | |
] | |
return f'{{"id": {step["id"]}, "function": "{step["function"]}", "inputs": [{", ".join(inputs)}]}}' | |
# Load original data | |
with open(input_path, "r", encoding="utf-8") as f: | |
data = json.load(f) | |
completed = 0 | |
output_data = [] | |
for item in data: | |
original_item = item.copy() | |
if item.get("instruction", "").strip() == "": | |
if completed >= n: | |
break | |
user_prompt = f"Here is a geospatial task. Please generate an appropriate instruction according to the rules:\n{json.dumps(item, ensure_ascii=False)}" | |
try: | |
response = client.chat.completions.create( | |
model=model, | |
messages=[ | |
{"role": "system", "content": system_prompt}, | |
{"role": "user", "content": user_prompt}, | |
] | |
) | |
content = response.choices[0].message.content.strip() | |
parsed = extract_json_block(content) | |
item = parsed | |
completed += 1 | |
except Exception as e: | |
print(f"[✗] index {item['index']} error: {e}") | |
item = original_item | |
continue | |
with open('localization_with_instruction.json', "r", encoding="utf-8") as f: | |
try: | |
datapoint = json.load(f) | |
except: | |
datapoint = [] | |
datapoint.append(item) | |
with open(output_path, "w", encoding="utf-8") as f: | |
json.dump(datapoint, f, ensure_ascii=False, indent=2) | |
time.sleep(1.5) | |
output_data.append(item) | |
# # Save full output as JSON | |
# with open(output_path, "w", encoding="utf-8") as f: | |
# json.dump(output_data, f, ensure_ascii=False, indent=2) | |
print(f"\n✅ Completed {completed} data points. Output saved to {output_path}") | |
def write_custom_json(data, filename): | |
def format_step(step): | |
inputs = json.dumps(step["inputs"], ensure_ascii=False) | |
return f'{{"id": {step["id"]}, "function": "{step["function"]}", "inputs": {inputs}}}' | |
with open(filename, "w", encoding="utf-8") as f: | |
f.write("[\n") | |
for i, item in enumerate(data): | |
f.write(" {\n") | |
f.write(f' "index": {item["index"]},\n') | |
instruction = json.dumps(item["instruction"], ensure_ascii=False) | |
f.write(f' "instruction": {instruction},\n') | |
f.write(' "steps": [\n') | |
step_lines = [f" {format_step(step)}" for step in item["steps"]] | |
f.write(",\n".join(step_lines)) | |
f.write("\n ]\n") | |
f.write(" }" + (",\n" if i < len(data) - 1 else "\n")) | |
f.write("]\n") | |
# Regenerate custom formatted JSON | |
with open("localization_with_instruction.json", "r", encoding="utf-8") as f: | |
data = json.load(f) | |
write_custom_json(data, "localization_with_instruction.json") | |