SpatialParse / 题目转化.py
Shunfeng Zheng
Upload 89 files
17e77ea verified
raw
history blame
5.35 kB
import json
import re
import time
from openai import OpenAI
# ========= Parameters =========
n = 100 # Only generate for the first n data points with empty instruction
input_path = "localization_samples.json"
output_path = "localization_with_instruction.json"
model = "gpt-4o"
# ==============================
client = OpenAI(api_key="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")
system_prompt = (
"You are a geospatial task design expert. The user will provide a JSON data point containing a series of function-based steps (`steps`). "
"Your job is to generate a natural and meaningful instruction (`instruction`) based on these steps.\n\n"
"There are four types of functions you might encounter:\n"
"1. Coords: Input is a location name, output is the coordinates of that location.\n"
"2. Relative: Input is a region (or place name), a direction, and a distance. Output is a new region offset by that direction and distance.\n"
"3. Between: Input is two coordinates (or place names), output is the midpoint between them.\n"
"4. Azimuth: Similar to Relative, but the direction is represented as an angle instead of a word.\n\n"
"Your tasks are:\n"
"- Complete the `instruction` field in the given JSON data point;\n"
"- Replace all placeholders like LOC_1, LOC_2 with real global place names;\n"
"- Also update the `steps` so that LOC_1, LOC_2 are replaced accordingly;\n"
"- The final `instruction` should NOT contain any LOC_x placeholders;\n"
"- Create a realistic scenario, such as navigation, trip planning, or station setup;\n"
"- First explain your reasoning: why you chose this scenario and these places;\n"
"- Then output a JSON object with the following format, enclosed between ```json and ```:\n\n"
"Format:\n"
"```json\n"
"{\n"
' "index": xxx,\n'
' "instruction": "....",\n'
' "steps": [\n'
" {\"id\": 1, \"function\": ..., \"inputs\": [...]},\n"
" ...\n"
" ]\n"
"}\n"
"```"
)
def extract_json_block(text):
match = re.search(r"```json\s*(\{.*?\})\s*```", text, re.DOTALL)
if match:
try:
return json.loads(match.group(1))
except json.JSONDecodeError:
return None
return None
def format_step(step):
inputs = [
f'"{inp}"' if isinstance(inp, str) else str(inp)
for inp in step["inputs"]
]
return f'{{"id": {step["id"]}, "function": "{step["function"]}", "inputs": [{", ".join(inputs)}]}}'
# Load original data
with open(input_path, "r", encoding="utf-8") as f:
data = json.load(f)
completed = 0
output_data = []
for item in data:
original_item = item.copy()
if item.get("instruction", "").strip() == "":
if completed >= n:
break
user_prompt = f"Here is a geospatial task. Please generate an appropriate instruction according to the rules:\n{json.dumps(item, ensure_ascii=False)}"
try:
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
]
)
content = response.choices[0].message.content.strip()
parsed = extract_json_block(content)
item = parsed
completed += 1
except Exception as e:
print(f"[✗] index {item['index']} error: {e}")
item = original_item
continue
with open('localization_with_instruction.json', "r", encoding="utf-8") as f:
try:
datapoint = json.load(f)
except:
datapoint = []
datapoint.append(item)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(datapoint, f, ensure_ascii=False, indent=2)
time.sleep(1.5)
output_data.append(item)
# # Save full output as JSON
# with open(output_path, "w", encoding="utf-8") as f:
# json.dump(output_data, f, ensure_ascii=False, indent=2)
print(f"\n✅ Completed {completed} data points. Output saved to {output_path}")
def write_custom_json(data, filename):
def format_step(step):
inputs = json.dumps(step["inputs"], ensure_ascii=False)
return f'{{"id": {step["id"]}, "function": "{step["function"]}", "inputs": {inputs}}}'
with open(filename, "w", encoding="utf-8") as f:
f.write("[\n")
for i, item in enumerate(data):
f.write(" {\n")
f.write(f' "index": {item["index"]},\n')
instruction = json.dumps(item["instruction"], ensure_ascii=False)
f.write(f' "instruction": {instruction},\n')
f.write(' "steps": [\n')
step_lines = [f" {format_step(step)}" for step in item["steps"]]
f.write(",\n".join(step_lines))
f.write("\n ]\n")
f.write(" }" + (",\n" if i < len(data) - 1 else "\n"))
f.write("]\n")
# Regenerate custom formatted JSON
with open("localization_with_instruction.json", "r", encoding="utf-8") as f:
data = json.load(f)
write_custom_json(data, "localization_with_instruction.json")