ReallyFloppyPenguin commited on
Commit
bffeb3e
·
verified ·
1 Parent(s): 2c79988

Update synthgen.py

Browse files
Files changed (1) hide show
  1. synthgen.py +229 -61
synthgen.py CHANGED
@@ -1,61 +1,229 @@
1
- import os
2
- from openai import OpenAI
3
-
4
- # Ensure the OPENROUTER_API_KEY environment variable is set
5
- api_key = "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"
6
- if not api_key:
7
- raise ValueError("OPENROUTER_API_KEY environment variable not set.")
8
-
9
- # Point the OpenAI client to the OpenRouter API
10
- client = OpenAI(
11
- base_url="https://openrouter.ai/api/v1",
12
- api_key=api_key,
13
- )
14
-
15
- def generate_synthetic_text(prompt: str, model: str = "deepseek/deepseek-chat-v3-0324:free") -> str:
16
- """
17
- Generates synthetic text using an OpenRouter model.
18
-
19
- Args:
20
- prompt: The input prompt to guide the text generation.
21
- model: The model to use on OpenRouter (default: gpt-3.5-turbo).
22
- You can find model names on the OpenRouter website.
23
-
24
- Returns:
25
- The generated text string.
26
- """
27
- try:
28
- response = client.chat.completions.create(
29
- extra_headers={
30
- # "HTTP-Referer": "https://www.google.com", # Optional. Site URL for rankings on openrouter.ai.
31
- "X-Title": "SynthGen", # Optional. Site title for rankings on openrouter.ai.
32
- },
33
- model=model,
34
- messages=[
35
- {"role": "system", "content": "You are a helpful assistant generating synthetic data."},
36
- {"role": "user", "content": prompt},
37
- ],
38
- )
39
- if response.choices and response.choices[0].message.content:
40
- return response.choices[0].message.content.strip()
41
- else:
42
- return "Error: No content generated."
43
- except Exception as e:
44
- return f"Error during API call: {e}"
45
-
46
- # --- Main Execution ---
47
- if __name__ == "__main__":
48
- # TODO: Define the kind of text and number of samples needed
49
- num_samples = 5 # Example: generate 5 samples
50
- prompt_template = "Generate a short, positive product review for a fictional gadget." # Example prompt
51
-
52
- print(f"Generating {num_samples} synthetic text samples...")
53
-
54
- for i in range(num_samples):
55
- # You might want to vary the prompt slightly for each sample
56
- # For now, we use the same template
57
- generated_text = generate_synthetic_text(prompt_template)
58
- print(f"\n--- Sample {i+1} ---")
59
- print(generated_text)
60
-
61
- print("\nGeneration complete.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from openai import OpenAI
3
+ import re # Import regex for parsing conversation turns
4
+ from typing import Optional, Union # Need Optional for settings
5
+
6
+ # Ensure the OPENROUTER_API_KEY environment variable is set
7
+ api_key = "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"
8
+ if not api_key:
9
+ raise ValueError("OPENROUTER_API_KEY environment variable not set.")
10
+
11
+ # Point the OpenAI client to the OpenRouter API
12
+ client = OpenAI(
13
+ base_url="https://openrouter.ai/api/v1",
14
+ api_key=api_key,
15
+ )
16
+
17
+ # --- Core Generation Functions ---
18
+
19
+ def generate_synthetic_text(
20
+ prompt: str,
21
+ model: str = "deepseek/deepseek-chat-v3-0324:free",
22
+ system_message: str = "You are a helpful assistant generating synthetic data.",
23
+ temperature: Optional[float] = 0.7, # Default temperature
24
+ top_p: Optional[float] = None, # Default top_p (let API decide if None)
25
+ max_tokens: Optional[int] = None # Default max_tokens (let API decide if None)
26
+ ) -> str:
27
+ """
28
+ Generates synthetic text using an OpenRouter model via Chat Completions,
29
+ including model parameter controls.
30
+
31
+ Args:
32
+ prompt: The user's input prompt.
33
+ model: The model ID.
34
+ system_message: The system message context.
35
+ temperature: Controls randomness (0.0 to 2.0). None means API default.
36
+ top_p: Nucleus sampling probability. None means API default.
37
+ max_tokens: Maximum number of tokens to generate. None means API default.
38
+
39
+ Returns:
40
+ The generated text string or an error message.
41
+ """
42
+ if not api_key or api_key == "YOUR_API_KEY_HERE_OR_SET_ENV_VAR":
43
+ return "Error: OPENROUTER_API_KEY not configured properly. Please set the environment variable."
44
+
45
+ # Prepare parameters, only including them if they are not None
46
+ params = {
47
+ "model": model,
48
+ "messages": [
49
+ {"role": "system", "content": system_message},
50
+ {"role": "user", "content": prompt},
51
+ ],
52
+ "extra_headers": {
53
+ # "HTTP-Referer": "YOUR_SITE_URL",
54
+ "X-Title": "SynthGen",
55
+ }
56
+ }
57
+ if temperature is not None:
58
+ params["temperature"] = temperature
59
+ if top_p is not None:
60
+ params["top_p"] = top_p
61
+ if max_tokens is not None:
62
+ params["max_tokens"] = max_tokens
63
+
64
+ try:
65
+ response = client.chat.completions.create(**params) # Use dictionary unpacking
66
+
67
+ if response.choices and response.choices[0].message and response.choices[0].message.content:
68
+ return response.choices[0].message.content.strip()
69
+ else:
70
+ print(f"Warning: No content in response for model {model}. Response: {response}")
71
+ return "Error: No content generated by the model."
72
+ except Exception as e:
73
+ print(f"Error during API call to model {model}: {e}")
74
+ return f"Error during API call: {e}"
75
+
76
+ def generate_prompts(
77
+ num_prompts: int,
78
+ model: str,
79
+ topic_hint: str = "diverse and interesting",
80
+ temperature: Optional[float] = 0.7, # Pass settings through
81
+ top_p: Optional[float] = None,
82
+ max_tokens: Optional[int] = 200 # Set a reasonable default max for prompts
83
+ ) -> list[str]:
84
+ """
85
+ Generates a list of conversation prompts using an AI model.
86
+
87
+ Args:
88
+ num_prompts: The number of prompts to generate.
89
+ model: The model ID to use for generation.
90
+ topic_hint: Optional hint for the kind of topics (e.g., "related to technology").
91
+ temperature: Controls randomness (0.0 to 2.0). None means API default.
92
+ top_p: Nucleus sampling probability. None means API default.
93
+ max_tokens: Maximum number of tokens to generate. None means API default.
94
+
95
+ Returns:
96
+ A list of generated prompts.
97
+ """
98
+ instruction = (
99
+ f"Generate exactly {num_prompts} unique, {topic_hint} system prompts or starting topics suitable "
100
+ f"for generating synthetic conversations between a user and an AI assistant. "
101
+ f"Each prompt should be concise (ideally one sentence) and focus on a clear task or subject. "
102
+ f"Present each prompt on a new line, with no other introductory or concluding text."
103
+ f"\n\nExamples:\n"
104
+ f"- Act as a travel agent planning a trip to Japan.\n"
105
+ f"- Explain the concept of black holes to a 5-year-old.\n"
106
+ f"- Write a python function to reverse a string."
107
+ )
108
+ system_msg = "You are an expert prompt generator. Follow the user's instructions precisely."
109
+
110
+ # Pass the settings down to generate_synthetic_text
111
+ generated_text = generate_synthetic_text(
112
+ instruction,
113
+ model,
114
+ system_message=system_msg,
115
+ temperature=temperature,
116
+ top_p=top_p,
117
+ max_tokens=max_tokens
118
+ )
119
+
120
+ if generated_text.startswith("Error:"):
121
+ raise ValueError(generated_text)
122
+
123
+ # Split into lines and clean up any extra whitespace or empty lines
124
+ prompts = [p.strip() for p in generated_text.strip().split('\n') if p.strip()]
125
+ prompts = [p.replace("- ", "") for p in prompts]
126
+ if not prompts:
127
+ # Log the raw generated text if parsing failed
128
+ print(f"Warning: Failed to parse prompts from generated text. Raw text:\n{generated_text}")
129
+ raise ValueError("AI failed to generate prompts in the expected format.")
130
+
131
+ # Optional: Truncate or pad if the model didn't generate the exact number
132
+ return prompts[:num_prompts]
133
+
134
+
135
+ def generate_synthetic_conversation(
136
+ system_prompt: str,
137
+ model: str,
138
+ num_turns: int,
139
+ temperature: Optional[float] = 0.7, # Pass settings through
140
+ top_p: Optional[float] = None,
141
+ max_tokens: Optional[int] = 1000 # Set a reasonable default max for conversations
142
+ ) -> str:
143
+ """
144
+ Generates a synthetic conversation with a specified number of turns.
145
+
146
+ Args:
147
+ system_prompt: The initial system prompt defining the context or AI persona.
148
+ model: The model ID to use for generation.
149
+ num_turns: The desired number of conversational turns (1 turn = 1 User + 1 Assistant).
150
+ temperature: Controls randomness (0.0 to 2.0). None means API default.
151
+ top_p: Nucleus sampling probability. None means API default.
152
+ max_tokens: Maximum number of tokens to generate. None means API default.
153
+
154
+ Returns:
155
+ A string containing the formatted conversation.
156
+ """
157
+ # We'll ask the model to generate the whole conversation in one go for simplicity.
158
+ # More complex approaches could involve iterative calls.
159
+ instruction = (
160
+ f"Generate a realistic conversation between a 'User' and an 'Assistant'. "
161
+ f"The conversation should start based on the following system prompt/topic: '{system_prompt}'.\n"
162
+ f"The conversation should have approximately {num_turns} pairs of User/Assistant turns.\n"
163
+ f"Format the output clearly, starting each line with 'User:' or 'Assistant:'.\n\n"
164
+ f"Example Format:\n"
165
+ f"User: Hello!\n"
166
+ f"Assistant: Hi there! How can I help you today?\n"
167
+ f"User: Can you explain photosynthesis?\n"
168
+ f"Assistant: Certainly! Photosynthesis is the process..."
169
+ )
170
+
171
+ # Use the user-provided system prompt for the *conversation's* context,
172
+ # but a generic one for the generation *task* itself.
173
+ system_msg_for_generation = f"You are an AI assistant simulating a conversation. The context for the conversation you generate is: {system_prompt}"
174
+
175
+ # Pass the settings down to generate_synthetic_text
176
+ conversation_text = generate_synthetic_text(
177
+ prompt=instruction,
178
+ model=model,
179
+ system_message=system_msg_for_generation,
180
+ temperature=temperature,
181
+ top_p=top_p,
182
+ max_tokens=max_tokens
183
+ )
184
+
185
+ if conversation_text.startswith("Error:"):
186
+ # Propagate the error message
187
+ return f"Error generating conversation for prompt '{system_prompt}':\n{conversation_text}"
188
+
189
+ # Basic validation/cleanup (optional)
190
+ if not re.search(r"User:|Assistant:", conversation_text, re.IGNORECASE):
191
+ print(f"Warning: Generated text for conversation '{system_prompt}' might not be in the expected format. Raw text:\n{conversation_text}")
192
+ # Return the raw text anyway, maybe the model format is slightly different
193
+ return f"Generated conversation for prompt '{system_prompt}':\n(Format might vary)\n\n{conversation_text}"
194
+
195
+ return f"Generated conversation for prompt '{system_prompt}':\n\n{conversation_text}"
196
+
197
+
198
+ # --- Main Execution (Example Usage) ---
199
+ if __name__ == "__main__":
200
+ print("--- Testing Basic Text Generation ---")
201
+ test_prompt = "Describe the benefits of using synthetic data."
202
+ text_result = generate_synthetic_text(test_prompt, temperature=0.5, max_tokens=100) # Example with settings
203
+ print(f"Prompt: {test_prompt}\nResult:\n{text_result}\n")
204
+
205
+ print("\n--- Testing Prompt Generation ---")
206
+ try:
207
+ num_prompts_to_gen = 3
208
+ prompts_result = generate_prompts(num_prompts_to_gen, "deepseek/deepseek-chat-v3-0324:free")
209
+ print(f"Generated {len(prompts_result)} prompts:")
210
+ for i, p in enumerate(prompts_result):
211
+ print(f"{i+1}. {p}")
212
+ except ValueError as e:
213
+ print(f"Error generating prompts: {e}")
214
+
215
+ print("\n--- Testing Conversation Generation ---")
216
+ conv_prompt = "Act as a helpful expert explaining the difference between nuclear fission and fusion."
217
+ num_conv_turns = 3
218
+ conv_result = generate_synthetic_conversation(conv_prompt, "deepseek/deepseek-chat-v3-0324:free", num_conv_turns)
219
+ print(f"{conv_result}\n")
220
+
221
+ print("\n--- Testing with Invalid API Key (if applicable) ---")
222
+ # Temporarily use an invalid key for testing error handling
223
+ original_key = client.api_key
224
+ client.api_key = "invalid-key"
225
+ error_text_result = generate_synthetic_text("Test prompt")
226
+ print(f"Result with invalid key: {error_text_result}")
227
+ client.api_key = original_key # Restore original key
228
+
229
+ print("\nGeneration tests complete.")