Spaces:
Running
Running
File size: 13,634 Bytes
bffeb3e 74b8bd8 bffeb3e 74b8bd8 bffeb3e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 |
import os
from openai import OpenAI
import re # Import regex for parsing conversation turns
from typing import Optional, Union # Need Optional for settings
# Ensure the OPENROUTER_API_KEY environment variable is set
api_key = "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"
if not api_key:
raise ValueError("OPENROUTER_API_KEY environment variable not set.")
# Point the OpenAI client to the OpenRouter API
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=api_key,
)
# --- Core Generation Functions ---
def generate_synthetic_text(
prompt: str,
model: str = "deepseek/deepseek-chat-v3-0324:free",
system_message: str = "You are a helpful assistant generating synthetic data.",
temperature: Optional[float] = 0.7, # Default temperature
top_p: Optional[float] = None, # Default top_p (let API decide if None)
max_tokens: Optional[int] = None # Default max_tokens (let API decide if None)
) -> str:
"""
Generates synthetic text using an OpenRouter model via Chat Completions,
including model parameter controls.
Args:
prompt: The user's input prompt.
model: The model ID.
system_message: The system message context.
temperature: Controls randomness (0.0 to 2.0). None means API default.
top_p: Nucleus sampling probability. None means API default.
max_tokens: Maximum number of tokens to generate. None means API default.
Returns:
The generated text string or an error message.
"""
if not api_key or api_key == "YOUR_API_KEY_HERE_OR_SET_ENV_VAR":
return "Error: OPENROUTER_API_KEY not configured properly. Please set the environment variable."
# Prepare parameters, only including them if they are not None
params = {
"model": model,
"messages": [
{"role": "system", "content": system_message},
{"role": "user", "content": prompt},
],
"extra_headers": {
# "HTTP-Referer": "YOUR_SITE_URL",
"X-Title": "SynthGen",
}
}
if temperature is not None:
params["temperature"] = temperature
if top_p is not None:
params["top_p"] = top_p
if max_tokens is not None:
params["max_tokens"] = max_tokens
try:
response = client.chat.completions.create(**params) # Use dictionary unpacking
if response.choices and response.choices[0].message and response.choices[0].message.content:
return response.choices[0].message.content.strip()
else:
print(f"Warning: No content in response for model {model}. Response: {response}")
return "Error: No content generated by the model."
except Exception as e:
print(f"Error during API call to model {model}: {e}")
return f"Error during API call: {e}"
def generate_prompts(
num_prompts: int,
model: str,
topic_hint: str = "diverse and interesting",
temperature: Optional[float] = 0.7, # Pass settings through
top_p: Optional[float] = None,
max_tokens: Optional[int] = 200 # Set a reasonable default max for prompts
) -> list[str]:
"""
Generates a list of conversation prompts using an AI model.
Args:
num_prompts: The number of prompts to generate.
model: The model ID to use for generation.
topic_hint: Optional hint for the kind of topics (e.g., "related to technology").
temperature: Controls randomness (0.0 to 2.0). None means API default.
top_p: Nucleus sampling probability. None means API default.
max_tokens: Maximum number of tokens to generate. None means API default.
Returns:
A list of generated prompts.
"""
instruction = (
f"Generate exactly {num_prompts} unique, {topic_hint} system prompts or starting topics suitable "
f"for generating synthetic conversations between a user and an AI assistant. "
f"Each prompt should be concise (ideally one sentence) and focus on a clear task or subject. "
f"Present each prompt on a new line, with no other introductory or concluding text."
f"\n\nExamples:\n"
f"- Act as a travel agent planning a trip to Japan.\n"
f"- Explain the concept of black holes to a 5-year-old.\n"
f"- Write a python function to reverse a string."
)
system_msg = "You are an expert prompt generator. Follow the user's instructions precisely."
# Pass the settings down to generate_synthetic_text
generated_text = generate_synthetic_text(
instruction,
model,
system_message=system_msg,
temperature=temperature,
top_p=top_p,
max_tokens=max_tokens
)
if generated_text.startswith("Error:"):
raise ValueError(generated_text)
# Split into lines and clean up any extra whitespace or empty lines
prompts = [p.strip() for p in generated_text.strip().split('\n') if p.strip()]
prompts = [p.replace("- ", "") for p in prompts]
if not prompts:
# Log the raw generated text if parsing failed
print(f"Warning: Failed to parse prompts from generated text. Raw text:\n{generated_text}")
raise ValueError("AI failed to generate prompts in the expected format.")
# Optional: Truncate or pad if the model didn't generate the exact number
return prompts[:num_prompts]
def generate_synthetic_conversation(
system_prompt: str,
model: str,
num_turns: int,
temperature: Optional[float] = 0.7, # Pass settings through
top_p: Optional[float] = None,
max_tokens: Optional[int] = 1000 # Set a reasonable default max for conversations
) -> str:
"""
Generates a synthetic conversation with a specified number of turns.
Args:
system_prompt: The initial system prompt defining the context or AI persona.
model: The model ID to use for generation.
num_turns: The desired number of conversational turns (1 turn = 1 User + 1 Assistant).
temperature: Controls randomness (0.0 to 2.0). None means API default.
top_p: Nucleus sampling probability. None means API default.
max_tokens: Maximum number of tokens to generate. None means API default.
Returns:
A string containing the formatted conversation.
"""
# We'll ask the model to generate the whole conversation in one go for simplicity.
# More complex approaches could involve iterative calls.
instruction = (
f"Generate a realistic conversation between a 'User' and an 'Assistant'. "
f"The conversation should start based on the following system prompt/topic: '{system_prompt}'.\n"
f"The conversation should have approximately {num_turns} pairs of User/Assistant turns.\n"
f"Format the output clearly, starting each line with 'User:' or 'Assistant:'.\n\n"
f"Example Format:\n"
f"User: Hello!\n"
f"Assistant: Hi there! How can I help you today?\n"
f"User: Can you explain photosynthesis?\n"
f"Assistant: Certainly! Photosynthesis is the process..."
)
# Use the user-provided system prompt for the *conversation's* context,
# but a generic one for the generation *task* itself.
system_msg_for_generation = f"You are an AI assistant simulating a conversation. The context for the conversation you generate is: {system_prompt}"
# Pass the settings down to generate_synthetic_text
conversation_text = generate_synthetic_text(
prompt=instruction,
model=model,
system_message=system_msg_for_generation,
temperature=temperature,
top_p=top_p,
max_tokens=max_tokens
)
if conversation_text.startswith("Error:"):
# Propagate the error message
return f"Error generating conversation for prompt '{system_prompt}':\n{conversation_text}"
# Basic validation/cleanup (optional)
if not re.search(r"User:|Assistant:", conversation_text, re.IGNORECASE):
print(f"Warning: Generated text for conversation '{system_prompt}' might not be in the expected format. Raw text:\n{conversation_text}")
# Return the raw text anyway, maybe the model format is slightly different
return f"Generated conversation for prompt '{system_prompt}':\n(Format might vary)\n\n{conversation_text}"
return f"Generated conversation for prompt '{system_prompt}':\n\n{conversation_text}"
# Function to generate different types of content based on a topic
def generate_corpus_content(
topic: str,
content_type: str, # e.g., "Corpus Snippets", "Short Story", "Article"
length_param: int, # Meaning depends on type (e.g., num snippets, approx words)
model: str,
system_message_base: str = "You are a helpful assistant generating synthetic content.",
temperature: Optional[float] = 0.7,
top_p: Optional[float] = None,
max_tokens: Optional[int] = None # Use a larger default if None
) -> str:
"""
Generates different types of synthetic content based on a topic.
Args:
topic: The central topic for the content.
content_type: The type of content to generate.
length_param: A parameter controlling length/quantity (meaning depends on type).
model: The model ID.
system_message_base: Base system message (will be specialized).
temperature: Model temperature.
top_p: Model top_p.
max_tokens: Model max_tokens.
Returns:
The generated content string or an error message.
"""
prompt = ""
system_message = system_message_base # Start with base
# --- Construct Prompt based on Content Type ---
if content_type == "Corpus Snippets":
if length_param <= 0: length_param = 5 # Default number of snippets
prompt = (
f"Generate exactly {length_param} distinct text snippets related to the topic: '{topic}'. "
f"Each snippet should be a few sentences long and focus on a different aspect if possible. "
f"Present each snippet clearly, perhaps separated by a blank line or a marker like '---'."
)
system_message = "You are an AI generating diverse text snippets for a data corpus."
# Adjust max_tokens based on expected number of snippets if not set
if max_tokens is None: max_tokens = length_param * 150 # Estimate
elif content_type == "Short Story":
if length_param <= 0: length_param = 300 # Default approx words
prompt = (
f"Write a short story (approximately {length_param} words) centered around the topic: '{topic}'. "
f"The story should have a clear beginning, middle, and end."
)
system_message = "You are a creative AI writing a short story."
if max_tokens is None: max_tokens = int(length_param * 2.5) # Estimate
elif content_type == "Article":
if length_param <= 0: length_param = 500 # Default approx words
prompt = (
f"Write an informative article (approximately {length_param} words) about the topic: '{topic}'. "
f"The article should be well-structured, factual (to the best of your ability), and engaging."
)
system_message = "You are an AI assistant writing an informative article."
if max_tokens is None: max_tokens = int(length_param * 2.5) # Estimate
else:
return f"Error: Unknown content type '{content_type}'."
if not prompt:
return "Error: Could not construct a valid prompt."
# --- Call the core generation function ---
generated_text = generate_synthetic_text(
prompt=prompt,
model=model,
system_message=system_message,
temperature=temperature,
top_p=top_p,
max_tokens=max_tokens
)
# Return the result (includes potential errors from generate_synthetic_text)
# Add a title for clarity
if not generated_text.startswith("Error:"):
return f"Generated {content_type} for topic '{topic}':\n\n{generated_text}"
else:
return generated_text # Propagate the error
# --- Main Execution (Example Usage) ---
if __name__ == "__main__":
print("--- Testing Basic Text Generation ---")
test_prompt = "Describe the benefits of using synthetic data."
text_result = generate_synthetic_text(test_prompt, temperature=0.5, max_tokens=100) # Example with settings
print(f"Prompt: {test_prompt}\nResult:\n{text_result}\n")
print("\n--- Testing Prompt Generation ---")
try:
num_prompts_to_gen = 3
prompts_result = generate_prompts(num_prompts_to_gen, "deepseek/deepseek-chat-v3-0324:free")
print(f"Generated {len(prompts_result)} prompts:")
for i, p in enumerate(prompts_result):
print(f"{i+1}. {p}")
except ValueError as e:
print(f"Error generating prompts: {e}")
print("\n--- Testing Conversation Generation ---")
conv_prompt = "Act as a helpful expert explaining the difference between nuclear fission and fusion."
num_conv_turns = 3
conv_result = generate_synthetic_conversation(conv_prompt, "deepseek/deepseek-chat-v3-0324:free", num_conv_turns)
print(f"{conv_result}\n")
print("\n--- Testing with Invalid API Key (if applicable) ---")
# Temporarily use an invalid key for testing error handling
original_key = client.api_key
client.api_key = "invalid-key"
error_text_result = generate_synthetic_text("Test prompt")
print(f"Result with invalid key: {error_text_result}")
client.api_key = original_key # Restore original key
print("\nGeneration tests complete.")
|