Spaces:

AtlaAI
/

selene

Running

App Files Files Community

kaikaidai commited on 5 days ago

Commit

1fc8c4c

verified ·

1 Parent(s): b477b2d

Synced repo using 'sync_with_huggingface' Github Action

Browse files

Files changed (3) hide show

.DS_Store +0 -0
random_sample/arena_interface.py +31 -62
random_sample/gen_api_answer.py +2 -75

.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

random_sample/arena_interface.py CHANGED Viewed

@@ -6,17 +6,13 @@ from dotenv import load_dotenv
 load_dotenv()
 from .gen_api_answer import (
-    get_atla_response,
-    get_selene_mini_response,
-    parse_selene_mini_response
 )
 from .prompts import (
     DEFAULT_EVAL_CRITERIA,
     DEFAULT_EVAL_PROMPT,
-    DEFAULT_EVAL_PROMPT_EDITABLE,
-    ATLA_PROMPT,
-    ATLA_PROMPT_WITH_REFERENCE
 )
 from .random_sample_generation import (
@@ -255,62 +251,35 @@ def create_arena_interface():
             ai_response,
             ground_truth,
         ):
-            if model_choice == "Selene Mini":
-                # Prepare prompt based on reference mode
-                prompt_template = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
-                prompt = prompt_template.format(
-                    human_input=human_input,
-                    ai_response=ai_response,
-                    eval_criteria=eval_criteria_text,
-                    ground_truth=ground_truth if use_reference else ""
-                )
-                print("\n=== Debug: Prompt being sent to Selene Mini ===")
-                print(prompt)
-                print("============================================\n")
-                # Get and parse response
-                raw_response = get_selene_mini_response(
-                    model_name="AtlaAI/Selene-1-Mini-Llama-3.1-8B",
-                    prompt=prompt,
-                    max_tokens=500,
-                    temperature=0.01
-                )
-                response = parse_selene_mini_response(raw_response)
-            else:
-                # Selene API logic
-                prompt_data = {
-                    'human_input': human_input,
-                    'ai_response': ai_response,
-                    'ground_truth': ground_truth if use_reference else None,
-                    'eval_criteria': eval_criteria_text,
-                }
-                print("\n=== Debug: Prompt data being sent to Selene API ===")
-                print(json.dumps(prompt_data, indent=2))
-                print("============================================\n")
-                response = get_atla_response(
-                    model_name="AtlaAI/Selene-1-Mini-Llama-3.1-8B",
-                    prompt=prompt_data,
-                    max_tokens=500,
-                    temperature=0.01
-                )
-            # Response now contains score and critique directly
-            if isinstance(response, dict) and 'score' in response and 'critique' in response:
-                score = str(response['score'])
-                critique = response['critique']
-            else:
-                score = "Error"
-                critique = str(response)
-            return [
-                score,
-                critique,
-                gr.update(value="Regenerate evaluation", variant="secondary", interactive=True),
-                gr.update(value="🎲"),
-            ]
         # Update the send_btn click handler with new input
         send_btn.click(

 load_dotenv()
 from .gen_api_answer import (
+    get_atla_response
 )
 from .prompts import (
     DEFAULT_EVAL_CRITERIA,
     DEFAULT_EVAL_PROMPT,
+    DEFAULT_EVAL_PROMPT_EDITABLE
 )
 from .random_sample_generation import (
             ai_response,
             ground_truth,
         ):
+            # Prepare prompt data for both models
+            prompt_data = {
+                'human_input': human_input,
+                'ai_response': ai_response,
+                'ground_truth': ground_truth if use_reference else None,
+                'eval_criteria': eval_criteria_text,
+            }
+            print("\n=== Debug: Prompt data being sent to Selene API ===")
+            print(json.dumps(prompt_data, indent=2))
+            print("============================================\n")
+            # Use appropriate model ID based on selection
+            model_id = "atla-selene-mini" if model_choice == "Selene Mini" else "atla-selene"
+            response = get_atla_response(
+                model_name=model_id,
+                prompt=prompt_data,
+                max_tokens=500,
+                temperature=0.01
+            )
+            # Format the response for display
+            score_text = f"{response['score']}/5"
+            critique_text = f"{response['critique']}"
+            # Return all required values for the UI components
+            return score_text, critique_text, gr.update(value="Regenerate evaluation", variant="secondary", interactive=True), gr.update(value="🎲", variant="primary")
         # Update the send_btn click handler with new input
         send_btn.click(

random_sample/gen_api_answer.py CHANGED Viewed

@@ -7,10 +7,6 @@ from dotenv import load_dotenv
 from .prompts import (
     JUDGE_SYSTEM_PROMPT
 )
-from transformers import AutoTokenizer
-import requests
-import json
-import re
 load_dotenv()
@@ -63,7 +59,7 @@ def get_atla_response(model_name, prompt, system_prompt=None, max_tokens=500, te
         evaluation_criteria = prompt.get('eval_criteria', '')
         response = atla_client.evaluation.create(
-            model_id="atla-selene",
             model_input=model_input,
             model_output=model_output,
             expected_model_output=expected_output if expected_output else None,
@@ -76,73 +72,4 @@ def get_atla_response(model_name, prompt, system_prompt=None, max_tokens=500, te
             "critique": response.result.evaluation.critique
         }
     except Exception as e:
-        return f"Error with Atla model {model_name}: {str(e)}"
-def get_selene_mini_response(model_name, prompt, system_prompt=None, max_tokens=500, temperature=0.01):
-    """Get response from HF endpoint for Atla model"""
-    try:
-        headers = {
-            "Accept": "application/json",
-            "Authorization": f"Bearer {hf_api_key}",
-            "Content-Type": "application/json"
-        }
-        # Create messages list for chat template
-        messages = []
-        if system_prompt:
-            messages.append({"role": "system", "content": system_prompt})
-        messages.append({"role": "user", "content": prompt})
-        # Apply chat template
-        model_id = "AtlaAI/Selene-1-Mini-Llama-3.1-8B"
-        tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_api_key)
-        formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        payload = {
-            "inputs": formatted_prompt,
-            "parameters": {
-                "max_new_tokens": max_tokens,
-                "return_full_text": False,
-                "temperature": temperature,
-                "seed": 42,
-                "add_generation_prompt": True
-            }
-        }
-        response = requests.post(
-            "https://bkp9p28gri93egqh.us-east-1.aws.endpoints.huggingface.cloud",
-            headers=headers,
-            json=payload
-        )
-        return response.json()[0]["generated_text"]
-    except Exception as e:
-        return f"Error with Atla model {model_name}: {str(e)}"
-def parse_selene_mini_response(response_text):
-    """Parse the response from Selene Mini to extract score and critique"""
-    try:
-        # Clean up the response text
-        response_text = response_text.strip()
-        # More flexible regex patterns
-        reasoning_pattern = r'\*\*Reasoning:?\*\*\s*(.*?)(?=\*\*Result|$)'
-        result_pattern = r'\*\*Result:?\*\*\s*(\d+)'
-        reasoning_match = re.search(reasoning_pattern, response_text, re.DOTALL | re.IGNORECASE)
-        result_match = re.search(result_pattern, response_text, re.IGNORECASE)
-        if reasoning_match and result_match:
-            critique = reasoning_match.group(1).strip()
-            score = result_match.group(1)
-            return {"score": score, "critique": critique}
-        else:
-            # If we can't parse it properly, let's return the raw response as critique
-            return {
-                "score": "Error",
-                "critique": f"Failed to parse response. Raw response:\n{response_text}"
-            }
-    except Exception as e:
-        return {
-            "score": "Error",
-            "critique": f"Error parsing response: {str(e)}\nRaw response:\n{response_text}"
-        }

 from .prompts import (
     JUDGE_SYSTEM_PROMPT
 )
 load_dotenv()
         evaluation_criteria = prompt.get('eval_criteria', '')
         response = atla_client.evaluation.create(
+            model_id=model_name,  # Will be either "atla-selene" or "atla-selene-mini"
             model_input=model_input,
             model_output=model_output,
             expected_model_output=expected_output if expected_output else None,
             "critique": response.result.evaluation.critique
         }
     except Exception as e:
+        return f"Error with Atla model {model_name}: {str(e)}"