|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from typing import Any |
|
|
|
from camel.prompts import TextPrompt, TextPromptDict |
|
from camel.types import RoleType |
|
|
|
|
|
|
|
class GenerateTextEmbeddingDataPromptTemplateDict(TextPromptDict): |
|
r"""A :obj:`TextPrompt` dictionary containing text embedding tasks |
|
generation, query, positive and hard negative samples generation, |
|
from the `"Improving Text Embeddings with Large Language Models" |
|
<https://arxiv.org/abs/2401.00368>`_ paper. |
|
|
|
|
|
Attributes: |
|
GENERATE_TASKS (TextPrompt): A prompt to generate a list |
|
of :obj:`num_tasks` synthetic text_embedding tasks. |
|
ASSISTANT_PROMPT (TextPrompt): A system prompt for the AI assistant |
|
to generate synthetic :obj:`user_query`, :obj:`positive document`, |
|
and :obj:`hard_negative_document` for a specific :obj:`task` with |
|
specified parameters including :obj:`query_type`, |
|
:obj:`query_length`, :obj:`clarity`, :obj:`num_words`, |
|
:obj:`language` and :obj:`difficulty`. |
|
""" |
|
|
|
GENERATE_TASKS = TextPrompt( |
|
"""You are an expert to brainstorm a list of {num_tasks} potentially useful text retrieval tasks |
|
Here are a few examples for your reference: |
|
- Provided a scientific claim as query, retrieve documents that help verify or refute the claim. |
|
- Search for documents that answers a FAQ-style query on children's nutrition. |
|
Please adhere to the following guidelines: |
|
- Specify what the query is, and what the desired documents are. |
|
- Each retrieval task should cover a wide range of queries, and should not be too specific. |
|
Your output should always be a python list of strings starting with `1.`, `2.` etc. |
|
And each element corresponds to a distinct retrieval task in one sentence. |
|
Do not explain yourself or output anything else. |
|
Be creative!""" |
|
) |
|
|
|
ASSISTANT_PROMPT = TextPrompt( |
|
"""You have been assigned a retrieval task: {task} |
|
Your mission is to write one text retrieval example for this task in JSON format. The JSON object must |
|
contain the following keys: |
|
- "user_query": a string, a random user search query specified by the retrieval task. |
|
- "positive_document": a string, a relevant document for the user query. |
|
- "hard_negative_document": a string, a hard negative document that only appears relevant to the query. |
|
Please adhere to the following guidelines: |
|
- The "user_query" should be {query_type}, {query_length}, {clarity}, and diverse in topic. |
|
- All documents must be created independent of the query. Avoid copying the query verbatim. |
|
It's acceptable if some parts of the "positive_document" are not topically related to the query. |
|
- All documents should be at least {num_words} words long. |
|
- The "hard_negative_document" contains some useful information, but it should be less useful or comprehensive compared to the "positive_document". |
|
- Both the query and documents should be in {language}. |
|
- Do not provide any explanation in any document on why it is relevant or not relevant to the query. |
|
- Both the query and documents require {difficulty} level education to understand. |
|
Your output must always be a JSON object only (starting and ending with curly brackets), do not explain yourself or output anything else. Be creative!""" |
|
) |
|
|
|
def __init__(self, *args: Any, **kwargs: Any) -> None: |
|
super().__init__(*args, **kwargs) |
|
self.update( |
|
{ |
|
"generate_tasks": self.GENERATE_TASKS, |
|
RoleType.ASSISTANT: self.ASSISTANT_PROMPT, |
|
} |
|
) |
|
|