File size: 4,545 Bytes
ed4d993
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from __future__ import annotations

from typing import Dict, List, Optional, cast

import requests
from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import BaseModel, Extra, SecretStr, root_validator
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env


class ClovaEmbeddings(BaseModel, Embeddings):
    """
    Clova's embedding service.

    To use this service,

    you should have the following environment variables
    set with your API tokens and application ID,
    or pass them as named parameters to the constructor:

    - ``CLOVA_EMB_API_KEY``: API key for accessing Clova's embedding service.
    - ``CLOVA_EMB_APIGW_API_KEY``: API gateway key for enhanced security.
    - ``CLOVA_EMB_APP_ID``: Application ID for identifying your application.

    Example:
        .. code-block:: python

            from langchain_community.embeddings import ClovaEmbeddings
            embeddings = ClovaEmbeddings(
                clova_emb_api_key='your_clova_emb_api_key',
                clova_emb_apigw_api_key='your_clova_emb_apigw_api_key',
                app_id='your_app_id'
            )

            query_text = "This is a test query."
            query_result = embeddings.embed_query(query_text)

            document_text = "This is a test document."
            document_result = embeddings.embed_documents([document_text])

    """

    endpoint_url: str = (
        "https://clovastudio.apigw.ntruss.com/testapp/v1/api-tools/embedding"
    )
    """Endpoint URL to use."""
    model: str = "clir-emb-dolphin"
    """Embedding model name to use."""
    clova_emb_api_key: Optional[SecretStr] = None
    """API key for accessing Clova's embedding service."""
    clova_emb_apigw_api_key: Optional[SecretStr] = None
    """API gateway key for enhanced security."""
    app_id: Optional[SecretStr] = None
    """Application ID for identifying your application."""

    class Config:
        extra = Extra.forbid

    @root_validator(pre=True, allow_reuse=True)
    def validate_environment(cls, values: Dict) -> Dict:
        """Validate api key exists in environment."""
        values["clova_emb_api_key"] = convert_to_secret_str(
            get_from_dict_or_env(values, "clova_emb_api_key", "CLOVA_EMB_API_KEY")
        )
        values["clova_emb_apigw_api_key"] = convert_to_secret_str(
            get_from_dict_or_env(
                values, "clova_emb_apigw_api_key", "CLOVA_EMB_APIGW_API_KEY"
            )
        )
        values["app_id"] = convert_to_secret_str(
            get_from_dict_or_env(values, "app_id", "CLOVA_EMB_APP_ID")
        )
        return values

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """
        Embed a list of texts and return their embeddings.

        Args:
            texts: The list of texts to embed.

        Returns:
            List of embeddings, one for each text.
        """
        embeddings = []
        for text in texts:
            embeddings.append(self._embed_text(text))
        return embeddings

    def embed_query(self, text: str) -> List[float]:
        """
        Embed a single query text and return its embedding.

        Args:
            text: The text to embed.

        Returns:
            Embeddings for the text.
        """
        return self._embed_text(text)

    def _embed_text(self, text: str) -> List[float]:
        """
        Internal method to call the embedding API and handle the response.
        """
        payload = {"text": text}

        # HTTP headers for authorization
        headers = {
            "X-NCP-CLOVASTUDIO-API-KEY": cast(
                SecretStr, self.clova_emb_api_key
            ).get_secret_value(),
            "X-NCP-APIGW-API-KEY": cast(
                SecretStr, self.clova_emb_apigw_api_key
            ).get_secret_value(),
            "Content-Type": "application/json",
        }

        # send request
        app_id = cast(SecretStr, self.app_id).get_secret_value()
        response = requests.post(
            f"{self.endpoint_url}/{self.model}/{app_id}",
            headers=headers,
            json=payload,
        )

        # check for errors
        if response.status_code == 200:
            response_data = response.json()
            if "result" in response_data and "embedding" in response_data["result"]:
                return response_data["result"]["embedding"]
        raise ValueError(
            f"API request failed with status {response.status_code}: {response.text}"
        )