aleksandrrnt commited on
Commit
8cf08be
·
verified ·
1 Parent(s): d6d7cc9

Upload 3 files

Browse files
Files changed (3) hide show
  1. db.py +63 -0
  2. rag.py +72 -0
  3. sources.txt +3 -0
db.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from chromadb import Client, Settings, PersistentClient
4
+ from chromadb.utils.embedding_functions.sentence_transformer_embedding_function import SentenceTransformerEmbeddingFunction
5
+
6
+ import logging
7
+
8
+ logger = logging.getLogger("db")
9
+ logging.basicConfig(
10
+ format="%(asctime)s %(levelname)-8s %(message)s",
11
+ level=logging.INFO,
12
+ datefmt="%Y-%m-%d %H:%M:%S",
13
+ )
14
+
15
+ JINA_KEY = os.getenv('JINA_API_KEY')
16
+
17
+ jina_headers = {
18
+ "Authorization": f"Bearer {JINA_KEY}"
19
+ }
20
+
21
+ def get_data_url(url):
22
+ logger.info(f"Scraping {url}")
23
+ jina_response = requests.get(f"https://r.jina.ai/{url}", headers=jina_headers, verify=False)
24
+
25
+ return jina_response.text
26
+
27
+ class HacatonDB:
28
+ def __init__(self):
29
+ self.client = PersistentClient(settings=Settings(anonymized_telemetry=False))
30
+ self.embed = SentenceTransformerEmbeddingFunction(
31
+ model_name="BAAI/bge-m3"
32
+ )
33
+ self.collection = self.client.create_collection('test_hakaton', embedding_function=self.embed, metadata={"hnsw:space": "cosine"}, get_or_create=True)
34
+
35
+ def add(self, urls):
36
+ logger.info(f"Add info to collection")
37
+ texts = []
38
+ meta = []
39
+ new_urls = []
40
+ for url in urls:
41
+ if len(self.collection.get(ids=[url])["ids"]) > 0:
42
+ logger.info(f"URL {url} already exist")
43
+ continue
44
+
45
+ new_urls.append(url)
46
+ texts.append(get_data_url(url))
47
+ meta.append({"file_name": f"file_{url.split('/')[-2]}"})
48
+ logger.info(f"URL {url} added")
49
+
50
+ if len(new_urls) > 0:
51
+ self.collection.add(documents=texts, ids=new_urls, metadatas=meta)
52
+ logger.info(f"Addition {len(new_urls)} sources completed")
53
+ else:
54
+ logger.info(f"No new sources")
55
+
56
+ def update(self, urls):
57
+ pass
58
+
59
+ def query(self, query, top_k):
60
+ return self.collection.query(query_texts=query, n_results=top_k)
61
+
62
+
63
+ db = HacatonDB()
rag.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ from db import db
4
+ import logging
5
+
6
+ logger = logging.getLogger("rag")
7
+ logging.basicConfig(
8
+ format="%(asctime)s %(levelname)-8s %(message)s",
9
+ level=logging.INFO,
10
+ datefmt="%Y-%m-%d %H:%M:%S",
11
+ )
12
+
13
+
14
+ MISTRAL_KEY=os.getenv('MISTRAL_API_KEY')
15
+ MISTRAL_URL="https://api.mistral.ai/v1"
16
+ MISTRAL_MODEL="mistral-small-latest"
17
+
18
+
19
+ client = openai.OpenAI(api_key=MISTRAL_KEY, base_url=MISTRAL_URL)
20
+
21
+ message_template = """\
22
+ Далее представлена информацию по опыту нашей компании
23
+ ---------------------
24
+ {retrieved_chunks}
25
+ ---------------------
26
+ Далее представлен запрос потенциального проекта
27
+ ---------------------
28
+ {request_content}
29
+ ---------------------
30
+ При проведении анализа опирайся только на представленную информацию"""
31
+
32
+ # Функция для обработки запроса к LLM
33
+ def process_query(req_file, system_prompt):
34
+ logger.info("Process query")
35
+
36
+ if req_file is not None:
37
+ with open(req_file.name, 'r', encoding='utf-8') as f:
38
+ req_file_content = f.read()
39
+ else:
40
+ logger.warning("File is not loaded!")
41
+ req_file_content = ""
42
+
43
+ logger.info("Retrive docs")
44
+
45
+ docs = db.query(req_file_content, top_k=1)
46
+
47
+
48
+ logger.info(f"Retrived {len(docs['ids'][0])} docs")
49
+
50
+ doc_context = '\n\n'.join(docs['documents'][0])
51
+
52
+ # Создание контекста из файлов
53
+ user_message = message_template.format(retrieved_chunks=doc_context, request_content=req_file_content)
54
+
55
+ # Формирование сообщения для LLM
56
+ messages = [
57
+ {"role": "system", "content": system_prompt},
58
+ {"role": "user", "content": user_message}
59
+ ]
60
+
61
+ logger.info("LLM call")
62
+
63
+ response = client.chat.completions.create(
64
+ messages=messages,
65
+ model=MISTRAL_MODEL,
66
+ )
67
+
68
+ logger.info("LLM call completed")
69
+
70
+ # Получение ответа от LLM
71
+ llm_response = response.choices[0].message.content
72
+ return llm_response
sources.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ https://www.reksoft.ru/blog/portfolio/iserver-ea-otp-bank/
2
+ https://www.reksoft.ru/blog/portfolio/iserver-otkrytie-broker/
3
+ https://www.reksoft.ru/blog/portfolio/func_test/