leo-pasi commited on
Commit
5de4570
·
1 Parent(s): ee601ba

updated source code

Browse files
src/mythesis_chatbot/evaluation.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import numpy as np
4
+ from tqdm import tqdm
5
+ from trulens.apps.llamaindex import TruLlama
6
+ from trulens.core import Feedback
7
+ from trulens.providers.openai import OpenAI
8
+
9
+ from mythesis_chatbot.utils import get_config_hash
10
+
11
+
12
+ def run_evals(eval_questions_path: Path, tru_recorder, query_engine):
13
+
14
+ eval_questions = []
15
+ with open(eval_questions_path) as file:
16
+ for line in file:
17
+ item = line.strip()
18
+ eval_questions.append(item)
19
+
20
+ for question in tqdm(eval_questions):
21
+ with tru_recorder as recording: # noqa: F841
22
+ response = query_engine.query(question) # noqa: F841
23
+
24
+
25
+ # Feedback function
26
+ def f_answer_relevance(provider=OpenAI(), name="Answer Relevance"):
27
+ return Feedback(provider.relevance_with_cot_reasons, name=name).on_input_output()
28
+
29
+
30
+ # Feedback function
31
+ def f_context_relevance(
32
+ provider=OpenAI(),
33
+ context=TruLlama.select_source_nodes().node.text,
34
+ name="Context Relevance",
35
+ ):
36
+ return (
37
+ Feedback(provider.relevance, name=name)
38
+ .on_input()
39
+ .on(context)
40
+ .aggregate(np.mean)
41
+ )
42
+
43
+
44
+ # Feedback function
45
+ def f_groundedness(
46
+ provider=OpenAI(),
47
+ context=TruLlama.select_source_nodes().node.text,
48
+ name="Groundedness",
49
+ ):
50
+ return (
51
+ Feedback(
52
+ provider.groundedness_measure_with_cot_reasons,
53
+ name=name,
54
+ )
55
+ .on(context)
56
+ .on_output()
57
+ )
58
+
59
+
60
+ def get_prebuilt_trulens_recorder(
61
+ query_engine, query_engine_config: dict[str, str | int]
62
+ ):
63
+ app_name = query_engine_config["rag_mode"]
64
+ app_version = get_config_hash(query_engine_config)
65
+
66
+ tru_recorder = TruLlama(
67
+ query_engine,
68
+ app_name=app_name,
69
+ app_version=app_version,
70
+ metadata=query_engine_config,
71
+ feedbacks=[f_answer_relevance(), f_context_relevance(), f_groundedness()],
72
+ )
73
+ return tru_recorder
src/mythesis_chatbot/rag_setup.py CHANGED
@@ -25,9 +25,11 @@ from llama_index.core.retrievers import AutoMergingRetriever
25
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
26
  from llama_index.llms.openai import OpenAI
27
 
28
- from src.mythesis_chatbot.utils import get_config_hash, get_openai_api_key
29
 
30
- SupportedRags = Literal["basic", "sentence window retrieval", "auto-merging retrieval"]
 
 
31
  SupportedOpenAIllms = Literal["gpt-4o-mini", "gpt-3.5-turbo"]
32
  SupportedEmbedModels = Literal["BAAI/bge-small-en-v1.5"]
33
  SupportedRerankModels = Literal["cross-encoder/ms-marco-MiniLM-L-2-v2"]
@@ -167,6 +169,7 @@ def sentence_window_retrieval_setup(
167
  similarity_top_k: int = 6,
168
  rerank_model: SupportedRerankModels = "cross-encoder/ms-marco-MiniLM-L-2-v2",
169
  rerank_top_n: int = 2,
 
170
  ):
171
 
172
  openai.api_key = get_openai_api_key()
@@ -204,6 +207,7 @@ def automerging_retrieval_setup(
204
  similarity_top_k: int = 6,
205
  rerank_model: SupportedRerankModels = "cross-encoder/ms-marco-MiniLM-L-2-v2",
206
  rerank_top_n: int = 2,
 
207
  ):
208
  openai.api_key = get_openai_api_key()
209
 
@@ -239,6 +243,7 @@ def basic_rag_setup(
239
  similarity_top_k: int = 6,
240
  rerank_model: SupportedRerankModels = "cross-encoder/ms-marco-MiniLM-L-2-v2",
241
  rerank_top_n: int = 2,
 
242
  ):
243
  openai.api_key = get_openai_api_key()
244
 
 
25
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
26
  from llama_index.llms.openai import OpenAI
27
 
28
+ from mythesis_chatbot.utils import get_config_hash, get_openai_api_key
29
 
30
+ SupportedRags = Literal[
31
+ "classic retrieval", "sentence window retrieval", "auto-merging retrieval"
32
+ ]
33
  SupportedOpenAIllms = Literal["gpt-4o-mini", "gpt-3.5-turbo"]
34
  SupportedEmbedModels = Literal["BAAI/bge-small-en-v1.5"]
35
  SupportedRerankModels = Literal["cross-encoder/ms-marco-MiniLM-L-2-v2"]
 
169
  similarity_top_k: int = 6,
170
  rerank_model: SupportedRerankModels = "cross-encoder/ms-marco-MiniLM-L-2-v2",
171
  rerank_top_n: int = 2,
172
+ **kwargs
173
  ):
174
 
175
  openai.api_key = get_openai_api_key()
 
207
  similarity_top_k: int = 6,
208
  rerank_model: SupportedRerankModels = "cross-encoder/ms-marco-MiniLM-L-2-v2",
209
  rerank_top_n: int = 2,
210
+ **kwargs
211
  ):
212
  openai.api_key = get_openai_api_key()
213
 
 
243
  similarity_top_k: int = 6,
244
  rerank_model: SupportedRerankModels = "cross-encoder/ms-marco-MiniLM-L-2-v2",
245
  rerank_top_n: int = 2,
246
+ **kwargs
247
  ):
248
  openai.api_key = get_openai_api_key()
249
 
src/mythesis_chatbot/run_evaluation.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ import os
3
+ import pandas as pd
4
+ import nest_asyncio
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ sys.path.append(str(Path(__file__).resolve().parents[1]))
9
+
10
+ from mythesis_chatbot import evaluation
11
+ from trulens.core import TruSession
12
+ from mythesis_chatbot.rag_setup import (
13
+ sentence_window_retrieval_setup,
14
+ )
15
+ import yaml
16
+ from trulens.dashboard.display import get_feedback_result
17
+ from trulens.dashboard import run_dashboard
18
+
19
+ # %%
20
+
21
+ with open(os.path.join("../../configs", "sentence_window.yaml"), "r") as f:
22
+ config = yaml.safe_load(f)
23
+
24
+ engine = sentence_window_retrieval_setup(
25
+ input_file="../../data/Master_Thesis.pdf", save_dir="../../data/indices", **config
26
+ )
27
+
28
+ # database_url=os.getenv("SUPABASE_CONNECTION_STRING")
29
+ tru = TruSession(database_url=os.getenv("SUPABASE_CONNECTION_STRING"))
30
+ tru.reset_database()
31
+ nest_asyncio.apply()
32
+
33
+
34
+ # %%
35
+
36
+ tru_recorder = evaluation.get_prebuilt_trulens_recorder(engine, config)
37
+ # %%
38
+
39
+ query = "Why?"
40
+ with tru_recorder as recording: # noqa: F841
41
+ response = engine.query(query) # noqa: F841
42
+
43
+ # %%
44
+ database = tru_recorder.db
45
+
46
+ # %%
47
+
48
+ rec = recording.get()
49
+ # get_feedback_result(rec, "Context Relevance")
50
+
51
+ for feedback, feedback_result in rec.wait_for_feedback_results().items():
52
+ print(feedback.name, feedback_result.result)
53
+ # database.insert_feedback(feedback_result)
54
+
55
+
56
+ # %%
57
+ evaluation.run_evals(
58
+ os.path.join("../../data/", "eval_questions.txt"), tru_recorder, engine
59
+ )
60
+
61
+ # %%
62
+ records, feedback = tru.get_records_and_feedback(app_ids=[])
63
+ records.head()
64
+ # %%
65
+ pd.set_option("display.max_colwidth", None)
66
+ records[["input", "output"] + feedback]
67
+ # %%
68
+
69
+ tru.get_leaderboard(app_ids=[])
70
+ # %%
71
+ tru.run_dashboard()