georad commited on
Commit
1b200e2
Β·
verified Β·
1 Parent(s): b9153c6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +143 -0
app.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from io import StringIO
4
+ import json
5
+ import torch
6
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM #AutoModelForTokenClassification
7
+ from sentence_transformers import SentenceTransformer, util
8
+ #import lmdeploy
9
+ #import turbomind as tm
10
+
11
+ from huggingface_hub import login
12
+ #login(token =)
13
+
14
+ def on_click():
15
+ st.session_state.user_input = ""
16
+
17
+ #@st.cache
18
+ def convert_df(df:pd.DataFrame):
19
+ return df.to_csv(index=False).encode('utf-8')
20
+
21
+ #@st.cache
22
+ def convert_json(df:pd.DataFrame):
23
+ result = df.to_json(orient="index")
24
+ parsed = json.loads(result)
25
+ json_string = json.dumps(parsed)
26
+ #st.json(json_string, expanded=True)
27
+ return json_string
28
+
29
+ #st.title("πŸ“˜SBS mapper")
30
+
31
+ INTdesc_input = st.text_input("Type internal description and hit Enter", key="user_input")
32
+
33
+ createSBScodes, right_column = st.columns(2)
34
+ createSBScodes_clicked = createSBScodes.button("Create SBS codes", key="user_createSBScodes")
35
+ right_column.button("Reset", on_click=on_click)
36
+
37
+ numMAPPINGS_input = 5
38
+ #numMAPPINGS_input = st.text_input("Type number of mappings and hit Enter", key="user_input_numMAPPINGS")
39
+ #st.button("Clear text", on_click=on_click)
40
+
41
+
42
+ model = SentenceTransformer('all-MiniLM-L6-v2') # fastest
43
+ #model = SentenceTransformer('all-mpnet-base-v2') # best performance
44
+ #model = SentenceTransformers('all-distilroberta-v1')
45
+ #model = SentenceTransformer('sentence-transformers/msmarco-bert-base-dot-v5')
46
+ #model = SentenceTransformer('clips/mfaq')
47
+
48
+ INTdesc_embedding = model.encode(INTdesc_input)
49
+
50
+ # Semantic search, Compute cosine similarity between all pairs of SBS descriptions
51
+
52
+ #df_SBS = pd.read_csv("SBS_V2_Table.csv", index_col="SBS_Code", usecols=["Long_Description"]) # na_values=['NA']
53
+ #df_SBS = pd.read_csv("SBS_V2_Table.csv", usecols=["SBS_Code_Hyphenated","Long_Description"])
54
+ from_line = 7727 # Imaging services chapter start, adjust as needed
55
+ to_line = 8239 # Imaging services chapter end, adjust as needed
56
+ nrows = to_line - from_line + 1
57
+ skiprows = list(range(1,from_line - 1))
58
+ df_SBS = pd.read_csv("SBS_V2_Table.csv", header=0, skip_blank_lines=False, skiprows=skiprows, nrows=nrows)
59
+ #st.write(df_SBS.head(5))
60
+
61
+ SBScorpus = df_SBS['Long_Description'].values.tolist()
62
+ SBScorpus_embeddings = model.encode(SBScorpus)
63
+
64
+ #my_model_results = pipeline("ner", model= "checkpoint-92")
65
+ HF_model_results = util.semantic_search(INTdesc_embedding, SBScorpus_embeddings)
66
+ HF_model_results_sorted = sorted(HF_model_results, key=lambda x: x[1], reverse=True)
67
+ HF_model_results_displayed = HF_model_results_sorted[0:numMAPPINGS_input]
68
+
69
+ model_id = "meta-llama/Llama-3.2-1B-Instruct"
70
+ pipe = pipeline(
71
+ "text-generation",
72
+ model=model_id,
73
+ torch_dtype=torch.bfloat16,
74
+ device_map="auto",
75
+ )
76
+
77
+
78
+
79
+ col1, col2, col3 = st.columns([1,1,2.5])
80
+ col1.subheader("Score")
81
+ col2.subheader("SBS code")
82
+ col3.subheader("SBS description V2.0")
83
+
84
+ dictA = {"Score": [], "SBS Code": [], "SBS Description V2.0": []}
85
+
86
+ if INTdesc_input is not None and createSBScodes_clicked == True:
87
+ #for i, result in enumerate(HF_model_results_displayed):
88
+ for result in HF_model_results_displayed:
89
+ with st.container():
90
+ col1.write("%.4f" % result[0]["score"])
91
+ col2.write(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[0]["corpus_id"]],"SBS_Code_Hyphenated"].values[0])
92
+ col3.write(SBScorpus[result[0]["corpus_id"]])
93
+ dictA["Score"].append("%.4f" % result[0]["score"]), dictA["SBS Code"].append(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[0]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]), dictA["SBS Description V2.0"].append(SBScorpus[result[0]["corpus_id"]])
94
+
95
+ col1.write("%.4f" % result[1]["score"])
96
+ col2.write(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[1]["corpus_id"]],"SBS_Code_Hyphenated"].values[0])
97
+ col3.write(SBScorpus[result[1]["corpus_id"]])
98
+ dictA["Score"].append("%.4f" % result[1]["score"]), dictA["SBS Code"].append(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[1]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]), dictA["SBS Description V2.0"].append(SBScorpus[result[1]["corpus_id"]])
99
+
100
+ col1.write("%.4f" % result[2]["score"])
101
+ col2.write(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[2]["corpus_id"]],"SBS_Code_Hyphenated"].values[0])
102
+ col3.write(SBScorpus[result[2]["corpus_id"]])
103
+ dictA["Score"].append("%.4f" % result[2]["score"]), dictA["SBS Code"].append(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[2]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]), dictA["SBS Description V2.0"].append(SBScorpus[result[2]["corpus_id"]])
104
+
105
+ col1.write("%.4f" % result[3]["score"])
106
+ col2.write(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[3]["corpus_id"]],"SBS_Code_Hyphenated"].values[0])
107
+ col3.write(SBScorpus[result[3]["corpus_id"]])
108
+ dictA["Score"].append("%.4f" % result[3]["score"]), dictA["SBS Code"].append(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[3]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]), dictA["SBS Description V2.0"].append(SBScorpus[result[3]["corpus_id"]])
109
+
110
+ col1.write("%.4f" % result[4]["score"])
111
+ col2.write(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[4]["corpus_id"]],"SBS_Code_Hyphenated"].values[0])
112
+ col3.write(SBScorpus[result[4]["corpus_id"]])
113
+ dictA["Score"].append("%.4f" % result[4]["score"]), dictA["SBS Code"].append(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[4]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]), dictA["SBS Description V2.0"].append(SBScorpus[result[4]["corpus_id"]])
114
+
115
+ dfA = pd.DataFrame.from_dict(dictA)
116
+
117
+ display_format = "ask REASONING MODEL: Which, if any, of the above Saudi Billing System descriptions corresponds best to " + INTdesc_input +"? "
118
+ st.write(display_format)
119
+ question = "Which, if any, of the below Saudi Billing System descriptions corresponds best to " + INTdesc_input +"? "
120
+ shortlist = [SBScorpus[result[0]["corpus_id"]], SBScorpus[result[1]["corpus_id"]], SBScorpus[result[2]["corpus_id"]], SBScorpus[result[3]["corpus_id"]], SBScorpus[result[4]["corpus_id"]]]
121
+ prompt = [question + " " + shortlist[0] + " " + shortlist[1] + " " + shortlist[2] + " " + shortlist[3] + " " + shortlist[4]]
122
+ st.write(prompt)
123
+
124
+ messages = [
125
+ {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
126
+ {"role": "user", "content": "Who are you?"},
127
+ ]
128
+ outputs = pipe(
129
+ messages,
130
+ max_new_tokens=256,
131
+ )
132
+ st.write(outputs[0]["generated_text"][-1])
133
+
134
+ bs, b1, b2, b3, bLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
135
+ with b1:
136
+ #csvbutton = download_button(results, "results.csv", "πŸ“₯ Download .csv")
137
+ csvbutton = st.download_button(label="πŸ“₯ Download .csv", data=convert_df(dfA), file_name= "results.csv", mime='text/csv', key='csv_b')
138
+ with b2:
139
+ #textbutton = download_button(results, "results.txt", "πŸ“₯ Download .txt")
140
+ textbutton = st.download_button(label="πŸ“₯ Download .txt", data=convert_df(dfA), file_name= "results.text", mime='text/plain', key='text_b')
141
+ with b3:
142
+ #jsonbutton = download_button(results, "results.json", "πŸ“₯ Download .json")
143
+ jsonbutton = st.download_button(label="πŸ“₯ Download .json", data=convert_json(dfA), file_name= "results.json", mime='application/json', key='json_b')