|
import streamlit as st |
|
import pandas as pd |
|
from io import StringIO |
|
import json |
|
|
|
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM |
|
from sentence_transformers import SentenceTransformer, util |
|
|
|
|
|
|
|
import os |
|
os.getenv("HF_TOKEN") |
|
|
|
PAGES = { |
|
"Home": Pages.home, |
|
"Demo": Pages.demo, |
|
"About": Pages.about |
|
} |
|
|
|
st.sidebar.title("SBSmapper") |
|
selection = st.sidebar.radio("Pages", list(PAGES.keys())) |
|
|
|
|
|
|
|
|
|
def on_click(): |
|
st.session_state.user_input = "" |
|
|
|
|
|
def convert_df(df:pd.DataFrame): |
|
return df.to_csv(index=False).encode('utf-8') |
|
|
|
|
|
def convert_json(df:pd.DataFrame): |
|
result = df.to_json(orient="index") |
|
parsed = json.loads(result) |
|
json_string = json.dumps(parsed) |
|
|
|
return json_string |
|
|
|
|
|
|
|
INTdesc_input = st.text_input("Type internal description and hit Enter", key="user_input") |
|
|
|
createSBScodes, right_column = st.columns(2) |
|
createSBScodes_clicked = createSBScodes.button("Map to SBS codes", key="user_createSBScodes") |
|
right_column.button("Reset", on_click=on_click) |
|
|
|
numMAPPINGS_input = 5 |
|
|
|
|
|
|
|
|
|
model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
|
|
|
|
|
|
|
|
INTdesc_embedding = model.encode(INTdesc_input) |
|
|
|
|
|
|
|
|
|
|
|
from_line = 7727 |
|
to_line = 8239 |
|
nrows = to_line - from_line + 1 |
|
skiprows = list(range(1,from_line - 1)) |
|
df_SBS = pd.read_csv("SBS_V2_Table.csv", header=0, skip_blank_lines=False, skiprows=skiprows, nrows=nrows) |
|
|
|
|
|
SBScorpus = df_SBS['Long_Description'].values.tolist() |
|
SBScorpus_embeddings = model.encode(SBScorpus) |
|
|
|
|
|
HF_model_results = util.semantic_search(INTdesc_embedding, SBScorpus_embeddings) |
|
HF_model_results_sorted = sorted(HF_model_results, key=lambda x: x[1], reverse=True) |
|
HF_model_results_displayed = HF_model_results_sorted[0:numMAPPINGS_input] |
|
|
|
model_id = "meta-llama/Llama-3.2-1B-Instruct" |
|
pipe = pipeline("text-generation", model=model_id, device_map="auto",) |
|
|
|
|
|
col1, col2, col3 = st.columns([1,1,2.5]) |
|
col1.subheader("Score") |
|
col2.subheader("SBS code") |
|
col3.subheader("SBS description V2.0") |
|
|
|
dictA = {"Score": [], "SBS Code": [], "SBS Description V2.0": []} |
|
|
|
if INTdesc_input is not None and createSBScodes_clicked == True: |
|
|
|
for result in HF_model_results_displayed: |
|
with st.container(): |
|
col1.write("%.4f" % result[0]["score"]) |
|
col2.write(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[0]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]) |
|
col3.write(SBScorpus[result[0]["corpus_id"]]) |
|
dictA["Score"].append("%.4f" % result[0]["score"]), dictA["SBS Code"].append(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[0]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]), dictA["SBS Description V2.0"].append(SBScorpus[result[0]["corpus_id"]]) |
|
|
|
col1.write("%.4f" % result[1]["score"]) |
|
col2.write(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[1]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]) |
|
col3.write(SBScorpus[result[1]["corpus_id"]]) |
|
dictA["Score"].append("%.4f" % result[1]["score"]), dictA["SBS Code"].append(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[1]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]), dictA["SBS Description V2.0"].append(SBScorpus[result[1]["corpus_id"]]) |
|
|
|
col1.write("%.4f" % result[2]["score"]) |
|
col2.write(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[2]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]) |
|
col3.write(SBScorpus[result[2]["corpus_id"]]) |
|
dictA["Score"].append("%.4f" % result[2]["score"]), dictA["SBS Code"].append(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[2]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]), dictA["SBS Description V2.0"].append(SBScorpus[result[2]["corpus_id"]]) |
|
|
|
col1.write("%.4f" % result[3]["score"]) |
|
col2.write(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[3]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]) |
|
col3.write(SBScorpus[result[3]["corpus_id"]]) |
|
dictA["Score"].append("%.4f" % result[3]["score"]), dictA["SBS Code"].append(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[3]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]), dictA["SBS Description V2.0"].append(SBScorpus[result[3]["corpus_id"]]) |
|
|
|
col1.write("%.4f" % result[4]["score"]) |
|
col2.write(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[4]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]) |
|
col3.write(SBScorpus[result[4]["corpus_id"]]) |
|
dictA["Score"].append("%.4f" % result[4]["score"]), dictA["SBS Code"].append(df_SBS.loc[df_SBS["Long_Description"] == SBScorpus[result[4]["corpus_id"]],"SBS_Code_Hyphenated"].values[0]), dictA["SBS Description V2.0"].append(SBScorpus[result[4]["corpus_id"]]) |
|
|
|
dfA = pd.DataFrame.from_dict(dictA) |
|
|
|
display_format = "ask REASONING MODEL: Which, if any, of the above Saudi Billing System descriptions corresponds best to " + INTdesc_input +"? " |
|
st.write(display_format) |
|
question = "Which, if any, of the below Saudi Billing System descriptions corresponds best to " + INTdesc_input +"? " |
|
shortlist = [SBScorpus[result[0]["corpus_id"]], SBScorpus[result[1]["corpus_id"]], SBScorpus[result[2]["corpus_id"]], SBScorpus[result[3]["corpus_id"]], SBScorpus[result[4]["corpus_id"]]] |
|
prompt = [question + " " + shortlist[0] + " " + shortlist[1] + " " + shortlist[2] + " " + shortlist[3] + " " + shortlist[4]] |
|
|
|
|
|
messages = [ |
|
{"role": "system", "content": "You are a knowledgable AI assistant who always answers truthfully and precisely!"}, |
|
{"role": "user", "content": prompt}, |
|
] |
|
outputs = pipe( |
|
messages, |
|
max_new_tokens=256, |
|
) |
|
st.write(outputs[0]["generated_text"][-1]["content"]) |
|
|
|
bs, b1, b2, b3, bLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75]) |
|
with b1: |
|
|
|
csvbutton = st.download_button(label="π₯ Download .csv", data=convert_df(dfA), file_name= "results.csv", mime='text/csv', key='csv_b') |
|
with b2: |
|
|
|
textbutton = st.download_button(label="π₯ Download .txt", data=convert_df(dfA), file_name= "results.text", mime='text/plain', key='text_b') |
|
with b3: |
|
|
|
jsonbutton = st.download_button(label="π₯ Download .json", data=convert_json(dfA), file_name= "results.json", mime='application/json', key='json_b') |