Jasper Sands commited on
Commit
e72bb6f
·
1 Parent(s): 53c7ecd

Add application file

Browse files
Files changed (3) hide show
  1. Clean Missouri Data.csv +0 -0
  2. app.py +120 -0
  3. requirements.txt +8 -0
Clean Missouri Data.csv ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import nltk
4
+ from nltk.corpus import stopwords
5
+ from sentence_transformers import SentenceTransformer, util
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+
9
+ from unsloth import FastLanguageModel
10
+ from peft import PeftModel
11
+ from unsloth.chat_templates import get_chat_template
12
+
13
+ # Download NLTK stopwords if not already downloaded
14
+ nltk.download("stopwords")
15
+
16
+ # 1. Load model + tokenizer
17
+ base_model_name = "unsloth/Llama-3.2-3B-Instruct"
18
+ model, tokenizer = FastLanguageModel.from_pretrained(
19
+ model_name=base_model_name,
20
+ max_seq_length=2048,
21
+ dtype=None,
22
+ load_in_4bit=True
23
+ )
24
+
25
+ # 2. Load the LoRA adapter
26
+ adapter_path = "jaspersands/model" # Adjust if needed
27
+ model = PeftModel.from_pretrained(model, adapter_path)
28
+
29
+ # 3. Load data
30
+ file_path = "Clean Missouri Data.csv" # Ensure this CSV is in your repo
31
+ df = pd.read_csv(file_path, encoding="MacRoman")
32
+
33
+ # 4. Define helper functions
34
+ def search_relevant_policies(query, df, top_n=10):
35
+ tfidf = TfidfVectorizer(stop_words='english')
36
+ tfidf_matrix = tfidf.fit_transform(df['Content'])
37
+ query_vector = tfidf.transform([query])
38
+ cosine_sim = cosine_similarity(query_vector, tfidf_matrix).flatten()
39
+ top_indices = cosine_sim.argsort()[-top_n:][::-1]
40
+ return df.iloc[top_indices]
41
+
42
+ def get_content_after_query(response_text, query):
43
+ query_position = response_text.lower().find(query.lower())
44
+ if query_position != -1:
45
+ res = response_text[query_position + len(query):].strip()
46
+ return res[11:]
47
+ else:
48
+ return response_text.strip()
49
+
50
+ def process_query(query, tokenizer):
51
+ # 1. Get relevant policies
52
+ relevant_policies = search_relevant_policies(query, df)
53
+
54
+ # 2. Format relevant policies
55
+ formatted_policies = []
56
+ for index, row in relevant_policies.iterrows():
57
+ formatted_policy = (
58
+ f"Title: {row['Title']}\nTerritory: {row['Territory']}\n"
59
+ f"Type: {row['Type']}\nYear: {row['Year']}\nCategory: {row['Category']}\n"
60
+ f"From: {row['From']}\nTo: {row['To']}\nContent: {row['Content']}\n"
61
+ f"Link: {row['Link to Content']}\n"
62
+ )
63
+ formatted_policies.append(formatted_policy)
64
+ relevant_policy_text = "\n\n".join(formatted_policies)
65
+
66
+ # 3. Create messages for model
67
+ messages_with_relevant_policies = [
68
+ {"role": "system", "content": relevant_policy_text},
69
+ {"role": "user", "content": query},
70
+ ]
71
+
72
+ # 4. Tokenize with chat template
73
+ tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")
74
+ inputs = tokenizer.apply_chat_template(
75
+ messages_with_relevant_policies,
76
+ tokenize=True,
77
+ add_generation_prompt=True,
78
+ return_tensors="pt"
79
+ ).to("cuda")
80
+
81
+ # 5. Generate output
82
+ FastLanguageModel.for_inference(model)
83
+ outputs = model.generate(
84
+ input_ids=inputs,
85
+ max_new_tokens=256,
86
+ use_cache=True,
87
+ temperature=1.5,
88
+ min_p=0.1
89
+ )
90
+ generated_response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
91
+ response = get_content_after_query(generated_response, query)
92
+
93
+ # 6. Rank the top 10 policies using SBERT
94
+ model_sbert = SentenceTransformer("all-MiniLM-L6-v2")
95
+ response_embedding = model_sbert.encode(generated_response, convert_to_tensor=True)
96
+ policy_embeddings = model_sbert.encode(relevant_policies['Content'].tolist(), convert_to_tensor=True)
97
+ cosine_similarities = util.cos_sim(response_embedding, policy_embeddings).flatten()
98
+ most_relevant_index = cosine_similarities.argmax().item()
99
+ most_relevant_link = relevant_policies.iloc[most_relevant_index]['Link to Content']
100
+
101
+ return {"response": response, "most_relevant_link": most_relevant_link}
102
+
103
+ # 5. Gradio interface
104
+ def answer_query(u_query):
105
+ result = process_query(u_query, tokenizer)
106
+ return result["response"], result["most_relevant_link"]
107
+
108
+ demo = gr.Interface(
109
+ fn=answer_query,
110
+ inputs="text",
111
+ outputs=[
112
+ gr.Textbox(label="System Response"),
113
+ gr.Textbox(label="Relevant Link")
114
+ ],
115
+ title="Foster Questions",
116
+ description="Enter your question about the US foster system"
117
+ )
118
+
119
+ if __name__ == "__main__":
120
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # requirements.txt
2
+ unsloth
3
+ peft
4
+ gradio
5
+ scikit-learn
6
+ pandas
7
+ nltk
8
+ sentence-transformers