Update app.py
Browse files
app.py
CHANGED
@@ -1,45 +1,29 @@
|
|
1 |
import streamlit as st
|
|
|
2 |
import os
|
3 |
-
import torch
|
4 |
-
import numpy as np
|
5 |
-
from hazm import *
|
6 |
import docx
|
7 |
-
from
|
8 |
-
from langchain.llms import OpenAI
|
9 |
-
from langchain.chat_models import ChatOpenAI
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-base-uncased")
|
14 |
-
model = AutoModel.from_pretrained("HooshvareLab/bert-fa-base-uncased")
|
15 |
-
|
16 |
-
@st.cache
|
17 |
-
def get_embedding(text):
|
18 |
-
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
|
19 |
-
with torch.no_grad():
|
20 |
-
outputs = model(**inputs)
|
21 |
-
embeddings = outputs.last_hidden_state.mean(dim=1)
|
22 |
-
return embeddings.squeeze().numpy()
|
23 |
-
|
24 |
-
def cosine_similarity(vec1, vec2):
|
25 |
-
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
|
26 |
-
|
27 |
-
llm = ChatOpenAI(
|
28 |
base_url="https://api.together.xyz/v1",
|
29 |
api_key='0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979',
|
30 |
model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
|
31 |
)
|
|
|
32 |
def rewrite_answer_with_llm(answer, user_input):
|
33 |
-
prompt = f"پاسخی که باید بازنویسی
|
34 |
-
response = llm(prompt)
|
35 |
-
return response['choices'][0]['text'].strip()
|
36 |
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
39 |
|
40 |
-
# بارگذاری
|
41 |
folder_path = '46'
|
42 |
texts = []
|
|
|
43 |
for filename in os.listdir(folder_path):
|
44 |
if filename.endswith(".docx"):
|
45 |
full_path = os.path.join(folder_path, filename)
|
@@ -48,30 +32,42 @@ for filename in os.listdir(folder_path):
|
|
48 |
if file_text.strip():
|
49 |
texts.append(file_text)
|
50 |
|
|
|
51 |
normalizer = Normalizer()
|
52 |
sentence_tokenizer = SentenceTokenizer()
|
|
|
53 |
all_sentences = []
|
54 |
for text in texts:
|
55 |
normalized = normalizer.normalize(text)
|
56 |
sentences = sentence_tokenizer.tokenize(normalized)
|
57 |
all_sentences.extend(sentences)
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
with st.spinner("در حال محاسبه شباهتها..."):
|
68 |
-
user_embedding = get_embedding(user_input)
|
69 |
-
similarities = [cosine_similarity(user_embedding, get_embedding(chunk)) for chunk in chunks]
|
70 |
-
most_similar_index = np.argmax(similarities)
|
71 |
-
most_similar_chunk = chunks[most_similar_index]
|
72 |
-
|
73 |
-
# بازنویسی پاسخ با مدل LLM
|
74 |
-
rewritten_answer = rewrite_answer_with_llm(most_similar_chunk, user_input)
|
75 |
-
|
76 |
-
st.subheader("📌 پاسخ بازنویسیشده:")
|
77 |
-
st.write(rewritten_answer)
|
|
|
1 |
import streamlit as st
|
2 |
+
from hazm import Normalizer, SentenceTokenizer
|
3 |
import os
|
|
|
|
|
|
|
4 |
import docx
|
5 |
+
from openai import OpenAI
|
|
|
|
|
6 |
|
7 |
+
# LLM setup
|
8 |
+
llm = OpenAI(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
base_url="https://api.together.xyz/v1",
|
10 |
api_key='0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979',
|
11 |
model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
|
12 |
)
|
13 |
+
|
14 |
def rewrite_answer_with_llm(answer, user_input):
|
15 |
+
prompt = f"پاسخی که باید بازنویسی شود:\n{answer}\n\nلطفاً این پاسخ را با در نظر گرفتن محتوای سوال زیر و لحن آن بازنویسی کن:\n\nسوال: {user_input}"
|
|
|
|
|
16 |
|
17 |
+
response = llm.chat.completions.create(
|
18 |
+
messages=[{"role": "user", "content": prompt}],
|
19 |
+
model=llm.model
|
20 |
+
)
|
21 |
+
return response.choices[0].message.content.strip()
|
22 |
|
23 |
+
# 📁 بارگذاری فایلهای کتاب
|
24 |
folder_path = '46'
|
25 |
texts = []
|
26 |
+
|
27 |
for filename in os.listdir(folder_path):
|
28 |
if filename.endswith(".docx"):
|
29 |
full_path = os.path.join(folder_path, filename)
|
|
|
32 |
if file_text.strip():
|
33 |
texts.append(file_text)
|
34 |
|
35 |
+
# 🌀 تبدیل کل کتاب به جملات
|
36 |
normalizer = Normalizer()
|
37 |
sentence_tokenizer = SentenceTokenizer()
|
38 |
+
|
39 |
all_sentences = []
|
40 |
for text in texts:
|
41 |
normalized = normalizer.normalize(text)
|
42 |
sentences = sentence_tokenizer.tokenize(normalized)
|
43 |
all_sentences.extend(sentences)
|
44 |
|
45 |
+
# 📌 دریافت ورودی از کاربر
|
46 |
+
query = st.text_input("🔎 کلمه یا عبارت موردنظر خود را وارد کنید:")
|
47 |
+
|
48 |
+
# ✅ نمایش جمله و ۵ جمله بعدی + بازنویسی با LLM
|
49 |
+
if query:
|
50 |
+
found = False
|
51 |
+
for idx, sentence in enumerate(all_sentences):
|
52 |
+
if query in sentence:
|
53 |
+
st.success("✅ جمله یافت شد:")
|
54 |
+
st.write(sentence)
|
55 |
+
|
56 |
+
next_sentences = []
|
57 |
+
st.markdown("📌 پنج جمله بعدی:")
|
58 |
+
for i in range(1, 6):
|
59 |
+
if idx + i < len(all_sentences):
|
60 |
+
st.write(all_sentences[idx + i])
|
61 |
+
next_sentences.append(all_sentences[idx + i])
|
62 |
+
|
63 |
+
# ↪️ آمادهسازی برای بازنویسی
|
64 |
+
total_text = sentence + " " + " ".join(next_sentences)
|
65 |
+
rewritten = rewrite_answer_with_llm(total_text, query)
|
66 |
+
st.markdown("🎨 **بازنویسی شده با LLM:**")
|
67 |
+
st.write(rewritten)
|
68 |
+
|
69 |
+
found = True
|
70 |
+
break
|
71 |
|
72 |
+
if not found:
|
73 |
+
st.warning("عبارت موردنظر در متن یافت نشد.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|