|
import streamlit as st |
|
import os |
|
import torch |
|
import numpy as np |
|
from hazm import * |
|
import docx |
|
from transformers import AutoTokenizer, AutoModel |
|
from langchain.llms import OpenAI |
|
from langchain.chat_models import ChatOpenAI |
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-base-uncased") |
|
model = AutoModel.from_pretrained("HooshvareLab/bert-fa-base-uncased") |
|
|
|
@st.cache |
|
def get_embedding(text): |
|
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512) |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
embeddings = outputs.last_hidden_state.mean(dim=1) |
|
return embeddings.squeeze().numpy() |
|
|
|
def cosine_similarity(vec1, vec2): |
|
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) |
|
|
|
llm = ChatOpenAI( |
|
base_url="https://api.together.xyz/v1", |
|
api_key='0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979', |
|
model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free" |
|
) |
|
def rewrite_answer_with_llm(answer, user_input): |
|
prompt = f"پاسخی که باید بازنویسی شود: {answer}\n\nلطفاً این پاسخ را با لحن مشابه به سوال پرسیده شده بازنویسی کن:\n\nسوال: {user_input}" |
|
response = llm(prompt) |
|
return response['choices'][0]['text'].strip() |
|
|
|
|
|
user_input = st.text_input("✅ لطفاً جمله خود را وارد کنید: ") |
|
|
|
|
|
folder_path = '46' |
|
texts = [] |
|
for filename in os.listdir(folder_path): |
|
if filename.endswith(".docx"): |
|
full_path = os.path.join(folder_path, filename) |
|
doc = docx.Document(full_path) |
|
file_text = "\n".join([para.text for para in doc.paragraphs]) |
|
if file_text.strip(): |
|
texts.append(file_text) |
|
|
|
normalizer = Normalizer() |
|
sentence_tokenizer = SentenceTokenizer() |
|
all_sentences = [] |
|
for text in texts: |
|
normalized = normalizer.normalize(text) |
|
sentences = sentence_tokenizer.tokenize(normalized) |
|
all_sentences.extend(sentences) |
|
|
|
chunks = [] |
|
for i in range(0, len(all_sentences), 5): |
|
chunk = " ".join(all_sentences[i:i+5]) |
|
if chunk: |
|
chunks.append(chunk) |
|
|
|
|
|
if user_input: |
|
with st.spinner("در حال محاسبه شباهتها..."): |
|
user_embedding = get_embedding(user_input) |
|
similarities = [cosine_similarity(user_embedding, get_embedding(chunk)) for chunk in chunks] |
|
most_similar_index = np.argmax(similarities) |
|
most_similar_chunk = chunks[most_similar_index] |
|
|
|
|
|
rewritten_answer = rewrite_answer_with_llm(most_similar_chunk, user_input) |
|
|
|
st.subheader("📌 پاسخ بازنویسیشده:") |
|
st.write(rewritten_answer) |