import streamlit as st import os import torch import numpy as np from hazm import * import docx from transformers import AutoTokenizer, AutoModel from langchain.llms import OpenAI from langchain.chat_models import ChatOpenAI # بارگذاری مدل‌ها و توکنایزر tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-base-uncased") model = AutoModel.from_pretrained("HooshvareLab/bert-fa-base-uncased") @st.cache def get_embedding(text): inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512) with torch.no_grad(): outputs = model(**inputs) embeddings = outputs.last_hidden_state.mean(dim=1) return embeddings.squeeze().numpy() def cosine_similarity(vec1, vec2): return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) llm = ChatOpenAI( base_url="https://api.together.xyz/v1", api_key='0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979', model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free" ) def rewrite_answer_with_llm(answer, user_input): prompt = f"پاسخی که باید بازنویسی شود: {answer}\n\nلطفاً این پاسخ را با لحن مشابه به سوال پرسیده شده بازنویسی کن:\n\nسوال: {user_input}" response = llm(prompt) return response['choices'][0]['text'].strip() # وارد کردن متن از کاربر user_input = st.text_input("✅ لطفاً جمله خود را وارد کنید: ") # بارگذاری متن‌ها و تقسیم به بخش‌ها folder_path = '46' texts = [] for filename in os.listdir(folder_path): if filename.endswith(".docx"): full_path = os.path.join(folder_path, filename) doc = docx.Document(full_path) file_text = "\n".join([para.text for para in doc.paragraphs]) if file_text.strip(): texts.append(file_text) normalizer = Normalizer() sentence_tokenizer = SentenceTokenizer() all_sentences = [] for text in texts: normalized = normalizer.normalize(text) sentences = sentence_tokenizer.tokenize(normalized) all_sentences.extend(sentences) chunks = [] for i in range(0, len(all_sentences), 5): chunk = " ".join(all_sentences[i:i+5]) if chunk: chunks.append(chunk) # محاسبه شباهت‌ها if user_input: with st.spinner("در حال محاسبه شباهت‌ها..."): user_embedding = get_embedding(user_input) similarities = [cosine_similarity(user_embedding, get_embedding(chunk)) for chunk in chunks] most_similar_index = np.argmax(similarities) most_similar_chunk = chunks[most_similar_index] # بازنویسی پاسخ با مدل LLM rewritten_answer = rewrite_answer_with_llm(most_similar_chunk, user_input) st.subheader("📌 پاسخ بازنویسی‌شده:") st.write(rewritten_answer)