File size: 2,925 Bytes
bc23008 f6afc8f 5264200 145151e bc23008 5264200 bc23008 5a87c2f 145151e bc23008 5264200 cc9574d 5264200 cc9574d 5264200 19a84e6 bc23008 5264200 bc23008 f5b923e bc23008 5264200 bc23008 5264200 bc23008 5264200 bc23008 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import streamlit as st
import os
import torch
import numpy as np
from hazm import *
import docx
from transformers import AutoTokenizer, AutoModel
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
# بارگذاری مدلها و توکنایزر
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-base-uncased")
model = AutoModel.from_pretrained("HooshvareLab/bert-fa-base-uncased")
@st.cache
def get_embedding(text):
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1)
return embeddings.squeeze().numpy()
def cosine_similarity(vec1, vec2):
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
llm = ChatOpenAI(
base_url="https://api.together.xyz/v1",
api_key='0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979',
model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
)
def rewrite_answer_with_llm(answer, user_input):
prompt = f"پاسخی که باید بازنویسی شود: {answer}\n\nلطفاً این پاسخ را با لحن مشابه به سوال پرسیده شده بازنویسی کن:\n\nسوال: {user_input}"
response = llm(prompt)
return response['choices'][0]['text'].strip()
# وارد کردن متن از کاربر
user_input = st.text_input("✅ لطفاً جمله خود را وارد کنید: ")
# بارگذاری متنها و تقسیم به بخشها
folder_path = '46'
texts = []
for filename in os.listdir(folder_path):
if filename.endswith(".docx"):
full_path = os.path.join(folder_path, filename)
doc = docx.Document(full_path)
file_text = "\n".join([para.text for para in doc.paragraphs])
if file_text.strip():
texts.append(file_text)
normalizer = Normalizer()
sentence_tokenizer = SentenceTokenizer()
all_sentences = []
for text in texts:
normalized = normalizer.normalize(text)
sentences = sentence_tokenizer.tokenize(normalized)
all_sentences.extend(sentences)
chunks = []
for i in range(0, len(all_sentences), 5):
chunk = " ".join(all_sentences[i:i+5])
if chunk:
chunks.append(chunk)
# محاسبه شباهتها
if user_input:
with st.spinner("در حال محاسبه شباهتها..."):
user_embedding = get_embedding(user_input)
similarities = [cosine_similarity(user_embedding, get_embedding(chunk)) for chunk in chunks]
most_similar_index = np.argmax(similarities)
most_similar_chunk = chunks[most_similar_index]
# بازنویسی پاسخ با مدل LLM
rewritten_answer = rewrite_answer_with_llm(most_similar_chunk, user_input)
st.subheader("📌 پاسخ بازنویسیشده:")
st.write(rewritten_answer) |