File size: 2,925 Bytes
bc23008
f6afc8f
5264200
 
145151e
bc23008
5264200
bc23008
5a87c2f
 
145151e
bc23008
 
 
5264200
cc9574d
5264200
cc9574d
 
 
 
 
5264200
 
 
 
19a84e6
 
 
 
 
bc23008
 
 
 
 
 
 
5264200
bc23008
f5b923e
bc23008
 
 
 
 
 
 
 
5264200
bc23008
 
 
 
 
 
 
5264200
bc23008
 
 
 
 
5264200
bc23008
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import streamlit as st
import os
import torch
import numpy as np
from hazm import *
import docx
from transformers import AutoTokenizer, AutoModel
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI


# بارگذاری مدل‌ها و توکنایزر
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-base-uncased")
model = AutoModel.from_pretrained("HooshvareLab/bert-fa-base-uncased")

@st.cache
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.squeeze().numpy()

def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

llm = ChatOpenAI(
    base_url="https://api.together.xyz/v1",
    api_key='0291f33aee03412a47fa5d8e562e515182dcc5d9aac5a7fb5eefdd1759005979',
    model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
)
def rewrite_answer_with_llm(answer, user_input):
    prompt = f"پاسخی که باید بازنویسی شود: {answer}\n\nلطفاً این پاسخ را با لحن مشابه به سوال پرسیده شده بازنویسی کن:\n\nسوال: {user_input}"
    response = llm(prompt)
    return response['choices'][0]['text'].strip()

# وارد کردن متن از کاربر
user_input = st.text_input("✅ لطفاً جمله خود را وارد کنید: ")

# بارگذاری متن‌ها و تقسیم به بخش‌ها
folder_path = '46'
texts = []
for filename in os.listdir(folder_path):
    if filename.endswith(".docx"):
        full_path = os.path.join(folder_path, filename)
        doc = docx.Document(full_path)
        file_text = "\n".join([para.text for para in doc.paragraphs])
        if file_text.strip():
            texts.append(file_text)

normalizer = Normalizer()
sentence_tokenizer = SentenceTokenizer()
all_sentences = []
for text in texts:
    normalized = normalizer.normalize(text)
    sentences = sentence_tokenizer.tokenize(normalized)
    all_sentences.extend(sentences)

chunks = []
for i in range(0, len(all_sentences), 5):
    chunk = " ".join(all_sentences[i:i+5])
    if chunk:
        chunks.append(chunk)

# محاسبه شباهت‌ها
if user_input:
    with st.spinner("در حال محاسبه شباهت‌ها..."):
        user_embedding = get_embedding(user_input)
        similarities = [cosine_similarity(user_embedding, get_embedding(chunk)) for chunk in chunks]
        most_similar_index = np.argmax(similarities)
        most_similar_chunk = chunks[most_similar_index]
        
        # بازنویسی پاسخ با مدل LLM
        rewritten_answer = rewrite_answer_with_llm(most_similar_chunk, user_input)
        
        st.subheader("📌 پاسخ بازنویسی‌شده:")
        st.write(rewritten_answer)