M17idd commited on
Commit
b5be236
·
verified ·
1 Parent(s): c4259a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -12
app.py CHANGED
@@ -10,6 +10,8 @@ from langchain.chat_models import ChatOpenAI
10
  from typing import List
11
  from together import Together
12
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
13
 
14
 
15
  import streamlit as st
@@ -107,8 +109,8 @@ from transformers import AutoTokenizer, AutoModel
107
 
108
  class HuggingFaceEmbeddings(Embeddings):
109
  def __init__(self, model_name: str):
110
- self.tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large")
111
- self.model = AutoModel.from_pretrained("FacebookAI/xlm-roberta-large") # Use AutoModel instead of AutoModelForMaskedLM
112
 
113
  def embed_documents(self, texts: List[str]) -> List[List[float]]:
114
  embeddings = []
@@ -125,24 +127,27 @@ class HuggingFaceEmbeddings(Embeddings):
125
  @st.cache_resource
126
  def get_pdf_index():
127
  with st.spinner('📄 در حال پردازش فایل PDF...'):
 
128
  loader = PyPDFLoader('test1.pdf')
129
  pages = loader.load()
 
 
130
  full_text = "\n".join([page.page_content for page in pages])
 
 
131
  text_splitter = RecursiveCharacterTextSplitter(
132
- chunk_size=1024,
133
- chunk_overlap=128
134
  )
135
  texts = text_splitter.split_text(full_text)
136
 
137
- embeddings = HuggingFaceEmbeddings(
138
- model_name="FacebookAI/xlm-roberta-large"
139
- )
140
 
141
- return VectorstoreIndexCreator(
142
- embedding=embeddings,
143
- text_splitter=text_splitter
144
- ).from_texts(texts)
145
-
146
  index = get_pdf_index()
147
 
148
  llm = ChatOpenAI(
 
10
  from typing import List
11
  from together import Together
12
  from transformers import AutoTokenizer, AutoModelForCausalLM
13
+ from transformers import AutoTokenizer, AutoModel
14
+ import torch
15
 
16
 
17
  import streamlit as st
 
109
 
110
  class HuggingFaceEmbeddings(Embeddings):
111
  def __init__(self, model_name: str):
112
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
113
+ self.model = AutoModel.from_pretrained(model_name)
114
 
115
  def embed_documents(self, texts: List[str]) -> List[List[float]]:
116
  embeddings = []
 
127
  @st.cache_resource
128
  def get_pdf_index():
129
  with st.spinner('📄 در حال پردازش فایل PDF...'):
130
+ # Load the PDF file
131
  loader = PyPDFLoader('test1.pdf')
132
  pages = loader.load()
133
+
134
+ # Extract text from each page
135
  full_text = "\n".join([page.page_content for page in pages])
136
+
137
+ # Split the text into chunks
138
  text_splitter = RecursiveCharacterTextSplitter(
139
+ chunk_size=1024, # Chunk size
140
+ chunk_overlap=128 # Overlap between chunks
141
  )
142
  texts = text_splitter.split_text(full_text)
143
 
144
+ # Create embeddings
145
+ embeddings = HuggingFaceEmbeddings(model_name="FacebookAI/xlm-roberta-large")
 
146
 
147
+ # Create FAISS vector store
148
+ vector_store = FAISS.from_texts(texts, embeddings)
149
+
150
+ return vector_store
 
151
  index = get_pdf_index()
152
 
153
  llm = ChatOpenAI(