Mojo3 commited on
Commit
7318f74
·
verified ·
1 Parent(s): 65c9e8e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +207 -14
app.py CHANGED
@@ -1,19 +1,212 @@
1
  import streamlit as st
2
- from transformers import pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
- @st.cache_resource
5
- def load_model():
6
- return pipeline("text-generation", model="gpt2")
7
 
8
- model = load_model()
9
 
10
  st.title("Simple Text Generator")
11
- user_input = st.text_input("Enter your prompt here:")
12
-
13
- if st.button("Generate"):
14
- if user_input:
15
- with st.spinner("Generating..."):
16
- output = model(user_input, max_length=100, do_sample=True)[0]['generated_text']
17
- st.write(output)
18
- else:
19
- st.warning("Please enter a prompt")
 
 
 
1
  import streamlit as st
2
+ from docx import Document
3
+ import os
4
+ from langchain_core.prompts import PromptTemplate
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM
6
+ import torch
7
+ import time
8
+ from sentence_transformers import SentenceTransformer
9
+ from langchain.vectorstores import Chroma
10
+ from langchain.docstore.document import Document as Document2
11
+ from langchain_community.embeddings import HuggingFaceEmbeddings
12
+
13
+ import cohere
14
+ from langchain_core.prompts import PromptTemplate
15
+
16
+ # Load token from environment variable
17
+ token = os.getenv("HF_TOKEN")
18
+
19
+ print("my token is ", token)
20
+ # Save the token to Hugging Face's system directory
21
+
22
+ docs_folder = "./converted_docs"
23
+
24
+
25
+ # Function to load .docx files from Google Drive folder
26
+ def load_docx_files_from_drive(drive_folder):
27
+ docx_files = [f for f in os.listdir(drive_folder) if f.endswith(".docx")]
28
+ documents = []
29
+
30
+ for file_name in docx_files:
31
+ file_path = os.path.join(drive_folder, file_name)
32
+ doc = Document(file_path)
33
+ content = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
34
+ documents.append(content)
35
+
36
+ return documents
37
+
38
+
39
+ # Load .docx files from Google Drive folder
40
+ documents = load_docx_files_from_drive(docs_folder)
41
+
42
+
43
+ def split_extracted_text_into_chunks(documents):
44
+ print("Splitting text into chunks")
45
+ # List to hold all chunks
46
+ chunks = []
47
+
48
+ for doc_text in documents:
49
+ # Split the document text into lines
50
+ lines = doc_text.splitlines()
51
+
52
+ # Initialize variables for splitting
53
+ current_chunk = []
54
+ for line in lines:
55
+ # Check if the line starts with "File Name:"
56
+ if line.startswith("File Name:"):
57
+ # If there's a current chunk, save it before starting a new one
58
+ if current_chunk:
59
+ chunks.append("\n".join(current_chunk))
60
+ current_chunk = [] # Reset the current chunk
61
+
62
+ # Add the line to the current chunk
63
+ current_chunk.append(line)
64
+
65
+ # Add the last chunk for the current document
66
+ if current_chunk:
67
+ chunks.append("\n".join(current_chunk))
68
+
69
+ return chunks
70
+
71
+
72
+ # Split the extracted documents into chunks
73
+ chunks = split_extracted_text_into_chunks(documents)
74
+
75
+
76
+ def save_chunks_to_file(chunks, output_file_path):
77
+ print("Saving chunks to file")
78
+ # Open the file in write mode
79
+ with open(output_file_path, "w", encoding="utf-8") as file:
80
+ for i, chunk in enumerate(chunks, start=1):
81
+ # Write each chunk with a header for easy identification
82
+ file.write(f"Chunk {i}:\n")
83
+ file.write(chunk)
84
+ file.write("\n" + "=" * 50 + "\n")
85
+
86
+
87
+ # Path to save the chunks file
88
+ output_file_path = "./chunks_output.txt"
89
+
90
+ # Split the extracted documents into chunks
91
+ chunks = split_extracted_text_into_chunks(documents)
92
+
93
+ # Save the chunks to the file
94
+ save_chunks_to_file(chunks, output_file_path)
95
+
96
+
97
+ # Step 1: Load the model through LangChain's wrapper
98
+ embedding_model = HuggingFaceEmbeddings(
99
+ model_name="Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2"
100
+ )
101
+ print("#0")
102
+
103
+ # Step 2: Embed the chunks (now simplified)
104
+ def embed_chunks(chunks):
105
+ print("Embedding the chunks")
106
+ return [
107
+ {"chunk": chunk, "embedding": embedding_model.embed_query(chunk)}
108
+ for chunk in chunks
109
+ ]
110
+
111
+
112
+ embeddings = embed_chunks(chunks)
113
+ print("#1")
114
+
115
+ # Step 3: Prepare documents (unchanged)
116
+ def prepare_documents_for_chroma(embeddings):
117
+ print("Preparing documents for chroma")
118
+ return [
119
+ Document2(page_content=entry["chunk"], metadata={"chunk_index": i})
120
+ for i, entry in enumerate(embeddings, start=1)
121
+ ]
122
+
123
+ print("#2")
124
+ documents = prepare_documents_for_chroma(embeddings)
125
+ print("Creating the vectore store")
126
+ # Step 4: Create Chroma store (fixed)
127
+ vectorstore = Chroma.from_documents(
128
+ documents=documents,
129
+ embedding=embedding_model, # Proper embedding object
130
+ persist_directory="./chroma_db", # Optional persistence
131
+ )
132
+
133
+
134
+
135
+
136
+ class RAGPipeline:
137
+ def __init__(self, vectorstore, api_key, model_name="c4ai-aya-expanse-8b", k=3):
138
+ print("Initializing RAG Pipeline")
139
+ self.vectorstore = vectorstore
140
+ self.model_name = model_name
141
+ self.k = k
142
+ self.api_key = api_key
143
+ self.client = cohere.Client(api_key) # Initialize the Cohere client
144
+ self.retriever = self.vectorstore.as_retriever(
145
+ search_type="mmr", search_kwargs={"k": 3}
146
+ )
147
+ self.prompt_template = PromptTemplate.from_template(self._get_template())
148
+
149
+ def _get_template(self):
150
+ return """<s>[INST] <<SYS>>
151
+ أنت مساعد مفيد يقدم إجابات باللغة العربية بناءً على السياق المقدم.
152
+ - أجب فقط باللغة العربية
153
+ - إذا لم تجد إجابة في السياق، قل أنك لا تعرف
154
+ - كن دقيقاً وواضحاً في إجاباتك
155
+ -جاوب من السياق حصريا
156
+ <</SYS>>
157
+
158
+ السياق: {context}
159
+
160
+ السؤال: {question}
161
+ الإجابة: [/INST]\
162
+
163
+ """
164
+
165
+ def generate_response(self, question):
166
+ retrieved_docs = self._retrieve_documents(question)
167
+ prompt = self._create_prompt(retrieved_docs, question)
168
+ response = self._generate_response_cohere(prompt)
169
+ return response
170
+
171
+ def _retrieve_documents(self, question):
172
+ retrieved_docs = self.retriever.invoke(question)
173
+ # print("\n=== المستندات المسترجعة ===")
174
+ # for i, doc in enumerate(retrieved_docs):
175
+ # print(f"المستند {i+1}: {doc.page_content}")
176
+ # print("==========================\n")
177
+
178
+ # دمج النصوص المسترجعة في سياق واحد
179
+ return " ".join([doc.page_content for doc in retrieved_docs])
180
+
181
+ def _create_prompt(self, docs, question):
182
+ return self.prompt_template.format(context=docs, question=question)
183
+
184
+ def _generate_response_cohere(self, prompt):
185
+ # Call Cohere's generate API
186
+ response = self.client.generate(
187
+ model=self.model_name,
188
+ prompt=prompt,
189
+ max_tokens=2000, # Adjust token limit based on requirements
190
+ temperature=0.3, # Control creativity
191
+ stop_sequences=None,
192
+ )
193
+
194
+ if response.generations:
195
+ return response.generations[0].text.strip()
196
+ else:
197
+ raise Exception("No response generated by Cohere API.")
198
 
 
 
 
199
 
 
200
 
201
  st.title("Simple Text Generator")
202
+ api_key = os.getenv("API_KEY")
203
+ s = api_key[:5]
204
+ print("KEY: ", s)
205
+ rag_pipeline = RAGPipeline(vectorstore=vectorstore, api_key=api_key)
206
+ print("Enter your question Here: ")
207
+ question = st.text_input("أدخل سؤالك هنا")
208
+ if st.button("Generate Answer"):
209
+ response = rag_pipeline.generate_response(question)
210
+ st.write(response)
211
+ print("Question: ", question)
212
+ print("Response: ", response)