Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
from huggingface_hub import HfApi, snapshot_download
|
2 |
from helper import download_hugging_face_embeddings
|
3 |
from url import md_files_url
|
4 |
from get_data import extract_repo_details, fetch_md_file_via_api, data_loader, chunk_text
|
@@ -24,29 +23,49 @@ from datetime import datetime
|
|
24 |
logging.basicConfig(level=logging.INFO)
|
25 |
logger = logging.getLogger(__name__)
|
26 |
|
27 |
-
|
28 |
base = {}
|
29 |
last_messages = 4
|
30 |
documents = []
|
|
|
|
|
|
|
31 |
|
32 |
load_dotenv()
|
33 |
AUTH_TOKEN_KEY = os.environ.get('AUTH_TOKEN_KEY')
|
34 |
-
|
35 |
os.environ['AUTH_TOKEN_KEY'] = AUTH_TOKEN_KEY
|
36 |
-
os.environ['BASE_URL'] = BASE_URL
|
37 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
38 |
-
os.environ['HF_TOKEN'] = HF_TOKEN
|
39 |
-
HF_USERNAME = "HumbleBeeAI" # Replace with your HF username
|
40 |
-
DATASET_NAME = "faiss_index"
|
41 |
-
index_path = "faiss_index"
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
-
# πΉ Ensure
|
48 |
if not os.path.exists(db_path):
|
49 |
print("π΄ chatbot.db does not exist! Creating it now...")
|
|
|
50 |
conn = sqlite3.connect(db_path)
|
51 |
cursor = conn.cursor()
|
52 |
cursor.execute('''CREATE TABLE IF NOT EXISTS users (
|
@@ -58,40 +77,33 @@ if not os.path.exists(db_path):
|
|
58 |
conn.close()
|
59 |
print("β
chatbot.db created successfully!")
|
60 |
|
61 |
-
# πΉ Confirm file existence
|
62 |
-
if os.path.exists(db_path):
|
63 |
-
print(f"β
File chatbot.db found at {db_path}")
|
64 |
-
else:
|
65 |
-
raise FileNotFoundError("π¨ chatbot.db was not found!")
|
66 |
-
api = HfApi()
|
67 |
-
|
68 |
# πΉ Upload chatbot.db as a private dataset
|
69 |
api.upload_file(
|
70 |
-
path_or_fileobj=db_path,
|
71 |
-
path_in_repo="chatbot.db",
|
72 |
-
repo_id=
|
73 |
repo_type="dataset",
|
74 |
token=HF_TOKEN
|
75 |
)
|
76 |
|
77 |
-
print("β
chatbot.db successfully uploaded to
|
|
|
|
|
|
|
78 |
|
79 |
# πΉ Download chatbot.db securely
|
80 |
db_folder = snapshot_download(
|
81 |
-
repo_id=
|
82 |
-
allow_patterns=["chatbot.db"], # Only download
|
83 |
use_auth_token=HF_TOKEN
|
84 |
)
|
85 |
|
86 |
-
# πΉ Define the database path
|
87 |
DB_PATH = os.path.join(db_folder, "chatbot.db")
|
88 |
|
89 |
-
# πΉ Confirm database was downloaded
|
90 |
if os.path.exists(DB_PATH):
|
91 |
print(f"β
Database downloaded at {DB_PATH}")
|
92 |
else:
|
93 |
-
raise FileNotFoundError("π¨ Failed to download chatbot.db from
|
94 |
-
|
95 |
|
96 |
# ---- Database part ----- #
|
97 |
# Database Connection
|
@@ -181,63 +193,80 @@ for url in md_files_url:
|
|
181 |
logging.error(f"Error processing URL {url}: {ve}")
|
182 |
print(f"Fetched {len(documents)} documents.")
|
183 |
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
# πΉ Ensure FAISS index exists before uploading
|
188 |
-
if not os.path.exists(faiss_index_path):
|
189 |
-
print("π΄ FAISS index not found! Creating a new FAISS index...")
|
190 |
|
191 |
-
|
192 |
-
|
193 |
-
index = faiss.IndexFlatL2(d) # Create an empty FAISS index
|
194 |
-
faiss.write_index(index, os.path.join(faiss_index_path, "index.faiss"))
|
195 |
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
|
206 |
-
# πΉ Upload FAISS
|
207 |
api.upload_folder(
|
208 |
-
folder_path=
|
209 |
-
repo_id=
|
210 |
repo_type="dataset",
|
211 |
token=HF_TOKEN
|
212 |
)
|
213 |
|
214 |
-
print("β
FAISS index successfully uploaded to
|
215 |
|
216 |
-
# πΉ Download FAISS index
|
217 |
-
|
218 |
-
repo_id=
|
219 |
-
allow_patterns=["faiss_index/*"],
|
220 |
use_auth_token=HF_TOKEN
|
221 |
)
|
222 |
|
223 |
-
|
224 |
-
faiss_file_path = os.path.join(faiss_folder, "index.faiss")
|
225 |
|
226 |
-
|
227 |
-
|
228 |
-
print(f"β
FAISS index downloaded at {faiss_file_path}")
|
229 |
else:
|
230 |
-
raise FileNotFoundError("π¨
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
|
232 |
-
|
233 |
-
index = faiss.read_index(faiss_file_path)
|
234 |
|
235 |
-
|
236 |
-
|
237 |
-
|
|
|
238 |
retriever = docsearch.as_retriever(search_type='similarity', search_kwargs={'k':2})
|
239 |
|
240 |
-
llm = Ollama(model='llama3.2'
|
241 |
|
242 |
prompt = ChatPromptTemplate.from_messages(
|
243 |
[
|
|
|
|
|
1 |
from helper import download_hugging_face_embeddings
|
2 |
from url import md_files_url
|
3 |
from get_data import extract_repo_details, fetch_md_file_via_api, data_loader, chunk_text
|
|
|
23 |
logging.basicConfig(level=logging.INFO)
|
24 |
logger = logging.getLogger(__name__)
|
25 |
|
|
|
26 |
base = {}
|
27 |
last_messages = 4
|
28 |
documents = []
|
29 |
+
HF_ORG_NAME = 'HumbleBeeAI'
|
30 |
+
DATASET_NAME = 'faiss_index'
|
31 |
+
repo_id = f"{HF_ORG_NAME}/{DATASET_NAME}"
|
32 |
|
33 |
load_dotenv()
|
34 |
AUTH_TOKEN_KEY = os.environ.get('AUTH_TOKEN_KEY')
|
35 |
+
|
36 |
os.environ['AUTH_TOKEN_KEY'] = AUTH_TOKEN_KEY
|
|
|
37 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
|
|
|
|
|
|
|
|
38 |
|
39 |
+
if not HF_TOKEN:
|
40 |
+
raise ValueError("π¨ HF_TOKEN is not set! Ensure you have the right permissions.")
|
41 |
+
|
42 |
+
from huggingface_hub import HfApi
|
43 |
+
|
44 |
+
api = HfApi()
|
45 |
+
|
46 |
+
# πΉ Check if the dataset exists in the organization
|
47 |
+
try:
|
48 |
+
api.repo_info(repo_id, repo_type="dataset", token=HF_TOKEN)
|
49 |
+
print(f"β
Dataset '{repo_id}' already exists in the organization.")
|
50 |
+
except Exception:
|
51 |
+
print(f"π΄ Dataset '{repo_id}' not found. Creating it now...")
|
52 |
+
|
53 |
+
# Create repo inside the organization
|
54 |
+
api.create_repo(
|
55 |
+
repo_id=repo_id,
|
56 |
+
repo_type="dataset",
|
57 |
+
private=True, # Ensure it's private
|
58 |
+
token=HF_TOKEN,
|
59 |
+
organization=HF_ORG_NAME # Specify organization
|
60 |
+
)
|
61 |
+
print(f"β
Dataset '{repo_id}' created successfully in the organization.")
|
62 |
+
|
63 |
+
db_path = "chatbot.db"
|
64 |
|
65 |
+
# πΉ Ensure `chatbot.db` exists before uploading
|
66 |
if not os.path.exists(db_path):
|
67 |
print("π΄ chatbot.db does not exist! Creating it now...")
|
68 |
+
import sqlite3
|
69 |
conn = sqlite3.connect(db_path)
|
70 |
cursor = conn.cursor()
|
71 |
cursor.execute('''CREATE TABLE IF NOT EXISTS users (
|
|
|
77 |
conn.close()
|
78 |
print("β
chatbot.db created successfully!")
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
# πΉ Upload chatbot.db as a private dataset
|
81 |
api.upload_file(
|
82 |
+
path_or_fileobj=db_path,
|
83 |
+
path_in_repo="chatbot.db",
|
84 |
+
repo_id=repo_id,
|
85 |
repo_type="dataset",
|
86 |
token=HF_TOKEN
|
87 |
)
|
88 |
|
89 |
+
print("β
chatbot.db successfully uploaded to the organization's private dataset.")
|
90 |
+
|
91 |
+
from huggingface_hub import snapshot_download
|
92 |
+
import os
|
93 |
|
94 |
# πΉ Download chatbot.db securely
|
95 |
db_folder = snapshot_download(
|
96 |
+
repo_id=repo_id,
|
97 |
+
allow_patterns=["chatbot.db"], # Only download chatbot.db
|
98 |
use_auth_token=HF_TOKEN
|
99 |
)
|
100 |
|
|
|
101 |
DB_PATH = os.path.join(db_folder, "chatbot.db")
|
102 |
|
|
|
103 |
if os.path.exists(DB_PATH):
|
104 |
print(f"β
Database downloaded at {DB_PATH}")
|
105 |
else:
|
106 |
+
raise FileNotFoundError("π¨ Failed to download chatbot.db from the organization's dataset.")
|
|
|
107 |
|
108 |
# ---- Database part ----- #
|
109 |
# Database Connection
|
|
|
193 |
logging.error(f"Error processing URL {url}: {ve}")
|
194 |
print(f"Fetched {len(documents)} documents.")
|
195 |
|
196 |
+
text_chunk = chunk_text(documents)
|
197 |
+
# Define paths
|
198 |
+
FAISS_LOCAL_PATH = "/tmp/faiss_index"
|
|
|
|
|
|
|
199 |
|
200 |
+
# πΉ Ensure FAISS directory exists
|
201 |
+
os.makedirs(FAISS_LOCAL_PATH, exist_ok=True)
|
|
|
|
|
202 |
|
203 |
+
try:
|
204 |
+
# πΉ Create FAISS index
|
205 |
+
faiss_index = FAISS.from_documents(text_chunk, download_hugging_face_embeddings())
|
206 |
+
|
207 |
+
# πΉ Save FAISS locally
|
208 |
+
faiss_index.save_local(FAISS_LOCAL_PATH)
|
209 |
+
print(f"β
FAISS index successfully saved to {FAISS_LOCAL_PATH}")
|
210 |
+
|
211 |
+
except Exception as e:
|
212 |
+
logging.error(f"π¨ Error creating or saving FAISS index: {e}")
|
213 |
+
|
214 |
+
try:
|
215 |
+
api.repo_info(repo_id, repo_type="dataset", token=HF_TOKEN)
|
216 |
+
print(f"β
Dataset '{repo_id}' already exists in the organization.")
|
217 |
+
except Exception:
|
218 |
+
print(f"π΄ Dataset '{repo_id}' not found. Creating it now...")
|
219 |
+
|
220 |
+
# Create dataset in the organization
|
221 |
+
api.create_repo(
|
222 |
+
repo_id=repo_id,
|
223 |
+
repo_type="dataset",
|
224 |
+
private=True,
|
225 |
+
token=HF_TOKEN,
|
226 |
+
organization=HF_ORG_NAME
|
227 |
+
)
|
228 |
+
print(f"β
Dataset '{repo_id}' created successfully in the organization.")
|
229 |
|
230 |
+
# πΉ Upload FAISS to the organization dataset
|
231 |
api.upload_folder(
|
232 |
+
folder_path=FAISS_LOCAL_PATH,
|
233 |
+
repo_id=repo_id,
|
234 |
repo_type="dataset",
|
235 |
token=HF_TOKEN
|
236 |
)
|
237 |
|
238 |
+
print("β
FAISS index successfully uploaded to the organization's private dataset.")
|
239 |
|
240 |
+
# πΉ Download FAISS index from the private organization dataset
|
241 |
+
faiss_download_folder = snapshot_download(
|
242 |
+
repo_id=repo_id,
|
243 |
+
allow_patterns=["faiss_index/*"],
|
244 |
use_auth_token=HF_TOKEN
|
245 |
)
|
246 |
|
247 |
+
FAISS_PATH = os.path.join(faiss_download_folder, "faiss_index")
|
|
|
248 |
|
249 |
+
if os.path.exists(FAISS_PATH):
|
250 |
+
print(f"β
FAISS index found at {FAISS_PATH}, loading it now...")
|
|
|
251 |
else:
|
252 |
+
raise FileNotFoundError("π¨ FAISS index not found in the organization's dataset.")
|
253 |
+
try:
|
254 |
+
# πΉ Load FAISS index with LangChain
|
255 |
+
docsearch = FAISS.load_local(
|
256 |
+
FAISS_PATH,
|
257 |
+
download_hugging_face_embeddings(),
|
258 |
+
allow_dangerous_deserialization=True
|
259 |
+
)
|
260 |
|
261 |
+
print("β
FAISS index successfully loaded for retrieval.")
|
|
|
262 |
|
263 |
+
except Exception as e:
|
264 |
+
logging.error(f"π¨ Error loading FAISS index: {e}")
|
265 |
+
|
266 |
+
|
267 |
retriever = docsearch.as_retriever(search_type='similarity', search_kwargs={'k':2})
|
268 |
|
269 |
+
llm = Ollama(model='llama3.2')
|
270 |
|
271 |
prompt = ChatPromptTemplate.from_messages(
|
272 |
[
|