Spaces:
Runtime error
Runtime error
Commit
·
a018b00
1
Parent(s):
6e651a2
Fresh start
Browse files- app.py +0 -35
- config.py +0 -14
- utils/data_loader.py +0 -42
- utils/logger.py +0 -11
- utils/vector_utils.py +0 -52
- vector_db/metadata.json +0 -0
app.py
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
# app.py
|
2 |
-
|
3 |
-
import os
|
4 |
-
from flask import Flask, jsonify
|
5 |
-
from utils.data_loader import download_dataset, save_metadata
|
6 |
-
from utils.vector_utils import create_vector_db
|
7 |
-
from config import v_auth_token, v_vector_folder, v_metadata_file
|
8 |
-
|
9 |
-
app = Flask(__name__)
|
10 |
-
|
11 |
-
@app.route('/health', methods=['GET'])
|
12 |
-
def health_check():
|
13 |
-
return jsonify({"status": "healthy"}), 200
|
14 |
-
|
15 |
-
@app.route('/initialize', methods=['POST'])
|
16 |
-
def initialize():
|
17 |
-
if not v_auth_token:
|
18 |
-
return jsonify({"error": "Authentication token not found"}), 500
|
19 |
-
|
20 |
-
# Ensure writable base directory and subdirectories are initialized
|
21 |
-
os.makedirs(v_vector_folder, exist_ok=True)
|
22 |
-
if not os.path.exists(v_metadata_file):
|
23 |
-
save_metadata(v_metadata_file, {})
|
24 |
-
|
25 |
-
print("Starting Vector Database Creation...")
|
26 |
-
|
27 |
-
v_dataset_path = download_dataset(v_auth_token, v_metadata_file, v_vector_folder)
|
28 |
-
if not v_dataset_path:
|
29 |
-
return jsonify({"message": "Vector database is up-to-date"}), 200
|
30 |
-
else:
|
31 |
-
create_vector_db(v_dataset_path, v_vector_folder, v_auth_token)
|
32 |
-
return jsonify({"message": "Vector database created successfully"}), 200
|
33 |
-
|
34 |
-
if __name__ == "__main__":
|
35 |
-
app.run(host="0.0.0.0", port=7860)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config.py
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
# config.py
|
2 |
-
|
3 |
-
import os
|
4 |
-
|
5 |
-
# Dataset configuration
|
6 |
-
v_dataset_url = "https://huggingface.co/datasets/vishalsh13/Dataset1/tree/main"
|
7 |
-
|
8 |
-
# Authentication token retrieved from Hugging Face secret
|
9 |
-
v_auth_token = os.getenv("hkey") # The secret name is `hkey`
|
10 |
-
|
11 |
-
# Paths for vector database and metadata
|
12 |
-
v_base_path = "/tmp/vector_db" # Writable directory
|
13 |
-
v_vector_folder = os.path.join(v_base_path, "vectors")
|
14 |
-
v_metadata_file = os.path.join(v_base_path, "metadata.json")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/data_loader.py
DELETED
@@ -1,42 +0,0 @@
|
|
1 |
-
# utils/data_loader.py
|
2 |
-
|
3 |
-
import os
|
4 |
-
import json
|
5 |
-
from huggingface_hub import hf_hub_download
|
6 |
-
|
7 |
-
def download_dataset(v_auth_token, v_metadata_file, v_vector_folder):
|
8 |
-
v_current_metadata = fetch_metadata()
|
9 |
-
v_existing_metadata = load_metadata(v_metadata_file)
|
10 |
-
|
11 |
-
if v_current_metadata == v_existing_metadata:
|
12 |
-
print("No updates detected. Skipping vector creation.")
|
13 |
-
return False
|
14 |
-
|
15 |
-
# Define the specific file to download
|
16 |
-
v_filename = "train.csv" # Replace this with the actual filename in your repository
|
17 |
-
v_dataset_path = hf_hub_download(
|
18 |
-
repo_id="vishalsh13/Dataset1",
|
19 |
-
repo_type="dataset",
|
20 |
-
#subfolder="data", # Adjust or remove subfolder as needed
|
21 |
-
filename=v_filename,
|
22 |
-
token=v_auth_token
|
23 |
-
)
|
24 |
-
|
25 |
-
print("Dataset downloaded successfully.")
|
26 |
-
save_metadata(v_metadata_file, v_current_metadata)
|
27 |
-
return v_dataset_path
|
28 |
-
|
29 |
-
def fetch_metadata():
|
30 |
-
# Simulate fetching metadata (e.g., hash or timestamp)
|
31 |
-
return {"dataset_version": "v1.0"}
|
32 |
-
|
33 |
-
def load_metadata(v_metadata_file):
|
34 |
-
if os.path.exists(v_metadata_file):
|
35 |
-
with open(v_metadata_file, "r") as file:
|
36 |
-
return json.load(file)
|
37 |
-
return {}
|
38 |
-
|
39 |
-
def save_metadata(v_metadata_file, v_metadata):
|
40 |
-
os.makedirs(os.path.dirname(v_metadata_file), exist_ok=True)
|
41 |
-
with open(v_metadata_file, "w") as file:
|
42 |
-
json.dump(v_metadata, file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/logger.py
DELETED
@@ -1,11 +0,0 @@
|
|
1 |
-
# utils/logger.py
|
2 |
-
|
3 |
-
import logging
|
4 |
-
|
5 |
-
def setup_logger(v_log_file="app.log"):
|
6 |
-
logging.basicConfig(
|
7 |
-
filename=v_log_file,
|
8 |
-
level=logging.INFO,
|
9 |
-
format="%(asctime)s - %(levelname)s - %(message)s"
|
10 |
-
)
|
11 |
-
return logging.getLogger()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/vector_utils.py
DELETED
@@ -1,52 +0,0 @@
|
|
1 |
-
# utils/vector_utils.py
|
2 |
-
|
3 |
-
import os
|
4 |
-
import faiss
|
5 |
-
import numpy as np
|
6 |
-
from huggingface_hub import Repository
|
7 |
-
from sentence_transformers import SentenceTransformer
|
8 |
-
|
9 |
-
def create_vector_db(v_dataset_path, v_vector_folder, v_auth_token):
|
10 |
-
# Initialize the model
|
11 |
-
obj_model = SentenceTransformer('all-MiniLM-L6-v2')
|
12 |
-
|
13 |
-
# Load and process data
|
14 |
-
with open(v_dataset_path, 'r') as file:
|
15 |
-
v_data = file.readlines()
|
16 |
-
|
17 |
-
v_embeddings = obj_model.encode(v_data)
|
18 |
-
|
19 |
-
# Save vectors locally
|
20 |
-
os.makedirs(v_vector_folder, exist_ok=True)
|
21 |
-
v_vector_file = os.path.join(v_vector_folder, "vector_index")
|
22 |
-
v_index = faiss.IndexFlatL2(v_embeddings.shape[1])
|
23 |
-
v_index.add(np.array(v_embeddings))
|
24 |
-
faiss.write_index(v_index, v_vector_file)
|
25 |
-
|
26 |
-
print(f"Vector database created and saved locally at {v_vector_file}")
|
27 |
-
|
28 |
-
# Save vector file back to Hugging Face dataset repository
|
29 |
-
upload_to_huggingface_repo(v_vector_file, v_auth_token)
|
30 |
-
print("Vector file successfully uploaded to Hugging Face dataset repository.")
|
31 |
-
|
32 |
-
def upload_to_huggingface_repo(v_file_path, v_auth_token):
|
33 |
-
"""
|
34 |
-
Upload the given file to the Hugging Face dataset repository.
|
35 |
-
"""
|
36 |
-
v_repo_id = "vishalsh13/Dataset1" # Replace with your repository name
|
37 |
-
v_repo = Repository(
|
38 |
-
local_dir="temp_repo",
|
39 |
-
clone_from=v_repo_id,
|
40 |
-
use_auth_token=v_auth_token
|
41 |
-
)
|
42 |
-
|
43 |
-
# Copy the file to the repository directory
|
44 |
-
os.makedirs(v_repo.local_dir, exist_ok=True)
|
45 |
-
v_dest_path = os.path.join(v_repo.local_dir, os.path.basename(v_file_path))
|
46 |
-
os.replace(v_file_path, v_dest_path)
|
47 |
-
|
48 |
-
# Commit and push the changes
|
49 |
-
v_repo.git_add(v_dest_path)
|
50 |
-
v_repo.git_commit("Upload updated vector file.")
|
51 |
-
v_repo.git_push()
|
52 |
-
print(f"Uploaded {os.path.basename(v_file_path)} to Hugging Face repository: {v_repo_id}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vector_db/metadata.json
DELETED
File without changes
|