vishalsh13 commited on
Commit
a018b00
·
1 Parent(s): 6e651a2

Fresh start

Browse files
Files changed (6) hide show
  1. app.py +0 -35
  2. config.py +0 -14
  3. utils/data_loader.py +0 -42
  4. utils/logger.py +0 -11
  5. utils/vector_utils.py +0 -52
  6. vector_db/metadata.json +0 -0
app.py DELETED
@@ -1,35 +0,0 @@
1
- # app.py
2
-
3
- import os
4
- from flask import Flask, jsonify
5
- from utils.data_loader import download_dataset, save_metadata
6
- from utils.vector_utils import create_vector_db
7
- from config import v_auth_token, v_vector_folder, v_metadata_file
8
-
9
- app = Flask(__name__)
10
-
11
- @app.route('/health', methods=['GET'])
12
- def health_check():
13
- return jsonify({"status": "healthy"}), 200
14
-
15
- @app.route('/initialize', methods=['POST'])
16
- def initialize():
17
- if not v_auth_token:
18
- return jsonify({"error": "Authentication token not found"}), 500
19
-
20
- # Ensure writable base directory and subdirectories are initialized
21
- os.makedirs(v_vector_folder, exist_ok=True)
22
- if not os.path.exists(v_metadata_file):
23
- save_metadata(v_metadata_file, {})
24
-
25
- print("Starting Vector Database Creation...")
26
-
27
- v_dataset_path = download_dataset(v_auth_token, v_metadata_file, v_vector_folder)
28
- if not v_dataset_path:
29
- return jsonify({"message": "Vector database is up-to-date"}), 200
30
- else:
31
- create_vector_db(v_dataset_path, v_vector_folder, v_auth_token)
32
- return jsonify({"message": "Vector database created successfully"}), 200
33
-
34
- if __name__ == "__main__":
35
- app.run(host="0.0.0.0", port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.py DELETED
@@ -1,14 +0,0 @@
1
- # config.py
2
-
3
- import os
4
-
5
- # Dataset configuration
6
- v_dataset_url = "https://huggingface.co/datasets/vishalsh13/Dataset1/tree/main"
7
-
8
- # Authentication token retrieved from Hugging Face secret
9
- v_auth_token = os.getenv("hkey") # The secret name is `hkey`
10
-
11
- # Paths for vector database and metadata
12
- v_base_path = "/tmp/vector_db" # Writable directory
13
- v_vector_folder = os.path.join(v_base_path, "vectors")
14
- v_metadata_file = os.path.join(v_base_path, "metadata.json")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/data_loader.py DELETED
@@ -1,42 +0,0 @@
1
- # utils/data_loader.py
2
-
3
- import os
4
- import json
5
- from huggingface_hub import hf_hub_download
6
-
7
- def download_dataset(v_auth_token, v_metadata_file, v_vector_folder):
8
- v_current_metadata = fetch_metadata()
9
- v_existing_metadata = load_metadata(v_metadata_file)
10
-
11
- if v_current_metadata == v_existing_metadata:
12
- print("No updates detected. Skipping vector creation.")
13
- return False
14
-
15
- # Define the specific file to download
16
- v_filename = "train.csv" # Replace this with the actual filename in your repository
17
- v_dataset_path = hf_hub_download(
18
- repo_id="vishalsh13/Dataset1",
19
- repo_type="dataset",
20
- #subfolder="data", # Adjust or remove subfolder as needed
21
- filename=v_filename,
22
- token=v_auth_token
23
- )
24
-
25
- print("Dataset downloaded successfully.")
26
- save_metadata(v_metadata_file, v_current_metadata)
27
- return v_dataset_path
28
-
29
- def fetch_metadata():
30
- # Simulate fetching metadata (e.g., hash or timestamp)
31
- return {"dataset_version": "v1.0"}
32
-
33
- def load_metadata(v_metadata_file):
34
- if os.path.exists(v_metadata_file):
35
- with open(v_metadata_file, "r") as file:
36
- return json.load(file)
37
- return {}
38
-
39
- def save_metadata(v_metadata_file, v_metadata):
40
- os.makedirs(os.path.dirname(v_metadata_file), exist_ok=True)
41
- with open(v_metadata_file, "w") as file:
42
- json.dump(v_metadata, file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/logger.py DELETED
@@ -1,11 +0,0 @@
1
- # utils/logger.py
2
-
3
- import logging
4
-
5
- def setup_logger(v_log_file="app.log"):
6
- logging.basicConfig(
7
- filename=v_log_file,
8
- level=logging.INFO,
9
- format="%(asctime)s - %(levelname)s - %(message)s"
10
- )
11
- return logging.getLogger()
 
 
 
 
 
 
 
 
 
 
 
 
utils/vector_utils.py DELETED
@@ -1,52 +0,0 @@
1
- # utils/vector_utils.py
2
-
3
- import os
4
- import faiss
5
- import numpy as np
6
- from huggingface_hub import Repository
7
- from sentence_transformers import SentenceTransformer
8
-
9
- def create_vector_db(v_dataset_path, v_vector_folder, v_auth_token):
10
- # Initialize the model
11
- obj_model = SentenceTransformer('all-MiniLM-L6-v2')
12
-
13
- # Load and process data
14
- with open(v_dataset_path, 'r') as file:
15
- v_data = file.readlines()
16
-
17
- v_embeddings = obj_model.encode(v_data)
18
-
19
- # Save vectors locally
20
- os.makedirs(v_vector_folder, exist_ok=True)
21
- v_vector_file = os.path.join(v_vector_folder, "vector_index")
22
- v_index = faiss.IndexFlatL2(v_embeddings.shape[1])
23
- v_index.add(np.array(v_embeddings))
24
- faiss.write_index(v_index, v_vector_file)
25
-
26
- print(f"Vector database created and saved locally at {v_vector_file}")
27
-
28
- # Save vector file back to Hugging Face dataset repository
29
- upload_to_huggingface_repo(v_vector_file, v_auth_token)
30
- print("Vector file successfully uploaded to Hugging Face dataset repository.")
31
-
32
- def upload_to_huggingface_repo(v_file_path, v_auth_token):
33
- """
34
- Upload the given file to the Hugging Face dataset repository.
35
- """
36
- v_repo_id = "vishalsh13/Dataset1" # Replace with your repository name
37
- v_repo = Repository(
38
- local_dir="temp_repo",
39
- clone_from=v_repo_id,
40
- use_auth_token=v_auth_token
41
- )
42
-
43
- # Copy the file to the repository directory
44
- os.makedirs(v_repo.local_dir, exist_ok=True)
45
- v_dest_path = os.path.join(v_repo.local_dir, os.path.basename(v_file_path))
46
- os.replace(v_file_path, v_dest_path)
47
-
48
- # Commit and push the changes
49
- v_repo.git_add(v_dest_path)
50
- v_repo.git_commit("Upload updated vector file.")
51
- v_repo.git_push()
52
- print(f"Uploaded {os.path.basename(v_file_path)} to Hugging Face repository: {v_repo_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vector_db/metadata.json DELETED
File without changes