m7n commited on
Commit
a076d8c
·
verified ·
1 Parent(s): ace3f3b

Delete data_setup.py

Browse files
Files changed (1) hide show
  1. data_setup.py +0 -117
data_setup.py DELETED
@@ -1,117 +0,0 @@
1
- import pickle
2
- import requests
3
- import umap
4
- from numba.typed import List
5
- import torch
6
- from sentence_transformers import SentenceTransformer
7
- import time
8
- from pathlib import Path
9
-
10
- def check_resources(files_dict, basemap_path, mapper_params_path):
11
- """
12
- Check if all required resources are present.
13
-
14
- Args:
15
- files_dict (dict): Dictionary mapping filenames to their download URLs
16
- basemap_path (str): Path to the basemap pickle file
17
- mapper_params_path (str): Path to the UMAP mapper parameters pickle file
18
-
19
- Returns:
20
- bool: True if all resources are present, False otherwise
21
- """
22
- all_files_present = True
23
-
24
- # Check downloaded files
25
- for filename in files_dict.keys():
26
- if not Path(filename).exists():
27
- print(f"Missing file: {filename}")
28
- all_files_present = False
29
-
30
- # Check basemap
31
- if not Path(basemap_path).exists():
32
- print(f"Missing basemap file: {basemap_path}")
33
- all_files_present = False
34
-
35
- # Check mapper params
36
- if not Path(mapper_params_path).exists():
37
- print(f"Missing mapper params file: {mapper_params_path}")
38
- all_files_present = False
39
-
40
- return all_files_present
41
-
42
- def download_required_files(files_dict):
43
- """
44
- Download required files from URLs only if they don't exist.
45
-
46
- Args:
47
- files_dict (dict): Dictionary mapping filenames to their download URLs
48
- """
49
- print(f"Checking required files: {time.strftime('%Y-%m-%d %H:%M:%S')}")
50
-
51
- files_to_download = {
52
- filename: url
53
- for filename, url in files_dict.items()
54
- if not Path(filename).exists()
55
- }
56
-
57
- if not files_to_download:
58
- print("All files already present, skipping downloads")
59
- return
60
-
61
- print(f"Downloading missing files: {list(files_to_download.keys())}")
62
- for filename, url in files_to_download.items():
63
- print(f"Downloading {filename}...")
64
- response = requests.get(url)
65
- with open(filename, "wb") as f:
66
- f.write(response.content)
67
-
68
- def setup_basemap_data(basemap_path):
69
- """
70
- Load and setup the base map data.
71
-
72
- Args:
73
- basemap_path (str): Path to the basemap pickle file
74
- """
75
- print(f"Getting basemap data: {time.strftime('%Y-%m-%d %H:%M:%S')}")
76
- basedata_df = pickle.load(open(basemap_path, 'rb'))
77
- return basedata_df
78
-
79
- def setup_mapper(mapper_params_path):
80
- """
81
- Setup and configure the UMAP mapper.
82
-
83
- Args:
84
- mapper_params_path (str): Path to the UMAP mapper parameters pickle file
85
- """
86
- print(f"Getting Mapper: {time.strftime('%Y-%m-%d %H:%M:%S')}")
87
-
88
- params_new = pickle.load(open(mapper_params_path, 'rb'))
89
- print("setting up mapper...")
90
- mapper = umap.UMAP()
91
-
92
- umap_params = {k: v for k, v in params_new.get('umap_params', {}).items()
93
- if k != 'target_backend'}
94
- mapper.set_params(**umap_params)
95
-
96
- for attr, value in params_new.get('umap_attributes', {}).items():
97
- if attr != 'embedding_':
98
- setattr(mapper, attr, value)
99
-
100
- if 'embedding_' in params_new.get('umap_attributes', {}):
101
- mapper.embedding_ = List(params_new['umap_attributes']['embedding_'])
102
-
103
- return mapper
104
-
105
- def setup_embedding_model(model_name):
106
- """
107
- Setup the SentenceTransformer model.
108
-
109
- Args:
110
- model_name (str): Name or path of the SentenceTransformer model
111
- """
112
- print(f"Setting up language model: {time.strftime('%Y-%m-%d %H:%M:%S')}")
113
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
114
- print(f"Using device: {device}")
115
-
116
- model = SentenceTransformer(model_name)
117
- return model