Namitg02 commited on
Commit
7921068
·
verified ·
1 Parent(s): 502bb36

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -10
app.py CHANGED
@@ -19,15 +19,46 @@ tokenizer = AutoTokenizer.from_pretrained(llm_model)
19
 
20
  #import numpy as np
21
 
22
- datasetiter = load_dataset("Namitg02/Test", split='train', streaming=False)
23
- dataset = to_map_style_dataset(datasetiter)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
 
26
  #dataset = load_dataset("not-lain/wikipedia",revision = "embedded")
27
  #dataset = load_dataset("epfl-llm/guidelines", split='train')
28
  #Returns a list of dictionaries, each representing a row in the dataset.
29
- print(dataset[1])
30
- length = len(dataset)
31
 
32
  #Itemdetails = dataset.items()
33
  #print(Itemdetails)
@@ -39,18 +70,18 @@ embedding_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
39
  #doc_func = lambda x: x.text
40
  #dataset = list(map(doc_func, dataset))
41
 
42
- def embedder(dataset):
43
- embeddings = embedding_model.encode(dataset["text"])
44
- dataset = dataset.add_column('embeddings', embeddings)
45
- return dataset
46
- updated_dataset = dataset.map(embedder)
47
  dataset['text'][:length]
48
 
49
  #print(embeddings)
50
 
51
  print(updated_dataset[1])
52
  print(updated_dataset[2])
53
- print(dataset[1])
54
 
55
  embedding_dim = embedding_model.get_sentence_embedding_dimension()
56
  #data = FAISS.from_embeddings(embed, embedding_model)
 
19
 
20
  #import numpy as np
21
 
22
+ from torch.utils.data import Dataset, IterableDataset
23
+
24
+ class MyIterableDataset(IterableDataset):
25
+ def __init__(self, iterable):
26
+ super().__init__()
27
+ self.iterable = iterable
28
+
29
+ def __iter__(self):
30
+ return iter(self.iterable)
31
+
32
+ class MapStyleDataset(Dataset):
33
+ def __init__(self, iterable):
34
+ super().__init__()
35
+ self.data = list(iterable)
36
+
37
+ def __len__(self):
38
+ return len(self.data)
39
+
40
+ def __getitem__(self, idx):
41
+ return self.data[idx]
42
+
43
+ # Create an iterable
44
+ iterable = "Namitg02/Test"
45
+
46
+ # Convert the iterable to a MapStyle dataset
47
+ map_style_dataset = MapStyleDataset(iterable)
48
+
49
+ # Create a DataLoader for the MapStyle dataset
50
+ data_loader = torch.utils.data.DataLoader(map_style_dataset, batch_size=2)
51
+
52
+
53
+ #datasetiter = load_dataset("Namitg02/Test", split='train', streaming=False)
54
+ #dataset = to_map_style_dataset(datasetiter)
55
 
56
 
57
  #dataset = load_dataset("not-lain/wikipedia",revision = "embedded")
58
  #dataset = load_dataset("epfl-llm/guidelines", split='train')
59
  #Returns a list of dictionaries, each representing a row in the dataset.
60
+ print(map_style_dataset[1])
61
+ length = len(map_style_dataset)
62
 
63
  #Itemdetails = dataset.items()
64
  #print(Itemdetails)
 
70
  #doc_func = lambda x: x.text
71
  #dataset = list(map(doc_func, dataset))
72
 
73
+ def embedder(map_style_dataset):
74
+ embeddings = embedding_model.encode(map_style_dataset["text"])
75
+ map_style_dataset = map_style_dataset.add_column('embeddings', embeddings)
76
+ return map_style_dataset
77
+ updated_dataset = map_style_dataset.map(embedder)
78
  dataset['text'][:length]
79
 
80
  #print(embeddings)
81
 
82
  print(updated_dataset[1])
83
  print(updated_dataset[2])
84
+ print(map_style_dataset[1])
85
 
86
  embedding_dim = embedding_model.get_sentence_embedding_dimension()
87
  #data = FAISS.from_embeddings(embed, embedding_model)