warhawkmonk commited on
Commit
629ef31
·
verified ·
1 Parent(s): 47b4b09

Update data_collector.py

Browse files
Files changed (1) hide show
  1. data_collector.py +399 -399
data_collector.py CHANGED
@@ -1,399 +1,399 @@
1
-
2
- import wikipedia
3
- import wikipediaapi
4
- import regex as re
5
- from sentence_transformers import SentenceTransformer,util
6
- from transformers import pipeline
7
- import requests
8
-
9
- # def consume_llm_api(prompt):
10
- # """
11
- # Sends a prompt to the LLM API and processes the streamed response.
12
- # """
13
- # url = "https://3c93-70-167-32-130.ngrok-free.app/api/llm-response"
14
- # headers = {"Content-Type": "application/json"}
15
- # payload = {"prompt": prompt,"extension":"1"}
16
-
17
-
18
- # print("Sending prompt to the LLM API...")
19
- # response_ = requests.post(url, json=payload,verify=False)
20
- # response_data = response_.json()
21
- # return response_data['text']
22
- def consume_llm_api(prompt):
23
- model = Ollama(model="llama3:latest", temperature=0.3)
24
- return model.invoke(prompt)
25
-
26
-
27
- def relevent_value(long_query,count=3):
28
- results = wikipedia.search(long_query,results=count)
29
-
30
- wiki_wiki = wikipediaapi.Wikipedia(user_agent='MyProjectName ([email protected])', language='en',extract_format=wikipediaapi.ExtractFormat.WIKI)
31
- wiki_wiki_html = wikipediaapi.Wikipedia(user_agent='MyProjectName ([email protected])', language='en',extract_format=wikipediaapi.ExtractFormat.HTML)
32
- values={}
33
- html_values={}
34
- for result in results:
35
- page_py = wiki_wiki.page(result)
36
- page_html = wiki_wiki_html.page(result)
37
- html_values[result]=page_html.text
38
-
39
- values[result]=page_py.text
40
- return values,html_values
41
-
42
-
43
- from langchain_community.llms import Ollama
44
- model=Ollama(model="llama3:latest",temperature=0.3)
45
- agent_understanding = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
46
- qa_model = pipeline('question-answering', model='deepset/roberta-base-squad2', tokenizer='deepset/roberta-base-squad2')
47
-
48
- # textual_value
49
-
50
- def construction_edit(textual_value,schema):
51
- construction_prompt= textual_value+"\n"
52
- construction_prompt+="Above is the generated text from wikipedia and below is the rule that has to be filled in the data. \n"
53
- construction_prompt+="The data should be in the form of a dictionary and it must follow the following schema: \n"
54
- construction_prompt+=str(schema)+"\n"
55
- construction_prompt+="The length of each list of each key must be same in the generated data(mandatory)."+"\n"
56
- construction_prompt+="No helper text like 'Here is the filled-in JSON schema based on the provided text' or 'Note: I've filled in the keys with relevant data' ."+ "\n"
57
- construction_prompt+="The output must be a dictionary"+"\n"
58
- constructed_text=consume_llm_api(construction_prompt)
59
- return constructed_text
60
-
61
- def dictionary_check(construction_edit):
62
- for keys in construction_edit:
63
- if len(construction_edit[keys])==0:
64
- return False
65
- return True
66
-
67
- def actual_value(textual_value,schema):
68
- for j in textual_value:
69
- formatted_result = str(textual_value[j])+ "\n"
70
- formatted_result += "Please fill the following schema with the relevant data from the text above."+ "\n"
71
- formatted_result += "Here is the schema"+"\n"
72
- formatted_result += str(schema)
73
- formatted_result += "Please generate data according to schema and fill this template with your answers.\n"
74
- formatted_result += "You have to fill each key with the relevant data from the text above."+ "\n"
75
- formatted_result += "Please return the exact key value pair as the schema above. "+ "\n"
76
- formatted_result += "No helper text like 'Here is the filled-in JSON schema based on the provided text' or 'Note: I've filled in the keys with relevant data' ."+ "\n"
77
- formatted_result += "Only fill the keys that are in the schema."+ "\n"
78
- formatted_result += "If you are not sure about the data, you can add 'Na'."+ "\n"
79
- formatted_result += "It's an order you can not add any other text(e.g Here is the filled-in JSON schema) or note ."+ "\n"
80
- formatted_result += "The length of each list of each key must be same in the generated data(mandatory)."+"\n"
81
- raw_output = consume_llm_api(formatted_result)
82
- try:
83
- data=construction_edit(raw_output,schema)
84
- json_object_match = re.search(r'\{(?:[^{}]|(?R))*\}', data)
85
- access_value=eval(json_object_match.group())
86
- for schema_key in schema:
87
- if schema_key not in access_value:
88
- access_value[schema_key]=list(set())
89
- for schema_key in access_value:
90
- access_value[schema_key]=list(set(access_value[schema_key]))
91
- access_value[schema_key]=list(set(access_value[schema_key])-set(["Na"]))
92
- yield access_value
93
-
94
- except:
95
- access_value=None
96
-
97
-
98
-
99
-
100
- def context_data_relevancy(value,context):
101
- researcher = "You are a professional reasearcher from data ."+ "\n"
102
- researcher += "You have to check can we fill some of the missing values in the "+str(value) + ". \n"
103
- researcher += "The possible part which available in the context has to be relevent with already present data"+ ". \n"
104
- researcher += "from the context given below"+ ". \n"
105
- researcher += context+ "\n"
106
- researcher += "Be strict while thing of filling data"+ ". \n"
107
- researcher += "Just return @yahoo@ if 90% possible else @NO@"+ ". \n"
108
-
109
-
110
- result = consume_llm_api(researcher)
111
- return result
112
-
113
- def agent_work_result(query,value):
114
- agent_understanding = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
115
- query_embedding = agent_understanding.encode(query)
116
- score1 = util.cos_sim(query_embedding,agent_understanding.encode("extract data for"))
117
- score2 = util.cos_sim(query_embedding,agent_understanding.encode("append data in "))
118
- score3 = util.cos_sim(query_embedding,agent_understanding.encode("check data"))
119
-
120
- if score1 > score2 and score1 > score3:
121
- # print("Extracting query:", query)
122
- question = "search word ?"
123
- result = qa_model(question=question, context=query)
124
- result = result['answer']
125
- print("Extracting query:", result)
126
- wikisearch = relevent_value(result,3)
127
- html_pages = wikisearch[1]
128
- wikisearch = wikisearch[0]
129
-
130
- for searches in wikisearch:
131
- if "@yahoo@" in context_data_relevancy(value,wikisearch[searches]):
132
- return wikisearch[searches]
133
- return "No data found"
134
- elif score2 > score1 and score2 > score3:
135
- try:
136
- print("Appending command:", query)
137
- question1 = "which value we are adding to key ?"
138
- result1 = qa_model(question=question1, context=query)
139
- question2 = "In which key we are appending ?"
140
- result2 = qa_model(question=question2, context=query)
141
- result1 = result1['answer']
142
- result2 = result2['answer']
143
-
144
- if len(value[result2])==0:
145
- value[result2].append(result1)
146
- return "Now you can fill the remaining columns"
147
- else:
148
- return "You are putting value in the same key column again not accepted."
149
- except Exception as e:
150
- return str(e)
151
- else:
152
- min_=0
153
- max_=0
154
- for keys in value:
155
-
156
- if len(value[keys])<min_:
157
- min_=len(value[keys])
158
- if len(value[keys])>max_:
159
- max_=len(value[keys])
160
- if min_==max_:
161
- return "You dia a great job"
162
- else:
163
- return "Please append the data correctly so that the length of each key is same and data is also relevant"
164
-
165
-
166
- def full_alignment(value):
167
- for values in value:
168
- if len(value[values])==0:
169
- return False
170
- return True
171
-
172
- def query_formatting(result):
173
- values=result.split("\n")
174
- if len(values)!=0:
175
- values.pop(0)
176
- return values
177
- def missing_value_completion(store,value):
178
-
179
- filler_prompt = "Below is mentioned ajson data\n"
180
- filler_prompt += str(value)+"\n"
181
- filler_prompt += "you only need to find missing data from the mentioned context section."
182
- filler_prompt += "You will return the results in below mentioned format.\n"
183
- filler_prompt += "The output will be in json format."
184
- filler_prompt += "context:\n"
185
-
186
- for search_key in store:
187
- try:
188
- fill_text = store[search_key]
189
- response = consume_llm_api(filler_prompt+fill_text)
190
-
191
- json_object_match = re.search(r'\{(?:[^{}]|(?R))*\}', response)
192
- access_value=eval(json_object_match.group())
193
- for keys in value:
194
- if len(value[keys])==0 and keys in access_value:
195
- value[keys].append(access_value[keys].pop(0))
196
- print(value)
197
- if full_alignment(value):
198
- return value
199
- except:
200
- pass
201
-
202
-
203
-
204
-
205
- def verification(value):
206
-
207
-
208
- validation_prompt = "Can you prepare a list of text(many as possible) that can be searched on google for filling(relevent data) the missing data below.\n"
209
- validation_prompt += str(value)+"\n"
210
- validation_prompt += "You need to prepare it by the following manner"
211
- validation_prompt += "1. Mention it line by line.\n"
212
- validation_prompt += "2. Please seperate it line by line.\n"
213
- validation_prompt += "3. Headers are not required\n"
214
- validation_prompt += "4. Please do not add any helper text example: Here is the required search queries , Here are the search queries .\n"
215
- validation_prompt += "5. Please do not add any notes"
216
- print("Searching for missing values")
217
- result=query_formatting(consume_llm_api(validation_prompt))
218
-
219
- for search_queries in result:
220
- if len(search_queries)!=0:
221
- print(search_queries)
222
- store=relevent_value(search_queries)
223
- html_pages = store[1]
224
- store = store[0]
225
- missing_value_completion(store,value)
226
- if full_alignment(value):
227
- return value
228
-
229
-
230
-
231
-
232
-
233
- return result
234
-
235
- def agent_data_prep(value,query):
236
- end_result = ""
237
- angent_earlier_income ="0"
238
- pre_money_saving = "0"
239
- mission = "First to fill most importent column \n"
240
- while end_result!="You dia a great job":
241
-
242
- if full_alignment(value):
243
- return value
244
-
245
-
246
- agent_instruction = mission
247
- agent_instruction += "your previous income"+pre_money_saving+"\n"
248
- agent_instruction += "your current income"+angent_earlier_income+"\n"
249
- pre_money_saving = angent_earlier_income
250
- if end_result=="You are putting value in the same key column again not accepted.":
251
-
252
- mission = "Why you are always filling the"+[i for i in value][-1]+"only.\n"
253
- mission += "We are removing $1000 from you account \n"
254
- angent_earlier_income = str(int(angent_earlier_income)-1000)
255
- agent_instruction += end_result + "\n" +"Above is the result of your previous command. Please give the next command to the agent."
256
- agent_instruction += query + "\n"
257
- agent_instruction += "Below is the data gathered upto now" + "\n"
258
- agent_instruction += str(value) + "\n"
259
- agent_instruction += "Please utilize the tool where you can command the agent to do any of the following tasks(one instruction at a time )"+ "\n"
260
- agent_instruction += "You only have to fill one value for each key if its not present. \n"
261
- agent_instruction += "From now onwards your each statement is understand as command which is categoried in any of the commands in mentioned below examples. \n"
262
- agent_instruction += "1. Ask agent to extract data from the web about anything like search for lamp production ,smartphone parts etc .\n"
263
- agent_instruction += "2. Give any specific value to append in current generated data . Please also mention the key in which the agent has to append the data .\n"
264
- agent_instruction += "3. Ask the agent to put the generated data on check weather each column fills correctly or not .\n"
265
- agent_instruction += "Here is the instruction to give commands to the agent. \n"
266
- agent_instruction += "You can give commands to the agent ,few examples are mentioned below. \n"
267
-
268
- agent_instruction += "1. Extract data about iron man suit or iron man suit mark1 \n"
269
- agent_instruction += "(while thinking about extract data look into the data \n"
270
- agent_instruction += "where data can be append and then search relevent query \n"
271
- agent_instruction += "like green arrow from DC only if DC and green arraow is in different column key values )\n\n"
272
-
273
- agent_instruction += "2. Append value 'bmw 4' to Car Model key \n"
274
- agent_instruction += "(While appending the value you must have read the data from extract data command and remember, if you found anything relevent don't forget to append.\n"
275
- agent_instruction += "The appending value has to be different not already present.) \n\n"
276
-
277
- agent_instruction += "Any different grammatical version of the above commands. \n"
278
- agent_instruction += "Command has to be given only for 'data filling' purpose. \n"
279
-
280
- agent_instruction += "While command like search for or extract information about something it has to be relevent query search. \n"
281
- agent_instruction += "The relevent the query the more accurate the data will be. \n"
282
- agent_instruction += "Be cautious while filling the data It has to be correct. \n"
283
- agent_instruction += "For each correct append you will get $1000. \n"
284
-
285
- agent_instruction += "Give your command only no text . \n"
286
-
287
- agent_instruction += "There will an audit after filling all the columns on data for its validity. \n"
288
- agent_instruction += "Some mistakes are okay but But if we find you guilty there are some repercussion."
289
-
290
- # instructionto give commands to the agent
291
-
292
- judgement = Ollama(model = "llama3:latest")
293
- command = judgement.invoke(agent_instruction)
294
-
295
- end_result = agent_work_result(command,value)
296
- if "Now you can fill the remaining columns" in end_result:
297
- angent_earlier_income = str(int(angent_earlier_income)+1000)
298
- print("--------------------")
299
- print(value)
300
- print("--------------------")
301
- return value
302
-
303
- def dictionary_formatting(value):
304
- new_dict={}
305
- for data_keys in [i for i in value]:
306
- key_values = data_keys.strip()
307
- if key_values in value:
308
- if key_values not in new_dict:
309
- new_dict[key_values] =[]
310
- new_dict[key_values] = value.pop(key_values)
311
- else:
312
- new_dict[key_values] = value.pop(data_keys)
313
- return new_dict
314
-
315
-
316
- def schema_formatter(output):
317
- schema = {i:[] for i in output.split(",")}
318
- return schema
319
- def schema_generator(query):
320
-
321
- formatting = "The above statement is given by the user. Please create a single .csv-based schema by following the points below:\n"
322
-
323
- formatting += "1. Only create the schema, no additional text or statement.\n"
324
-
325
- formatting += "2. Keep the schema simple, avoid complex column names.\n"
326
-
327
- formatting+= "3. please only generate 5 schema if not mentioned.\n"
328
-
329
- formatting += "4. For example, if the user provides a statement like: 'Generate data for students getting placements from IIT Bombay,' the response should be:\n"
330
-
331
- formatting += "Student Name, Student Roll Number, Student Branch, Student Year, Student Placement Status, Student Company Name, Student Package, Student Location, Student Role\n"
332
-
333
- formatting += "Follow the above example but remember above is not actual schema you have to provide the schema depending on the user prompt.\n"
334
-
335
- formatting+= "5. please only generate schema no notes or anything.\n"
336
-
337
- output=consume_llm_api(query+"\n"+formatting)
338
-
339
- return schema_formatter(output)
340
- def sorting(data_dict):
341
- new_dict={str(i):0 for i in data_dict}
342
-
343
- for i in data_dict:
344
- for j in i:
345
- if len(i[j])!=0:
346
- new_dict[str(i)] +=1
347
- new_dict=[(new_dict[i],i) for i in new_dict]
348
- new_dict.sort(reverse=True)
349
- new_dict={i[-1]:i[0] for i in new_dict}
350
- return new_dict
351
-
352
-
353
- def process_data(query):
354
-
355
-
356
-
357
-
358
- formatting = "The above statement is given by the user. Please create a single .csv-based schema by following the points below:\n"
359
- formatting += "1. Only create the schema, no additional text or statement.\n"
360
- formatting += "2. Keep the schema simple, avoid complex column names.\n"
361
- formatting+= "3. please only generate 5 schema if not mentioned.\n"
362
- formatting += "4. For example, if the user provides a statement like: 'Generate data for students getting placements from IIT Bombay,' the response should be:\n"
363
- formatting += "Student Name, Student Roll Number, Student Branch, Student Year, Student Placement Status, Student Company Name, Student Package, Student Location, Student Role\n"
364
- formatting += "Follow the above example but remember above is not actual schema you have to provide the schema depending on the user prompt.\n"
365
- formatting+= "5. please only generate schema no notes or anything.\n"
366
- print("Query:",query)
367
- output=consume_llm_api(query+"\n"+formatting)
368
-
369
- schema = {i:[] for i in output.split(",")}
370
- textual_value=relevent_value(str(schema).lower(),3)
371
- html_pages = textual_value[1]
372
- textual_value = textual_value[0]
373
- data_dict =[j for j in actual_value(textual_value,schema)]
374
- for j in sorting(data_dict):
375
- try:
376
- # Convert string to dictionary
377
- dummy_value = eval(j)
378
-
379
- # Process dictionary values
380
- for key in dummy_value:
381
- while len(dummy_value[key]) >= 2:
382
- dummy_value[key].pop(0)
383
-
384
- # Format dictionary
385
- formatted = dictionary_formatting(dummy_value)
386
- print(formatted)
387
- # Verify and store result
388
- verification_result = verification(formatted) if formatted else None
389
-
390
- yield verification_result
391
-
392
- except Exception as e:
393
- print(f"Error processing dictionary {j}: {e}")
394
-
395
-
396
- # for j in process_data("Generate data for smart phones"):
397
- # print(j)
398
-
399
-
 
1
+
2
+ import wikipedia
3
+ import wikipediaapi
4
+ import regex as re
5
+ from sentence_transformers import SentenceTransformer,util
6
+ from transformers import pipeline
7
+ import requests
8
+
9
+ def consume_llm_api(prompt):
10
+ """
11
+ Sends a prompt to the LLM API and processes the streamed response.
12
+ """
13
+ url = "https://3c93-70-167-32-130.ngrok-free.app/api/llm-response"
14
+ headers = {"Content-Type": "application/json"}
15
+ payload = {"prompt": prompt,"extension":"1"}
16
+
17
+
18
+ print("Sending prompt to the LLM API...")
19
+ response_ = requests.post(url, json=payload,verify=False)
20
+ response_data = response_.json()
21
+ return response_data['text']
22
+ # def consume_llm_api(prompt):
23
+ # model = Ollama(model="llama3:latest", temperature=0.3)
24
+ # return model.invoke(prompt)
25
+
26
+
27
+ def relevent_value(long_query,count=3):
28
+ results = wikipedia.search(long_query,results=count)
29
+
30
+ wiki_wiki = wikipediaapi.Wikipedia(user_agent='MyProjectName ([email protected])', language='en',extract_format=wikipediaapi.ExtractFormat.WIKI)
31
+ wiki_wiki_html = wikipediaapi.Wikipedia(user_agent='MyProjectName ([email protected])', language='en',extract_format=wikipediaapi.ExtractFormat.HTML)
32
+ values={}
33
+ html_values={}
34
+ for result in results:
35
+ page_py = wiki_wiki.page(result)
36
+ page_html = wiki_wiki_html.page(result)
37
+ html_values[result]=page_html.text
38
+
39
+ values[result]=page_py.text
40
+ return values,html_values
41
+
42
+
43
+ from langchain_community.llms import Ollama
44
+ model=Ollama(model="llama3:latest",temperature=0.3)
45
+ agent_understanding = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
46
+ qa_model = pipeline('question-answering', model='deepset/roberta-base-squad2', tokenizer='deepset/roberta-base-squad2')
47
+
48
+ # textual_value
49
+
50
+ def construction_edit(textual_value,schema):
51
+ construction_prompt= textual_value+"\n"
52
+ construction_prompt+="Above is the generated text from wikipedia and below is the rule that has to be filled in the data. \n"
53
+ construction_prompt+="The data should be in the form of a dictionary and it must follow the following schema: \n"
54
+ construction_prompt+=str(schema)+"\n"
55
+ construction_prompt+="The length of each list of each key must be same in the generated data(mandatory)."+"\n"
56
+ construction_prompt+="No helper text like 'Here is the filled-in JSON schema based on the provided text' or 'Note: I've filled in the keys with relevant data' ."+ "\n"
57
+ construction_prompt+="The output must be a dictionary"+"\n"
58
+ constructed_text=consume_llm_api(construction_prompt)
59
+ return constructed_text
60
+
61
+ def dictionary_check(construction_edit):
62
+ for keys in construction_edit:
63
+ if len(construction_edit[keys])==0:
64
+ return False
65
+ return True
66
+
67
+ def actual_value(textual_value,schema):
68
+ for j in textual_value:
69
+ formatted_result = str(textual_value[j])+ "\n"
70
+ formatted_result += "Please fill the following schema with the relevant data from the text above."+ "\n"
71
+ formatted_result += "Here is the schema"+"\n"
72
+ formatted_result += str(schema)
73
+ formatted_result += "Please generate data according to schema and fill this template with your answers.\n"
74
+ formatted_result += "You have to fill each key with the relevant data from the text above."+ "\n"
75
+ formatted_result += "Please return the exact key value pair as the schema above. "+ "\n"
76
+ formatted_result += "No helper text like 'Here is the filled-in JSON schema based on the provided text' or 'Note: I've filled in the keys with relevant data' ."+ "\n"
77
+ formatted_result += "Only fill the keys that are in the schema."+ "\n"
78
+ formatted_result += "If you are not sure about the data, you can add 'Na'."+ "\n"
79
+ formatted_result += "It's an order you can not add any other text(e.g Here is the filled-in JSON schema) or note ."+ "\n"
80
+ formatted_result += "The length of each list of each key must be same in the generated data(mandatory)."+"\n"
81
+ raw_output = consume_llm_api(formatted_result)
82
+ try:
83
+ data=construction_edit(raw_output,schema)
84
+ json_object_match = re.search(r'\{(?:[^{}]|(?R))*\}', data)
85
+ access_value=eval(json_object_match.group())
86
+ for schema_key in schema:
87
+ if schema_key not in access_value:
88
+ access_value[schema_key]=list(set())
89
+ for schema_key in access_value:
90
+ access_value[schema_key]=list(set(access_value[schema_key]))
91
+ access_value[schema_key]=list(set(access_value[schema_key])-set(["Na"]))
92
+ yield access_value
93
+
94
+ except:
95
+ access_value=None
96
+
97
+
98
+
99
+
100
+ def context_data_relevancy(value,context):
101
+ researcher = "You are a professional reasearcher from data ."+ "\n"
102
+ researcher += "You have to check can we fill some of the missing values in the "+str(value) + ". \n"
103
+ researcher += "The possible part which available in the context has to be relevent with already present data"+ ". \n"
104
+ researcher += "from the context given below"+ ". \n"
105
+ researcher += context+ "\n"
106
+ researcher += "Be strict while thing of filling data"+ ". \n"
107
+ researcher += "Just return @yahoo@ if 90% possible else @NO@"+ ". \n"
108
+
109
+
110
+ result = consume_llm_api(researcher)
111
+ return result
112
+
113
+ def agent_work_result(query,value):
114
+ agent_understanding = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
115
+ query_embedding = agent_understanding.encode(query)
116
+ score1 = util.cos_sim(query_embedding,agent_understanding.encode("extract data for"))
117
+ score2 = util.cos_sim(query_embedding,agent_understanding.encode("append data in "))
118
+ score3 = util.cos_sim(query_embedding,agent_understanding.encode("check data"))
119
+
120
+ if score1 > score2 and score1 > score3:
121
+ # print("Extracting query:", query)
122
+ question = "search word ?"
123
+ result = qa_model(question=question, context=query)
124
+ result = result['answer']
125
+ print("Extracting query:", result)
126
+ wikisearch = relevent_value(result,3)
127
+ html_pages = wikisearch[1]
128
+ wikisearch = wikisearch[0]
129
+
130
+ for searches in wikisearch:
131
+ if "@yahoo@" in context_data_relevancy(value,wikisearch[searches]):
132
+ return wikisearch[searches]
133
+ return "No data found"
134
+ elif score2 > score1 and score2 > score3:
135
+ try:
136
+ print("Appending command:", query)
137
+ question1 = "which value we are adding to key ?"
138
+ result1 = qa_model(question=question1, context=query)
139
+ question2 = "In which key we are appending ?"
140
+ result2 = qa_model(question=question2, context=query)
141
+ result1 = result1['answer']
142
+ result2 = result2['answer']
143
+
144
+ if len(value[result2])==0:
145
+ value[result2].append(result1)
146
+ return "Now you can fill the remaining columns"
147
+ else:
148
+ return "You are putting value in the same key column again not accepted."
149
+ except Exception as e:
150
+ return str(e)
151
+ else:
152
+ min_=0
153
+ max_=0
154
+ for keys in value:
155
+
156
+ if len(value[keys])<min_:
157
+ min_=len(value[keys])
158
+ if len(value[keys])>max_:
159
+ max_=len(value[keys])
160
+ if min_==max_:
161
+ return "You dia a great job"
162
+ else:
163
+ return "Please append the data correctly so that the length of each key is same and data is also relevant"
164
+
165
+
166
+ def full_alignment(value):
167
+ for values in value:
168
+ if len(value[values])==0:
169
+ return False
170
+ return True
171
+
172
+ def query_formatting(result):
173
+ values=result.split("\n")
174
+ if len(values)!=0:
175
+ values.pop(0)
176
+ return values
177
+ def missing_value_completion(store,value):
178
+
179
+ filler_prompt = "Below is mentioned ajson data\n"
180
+ filler_prompt += str(value)+"\n"
181
+ filler_prompt += "you only need to find missing data from the mentioned context section."
182
+ filler_prompt += "You will return the results in below mentioned format.\n"
183
+ filler_prompt += "The output will be in json format."
184
+ filler_prompt += "context:\n"
185
+
186
+ for search_key in store:
187
+ try:
188
+ fill_text = store[search_key]
189
+ response = consume_llm_api(filler_prompt+fill_text)
190
+
191
+ json_object_match = re.search(r'\{(?:[^{}]|(?R))*\}', response)
192
+ access_value=eval(json_object_match.group())
193
+ for keys in value:
194
+ if len(value[keys])==0 and keys in access_value:
195
+ value[keys].append(access_value[keys].pop(0))
196
+ print(value)
197
+ if full_alignment(value):
198
+ return value
199
+ except:
200
+ pass
201
+
202
+
203
+
204
+
205
+ def verification(value):
206
+
207
+
208
+ validation_prompt = "Can you prepare a list of text(many as possible) that can be searched on google for filling(relevent data) the missing data below.\n"
209
+ validation_prompt += str(value)+"\n"
210
+ validation_prompt += "You need to prepare it by the following manner"
211
+ validation_prompt += "1. Mention it line by line.\n"
212
+ validation_prompt += "2. Please seperate it line by line.\n"
213
+ validation_prompt += "3. Headers are not required\n"
214
+ validation_prompt += "4. Please do not add any helper text example: Here is the required search queries , Here are the search queries .\n"
215
+ validation_prompt += "5. Please do not add any notes"
216
+ print("Searching for missing values")
217
+ result=query_formatting(consume_llm_api(validation_prompt))
218
+
219
+ for search_queries in result:
220
+ if len(search_queries)!=0:
221
+ print(search_queries)
222
+ store=relevent_value(search_queries)
223
+ html_pages = store[1]
224
+ store = store[0]
225
+ missing_value_completion(store,value)
226
+ if full_alignment(value):
227
+ return value
228
+
229
+
230
+
231
+
232
+
233
+ return result
234
+
235
+ def agent_data_prep(value,query):
236
+ end_result = ""
237
+ angent_earlier_income ="0"
238
+ pre_money_saving = "0"
239
+ mission = "First to fill most importent column \n"
240
+ while end_result!="You dia a great job":
241
+
242
+ if full_alignment(value):
243
+ return value
244
+
245
+
246
+ agent_instruction = mission
247
+ agent_instruction += "your previous income"+pre_money_saving+"\n"
248
+ agent_instruction += "your current income"+angent_earlier_income+"\n"
249
+ pre_money_saving = angent_earlier_income
250
+ if end_result=="You are putting value in the same key column again not accepted.":
251
+
252
+ mission = "Why you are always filling the"+[i for i in value][-1]+"only.\n"
253
+ mission += "We are removing $1000 from you account \n"
254
+ angent_earlier_income = str(int(angent_earlier_income)-1000)
255
+ agent_instruction += end_result + "\n" +"Above is the result of your previous command. Please give the next command to the agent."
256
+ agent_instruction += query + "\n"
257
+ agent_instruction += "Below is the data gathered upto now" + "\n"
258
+ agent_instruction += str(value) + "\n"
259
+ agent_instruction += "Please utilize the tool where you can command the agent to do any of the following tasks(one instruction at a time )"+ "\n"
260
+ agent_instruction += "You only have to fill one value for each key if its not present. \n"
261
+ agent_instruction += "From now onwards your each statement is understand as command which is categoried in any of the commands in mentioned below examples. \n"
262
+ agent_instruction += "1. Ask agent to extract data from the web about anything like search for lamp production ,smartphone parts etc .\n"
263
+ agent_instruction += "2. Give any specific value to append in current generated data . Please also mention the key in which the agent has to append the data .\n"
264
+ agent_instruction += "3. Ask the agent to put the generated data on check weather each column fills correctly or not .\n"
265
+ agent_instruction += "Here is the instruction to give commands to the agent. \n"
266
+ agent_instruction += "You can give commands to the agent ,few examples are mentioned below. \n"
267
+
268
+ agent_instruction += "1. Extract data about iron man suit or iron man suit mark1 \n"
269
+ agent_instruction += "(while thinking about extract data look into the data \n"
270
+ agent_instruction += "where data can be append and then search relevent query \n"
271
+ agent_instruction += "like green arrow from DC only if DC and green arraow is in different column key values )\n\n"
272
+
273
+ agent_instruction += "2. Append value 'bmw 4' to Car Model key \n"
274
+ agent_instruction += "(While appending the value you must have read the data from extract data command and remember, if you found anything relevent don't forget to append.\n"
275
+ agent_instruction += "The appending value has to be different not already present.) \n\n"
276
+
277
+ agent_instruction += "Any different grammatical version of the above commands. \n"
278
+ agent_instruction += "Command has to be given only for 'data filling' purpose. \n"
279
+
280
+ agent_instruction += "While command like search for or extract information about something it has to be relevent query search. \n"
281
+ agent_instruction += "The relevent the query the more accurate the data will be. \n"
282
+ agent_instruction += "Be cautious while filling the data It has to be correct. \n"
283
+ agent_instruction += "For each correct append you will get $1000. \n"
284
+
285
+ agent_instruction += "Give your command only no text . \n"
286
+
287
+ agent_instruction += "There will an audit after filling all the columns on data for its validity. \n"
288
+ agent_instruction += "Some mistakes are okay but But if we find you guilty there are some repercussion."
289
+
290
+ # instructionto give commands to the agent
291
+
292
+ judgement = Ollama(model = "llama3:latest")
293
+ command = judgement.invoke(agent_instruction)
294
+
295
+ end_result = agent_work_result(command,value)
296
+ if "Now you can fill the remaining columns" in end_result:
297
+ angent_earlier_income = str(int(angent_earlier_income)+1000)
298
+ print("--------------------")
299
+ print(value)
300
+ print("--------------------")
301
+ return value
302
+
303
+ def dictionary_formatting(value):
304
+ new_dict={}
305
+ for data_keys in [i for i in value]:
306
+ key_values = data_keys.strip()
307
+ if key_values in value:
308
+ if key_values not in new_dict:
309
+ new_dict[key_values] =[]
310
+ new_dict[key_values] = value.pop(key_values)
311
+ else:
312
+ new_dict[key_values] = value.pop(data_keys)
313
+ return new_dict
314
+
315
+
316
+ def schema_formatter(output):
317
+ schema = {i:[] for i in output.split(",")}
318
+ return schema
319
+ def schema_generator(query):
320
+
321
+ formatting = "The above statement is given by the user. Please create a single .csv-based schema by following the points below:\n"
322
+
323
+ formatting += "1. Only create the schema, no additional text or statement.\n"
324
+
325
+ formatting += "2. Keep the schema simple, avoid complex column names.\n"
326
+
327
+ formatting+= "3. please only generate 5 schema if not mentioned.\n"
328
+
329
+ formatting += "4. For example, if the user provides a statement like: 'Generate data for students getting placements from IIT Bombay,' the response should be:\n"
330
+
331
+ formatting += "Student Name, Student Roll Number, Student Branch, Student Year, Student Placement Status, Student Company Name, Student Package, Student Location, Student Role\n"
332
+
333
+ formatting += "Follow the above example but remember above is not actual schema you have to provide the schema depending on the user prompt.\n"
334
+
335
+ formatting+= "5. please only generate schema no notes or anything.\n"
336
+
337
+ output=consume_llm_api(query+"\n"+formatting)
338
+
339
+ return schema_formatter(output)
340
+ def sorting(data_dict):
341
+ new_dict={str(i):0 for i in data_dict}
342
+
343
+ for i in data_dict:
344
+ for j in i:
345
+ if len(i[j])!=0:
346
+ new_dict[str(i)] +=1
347
+ new_dict=[(new_dict[i],i) for i in new_dict]
348
+ new_dict.sort(reverse=True)
349
+ new_dict={i[-1]:i[0] for i in new_dict}
350
+ return new_dict
351
+
352
+
353
+ def process_data(query):
354
+
355
+
356
+
357
+
358
+ formatting = "The above statement is given by the user. Please create a single .csv-based schema by following the points below:\n"
359
+ formatting += "1. Only create the schema, no additional text or statement.\n"
360
+ formatting += "2. Keep the schema simple, avoid complex column names.\n"
361
+ formatting+= "3. please only generate 5 schema if not mentioned.\n"
362
+ formatting += "4. For example, if the user provides a statement like: 'Generate data for students getting placements from IIT Bombay,' the response should be:\n"
363
+ formatting += "Student Name, Student Roll Number, Student Branch, Student Year, Student Placement Status, Student Company Name, Student Package, Student Location, Student Role\n"
364
+ formatting += "Follow the above example but remember above is not actual schema you have to provide the schema depending on the user prompt.\n"
365
+ formatting+= "5. please only generate schema no notes or anything.\n"
366
+ print("Query:",query)
367
+ output=consume_llm_api(query+"\n"+formatting)
368
+
369
+ schema = {i:[] for i in output.split(",")}
370
+ textual_value=relevent_value(str(schema).lower(),3)
371
+ html_pages = textual_value[1]
372
+ textual_value = textual_value[0]
373
+ data_dict =[j for j in actual_value(textual_value,schema)]
374
+ for j in sorting(data_dict):
375
+ try:
376
+ # Convert string to dictionary
377
+ dummy_value = eval(j)
378
+
379
+ # Process dictionary values
380
+ for key in dummy_value:
381
+ while len(dummy_value[key]) >= 2:
382
+ dummy_value[key].pop(0)
383
+
384
+ # Format dictionary
385
+ formatted = dictionary_formatting(dummy_value)
386
+ print(formatted)
387
+ # Verify and store result
388
+ verification_result = verification(formatted) if formatted else None
389
+
390
+ yield verification_result
391
+
392
+ except Exception as e:
393
+ print(f"Error processing dictionary {j}: {e}")
394
+
395
+
396
+ # for j in process_data("Generate data for smart phones"):
397
+ # print(j)
398
+
399
+