annikwag commited on
Commit
d5cb87e
·
verified ·
1 Parent(s): 89aa59d

Update appStore/prep_data.py

Browse files
Files changed (1) hide show
  1. appStore/prep_data.py +49 -48
appStore/prep_data.py CHANGED
@@ -33,63 +33,64 @@ def process_iati():
33
  return projects_df
34
 
35
 
36
- # def process_giz_worldwide():
37
- # """
38
- # This will read the new giz_worldwide file and create the chunks.
39
- # The following adjustments have been made:
40
- # - Reads the file 'giz_worldwide_api_download_23_02_2025.json'
41
- # - Renames 'name.en' to 'project_name'
42
- # - Uses the 'merged_text' column for creating chunks and computing text size
43
- # - Creates an empty 'url' column (since the new dataset has an empty URL)
44
- # - Renames 'country' to 'countries'
45
- # - Renames 'duration.project.start' to 'start_year' and 'duration.project.end' to 'end_year'
46
- # """
47
- # # Read the new JSON file
48
- # giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
 
 
 
 
 
 
 
 
 
49
 
50
- # # Rename columns per new dataset requirements
51
- # giz_df = giz_df.rename(columns={
52
- # 'name.en': 'project_name',
53
- # 'country': 'countries',
54
- # 'duration.project.start': 'start_year',
55
- # 'duration.project.end': 'end_year'
56
- # })
57
 
58
- # # Create an empty 'url' column as the new dataset has an empty URL
59
- # giz_df['url'] = ''
 
60
 
61
- # # Create text_size based on merged_text and create chunks from merged_text
62
- # giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
63
- # giz_df['chunks'] = giz_df['merged_text'].apply(lambda text: create_chunks(text) if isinstance(text, str) else [])
 
64
 
65
- # print("initial df length:", len(giz_df))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  # giz_df = giz_df.explode(column=['chunks'], ignore_index=True)
67
- # print("new df length:", len(giz_df))
68
  # print(giz_df.columns)
69
-
 
70
  # giz_df['source'] = 'GIZ_WORLDWIDE'
71
  # return giz_df
72
 
73
-
74
- def process_giz_worldwide():
75
- """
76
- this will read the giz_worldwide files and create the chunks
77
- """
78
- giz_df = pd.read_json(f'{path_to_data}giz_worldwide/data_giz_website.json')
79
- giz_df = giz_df.rename(columns={'content':'project_description'})
80
- # Sample 10 random rows for quick embeddings (seed set for reproducibility)
81
- giz_df = giz_df.sample(n=5, random_state=42)
82
- giz_df['text_size'] = giz_df.apply(lambda x: len((x['project_name'] + x['project_description']).split()), axis=1)
83
- giz_df['chunks'] = giz_df.apply(lambda x:create_chunks(x['project_name'] + x['project_description']),axis=1)
84
- print("initial df length:",len(giz_df))
85
- giz_df = giz_df.explode(column=['chunks'], ignore_index=True)
86
- print("new df length:",len(giz_df))
87
- print(giz_df.columns)
88
- #giz_df.drop(columns = ['filename', 'url', 'name', 'mail',
89
- # 'language', 'start_year', 'end_year','poli_trager'], inplace=True)
90
- giz_df['source'] = 'GIZ_WORLDWIDE'
91
- return giz_df
92
-
93
  def remove_duplicates(results_list):
94
  """
95
  Return a new list of results with duplicates removed,
 
33
  return projects_df
34
 
35
 
36
+ def process_giz_worldwide():
37
+ """
38
+ This will read the new giz_worldwide file and create the chunks.
39
+ The following adjustments have been made:
40
+ - Reads the file 'giz_worldwide_api_download_23_02_2025.json'
41
+ - Renames 'name.en' to 'project_name'
42
+ - Uses the 'merged_text' column for creating chunks and computing text size
43
+ - Creates an empty 'url' column (since the new dataset has an empty URL)
44
+ - Renames 'country' to 'countries'
45
+ - Renames 'duration.project.start' to 'start_year' and 'duration.project.end' to 'end_year'
46
+ """
47
+ # Read the new JSON file
48
+ giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
49
+ # Sample random rows for quick embeddings (seed set for reproducibility)
50
+ giz_df = giz_df.sample(n=5, random_state=42)
51
+ # Rename columns per new dataset requirements
52
+ giz_df = giz_df.rename(columns={
53
+ 'name.en': 'project_name',
54
+ 'country': 'countries',
55
+ 'duration.project.start': 'start_year',
56
+ 'duration.project.end': 'end_year'
57
+ })
58
 
59
+ # Create an empty 'url' column as the new dataset has an empty URL
60
+ giz_df['url'] = ''
 
 
 
 
 
61
 
62
+ # Create text_size based on merged_text and create chunks from merged_text
63
+ giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
64
+ giz_df['chunks'] = giz_df['merged_text'].apply(lambda text: create_chunks(text) if isinstance(text, str) else [])
65
 
66
+ print("initial df length:", len(giz_df))
67
+ giz_df = giz_df.explode(column=['chunks'], ignore_index=True)
68
+ print("new df length:", len(giz_df))
69
+ print(giz_df.columns)
70
 
71
+ giz_df['source'] = 'GIZ_WORLDWIDE'
72
+ return giz_df
73
+
74
+
75
+ # def process_giz_worldwide():
76
+ # """
77
+ # this will read the giz_worldwide files and create the chunks
78
+ # """
79
+ # giz_df = pd.read_json(f'{path_to_data}giz_worldwide/data_giz_website.json')
80
+ # giz_df = giz_df.rename(columns={'content':'project_description'})
81
+ # # Sample random rows for quick embeddings (seed set for reproducibility)
82
+ # giz_df = giz_df.sample(n=5, random_state=42)
83
+ # giz_df['text_size'] = giz_df.apply(lambda x: len((x['project_name'] + x['project_description']).split()), axis=1)
84
+ # giz_df['chunks'] = giz_df.apply(lambda x:create_chunks(x['project_name'] + x['project_description']),axis=1)
85
+ # print("initial df length:",len(giz_df))
86
  # giz_df = giz_df.explode(column=['chunks'], ignore_index=True)
87
+ # print("new df length:",len(giz_df))
88
  # print(giz_df.columns)
89
+ # #giz_df.drop(columns = ['filename', 'url', 'name', 'mail',
90
+ # # 'language', 'start_year', 'end_year','poli_trager'], inplace=True)
91
  # giz_df['source'] = 'GIZ_WORLDWIDE'
92
  # return giz_df
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  def remove_duplicates(results_list):
95
  """
96
  Return a new list of results with duplicates removed,