Spaces:
Running
on
Zero
Running
on
Zero
Enhance data processing in app.py and openalex_utils.py by improving handling of referenced works and filling missing publication values with spaces.
Browse files- app.py +2 -1
- openalex_utils.py +4 -1
app.py
CHANGED
@@ -524,7 +524,8 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
|
|
524 |
# Export relevant column
|
525 |
export_df = records_df[['title', 'abstract', 'doi', 'publication_year', 'x', 'y','id','primary_topic']]
|
526 |
export_df['parsed_field'] = [get_field(row) for ix, row in export_df.iterrows()]
|
527 |
-
export_df['referenced_works'] = [', '.join(x) for x in records_df['referenced_works']]
|
|
|
528 |
if locally_approximate_publication_date_checkbox and plot_time_checkbox:
|
529 |
export_df['approximate_publication_year'] = local_years
|
530 |
export_df.to_csv(csv_file_path, index=False)
|
|
|
524 |
# Export relevant column
|
525 |
export_df = records_df[['title', 'abstract', 'doi', 'publication_year', 'x', 'y','id','primary_topic']]
|
526 |
export_df['parsed_field'] = [get_field(row) for ix, row in export_df.iterrows()]
|
527 |
+
export_df['referenced_works'] = [x if isinstance(x, str) else ', '.join(x) if isinstance(x, (list, tuple)) and not pd.isna(x) else '' for x in records_df['referenced_works']]
|
528 |
+
|
529 |
if locally_approximate_publication_date_checkbox and plot_time_checkbox:
|
530 |
export_df['approximate_publication_year'] = local_years
|
531 |
export_df.to_csv(csv_file_path, index=False)
|
openalex_utils.py
CHANGED
@@ -99,14 +99,17 @@ def process_records_to_df(records):
|
|
99 |
records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
|
100 |
if 'primary_location' in records_df.columns:
|
101 |
records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
|
|
|
|
|
102 |
else:
|
103 |
# Process raw records as before
|
104 |
records_df = pd.DataFrame(records)
|
105 |
records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
|
106 |
records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
|
|
|
107 |
|
108 |
# Fill missing values and deduplicate
|
109 |
-
|
110 |
records_df['abstract'] = records_df['abstract'].fillna(' ')
|
111 |
records_df['title'] = records_df['title'].fillna(' ')
|
112 |
records_df = records_df.drop_duplicates(subset=['id']).reset_index(drop=True)
|
|
|
99 |
records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
|
100 |
if 'primary_location' in records_df.columns:
|
101 |
records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
|
102 |
+
records_df['parsed_publication'] = records_df['parsed_publication'].fillna(' ') # fill missing values with space, only if we have them.
|
103 |
+
|
104 |
else:
|
105 |
# Process raw records as before
|
106 |
records_df = pd.DataFrame(records)
|
107 |
records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
|
108 |
records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
|
109 |
+
records_df['parsed_publication'] = records_df['parsed_publication'].fillna(' ')
|
110 |
|
111 |
# Fill missing values and deduplicate
|
112 |
+
|
113 |
records_df['abstract'] = records_df['abstract'].fillna(' ')
|
114 |
records_df['title'] = records_df['title'].fillna(' ')
|
115 |
records_df = records_df.drop_duplicates(subset=['id']).reset_index(drop=True)
|