m7n commited on
Commit
0a0d7fe
·
verified ·
1 Parent(s): 2b74275

Delete openalex_utils.py

Browse files
Files changed (1) hide show
  1. openalex_utils.py +0 -163
openalex_utils.py DELETED
@@ -1,163 +0,0 @@
1
- import numpy as np
2
- from urllib.parse import urlparse, parse_qs
3
- from pyalex import Works
4
- import pandas as pd
5
-
6
- def openalex_url_to_pyalex_query(url):
7
- """
8
- Convert an OpenAlex search URL to a pyalex query.
9
-
10
- Args:
11
- url (str): The OpenAlex search URL.
12
-
13
- Returns:
14
- tuple: (Works object, dict of parameters)
15
- """
16
- parsed_url = urlparse(url)
17
- query_params = parse_qs(parsed_url.query)
18
-
19
- # Initialize the Works object
20
- query = Works()
21
-
22
- # Handle filters
23
- if 'filter' in query_params:
24
- filters = query_params['filter'][0].split(',')
25
- for f in filters:
26
- if ':' in f:
27
- key, value = f.split(':', 1)
28
- if key == 'default.search':
29
- query = query.search(value)
30
- else:
31
- query = query.filter(**{key: value})
32
-
33
- # Handle sort - Fixed to properly handle field:direction format
34
- if 'sort' in query_params:
35
- sort_params = query_params['sort'][0].split(',')
36
- for s in sort_params:
37
- if ':' in s: # Handle field:direction format
38
- field, direction = s.split(':')
39
- query = query.sort(**{field: direction})
40
- elif s.startswith('-'): # Handle -field format
41
- query = query.sort(**{s[1:]: 'desc'})
42
- else: # Handle field format
43
- query = query.sort(**{s: 'asc'})
44
-
45
- # Handle other parameters
46
- params = {}
47
- for key in ['page', 'per-page', 'sample', 'seed']:
48
- if key in query_params:
49
- params[key] = query_params[key][0]
50
-
51
- return query, params
52
-
53
- def invert_abstract(inv_index):
54
- """Reconstruct abstract from inverted index."""
55
- if inv_index is not None:
56
- l_inv = [(w, p) for w, pos in inv_index.items() for p in pos]
57
- return " ".join(map(lambda x: x[0], sorted(l_inv, key=lambda x: x[1])))
58
- else:
59
- return ' '
60
-
61
- def get_pub(x):
62
- """Extract publication name from record."""
63
- try:
64
- source = x['source']['display_name']
65
- if source not in ['parsed_publication','Deleted Journal']:
66
- return source
67
- else:
68
- return ' '
69
- except:
70
- return ' '
71
-
72
- def get_field(x):
73
- """Extract academic field from record."""
74
- try:
75
- field = x['primary_topic']['subfield']['display_name']
76
- if field is not None:
77
- return field
78
- else:
79
- return np.nan
80
- except:
81
- return np.nan
82
-
83
- def process_records_to_df(records):
84
- """
85
- Convert OpenAlex records to a pandas DataFrame with processed fields.
86
-
87
- Args:
88
- records (list): List of OpenAlex record dictionaries
89
-
90
- Returns:
91
- pandas.DataFrame: Processed DataFrame with abstracts, publications, and titles
92
- """
93
- records_df = pd.DataFrame(records)
94
- records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
95
- records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
96
-
97
- records_df['parsed_publication'] = records_df['parsed_publication'].fillna(' ')
98
- records_df['abstract'] = records_df['abstract'].fillna(' ')
99
- records_df['title'] = records_df['title'].fillna(' ')
100
-
101
- return records_df
102
-
103
- def openalex_url_to_filename(url):
104
- """
105
- Convert an OpenAlex URL to a filename-safe string with timestamp.
106
-
107
- Args:
108
- url (str): The OpenAlex search URL
109
-
110
- Returns:
111
- str: A filename-safe string with timestamp (without extension)
112
- """
113
- from datetime import datetime
114
- import re
115
-
116
- # First parse the URL into query and params
117
- parsed_url = urlparse(url)
118
- query_params = parse_qs(parsed_url.query)
119
-
120
- # Create parts of the filename
121
- parts = []
122
-
123
- # Handle filters
124
- if 'filter' in query_params:
125
- filters = query_params['filter'][0].split(',')
126
- for f in filters:
127
- if ':' in f:
128
- key, value = f.split(':', 1)
129
- # Replace dots with underscores and clean the value
130
- key = key.replace('.', '_')
131
- # Clean the value to be filename-safe and add spaces around words
132
- clean_value = re.sub(r'[^\w\s-]', '', value)
133
- # Replace multiple spaces with single space and strip
134
- clean_value = ' '.join(clean_value.split())
135
- # Replace spaces with underscores for filename
136
- clean_value = clean_value.replace(' ', '_')
137
-
138
- if key == 'default_search':
139
- parts.append(f"search_{clean_value}")
140
- else:
141
- parts.append(f"{key}_{clean_value}")
142
-
143
- # Handle sort parameters
144
- if 'sort' in query_params:
145
- sort_params = query_params['sort'][0].split(',')
146
- for s in sort_params:
147
- if s.startswith('-'):
148
- parts.append(f"sort_{s[1:].replace('.', '_')}_desc")
149
- else:
150
- parts.append(f"sort_{s.replace('.', '_')}_asc")
151
-
152
- # Add timestamp
153
- timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
154
-
155
- # Combine all parts
156
- filename = '__'.join(parts) if parts else 'openalex_query'
157
- filename = f"{filename}__{timestamp}"
158
-
159
- # Ensure filename is not too long (max 255 chars is common filesystem limit)
160
- if len(filename) > 255:
161
- filename = filename[:251] # leave room for potential extension
162
-
163
- return filename