openfree commited on
Commit
d1bc4aa
·
verified ·
1 Parent(s): 7f8500d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -35
app.py CHANGED
@@ -999,29 +999,45 @@ target_datasets = {
999
 
1000
  def get_korea_datasets():
1001
  """Korea 관련 데이터셋 검색"""
1002
- params = {
1003
- "search": "korea",
1004
- "full": "True",
1005
- "limit": 1000
1006
- }
1007
 
1008
- try:
1009
- response = requests.get(
1010
- "https://huggingface.co/api/datasets",
1011
- headers={'Authorization': f'Bearer {HF_TOKEN}'},
1012
- params=params
1013
- )
1014
 
1015
- if response.status_code == 200:
1016
- return response.json()
1017
- else:
1018
- print(f"Failed to fetch Korea datasets: {response.status_code}")
1019
- return []
1020
- except Exception as e:
1021
- print(f"Error fetching Korea datasets: {str(e)}")
1022
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1023
 
1024
- def get_all_datasets(limit=3000):
1025
  """모든 데이터셋과 Korea 관련 데이터셋 가져오기"""
1026
  all_datasets = []
1027
  page_size = 1000
@@ -1033,27 +1049,37 @@ def get_all_datasets(limit=3000):
1033
  'offset': offset
1034
  }
1035
 
1036
- response = requests.get(
1037
- "https://huggingface.co/api/datasets",
1038
- headers={'Authorization': f'Bearer {HF_TOKEN}'},
1039
- params=params
1040
- )
1041
-
1042
- if response.status_code == 200:
1043
- all_datasets.extend(response.json())
1044
- print(f"Fetched datasets {offset+1} to {offset+len(response.json())}")
1045
- else:
1046
- print(f"Failed to fetch datasets at offset {offset}: {response.status_code}")
 
 
 
 
 
1047
  break
1048
 
1049
  # Korea 검색 결과 추가
1050
  korea_datasets = get_korea_datasets()
1051
  existing_ids = {dataset.get('id', '') for dataset in all_datasets}
1052
 
 
1053
  for korea_dataset in korea_datasets:
1054
  if korea_dataset.get('id', '') not in existing_ids:
1055
  all_datasets.append(korea_dataset)
1056
  existing_ids.add(korea_dataset.get('id', ''))
 
 
 
 
1057
 
1058
  return all_datasets[:limit]
1059
 
@@ -1115,7 +1141,7 @@ def get_datasets_data(progress=gr.Progress()):
1115
  else:
1116
  filtered_datasets.append({
1117
  'id': dataset_id,
1118
- 'global_rank': 'Not in top 3000',
1119
  'downloads': 0,
1120
  'likes': 0,
1121
  'title': 'No Title',
@@ -1154,9 +1180,9 @@ def get_datasets_data(progress=gr.Progress()):
1154
  xaxis_title="Dataset ID",
1155
  yaxis_title="Global Rank",
1156
  yaxis=dict(
1157
- ticktext=[f"#{i}" for i in range(1, 3001, 100)],
1158
- tickvals=[3001 - i for i in range(1, 3001, 100)],
1159
- range=[0, 3000]
1160
  ),
1161
  height=800,
1162
  showlegend=False,
 
999
 
1000
  def get_korea_datasets():
1001
  """Korea 관련 데이터셋 검색"""
1002
+ search_terms = ['korea', 'korean', 'kor'] # 검색어 확장
1003
+ all_korea_datasets = []
 
 
 
1004
 
1005
+ for term in search_terms:
1006
+ params = {
1007
+ "search": term,
1008
+ "full": "True",
1009
+ "limit": 10000 # 검색 범위 확장
1010
+ }
1011
 
1012
+ try:
1013
+ response = requests.get(
1014
+ "https://huggingface.co/api/datasets",
1015
+ headers={'Authorization': f'Bearer {HF_TOKEN}'},
1016
+ params=params
1017
+ )
1018
+
1019
+ if response.status_code == 200:
1020
+ datasets = response.json()
1021
+ all_korea_datasets.extend(datasets)
1022
+ print(f"Found {len(datasets)} datasets for search term '{term}'")
1023
+ else:
1024
+ print(f"Failed to fetch datasets for term '{term}': {response.status_code}")
1025
+ except Exception as e:
1026
+ print(f"Error fetching datasets for term '{term}': {str(e)}")
1027
+
1028
+ # 중복 제거
1029
+ seen_ids = set()
1030
+ unique_datasets = []
1031
+ for dataset in all_korea_datasets:
1032
+ dataset_id = dataset.get('id', '')
1033
+ if dataset_id and dataset_id not in seen_ids:
1034
+ seen_ids.add(dataset_id)
1035
+ unique_datasets.append(dataset)
1036
+
1037
+ print(f"Total unique Korea-related datasets found: {len(unique_datasets)}")
1038
+ return unique_datasets
1039
 
1040
+ def get_all_datasets(limit=10000): # 기본 limit 증가
1041
  """모든 데이터셋과 Korea 관련 데이터셋 가져오기"""
1042
  all_datasets = []
1043
  page_size = 1000
 
1049
  'offset': offset
1050
  }
1051
 
1052
+ try:
1053
+ response = requests.get(
1054
+ "https://huggingface.co/api/datasets",
1055
+ headers={'Authorization': f'Bearer {HF_TOKEN}'},
1056
+ params=params
1057
+ )
1058
+
1059
+ if response.status_code == 200:
1060
+ datasets = response.json()
1061
+ all_datasets.extend(datasets)
1062
+ print(f"Fetched datasets {offset+1} to {offset+len(datasets)}")
1063
+ else:
1064
+ print(f"Failed to fetch datasets at offset {offset}: {response.status_code}")
1065
+ break
1066
+ except Exception as e:
1067
+ print(f"Error fetching datasets at offset {offset}: {str(e)}")
1068
  break
1069
 
1070
  # Korea 검색 결과 추가
1071
  korea_datasets = get_korea_datasets()
1072
  existing_ids = {dataset.get('id', '') for dataset in all_datasets}
1073
 
1074
+ added_count = 0
1075
  for korea_dataset in korea_datasets:
1076
  if korea_dataset.get('id', '') not in existing_ids:
1077
  all_datasets.append(korea_dataset)
1078
  existing_ids.add(korea_dataset.get('id', ''))
1079
+ added_count += 1
1080
+
1081
+ print(f"Added {added_count} additional Korea-related datasets")
1082
+ print(f"Total datasets: {len(all_datasets)}")
1083
 
1084
  return all_datasets[:limit]
1085
 
 
1141
  else:
1142
  filtered_datasets.append({
1143
  'id': dataset_id,
1144
+ 'global_rank': 'Not in top 10000',
1145
  'downloads': 0,
1146
  'likes': 0,
1147
  'title': 'No Title',
 
1180
  xaxis_title="Dataset ID",
1181
  yaxis_title="Global Rank",
1182
  yaxis=dict(
1183
+ ticktext=[f"#{i}" for i in range(1, 10001, 100)],
1184
+ tickvals=[10001 - i for i in range(1, 10001, 100)],
1185
+ range=[0, 10000]
1186
  ),
1187
  height=800,
1188
  showlegend=False,