Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -999,29 +999,45 @@ target_datasets = {
|
|
999 |
|
1000 |
def get_korea_datasets():
|
1001 |
"""Korea 관련 데이터셋 검색"""
|
1002 |
-
|
1003 |
-
|
1004 |
-
"full": "True",
|
1005 |
-
"limit": 1000
|
1006 |
-
}
|
1007 |
|
1008 |
-
|
1009 |
-
|
1010 |
-
"
|
1011 |
-
|
1012 |
-
|
1013 |
-
|
1014 |
|
1015 |
-
|
1016 |
-
|
1017 |
-
|
1018 |
-
|
1019 |
-
|
1020 |
-
|
1021 |
-
|
1022 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1023 |
|
1024 |
-
def get_all_datasets(limit=
|
1025 |
"""모든 데이터셋과 Korea 관련 데이터셋 가져오기"""
|
1026 |
all_datasets = []
|
1027 |
page_size = 1000
|
@@ -1033,27 +1049,37 @@ def get_all_datasets(limit=3000):
|
|
1033 |
'offset': offset
|
1034 |
}
|
1035 |
|
1036 |
-
|
1037 |
-
|
1038 |
-
|
1039 |
-
|
1040 |
-
|
1041 |
-
|
1042 |
-
|
1043 |
-
|
1044 |
-
|
1045 |
-
|
1046 |
-
|
|
|
|
|
|
|
|
|
|
|
1047 |
break
|
1048 |
|
1049 |
# Korea 검색 결과 추가
|
1050 |
korea_datasets = get_korea_datasets()
|
1051 |
existing_ids = {dataset.get('id', '') for dataset in all_datasets}
|
1052 |
|
|
|
1053 |
for korea_dataset in korea_datasets:
|
1054 |
if korea_dataset.get('id', '') not in existing_ids:
|
1055 |
all_datasets.append(korea_dataset)
|
1056 |
existing_ids.add(korea_dataset.get('id', ''))
|
|
|
|
|
|
|
|
|
1057 |
|
1058 |
return all_datasets[:limit]
|
1059 |
|
@@ -1115,7 +1141,7 @@ def get_datasets_data(progress=gr.Progress()):
|
|
1115 |
else:
|
1116 |
filtered_datasets.append({
|
1117 |
'id': dataset_id,
|
1118 |
-
'global_rank': 'Not in top
|
1119 |
'downloads': 0,
|
1120 |
'likes': 0,
|
1121 |
'title': 'No Title',
|
@@ -1154,9 +1180,9 @@ def get_datasets_data(progress=gr.Progress()):
|
|
1154 |
xaxis_title="Dataset ID",
|
1155 |
yaxis_title="Global Rank",
|
1156 |
yaxis=dict(
|
1157 |
-
ticktext=[f"#{i}" for i in range(1,
|
1158 |
-
tickvals=[
|
1159 |
-
range=[0,
|
1160 |
),
|
1161 |
height=800,
|
1162 |
showlegend=False,
|
|
|
999 |
|
1000 |
def get_korea_datasets():
|
1001 |
"""Korea 관련 데이터셋 검색"""
|
1002 |
+
search_terms = ['korea', 'korean', 'kor'] # 검색어 확장
|
1003 |
+
all_korea_datasets = []
|
|
|
|
|
|
|
1004 |
|
1005 |
+
for term in search_terms:
|
1006 |
+
params = {
|
1007 |
+
"search": term,
|
1008 |
+
"full": "True",
|
1009 |
+
"limit": 10000 # 검색 범위 확장
|
1010 |
+
}
|
1011 |
|
1012 |
+
try:
|
1013 |
+
response = requests.get(
|
1014 |
+
"https://huggingface.co/api/datasets",
|
1015 |
+
headers={'Authorization': f'Bearer {HF_TOKEN}'},
|
1016 |
+
params=params
|
1017 |
+
)
|
1018 |
+
|
1019 |
+
if response.status_code == 200:
|
1020 |
+
datasets = response.json()
|
1021 |
+
all_korea_datasets.extend(datasets)
|
1022 |
+
print(f"Found {len(datasets)} datasets for search term '{term}'")
|
1023 |
+
else:
|
1024 |
+
print(f"Failed to fetch datasets for term '{term}': {response.status_code}")
|
1025 |
+
except Exception as e:
|
1026 |
+
print(f"Error fetching datasets for term '{term}': {str(e)}")
|
1027 |
+
|
1028 |
+
# 중복 제거
|
1029 |
+
seen_ids = set()
|
1030 |
+
unique_datasets = []
|
1031 |
+
for dataset in all_korea_datasets:
|
1032 |
+
dataset_id = dataset.get('id', '')
|
1033 |
+
if dataset_id and dataset_id not in seen_ids:
|
1034 |
+
seen_ids.add(dataset_id)
|
1035 |
+
unique_datasets.append(dataset)
|
1036 |
+
|
1037 |
+
print(f"Total unique Korea-related datasets found: {len(unique_datasets)}")
|
1038 |
+
return unique_datasets
|
1039 |
|
1040 |
+
def get_all_datasets(limit=10000): # 기본 limit 증가
|
1041 |
"""모든 데이터셋과 Korea 관련 데이터셋 가져오기"""
|
1042 |
all_datasets = []
|
1043 |
page_size = 1000
|
|
|
1049 |
'offset': offset
|
1050 |
}
|
1051 |
|
1052 |
+
try:
|
1053 |
+
response = requests.get(
|
1054 |
+
"https://huggingface.co/api/datasets",
|
1055 |
+
headers={'Authorization': f'Bearer {HF_TOKEN}'},
|
1056 |
+
params=params
|
1057 |
+
)
|
1058 |
+
|
1059 |
+
if response.status_code == 200:
|
1060 |
+
datasets = response.json()
|
1061 |
+
all_datasets.extend(datasets)
|
1062 |
+
print(f"Fetched datasets {offset+1} to {offset+len(datasets)}")
|
1063 |
+
else:
|
1064 |
+
print(f"Failed to fetch datasets at offset {offset}: {response.status_code}")
|
1065 |
+
break
|
1066 |
+
except Exception as e:
|
1067 |
+
print(f"Error fetching datasets at offset {offset}: {str(e)}")
|
1068 |
break
|
1069 |
|
1070 |
# Korea 검색 결과 추가
|
1071 |
korea_datasets = get_korea_datasets()
|
1072 |
existing_ids = {dataset.get('id', '') for dataset in all_datasets}
|
1073 |
|
1074 |
+
added_count = 0
|
1075 |
for korea_dataset in korea_datasets:
|
1076 |
if korea_dataset.get('id', '') not in existing_ids:
|
1077 |
all_datasets.append(korea_dataset)
|
1078 |
existing_ids.add(korea_dataset.get('id', ''))
|
1079 |
+
added_count += 1
|
1080 |
+
|
1081 |
+
print(f"Added {added_count} additional Korea-related datasets")
|
1082 |
+
print(f"Total datasets: {len(all_datasets)}")
|
1083 |
|
1084 |
return all_datasets[:limit]
|
1085 |
|
|
|
1141 |
else:
|
1142 |
filtered_datasets.append({
|
1143 |
'id': dataset_id,
|
1144 |
+
'global_rank': 'Not in top 10000',
|
1145 |
'downloads': 0,
|
1146 |
'likes': 0,
|
1147 |
'title': 'No Title',
|
|
|
1180 |
xaxis_title="Dataset ID",
|
1181 |
yaxis_title="Global Rank",
|
1182 |
yaxis=dict(
|
1183 |
+
ticktext=[f"#{i}" for i in range(1, 10001, 100)],
|
1184 |
+
tickvals=[10001 - i for i in range(1, 10001, 100)],
|
1185 |
+
range=[0, 10000]
|
1186 |
),
|
1187 |
height=800,
|
1188 |
showlegend=False,
|