Spaces:
Runtime error
Runtime error
File size: 3,640 Bytes
8918ac7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import argparse
import os
import sys, errno, re, json, ssl
import urllib
from urllib import request
from urllib.error import HTTPError
from time import sleep
from tqdm import tqdm
from fake_useragent import UserAgent
ua = UserAgent()
def output_list(args):
if args.filter_name:
BASE_URL = f"https://www.ebi.ac.uk:443/interpro/api/protein/UniProt/entry/InterPro/{args.protein}/{args.filter_name}/?page_size={args.page_size}"
else:
BASE_URL = f"https://www.ebi.ac.uk:443/interpro/api/protein/UniProt/entry/InterPro/{args.protein}/?page_size={args.page_size}"
print(f"Processing {BASE_URL}")
if args.re_collect:
os.remove(args.output)
#disable SSL verification to avoid config issues
context = ssl._create_unverified_context()
# context.check_hostname = False
# context.verify_mode = ssl.CERT_NONE
next = BASE_URL
attempts = 0
cur_page = 0
names = []
while next:
try:
print(next)
req = request.Request(next,
headers={
"Accept": "application/json",
'user-agent': ua.random
})
res = request.urlopen(req, context=context)
# If the API times out due a long running query
if res.status == 408:
# wait just over a minute
sleep(61)
# then continue this loop with the same URL
continue
elif res.status == 204:
#no data so leave loop
break
payload = json.loads(res.read().decode())
res.close()
next = payload["next"]
attempts = 0
except HTTPError as e:
if e.code == 408:
sleep(61)
continue
else:
# If there is a different HTTP error, it wil re-try 3 times before failing
if attempts < 3:
attempts += 1
sleep(61)
continue
else:
sys.stderr.write("LAST URL: " + next)
raise e
cur_page += 1
bar = tqdm(payload["results"])
for item in bar:
bar.set_postfix({"current": f"{(cur_page - 1)*args.page_size}-{cur_page*args.page_size}"})
names.append(item["metadata"]["accession"])
# remove duplicate
nemas = list(set(names))
lenth = len(names)
max_i = lenth//args.chunk_size+1
for i in range(max_i):
names_ = names[i*args.chunk_size: (i+1)*args.chunk_size]
with open(os.path.join(args.output, f"af_raw_{args.protein_name}_{i}.txt"), "w") as f:
for name in names_:
f.write(name+"\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--protein", type=str, default="IPR001557", required=False)
parser.add_argument("--protein_name", type=str, default="MDH", required=False)
parser.add_argument("--chunk_size", type=int, default=5000, required=False)
parser.add_argument("--filter_name", type=str, default="", required=False)
parser.add_argument("--page_size", type=int, default=200, required=False)
parser.add_argument("--output", type=str, default="data/MDH", required=False)
parser.add_argument("--re_collect", action="store_true", default=False, required=False)
args = parser.parse_args()
output_list(args) |