import argparse import os import sys, errno, re, json, ssl import urllib from urllib import request from urllib.error import HTTPError from time import sleep from tqdm import tqdm from fake_useragent import UserAgent ua = UserAgent() def output_list(args): if args.filter_name: BASE_URL = f"https://www.ebi.ac.uk:443/interpro/api/protein/UniProt/entry/InterPro/{args.protein}/{args.filter_name}/?page_size={args.page_size}" else: BASE_URL = f"https://www.ebi.ac.uk:443/interpro/api/protein/UniProt/entry/InterPro/{args.protein}/?page_size={args.page_size}" print(f"Processing {BASE_URL}") if args.re_collect: os.remove(args.output) #disable SSL verification to avoid config issues context = ssl._create_unverified_context() # context.check_hostname = False # context.verify_mode = ssl.CERT_NONE next = BASE_URL attempts = 0 cur_page = 0 names = [] while next: try: print(next) req = request.Request(next, headers={ "Accept": "application/json", 'user-agent': ua.random }) res = request.urlopen(req, context=context) # If the API times out due a long running query if res.status == 408: # wait just over a minute sleep(61) # then continue this loop with the same URL continue elif res.status == 204: #no data so leave loop break payload = json.loads(res.read().decode()) res.close() next = payload["next"] attempts = 0 except HTTPError as e: if e.code == 408: sleep(61) continue else: # If there is a different HTTP error, it wil re-try 3 times before failing if attempts < 3: attempts += 1 sleep(61) continue else: sys.stderr.write("LAST URL: " + next) raise e cur_page += 1 bar = tqdm(payload["results"]) for item in bar: bar.set_postfix({"current": f"{(cur_page - 1)*args.page_size}-{cur_page*args.page_size}"}) names.append(item["metadata"]["accession"]) # remove duplicate nemas = list(set(names)) lenth = len(names) max_i = lenth//args.chunk_size+1 for i in range(max_i): names_ = names[i*args.chunk_size: (i+1)*args.chunk_size] with open(os.path.join(args.output, f"af_raw_{args.protein_name}_{i}.txt"), "w") as f: for name in names_: f.write(name+"\n") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--protein", type=str, default="IPR001557", required=False) parser.add_argument("--protein_name", type=str, default="MDH", required=False) parser.add_argument("--chunk_size", type=int, default=5000, required=False) parser.add_argument("--filter_name", type=str, default="", required=False) parser.add_argument("--page_size", type=int, default=200, required=False) parser.add_argument("--output", type=str, default="data/MDH", required=False) parser.add_argument("--re_collect", action="store_true", default=False, required=False) args = parser.parse_args() output_list(args)