import argparse import os import ssl from urllib import request from lxml import etree from tqdm import tqdm from fake_useragent import UserAgent ua = UserAgent() def process(args): # instanciate parser tree = etree.parse(args.html, parser=etree.HTMLParser(recover=True)) # get all the links blast_items = tree.xpath('//*[@id="root"]/div/div/div/main/div[2]/div[2]/section/div/div/span[6]/a/text()') context = ssl._create_unverified_context() base_url = "https://www.ebi.ac.uk/Tools/services/rest/ncbiblast/result/" bar = tqdm(blast_items) names = [] for item in bar: bar.set_postfix({"current": item}) trg_url = base_url + item + "/accs" req = request.Request(trg_url, headers={ "Accept": "application/json", 'user-agent': ua.random }) res = request.urlopen(req, context=context) payload = [p[5:] for p in res.read().decode().split("\n")[:-1]] names.extend(payload) # remove duplicate names = list(set(names)) lenth = len(names) max_i = lenth//args.chunk_size+1 for i in range(max_i): names_ = names[i*args.chunk_size: (i+1)*args.chunk_size] with open(os.path.join(args.output, f"af_raw_{args.protein_name}_{i}.txt"), "w") as f: for name in names_: f.write(name+"\n") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--protein_name", type=str, default="CM", required=False) parser.add_argument("--html", type=str, default="data/CM/CM.html", required=False) parser.add_argument("--output", type=str, default="data/CM", required=False) parser.add_argument("--chunk_size", type=int, default=5000, required=False) args = parser.parse_args() process(args)