File size: 3,640 Bytes
8918ac7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import argparse
import os
import sys, errno, re, json, ssl
import urllib
from urllib import request
from urllib.error import HTTPError
from time import sleep
from tqdm import tqdm
from fake_useragent import UserAgent

ua = UserAgent()

def output_list(args):
    if args.filter_name:
        BASE_URL = f"https://www.ebi.ac.uk:443/interpro/api/protein/UniProt/entry/InterPro/{args.protein}/{args.filter_name}/?page_size={args.page_size}"
    else:
        BASE_URL = f"https://www.ebi.ac.uk:443/interpro/api/protein/UniProt/entry/InterPro/{args.protein}/?page_size={args.page_size}"
    print(f"Processing {BASE_URL}")

    if args.re_collect:
        os.remove(args.output)
        
    #disable SSL verification to avoid config issues
    context = ssl._create_unverified_context()
    # context.check_hostname = False
    # context.verify_mode = ssl.CERT_NONE
    
    next = BASE_URL
    attempts = 0
    cur_page = 0
    names = []
    while next:
        try:
            print(next)
            req = request.Request(next, 
                                  headers={
                                      "Accept": "application/json",
                                      'user-agent': ua.random
                                  })
            res = request.urlopen(req, context=context)
            # If the API times out due a long running query
            if res.status == 408:
                # wait just over a minute
                sleep(61)
                # then continue this loop with the same URL
                continue
            elif res.status == 204:
                #no data so leave loop
                break
            payload = json.loads(res.read().decode())
            res.close()
            next = payload["next"]
            attempts = 0
        except HTTPError as e:
            if e.code == 408:
                sleep(61)
                continue
            else:
                # If there is a different HTTP error, it wil re-try 3 times before failing
                if attempts < 3:
                    attempts += 1
                    sleep(61)
                    continue
                else:
                    sys.stderr.write("LAST URL: " + next)
                    raise e
        cur_page += 1
        bar = tqdm(payload["results"])
        for item in bar:
            bar.set_postfix({"current": f"{(cur_page - 1)*args.page_size}-{cur_page*args.page_size}"})
            names.append(item["metadata"]["accession"])
    # remove duplicate
    nemas = list(set(names))
    lenth = len(names)
    max_i = lenth//args.chunk_size+1
    for i in range(max_i):
        names_ = names[i*args.chunk_size: (i+1)*args.chunk_size]
        with open(os.path.join(args.output, f"af_raw_{args.protein_name}_{i}.txt"), "w") as f:
            for name in names_:
                f.write(name+"\n")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--protein", type=str, default="IPR001557", required=False)
    parser.add_argument("--protein_name", type=str, default="MDH", required=False)
    parser.add_argument("--chunk_size", type=int, default=5000, required=False)
    parser.add_argument("--filter_name", type=str, default="", required=False)
    parser.add_argument("--page_size", type=int, default=200, required=False)
    parser.add_argument("--output", type=str, default="data/MDH", required=False)
    parser.add_argument("--re_collect", action="store_true", default=False, required=False)
    args = parser.parse_args()
    
    output_list(args)