Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +6 -0
- .gitignore +177 -0
- LICENSE +201 -0
- README.md +742 -12
- README_CN.md +728 -0
- Scripts_notebook.ipynb +0 -0
- WebUI_demo.md +84 -0
- WebUI_demo_CN.md +83 -0
- app.py +74 -0
- ckpt/demo/demo.json +1 -0
- ckpt/demo/demo.pt +3 -0
- ckpt/demo/demo_provided.json +1 -0
- ckpt/demo/demo_provided.pt +3 -0
- data/DeepET_Topt/DeepET_Topt_AlphaFold2_HF.json +10 -0
- data/DeepET_Topt/DeepET_Topt_ESMFold_HF.json +10 -0
- data/DeepET_Topt/DeepET_Topt_HF.json +9 -0
- data/DeepLoc2Multi/DeepLoc2Multi_AlphaFold2_HF.json +9 -0
- data/DeepLoc2Multi/DeepLoc2Multi_HF.json +8 -0
- data/DeepLocBinary/DeepLocBinary_AlphaFold2_HF.json +9 -0
- data/DeepLocBinary/DeepLocBinary_ESMFold_HF.json +9 -0
- data/DeepLocBinary/DeepLocBinary_HF.json +8 -0
- data/DeepLocMulti/DeepLocMulti_AlphaFold2_HF.json +9 -0
- data/DeepLocMulti/DeepLocMulti_ESMFold_HF.json +9 -0
- data/DeepLocMulti/DeepLocMulti_HF.json +8 -0
- data/DeepSol/DeepSol_ESMFold_HF.json +9 -0
- data/DeepSol/DeepSol_HF.json +8 -0
- data/DeepSoluE/DeepSoluE_ESMFold_HF.json +9 -0
- data/DeepSoluE/DeepSoluE_HF.json +8 -0
- data/Demo/Demo_Solubility_HF.json +8 -0
- data/EC/EC_AlphaFold2_HF.json +9 -0
- data/EC/EC_ESMFold_HF.json +9 -0
- data/EC/EC_HF.json +8 -0
- data/FLIP_AAV/FLIP_AAV_des-mut_HF.json +9 -0
- data/FLIP_AAV/FLIP_AAV_low-vs-high_HF.json +9 -0
- data/FLIP_AAV/FLIP_AAV_mut-des_HF.json +9 -0
- data/FLIP_AAV/FLIP_AAV_one-vs-rest_HF.json +9 -0
- data/FLIP_AAV/FLIP_AAV_sampled_HF.json +9 -0
- data/FLIP_AAV/FLIP_AAV_seven-vs-rest_HF.json +9 -0
- data/FLIP_AAV/FLIP_AAV_two-vs-rest_HF.json +9 -0
- data/FLIP_GB1/FLIP_GB1_low-vs-high_HF.json +9 -0
- data/FLIP_GB1/FLIP_GB1_one-vs-rest_HF.json +9 -0
- data/FLIP_GB1/FLIP_GB1_sampled_HF.json +9 -0
- data/FLIP_GB1/FLIP_GB1_three-vs-rest_HF.json +9 -0
- data/FLIP_GB1/FLIP_GB1_two-vs-rest_HF.json +9 -0
- data/GO_BP/GO_BP_AlphaFold2_HF.json +9 -0
- data/GO_BP/GO_BP_ESMFold_HF.json +9 -0
- data/GO_BP/GO_BP_HF.json +8 -0
- data/GO_CC/GO_CC_AlphaFold2_HF.json +9 -0
- data/GO_CC/GO_CC_ESMFold_HF.json +9 -0
- data/GO_CC/GO_CC_HF.json +8 -0
.gitattributes
CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
img/Eval/Model_Dataset_Config.png filter=lfs diff=lfs merge=lfs -text
|
37 |
+
img/HuggingFace/HF1.png filter=lfs diff=lfs merge=lfs -text
|
38 |
+
img/HuggingFace/HF2.png filter=lfs diff=lfs merge=lfs -text
|
39 |
+
img/HuggingFace/HF3.png filter=lfs diff=lfs merge=lfs -text
|
40 |
+
img/Predict/Predict_Tab.png filter=lfs diff=lfs merge=lfs -text
|
41 |
+
img/Train/Monitor_Figs.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/#use-with-ide
|
110 |
+
.pdm.toml
|
111 |
+
|
112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
113 |
+
__pypackages__/
|
114 |
+
|
115 |
+
# Celery stuff
|
116 |
+
celerybeat-schedule
|
117 |
+
celerybeat.pid
|
118 |
+
|
119 |
+
# SageMath parsed files
|
120 |
+
*.sage.py
|
121 |
+
|
122 |
+
# Environments
|
123 |
+
.env
|
124 |
+
.venv
|
125 |
+
env/
|
126 |
+
venv/
|
127 |
+
ENV/
|
128 |
+
env.bak/
|
129 |
+
venv.bak/
|
130 |
+
|
131 |
+
# Spyder project settings
|
132 |
+
.spyderproject
|
133 |
+
.spyproject
|
134 |
+
|
135 |
+
# Rope project settings
|
136 |
+
.ropeproject
|
137 |
+
|
138 |
+
# mkdocs documentation
|
139 |
+
/site
|
140 |
+
|
141 |
+
# mypy
|
142 |
+
.mypy_cache/
|
143 |
+
.dmypy.json
|
144 |
+
dmypy.json
|
145 |
+
|
146 |
+
# Pyre type checker
|
147 |
+
.pyre/
|
148 |
+
|
149 |
+
# pytype static type analyzer
|
150 |
+
.pytype/
|
151 |
+
|
152 |
+
# Cython debug symbols
|
153 |
+
cython_debug/
|
154 |
+
|
155 |
+
# PyCharm
|
156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
+
#.idea/
|
161 |
+
|
162 |
+
dataset/
|
163 |
+
data/*.ipynb
|
164 |
+
wandb/
|
165 |
+
ckpt/ckpt
|
166 |
+
ckpt/dev_models
|
167 |
+
script_dev/
|
168 |
+
.gradio/
|
169 |
+
configs/
|
170 |
+
result/
|
171 |
+
|
172 |
+
# ignore all files in src/data/weight except .keep
|
173 |
+
src/data/weight/
|
174 |
+
!src/data/weight/.keep
|
175 |
+
|
176 |
+
tmp_db/
|
177 |
+
log/
|
LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
README.md
CHANGED
@@ -1,12 +1,742 @@
|
|
1 |
-
---
|
2 |
-
title: VenusFactory
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: VenusFactory
|
3 |
+
app_file: app.py
|
4 |
+
sdk: gradio
|
5 |
+
sdk_version: 5.24.0
|
6 |
+
---
|
7 |
+
<div align="right">
|
8 |
+
<a href="README.md">English</a> | <a href="README_CN.md">简体中文</a>
|
9 |
+
</div>
|
10 |
+
|
11 |
+
<p align="center">
|
12 |
+
<img src="img/banner_2503.png" width="70%" alt="VenusFactory Banner">
|
13 |
+
</p>
|
14 |
+
|
15 |
+
<div align="center">
|
16 |
+
|
17 |
+
[](https://github.com/tyang816/VenusFactory/stargazers) [](https://github.com/tyang816/VenusFactory/network/members) [](https://github.com/tyang816/VenusFactory/issues) [](https://github.com/tyang816/VenusFactory/blob/main/LICENSE)
|
18 |
+
[](https://www.python.org/) [](https://venusfactory.readthedocs.io/) [](https://github.com/tyang816/VenusFactory/releases)
|
19 |
+
|
20 |
+
</div>
|
21 |
+
|
22 |
+
Recent News:
|
23 |
+
|
24 |
+
- Welcome to VenusFactory! This project is developed by [**Liang's Lab**](https://lianglab.sjtu.edu.cn/) at [**Shanghai Jiao Tong University**](https://www.sjtu.edu.cn/).
|
25 |
+
- [2025-03-26] Add [VenusPLM-300M](https://huggingface.co/AI4Protein/VenusPLM-300M) model, trained based on **VenusPod**, is a protein language model independently developed by Hong Liang's research group at Shanghai Jiao Tong University.
|
26 |
+
- [2025-03-17] Add [Venus-PETA, Venus-ProPrime, Venus-ProSST models](https://huggingface.co/AI4Protein), for more details, please refer to [Supported Models](#-supported-models)
|
27 |
+
- [2025-03-05] 🎉 Congratulations! 🎉
|
28 |
+
|
29 |
+
🚀 Our latest research achievement, **VenusMutHub**, has been officially accepted by [**Acta Pharmaceutica Sinica B**](https://www.sciencedirect.com/science/article/pii/S2211383525001650) and is now featured in a series of [**leaderboards**](https://lianglab.sjtu.edu.cn/muthub/)!
|
30 |
+
💡 In this study, we built **900+ high-quality benchmark** [**datasets**](https://huggingface.co/datasets/AI4Protein/VenusMutHub) covering **500+ protein functional properties**. VenusMutHub not only offers a new collection of small-sample datasets for **real-world protein mutation engineering**, but also fills the gap in **diversity** within existing benchmarks, laying a stronger foundation for AI-driven protein mutation effect prediction.
|
31 |
+
|
32 |
+
|
33 |
+
## ✏️ Table of Contents
|
34 |
+
|
35 |
+
- [Features](#-features)
|
36 |
+
- [Supported Models](#-supported-models)
|
37 |
+
- [Supported Training Approaches](#-supported-training-approaches)
|
38 |
+
- [Supported Datasets](#-supported-datasets)
|
39 |
+
- [Supported Metrics](#-supported-metrics)
|
40 |
+
- [Requirements](#-requirements)
|
41 |
+
- [Installation Guide](#-installation-guide)
|
42 |
+
- [Quick Start with Venus Web UI](#-quick-start-with-venus-web-ui)
|
43 |
+
- [Code-line Usage](#-code-line-usage)
|
44 |
+
- [Citation](#-citation)
|
45 |
+
- [Acknowledgement](#-acknowledgement)
|
46 |
+
|
47 |
+
## 📑 Features
|
48 |
+
|
49 |
+
- **Vaious protein langugae models**: Venus series, ESM series, ProtTrans series, Ankh series, etc
|
50 |
+
- **Comprehensive supervised datasets**: Localization, Fitness, Solubility, Stability, etc
|
51 |
+
- **Easy and quick data collector**: AlphaFold2 Database, RCSB, InterPro, Uniprot, etc
|
52 |
+
- **Experiment moitors**: Wandb, Local
|
53 |
+
- **Friendly interface**: Gradio UI
|
54 |
+
|
55 |
+
## 🤖 Supported Models
|
56 |
+
|
57 |
+
### Pre-training Protein Language Models
|
58 |
+
|
59 |
+
<details>
|
60 |
+
<summary>Venus Series Models (Published by Liang's Lab)</summary>
|
61 |
+
|
62 |
+
| Model | Size | Parameters | GPU Memory | Features | Template |
|
63 |
+
|-------|------|------------|------------|----------|----------|
|
64 |
+
| ProSST-20 | 20 | 110M | 4GB+ | Mutation | [AI4Protein/ProSST-20](https://huggingface.co/AI4Protein/ProSST-20) |
|
65 |
+
| ProSST-128 | 128 | 110M | 4GB+ | Mutation | [AI4Protein/ProSST-128](https://huggingface.co/AI4Protein/ProSST-128) |
|
66 |
+
| ProSST-512 | 512 | 110M | 4GB+ | Mutation | [AI4Protein/ProSST-512](https://huggingface.co/AI4Protein/ProSST-512) |
|
67 |
+
| ProSST-2048 | 2048 | 110M | 4GB+ | Mutation | [AI4Protein/ProSST-2048](https://huggingface.co/AI4Protein/ProSST-2048) |
|
68 |
+
| ProSST-4096 | 4096 | 110M | 4GB+ | Mutation | [AI4Protein/ProSST-4096](https://huggingface.co/AI4Protein/ProSST-4096) |
|
69 |
+
| ProPrime-690M | 690M | 690M | 16GB+ | OGT-prediction | [AI4Protein/Prime_690M](https://huggingface.co/AI4Protein/Prime_690M) |
|
70 |
+
| VenusPLM-300M | 300M | 300M | 12GB+ | Protein-language | [AI4Protein/VenusPLM-300M](https://huggingface.co/AI4Protein/VenusPLM-300M) |
|
71 |
+
|
72 |
+
> 💡 These models often excel in specific tasks or offer unique architectural benefits
|
73 |
+
</details>
|
74 |
+
|
75 |
+
<details>
|
76 |
+
<summary>Venus-PETA Models: Tokenization variants</summary>
|
77 |
+
|
78 |
+
#### BPE Tokenization Series
|
79 |
+
| Model | Vocab Size | Parameters | GPU Memory | Template |
|
80 |
+
|-------|------------|------------|------------|----------|
|
81 |
+
| PETA-base | base | 80M | 4GB+ | [AI4Protein/deep_base](https://huggingface.co/AI4Protein/deep_base) |
|
82 |
+
| PETA-bpe-50 | 50 | 80M | 4GB+ | [AI4Protein/deep_bpe_50](https://huggingface.co/AI4Protein/deep_bpe_50) |
|
83 |
+
| PETA-bpe-200 | 200 | 80M | 4GB+ | [AI4Protein/deep_bpe_200](https://huggingface.co/AI4Protein/deep_bpe_200) |
|
84 |
+
| PETA-bpe-400 | 400 | 80M | 4GB+ | [AI4Protein/deep_bpe_400](https://huggingface.co/AI4Protein/deep_bpe_400) |
|
85 |
+
| PETA-bpe-800 | 800 | 80M | 4GB+ | [AI4Protein/deep_bpe_800](https://huggingface.co/AI4Protein/deep_bpe_800) |
|
86 |
+
| PETA-bpe-1600 | 1600 | 80M | 4GB+ | [AI4Protein/deep_bpe_1600](https://huggingface.co/AI4Protein/deep_bpe_1600) |
|
87 |
+
| PETA-bpe-3200 | 3200 | 80M | 4GB+ | [AI4Protein/deep_bpe_3200](https://huggingface.co/AI4Protein/deep_bpe_3200) |
|
88 |
+
|
89 |
+
#### Unigram Tokenization Series
|
90 |
+
| Model | Vocab Size | Parameters | GPU Memory | Template |
|
91 |
+
|-------|------------|------------|------------|----------|
|
92 |
+
| PETA-unigram-50 | 50 | 80M | 4GB+ | [AI4Protein/deep_unigram_50](https://huggingface.co/AI4Protein/deep_unigram_50) |
|
93 |
+
| PETA-unigram-100 | 100 | 80M | 4GB+ | [AI4Protein/deep_unigram_100](https://huggingface.co/AI4Protein/deep_unigram_100) |
|
94 |
+
| PETA-unigram-200 | 200 | 80M | 4GB+ | [AI4Protein/deep_unigram_200](https://huggingface.co/AI4Protein/deep_unigram_200) |
|
95 |
+
| PETA-unigram-400 | 400 | 80M | 4GB+ | [AI4Protein/deep_unigram_400](https://huggingface.co/AI4Protein/deep_unigram_400) |
|
96 |
+
| PETA-unigram-800 | 800 | 80M | 4GB+ | [AI4Protein/deep_unigram_800](https://huggingface.co/AI4Protein/deep_unigram_800) |
|
97 |
+
| PETA-unigram-1600 | 1600 | 80M | 4GB+ | [AI4Protein/deep_unigram_1600](https://huggingface.co/AI4Protein/deep_unigram_1600) |
|
98 |
+
| PETA-unigram-3200 | 3200 | 80M | 4GB+ | [AI4Protein/deep_unigram_3200](https://huggingface.co/AI4Protein/deep_unigram_3200) |
|
99 |
+
|
100 |
+
> 💡 Different tokenization strategies may be better suited for specific tasks
|
101 |
+
</details>
|
102 |
+
|
103 |
+
<details>
|
104 |
+
<summary>ESM Series Models: Meta AI's protein language models</summary>
|
105 |
+
|
106 |
+
| Model | Size | Parameters | GPU Memory | Training Data | Template |
|
107 |
+
|-------|------|------------|------------|---------------|----------|
|
108 |
+
| ESM2-8M | 8M | 8M | 2GB+ | UR50/D | [facebook/esm2_t6_8M_UR50D](https://huggingface.co/facebook/esm2_t6_8M_UR50D) |
|
109 |
+
| ESM2-35M | 35M | 35M | 4GB+ | UR50/D | [facebook/esm2_t12_35M_UR50D](https://huggingface.co/facebook/esm2_t12_35M_UR50D) |
|
110 |
+
| ESM2-150M | 150M | 150M | 8GB+ | UR50/D | [facebook/esm2_t30_150M_UR50D](https://huggingface.co/facebook/esm2_t30_150M_UR50D) |
|
111 |
+
| ESM2-650M | 650M | 650M | 16GB+ | UR50/D | [facebook/esm2_t33_650M_UR50D](https://huggingface.co/facebook/esm2_t33_650M_UR50D) |
|
112 |
+
| ESM2-3B | 3B | 3B | 24GB+ | UR50/D | [facebook/esm2_t36_3B_UR50D](https://huggingface.co/facebook/esm2_t36_3B_UR50D) |
|
113 |
+
| ESM2-15B | 15B | 15B | 40GB+ | UR50/D | [facebook/esm2_t48_15B_UR50D](https://huggingface.co/facebook/esm2_t48_15B_UR50D) |
|
114 |
+
| ESM-1b | 650M | 650M | 16GB+ | UR50/S | [facebook/esm1b_t33_650M_UR50S](https://huggingface.co/facebook/esm1b_t33_650M_UR50S) |
|
115 |
+
| ESM-1v-1 | 650M | 650M | 16GB+ | UR90/S | [facebook/esm1v_t33_650M_UR90S_1](https://huggingface.co/facebook/esm1v_t33_650M_UR90S_1) |
|
116 |
+
| ESM-1v-2 | 650M | 650M | 16GB+ | UR90/S | [facebook/esm1v_t33_650M_UR90S_2](https://huggingface.co/facebook/esm1v_t33_650M_UR90S_2) |
|
117 |
+
| ESM-1v-3 | 650M | 650M | 16GB+ | UR90/S | [facebook/esm1v_t33_650M_UR90S_3](https://huggingface.co/facebook/esm1v_t33_650M_UR90S_3) |
|
118 |
+
| ESM-1v-4 | 650M | 650M | 16GB+ | UR90/S | [facebook/esm1v_t33_650M_UR90S_4](https://huggingface.co/facebook/esm1v_t33_650M_UR90S_4) |
|
119 |
+
| ESM-1v-5 | 650M | 650M | 16GB+ | UR90/S | [facebook/esm1v_t33_650M_UR90S_5](https://huggingface.co/facebook/esm1v_t33_650M_UR90S_5) |
|
120 |
+
|
121 |
+
> 💡 ESM2 models are the latest generation, offering better performance than ESM-1b/1v
|
122 |
+
</details>
|
123 |
+
|
124 |
+
<details>
|
125 |
+
<summary>BERT-based Models: Transformer encoder architecture</summary>
|
126 |
+
|
127 |
+
| Model | Size | Parameters | GPU Memory | Training Data | Template |
|
128 |
+
|-------|------|------------|------------|---------------|----------|
|
129 |
+
| ProtBert-Uniref100 | 420M | 420M | 12GB+ | UniRef100 | [Rostlab/prot_bert](https://huggingface.co/Rostlab/prot_bert) |
|
130 |
+
| ProtBert-BFD | 420M | 420M | 12GB+ | BFD100 | [Rostlab/prot_bert_bfd](https://huggingface.co/Rostlab/prot_bert_bfd) |
|
131 |
+
| IgBert | 420M | 420M | 12GB+ | Antibody | [Exscientia/IgBert](https://huggingface.co/Exscientia/IgBert) |
|
132 |
+
| IgBert-unpaired | 420M | 420M | 12GB+ | Antibody | [Exscientia/IgBert_unpaired](https://huggingface.co/Exscientia/IgBert_unpaired) |
|
133 |
+
|
134 |
+
> 💡 BFD-trained models generally show better performance on structure-related tasks
|
135 |
+
</details>
|
136 |
+
|
137 |
+
<details>
|
138 |
+
<summary>T5-based Models: Encoder-decoder architecture</summary>
|
139 |
+
|
140 |
+
| Model | Size | Parameters | GPU Memory | Training Data | Template |
|
141 |
+
|-------|------|------------|------------|---------------|----------|
|
142 |
+
| ProtT5-XL-UniRef50 | 3B | 3B | 24GB+ | UniRef50 | [Rostlab/prot_t5_xl_uniref50](https://huggingface.co/Rostlab/prot_t5_xl_uniref50) |
|
143 |
+
| ProtT5-XXL-UniRef50 | 11B | 11B | 40GB+ | UniRef50 | [Rostlab/prot_t5_xxl_uniref50](https://huggingface.co/Rostlab/prot_t5_xxl_uniref50) |
|
144 |
+
| ProtT5-XL-BFD | 3B | 3B | 24GB+ | BFD100 | [Rostlab/prot_t5_xl_bfd](https://huggingface.co/Rostlab/prot_t5_xl_bfd) |
|
145 |
+
| ProtT5-XXL-BFD | 11B | 11B | 40GB+ | BFD100 | [Rostlab/prot_t5_xxl_bfd](https://huggingface.co/Rostlab/prot_t5_xxl_bfd) |
|
146 |
+
| IgT5 | 3B | 3B | 24GB+ | Antibody | [Exscientia/IgT5](https://huggingface.co/Exscientia/IgT5) |
|
147 |
+
| IgT5-unpaired | 3B | 3B | 24GB+ | Antibody | [Exscientia/IgT5_unpaired](https://huggingface.co/Exscientia/IgT5_unpaired) |
|
148 |
+
| Ankh-base | 450M | 450M | 12GB+ | Encoder-decoder | [ElnaggarLab/ankh-base](https://huggingface.co/ElnaggarLab/ankh-base) |
|
149 |
+
| Ankh-large | 1.2B | 1.2B | 20GB+ | Encoder-decoder | [ElnaggarLab/ankh-large](https://huggingface.co/ElnaggarLab/ankh-large) |
|
150 |
+
|
151 |
+
> 💡 T5 models can be used for both encoding and generation tasks
|
152 |
+
</details>
|
153 |
+
|
154 |
+
### Model Selection Guide
|
155 |
+
|
156 |
+
<details>
|
157 |
+
<summary>How to choose the right model?</summary>
|
158 |
+
|
159 |
+
1. **Based on Hardware Constraints:**
|
160 |
+
- Limited GPU (<8GB): ESM2-8M, ESM2-35M, ProSST
|
161 |
+
- Medium GPU (8-16GB): ESM2-150M, ESM2-650M, ProtBert series
|
162 |
+
- High-end GPU (24GB+): ESM2-3B, ProtT5-XL, Ankh-large
|
163 |
+
- Multiple GPUs: ESM2-15B, ProtT5-XXL
|
164 |
+
|
165 |
+
2. **Based on Task Type:**
|
166 |
+
- Sequence classification: ESM2, ProtBert
|
167 |
+
- Structure prediction: ESM2, Ankh
|
168 |
+
- Generation tasks: ProtT5
|
169 |
+
- Antibody design: IgBert, IgT5
|
170 |
+
- Lightweight deployment: ProSST, PETA-base
|
171 |
+
|
172 |
+
3. **Based on Training Data:**
|
173 |
+
- General protein tasks: ESM2, ProtBert
|
174 |
+
- Structure-aware tasks: Ankh
|
175 |
+
- Antibody-specific: IgBert, IgT5
|
176 |
+
- Custom tokenization needs: PETA series
|
177 |
+
|
178 |
+
</details>
|
179 |
+
|
180 |
+
> 🔍 All models are available through the Hugging Face Hub and can be easily loaded using their templates.
|
181 |
+
|
182 |
+
## 🔬 Supported Training Approaches
|
183 |
+
|
184 |
+
<details>
|
185 |
+
<summary>Supported Training Approaches</summary>
|
186 |
+
|
187 |
+
| Approach | Full-tuning | Freeze-tuning | SES-Adapter | AdaLoRA | QLoRA | LoRA | DoRA | IA3 |
|
188 |
+
| ---------------------- | ----------- | ------------------ | ------------------ | ------------------ |----------- | ------------------ | -----------------| -----------------|
|
189 |
+
| Supervised Fine-Tuning | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |
|
190 |
+
|
191 |
+
</details>
|
192 |
+
|
193 |
+
## 📚 Supported Datasets
|
194 |
+
|
195 |
+
<details><summary>Pre-training datasets</summary>
|
196 |
+
|
197 |
+
| dataset | data level | link |
|
198 |
+
|------------|------|------|
|
199 |
+
| CATH_V43_S40 | structures | [CATH_V43_S40](https://huggingface.co/datasets/tyang816/cath) |
|
200 |
+
| AGO_family | structures | [AGO_family](https://huggingface.co/datasets/tyang816/Ago_database_PDB) |
|
201 |
+
|
202 |
+
</details>
|
203 |
+
|
204 |
+
<details><summary>Zero-shot datasets</summary>
|
205 |
+
|
206 |
+
| dataset | task | link |
|
207 |
+
|------------|------|------|
|
208 |
+
| VenusMutHub | mutation effects prediction | [VenusMutHub](https://huggingface.co/datasets/AI4Protein/VenusMutHub) |
|
209 |
+
| ProteinGym | mutation effects prediction | [ProteinGym](https://proteingym.org/) |
|
210 |
+
|
211 |
+
</details>
|
212 |
+
|
213 |
+
<details><summary>Supervised fine-tuning datasets (amino acid sequences/ foldseek sequences/ ss8 sequences)</summary>
|
214 |
+
|
215 |
+
| dataset | task | data level | problem type | link |
|
216 |
+
|------------|------|----------|----------|------|
|
217 |
+
| DeepLocBinary | localization | protein-wise | single_label_classification | [DeepLocBinary_AlphaFold2](https://huggingface.co/datasets/tyang816/DeepLocBinary_AlphaFold2), [DeepLocBinary_ESMFold](https://huggingface.co/datasets/tyang816/DeepLocBinary_ESMFold) |
|
218 |
+
| DeepLocMulti | localization | protein-wise | multi_label_classification | [DeepLocMulti_AlphaFold2](https://huggingface.co/datasets/tyang816/DeepLocMulti_AlphaFold2), [DeepLocMulti_ESMFold](https://huggingface.co/datasets/tyang816/DeepLocMulti_ESMFold) |
|
219 |
+
| DeepLoc2Multi | localization | protein-wise | single_label_classification | [DeepLoc2Multi_AlphaFold2](https://huggingface.co/datasets/tyang816/DeepLoc2Multi_AlphaFold2), [DeepLoc2Multi_ESMFold](https://huggingface.co/datasets/tyang816/DeepLoc2Multi_ESMFold) |
|
220 |
+
| DeepSol | solubility | protein-wise | single_label_classification | [DeepSol_ESMFold](https://huggingface.co/datasets/tyang816/DeepSol_ESMFold) |
|
221 |
+
| DeepSoluE | solubility | protein-wise | single_label_classification | [DeepSoluE_ESMFold](https://huggingface.co/datasets/tyang816/DeepSoluE_ESMFold) |
|
222 |
+
| ProtSolM | solubility | protein-wise | single_label_classification | [ProtSolM_ESMFold](https://huggingface.co/datasets/tyang816/ProtSolM_ESMFold) |
|
223 |
+
| eSOL | solubility | protein-wise | regression | [eSOL_AlphaFold2](https://huggingface.co/datasets/tyang816/eSOL_AlphaFold2), [eSOL_ESMFold](https://huggingface.co/datasets/tyang816/eSOL_ESMFold) |
|
224 |
+
| DeepET_Topt | optimum temperature | protein-wise | regression | [DeepET_Topt_AlphaFold2](https://huggingface.co/datasets/tyang816/DeepET_Topt_AlphaFold2), [DeepET_Topt_ESMFold](https://huggingface.co/datasets/tyang816/DeepET_Topt_ESMFold) |
|
225 |
+
| EC | function | protein-wise | multi_label_classification | [EC_AlphaFold2](https://huggingface.co/datasets/tyang816/EC_AlphaFold2), [EC_ESMFold](https://huggingface.co/datasets/tyang816/EC_ESMFold) |
|
226 |
+
| GO_BP | function | protein-wise | multi_label_classification | [GO_BP_AlphaFold2](https://huggingface.co/datasets/tyang816/GO_BP_AlphaFold2), [GO_BP_ESMFold](https://huggingface.co/datasets/tyang816/GO_BP_ESMFold) |
|
227 |
+
| GO_CC | function | protein-wise | multi_label_classification | [GO_CC_AlphaFold2](https://huggingface.co/datasets/tyang816/GO_CC_AlphaFold2), [GO_CC_ESMFold](https://huggingface.co/datasets/tyang816/GO_CC_ESMFold) |
|
228 |
+
| GO_MF | function | protein-wise | multi_label_classification | [GO_MF_AlphaFold2](https://huggingface.co/datasets/tyang816/GO_MF_AlphaFold2), [GO_MF_ESMFold](https://huggingface.co/datasets/tyang816/GO_MF_ESMFold) |
|
229 |
+
| MetalIonBinding | binding | protein-wise | single_label_classification | [MetalIonBinding_AlphaFold2](https://huggingface.co/datasets/tyang816/MetalIonBinding_AlphaFold2), [MetalIonBinding_ESMFold](https://huggingface.co/datasets/tyang816/MetalIonBinding_ESMFold) |
|
230 |
+
| Thermostability | stability | protein-wise | regression | [Thermostability_AlphaFold2](https://huggingface.co/datasets/tyang816/Thermostability_AlphaFold2), [Thermostability_ESMFold](https://huggingface.co/datasets/tyang816/Thermostability_ESMFold) |
|
231 |
+
|
232 |
+
> ✨ Only structural sequences are different for the same dataset, for example, ``DeepLocBinary_ESMFold`` and ``DeepLocBinary_AlphaFold2`` share the same amino acid sequences, this means if you only want to use the ``aa_seqs``, both are ok!
|
233 |
+
|
234 |
+
</details>
|
235 |
+
|
236 |
+
<details><summary>Supervised fine-tuning datasets (amino acid sequences)</summary>
|
237 |
+
|
238 |
+
| dataset | task | data level | problem type | link |
|
239 |
+
|------------|------|----------|----------|------|
|
240 |
+
| Demo_Solubility | solubility | protein-wise | single_label_classification | [Demo_Solubility](https://huggingface.co/datasets/tyang816/Demo_Solubility) |
|
241 |
+
| DeepLocBinary | localization | protein-wise | single_label_classification | [DeepLocBinary](https://huggingface.co/datasets/tyang816/DeepLocBinary) |
|
242 |
+
| DeepLocMulti | localization | protein-wise | multi_label_classification | [DeepLocMulti](https://huggingface.co/datasets/tyang816/DeepLocMulti) |
|
243 |
+
| DeepLoc2Multi | localization | protein-wise | single_label_classification | [DeepLoc2Multi](https://huggingface.co/datasets/tyang816/DeepLoc2Multi) |
|
244 |
+
| DeepSol | solubility | protein-wise | single_label_classification | [DeepSol](https://huggingface.co/datasets/tyang816/DeepSol) |
|
245 |
+
| DeepSoluE | solubility | protein-wise | single_label_classification | [DeepSoluE](https://huggingface.co/datasets/tyang816/DeepSoluE) |
|
246 |
+
| ProtSolM | solubility | protein-wise | single_label_classification | [ProtSolM](https://huggingface.co/datasets/tyang816/ProtSolM) |
|
247 |
+
| eSOL | solubility | protein-wise | regression | [eSOL](https://huggingface.co/datasets/tyang816/eSOL) |
|
248 |
+
| DeepET_Topt | optimum temperature | protein-wise | regression | [DeepET_Topt](https://huggingface.co/datasets/tyang816/DeepET_Topt) |
|
249 |
+
| EC | function | protein-wise | multi_label_classification | [EC](https://huggingface.co/datasets/tyang816/EC) |
|
250 |
+
| GO_BP | function | protein-wise | multi_label_classification | [GO_BP](https://huggingface.co/datasets/tyang816/GO_BP) |
|
251 |
+
| GO_CC | function | protein-wise | multi_label_classification | [GO_CC](https://huggingface.co/datasets/tyang816/GO_CC) |
|
252 |
+
| GO_MF | function | protein-wise | multi_label_classification | [GO_MF](https://huggingface.co/datasets/tyang816/GO_MF) |
|
253 |
+
| MetalIonBinding | binding | protein-wise | single_label_classification | [MetalIonBinding](https://huggingface.co/datasets/tyang816/MetalIonBinding) |
|
254 |
+
| Thermostability | stability | protein-wise | regression | [Thermostability](https://huggingface.co/datasets/tyang816/Thermostability) |
|
255 |
+
| PaCRISPR | CRISPR | protein-wise | single_label_classification | [PaCRISPR](https://huggingface.co/datasets/tyang816/PaCRISPR) |
|
256 |
+
| PETA_CHS_Sol | solubility | protein-wise | single_label_classification | [PETA_CHS_Sol](https://huggingface.co/datasets/tyang816/PETA_CHS_Sol) |
|
257 |
+
| PETA_LGK_Sol | solubility | protein-wise | single_label_classification | [PETA_LGK_Sol](https://huggingface.co/datasets/tyang816/PETA_LGK_Sol) |
|
258 |
+
| PETA_TEM_Sol | solubility | protein-wise | single_label_classification | [PETA_TEM_Sol](https://huggingface.co/datasets/tyang816/PETA_TEM_Sol) |
|
259 |
+
| SortingSignal | sorting signal | protein-wise | single_label_classification | [SortingSignal](https://huggingface.co/datasets/tyang816/SortingSignal) |
|
260 |
+
| FLIP_AAV | mutation | protein-site | regression |
|
261 |
+
| FLIP_AAV_one-vs-rest | mutation | protein-site | single_label_classification | [FLIP_AAV_one-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_AAV_one-vs-rest) |
|
262 |
+
| FLIP_AAV_two-vs-rest | mutation | protein-site | single_label_classification | [FLIP_AAV_two-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_AAV_two-vs-rest) |
|
263 |
+
| FLIP_AAV_mut-des | mutation | protein-site | single_label_classification | [FLIP_AAV_mut-des](https://huggingface.co/datasets/tyang816/FLIP_AAV_mut-des) |
|
264 |
+
| FLIP_AAV_des-mut | mutation | protein-site | single_label_classification | [FLIP_AAV_des-mut](https://huggingface.co/datasets/tyang816/FLIP_AAV_des-mut) |
|
265 |
+
| FLIP_AAV_seven-vs-rest | mutation | protein-site | single_label_classification | [FLIP_AAV_seven-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_AAV_seven-vs-rest) |
|
266 |
+
| FLIP_AAV_low-vs-high | mutation | protein-site | single_label_classification | [FLIP_AAV_low-vs-high](https://huggingface.co/datasets/tyang816/FLIP_AAV_low-vs-high) |
|
267 |
+
| FLIP_AAV_sampled | mutation | protein-site | single_label_classification | [FLIP_AAV_sampled](https://huggingface.co/datasets/tyang816/FLIP_AAV_sampled) |
|
268 |
+
| FLIP_GB1 | mutation | protein-site | regression |
|
269 |
+
| FLIP_GB1_one-vs-rest | mutation | protein-site | single_label_classification | [FLIP_GB1_one-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_GB1_one-vs-rest) |
|
270 |
+
| FLIP_GB1_two-vs-rest | mutation | protein-site | single_label_classification | [FLIP_GB1_two-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_GB1_two-vs-rest) |
|
271 |
+
| FLIP_GB1_three-vs-rest | mutation | protein-site | single_label_classification | [FLIP_GB1_three-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_GB1_three-vs-rest) |
|
272 |
+
| FLIP_GB1_low-vs-high | mutation | protein-site | single_label_classification | [FLIP_GB1_low-vs-high](https://huggingface.co/datasets/tyang816/FLIP_GB1_low-vs-high) |
|
273 |
+
| FLIP_GB1_sampled | mutation | protein-site | single_label_classification | [FLIP_GB1_sampled](https://huggingface.co/datasets/tyang816/FLIP_GB1_sampled) |
|
274 |
+
| TAPE_Fluorescence | fluorescence | protein-site | regression | [TAPE_Fluorescence](https://huggingface.co/datasets/tyang816/TAPE_Fluorescence) |
|
275 |
+
| TAPE_Stability | stability | protein-site | regression | [TAPE_Stability](https://huggingface.co/datasets/tyang816/TAPE_Stability) |
|
276 |
+
|
277 |
+
</details>
|
278 |
+
|
279 |
+
## 📈 Supported Metrics
|
280 |
+
|
281 |
+
<details>
|
282 |
+
<summary>Supported Metrics</summary>
|
283 |
+
|
284 |
+
| Name | Torchmetrics | Problem Type |
|
285 |
+
| ------------- | ---------------- | ------------------------------------------------------- |
|
286 |
+
| accuracy | Accuracy | single_label_classification/ multi_label_classification |
|
287 |
+
| recall | Recall | single_label_classification/ multi_label_classification |
|
288 |
+
| precision | Precision | single_label_classification/ multi_label_classification |
|
289 |
+
| f1 | F1Score | single_label_classification/ multi_label_classification |
|
290 |
+
| mcc | MatthewsCorrCoef | single_label_classification/ multi_label_classification |
|
291 |
+
| auc | AUROC | single_label_classification/ multi_label_classification |
|
292 |
+
| f1_max | F1ScoreMax | multi_label_classification |
|
293 |
+
| spearman_corr | SpearmanCorrCoef | regression |
|
294 |
+
| mse | MeanSquaredError | regression |
|
295 |
+
|
296 |
+
</details>
|
297 |
+
|
298 |
+
## ✈️ Requirements
|
299 |
+
|
300 |
+
### Hardware Requirements
|
301 |
+
- Recommended: NVIDIA RTX 3090 (24GB) or better
|
302 |
+
- Actual requirements depend on your chosen protein language model
|
303 |
+
|
304 |
+
### Software Requirements
|
305 |
+
- [Anaconda3](https://www.anaconda.com/download) or [Miniconda3](https://docs.conda.io/projects/miniconda/en/latest/)
|
306 |
+
- Python 3.10
|
307 |
+
|
308 |
+
## 📦 Installation Guide
|
309 |
+
<details><summary> Git start with macOS</summary>
|
310 |
+
|
311 |
+
## To achieve the best performance and experience, we recommend using Mac devices with M-series chips (such as M1, M2, M3, etc.).
|
312 |
+
|
313 |
+
## 1️⃣ Clone the repository
|
314 |
+
|
315 |
+
First, get the VenusFactory code:
|
316 |
+
|
317 |
+
```bash
|
318 |
+
git clone https://github.com/tyang816/VenusFactory.git
|
319 |
+
cd VenusFactory
|
320 |
+
```
|
321 |
+
|
322 |
+
## 2️⃣ Create a Conda environment
|
323 |
+
|
324 |
+
Ensure you have Anaconda or Miniconda installed. Then, create a new environment named `venus` with Python 3.10:
|
325 |
+
|
326 |
+
```bash
|
327 |
+
conda create -n venus python=3.10
|
328 |
+
conda activate venus
|
329 |
+
```
|
330 |
+
|
331 |
+
## 3️⃣ Install Pytorch and PyG dependencies
|
332 |
+
|
333 |
+
```bash
|
334 |
+
# Install PyTorch
|
335 |
+
pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
|
336 |
+
|
337 |
+
# Install PyG dependencies
|
338 |
+
pip install torch_scatter torch-sparse torch-geometric -f https://data.pyg.org/whl/torch-2.2.0+cpu.html
|
339 |
+
```
|
340 |
+
|
341 |
+
## 4️⃣ Install remaining dependencies
|
342 |
+
|
343 |
+
Install the remaining dependencies using `requirements_for_macOS.txt`:
|
344 |
+
```bash
|
345 |
+
pip install -r requirements_for_macOS.txt
|
346 |
+
```
|
347 |
+
</details>
|
348 |
+
|
349 |
+
<details><summary> Git start with Windows or Linux on CUDA 12.x</summary>
|
350 |
+
|
351 |
+
## We recommend using CUDA 12.2
|
352 |
+
|
353 |
+
|
354 |
+
## 1️⃣ Clone the repository
|
355 |
+
|
356 |
+
First, get the VenusFactory code:
|
357 |
+
|
358 |
+
```bash
|
359 |
+
git clone https://github.com/tyang816/VenusFactory.git
|
360 |
+
cd VenusFactory
|
361 |
+
```
|
362 |
+
|
363 |
+
## 2️⃣ Create a Conda environment
|
364 |
+
|
365 |
+
Ensure you have Anaconda or Miniconda installed. Then, create a new environment named `venus` with Python 3.10:
|
366 |
+
|
367 |
+
```bash
|
368 |
+
conda create -n venus python=3.10
|
369 |
+
conda activate venus
|
370 |
+
```
|
371 |
+
|
372 |
+
## 3️⃣ Install Pytorch and PyG dependencies
|
373 |
+
|
374 |
+
```bash
|
375 |
+
# Install PyTorch
|
376 |
+
pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu121
|
377 |
+
|
378 |
+
# Install PyG dependencies
|
379 |
+
pip install torch_geometric==2.6.1 -f https://pytorch-geometric.com/whl/torch-2.5.1+cu121.html
|
380 |
+
pip install --no-index torch_scatter==2.1.2 -f https://pytorch-geometric.com/whl/torch-2.5.1+cu121.html
|
381 |
+
```
|
382 |
+
|
383 |
+
## 4️⃣ Install remaining dependencies
|
384 |
+
|
385 |
+
Install the remaining dependencies using `requirements.txt`:
|
386 |
+
```bash
|
387 |
+
pip install -r requirements.txt
|
388 |
+
```
|
389 |
+
</details>
|
390 |
+
|
391 |
+
<details><summary> Git start with Windows or Linux on CUDA 11.x</summary>
|
392 |
+
|
393 |
+
## We recommend using CUDA 11.8 or later versions, as they support higher versions of PyTorch, providing a better experience.
|
394 |
+
|
395 |
+
|
396 |
+
## 1️⃣ Clone the repository
|
397 |
+
|
398 |
+
First, get the VenusFactory code:
|
399 |
+
|
400 |
+
```bash
|
401 |
+
git clone https://github.com/tyang816/VenusFactory.git
|
402 |
+
cd VenusFactory
|
403 |
+
```
|
404 |
+
|
405 |
+
## 2️⃣ Create a Conda environment
|
406 |
+
|
407 |
+
Ensure you have Anaconda or Miniconda installed. Then, create a new environment named `venus` with Python 3.10:
|
408 |
+
|
409 |
+
```bash
|
410 |
+
conda create -n venus python=3.10
|
411 |
+
conda activate venus
|
412 |
+
```
|
413 |
+
|
414 |
+
## 3️⃣ Install Pytorch and PyG dependencies
|
415 |
+
|
416 |
+
```bash
|
417 |
+
# Install PyTorch
|
418 |
+
pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu118
|
419 |
+
|
420 |
+
# Install PyG dependencies
|
421 |
+
pip install torch_geometric==2.6.1 -f https://pytorch-geometric.com/whl/torch-2.5.1+cu118.html
|
422 |
+
pip install --no-index torch_scatter==2.1.2 -f https://pytorch-geometric.com/whl/torch-2.5.1+cu118.html
|
423 |
+
```
|
424 |
+
|
425 |
+
## 4️⃣ Install remaining dependencies
|
426 |
+
|
427 |
+
Install the remaining dependencies using `requirements.txt`:
|
428 |
+
```bash
|
429 |
+
pip install -r requirements.txt
|
430 |
+
```
|
431 |
+
</details>
|
432 |
+
|
433 |
+
<details><summary> Git start with Windows or Linux on CPU</summary>
|
434 |
+
|
435 |
+
## 1️⃣ Clone the repository
|
436 |
+
|
437 |
+
First, get the VenusFactory code:
|
438 |
+
|
439 |
+
```bash
|
440 |
+
git clone https://github.com/tyang816/VenusFactory.git
|
441 |
+
cd VenusFactory
|
442 |
+
```
|
443 |
+
|
444 |
+
## 2️⃣ Create a Conda environment
|
445 |
+
|
446 |
+
Ensure you have Anaconda or Miniconda installed. Then, create a new environment named `venus` with Python 3.10:
|
447 |
+
|
448 |
+
```bash
|
449 |
+
conda create -n venus python=3.10
|
450 |
+
conda activate venus
|
451 |
+
```
|
452 |
+
|
453 |
+
## 3️⃣ Install Pytorch and PyG dependencies
|
454 |
+
|
455 |
+
```bash
|
456 |
+
# Install PyTorch
|
457 |
+
pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cpu
|
458 |
+
|
459 |
+
# Install PyG dependencies
|
460 |
+
pip install torch_geometric==2.6.1 -f https://pytorch-geometric.com/whl/torch-2.5.1+cpu.html
|
461 |
+
pip install --no-index torch_scatter==2.1.2 -f https://pytorch-geometric.com/whl/torch-2.5.1+cpu.html
|
462 |
+
```
|
463 |
+
|
464 |
+
## 4️⃣ Install remaining dependencies
|
465 |
+
|
466 |
+
Install the remaining dependencies using `requirements.txt`:
|
467 |
+
```bash
|
468 |
+
pip install -r requirements.txt
|
469 |
+
```
|
470 |
+
</details>
|
471 |
+
|
472 |
+
## 🚀 Quick Start with Venus Web UI
|
473 |
+
|
474 |
+
### Start Venus Web UI
|
475 |
+
|
476 |
+
Get started quickly with our intuitive graphical interface powered by [Gradio](https://github.com/gradio-app/gradio):
|
477 |
+
|
478 |
+
```bash
|
479 |
+
python ./src/webui.py
|
480 |
+
```
|
481 |
+
|
482 |
+
This will launch the Venus Web UI where you can:
|
483 |
+
- Configure and run fine-tuning experiments
|
484 |
+
- Monitor training progress
|
485 |
+
- Evaluate models
|
486 |
+
- Visualize results
|
487 |
+
|
488 |
+
### Using Each Tab
|
489 |
+
|
490 |
+
We provide a detailed guide to help you navigate through each tab of the Venus Web UI.
|
491 |
+
|
492 |
+
<details>
|
493 |
+
<summary>1. Training Tab: Train your own protein language model</summary>
|
494 |
+
|
495 |
+

|
496 |
+
|
497 |
+
Select a protein language model from the dropdown menu. Upload your dataset or select from available datasets and choose metrics appropriate for your problem type.
|
498 |
+
|
499 |
+

|
500 |
+
Choose a training method (Freeze, SES-Adapter, LoRA, QLoRA etc.) and configure training parameters (batch size, learning rate, etc.).
|
501 |
+
|
502 |
+

|
503 |
+

|
504 |
+

|
505 |
+

|
506 |
+
Click "Start Training" and monitor progress in real-time.
|
507 |
+
|
508 |
+
<p align="center">
|
509 |
+
<img src="img/Train/Metric_Results.png" width="60%" alt="Metric_Results">
|
510 |
+
</p>
|
511 |
+
|
512 |
+
Click "Download CSV" to download the test metrics results.
|
513 |
+
</details>
|
514 |
+
|
515 |
+
<details>
|
516 |
+
<summary>2. Evaluation Tab: Evaluate your trained model within a benchmark</summary>
|
517 |
+
|
518 |
+

|
519 |
+
|
520 |
+
Load your trained model by specifying the model path. Select the same protein language model and model configs used during training. Select a test dataset and configure batch size. Choose evaluation metrics appropriate for your problem type. Finally, click "Start Evaluation" to view performance metrics.
|
521 |
+
</details>
|
522 |
+
|
523 |
+
<details>
|
524 |
+
<summary>3. Prediction Tab: Use your trained model to predict samples</summary>
|
525 |
+
|
526 |
+

|
527 |
+
|
528 |
+
Load your trained model by specifying the model path. Select the same protein language model and model configs used during training.
|
529 |
+
|
530 |
+
For single sequence: Enter a protein sequence in the text box.
|
531 |
+
|
532 |
+
For batch prediction: Upload a CSV file with sequences.
|
533 |
+
|
534 |
+

|
535 |
+
|
536 |
+
Click "Predict" to generate and view results.
|
537 |
+
</details>
|
538 |
+
|
539 |
+
<details>
|
540 |
+
<summary>4. Download Tab: Collect data from different sources with high efficiency</summary>
|
541 |
+
|
542 |
+
- **AlphaFold2 Structures**: Enter UniProt IDs to download protein structures
|
543 |
+
- **UniProt**: Search for protein information using keywords or IDs
|
544 |
+
- **InterPro**: Retrieve protein family and domain information
|
545 |
+
- **RCSB PDB**: Download experimental protein structures
|
546 |
+
</details>
|
547 |
+
|
548 |
+
<details>
|
549 |
+
<summary>5. Manual Tab: Detailed documentation and guides</summary>
|
550 |
+
|
551 |
+
Select a language (English/Chinese).
|
552 |
+
|
553 |
+
Navigate through the documentation using the table of contents and find step-by-step guides.
|
554 |
+
</details>
|
555 |
+
|
556 |
+
## 🧬 Code-line Usage
|
557 |
+
|
558 |
+
For users who prefer command-line interface, we provide comprehensive script solutions for different scenarios.
|
559 |
+
|
560 |
+
<details>
|
561 |
+
<summary>Training Methods: Various fine-tuning approaches for different needs</summary>
|
562 |
+
|
563 |
+
### Full Model Fine-tuning
|
564 |
+
```bash
|
565 |
+
# Freeze-tuning: Train only specific layers while freezing others
|
566 |
+
bash ./script/train/train_plm_vanilla.sh
|
567 |
+
```
|
568 |
+
|
569 |
+
### Parameter-Efficient Fine-tuning (PEFT)
|
570 |
+
```bash
|
571 |
+
# SES-Adapter: Selective and Efficient adapter fine-tuning
|
572 |
+
bash ./script/train/train_plm_ses-adapter.sh
|
573 |
+
|
574 |
+
# AdaLoRA: Adaptive Low-Rank Adaptation
|
575 |
+
bash ./script/train/train_plm_adalora.sh
|
576 |
+
|
577 |
+
# QLoRA: Quantized Low-Rank Adaptation
|
578 |
+
bash ./script/train/train_plm_qlora.sh
|
579 |
+
|
580 |
+
# LoRA: Low-Rank Adaptation
|
581 |
+
bash ./script/train/train_plm_lora.sh
|
582 |
+
|
583 |
+
# DoRA: Double Low-Rank Adaptation
|
584 |
+
bash ./script/train/train_plm_dora.sh
|
585 |
+
|
586 |
+
# IA3: Infused Adapter by Inhibiting and Amplifying Inner Activations
|
587 |
+
bash ./script/train/train_plm_ia3.sh
|
588 |
+
```
|
589 |
+
|
590 |
+
#### Training Method Comparison
|
591 |
+
| Method | Memory Usage | Training Speed | Performance |
|
592 |
+
|--------|--------------|----------------|-------------|
|
593 |
+
| Freeze | Low | Fast | Good |
|
594 |
+
| SES-Adapter | Medium | Medium | Better |
|
595 |
+
| AdaLoRA | Low | Medium | Better |
|
596 |
+
| QLoRA | Very Low | Slower | Good |
|
597 |
+
| LoRA | Low | Fast | Good |
|
598 |
+
| DoRA | Low | Medium | Better |
|
599 |
+
| IA3 | Very Low | Fast | Good |
|
600 |
+
|
601 |
+
</details>
|
602 |
+
|
603 |
+
<details>
|
604 |
+
<summary>Model Evaluation: Comprehensive evaluation tools</summary>
|
605 |
+
|
606 |
+
### Basic Evaluation
|
607 |
+
```bash
|
608 |
+
# Evaluate model performance on test sets
|
609 |
+
bash ./script/eval/eval.sh
|
610 |
+
```
|
611 |
+
|
612 |
+
### Available Metrics
|
613 |
+
- Classification: accuracy, precision, recall, F1, MCC, AUC
|
614 |
+
- Regression: MSE, Spearman correlation
|
615 |
+
- Multi-label: F1-max
|
616 |
+
|
617 |
+
### Visualization Tools
|
618 |
+
- Training curves
|
619 |
+
- Confusion matrices
|
620 |
+
- ROC curves
|
621 |
+
- Performance comparison plots
|
622 |
+
|
623 |
+
</details>
|
624 |
+
|
625 |
+
<details>
|
626 |
+
<summary>Structure Sequence Tools: Process protein structure information</summary>
|
627 |
+
|
628 |
+
### ESM Structure Sequence
|
629 |
+
```bash
|
630 |
+
# Generate structure sequences using ESM-3
|
631 |
+
bash ./script/get_get_structure_seq/get_esm3_structure_seq.sh
|
632 |
+
```
|
633 |
+
|
634 |
+
### Secondary Structure
|
635 |
+
```bash
|
636 |
+
# Predict protein secondary structure
|
637 |
+
bash ./script/get_get_structure_seq/get_secondary_structure_seq.sh
|
638 |
+
```
|
639 |
+
|
640 |
+
Features:
|
641 |
+
- Support for multiple sequence formats
|
642 |
+
- Batch processing capability
|
643 |
+
- Integration with popular structure prediction tools
|
644 |
+
|
645 |
+
</details>
|
646 |
+
|
647 |
+
<details>
|
648 |
+
<summary>Data Collection Tools: Multi-source protein data acquisition</summary>
|
649 |
+
|
650 |
+
### Format Conversion
|
651 |
+
```bash
|
652 |
+
# Convert CIF format to PDB
|
653 |
+
bash ./crawler/convert/maxit.sh
|
654 |
+
```
|
655 |
+
|
656 |
+
### Metadata Collection
|
657 |
+
```bash
|
658 |
+
# Download metadata from RCSB PDB
|
659 |
+
bash ./crawler/metadata/download_rcsb.sh
|
660 |
+
```
|
661 |
+
|
662 |
+
### Sequence Data
|
663 |
+
```bash
|
664 |
+
# Download protein sequences from UniProt
|
665 |
+
bash ./crawler/sequence/download_uniprot_seq.sh
|
666 |
+
```
|
667 |
+
|
668 |
+
### Structure Data
|
669 |
+
```bash
|
670 |
+
# Download from AlphaFold2 Database
|
671 |
+
bash ./crawler/structure/download_alphafold.sh
|
672 |
+
|
673 |
+
# Download from RCSB PDB
|
674 |
+
bash ./crawler/structure/download_rcsb.sh
|
675 |
+
```
|
676 |
+
|
677 |
+
Features:
|
678 |
+
- Automated batch downloading
|
679 |
+
- Resume interrupted downloads
|
680 |
+
- Data integrity verification
|
681 |
+
- Multiple source support
|
682 |
+
- Customizable search criteria
|
683 |
+
|
684 |
+
#### Supported Databases
|
685 |
+
| Database | Data Type | Access Method | Rate Limit |
|
686 |
+
|----------|-----------|---------------|------------|
|
687 |
+
| AlphaFold2 | Structures | REST API | Yes |
|
688 |
+
| RCSB PDB | Structures | FTP/HTTP | No |
|
689 |
+
| UniProt | Sequences | REST API | Yes |
|
690 |
+
| InterPro | Domains | REST API | Yes |
|
691 |
+
|
692 |
+
</details>
|
693 |
+
|
694 |
+
<details>
|
695 |
+
<summary>Usage Examples: Common scenarios and solutions</summary>
|
696 |
+
|
697 |
+
### Training Example
|
698 |
+
```bash
|
699 |
+
# Train a protein solubility predictor using ESM2
|
700 |
+
bash ./script/train/train_plm_lora.sh \
|
701 |
+
--model "facebook/esm2_t33_650M_UR50D" \
|
702 |
+
--dataset "DeepSol" \
|
703 |
+
--batch_size 32 \
|
704 |
+
--learning_rate 1e-4
|
705 |
+
```
|
706 |
+
|
707 |
+
### Evaluation Example
|
708 |
+
```bash
|
709 |
+
# Evaluate the trained model
|
710 |
+
bash ./script/eval/eval.sh \
|
711 |
+
--model_path "path/to/your/model" \
|
712 |
+
--test_dataset "DeepSol_test"
|
713 |
+
```
|
714 |
+
|
715 |
+
### Data Collection Example
|
716 |
+
```bash
|
717 |
+
# Download structures for a list of UniProt IDs
|
718 |
+
bash ./crawler/structure/download_alphafold.sh \
|
719 |
+
--input uniprot_ids.txt \
|
720 |
+
--output ./structures
|
721 |
+
```
|
722 |
+
|
723 |
+
</details>
|
724 |
+
|
725 |
+
> 💡 All scripts support additional command-line arguments for customization. Use `--help` with any script to see available options.
|
726 |
+
|
727 |
+
## 🙌 Citation
|
728 |
+
|
729 |
+
Please cite our work if you have used our code or data.
|
730 |
+
|
731 |
+
```bibtex
|
732 |
+
@article{tan2025venusfactory,
|
733 |
+
title={VenusFactory: A Unified Platform for Protein Engineering Data Retrieval and Language Model Fine-Tuning},
|
734 |
+
author={Tan, Yang and Liu, Chen and Gao, Jingyuan and Wu, Banghao and Li, Mingchen and Wang, Ruilin and Zhang, Lingrong and Yu, Huiqun and Fan, Guisheng and Hong, Liang and Zhou, Bingxin},
|
735 |
+
journal={arXiv preprint arXiv:2503.15438},
|
736 |
+
year={2025}
|
737 |
+
}
|
738 |
+
```
|
739 |
+
|
740 |
+
## 🎊 Acknowledgement
|
741 |
+
|
742 |
+
Thanks the support of [Liang's Lab](https://ins.sjtu.edu.cn/people/lhong/index.html).
|
README_CN.md
ADDED
@@ -0,0 +1,728 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<div align="right">
|
2 |
+
<a href="README.md">English</a> | <a href="README_CN.md">简体中文</a>
|
3 |
+
</div>
|
4 |
+
|
5 |
+
<p align="center">
|
6 |
+
<img src="img/banner_2503.png" width="70%" alt="VenusFactory Banner">
|
7 |
+
</p>
|
8 |
+
|
9 |
+
<div align="center">
|
10 |
+
|
11 |
+
[](https://github.com/tyang816/VenusFactory/stargazers) [](https://github.com/tyang816/VenusFactory/network/members) [](https://github.com/tyang816/VenusFactory/issues) [](https://github.com/tyang816/VenusFactory/blob/main/LICENSE)
|
12 |
+
[](https://www.python.org/) [](https://venusfactory.readthedocs.io/) [](https://github.com/tyang816/VenusFactory/releases)
|
13 |
+
|
14 |
+
</div>
|
15 |
+
|
16 |
+
最新消息:
|
17 |
+
|
18 |
+
- 欢迎使用 VenusFactory!本项目由[**Liang's Lab**](https://lianglab.sjtu.edu.cn/)开发,由[**Shanghai Jiao Tong University**](https://www.sjtu.edu.cn/)维护。
|
19 |
+
- [2025-03-26] 新增 [VenusPLM-300M](https://huggingface.co/AI4Protein/VenusPLM-300M) 模型,基于**VenusPod**独立开发,由[**Hong Liang**](https://lianglab.sjtu.edu.cn/)课题组开发。
|
20 |
+
- [2025-03-17] 新增 [Venus-PETA、Venus-ProPrime、Venus-ProSST 模型](https://huggingface.co/AI4Protein),更多详情请参考[支持的模型](#-支持的模型)
|
21 |
+
- [2025-03-05] 🎉 **祝贺!** 🎉
|
22 |
+
|
23 |
+
🚀 我们课题组最新的研究成果**VenusMutHub**被[**Acta Pharmaceutica Sinica B**](https://www.sciencedirect.com/science/article/pii/S2211383525001650) 正式接收,并发布了系列[**排行榜**](https://lianglab.sjtu.edu.cn/muthub/)!
|
24 |
+
💡 在本研究中,我们构建了**900+ 高质量基准**[**数据集**](https://huggingface.co/datasets/AI4Protein/VenusMutHub),涵盖 **500+ 不同功能特性的蛋白质**. VenusMutHub不仅为**蛋白质突变工程的真实应用场景**提供了全新的小样本数据集,还弥补了现有基准数据集在**多样性**方面的空白,为AI驱动的蛋白质突变效应预测奠定了更坚实的基础。
|
25 |
+
|
26 |
+
## ✏️ 目录
|
27 |
+
|
28 |
+
- [功能特点](#-功能特点)
|
29 |
+
- [支持的模型](#-支持的模型)
|
30 |
+
- [支持的训练方法](#-支持的训练方法)
|
31 |
+
- [支持的数据集](#-支持的数据集)
|
32 |
+
- [支持的评估指标](#-支持的评估指标)
|
33 |
+
- [环境要求](#-环境要求)
|
34 |
+
- [安装指南](#-安装指南)
|
35 |
+
- [快速开始](#-快速开始)
|
36 |
+
- [命令行使用](#-命令行使用)
|
37 |
+
- [引用](#-引用)
|
38 |
+
- [致谢](#-致谢)
|
39 |
+
|
40 |
+
## 📑 功能特点
|
41 |
+
|
42 |
+
- **丰富的蛋白质语言模型**:Venus系列、ESM系列、ProtTrans系列、Ankh 系列等
|
43 |
+
- **全面的监督数据集**:定位、适应度、溶解度、稳定性等
|
44 |
+
- **便捷的数据收集工具**:AlphaFold2 数据库、RCSB、InterPro、Uniprot 等
|
45 |
+
- **实验监控**:Wandb、本地监控
|
46 |
+
- **友好的界面**:Gradio UI
|
47 |
+
|
48 |
+
## 🤖 支持的模型
|
49 |
+
|
50 |
+
### 预训练蛋白质语言模型
|
51 |
+
|
52 |
+
<details>
|
53 |
+
<summary>Venus系列模型:特定任务架构</summary>
|
54 |
+
|
55 |
+
| 模型 | 大小 | 参数量 | GPU内存 | 特点 | 模板 |
|
56 |
+
|-------|------|------------|------------|----------|----------|
|
57 |
+
| ProSST-20 | 20 | 110M | 4GB+ | 突变预测 | [AI4Protein/ProSST-20](https://huggingface.co/AI4Protein/ProSST-20) |
|
58 |
+
| ProSST-128 | 128 | 110M | 4GB+ | 突变预测 | [AI4Protein/ProSST-128](https://huggingface.co/AI4Protein/ProSST-128) |
|
59 |
+
| ProSST-512 | 512 | 110M | 4GB+ | 突变预测 | [AI4Protein/ProSST-512](https://huggingface.co/AI4Protein/ProSST-512) |
|
60 |
+
| ProSST-2048 | 2048 | 110M | 4GB+ | 突变预测 | [AI4Protein/ProSST-2048](https://huggingface.co/AI4Protein/ProSST-2048) |
|
61 |
+
| ProSST-4096 | 4096 | 110M | 4GB+ | 突变预测 | [AI4Protein/ProSST-4096](https://huggingface.co/AI4Protein/ProSST-4096) |
|
62 |
+
| ProPrime-690M | 690M | 690M | 16GB+ | OGT预测 | [AI4Protein/Prime_690M](https://huggingface.co/AI4Protein/Prime_690M) |
|
63 |
+
|
64 |
+
> 💡 这些模型在特定任务上表现出色或提供独特的架构优势
|
65 |
+
</details>
|
66 |
+
|
67 |
+
<details>
|
68 |
+
<summary>Venus-PETA 模型:分词变体</summary>
|
69 |
+
|
70 |
+
#### BPE 分词系列
|
71 |
+
| 模型 | 词表大小 | 参数量 | GPU内存 | 模板 |
|
72 |
+
|-------|------------|------------|------------|----------|
|
73 |
+
| PETA-base | base | 80M | 4GB+ | [AI4Protein/deep_base](https://huggingface.co/AI4Protein/deep_base) |
|
74 |
+
| PETA-bpe-50 | 50 | 80M | 4GB+ | [AI4Protein/deep_bpe_50](https://huggingface.co/AI4Protein/deep_bpe_50) |
|
75 |
+
| PETA-bpe-200 | 200 | 80M | 4GB+ | [AI4Protein/deep_bpe_200](https://huggingface.co/AI4Protein/deep_bpe_200) |
|
76 |
+
| PETA-bpe-400 | 400 | 80M | 4GB+ | [AI4Protein/deep_bpe_400](https://huggingface.co/AI4Protein/deep_bpe_400) |
|
77 |
+
| PETA-bpe-800 | 800 | 80M | 4GB+ | [AI4Protein/deep_bpe_800](https://huggingface.co/AI4Protein/deep_bpe_800) |
|
78 |
+
| PETA-bpe-1600 | 1600 | 80M | 4GB+ | [AI4Protein/deep_bpe_1600](https://huggingface.co/AI4Protein/deep_bpe_1600) |
|
79 |
+
| PETA-bpe-3200 | 3200 | 80M | 4GB+ | [AI4Protein/deep_bpe_3200](https://huggingface.co/AI4Protein/deep_bpe_3200) |
|
80 |
+
|
81 |
+
#### Unigram 分词系列
|
82 |
+
| 模型 | 词表大小 | 参数量 | GPU内存 | 模板 |
|
83 |
+
|-------|------------|------------|------------|----------|
|
84 |
+
| PETA-unigram-50 | 50 | 80M | 4GB+ | [AI4Protein/deep_unigram_50](https://huggingface.co/AI4Protein/deep_unigram_50) |
|
85 |
+
| PETA-unigram-100 | 100 | 80M | 4GB+ | [AI4Protein/deep_unigram_100](https://huggingface.co/AI4Protein/deep_unigram_100) |
|
86 |
+
| PETA-unigram-200 | 200 | 80M | 4GB+ | [AI4Protein/deep_unigram_200](https://huggingface.co/AI4Protein/deep_unigram_200) |
|
87 |
+
| PETA-unigram-400 | 400 | 80M | 4GB+ | [AI4Protein/deep_unigram_400](https://huggingface.co/AI4Protein/deep_unigram_400) |
|
88 |
+
| PETA-unigram-800 | 800 | 80M | 4GB+ | [AI4Protein/deep_unigram_800](https://huggingface.co/AI4Protein/deep_unigram_800) |
|
89 |
+
| PETA-unigram-1600 | 1600 | 80M | 4GB+ | [AI4Protein/deep_unigram_1600](https://huggingface.co/AI4Protein/deep_unigram_1600) |
|
90 |
+
| PETA-unigram-3200 | 3200 | 80M | 4GB+ | [AI4Protein/deep_unigram_3200](https://huggingface.co/AI4Protein/deep_unigram_3200) |
|
91 |
+
|
92 |
+
> 💡 不同的分词策略可能更适合特定任务
|
93 |
+
</details>
|
94 |
+
|
95 |
+
<details>
|
96 |
+
<summary>ESM 系列模型:Meta AI 的蛋白质语言模型</summary>
|
97 |
+
|
98 |
+
| 模型 | 大小 | 参数量 | GPU内存 | 训练数据 | 模板 |
|
99 |
+
|-------|------|------------|------------|---------------|----------|
|
100 |
+
| ESM2-8M | 8M | 8M | 2GB+ | UR50/D | [facebook/esm2_t6_8M_UR50D](https://huggingface.co/facebook/esm2_t6_8M_UR50D) |
|
101 |
+
| ESM2-35M | 35M | 35M | 4GB+ | UR50/D | [facebook/esm2_t12_35M_UR50D](https://huggingface.co/facebook/esm2_t12_35M_UR50D) |
|
102 |
+
| ESM2-150M | 150M | 150M | 8GB+ | UR50/D | [facebook/esm2_t30_150M_UR50D](https://huggingface.co/facebook/esm2_t30_150M_UR50D) |
|
103 |
+
| ESM2-650M | 650M | 650M | 16GB+ | UR50/D | [facebook/esm2_t33_650M_UR50D](https://huggingface.co/facebook/esm2_t33_650M_UR50D) |
|
104 |
+
| ESM2-3B | 3B | 3B | 24GB+ | UR50/D | [facebook/esm2_t36_3B_UR50D](https://huggingface.co/facebook/esm2_t36_3B_UR50D) |
|
105 |
+
| ESM2-15B | 15B | 15B | 40GB+ | UR50/D | [facebook/esm2_t48_15B_UR50D](https://huggingface.co/facebook/esm2_t48_15B_UR50D) |
|
106 |
+
|
107 |
+
> 💡 ESM2 模型是最新一代,性能优于 ESM-1b/1v
|
108 |
+
</details>
|
109 |
+
|
110 |
+
<details>
|
111 |
+
<summary>BERT 系列模型:基于 Transformer 编码器架构</summary>
|
112 |
+
|
113 |
+
| 模型 | 大小 | 参数量 | GPU内存 | 训练数据 | 模板 |
|
114 |
+
|-------|------|------------|------------|---------------|----------|
|
115 |
+
| ProtBert-Uniref100 | 420M | 420M | 12GB+ | UniRef100 | [Rostlab/prot_bert](https://huggingface.co/Rostlab/prot_bert) |
|
116 |
+
| ProtBert-BFD | 420M | 420M | 12GB+ | BFD100 | [Rostlab/prot_bert_bfd](https://huggingface.co/Rostlab/prot_bert_bfd) |
|
117 |
+
| IgBert | 420M | 420M | 12GB+ | 抗体 | [Exscientia/IgBert](https://huggingface.co/Exscientia/IgBert) |
|
118 |
+
| IgBert-unpaired | 420M | 420M | 12GB+ | 抗体 | [Exscientia/IgBert_unpaired](https://huggingface.co/Exscientia/IgBert_unpaired) |
|
119 |
+
|
120 |
+
> 💡 BFD 训练的模型在结构相关任务上表现更好
|
121 |
+
</details>
|
122 |
+
|
123 |
+
<details>
|
124 |
+
<summary>T5 系列模型:编码器-解码器架构</summary>
|
125 |
+
|
126 |
+
| 模型 | 大小 | 参数量 | GPU内存 | 训练数据 | 模板 |
|
127 |
+
|-------|------|------------|------------|---------------|----------|
|
128 |
+
| ProtT5-XL-UniRef50 | 3B | 3B | 24GB+ | UniRef50 | [Rostlab/prot_t5_xl_uniref50](https://huggingface.co/Rostlab/prot_t5_xl_uniref50) |
|
129 |
+
| ProtT5-XXL-UniRef50 | 11B | 11B | 40GB+ | UniRef50 | [Rostlab/prot_t5_xxl_uniref50](https://huggingface.co/Rostlab/prot_t5_xxl_uniref50) |
|
130 |
+
| ProtT5-XL-BFD | 3B | 3B | 24GB+ | BFD100 | [Rostlab/prot_t5_xl_bfd](https://huggingface.co/Rostlab/prot_t5_xl_bfd) |
|
131 |
+
| ProtT5-XXL-BFD | 11B | 11B | 40GB+ | BFD100 | [Rostlab/prot_t5_xxl_bfd](https://huggingface.co/Rostlab/prot_t5_xxl_bfd) |
|
132 |
+
| Ankh-base | 450M | 450M | 12GB+ | 编码器-解码器 | [ElnaggarLab/ankh-base](https://huggingface.co/ElnaggarLab/ankh-base) |
|
133 |
+
| Ankh-large | 1.2B | 1.2B | 20GB+ | 编码器-解码器 | [ElnaggarLab/ankh-large](https://huggingface.co/ElnaggarLab/ankh-large) |
|
134 |
+
|
135 |
+
> 💡 T5 模型可用于编码和生成任务
|
136 |
+
</details>
|
137 |
+
|
138 |
+
### 模型选择指南
|
139 |
+
|
140 |
+
<details>
|
141 |
+
<summary>如何选择合适的模型?</summary>
|
142 |
+
|
143 |
+
1. **基于硬件限制:**
|
144 |
+
- 低配GPU (<8GB):ESM2-8M、ESM2-35M、ProSST
|
145 |
+
- 中配GPU (8-16GB):ESM2-150M、ESM2-650M、ProtBert系列
|
146 |
+
- 高配GPU (24GB+):ESM2-3B、ProtT5-XL、Ankh-large
|
147 |
+
- 多GPU:ESM2-15B、ProtT5-XXL
|
148 |
+
|
149 |
+
2. **基于任务类型:**
|
150 |
+
- 序列分类:ESM2、ProtBert
|
151 |
+
- 结构预测:ESM2、Ankh
|
152 |
+
- 生成任务:ProtT5
|
153 |
+
- 抗体设计:IgBert、IgT5
|
154 |
+
- 轻量级部署:ProSST、PETA-base
|
155 |
+
|
156 |
+
3. **基于训练数据:**
|
157 |
+
- 通用蛋白质任务:ESM2、ProtBert
|
158 |
+
- 结构感知任务:Ankh
|
159 |
+
- 抗体特异性:IgBert、IgT5
|
160 |
+
- 自定义分词需求:PETA系列
|
161 |
+
|
162 |
+
</details>
|
163 |
+
|
164 |
+
> 🔍 所有模型都可通过Hugging Face Hub获取,使用其模板可轻松加载。
|
165 |
+
|
166 |
+
## 🔬 支持的训练方法
|
167 |
+
|
168 |
+
<details>
|
169 |
+
<summary>支持的训练方法</summary>
|
170 |
+
|
171 |
+
| 方法 | 全量微调 | 冻结微调 | SES-Adapter | AdaLoRA | QLoRA | LoRA | DoRA | IA3 |
|
172 |
+
|------|---------|----------|-------------|----------|--------|------|------|-----|
|
173 |
+
| 监督微调 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
174 |
+
</details>
|
175 |
+
|
176 |
+
## 📚 支持的数据集
|
177 |
+
|
178 |
+
<details><summary>预训练数据集</summary>
|
179 |
+
|
180 |
+
| 数据集 | 数据来源 |
|
181 |
+
|-------|----------|
|
182 |
+
| [CATH_V43_S40](https://huggingface.co/datasets/tyang816/cath) | 结构数据集
|
183 |
+
| [AGO_family](https://huggingface.co/datasets/tyang816/Ago_database_PDB) | 结构数据集
|
184 |
+
|
185 |
+
</details>
|
186 |
+
|
187 |
+
<details><summary>零样本数据集</summary>
|
188 |
+
|
189 |
+
| 数据集 | 任务 | 数据来源 |
|
190 |
+
|-------|----------|----------|
|
191 |
+
| [VenusMutHub](https://huggingface.co/datasets/AI4Protein/VenusMutHub) | 突变 | 蛋白质序列
|
192 |
+
| [ProteinGym](https://proteingym.org/) | 突变 | 蛋白质序列
|
193 |
+
</details>
|
194 |
+
|
195 |
+
<details><summary>监督微调数据集(氨基酸序列/foldseek序列/二级结构序列)</summary>
|
196 |
+
|
197 |
+
| 数据集 | 任务 | 数据层次 | 问题类型 | 数据来源 |
|
198 |
+
|-------|------|------------|----------|----------|
|
199 |
+
| DeepLocBinary | 定位 | 蛋白质级别 | 单标签分类 | [DeepLocBinary_AlphaFold2](https://huggingface.co/datasets/tyang816/DeepLocBinary_AlphaFold2), [DeepLocBinary_ESMFold](https://huggingface.co/datasets/tyang816/DeepLocBinary_ESMFold) |
|
200 |
+
| DeepLocMulti | 定位 | 蛋白质级别 | 多标签分类 | [DeepLocMulti_AlphaFold2](https://huggingface.co/datasets/tyang816/DeepLocMulti_AlphaFold2), [DeepLocMulti_ESMFold](https://huggingface.co/datasets/tyang816/DeepLocMulti_ESMFold) |
|
201 |
+
| DeepLoc2Multi | 定位 | 蛋白质级别 | 多标签分类 | [DeepLoc2Multi_AlphaFold2](https://huggingface.co/datasets/tyang816/DeepLoc2Multi_AlphaFold2), [DeepLoc2Multi_ESMFold](https://huggingface.co/datasets/tyang816/DeepLoc2Multi_ESMFold) |
|
202 |
+
| DeepSol | 溶解度 | 蛋白质级别 | 单标签分类 | [DeepSol_AlphaFold2](https://huggingface.co/datasets/tyang816/DeepSol_AlphaFold2), [DeepSol_ESMFold](https://huggingface.co/datasets/tyang816/DeepSol_ESMFold) |
|
203 |
+
| DeepSoluE | 溶解度 | 蛋白质级别 | 单标签分类 | [DeepSoluE_ESMFold](https://huggingface.co/datasets/tyang816/DeepSoluE_ESMFold) |
|
204 |
+
| ProtSolM | 溶解度 | 蛋白质级别 | 单标签分类 | [ProtSolM_ESMFold](https://huggingface.co/datasets/tyang816/ProtSolM_ESMFold) |
|
205 |
+
| eSOL | 溶解度 | 蛋白质级别 | 回归 | [eSOL_AlphaFold2](https://huggingface.co/datasets/tyang816/eSOL_AlphaFold2), [eSOL_ESMFold](https://huggingface.co/datasets/tyang816/eSOL_ESMFold) |
|
206 |
+
| DeepET_Topt | 最适酶活 | 蛋白质级别 | 回归 | [DeepET_Topt_AlphaFold2](https://huggingface.co/datasets/tyang816/DeepET_Topt_AlphaFold2), [DeepET_Topt_ESMFold](https://huggingface.co/datasets/tyang816/DeepET_Topt_ESMFold) |
|
207 |
+
| EC | 功能 | 蛋白质级别 | 多标签分类 | [EC_AlphaFold2](https://huggingface.co/datasets/tyang816/EC_AlphaFold2), [EC_ESMFold](https://huggingface.co/datasets/tyang816/EC_ESMFold) |
|
208 |
+
| GO_BP | 功能 | 蛋白质级别 | 多标签分类 | [GO_BP_AlphaFold2](https://huggingface.co/datasets/tyang816/GO_BP_AlphaFold2), [GO_BP_ESMFold](https://huggingface.co/datasets/tyang816/GO_BP_ESMFold) |
|
209 |
+
| GO_CC | 功能 | 蛋白质级别 | 多标签分类 | [GO_CC_AlphaFold2](https://huggingface.co/datasets/tyang816/GO_CC_AlphaFold2), [GO_CC_ESMFold](https://huggingface.co/datasets/tyang816/GO_CC_ESMFold) |
|
210 |
+
| GO_MF | 功能 | 蛋白质级别 | 多标签分类 | [GO_MF_AlphaFold2](https://huggingface.co/datasets/tyang816/GO_MF_AlphaFold2), [GO_MF_ESMFold](https://huggingface.co/datasets/tyang816/GO_MF_ESMFold) |
|
211 |
+
| MetalIonBinding | 结合 | 蛋白质级别 | 单标签分类 | [MetalIonBinding_AlphaFold2](https://huggingface.co/datasets/tyang816/MetalIonBinding_AlphaFold2), [MetalIonBinding_ESMFold](https://huggingface.co/datasets/tyang816/MetalIonBinding_ESMFold) |
|
212 |
+
| Thermostability | 稳定性 | 蛋白质级别 | 回归 | [Thermostability_AlphaFold2](https://huggingface.co/datasets/tyang816/Thermostability_AlphaFold2), [Thermostability_ESMFold](https://huggingface.co/datasets/tyang816/Thermostability_ESMFold) |
|
213 |
+
|
214 |
+
> 💡 每个数据集都提供了使用 AlphaFold2 和 ESMFold 生成的结构序列版本
|
215 |
+
</details>
|
216 |
+
|
217 |
+
<details><summary>监督微调数据集(氨基酸序列)</summary>
|
218 |
+
|
219 |
+
| 数据集 | 任务 | 数据层次 | 问题类型 | 数据来源 |
|
220 |
+
|-------|------|------------|----------|----------|
|
221 |
+
| Demo_Solubility | 溶解度 | 蛋白质级别 | 单标签分类 | [Demo_Solubility](https://huggingface.co/datasets/tyang816/Demo_Solubility) |
|
222 |
+
| DeepLocBinary | 定位 | 蛋白质级别 | 单标签分类 | [DeepLocBinary](https://huggingface.co/datasets/tyang816/DeepLocBinary) |
|
223 |
+
| DeepLocMulti | 定位 | 蛋白质级别 | 单标签分类 | [DeepLocMulti](https://huggingface.co/datasets/tyang816/DeepLocMulti) |
|
224 |
+
| DeepLoc2Multi | 定位 | 蛋白质级别 | 多标签分类 | [DeepLoc2Multi](https://huggingface.co/datasets/tyang816/DeepLoc2Multi) |
|
225 |
+
| DeepSol | 溶解度 | 蛋白质级别 | 单标签分类 | [DeepSol](https://huggingface.co/datasets/tyang816/DeepSol) |
|
226 |
+
| DeepSoluE | 溶解度 | 蛋白质级别 | 单标签分类 | [DeepSoluE](https://huggingface.co/datasets/tyang816/DeepSoluE) |
|
227 |
+
| ProtSolM | 溶解度 | 蛋白质级别 | 单标签分类 | [ProtSolM](https://huggingface.co/datasets/tyang816/ProtSolM) |
|
228 |
+
| eSOL | 溶解度 | 蛋白质级别 | 回归 | [eSOL](https://huggingface.co/datasets/tyang816/eSOL) |
|
229 |
+
| DeepET_Topt | 最适酶活 | 蛋白质级别 | 回归 | [DeepET_Topt](https://huggingface.co/datasets/tyang816/DeepET_Topt) |
|
230 |
+
| EC | 功能 | 蛋白质级别 | 多标签分类 | [EC](https://huggingface.co/datasets/tyang816/EC) |
|
231 |
+
| GO_BP | 功能 | 蛋白质级别 | 多标签分类 | [GO_BP](https://huggingface.co/datasets/tyang816/GO_BP) |
|
232 |
+
| GO_CC | 功能 | 蛋白质级别 | 多标签分类 | [GO_CC](https://huggingface.co/datasets/tyang816/GO_CC) |
|
233 |
+
| GO_MF | 功能 | 蛋白质级别 | 多标签分类 | [GO_MF](https://huggingface.co/datasets/tyang816/GO_MF) |
|
234 |
+
| MetalIonBinding | 结合 | 蛋白质级别 | 单标签分类 | [MetalIonBinding](https://huggingface.co/datasets/tyang816/MetalIonBinding) |
|
235 |
+
| Thermostability | 稳定性 | 蛋白质级别 | 回归 | [Thermostability](https://huggingface.co/datasets/tyang816/Thermostability) |
|
236 |
+
| PaCRISPR | CRISPR | 蛋白质级别 | 回归 | [PaCRISPR](https://huggingface.co/datasets/tyang816/PaCRISPR) |
|
237 |
+
| PETA_CHS_Sol | 溶解度 | 蛋白质级别 | 回归 | [PETA_CHS_Sol](https://huggingface.co/datasets/tyang816/PETA_CHS_Sol) |
|
238 |
+
| PETA_LGK_Sol | 溶解度 | 蛋白质级别 | 回归 | [PETA_LGK_Sol](https://huggingface.co/datasets/tyang816/PETA_LGK_Sol) |
|
239 |
+
| PETA_TEM_Sol | 溶解度 | 蛋白质级别 | 回归 | [PETA_TEM_Sol](https://huggingface.co/datasets/tyang816/PETA_TEM_Sol) |
|
240 |
+
| SortingSignal | 信号肽 | 蛋白质级别 | 回归 | [SortingSignal](https://huggingface.co/datasets/tyang816/SortingSignal) |
|
241 |
+
| FLIP_AAV | 突变 | 蛋白质点位 | 回归 | |
|
242 |
+
| FLIP_AAV_one-vs-rest | 突变 | 蛋白质点位 | 回归 | [FLIP_AAV_one-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_AAV_one-vs-rest) |
|
243 |
+
| FLIP_AAV_two-vs-rest | 突变 | 蛋白质点位 | 回归 | [FLIP_AAV_two-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_AAV_two-vs-rest) |
|
244 |
+
| FLIP_AAV_mut-des | 突变 | 蛋白质点位 | 回归 | [FLIP_AAV_mut-des](https://huggingface.co/datasets/tyang816/FLIP_AAV_mut-des) |
|
245 |
+
| FLIP_AAV_des-mut | 突变 | 蛋白质点位 | 回归 | [FLIP_AAV_des-mut](https://huggingface.co/datasets/tyang816/FLIP_AAV_des-mut) |
|
246 |
+
| FLIP_AAV_seven-vs-rest | 突变 | 蛋白质点位 | 回归 | [FLIP_AAV_seven-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_AAV_seven-vs-rest) |
|
247 |
+
| FLIP_AAV_low-vs-high | 突变 | 蛋白质点位 | 回归 | [FLIP_AAV_low-vs-high](https://huggingface.co/datasets/tyang816/FLIP_AAV_low-vs-high) |
|
248 |
+
| FLIP_AAV_sampled | 突变 | 蛋白质点位 | 回归 | [FLIP_AAV_sampled](https://huggingface.co/datasets/tyang816/FLIP_AAV_sampled) |
|
249 |
+
| FLIP_GB1 | 突变 | 蛋白质点位 | 回归 | |
|
250 |
+
| FLIP_GB1_one-vs-rest | 突变 | 蛋白质点位 | 回归 | [FLIP_GB1_one-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_GB1_one-vs-rest) |
|
251 |
+
| FLIP_GB1_two-vs-rest | 突变 | 蛋白质点位 | 回归 | [FLIP_GB1_two-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_GB1_two-vs-rest) |
|
252 |
+
| FLIP_GB1_three-vs-rest | 突变 | 蛋白质点位 | 回归 | [FLIP_GB1_three-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_GB1_three-vs-rest) |
|
253 |
+
| FLIP_GB1_low-vs-high | 突变 | 蛋白质点位 | 回归 | [FLIP_GB1_low-vs-high](https://huggingface.co/datasets/tyang816/FLIP_GB1_low-vs-high) |
|
254 |
+
| FLIP_GB1_sampled | 突变 | 蛋白质点位 | 回归 | [FLIP_GB1_sampled](https://huggingface.co/datasets/tyang816/FLIP_GB1_sampled) |
|
255 |
+
| TAPE_Fluorescence | 突变 | 蛋白质点位 | 回归 | [TAPE_Fluorescence](https://huggingface.co/datasets/tyang816/TAPE_Fluorescence) |
|
256 |
+
| TAPE_Stability | 突变 | 蛋白质点位 | 回归 | [TAPE_Stability](https://huggingface.co/datasets/tyang816/TAPE_Stability) |
|
257 |
+
|
258 |
+
|
259 |
+
> 💡 不同数据集的序列结构不同,例如 ``DeepLocBinary_ESMFold`` 和 ``DeepLocBinary_AlphaFold2`` 共享相同的氨基酸序列,因此如果您只想使用 ``aa_seqs``,两者都可以使用!
|
260 |
+
|
261 |
+
</details>
|
262 |
+
|
263 |
+
|
264 |
+
## 📈 支持的评估指标
|
265 |
+
<details>
|
266 |
+
<summary>支持的评估指标</summary>
|
267 |
+
|
268 |
+
| 名称 | Torchmetrics | 问题类型 |
|
269 |
+
|------|--------------|----------|
|
270 |
+
| accuracy | Accuracy | 单标签分类/多标签分类 |
|
271 |
+
| recall | Recall | 单标签分类/多标签分类 |
|
272 |
+
| precision | Precision | 单标签分类/多标签分类 |
|
273 |
+
| f1 | F1Score | 单标签分类/多标签分类 |
|
274 |
+
| mcc | MatthewsCorrCoef | 单标签���类/多标签分类 |
|
275 |
+
| auc | AUROC | 单标签分类/多标签分类 |
|
276 |
+
| f1_max | F1ScoreMax | 多标签分类 |
|
277 |
+
| spearman_corr | SpearmanCorrCoef | 回归 |
|
278 |
+
| mse | MeanSquaredError | 回归 |
|
279 |
+
</details>
|
280 |
+
## ✈️ 环境要求
|
281 |
+
|
282 |
+
### 硬件要求
|
283 |
+
- 推荐:NVIDIA RTX 3090 (24GB) 或更好
|
284 |
+
- 实际要求取决于您选择的蛋白质语言模型
|
285 |
+
|
286 |
+
### 软件要求
|
287 |
+
- [Anaconda3](https://www.anaconda.com/download) 或 [Miniconda3](https://docs.conda.io/projects/miniconda/en/latest/)
|
288 |
+
- Python 3.10
|
289 |
+
|
290 |
+
## 📦 安装指南
|
291 |
+
<details><summary> 在macOS上开始</summary>
|
292 |
+
|
293 |
+
## 为了获得最佳性能和体验,我们推荐使用 带有M系列芯片的Mac设备(如 M1、M2、M3 等)
|
294 |
+
|
295 |
+
## 1️⃣ 克隆仓库
|
296 |
+
|
297 |
+
首先,从Github获取VenusFactory的代码:
|
298 |
+
|
299 |
+
```bash
|
300 |
+
git clone https://github.com/tyang816/VenusFactory.git
|
301 |
+
cd VenusFactory
|
302 |
+
```
|
303 |
+
|
304 |
+
## 2️⃣ 创建Conda环境
|
305 |
+
|
306 |
+
确保已安装Anaconda或Miniconda。然后,创建一个名为`venus`的新环境,使用Python 3.10:
|
307 |
+
|
308 |
+
```bash
|
309 |
+
conda create -n venus python=3.10
|
310 |
+
conda activate venus
|
311 |
+
```
|
312 |
+
|
313 |
+
## 3️⃣ 安装PyTorch和PyG依赖项
|
314 |
+
|
315 |
+
```bash
|
316 |
+
# 安装PyTorch
|
317 |
+
pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
|
318 |
+
|
319 |
+
# 安装PyG依赖项
|
320 |
+
pip install torch_scatter torch-sparse torch-cluster torch-geometric -f https://data.pyg.org/whl/torch-2.2.0+cpu.html
|
321 |
+
```
|
322 |
+
|
323 |
+
## 4️⃣ 安装其他依赖项
|
324 |
+
|
325 |
+
使用`requirements_for_macOS.txt`安装剩余依赖项:
|
326 |
+
```bash
|
327 |
+
pip install -r requirements_for_macOS.txt
|
328 |
+
```
|
329 |
+
</details>
|
330 |
+
|
331 |
+
<details><summary> 在Windows或Linux上开始(使用CUDA 12.X)</summary>
|
332 |
+
|
333 |
+
## 我们推荐使用CUDA 12.2
|
334 |
+
|
335 |
+
|
336 |
+
## 1️⃣ 克隆仓库
|
337 |
+
|
338 |
+
首先,从Github获取VenusFactory的代码:
|
339 |
+
|
340 |
+
```bash
|
341 |
+
git clone https://github.com/tyang816/VenusFactory.git
|
342 |
+
cd VenusFactory
|
343 |
+
```
|
344 |
+
|
345 |
+
## 2️⃣ 创建Conda环境
|
346 |
+
|
347 |
+
确保已安装Anaconda或Miniconda。然后,创建一个名为`venus`的新环境,使用Python 3.10:
|
348 |
+
|
349 |
+
|
350 |
+
```bash
|
351 |
+
conda create -n venus python=3.10
|
352 |
+
conda activate venus
|
353 |
+
```
|
354 |
+
|
355 |
+
## 3️⃣ 安装PyTorch和PyG依赖项
|
356 |
+
|
357 |
+
```bash
|
358 |
+
# 安装PyTorch
|
359 |
+
pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu121
|
360 |
+
|
361 |
+
# 安装PyG依赖项
|
362 |
+
pip install torch_geometric==2.6.1 -f https://pytorch-geometric.com/whl/torch-2.5.1+cu121.html
|
363 |
+
pip install --no-index torch_scatter==2.1.2 -f https://pytorch-geometric.com/whl/torch-2.5.1+cu121.html
|
364 |
+
```
|
365 |
+
|
366 |
+
## 4️⃣ 安装其他依赖项
|
367 |
+
|
368 |
+
使用`requirements.txt`安装剩余依赖项:
|
369 |
+
```bash
|
370 |
+
pip install -r requirements.txt
|
371 |
+
```
|
372 |
+
</details>
|
373 |
+
|
374 |
+
<details><summary> 在Windows或Linux上开始(使用CUDA 11.X)</summary>
|
375 |
+
|
376 |
+
## 我们推荐使用CUDA 11.8或更高版本,因为它们支持更高版本的PyTorch,提供更好的体验。
|
377 |
+
|
378 |
+
|
379 |
+
## 1️⃣ 克隆仓库
|
380 |
+
|
381 |
+
首先,从Github获取VenusFactory的代码:
|
382 |
+
|
383 |
+
```bash
|
384 |
+
git clone https://github.com/tyang816/VenusFactory.git
|
385 |
+
cd VenusFactory
|
386 |
+
```
|
387 |
+
|
388 |
+
## 2️⃣ 创建Conda环境
|
389 |
+
|
390 |
+
确保已安装Anaconda或Miniconda。然后,创建一个名为`venus`的新环境,使用Python 3.10:
|
391 |
+
|
392 |
+
|
393 |
+
```bash
|
394 |
+
conda create -n venus python=3.10
|
395 |
+
conda activate venus
|
396 |
+
```
|
397 |
+
|
398 |
+
## 3️⃣ 安装PyTorch和PyG依赖项
|
399 |
+
|
400 |
+
```bash
|
401 |
+
# 安装PyTorch
|
402 |
+
pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu118
|
403 |
+
|
404 |
+
# 安装PyG依赖项
|
405 |
+
pip install torch_geometric==2.6.1 -f https://pytorch-geometric.com/whl/torch-2.5.1+cu118.html
|
406 |
+
pip install --no-index torch_scatter==2.1.2 -f https://pytorch-geometric.com/whl/torch-2.5.1+cu118.html
|
407 |
+
```
|
408 |
+
|
409 |
+
## 4️⃣ 安装其他依赖项
|
410 |
+
|
411 |
+
使用`requirements.txt`安装剩余依赖项:
|
412 |
+
```bash
|
413 |
+
pip install -r requirements.txt
|
414 |
+
```
|
415 |
+
</details>
|
416 |
+
|
417 |
+
<details><summary> 在Windows或Linux上开始(使用CPU)</summary>
|
418 |
+
|
419 |
+
## 1️⃣ 克隆仓库
|
420 |
+
|
421 |
+
首先,从Github获取VenusFactory的代码:
|
422 |
+
|
423 |
+
```bash
|
424 |
+
git clone https://github.com/tyang816/VenusFactory.git
|
425 |
+
cd VenusFactory
|
426 |
+
```
|
427 |
+
|
428 |
+
## 2️⃣ 创建Conda环境
|
429 |
+
|
430 |
+
确保已安装Anaconda或Miniconda。然后,创建一个名为`venus`的新环境,使用Python 3.10:
|
431 |
+
|
432 |
+
|
433 |
+
```bash
|
434 |
+
conda create -n venus python=3.10
|
435 |
+
conda activate venus
|
436 |
+
```
|
437 |
+
|
438 |
+
## 3️⃣ 安装PyTorch和PyG依赖项
|
439 |
+
|
440 |
+
```bash
|
441 |
+
# 安装PyTorch
|
442 |
+
pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cpu
|
443 |
+
|
444 |
+
# 安装PyG依赖项
|
445 |
+
pip install torch_geometric==2.6.1 -f https://pytorch-geometric.com/whl/torch-2.5.1+cpu.html
|
446 |
+
pip install --no-index torch_scatter==2.1.2 -f https://pytorch-geometric.com/whl/torch-2.5.1+cpu.html
|
447 |
+
```
|
448 |
+
|
449 |
+
## 4️⃣ 安装其他依赖项
|
450 |
+
|
451 |
+
使用`requirements.txt`安装剩余依赖项:
|
452 |
+
```bash
|
453 |
+
pip install -r requirements.txt
|
454 |
+
```
|
455 |
+
</details>
|
456 |
+
|
457 |
+
|
458 |
+
## 🚀 快速开始
|
459 |
+
|
460 |
+
### 启动 Venus Web UI
|
461 |
+
|
462 |
+
使用我们基于 [Gradio](https://github.com/gradio-app/gradio) 的直观图形界面快速开始:
|
463 |
+
|
464 |
+
```bash
|
465 |
+
python ./src/webui.py
|
466 |
+
```
|
467 |
+
|
468 |
+
您可以:
|
469 |
+
- 配置并运行微调实验
|
470 |
+
- 监控训练进度
|
471 |
+
- 评估模型
|
472 |
+
- 可视化结果
|
473 |
+
|
474 |
+
### 使用各个标签页
|
475 |
+
|
476 |
+
我们提供详细的指南帮助您浏览每个标签页。
|
477 |
+
|
478 |
+
<details>
|
479 |
+
<summary>1. 训练标签页:训练您自己的蛋白质语言模型</summary>
|
480 |
+
|
481 |
+

|
482 |
+
|
483 |
+
从下拉菜单中选择蛋白质语言模型。上传您的数据集或选择可用数据集,并选择适合您问题类型的评估指标。
|
484 |
+
|
485 |
+

|
486 |
+
选择训练方法(Freeze、SES-Adapter、LoRA、QLoRA等)并配置训练参数(批量大小、学习率等)。
|
487 |
+
|
488 |
+

|
489 |
+

|
490 |
+

|
491 |
+

|
492 |
+
点击"开始训练"并实时监控进度。
|
493 |
+
|
494 |
+
<p align="center">
|
495 |
+
<img src="img/Train/Metric_Results.png" width="60%" alt="Metric_Results">
|
496 |
+
</p>
|
497 |
+
|
498 |
+
点击"下载CSV"下载测试指标结果。
|
499 |
+
</details>
|
500 |
+
|
501 |
+
<details>
|
502 |
+
<summary>2. 评估标签页:在基准测试中评估您的训练模型</summary>
|
503 |
+
|
504 |
+

|
505 |
+
|
506 |
+
通过指定模型路径加载您的训练模型。选择训练时使用的相同蛋白质语言模型和模型配置。选择测试数据集并配置批量大小。选择适合您问题类型的评估指标。最后,点击"开始评估"查看性能指标。
|
507 |
+
</details>
|
508 |
+
|
509 |
+
<details>
|
510 |
+
<summary>3. 预测标签页:使用您的训练模型进行样本预测</summary>
|
511 |
+
|
512 |
+

|
513 |
+
|
514 |
+
通过指定模型路径加载您的训练模型。选择训练时使用的相同蛋白质语言模型和模型配置。
|
515 |
+
|
516 |
+
单序列预测:在文本框中输入蛋白质序列。
|
517 |
+
|
518 |
+
批量预测:上传包含序列的CSV文件。
|
519 |
+
|
520 |
+

|
521 |
+
|
522 |
+
点击"预测"生成并查看结果。
|
523 |
+
</details>
|
524 |
+
|
525 |
+
<details>
|
526 |
+
<summary>4. 下载标签页:高效收集来自不同来源的数据</summary>
|
527 |
+
|
528 |
+
- **AlphaFold2结构**:输入UniProt ID下载蛋白质结构
|
529 |
+
- **UniProt**:使用关键词或ID搜索蛋白质信息
|
530 |
+
- **InterPro**:获取蛋白质家族和结构域信息
|
531 |
+
- **RCSB PDB**:下载实验蛋白质结构
|
532 |
+
</details>
|
533 |
+
|
534 |
+
<details>
|
535 |
+
<summary>5. 手册标签页:详细文档和指南</summary>
|
536 |
+
|
537 |
+
选择语言(英文/中文)。
|
538 |
+
|
539 |
+
使用目录导航文档并找到分步指南。
|
540 |
+
</details>
|
541 |
+
|
542 |
+
## 🧬 命令行使用
|
543 |
+
|
544 |
+
对于偏好命令行界面的用户,我们提供全面的脚本解决方案。
|
545 |
+
|
546 |
+
<details>
|
547 |
+
<summary>训练方法:适应不同需求的各种微调方法</summary>
|
548 |
+
|
549 |
+
### 全模型微调
|
550 |
+
```bash
|
551 |
+
# 冻结微调:训练特定层同时冻结其他层
|
552 |
+
bash ./script/train/train_plm_vanilla.sh
|
553 |
+
```
|
554 |
+
|
555 |
+
### 参数高效微调 (PEFT)
|
556 |
+
```bash
|
557 |
+
# SES-Adapter:选择性和高效的适配器微调
|
558 |
+
bash ./script/train/train_plm_ses-adapter.sh
|
559 |
+
|
560 |
+
# AdaLoRA:自适应低秩适配
|
561 |
+
bash ./script/train/train_plm_adalora.sh
|
562 |
+
|
563 |
+
# QLoRA:量化低秩适配
|
564 |
+
bash ./script/train/train_plm_qlora.sh
|
565 |
+
|
566 |
+
# LoRA:低秩适配
|
567 |
+
bash ./script/train/train_plm_lora.sh
|
568 |
+
|
569 |
+
# DoRA:双低秩适配
|
570 |
+
bash ./script/train/train_plm_dora.sh
|
571 |
+
|
572 |
+
# IA3:通过抑制和放大内部激活的注入适配器
|
573 |
+
bash ./script/train/train_plm_ia3.sh
|
574 |
+
```
|
575 |
+
|
576 |
+
#### 训练方法比较
|
577 |
+
| 方法 | 内存使用 | 训练速度 | 性能 |
|
578 |
+
|------|----------|----------|------|
|
579 |
+
| Freeze | 低 | 快 | 良好 |
|
580 |
+
| SES-Adapter | 中等 | 中等 | 更好 |
|
581 |
+
| AdaLoRA | 低 | 中等 | 更好 |
|
582 |
+
| QLoRA | 非常低 | 较慢 | 良好 |
|
583 |
+
| LoRA | 低 | 快 | 良好 |
|
584 |
+
| DoRA | 低 | 中等 | 更好 |
|
585 |
+
| IA3 | 非常低 | 快 | 良好 |
|
586 |
+
|
587 |
+
</details>
|
588 |
+
|
589 |
+
<details>
|
590 |
+
<summary>模型评估:全面的评估工具</summary>
|
591 |
+
|
592 |
+
### 基础评估
|
593 |
+
```bash
|
594 |
+
# 在测试集上评估模型性能
|
595 |
+
bash ./script/eval/eval.sh
|
596 |
+
```
|
597 |
+
|
598 |
+
### 可用指标
|
599 |
+
- 分类:准确率、精确率、召回率、F1、MCC、AUC
|
600 |
+
- 回归:MSE、Spearman相关系数
|
601 |
+
- 多标签:F1-max
|
602 |
+
|
603 |
+
### 可视化工具
|
604 |
+
- 训练曲线
|
605 |
+
- 混淆矩阵
|
606 |
+
- ROC曲线
|
607 |
+
- 性能比较图
|
608 |
+
|
609 |
+
</details>
|
610 |
+
|
611 |
+
<details>
|
612 |
+
<summary>结构序列工具:处理蛋白质结构信息</summary>
|
613 |
+
|
614 |
+
### ESM结构序列
|
615 |
+
```bash
|
616 |
+
# 使用ESM-3生成结构序列
|
617 |
+
bash ./script/get_get_structure_seq/get_esm3_structure_seq.sh
|
618 |
+
```
|
619 |
+
|
620 |
+
### 二级结构
|
621 |
+
```bash
|
622 |
+
# 预测蛋白质二级结构
|
623 |
+
bash ./script/get_get_structure_seq/get_secondary_structure_seq.sh
|
624 |
+
```
|
625 |
+
|
626 |
+
特点:
|
627 |
+
- 支持多种序列格式
|
628 |
+
- 批处理能力
|
629 |
+
- 与流行的结构预测工具集成
|
630 |
+
|
631 |
+
</details>
|
632 |
+
|
633 |
+
<details>
|
634 |
+
<summary>数据收集工具:多源蛋白质数据获取</summary>
|
635 |
+
|
636 |
+
### 格式转换
|
637 |
+
```bash
|
638 |
+
# 将CIF格式转换为PDB
|
639 |
+
bash ./crawler/convert/maxit.sh
|
640 |
+
```
|
641 |
+
|
642 |
+
### 元数据收集
|
643 |
+
```bash
|
644 |
+
# 从RCSB PDB下载元数据
|
645 |
+
bash ./crawler/metadata/download_rcsb.sh
|
646 |
+
```
|
647 |
+
|
648 |
+
### 序列数据
|
649 |
+
```bash
|
650 |
+
# 从UniProt下载蛋白质序列
|
651 |
+
bash ./crawler/sequence/download_uniprot_seq.sh
|
652 |
+
```
|
653 |
+
|
654 |
+
### 结构数据
|
655 |
+
```bash
|
656 |
+
# 从AlphaFold2数据库下载
|
657 |
+
bash ./crawler/structure/download_alphafold.sh
|
658 |
+
|
659 |
+
# 从RCSB PDB下载
|
660 |
+
bash ./crawler/structure/download_rcsb.sh
|
661 |
+
```
|
662 |
+
|
663 |
+
特点:
|
664 |
+
- 自动批量下载
|
665 |
+
- 断点续传
|
666 |
+
- 数据完整性验证
|
667 |
+
- 多源支持
|
668 |
+
- 自定义搜索条件
|
669 |
+
|
670 |
+
#### 支持的数据库
|
671 |
+
| 数据库 | 数据类型 | 访问方式 | 速率限制 |
|
672 |
+
|--------|----------|----------|----------|
|
673 |
+
| AlphaFold2 | 结构 | REST API | 是 |
|
674 |
+
| RCSB PDB | 结构 | FTP/HTTP | 否 |
|
675 |
+
| UniProt | 序列 | REST API | 是 |
|
676 |
+
| InterPro | 结构域 | REST API | 是 |
|
677 |
+
|
678 |
+
</details>
|
679 |
+
|
680 |
+
<details>
|
681 |
+
<summary>使用示例:常见场景和解决方案</summary>
|
682 |
+
|
683 |
+
### 训练示例
|
684 |
+
```bash
|
685 |
+
# 使用ESM2训练蛋白质溶解度预测器
|
686 |
+
bash ./script/train/train_plm_lora.sh \
|
687 |
+
--model "facebook/esm2_t33_650M_UR50D" \
|
688 |
+
--dataset "DeepSol" \
|
689 |
+
--batch_size 32 \
|
690 |
+
--learning_rate 1e-4
|
691 |
+
```
|
692 |
+
|
693 |
+
### 评估示例
|
694 |
+
```bash
|
695 |
+
# 评估训练好的模型
|
696 |
+
bash ./script/eval/eval.sh \
|
697 |
+
--model_path "path/to/your/model" \
|
698 |
+
--test_dataset "DeepSol_test"
|
699 |
+
```
|
700 |
+
|
701 |
+
### 数据收集示例
|
702 |
+
```bash
|
703 |
+
# 下载UniProt ID列表对应的结构
|
704 |
+
bash ./crawler/structure/download_alphafold.sh \
|
705 |
+
--input uniprot_ids.txt \
|
706 |
+
--output ./structures
|
707 |
+
```
|
708 |
+
|
709 |
+
</details>
|
710 |
+
|
711 |
+
> 💡 所有脚本都支持额外的命令行参数进行自定义。使用任何脚本的 `--help` 选项查看可用选项。
|
712 |
+
|
713 |
+
## 🙌 引用
|
714 |
+
|
715 |
+
如果您使用了我们的代码或数据,请引用我们的工作:
|
716 |
+
|
717 |
+
```bibtex
|
718 |
+
@article{tan2025venusfactory,
|
719 |
+
title={VenusFactory: A Unified Platform for Protein Engineering Data Retrieval and Language Model Fine-Tuning},
|
720 |
+
author={Tan, Yang and Liu, Chen and Gao, Jingyuan and Wu, Banghao and Li, Mingchen and Wang, Ruilin and Zhang, Lingrong and Yu, Huiqun and Fan, Guisheng and Hong, Liang and Zhou, Bingxin},
|
721 |
+
journal={arXiv preprint arXiv:2503.15438},
|
722 |
+
year={2025}
|
723 |
+
}
|
724 |
+
```
|
725 |
+
|
726 |
+
## 🎊 致谢
|
727 |
+
|
728 |
+
感谢 [Liang's Lab](https://ins.sjtu.edu.cn/people/lhong/index.html) 的支持。
|
Scripts_notebook.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
WebUI_demo.md
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Quick Demo Guide
|
2 |
+
|
3 |
+
This document provides a comprehensive guide to help you quickly understand the main features of VenusFactory and perform fine-tuning, evaluation, and prediction on a demo dataset for protein solubility prediction.
|
4 |
+
|
5 |
+
## 1. Environment Preparation
|
6 |
+
|
7 |
+
Before starting, please ensure that you have successfully installed **VenusFactory** and correctly configured the corresponding environment and Python dependencies. If not yet installed, please refer to the **✈️ Requirements** section in [README.md](README.md) for installation instructions.
|
8 |
+
|
9 |
+
## 2. Launch Web Interface
|
10 |
+
|
11 |
+
Enter the following command in the command line to launch the Web UI:
|
12 |
+
|
13 |
+
```bash
|
14 |
+
python src/webui.py
|
15 |
+
```
|
16 |
+
|
17 |
+
## 3. Training (Training Tab)
|
18 |
+
|
19 |
+
### 3.1 Select Pre-trained Model
|
20 |
+
|
21 |
+
Choose a suitable pre-trained model from the Protein Language Model dropdown. It is recommended to start with ESM2-8M, which has lower computational cost and is suitable for beginners.
|
22 |
+
|
23 |
+
### 3.2 Select Dataset
|
24 |
+
|
25 |
+
In the Dataset Configuration section, select the Demo_Solubility dataset (default option). Click the Preview Dataset button to preview the dataset content.
|
26 |
+
|
27 |
+
### 3.3 Set Task Parameters
|
28 |
+
|
29 |
+
- Problem Type, Number of Labels, and Metrics options will be automatically filled when selecting a Pre-defined Dataset.
|
30 |
+
|
31 |
+
- For Batch Processing Mode, it is recommended to select Batch Token Mode to avoid uneven batch processing due to high variance in protein sequence lengths.
|
32 |
+
|
33 |
+
- Batch Token is recommended to be set to 4000. If you encounter CUDA memory errors, you can reduce this value accordingly.
|
34 |
+
|
35 |
+
### 3.4 Choose Training Method
|
36 |
+
|
37 |
+
In the Training Parameters section:
|
38 |
+
|
39 |
+
- Training Method is a key selection. This Demo dataset does not currently support the SES-Adapter method (due to lack of structural sequence information).
|
40 |
+
|
41 |
+
- You can choose the Freeze method to only fine-tune the classification head, or use the LoRA method for efficient parameter fine-tuning.
|
42 |
+
|
43 |
+
### 3.5 Start Training
|
44 |
+
|
45 |
+
- Click Preview Command to preview the command line script.
|
46 |
+
|
47 |
+
- Click Start to begin training. The Web interface will display model statistics and real-time training monitoring.
|
48 |
+
|
49 |
+
- After training is complete, the interface will show the model's Metrics on the test set to evaluate model performance.
|
50 |
+
|
51 |
+
## 4. Evaluation (Evaluation Tab)
|
52 |
+
|
53 |
+
### 4.1 Select Model Path
|
54 |
+
|
55 |
+
In the **Model Path** option, enter the path of the trained model (under the `ckpt` root directory). Ensure that the selected **PLM** and **method** are consistent with those used during training.
|
56 |
+
|
57 |
+
### 4.2 Evaluation Dataset Loading Rules
|
58 |
+
|
59 |
+
- The evaluation system will automatically load the test set of the corresponding dataset.
|
60 |
+
- If the test set cannot be found, data will be loaded in the order of **validation set → training set**.
|
61 |
+
- For custom datasets uploaded to Hugging Face:
|
62 |
+
- **If only a single CSV file is uploaded**, the evaluation system will automatically load that file, regardless of naming.
|
63 |
+
- **If training, validation, and test sets are uploaded**, please ensure accurate file naming.
|
64 |
+
|
65 |
+
### 4.3 Start Evaluation
|
66 |
+
|
67 |
+
Click **Start Evaluation** to begin the evaluation.
|
68 |
+
|
69 |
+
> **Example Model**
|
70 |
+
> This project provides a model **demo_provided.pt** that has already been trained on the **Demo_Solubility** dataset using the **Freeze** method, which can be used directly for evaluation.
|
71 |
+
|
72 |
+
## 5. Prediction (Prediction Tab)
|
73 |
+
|
74 |
+
### 5.1 Single Sequence Prediction
|
75 |
+
|
76 |
+
Enter a single amino acid sequence to directly predict its solubility.
|
77 |
+
|
78 |
+
### 5.2 Batch Prediction
|
79 |
+
|
80 |
+
- By uploading a CSV file, you can predict the solubility of proteins in batch and download the results (in CSV format).
|
81 |
+
|
82 |
+
## 6. Download (Download Tab)
|
83 |
+
|
84 |
+
For detailed instructions and examples regarding the **Download Tab**, please refer to the **Download** section in the **Manual Tab**.
|
WebUI_demo_CN.md
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 快速Demo指南
|
2 |
+
|
3 |
+
本文档提供了一个全面的指南,帮助您快速了解VenusFactory的主要功能,并在一个蛋白质可溶性预测的Demo数据集上进行微调训练、评估和预测。
|
4 |
+
|
5 |
+
## 1. 环境准备
|
6 |
+
|
7 |
+
在开始之前,请确保您已成功安装 **VenusFactory** 并正确配置了相应的环境和 Python 依赖包。如果尚未安装,请参考 [README.md](README_CN.md) 中的 **✈️ Requirements** 章节进行安装。
|
8 |
+
|
9 |
+
## 2. 启动 Web 界面
|
10 |
+
|
11 |
+
在命令行中输入以下命令,启动 Web UI:
|
12 |
+
```bash
|
13 |
+
python src/webui.py
|
14 |
+
```
|
15 |
+
|
16 |
+
## 3. 训练(Training Tab)
|
17 |
+
|
18 |
+
### 3.1 选择预训练模型
|
19 |
+
|
20 |
+
在 Protein Language Model 选项中选择合适的预训练模型。建议从 ESM2-8M 开始,该模型计算成本较低,便于快速上手。
|
21 |
+
|
22 |
+
### 3.2 选择数据集
|
23 |
+
|
24 |
+
在 Dataset Configuration 选项中,选择 Demo_Solubility 数据集(默认选项)。点击 Preview Dataset 按钮可预览数据集内容。
|
25 |
+
|
26 |
+
### 3.3 设定任务参数
|
27 |
+
|
28 |
+
- Problem Type、Number of Labels 和 Metrics 选项会在选择 Pre-defined Dataset 时自动填充。
|
29 |
+
|
30 |
+
- Batch Processing Mode 建议选择 Batch Token Mode,以避免蛋白质序列长度方差过大导致批处理不均。
|
31 |
+
|
32 |
+
- Batch Token 推荐设为 4000,若出现 CUDA 内存不足错误,可适当减小该值。
|
33 |
+
|
34 |
+
### 3.4 选择训练方法
|
35 |
+
|
36 |
+
在 Training Parameters 选项中:
|
37 |
+
|
38 |
+
- Training Method 为关键选择项。本 Demo 数据集暂不支持 SES-Adapter 方法(因缺乏结构序列信息)。
|
39 |
+
|
40 |
+
- 可选择 Freeze 方法,仅微调分类头,或采用 LoRA 方法进行高效参数微调。
|
41 |
+
|
42 |
+
### 3.5 开始训练
|
43 |
+
|
44 |
+
- 点击 Preview Command 预览命令行脚本。
|
45 |
+
|
46 |
+
- 点击 Start 启动训练,Web 界面会显示模型的统计信息和实时训练监控。
|
47 |
+
|
48 |
+
- 训练完成后,界面会展示模型在测试集上的 Metrics,用于评估模型效果。
|
49 |
+
|
50 |
+
## 4. 评估(Evaluation Tab)
|
51 |
+
|
52 |
+
### 4.1 选择模型路径
|
53 |
+
|
54 |
+
在 **Model Path** 选项中,输入训练完成的模型路径(`ckpt` 根目录下)。确保选择的 **PLM** 和 **method** 与训练时一致。
|
55 |
+
|
56 |
+
### 4.2 评估数据集加载规则
|
57 |
+
|
58 |
+
- 评估系统会自动加载相应数据集的测试集。
|
59 |
+
- 若找不到测试集,则按照 **验证集 → 训练集** 的顺序加载数据。
|
60 |
+
- 上传到 Hugging Face 的自定义数据集:
|
61 |
+
- **若仅上传单个 CSV 文件**,评估系统会自动加载该文件,不受命名影响。
|
62 |
+
- **若上传训练集、验证集和测试集**,请确保文件命名准确。
|
63 |
+
|
64 |
+
### 4.3 启动评估
|
65 |
+
|
66 |
+
点击 **Start Evaluation** 进行评估。
|
67 |
+
|
68 |
+
> **示例模型**
|
69 |
+
> 本项目提供了一个已经在 **Demo_Solubility** 数据集上使用 **Freeze** 方法训练的模型 **demo_provided.pt**,可直接用于评估。
|
70 |
+
|
71 |
+
## 5. 预测(Prediction Tab)
|
72 |
+
|
73 |
+
### 5.1 单序列预测(Sequence Prediction)
|
74 |
+
|
75 |
+
输入单个氨基酸序列,即可直接进行可溶性预测。
|
76 |
+
|
77 |
+
### 5.2 批量预测(Batch Prediction)
|
78 |
+
|
79 |
+
- 通过上传 CSV 文件,可批量预测蛋白质的可溶性,并下载结果(CSV 格式)。
|
80 |
+
|
81 |
+
## 6. 下载(Download Tab)
|
82 |
+
|
83 |
+
有关 **Download Tab** 的详细使用说明和示例,请参考 **Manual Tab** 中的 **Download** 章节。
|
app.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import time
|
3 |
+
import gradio as gr
|
4 |
+
from web.utils.monitor import TrainingMonitor
|
5 |
+
from web.train_tab import create_train_tab
|
6 |
+
from web.eval_tab import create_eval_tab
|
7 |
+
from web.download_tab import create_download_tab
|
8 |
+
from web.predict_tab import create_predict_tab
|
9 |
+
from web.manual_tab import create_manual_tab
|
10 |
+
|
11 |
+
def load_constant():
|
12 |
+
"""Load constant values from config files"""
|
13 |
+
try:
|
14 |
+
return json.load(open("src/constant.json"))
|
15 |
+
except Exception as e:
|
16 |
+
return {"error": f"Failed to load constant.json: {str(e)}"}
|
17 |
+
|
18 |
+
def create_ui():
|
19 |
+
monitor = TrainingMonitor()
|
20 |
+
constant = load_constant()
|
21 |
+
|
22 |
+
def update_output():
|
23 |
+
try:
|
24 |
+
if monitor.is_training:
|
25 |
+
messages = monitor.get_messages()
|
26 |
+
loss_plot = monitor.get_loss_plot()
|
27 |
+
metrics_plot = monitor.get_metrics_plot()
|
28 |
+
return messages, loss_plot, metrics_plot
|
29 |
+
else:
|
30 |
+
if monitor.error_message:
|
31 |
+
return f"Training stopped with error:\n{monitor.error_message}", None, None
|
32 |
+
return "Click Start to begin training!", None, None
|
33 |
+
except Exception as e:
|
34 |
+
return f"Error in UI update: {str(e)}", None, None
|
35 |
+
|
36 |
+
with gr.Blocks() as demo:
|
37 |
+
gr.Markdown("# VenusFactory")
|
38 |
+
|
39 |
+
# Create tabs
|
40 |
+
with gr.Tabs():
|
41 |
+
try:
|
42 |
+
train_components = {"output_text": None, "loss_plot": None, "metrics_plot": None}
|
43 |
+
train_tab = create_train_tab(constant)
|
44 |
+
if train_components["output_text"] is not None and train_components["loss_plot"] is not None and train_components["metrics_plot"] is not None:
|
45 |
+
train_components["output_text"] = train_tab["output_text"]
|
46 |
+
train_components["loss_plot"] = train_tab["loss_plot"]
|
47 |
+
train_components["metrics_plot"] = train_tab["metrics_plot"]
|
48 |
+
eval_components = create_eval_tab(constant)
|
49 |
+
predict_components = create_predict_tab(constant)
|
50 |
+
download_components = create_download_tab(constant)
|
51 |
+
manual_components = create_manual_tab(constant)
|
52 |
+
except Exception as e:
|
53 |
+
gr.Markdown(f"Error creating UI components: {str(e)}")
|
54 |
+
train_components = {"output_text": None, "loss_plot": None, "metrics_plot": None}
|
55 |
+
|
56 |
+
if train_components["output_text"] is not None and train_components["loss_plot"] is not None and train_components["metrics_plot"] is not None:
|
57 |
+
demo.load(
|
58 |
+
fn=update_output,
|
59 |
+
inputs=None,
|
60 |
+
outputs=[
|
61 |
+
train_components["output_text"],
|
62 |
+
train_components["loss_plot"],
|
63 |
+
train_components["metrics_plot"]
|
64 |
+
]
|
65 |
+
)
|
66 |
+
|
67 |
+
return demo
|
68 |
+
|
69 |
+
if __name__ == "__main__":
|
70 |
+
try:
|
71 |
+
demo = create_ui()
|
72 |
+
demo.launch(server_name="0.0.0.0", share=True, allowed_paths=["img"])
|
73 |
+
except Exception as e:
|
74 |
+
print(f"Failed to launch UI: {str(e)}")
|
ckpt/demo/demo.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"hidden_size": 1280, "num_attention_head": 8, "attention_probs_dropout": 0.1, "plm_model": "facebook/esm2_t33_650M_UR50D", "pooling_method": "mean", "pooling_dropout": 0.1, "dataset": "tyang816/FLIP_AAV_des-mut", "dataset_config": "data/FLIP_AAV/FLIP_AAV_des-mut_HF.json", "normalize": "min_max", "num_labels": 1, "problem_type": "regression", "pdb_type": null, "train_file": null, "valid_file": null, "test_file": null, "metrics": ["spearman_corr"], "seed": 3407, "learning_rate": 0.0005, "scheduler": null, "warmup_steps": 0, "num_workers": 4, "batch_size": null, "batch_token": 4000, "num_epochs": 5, "max_seq_len": -1, "gradient_accumulation_steps": 1, "max_grad_norm": -1.0, "patience": 10, "monitor": "spearman_corr", "monitor_strategy": "max", "training_method": "freeze", "lora_r": 8, "lora_alpha": 32, "lora_dropout": 0.1, "feedforward_modules": "w0", "lora_target_modules": ["query", "key", "value"], "structure_seq": [], "output_model_name": "demo.pt", "output_root": "ckpt", "output_dir": "ckpt\\demo", "wandb": false, "wandb_entity": null, "wandb_project": "VenusFactory", "wandb_run_name": null}
|
ckpt/demo/demo.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:85a61422d6f469c4dc94823bbdfcba090377c4235bca1f9e5768d1c89f853113
|
3 |
+
size 6576362
|
ckpt/demo/demo_provided.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"hidden_size": 320, "num_attention_head": 8, "attention_probs_dropout": 0.1, "plm_model": "facebook/esm2_t6_8M_UR50D", "pooling_method": "mean", "pooling_dropout": 0.1, "dataset": "tyang816/Demo_Solubility", "dataset_config": "data/Demo/Demo_Solubility_HF.json", "normalize": null, "num_labels": 2, "problem_type": "single_label_classification", "pdb_type": null, "train_file": null, "valid_file": null, "test_file": null, "metrics": ["accuracy", "mcc", "f1", "precision", "recall", "auroc"], "seed": 3407, "learning_rate": 0.0005, "scheduler": null, "warmup_steps": 0, "num_workers": 4, "batch_size": null, "batch_token": 4000, "num_epochs": 20, "max_seq_len": -1, "gradient_accumulation_steps": 1, "max_grad_norm": -1.0, "patience": 10, "monitor": "accuracy", "monitor_strategy": "max", "training_method": "freeze", "lora_r": 8, "lora_alpha": 32, "lora_dropout": 0.1, "feedforward_modules": "w0", "lora_target_modules": ["query", "key", "value"], "structure_seq": [], "output_model_name": "demo_provided.pt", "output_root": "ckpt", "output_dir": "ckpt/demo", "wandb": false, "wandb_entity": null, "wandb_project": "VenusFactory", "wandb_run_name": null}
|
ckpt/demo/demo_provided.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3b46ee577312579dee0906b3cdbd23d30d40bbb8a8ce873cba85abbf694c125e
|
3 |
+
size 418692
|
data/DeepET_Topt/DeepET_Topt_AlphaFold2_HF.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/DeepET_Topt_AlphaFold2",
|
3 |
+
"pdb_type": "AlphaFold2",
|
4 |
+
"num_labels": 1,
|
5 |
+
"problem_type": "regression",
|
6 |
+
"metrics": "mse,spearman_corr",
|
7 |
+
"monitor": "mse",
|
8 |
+
"monitor_strategy": "min",
|
9 |
+
"normalize": "standard"
|
10 |
+
}
|
data/DeepET_Topt/DeepET_Topt_ESMFold_HF.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/DeepET_Topt_ESMFold",
|
3 |
+
"pdb_type": "ESMFold",
|
4 |
+
"num_labels": 1,
|
5 |
+
"problem_type": "regression",
|
6 |
+
"metrics": "mse,spearman_corr",
|
7 |
+
"monitor": "mse",
|
8 |
+
"monitor_strategy": "min",
|
9 |
+
"normalize": "standard"
|
10 |
+
}
|
data/DeepET_Topt/DeepET_Topt_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/DeepET_Topt",
|
3 |
+
"num_labels": 1,
|
4 |
+
"problem_type": "regression",
|
5 |
+
"metrics": "mse,spearman_corr",
|
6 |
+
"monitor": "mse",
|
7 |
+
"monitor_strategy": "min",
|
8 |
+
"normalize": "standard"
|
9 |
+
}
|
data/DeepLoc2Multi/DeepLoc2Multi_AlphaFold2_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/DeepLoc2Multi_AlphaFold2",
|
3 |
+
"pdb_type": "AlphaFold2",
|
4 |
+
"num_labels": 10,
|
5 |
+
"problem_type": "multi_label_classification",
|
6 |
+
"metrics": "f1_max",
|
7 |
+
"monitor": "f1_max",
|
8 |
+
"monitor_strategy": "max"
|
9 |
+
}
|
data/DeepLoc2Multi/DeepLoc2Multi_HF.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/DeepLoc2Multi",
|
3 |
+
"num_labels": 10,
|
4 |
+
"problem_type": "multi_label_classification",
|
5 |
+
"metrics": "f1_max",
|
6 |
+
"monitor": "f1_max",
|
7 |
+
"monitor_strategy": "max"
|
8 |
+
}
|
data/DeepLocBinary/DeepLocBinary_AlphaFold2_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/DeepLocBinary_AlphaFold2",
|
3 |
+
"pdb_type": "AlphaFold2",
|
4 |
+
"num_labels": 2,
|
5 |
+
"problem_type": "single_label_classification",
|
6 |
+
"metrics": "accuracy,mcc,f1,precision,recall,auroc",
|
7 |
+
"monitor": "accuracy",
|
8 |
+
"monitor_strategy": "max"
|
9 |
+
}
|
data/DeepLocBinary/DeepLocBinary_ESMFold_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/DeepLocBinary_ESMFold",
|
3 |
+
"pdb_type": "ESMFold",
|
4 |
+
"num_labels": 2,
|
5 |
+
"problem_type": "single_label_classification",
|
6 |
+
"metrics": "accuracy,mcc,f1,precision,recall,auroc",
|
7 |
+
"monitor": "accuracy",
|
8 |
+
"monitor_strategy": "max"
|
9 |
+
}
|
data/DeepLocBinary/DeepLocBinary_HF.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/DeepLocBinary",
|
3 |
+
"num_labels": 2,
|
4 |
+
"problem_type": "single_label_classification",
|
5 |
+
"metrics": "accuracy,mcc,f1,precision,recall,auroc",
|
6 |
+
"monitor": "accuracy",
|
7 |
+
"monitor_strategy": "max"
|
8 |
+
}
|
data/DeepLocMulti/DeepLocMulti_AlphaFold2_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/DeepLocMulti_AlphaFold2",
|
3 |
+
"pdb_type": "AlphaFold2",
|
4 |
+
"num_labels": 10,
|
5 |
+
"problem_type": "single_label_classification",
|
6 |
+
"metrics": "accuracy,mcc,f1,precision,recall,auroc",
|
7 |
+
"monitor": "accuracy",
|
8 |
+
"monitor_strategy": "max"
|
9 |
+
}
|
data/DeepLocMulti/DeepLocMulti_ESMFold_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/DeepLocMulti_ESMFold",
|
3 |
+
"pdb_type": "ESMFold",
|
4 |
+
"num_labels": 10,
|
5 |
+
"problem_type": "single_label_classification",
|
6 |
+
"metrics": "accuracy,mcc,f1,precision,recall,auroc",
|
7 |
+
"monitor": "accuracy",
|
8 |
+
"monitor_strategy": "max"
|
9 |
+
}
|
data/DeepLocMulti/DeepLocMulti_HF.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/DeepLocMulti",
|
3 |
+
"num_labels": 10,
|
4 |
+
"problem_type": "single_label_classification",
|
5 |
+
"metrics": "accuracy,mcc,f1,precision,recall,auroc",
|
6 |
+
"monitor": "accuracy",
|
7 |
+
"monitor_strategy": "max"
|
8 |
+
}
|
data/DeepSol/DeepSol_ESMFold_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/DeepSol_ESMFold",
|
3 |
+
"pdb_type": "ESMFold",
|
4 |
+
"num_labels": 2,
|
5 |
+
"problem_type": "single_label_classification",
|
6 |
+
"metrics": "accuracy,mcc,f1,precision,recall,auroc",
|
7 |
+
"monitor": "accuracy",
|
8 |
+
"monitor_strategy": "max"
|
9 |
+
}
|
data/DeepSol/DeepSol_HF.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/DeepSol",
|
3 |
+
"num_labels": 2,
|
4 |
+
"problem_type": "single_label_classification",
|
5 |
+
"metrics": "accuracy,mcc,f1,precision,recall,auroc",
|
6 |
+
"monitor": "accuracy",
|
7 |
+
"monitor_strategy": "max"
|
8 |
+
}
|
data/DeepSoluE/DeepSoluE_ESMFold_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/DeepSoluE_ESMFold",
|
3 |
+
"pdb_type": "ESMFold",
|
4 |
+
"num_labels": 2,
|
5 |
+
"problem_type": "single_label_classification",
|
6 |
+
"metrics": "accuracy,mcc,f1,precision,recall,auroc",
|
7 |
+
"monitor": "accuracy",
|
8 |
+
"monitor_strategy": "max"
|
9 |
+
}
|
data/DeepSoluE/DeepSoluE_HF.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/DeepSoluE",
|
3 |
+
"num_labels": 2,
|
4 |
+
"problem_type": "single_label_classification",
|
5 |
+
"metrics": "accuracy,mcc,f1,precision,recall,auroc",
|
6 |
+
"monitor": "accuracy",
|
7 |
+
"monitor_strategy": "max"
|
8 |
+
}
|
data/Demo/Demo_Solubility_HF.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/Demo_Solubility",
|
3 |
+
"num_labels": 2,
|
4 |
+
"problem_type": "single_label_classification",
|
5 |
+
"metrics": "accuracy,mcc,f1,precision,recall,auroc",
|
6 |
+
"monitor": "accuracy",
|
7 |
+
"monitor_strategy": "max"
|
8 |
+
}
|
data/EC/EC_AlphaFold2_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/EC_AlphaFold2",
|
3 |
+
"pdb_type": "AlphaFold2",
|
4 |
+
"num_labels": 585,
|
5 |
+
"problem_type": "multi_label_classification",
|
6 |
+
"metrics": "f1_max",
|
7 |
+
"monitor": "f1_max",
|
8 |
+
"monitor_strategy": "max"
|
9 |
+
}
|
data/EC/EC_ESMFold_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/EC_ESMFold",
|
3 |
+
"pdb_type": "ESMFold",
|
4 |
+
"num_labels": 585,
|
5 |
+
"problem_type": "multi_label_classification",
|
6 |
+
"metrics": "f1_max",
|
7 |
+
"monitor": "f1_max",
|
8 |
+
"monitor_strategy": "max"
|
9 |
+
}
|
data/EC/EC_HF.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/EC",
|
3 |
+
"num_labels": 585,
|
4 |
+
"problem_type": "multi_label_classification",
|
5 |
+
"metrics": "f1_max",
|
6 |
+
"monitor": "f1_max",
|
7 |
+
"monitor_strategy": "max"
|
8 |
+
}
|
data/FLIP_AAV/FLIP_AAV_des-mut_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/FLIP_AAV_des-mut",
|
3 |
+
"num_labels": 1,
|
4 |
+
"problem_type": "regression",
|
5 |
+
"metrics": "spearman_corr",
|
6 |
+
"monitor": "spearman_corr",
|
7 |
+
"monitor_strategy": "max",
|
8 |
+
"normalize": "min_max"
|
9 |
+
}
|
data/FLIP_AAV/FLIP_AAV_low-vs-high_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/FLIP_AAV_low-vs-high",
|
3 |
+
"num_labels": 1,
|
4 |
+
"problem_type": "regression",
|
5 |
+
"metrics": "spearman_corr",
|
6 |
+
"monitor": "spearman_corr",
|
7 |
+
"monitor_strategy": "max",
|
8 |
+
"normalize": "min_max"
|
9 |
+
}
|
data/FLIP_AAV/FLIP_AAV_mut-des_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/FLIP_AAV_mut-des",
|
3 |
+
"num_labels": 1,
|
4 |
+
"problem_type": "regression",
|
5 |
+
"metrics": "spearman_corr",
|
6 |
+
"monitor": "spearman_corr",
|
7 |
+
"monitor_strategy": "max",
|
8 |
+
"normalize": "min_max"
|
9 |
+
}
|
data/FLIP_AAV/FLIP_AAV_one-vs-rest_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/FLIP_AAV_one-vs-rest",
|
3 |
+
"num_labels": 1,
|
4 |
+
"problem_type": "regression",
|
5 |
+
"metrics": "spearman_corr",
|
6 |
+
"monitor": "spearman_corr",
|
7 |
+
"monitor_strategy": "max",
|
8 |
+
"normalize": "min_max"
|
9 |
+
}
|
data/FLIP_AAV/FLIP_AAV_sampled_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/FLIP_AAV_sampled",
|
3 |
+
"num_labels": 1,
|
4 |
+
"problem_type": "regression",
|
5 |
+
"metrics": "spearman_corr",
|
6 |
+
"monitor": "spearman_corr",
|
7 |
+
"monitor_strategy": "max",
|
8 |
+
"normalize": "min_max"
|
9 |
+
}
|
data/FLIP_AAV/FLIP_AAV_seven-vs-rest_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/FLIP_AAV_seven-vs-rest",
|
3 |
+
"num_labels": 1,
|
4 |
+
"problem_type": "regression",
|
5 |
+
"metrics": "spearman_corr",
|
6 |
+
"monitor": "spearman_corr",
|
7 |
+
"monitor_strategy": "max",
|
8 |
+
"normalize": "min_max"
|
9 |
+
}
|
data/FLIP_AAV/FLIP_AAV_two-vs-rest_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/FLIP_AAV_two-vs-rest",
|
3 |
+
"num_labels": 1,
|
4 |
+
"problem_type": "regression",
|
5 |
+
"metrics": "spearman_corr",
|
6 |
+
"monitor": "spearman_corr",
|
7 |
+
"monitor_strategy": "max",
|
8 |
+
"normalize": "min_max"
|
9 |
+
}
|
data/FLIP_GB1/FLIP_GB1_low-vs-high_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/FLIP_GB1_low-vs-high",
|
3 |
+
"num_labels": 1,
|
4 |
+
"problem_type": "regression",
|
5 |
+
"metrics": "spearman_corr",
|
6 |
+
"monitor": "spearman_corr",
|
7 |
+
"monitor_strategy": "max",
|
8 |
+
"normalize": "min_max"
|
9 |
+
}
|
data/FLIP_GB1/FLIP_GB1_one-vs-rest_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/FLIP_GB1_one-vs-rest",
|
3 |
+
"num_labels": 1,
|
4 |
+
"problem_type": "regression",
|
5 |
+
"metrics": "spearman_corr",
|
6 |
+
"monitor": "spearman_corr",
|
7 |
+
"monitor_strategy": "max",
|
8 |
+
"normalize": "min_max"
|
9 |
+
}
|
data/FLIP_GB1/FLIP_GB1_sampled_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/FLIP_GB1_sampled",
|
3 |
+
"num_labels": 1,
|
4 |
+
"problem_type": "regression",
|
5 |
+
"metrics": "spearman_corr",
|
6 |
+
"monitor": "spearman_corr",
|
7 |
+
"monitor_strategy": "max",
|
8 |
+
"normalize": "min_max"
|
9 |
+
}
|
data/FLIP_GB1/FLIP_GB1_three-vs-rest_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/FLIP_GB1_three-vs-rest",
|
3 |
+
"num_labels": 1,
|
4 |
+
"problem_type": "regression",
|
5 |
+
"metrics": "spearman_corr",
|
6 |
+
"monitor": "spearman_corr",
|
7 |
+
"monitor_strategy": "max",
|
8 |
+
"normalize": "min_max"
|
9 |
+
}
|
data/FLIP_GB1/FLIP_GB1_two-vs-rest_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/FLIP_GB1_two-vs-rest",
|
3 |
+
"num_labels": 1,
|
4 |
+
"problem_type": "regression",
|
5 |
+
"metrics": "spearman_corr",
|
6 |
+
"monitor": "spearman_corr",
|
7 |
+
"monitor_strategy": "max",
|
8 |
+
"normalize": "min_max"
|
9 |
+
}
|
data/GO_BP/GO_BP_AlphaFold2_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/GO_BP_AlphaFold2",
|
3 |
+
"pdb_type": "AlphaFold2",
|
4 |
+
"num_labels": 1943,
|
5 |
+
"problem_type": "multi_label_classification",
|
6 |
+
"metrics": "f1_max",
|
7 |
+
"monitor": "f1_max",
|
8 |
+
"monitor_strategy": "max"
|
9 |
+
}
|
data/GO_BP/GO_BP_ESMFold_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/GO_BP_ESMFold",
|
3 |
+
"pdb_type": "ESMFold",
|
4 |
+
"num_labels": 1943,
|
5 |
+
"problem_type": "multi_label_classification",
|
6 |
+
"metrics": "f1_max",
|
7 |
+
"monitor": "f1_max",
|
8 |
+
"monitor_strategy": "max"
|
9 |
+
}
|
data/GO_BP/GO_BP_HF.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/GO_BP",
|
3 |
+
"num_labels": 1943,
|
4 |
+
"problem_type": "multi_label_classification",
|
5 |
+
"metrics": "f1_max",
|
6 |
+
"monitor": "f1_max",
|
7 |
+
"monitor_strategy": "max"
|
8 |
+
}
|
data/GO_CC/GO_CC_AlphaFold2_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/GO_CC_AlphaFold2",
|
3 |
+
"pdb_type": "AlphaFold2",
|
4 |
+
"num_labels": 320,
|
5 |
+
"problem_type": "multi_label_classification",
|
6 |
+
"metrics": "f1_max",
|
7 |
+
"monitor": "f1_max",
|
8 |
+
"monitor_strategy": "max"
|
9 |
+
}
|
data/GO_CC/GO_CC_ESMFold_HF.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/GO_CC_ESMFold",
|
3 |
+
"pdb_type": "ESMFold",
|
4 |
+
"num_labels": 320,
|
5 |
+
"problem_type": "multi_label_classification",
|
6 |
+
"metrics": "f1_max",
|
7 |
+
"monitor": "f1_max",
|
8 |
+
"monitor_strategy": "max"
|
9 |
+
}
|
data/GO_CC/GO_CC_HF.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": "tyang816/GO_CC",
|
3 |
+
"num_labels": 320,
|
4 |
+
"problem_type": "multi_label_classification",
|
5 |
+
"metrics": "f1_max",
|
6 |
+
"monitor": "f1_max",
|
7 |
+
"monitor_strategy": "max"
|
8 |
+
}
|