2dogey commited on
Commit
8918ac7
·
verified ·
1 Parent(s): bdf39d1

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +6 -0
  2. .gitignore +177 -0
  3. LICENSE +201 -0
  4. README.md +742 -12
  5. README_CN.md +728 -0
  6. Scripts_notebook.ipynb +0 -0
  7. WebUI_demo.md +84 -0
  8. WebUI_demo_CN.md +83 -0
  9. app.py +74 -0
  10. ckpt/demo/demo.json +1 -0
  11. ckpt/demo/demo.pt +3 -0
  12. ckpt/demo/demo_provided.json +1 -0
  13. ckpt/demo/demo_provided.pt +3 -0
  14. data/DeepET_Topt/DeepET_Topt_AlphaFold2_HF.json +10 -0
  15. data/DeepET_Topt/DeepET_Topt_ESMFold_HF.json +10 -0
  16. data/DeepET_Topt/DeepET_Topt_HF.json +9 -0
  17. data/DeepLoc2Multi/DeepLoc2Multi_AlphaFold2_HF.json +9 -0
  18. data/DeepLoc2Multi/DeepLoc2Multi_HF.json +8 -0
  19. data/DeepLocBinary/DeepLocBinary_AlphaFold2_HF.json +9 -0
  20. data/DeepLocBinary/DeepLocBinary_ESMFold_HF.json +9 -0
  21. data/DeepLocBinary/DeepLocBinary_HF.json +8 -0
  22. data/DeepLocMulti/DeepLocMulti_AlphaFold2_HF.json +9 -0
  23. data/DeepLocMulti/DeepLocMulti_ESMFold_HF.json +9 -0
  24. data/DeepLocMulti/DeepLocMulti_HF.json +8 -0
  25. data/DeepSol/DeepSol_ESMFold_HF.json +9 -0
  26. data/DeepSol/DeepSol_HF.json +8 -0
  27. data/DeepSoluE/DeepSoluE_ESMFold_HF.json +9 -0
  28. data/DeepSoluE/DeepSoluE_HF.json +8 -0
  29. data/Demo/Demo_Solubility_HF.json +8 -0
  30. data/EC/EC_AlphaFold2_HF.json +9 -0
  31. data/EC/EC_ESMFold_HF.json +9 -0
  32. data/EC/EC_HF.json +8 -0
  33. data/FLIP_AAV/FLIP_AAV_des-mut_HF.json +9 -0
  34. data/FLIP_AAV/FLIP_AAV_low-vs-high_HF.json +9 -0
  35. data/FLIP_AAV/FLIP_AAV_mut-des_HF.json +9 -0
  36. data/FLIP_AAV/FLIP_AAV_one-vs-rest_HF.json +9 -0
  37. data/FLIP_AAV/FLIP_AAV_sampled_HF.json +9 -0
  38. data/FLIP_AAV/FLIP_AAV_seven-vs-rest_HF.json +9 -0
  39. data/FLIP_AAV/FLIP_AAV_two-vs-rest_HF.json +9 -0
  40. data/FLIP_GB1/FLIP_GB1_low-vs-high_HF.json +9 -0
  41. data/FLIP_GB1/FLIP_GB1_one-vs-rest_HF.json +9 -0
  42. data/FLIP_GB1/FLIP_GB1_sampled_HF.json +9 -0
  43. data/FLIP_GB1/FLIP_GB1_three-vs-rest_HF.json +9 -0
  44. data/FLIP_GB1/FLIP_GB1_two-vs-rest_HF.json +9 -0
  45. data/GO_BP/GO_BP_AlphaFold2_HF.json +9 -0
  46. data/GO_BP/GO_BP_ESMFold_HF.json +9 -0
  47. data/GO_BP/GO_BP_HF.json +8 -0
  48. data/GO_CC/GO_CC_AlphaFold2_HF.json +9 -0
  49. data/GO_CC/GO_CC_ESMFold_HF.json +9 -0
  50. data/GO_CC/GO_CC_HF.json +8 -0
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ img/Eval/Model_Dataset_Config.png filter=lfs diff=lfs merge=lfs -text
37
+ img/HuggingFace/HF1.png filter=lfs diff=lfs merge=lfs -text
38
+ img/HuggingFace/HF2.png filter=lfs diff=lfs merge=lfs -text
39
+ img/HuggingFace/HF3.png filter=lfs diff=lfs merge=lfs -text
40
+ img/Predict/Predict_Tab.png filter=lfs diff=lfs merge=lfs -text
41
+ img/Train/Monitor_Figs.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
161
+
162
+ dataset/
163
+ data/*.ipynb
164
+ wandb/
165
+ ckpt/ckpt
166
+ ckpt/dev_models
167
+ script_dev/
168
+ .gradio/
169
+ configs/
170
+ result/
171
+
172
+ # ignore all files in src/data/weight except .keep
173
+ src/data/weight/
174
+ !src/data/weight/.keep
175
+
176
+ tmp_db/
177
+ log/
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md CHANGED
@@ -1,12 +1,742 @@
1
- ---
2
- title: VenusFactory
3
- emoji: 📚
4
- colorFrom: green
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 5.24.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: VenusFactory
3
+ app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 5.24.0
6
+ ---
7
+ <div align="right">
8
+ <a href="README.md">English</a> | <a href="README_CN.md">简体中文</a>
9
+ </div>
10
+
11
+ <p align="center">
12
+ <img src="img/banner_2503.png" width="70%" alt="VenusFactory Banner">
13
+ </p>
14
+
15
+ <div align="center">
16
+
17
+ [![GitHub stars](https://img.shields.io/github/stars/tyang816/VenusFactory?style=flat-square)](https://github.com/tyang816/VenusFactory/stargazers) [![GitHub forks](https://img.shields.io/github/forks/tyang816/VenusFactory?style=flat-square)](https://github.com/tyang816/VenusFactory/network/members) [![GitHub issues](https://img.shields.io/github/issues/tyang816/VenusFactory?style=flat-square)](https://github.com/tyang816/VenusFactory/issues) [![GitHub license](https://img.shields.io/github/license/tyang816/VenusFactory?style=flat-square)](https://github.com/tyang816/VenusFactory/blob/main/LICENSE)
18
+ [![Python Version](https://img.shields.io/badge/Python-3.10-blue?style=flat-square&logo=python)](https://www.python.org/) [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen?style=flat-square)](https://venusfactory.readthedocs.io/) [![Downloads](https://img.shields.io/github/downloads/tyang816/VenusFactory/total?style=flat-square)](https://github.com/tyang816/VenusFactory/releases)
19
+
20
+ </div>
21
+
22
+ Recent News:
23
+
24
+ - Welcome to VenusFactory! This project is developed by [**Liang's Lab**](https://lianglab.sjtu.edu.cn/) at [**Shanghai Jiao Tong University**](https://www.sjtu.edu.cn/).
25
+ - [2025-03-26] Add [VenusPLM-300M](https://huggingface.co/AI4Protein/VenusPLM-300M) model, trained based on **VenusPod**, is a protein language model independently developed by Hong Liang's research group at Shanghai Jiao Tong University.
26
+ - [2025-03-17] Add [Venus-PETA, Venus-ProPrime, Venus-ProSST models](https://huggingface.co/AI4Protein), for more details, please refer to [Supported Models](#-supported-models)
27
+ - [2025-03-05] 🎉 Congratulations! 🎉
28
+
29
+ 🚀 Our latest research achievement, **VenusMutHub**, has been officially accepted by [**Acta Pharmaceutica Sinica B**](https://www.sciencedirect.com/science/article/pii/S2211383525001650) and is now featured in a series of [**leaderboards**](https://lianglab.sjtu.edu.cn/muthub/)!
30
+ 💡 In this study, we built **900+ high-quality benchmark** [**datasets**](https://huggingface.co/datasets/AI4Protein/VenusMutHub) covering **500+ protein functional properties**. VenusMutHub not only offers a new collection of small-sample datasets for **real-world protein mutation engineering**, but also fills the gap in **diversity** within existing benchmarks, laying a stronger foundation for AI-driven protein mutation effect prediction.
31
+
32
+
33
+ ## ✏️ Table of Contents
34
+
35
+ - [Features](#-features)
36
+ - [Supported Models](#-supported-models)
37
+ - [Supported Training Approaches](#-supported-training-approaches)
38
+ - [Supported Datasets](#-supported-datasets)
39
+ - [Supported Metrics](#-supported-metrics)
40
+ - [Requirements](#-requirements)
41
+ - [Installation Guide](#-installation-guide)
42
+ - [Quick Start with Venus Web UI](#-quick-start-with-venus-web-ui)
43
+ - [Code-line Usage](#-code-line-usage)
44
+ - [Citation](#-citation)
45
+ - [Acknowledgement](#-acknowledgement)
46
+
47
+ ## 📑 Features
48
+
49
+ - **Vaious protein langugae models**: Venus series, ESM series, ProtTrans series, Ankh series, etc
50
+ - **Comprehensive supervised datasets**: Localization, Fitness, Solubility, Stability, etc
51
+ - **Easy and quick data collector**: AlphaFold2 Database, RCSB, InterPro, Uniprot, etc
52
+ - **Experiment moitors**: Wandb, Local
53
+ - **Friendly interface**: Gradio UI
54
+
55
+ ## 🤖 Supported Models
56
+
57
+ ### Pre-training Protein Language Models
58
+
59
+ <details>
60
+ <summary>Venus Series Models (Published by Liang's Lab)</summary>
61
+
62
+ | Model | Size | Parameters | GPU Memory | Features | Template |
63
+ |-------|------|------------|------------|----------|----------|
64
+ | ProSST-20 | 20 | 110M | 4GB+ | Mutation | [AI4Protein/ProSST-20](https://huggingface.co/AI4Protein/ProSST-20) |
65
+ | ProSST-128 | 128 | 110M | 4GB+ | Mutation | [AI4Protein/ProSST-128](https://huggingface.co/AI4Protein/ProSST-128) |
66
+ | ProSST-512 | 512 | 110M | 4GB+ | Mutation | [AI4Protein/ProSST-512](https://huggingface.co/AI4Protein/ProSST-512) |
67
+ | ProSST-2048 | 2048 | 110M | 4GB+ | Mutation | [AI4Protein/ProSST-2048](https://huggingface.co/AI4Protein/ProSST-2048) |
68
+ | ProSST-4096 | 4096 | 110M | 4GB+ | Mutation | [AI4Protein/ProSST-4096](https://huggingface.co/AI4Protein/ProSST-4096) |
69
+ | ProPrime-690M | 690M | 690M | 16GB+ | OGT-prediction | [AI4Protein/Prime_690M](https://huggingface.co/AI4Protein/Prime_690M) |
70
+ | VenusPLM-300M | 300M | 300M | 12GB+ | Protein-language | [AI4Protein/VenusPLM-300M](https://huggingface.co/AI4Protein/VenusPLM-300M) |
71
+
72
+ > 💡 These models often excel in specific tasks or offer unique architectural benefits
73
+ </details>
74
+
75
+ <details>
76
+ <summary>Venus-PETA Models: Tokenization variants</summary>
77
+
78
+ #### BPE Tokenization Series
79
+ | Model | Vocab Size | Parameters | GPU Memory | Template |
80
+ |-------|------------|------------|------------|----------|
81
+ | PETA-base | base | 80M | 4GB+ | [AI4Protein/deep_base](https://huggingface.co/AI4Protein/deep_base) |
82
+ | PETA-bpe-50 | 50 | 80M | 4GB+ | [AI4Protein/deep_bpe_50](https://huggingface.co/AI4Protein/deep_bpe_50) |
83
+ | PETA-bpe-200 | 200 | 80M | 4GB+ | [AI4Protein/deep_bpe_200](https://huggingface.co/AI4Protein/deep_bpe_200) |
84
+ | PETA-bpe-400 | 400 | 80M | 4GB+ | [AI4Protein/deep_bpe_400](https://huggingface.co/AI4Protein/deep_bpe_400) |
85
+ | PETA-bpe-800 | 800 | 80M | 4GB+ | [AI4Protein/deep_bpe_800](https://huggingface.co/AI4Protein/deep_bpe_800) |
86
+ | PETA-bpe-1600 | 1600 | 80M | 4GB+ | [AI4Protein/deep_bpe_1600](https://huggingface.co/AI4Protein/deep_bpe_1600) |
87
+ | PETA-bpe-3200 | 3200 | 80M | 4GB+ | [AI4Protein/deep_bpe_3200](https://huggingface.co/AI4Protein/deep_bpe_3200) |
88
+
89
+ #### Unigram Tokenization Series
90
+ | Model | Vocab Size | Parameters | GPU Memory | Template |
91
+ |-------|------------|------------|------------|----------|
92
+ | PETA-unigram-50 | 50 | 80M | 4GB+ | [AI4Protein/deep_unigram_50](https://huggingface.co/AI4Protein/deep_unigram_50) |
93
+ | PETA-unigram-100 | 100 | 80M | 4GB+ | [AI4Protein/deep_unigram_100](https://huggingface.co/AI4Protein/deep_unigram_100) |
94
+ | PETA-unigram-200 | 200 | 80M | 4GB+ | [AI4Protein/deep_unigram_200](https://huggingface.co/AI4Protein/deep_unigram_200) |
95
+ | PETA-unigram-400 | 400 | 80M | 4GB+ | [AI4Protein/deep_unigram_400](https://huggingface.co/AI4Protein/deep_unigram_400) |
96
+ | PETA-unigram-800 | 800 | 80M | 4GB+ | [AI4Protein/deep_unigram_800](https://huggingface.co/AI4Protein/deep_unigram_800) |
97
+ | PETA-unigram-1600 | 1600 | 80M | 4GB+ | [AI4Protein/deep_unigram_1600](https://huggingface.co/AI4Protein/deep_unigram_1600) |
98
+ | PETA-unigram-3200 | 3200 | 80M | 4GB+ | [AI4Protein/deep_unigram_3200](https://huggingface.co/AI4Protein/deep_unigram_3200) |
99
+
100
+ > 💡 Different tokenization strategies may be better suited for specific tasks
101
+ </details>
102
+
103
+ <details>
104
+ <summary>ESM Series Models: Meta AI's protein language models</summary>
105
+
106
+ | Model | Size | Parameters | GPU Memory | Training Data | Template |
107
+ |-------|------|------------|------------|---------------|----------|
108
+ | ESM2-8M | 8M | 8M | 2GB+ | UR50/D | [facebook/esm2_t6_8M_UR50D](https://huggingface.co/facebook/esm2_t6_8M_UR50D) |
109
+ | ESM2-35M | 35M | 35M | 4GB+ | UR50/D | [facebook/esm2_t12_35M_UR50D](https://huggingface.co/facebook/esm2_t12_35M_UR50D) |
110
+ | ESM2-150M | 150M | 150M | 8GB+ | UR50/D | [facebook/esm2_t30_150M_UR50D](https://huggingface.co/facebook/esm2_t30_150M_UR50D) |
111
+ | ESM2-650M | 650M | 650M | 16GB+ | UR50/D | [facebook/esm2_t33_650M_UR50D](https://huggingface.co/facebook/esm2_t33_650M_UR50D) |
112
+ | ESM2-3B | 3B | 3B | 24GB+ | UR50/D | [facebook/esm2_t36_3B_UR50D](https://huggingface.co/facebook/esm2_t36_3B_UR50D) |
113
+ | ESM2-15B | 15B | 15B | 40GB+ | UR50/D | [facebook/esm2_t48_15B_UR50D](https://huggingface.co/facebook/esm2_t48_15B_UR50D) |
114
+ | ESM-1b | 650M | 650M | 16GB+ | UR50/S | [facebook/esm1b_t33_650M_UR50S](https://huggingface.co/facebook/esm1b_t33_650M_UR50S) |
115
+ | ESM-1v-1 | 650M | 650M | 16GB+ | UR90/S | [facebook/esm1v_t33_650M_UR90S_1](https://huggingface.co/facebook/esm1v_t33_650M_UR90S_1) |
116
+ | ESM-1v-2 | 650M | 650M | 16GB+ | UR90/S | [facebook/esm1v_t33_650M_UR90S_2](https://huggingface.co/facebook/esm1v_t33_650M_UR90S_2) |
117
+ | ESM-1v-3 | 650M | 650M | 16GB+ | UR90/S | [facebook/esm1v_t33_650M_UR90S_3](https://huggingface.co/facebook/esm1v_t33_650M_UR90S_3) |
118
+ | ESM-1v-4 | 650M | 650M | 16GB+ | UR90/S | [facebook/esm1v_t33_650M_UR90S_4](https://huggingface.co/facebook/esm1v_t33_650M_UR90S_4) |
119
+ | ESM-1v-5 | 650M | 650M | 16GB+ | UR90/S | [facebook/esm1v_t33_650M_UR90S_5](https://huggingface.co/facebook/esm1v_t33_650M_UR90S_5) |
120
+
121
+ > 💡 ESM2 models are the latest generation, offering better performance than ESM-1b/1v
122
+ </details>
123
+
124
+ <details>
125
+ <summary>BERT-based Models: Transformer encoder architecture</summary>
126
+
127
+ | Model | Size | Parameters | GPU Memory | Training Data | Template |
128
+ |-------|------|------------|------------|---------------|----------|
129
+ | ProtBert-Uniref100 | 420M | 420M | 12GB+ | UniRef100 | [Rostlab/prot_bert](https://huggingface.co/Rostlab/prot_bert) |
130
+ | ProtBert-BFD | 420M | 420M | 12GB+ | BFD100 | [Rostlab/prot_bert_bfd](https://huggingface.co/Rostlab/prot_bert_bfd) |
131
+ | IgBert | 420M | 420M | 12GB+ | Antibody | [Exscientia/IgBert](https://huggingface.co/Exscientia/IgBert) |
132
+ | IgBert-unpaired | 420M | 420M | 12GB+ | Antibody | [Exscientia/IgBert_unpaired](https://huggingface.co/Exscientia/IgBert_unpaired) |
133
+
134
+ > 💡 BFD-trained models generally show better performance on structure-related tasks
135
+ </details>
136
+
137
+ <details>
138
+ <summary>T5-based Models: Encoder-decoder architecture</summary>
139
+
140
+ | Model | Size | Parameters | GPU Memory | Training Data | Template |
141
+ |-------|------|------------|------------|---------------|----------|
142
+ | ProtT5-XL-UniRef50 | 3B | 3B | 24GB+ | UniRef50 | [Rostlab/prot_t5_xl_uniref50](https://huggingface.co/Rostlab/prot_t5_xl_uniref50) |
143
+ | ProtT5-XXL-UniRef50 | 11B | 11B | 40GB+ | UniRef50 | [Rostlab/prot_t5_xxl_uniref50](https://huggingface.co/Rostlab/prot_t5_xxl_uniref50) |
144
+ | ProtT5-XL-BFD | 3B | 3B | 24GB+ | BFD100 | [Rostlab/prot_t5_xl_bfd](https://huggingface.co/Rostlab/prot_t5_xl_bfd) |
145
+ | ProtT5-XXL-BFD | 11B | 11B | 40GB+ | BFD100 | [Rostlab/prot_t5_xxl_bfd](https://huggingface.co/Rostlab/prot_t5_xxl_bfd) |
146
+ | IgT5 | 3B | 3B | 24GB+ | Antibody | [Exscientia/IgT5](https://huggingface.co/Exscientia/IgT5) |
147
+ | IgT5-unpaired | 3B | 3B | 24GB+ | Antibody | [Exscientia/IgT5_unpaired](https://huggingface.co/Exscientia/IgT5_unpaired) |
148
+ | Ankh-base | 450M | 450M | 12GB+ | Encoder-decoder | [ElnaggarLab/ankh-base](https://huggingface.co/ElnaggarLab/ankh-base) |
149
+ | Ankh-large | 1.2B | 1.2B | 20GB+ | Encoder-decoder | [ElnaggarLab/ankh-large](https://huggingface.co/ElnaggarLab/ankh-large) |
150
+
151
+ > 💡 T5 models can be used for both encoding and generation tasks
152
+ </details>
153
+
154
+ ### Model Selection Guide
155
+
156
+ <details>
157
+ <summary>How to choose the right model?</summary>
158
+
159
+ 1. **Based on Hardware Constraints:**
160
+ - Limited GPU (<8GB): ESM2-8M, ESM2-35M, ProSST
161
+ - Medium GPU (8-16GB): ESM2-150M, ESM2-650M, ProtBert series
162
+ - High-end GPU (24GB+): ESM2-3B, ProtT5-XL, Ankh-large
163
+ - Multiple GPUs: ESM2-15B, ProtT5-XXL
164
+
165
+ 2. **Based on Task Type:**
166
+ - Sequence classification: ESM2, ProtBert
167
+ - Structure prediction: ESM2, Ankh
168
+ - Generation tasks: ProtT5
169
+ - Antibody design: IgBert, IgT5
170
+ - Lightweight deployment: ProSST, PETA-base
171
+
172
+ 3. **Based on Training Data:**
173
+ - General protein tasks: ESM2, ProtBert
174
+ - Structure-aware tasks: Ankh
175
+ - Antibody-specific: IgBert, IgT5
176
+ - Custom tokenization needs: PETA series
177
+
178
+ </details>
179
+
180
+ > 🔍 All models are available through the Hugging Face Hub and can be easily loaded using their templates.
181
+
182
+ ## 🔬 Supported Training Approaches
183
+
184
+ <details>
185
+ <summary>Supported Training Approaches</summary>
186
+
187
+ | Approach | Full-tuning | Freeze-tuning | SES-Adapter | AdaLoRA | QLoRA | LoRA | DoRA | IA3 |
188
+ | ---------------------- | ----------- | ------------------ | ------------------ | ------------------ |----------- | ------------------ | -----------------| -----------------|
189
+ | Supervised Fine-Tuning | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |
190
+
191
+ </details>
192
+
193
+ ## 📚 Supported Datasets
194
+
195
+ <details><summary>Pre-training datasets</summary>
196
+
197
+ | dataset | data level | link |
198
+ |------------|------|------|
199
+ | CATH_V43_S40 | structures | [CATH_V43_S40](https://huggingface.co/datasets/tyang816/cath) |
200
+ | AGO_family | structures | [AGO_family](https://huggingface.co/datasets/tyang816/Ago_database_PDB) |
201
+
202
+ </details>
203
+
204
+ <details><summary>Zero-shot datasets</summary>
205
+
206
+ | dataset | task | link |
207
+ |------------|------|------|
208
+ | VenusMutHub | mutation effects prediction | [VenusMutHub](https://huggingface.co/datasets/AI4Protein/VenusMutHub) |
209
+ | ProteinGym | mutation effects prediction | [ProteinGym](https://proteingym.org/) |
210
+
211
+ </details>
212
+
213
+ <details><summary>Supervised fine-tuning datasets (amino acid sequences/ foldseek sequences/ ss8 sequences)</summary>
214
+
215
+ | dataset | task | data level | problem type | link |
216
+ |------------|------|----------|----------|------|
217
+ | DeepLocBinary | localization | protein-wise | single_label_classification | [DeepLocBinary_AlphaFold2](https://huggingface.co/datasets/tyang816/DeepLocBinary_AlphaFold2), [DeepLocBinary_ESMFold](https://huggingface.co/datasets/tyang816/DeepLocBinary_ESMFold) |
218
+ | DeepLocMulti | localization | protein-wise | multi_label_classification | [DeepLocMulti_AlphaFold2](https://huggingface.co/datasets/tyang816/DeepLocMulti_AlphaFold2), [DeepLocMulti_ESMFold](https://huggingface.co/datasets/tyang816/DeepLocMulti_ESMFold) |
219
+ | DeepLoc2Multi | localization | protein-wise | single_label_classification | [DeepLoc2Multi_AlphaFold2](https://huggingface.co/datasets/tyang816/DeepLoc2Multi_AlphaFold2), [DeepLoc2Multi_ESMFold](https://huggingface.co/datasets/tyang816/DeepLoc2Multi_ESMFold) |
220
+ | DeepSol | solubility | protein-wise | single_label_classification | [DeepSol_ESMFold](https://huggingface.co/datasets/tyang816/DeepSol_ESMFold) |
221
+ | DeepSoluE | solubility | protein-wise | single_label_classification | [DeepSoluE_ESMFold](https://huggingface.co/datasets/tyang816/DeepSoluE_ESMFold) |
222
+ | ProtSolM | solubility | protein-wise | single_label_classification | [ProtSolM_ESMFold](https://huggingface.co/datasets/tyang816/ProtSolM_ESMFold) |
223
+ | eSOL | solubility | protein-wise | regression | [eSOL_AlphaFold2](https://huggingface.co/datasets/tyang816/eSOL_AlphaFold2), [eSOL_ESMFold](https://huggingface.co/datasets/tyang816/eSOL_ESMFold) |
224
+ | DeepET_Topt | optimum temperature | protein-wise | regression | [DeepET_Topt_AlphaFold2](https://huggingface.co/datasets/tyang816/DeepET_Topt_AlphaFold2), [DeepET_Topt_ESMFold](https://huggingface.co/datasets/tyang816/DeepET_Topt_ESMFold) |
225
+ | EC | function | protein-wise | multi_label_classification | [EC_AlphaFold2](https://huggingface.co/datasets/tyang816/EC_AlphaFold2), [EC_ESMFold](https://huggingface.co/datasets/tyang816/EC_ESMFold) |
226
+ | GO_BP | function | protein-wise | multi_label_classification | [GO_BP_AlphaFold2](https://huggingface.co/datasets/tyang816/GO_BP_AlphaFold2), [GO_BP_ESMFold](https://huggingface.co/datasets/tyang816/GO_BP_ESMFold) |
227
+ | GO_CC | function | protein-wise | multi_label_classification | [GO_CC_AlphaFold2](https://huggingface.co/datasets/tyang816/GO_CC_AlphaFold2), [GO_CC_ESMFold](https://huggingface.co/datasets/tyang816/GO_CC_ESMFold) |
228
+ | GO_MF | function | protein-wise | multi_label_classification | [GO_MF_AlphaFold2](https://huggingface.co/datasets/tyang816/GO_MF_AlphaFold2), [GO_MF_ESMFold](https://huggingface.co/datasets/tyang816/GO_MF_ESMFold) |
229
+ | MetalIonBinding | binding | protein-wise | single_label_classification | [MetalIonBinding_AlphaFold2](https://huggingface.co/datasets/tyang816/MetalIonBinding_AlphaFold2), [MetalIonBinding_ESMFold](https://huggingface.co/datasets/tyang816/MetalIonBinding_ESMFold) |
230
+ | Thermostability | stability | protein-wise | regression | [Thermostability_AlphaFold2](https://huggingface.co/datasets/tyang816/Thermostability_AlphaFold2), [Thermostability_ESMFold](https://huggingface.co/datasets/tyang816/Thermostability_ESMFold) |
231
+
232
+ > ✨ Only structural sequences are different for the same dataset, for example, ``DeepLocBinary_ESMFold`` and ``DeepLocBinary_AlphaFold2`` share the same amino acid sequences, this means if you only want to use the ``aa_seqs``, both are ok!
233
+
234
+ </details>
235
+
236
+ <details><summary>Supervised fine-tuning datasets (amino acid sequences)</summary>
237
+
238
+ | dataset | task | data level | problem type | link |
239
+ |------------|------|----------|----------|------|
240
+ | Demo_Solubility | solubility | protein-wise | single_label_classification | [Demo_Solubility](https://huggingface.co/datasets/tyang816/Demo_Solubility) |
241
+ | DeepLocBinary | localization | protein-wise | single_label_classification | [DeepLocBinary](https://huggingface.co/datasets/tyang816/DeepLocBinary) |
242
+ | DeepLocMulti | localization | protein-wise | multi_label_classification | [DeepLocMulti](https://huggingface.co/datasets/tyang816/DeepLocMulti) |
243
+ | DeepLoc2Multi | localization | protein-wise | single_label_classification | [DeepLoc2Multi](https://huggingface.co/datasets/tyang816/DeepLoc2Multi) |
244
+ | DeepSol | solubility | protein-wise | single_label_classification | [DeepSol](https://huggingface.co/datasets/tyang816/DeepSol) |
245
+ | DeepSoluE | solubility | protein-wise | single_label_classification | [DeepSoluE](https://huggingface.co/datasets/tyang816/DeepSoluE) |
246
+ | ProtSolM | solubility | protein-wise | single_label_classification | [ProtSolM](https://huggingface.co/datasets/tyang816/ProtSolM) |
247
+ | eSOL | solubility | protein-wise | regression | [eSOL](https://huggingface.co/datasets/tyang816/eSOL) |
248
+ | DeepET_Topt | optimum temperature | protein-wise | regression | [DeepET_Topt](https://huggingface.co/datasets/tyang816/DeepET_Topt) |
249
+ | EC | function | protein-wise | multi_label_classification | [EC](https://huggingface.co/datasets/tyang816/EC) |
250
+ | GO_BP | function | protein-wise | multi_label_classification | [GO_BP](https://huggingface.co/datasets/tyang816/GO_BP) |
251
+ | GO_CC | function | protein-wise | multi_label_classification | [GO_CC](https://huggingface.co/datasets/tyang816/GO_CC) |
252
+ | GO_MF | function | protein-wise | multi_label_classification | [GO_MF](https://huggingface.co/datasets/tyang816/GO_MF) |
253
+ | MetalIonBinding | binding | protein-wise | single_label_classification | [MetalIonBinding](https://huggingface.co/datasets/tyang816/MetalIonBinding) |
254
+ | Thermostability | stability | protein-wise | regression | [Thermostability](https://huggingface.co/datasets/tyang816/Thermostability) |
255
+ | PaCRISPR | CRISPR | protein-wise | single_label_classification | [PaCRISPR](https://huggingface.co/datasets/tyang816/PaCRISPR) |
256
+ | PETA_CHS_Sol | solubility | protein-wise | single_label_classification | [PETA_CHS_Sol](https://huggingface.co/datasets/tyang816/PETA_CHS_Sol) |
257
+ | PETA_LGK_Sol | solubility | protein-wise | single_label_classification | [PETA_LGK_Sol](https://huggingface.co/datasets/tyang816/PETA_LGK_Sol) |
258
+ | PETA_TEM_Sol | solubility | protein-wise | single_label_classification | [PETA_TEM_Sol](https://huggingface.co/datasets/tyang816/PETA_TEM_Sol) |
259
+ | SortingSignal | sorting signal | protein-wise | single_label_classification | [SortingSignal](https://huggingface.co/datasets/tyang816/SortingSignal) |
260
+ | FLIP_AAV | mutation | protein-site | regression |
261
+ | FLIP_AAV_one-vs-rest | mutation | protein-site | single_label_classification | [FLIP_AAV_one-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_AAV_one-vs-rest) |
262
+ | FLIP_AAV_two-vs-rest | mutation | protein-site | single_label_classification | [FLIP_AAV_two-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_AAV_two-vs-rest) |
263
+ | FLIP_AAV_mut-des | mutation | protein-site | single_label_classification | [FLIP_AAV_mut-des](https://huggingface.co/datasets/tyang816/FLIP_AAV_mut-des) |
264
+ | FLIP_AAV_des-mut | mutation | protein-site | single_label_classification | [FLIP_AAV_des-mut](https://huggingface.co/datasets/tyang816/FLIP_AAV_des-mut) |
265
+ | FLIP_AAV_seven-vs-rest | mutation | protein-site | single_label_classification | [FLIP_AAV_seven-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_AAV_seven-vs-rest) |
266
+ | FLIP_AAV_low-vs-high | mutation | protein-site | single_label_classification | [FLIP_AAV_low-vs-high](https://huggingface.co/datasets/tyang816/FLIP_AAV_low-vs-high) |
267
+ | FLIP_AAV_sampled | mutation | protein-site | single_label_classification | [FLIP_AAV_sampled](https://huggingface.co/datasets/tyang816/FLIP_AAV_sampled) |
268
+ | FLIP_GB1 | mutation | protein-site | regression |
269
+ | FLIP_GB1_one-vs-rest | mutation | protein-site | single_label_classification | [FLIP_GB1_one-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_GB1_one-vs-rest) |
270
+ | FLIP_GB1_two-vs-rest | mutation | protein-site | single_label_classification | [FLIP_GB1_two-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_GB1_two-vs-rest) |
271
+ | FLIP_GB1_three-vs-rest | mutation | protein-site | single_label_classification | [FLIP_GB1_three-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_GB1_three-vs-rest) |
272
+ | FLIP_GB1_low-vs-high | mutation | protein-site | single_label_classification | [FLIP_GB1_low-vs-high](https://huggingface.co/datasets/tyang816/FLIP_GB1_low-vs-high) |
273
+ | FLIP_GB1_sampled | mutation | protein-site | single_label_classification | [FLIP_GB1_sampled](https://huggingface.co/datasets/tyang816/FLIP_GB1_sampled) |
274
+ | TAPE_Fluorescence | fluorescence | protein-site | regression | [TAPE_Fluorescence](https://huggingface.co/datasets/tyang816/TAPE_Fluorescence) |
275
+ | TAPE_Stability | stability | protein-site | regression | [TAPE_Stability](https://huggingface.co/datasets/tyang816/TAPE_Stability) |
276
+
277
+ </details>
278
+
279
+ ## 📈 Supported Metrics
280
+
281
+ <details>
282
+ <summary>Supported Metrics</summary>
283
+
284
+ | Name | Torchmetrics | Problem Type |
285
+ | ------------- | ---------------- | ------------------------------------------------------- |
286
+ | accuracy | Accuracy | single_label_classification/ multi_label_classification |
287
+ | recall | Recall | single_label_classification/ multi_label_classification |
288
+ | precision | Precision | single_label_classification/ multi_label_classification |
289
+ | f1 | F1Score | single_label_classification/ multi_label_classification |
290
+ | mcc | MatthewsCorrCoef | single_label_classification/ multi_label_classification |
291
+ | auc | AUROC | single_label_classification/ multi_label_classification |
292
+ | f1_max | F1ScoreMax | multi_label_classification |
293
+ | spearman_corr | SpearmanCorrCoef | regression |
294
+ | mse | MeanSquaredError | regression |
295
+
296
+ </details>
297
+
298
+ ## ✈️ Requirements
299
+
300
+ ### Hardware Requirements
301
+ - Recommended: NVIDIA RTX 3090 (24GB) or better
302
+ - Actual requirements depend on your chosen protein language model
303
+
304
+ ### Software Requirements
305
+ - [Anaconda3](https://www.anaconda.com/download) or [Miniconda3](https://docs.conda.io/projects/miniconda/en/latest/)
306
+ - Python 3.10
307
+
308
+ ## 📦 Installation Guide
309
+ <details><summary> Git start with macOS</summary>
310
+
311
+ ## To achieve the best performance and experience, we recommend using ​Mac devices with M-series chips (such as M1, M2, M3, etc.).
312
+
313
+ ## 1️⃣ Clone the repository
314
+
315
+ First, get the VenusFactory code:
316
+
317
+ ```bash
318
+ git clone https://github.com/tyang816/VenusFactory.git
319
+ cd VenusFactory
320
+ ```
321
+
322
+ ## 2️⃣ Create a Conda environment
323
+
324
+ Ensure you have Anaconda or Miniconda installed. Then, create a new environment named `venus` with Python 3.10:
325
+
326
+ ```bash
327
+ conda create -n venus python=3.10
328
+ conda activate venus
329
+ ```
330
+
331
+ ## 3️⃣ Install Pytorch and PyG dependencies
332
+
333
+ ```bash
334
+ # Install PyTorch
335
+ pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
336
+
337
+ # Install PyG dependencies
338
+ pip install torch_scatter torch-sparse torch-geometric -f https://data.pyg.org/whl/torch-2.2.0+cpu.html
339
+ ```
340
+
341
+ ## 4️⃣ Install remaining dependencies
342
+
343
+ Install the remaining dependencies using `requirements_for_macOS.txt`:
344
+ ```bash
345
+ pip install -r requirements_for_macOS.txt
346
+ ```
347
+ </details>
348
+
349
+ <details><summary> Git start with Windows or Linux on CUDA 12.x</summary>
350
+
351
+ ## We recommend using CUDA 12.2
352
+
353
+
354
+ ## 1️⃣ Clone the repository
355
+
356
+ First, get the VenusFactory code:
357
+
358
+ ```bash
359
+ git clone https://github.com/tyang816/VenusFactory.git
360
+ cd VenusFactory
361
+ ```
362
+
363
+ ## 2️⃣ Create a Conda environment
364
+
365
+ Ensure you have Anaconda or Miniconda installed. Then, create a new environment named `venus` with Python 3.10:
366
+
367
+ ```bash
368
+ conda create -n venus python=3.10
369
+ conda activate venus
370
+ ```
371
+
372
+ ## 3️⃣ Install Pytorch and PyG dependencies
373
+
374
+ ```bash
375
+ # Install PyTorch
376
+ pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu121
377
+
378
+ # Install PyG dependencies
379
+ pip install torch_geometric==2.6.1 -f https://pytorch-geometric.com/whl/torch-2.5.1+cu121.html
380
+ pip install --no-index torch_scatter==2.1.2 -f https://pytorch-geometric.com/whl/torch-2.5.1+cu121.html
381
+ ```
382
+
383
+ ## 4️⃣ Install remaining dependencies
384
+
385
+ Install the remaining dependencies using `requirements.txt`:
386
+ ```bash
387
+ pip install -r requirements.txt
388
+ ```
389
+ </details>
390
+
391
+ <details><summary> Git start with Windows or Linux on CUDA 11.x</summary>
392
+
393
+ ## We recommend using CUDA 11.8 or later versions, as they support higher versions of PyTorch, providing a better experience.
394
+
395
+
396
+ ## 1️⃣ Clone the repository
397
+
398
+ First, get the VenusFactory code:
399
+
400
+ ```bash
401
+ git clone https://github.com/tyang816/VenusFactory.git
402
+ cd VenusFactory
403
+ ```
404
+
405
+ ## 2️⃣ Create a Conda environment
406
+
407
+ Ensure you have Anaconda or Miniconda installed. Then, create a new environment named `venus` with Python 3.10:
408
+
409
+ ```bash
410
+ conda create -n venus python=3.10
411
+ conda activate venus
412
+ ```
413
+
414
+ ## 3️⃣ Install Pytorch and PyG dependencies
415
+
416
+ ```bash
417
+ # Install PyTorch
418
+ pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu118
419
+
420
+ # Install PyG dependencies
421
+ pip install torch_geometric==2.6.1 -f https://pytorch-geometric.com/whl/torch-2.5.1+cu118.html
422
+ pip install --no-index torch_scatter==2.1.2 -f https://pytorch-geometric.com/whl/torch-2.5.1+cu118.html
423
+ ```
424
+
425
+ ## 4️⃣ Install remaining dependencies
426
+
427
+ Install the remaining dependencies using `requirements.txt`:
428
+ ```bash
429
+ pip install -r requirements.txt
430
+ ```
431
+ </details>
432
+
433
+ <details><summary> Git start with Windows or Linux on CPU</summary>
434
+
435
+ ## 1️⃣ Clone the repository
436
+
437
+ First, get the VenusFactory code:
438
+
439
+ ```bash
440
+ git clone https://github.com/tyang816/VenusFactory.git
441
+ cd VenusFactory
442
+ ```
443
+
444
+ ## 2️⃣ Create a Conda environment
445
+
446
+ Ensure you have Anaconda or Miniconda installed. Then, create a new environment named `venus` with Python 3.10:
447
+
448
+ ```bash
449
+ conda create -n venus python=3.10
450
+ conda activate venus
451
+ ```
452
+
453
+ ## 3️⃣ Install Pytorch and PyG dependencies
454
+
455
+ ```bash
456
+ # Install PyTorch
457
+ pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cpu
458
+
459
+ # Install PyG dependencies
460
+ pip install torch_geometric==2.6.1 -f https://pytorch-geometric.com/whl/torch-2.5.1+cpu.html
461
+ pip install --no-index torch_scatter==2.1.2 -f https://pytorch-geometric.com/whl/torch-2.5.1+cpu.html
462
+ ```
463
+
464
+ ## 4️⃣ Install remaining dependencies
465
+
466
+ Install the remaining dependencies using `requirements.txt`:
467
+ ```bash
468
+ pip install -r requirements.txt
469
+ ```
470
+ </details>
471
+
472
+ ## 🚀 Quick Start with Venus Web UI
473
+
474
+ ### Start Venus Web UI
475
+
476
+ Get started quickly with our intuitive graphical interface powered by [Gradio](https://github.com/gradio-app/gradio):
477
+
478
+ ```bash
479
+ python ./src/webui.py
480
+ ```
481
+
482
+ This will launch the Venus Web UI where you can:
483
+ - Configure and run fine-tuning experiments
484
+ - Monitor training progress
485
+ - Evaluate models
486
+ - Visualize results
487
+
488
+ ### Using Each Tab
489
+
490
+ We provide a detailed guide to help you navigate through each tab of the Venus Web UI.
491
+
492
+ <details>
493
+ <summary>1. Training Tab: Train your own protein language model</summary>
494
+
495
+ ![Model_Dataset_Config](img/Train/Model_Dataset_Config.png)
496
+
497
+ Select a protein language model from the dropdown menu. Upload your dataset or select from available datasets and choose metrics appropriate for your problem type.
498
+
499
+ ![Training_Parameters](img/Train/Training_Parameters.png)
500
+ Choose a training method (Freeze, SES-Adapter, LoRA, QLoRA etc.) and configure training parameters (batch size, learning rate, etc.).
501
+
502
+ ![Preview_Command](img/Train/Preview_Command.png)
503
+ ![Training_Progress](img/Train/Training_Progress.png)
504
+ ![Best_Model](img/Train/Best_Model.png)
505
+ ![Monitor_Figs](img/Train/Monitor_Figs.png)
506
+ Click "Start Training" and monitor progress in real-time.
507
+
508
+ <p align="center">
509
+ <img src="img/Train/Metric_Results.png" width="60%" alt="Metric_Results">
510
+ </p>
511
+
512
+ Click "Download CSV" to download the test metrics results.
513
+ </details>
514
+
515
+ <details>
516
+ <summary>2. Evaluation Tab: Evaluate your trained model within a benchmark</summary>
517
+
518
+ ![Model_Dataset_Config](img/Eval/Model_Dataset_Config.png)
519
+
520
+ Load your trained model by specifying the model path. Select the same protein language model and model configs used during training. Select a test dataset and configure batch size. Choose evaluation metrics appropriate for your problem type. Finally, click "Start Evaluation" to view performance metrics.
521
+ </details>
522
+
523
+ <details>
524
+ <summary>3. Prediction Tab: Use your trained model to predict samples</summary>
525
+
526
+ ![Predict_Tab](img/Predict/Predict_Tab.png)
527
+
528
+ Load your trained model by specifying the model path. Select the same protein language model and model configs used during training.
529
+
530
+ For single sequence: Enter a protein sequence in the text box.
531
+
532
+ For batch prediction: Upload a CSV file with sequences.
533
+
534
+ ![Batch](img/Predict/Batch.png)
535
+
536
+ Click "Predict" to generate and view results.
537
+ </details>
538
+
539
+ <details>
540
+ <summary>4. Download Tab: Collect data from different sources with high efficiency</summary>
541
+
542
+ - **AlphaFold2 Structures**: Enter UniProt IDs to download protein structures
543
+ - **UniProt**: Search for protein information using keywords or IDs
544
+ - **InterPro**: Retrieve protein family and domain information
545
+ - **RCSB PDB**: Download experimental protein structures
546
+ </details>
547
+
548
+ <details>
549
+ <summary>5. Manual Tab: Detailed documentation and guides</summary>
550
+
551
+ Select a language (English/Chinese).
552
+
553
+ Navigate through the documentation using the table of contents and find step-by-step guides.
554
+ </details>
555
+
556
+ ## 🧬 Code-line Usage
557
+
558
+ For users who prefer command-line interface, we provide comprehensive script solutions for different scenarios.
559
+
560
+ <details>
561
+ <summary>Training Methods: Various fine-tuning approaches for different needs</summary>
562
+
563
+ ### Full Model Fine-tuning
564
+ ```bash
565
+ # Freeze-tuning: Train only specific layers while freezing others
566
+ bash ./script/train/train_plm_vanilla.sh
567
+ ```
568
+
569
+ ### Parameter-Efficient Fine-tuning (PEFT)
570
+ ```bash
571
+ # SES-Adapter: Selective and Efficient adapter fine-tuning
572
+ bash ./script/train/train_plm_ses-adapter.sh
573
+
574
+ # AdaLoRA: Adaptive Low-Rank Adaptation
575
+ bash ./script/train/train_plm_adalora.sh
576
+
577
+ # QLoRA: Quantized Low-Rank Adaptation
578
+ bash ./script/train/train_plm_qlora.sh
579
+
580
+ # LoRA: Low-Rank Adaptation
581
+ bash ./script/train/train_plm_lora.sh
582
+
583
+ # DoRA: Double Low-Rank Adaptation
584
+ bash ./script/train/train_plm_dora.sh
585
+
586
+ # IA3: Infused Adapter by Inhibiting and Amplifying Inner Activations
587
+ bash ./script/train/train_plm_ia3.sh
588
+ ```
589
+
590
+ #### Training Method Comparison
591
+ | Method | Memory Usage | Training Speed | Performance |
592
+ |--------|--------------|----------------|-------------|
593
+ | Freeze | Low | Fast | Good |
594
+ | SES-Adapter | Medium | Medium | Better |
595
+ | AdaLoRA | Low | Medium | Better |
596
+ | QLoRA | Very Low | Slower | Good |
597
+ | LoRA | Low | Fast | Good |
598
+ | DoRA | Low | Medium | Better |
599
+ | IA3 | Very Low | Fast | Good |
600
+
601
+ </details>
602
+
603
+ <details>
604
+ <summary>Model Evaluation: Comprehensive evaluation tools</summary>
605
+
606
+ ### Basic Evaluation
607
+ ```bash
608
+ # Evaluate model performance on test sets
609
+ bash ./script/eval/eval.sh
610
+ ```
611
+
612
+ ### Available Metrics
613
+ - Classification: accuracy, precision, recall, F1, MCC, AUC
614
+ - Regression: MSE, Spearman correlation
615
+ - Multi-label: F1-max
616
+
617
+ ### Visualization Tools
618
+ - Training curves
619
+ - Confusion matrices
620
+ - ROC curves
621
+ - Performance comparison plots
622
+
623
+ </details>
624
+
625
+ <details>
626
+ <summary>Structure Sequence Tools: Process protein structure information</summary>
627
+
628
+ ### ESM Structure Sequence
629
+ ```bash
630
+ # Generate structure sequences using ESM-3
631
+ bash ./script/get_get_structure_seq/get_esm3_structure_seq.sh
632
+ ```
633
+
634
+ ### Secondary Structure
635
+ ```bash
636
+ # Predict protein secondary structure
637
+ bash ./script/get_get_structure_seq/get_secondary_structure_seq.sh
638
+ ```
639
+
640
+ Features:
641
+ - Support for multiple sequence formats
642
+ - Batch processing capability
643
+ - Integration with popular structure prediction tools
644
+
645
+ </details>
646
+
647
+ <details>
648
+ <summary>Data Collection Tools: Multi-source protein data acquisition</summary>
649
+
650
+ ### Format Conversion
651
+ ```bash
652
+ # Convert CIF format to PDB
653
+ bash ./crawler/convert/maxit.sh
654
+ ```
655
+
656
+ ### Metadata Collection
657
+ ```bash
658
+ # Download metadata from RCSB PDB
659
+ bash ./crawler/metadata/download_rcsb.sh
660
+ ```
661
+
662
+ ### Sequence Data
663
+ ```bash
664
+ # Download protein sequences from UniProt
665
+ bash ./crawler/sequence/download_uniprot_seq.sh
666
+ ```
667
+
668
+ ### Structure Data
669
+ ```bash
670
+ # Download from AlphaFold2 Database
671
+ bash ./crawler/structure/download_alphafold.sh
672
+
673
+ # Download from RCSB PDB
674
+ bash ./crawler/structure/download_rcsb.sh
675
+ ```
676
+
677
+ Features:
678
+ - Automated batch downloading
679
+ - Resume interrupted downloads
680
+ - Data integrity verification
681
+ - Multiple source support
682
+ - Customizable search criteria
683
+
684
+ #### Supported Databases
685
+ | Database | Data Type | Access Method | Rate Limit |
686
+ |----------|-----------|---------------|------------|
687
+ | AlphaFold2 | Structures | REST API | Yes |
688
+ | RCSB PDB | Structures | FTP/HTTP | No |
689
+ | UniProt | Sequences | REST API | Yes |
690
+ | InterPro | Domains | REST API | Yes |
691
+
692
+ </details>
693
+
694
+ <details>
695
+ <summary>Usage Examples: Common scenarios and solutions</summary>
696
+
697
+ ### Training Example
698
+ ```bash
699
+ # Train a protein solubility predictor using ESM2
700
+ bash ./script/train/train_plm_lora.sh \
701
+ --model "facebook/esm2_t33_650M_UR50D" \
702
+ --dataset "DeepSol" \
703
+ --batch_size 32 \
704
+ --learning_rate 1e-4
705
+ ```
706
+
707
+ ### Evaluation Example
708
+ ```bash
709
+ # Evaluate the trained model
710
+ bash ./script/eval/eval.sh \
711
+ --model_path "path/to/your/model" \
712
+ --test_dataset "DeepSol_test"
713
+ ```
714
+
715
+ ### Data Collection Example
716
+ ```bash
717
+ # Download structures for a list of UniProt IDs
718
+ bash ./crawler/structure/download_alphafold.sh \
719
+ --input uniprot_ids.txt \
720
+ --output ./structures
721
+ ```
722
+
723
+ </details>
724
+
725
+ > 💡 All scripts support additional command-line arguments for customization. Use `--help` with any script to see available options.
726
+
727
+ ## 🙌 Citation
728
+
729
+ Please cite our work if you have used our code or data.
730
+
731
+ ```bibtex
732
+ @article{tan2025venusfactory,
733
+ title={VenusFactory: A Unified Platform for Protein Engineering Data Retrieval and Language Model Fine-Tuning},
734
+ author={Tan, Yang and Liu, Chen and Gao, Jingyuan and Wu, Banghao and Li, Mingchen and Wang, Ruilin and Zhang, Lingrong and Yu, Huiqun and Fan, Guisheng and Hong, Liang and Zhou, Bingxin},
735
+ journal={arXiv preprint arXiv:2503.15438},
736
+ year={2025}
737
+ }
738
+ ```
739
+
740
+ ## 🎊 Acknowledgement
741
+
742
+ Thanks the support of [Liang's Lab](https://ins.sjtu.edu.cn/people/lhong/index.html).
README_CN.md ADDED
@@ -0,0 +1,728 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="right">
2
+ <a href="README.md">English</a> | <a href="README_CN.md">简体中文</a>
3
+ </div>
4
+
5
+ <p align="center">
6
+ <img src="img/banner_2503.png" width="70%" alt="VenusFactory Banner">
7
+ </p>
8
+
9
+ <div align="center">
10
+
11
+ [![GitHub stars](https://img.shields.io/github/stars/tyang816/VenusFactory?style=flat-square)](https://github.com/tyang816/VenusFactory/stargazers) [![GitHub forks](https://img.shields.io/github/forks/tyang816/VenusFactory?style=flat-square)](https://github.com/tyang816/VenusFactory/network/members) [![GitHub issues](https://img.shields.io/github/issues/tyang816/VenusFactory?style=flat-square)](https://github.com/tyang816/VenusFactory/issues) [![GitHub license](https://img.shields.io/github/license/tyang816/VenusFactory?style=flat-square)](https://github.com/tyang816/VenusFactory/blob/main/LICENSE)
12
+ [![Python Version](https://img.shields.io/badge/Python-3.10-blue?style=flat-square&logo=python)](https://www.python.org/) [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen?style=flat-square)](https://venusfactory.readthedocs.io/) [![Downloads](https://img.shields.io/github/downloads/tyang816/VenusFactory/total?style=flat-square)](https://github.com/tyang816/VenusFactory/releases)
13
+
14
+ </div>
15
+
16
+ 最新消息:
17
+
18
+ - 欢迎使用 VenusFactory!本项目由[**Liang's Lab**](https://lianglab.sjtu.edu.cn/)开发,由[**Shanghai Jiao Tong University**](https://www.sjtu.edu.cn/)维护。
19
+ - [2025-03-26] 新增 [VenusPLM-300M](https://huggingface.co/AI4Protein/VenusPLM-300M) 模型,基于**VenusPod**独立开发,由[**Hong Liang**](https://lianglab.sjtu.edu.cn/)课题组开发。
20
+ - [2025-03-17] 新增 [Venus-PETA、Venus-ProPrime、Venus-ProSST 模型](https://huggingface.co/AI4Protein),更多详情请参考[支持的模型](#-支持的模型)
21
+ - [2025-03-05] 🎉 **祝贺!** 🎉
22
+
23
+ 🚀 我们课题组最新的研究成果**VenusMutHub**被[**Acta Pharmaceutica Sinica B**](https://www.sciencedirect.com/science/article/pii/S2211383525001650) 正式接收,并发布了系列[**排行榜**](https://lianglab.sjtu.edu.cn/muthub/)!
24
+ 💡 在本研究中,我们构建了**900+ 高质量基准**[**数据集**](https://huggingface.co/datasets/AI4Protein/VenusMutHub),涵盖 **500+ 不同功能特性的蛋白质**. VenusMutHub不仅为**蛋白质突变工程的真实应用场景**提供了全新的小样本数据集,还弥补了现有基准数据集在**多样性**方面的空白,为AI驱动的蛋白质突变效应预测奠定了更坚实的基础。
25
+
26
+ ## ✏️ 目录
27
+
28
+ - [功能特点](#-功能特点)
29
+ - [支持的模型](#-支持的模型)
30
+ - [支持的训练方法](#-支持的训练方法)
31
+ - [支持的数据集](#-支持的数据集)
32
+ - [支持的评估指标](#-支持的评估指标)
33
+ - [环境要求](#-环境要求)
34
+ - [安装指南](#-安装指南)
35
+ - [快速开始](#-快速开始)
36
+ - [命令行使用](#-命令行使用)
37
+ - [引用](#-引用)
38
+ - [致谢](#-致谢)
39
+
40
+ ## 📑 功能特点
41
+
42
+ - **丰富的蛋白质语言模型**:Venus系列、ESM系列、ProtTrans系列、Ankh 系列等
43
+ - **全面的监督数据集**:定位、适应度、溶解度、稳定性等
44
+ - **便捷的数据收集工具**:AlphaFold2 数据库、RCSB、InterPro、Uniprot 等
45
+ - **实验监控**:Wandb、本地监控
46
+ - **友好的界面**:Gradio UI
47
+
48
+ ## 🤖 支持的模型
49
+
50
+ ### 预训练蛋白质语言模型
51
+
52
+ <details>
53
+ <summary>Venus系列模型:特定任务架构</summary>
54
+
55
+ | 模型 | 大小 | 参数量 | GPU内存 | 特点 | 模板 |
56
+ |-------|------|------------|------------|----------|----------|
57
+ | ProSST-20 | 20 | 110M | 4GB+ | 突变预测 | [AI4Protein/ProSST-20](https://huggingface.co/AI4Protein/ProSST-20) |
58
+ | ProSST-128 | 128 | 110M | 4GB+ | 突变预测 | [AI4Protein/ProSST-128](https://huggingface.co/AI4Protein/ProSST-128) |
59
+ | ProSST-512 | 512 | 110M | 4GB+ | 突变预测 | [AI4Protein/ProSST-512](https://huggingface.co/AI4Protein/ProSST-512) |
60
+ | ProSST-2048 | 2048 | 110M | 4GB+ | 突变预测 | [AI4Protein/ProSST-2048](https://huggingface.co/AI4Protein/ProSST-2048) |
61
+ | ProSST-4096 | 4096 | 110M | 4GB+ | 突变预测 | [AI4Protein/ProSST-4096](https://huggingface.co/AI4Protein/ProSST-4096) |
62
+ | ProPrime-690M | 690M | 690M | 16GB+ | OGT预测 | [AI4Protein/Prime_690M](https://huggingface.co/AI4Protein/Prime_690M) |
63
+
64
+ > 💡 这些模型在特定任务上表现出色或提供独特的架构优势
65
+ </details>
66
+
67
+ <details>
68
+ <summary>Venus-PETA 模型:分词变体</summary>
69
+
70
+ #### BPE 分词系列
71
+ | 模型 | 词表大小 | 参数量 | GPU内存 | 模板 |
72
+ |-------|------------|------------|------------|----------|
73
+ | PETA-base | base | 80M | 4GB+ | [AI4Protein/deep_base](https://huggingface.co/AI4Protein/deep_base) |
74
+ | PETA-bpe-50 | 50 | 80M | 4GB+ | [AI4Protein/deep_bpe_50](https://huggingface.co/AI4Protein/deep_bpe_50) |
75
+ | PETA-bpe-200 | 200 | 80M | 4GB+ | [AI4Protein/deep_bpe_200](https://huggingface.co/AI4Protein/deep_bpe_200) |
76
+ | PETA-bpe-400 | 400 | 80M | 4GB+ | [AI4Protein/deep_bpe_400](https://huggingface.co/AI4Protein/deep_bpe_400) |
77
+ | PETA-bpe-800 | 800 | 80M | 4GB+ | [AI4Protein/deep_bpe_800](https://huggingface.co/AI4Protein/deep_bpe_800) |
78
+ | PETA-bpe-1600 | 1600 | 80M | 4GB+ | [AI4Protein/deep_bpe_1600](https://huggingface.co/AI4Protein/deep_bpe_1600) |
79
+ | PETA-bpe-3200 | 3200 | 80M | 4GB+ | [AI4Protein/deep_bpe_3200](https://huggingface.co/AI4Protein/deep_bpe_3200) |
80
+
81
+ #### Unigram 分词系列
82
+ | 模型 | 词表大小 | 参数量 | GPU内存 | 模板 |
83
+ |-------|------------|------------|------------|----------|
84
+ | PETA-unigram-50 | 50 | 80M | 4GB+ | [AI4Protein/deep_unigram_50](https://huggingface.co/AI4Protein/deep_unigram_50) |
85
+ | PETA-unigram-100 | 100 | 80M | 4GB+ | [AI4Protein/deep_unigram_100](https://huggingface.co/AI4Protein/deep_unigram_100) |
86
+ | PETA-unigram-200 | 200 | 80M | 4GB+ | [AI4Protein/deep_unigram_200](https://huggingface.co/AI4Protein/deep_unigram_200) |
87
+ | PETA-unigram-400 | 400 | 80M | 4GB+ | [AI4Protein/deep_unigram_400](https://huggingface.co/AI4Protein/deep_unigram_400) |
88
+ | PETA-unigram-800 | 800 | 80M | 4GB+ | [AI4Protein/deep_unigram_800](https://huggingface.co/AI4Protein/deep_unigram_800) |
89
+ | PETA-unigram-1600 | 1600 | 80M | 4GB+ | [AI4Protein/deep_unigram_1600](https://huggingface.co/AI4Protein/deep_unigram_1600) |
90
+ | PETA-unigram-3200 | 3200 | 80M | 4GB+ | [AI4Protein/deep_unigram_3200](https://huggingface.co/AI4Protein/deep_unigram_3200) |
91
+
92
+ > 💡 不同的分词策略可能更适合特定任务
93
+ </details>
94
+
95
+ <details>
96
+ <summary>ESM 系列模型:Meta AI 的蛋白质语言模型</summary>
97
+
98
+ | 模型 | 大小 | 参数量 | GPU内存 | 训练数据 | 模板 |
99
+ |-------|------|------------|------------|---------------|----------|
100
+ | ESM2-8M | 8M | 8M | 2GB+ | UR50/D | [facebook/esm2_t6_8M_UR50D](https://huggingface.co/facebook/esm2_t6_8M_UR50D) |
101
+ | ESM2-35M | 35M | 35M | 4GB+ | UR50/D | [facebook/esm2_t12_35M_UR50D](https://huggingface.co/facebook/esm2_t12_35M_UR50D) |
102
+ | ESM2-150M | 150M | 150M | 8GB+ | UR50/D | [facebook/esm2_t30_150M_UR50D](https://huggingface.co/facebook/esm2_t30_150M_UR50D) |
103
+ | ESM2-650M | 650M | 650M | 16GB+ | UR50/D | [facebook/esm2_t33_650M_UR50D](https://huggingface.co/facebook/esm2_t33_650M_UR50D) |
104
+ | ESM2-3B | 3B | 3B | 24GB+ | UR50/D | [facebook/esm2_t36_3B_UR50D](https://huggingface.co/facebook/esm2_t36_3B_UR50D) |
105
+ | ESM2-15B | 15B | 15B | 40GB+ | UR50/D | [facebook/esm2_t48_15B_UR50D](https://huggingface.co/facebook/esm2_t48_15B_UR50D) |
106
+
107
+ > 💡 ESM2 模型是最新一代,性能优于 ESM-1b/1v
108
+ </details>
109
+
110
+ <details>
111
+ <summary>BERT 系列模型:基于 Transformer 编码器架构</summary>
112
+
113
+ | 模型 | 大小 | 参数量 | GPU内存 | 训练数据 | 模板 |
114
+ |-------|------|------------|------------|---------------|----------|
115
+ | ProtBert-Uniref100 | 420M | 420M | 12GB+ | UniRef100 | [Rostlab/prot_bert](https://huggingface.co/Rostlab/prot_bert) |
116
+ | ProtBert-BFD | 420M | 420M | 12GB+ | BFD100 | [Rostlab/prot_bert_bfd](https://huggingface.co/Rostlab/prot_bert_bfd) |
117
+ | IgBert | 420M | 420M | 12GB+ | 抗体 | [Exscientia/IgBert](https://huggingface.co/Exscientia/IgBert) |
118
+ | IgBert-unpaired | 420M | 420M | 12GB+ | 抗体 | [Exscientia/IgBert_unpaired](https://huggingface.co/Exscientia/IgBert_unpaired) |
119
+
120
+ > 💡 BFD 训练的模型在结构相关任务上表现更好
121
+ </details>
122
+
123
+ <details>
124
+ <summary>T5 系列模型:编码器-解码器架构</summary>
125
+
126
+ | 模型 | 大小 | 参数量 | GPU内存 | 训练数据 | 模板 |
127
+ |-------|------|------------|------------|---------------|----------|
128
+ | ProtT5-XL-UniRef50 | 3B | 3B | 24GB+ | UniRef50 | [Rostlab/prot_t5_xl_uniref50](https://huggingface.co/Rostlab/prot_t5_xl_uniref50) |
129
+ | ProtT5-XXL-UniRef50 | 11B | 11B | 40GB+ | UniRef50 | [Rostlab/prot_t5_xxl_uniref50](https://huggingface.co/Rostlab/prot_t5_xxl_uniref50) |
130
+ | ProtT5-XL-BFD | 3B | 3B | 24GB+ | BFD100 | [Rostlab/prot_t5_xl_bfd](https://huggingface.co/Rostlab/prot_t5_xl_bfd) |
131
+ | ProtT5-XXL-BFD | 11B | 11B | 40GB+ | BFD100 | [Rostlab/prot_t5_xxl_bfd](https://huggingface.co/Rostlab/prot_t5_xxl_bfd) |
132
+ | Ankh-base | 450M | 450M | 12GB+ | 编码器-解码器 | [ElnaggarLab/ankh-base](https://huggingface.co/ElnaggarLab/ankh-base) |
133
+ | Ankh-large | 1.2B | 1.2B | 20GB+ | 编码器-解码器 | [ElnaggarLab/ankh-large](https://huggingface.co/ElnaggarLab/ankh-large) |
134
+
135
+ > 💡 T5 模型可用于编码和生成任务
136
+ </details>
137
+
138
+ ### 模型选择指南
139
+
140
+ <details>
141
+ <summary>如何选择合适的模型?</summary>
142
+
143
+ 1. **基于硬件限制:**
144
+ - 低配GPU (<8GB):ESM2-8M、ESM2-35M、ProSST
145
+ - 中配GPU (8-16GB):ESM2-150M、ESM2-650M、ProtBert系列
146
+ - 高配GPU (24GB+):ESM2-3B、ProtT5-XL、Ankh-large
147
+ - 多GPU:ESM2-15B、ProtT5-XXL
148
+
149
+ 2. **基于任务类型:**
150
+ - 序列分类:ESM2、ProtBert
151
+ - 结构预测:ESM2、Ankh
152
+ - 生成任务:ProtT5
153
+ - 抗体设计:IgBert、IgT5
154
+ - 轻量级部署:ProSST、PETA-base
155
+
156
+ 3. **基于训练数据:**
157
+ - 通用蛋白质任务:ESM2、ProtBert
158
+ - 结构感知任务:Ankh
159
+ - 抗体特异性:IgBert、IgT5
160
+ - 自定义分词需求:PETA系列
161
+
162
+ </details>
163
+
164
+ > 🔍 所有模型都可通过Hugging Face Hub获取,使用其模板可轻松加载。
165
+
166
+ ## 🔬 支持的训练方法
167
+
168
+ <details>
169
+ <summary>支持的训练方法</summary>
170
+
171
+ | 方法 | 全量微调 | 冻结微调 | SES-Adapter | AdaLoRA | QLoRA | LoRA | DoRA | IA3 |
172
+ |------|---------|----------|-------------|----------|--------|------|------|-----|
173
+ | 监督微调 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
174
+ </details>
175
+
176
+ ## 📚 支持的数据集
177
+
178
+ <details><summary>预训练数据集</summary>
179
+
180
+ | 数据集 | 数据来源 |
181
+ |-------|----------|
182
+ | [CATH_V43_S40](https://huggingface.co/datasets/tyang816/cath) | 结构数据集
183
+ | [AGO_family](https://huggingface.co/datasets/tyang816/Ago_database_PDB) | 结构数据集
184
+
185
+ </details>
186
+
187
+ <details><summary>零样本数据集</summary>
188
+
189
+ | 数据集 | 任务 | 数据来源 |
190
+ |-------|----------|----------|
191
+ | [VenusMutHub](https://huggingface.co/datasets/AI4Protein/VenusMutHub) | 突变 | 蛋白质序列
192
+ | [ProteinGym](https://proteingym.org/) | 突变 | 蛋白质序列
193
+ </details>
194
+
195
+ <details><summary>监督微调数据集(氨基酸序列/foldseek序列/二级结构序列)</summary>
196
+
197
+ | 数据集 | 任务 | 数据层次 | 问题类型 | 数据来源 |
198
+ |-------|------|------------|----------|----------|
199
+ | DeepLocBinary | 定位 | 蛋白质级别 | 单标签分类 | [DeepLocBinary_AlphaFold2](https://huggingface.co/datasets/tyang816/DeepLocBinary_AlphaFold2), [DeepLocBinary_ESMFold](https://huggingface.co/datasets/tyang816/DeepLocBinary_ESMFold) |
200
+ | DeepLocMulti | 定位 | 蛋白质级别 | 多标签分类 | [DeepLocMulti_AlphaFold2](https://huggingface.co/datasets/tyang816/DeepLocMulti_AlphaFold2), [DeepLocMulti_ESMFold](https://huggingface.co/datasets/tyang816/DeepLocMulti_ESMFold) |
201
+ | DeepLoc2Multi | 定位 | 蛋白质级别 | 多标签分类 | [DeepLoc2Multi_AlphaFold2](https://huggingface.co/datasets/tyang816/DeepLoc2Multi_AlphaFold2), [DeepLoc2Multi_ESMFold](https://huggingface.co/datasets/tyang816/DeepLoc2Multi_ESMFold) |
202
+ | DeepSol | 溶解度 | 蛋白质级别 | 单标签分类 | [DeepSol_AlphaFold2](https://huggingface.co/datasets/tyang816/DeepSol_AlphaFold2), [DeepSol_ESMFold](https://huggingface.co/datasets/tyang816/DeepSol_ESMFold) |
203
+ | DeepSoluE | 溶解度 | 蛋白质级别 | 单标签分类 | [DeepSoluE_ESMFold](https://huggingface.co/datasets/tyang816/DeepSoluE_ESMFold) |
204
+ | ProtSolM | 溶解度 | 蛋白质级别 | 单标签分类 | [ProtSolM_ESMFold](https://huggingface.co/datasets/tyang816/ProtSolM_ESMFold) |
205
+ | eSOL | 溶解度 | 蛋白质级别 | 回归 | [eSOL_AlphaFold2](https://huggingface.co/datasets/tyang816/eSOL_AlphaFold2), [eSOL_ESMFold](https://huggingface.co/datasets/tyang816/eSOL_ESMFold) |
206
+ | DeepET_Topt | 最适酶活 | 蛋白质级别 | 回归 | [DeepET_Topt_AlphaFold2](https://huggingface.co/datasets/tyang816/DeepET_Topt_AlphaFold2), [DeepET_Topt_ESMFold](https://huggingface.co/datasets/tyang816/DeepET_Topt_ESMFold) |
207
+ | EC | 功能 | 蛋白质级别 | 多标签分类 | [EC_AlphaFold2](https://huggingface.co/datasets/tyang816/EC_AlphaFold2), [EC_ESMFold](https://huggingface.co/datasets/tyang816/EC_ESMFold) |
208
+ | GO_BP | 功能 | 蛋白质级别 | 多标签分类 | [GO_BP_AlphaFold2](https://huggingface.co/datasets/tyang816/GO_BP_AlphaFold2), [GO_BP_ESMFold](https://huggingface.co/datasets/tyang816/GO_BP_ESMFold) |
209
+ | GO_CC | 功能 | 蛋白质级别 | 多标签分类 | [GO_CC_AlphaFold2](https://huggingface.co/datasets/tyang816/GO_CC_AlphaFold2), [GO_CC_ESMFold](https://huggingface.co/datasets/tyang816/GO_CC_ESMFold) |
210
+ | GO_MF | 功能 | 蛋白质级别 | 多标签分类 | [GO_MF_AlphaFold2](https://huggingface.co/datasets/tyang816/GO_MF_AlphaFold2), [GO_MF_ESMFold](https://huggingface.co/datasets/tyang816/GO_MF_ESMFold) |
211
+ | MetalIonBinding | 结合 | 蛋白质级别 | 单标签分类 | [MetalIonBinding_AlphaFold2](https://huggingface.co/datasets/tyang816/MetalIonBinding_AlphaFold2), [MetalIonBinding_ESMFold](https://huggingface.co/datasets/tyang816/MetalIonBinding_ESMFold) |
212
+ | Thermostability | 稳定性 | 蛋白质级别 | 回归 | [Thermostability_AlphaFold2](https://huggingface.co/datasets/tyang816/Thermostability_AlphaFold2), [Thermostability_ESMFold](https://huggingface.co/datasets/tyang816/Thermostability_ESMFold) |
213
+
214
+ > 💡 每个数据集都提供了使用 AlphaFold2 和 ESMFold 生成的结构序列版本
215
+ </details>
216
+
217
+ <details><summary>监督微调数据集(氨基酸序列)</summary>
218
+
219
+ | 数据集 | 任务 | 数据层次 | 问题类型 | 数据来源 |
220
+ |-------|------|------------|----------|----------|
221
+ | Demo_Solubility | 溶解度 | 蛋白质级别 | 单标签分类 | [Demo_Solubility](https://huggingface.co/datasets/tyang816/Demo_Solubility) |
222
+ | DeepLocBinary | 定位 | 蛋白质级别 | 单标签分类 | [DeepLocBinary](https://huggingface.co/datasets/tyang816/DeepLocBinary) |
223
+ | DeepLocMulti | 定位 | 蛋白质级别 | 单标签分类 | [DeepLocMulti](https://huggingface.co/datasets/tyang816/DeepLocMulti) |
224
+ | DeepLoc2Multi | 定位 | 蛋白质级别 | 多标签分类 | [DeepLoc2Multi](https://huggingface.co/datasets/tyang816/DeepLoc2Multi) |
225
+ | DeepSol | 溶解度 | 蛋白质级别 | 单标签分类 | [DeepSol](https://huggingface.co/datasets/tyang816/DeepSol) |
226
+ | DeepSoluE | 溶解度 | 蛋白质级别 | 单标签分类 | [DeepSoluE](https://huggingface.co/datasets/tyang816/DeepSoluE) |
227
+ | ProtSolM | 溶解度 | 蛋白质级别 | 单标签分类 | [ProtSolM](https://huggingface.co/datasets/tyang816/ProtSolM) |
228
+ | eSOL | 溶解度 | 蛋白质级别 | 回归 | [eSOL](https://huggingface.co/datasets/tyang816/eSOL) |
229
+ | DeepET_Topt | 最适酶活 | 蛋白质级别 | 回归 | [DeepET_Topt](https://huggingface.co/datasets/tyang816/DeepET_Topt) |
230
+ | EC | 功能 | 蛋白质级别 | 多标签分类 | [EC](https://huggingface.co/datasets/tyang816/EC) |
231
+ | GO_BP | 功能 | 蛋白质级别 | 多标签分类 | [GO_BP](https://huggingface.co/datasets/tyang816/GO_BP) |
232
+ | GO_CC | 功能 | 蛋白质级别 | 多标签分类 | [GO_CC](https://huggingface.co/datasets/tyang816/GO_CC) |
233
+ | GO_MF | 功能 | 蛋白质级别 | 多标签分类 | [GO_MF](https://huggingface.co/datasets/tyang816/GO_MF) |
234
+ | MetalIonBinding | 结合 | 蛋白质级别 | 单标签分类 | [MetalIonBinding](https://huggingface.co/datasets/tyang816/MetalIonBinding) |
235
+ | Thermostability | 稳定性 | 蛋白质级别 | 回归 | [Thermostability](https://huggingface.co/datasets/tyang816/Thermostability) |
236
+ | PaCRISPR | CRISPR | 蛋白质级别 | 回归 | [PaCRISPR](https://huggingface.co/datasets/tyang816/PaCRISPR) |
237
+ | PETA_CHS_Sol | 溶解度 | 蛋白质级别 | 回归 | [PETA_CHS_Sol](https://huggingface.co/datasets/tyang816/PETA_CHS_Sol) |
238
+ | PETA_LGK_Sol | 溶解度 | 蛋白质级别 | 回归 | [PETA_LGK_Sol](https://huggingface.co/datasets/tyang816/PETA_LGK_Sol) |
239
+ | PETA_TEM_Sol | 溶解度 | 蛋白质级别 | 回归 | [PETA_TEM_Sol](https://huggingface.co/datasets/tyang816/PETA_TEM_Sol) |
240
+ | SortingSignal | 信号肽 | 蛋白质级别 | 回归 | [SortingSignal](https://huggingface.co/datasets/tyang816/SortingSignal) |
241
+ | FLIP_AAV | 突变 | 蛋白质点位 | 回归 | |
242
+ | FLIP_AAV_one-vs-rest | 突变 | 蛋白质点位 | 回归 | [FLIP_AAV_one-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_AAV_one-vs-rest) |
243
+ | FLIP_AAV_two-vs-rest | 突变 | 蛋白质点位 | 回归 | [FLIP_AAV_two-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_AAV_two-vs-rest) |
244
+ | FLIP_AAV_mut-des | 突变 | 蛋白质点位 | 回归 | [FLIP_AAV_mut-des](https://huggingface.co/datasets/tyang816/FLIP_AAV_mut-des) |
245
+ | FLIP_AAV_des-mut | 突变 | 蛋白质点位 | 回归 | [FLIP_AAV_des-mut](https://huggingface.co/datasets/tyang816/FLIP_AAV_des-mut) |
246
+ | FLIP_AAV_seven-vs-rest | 突变 | 蛋白质点位 | 回归 | [FLIP_AAV_seven-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_AAV_seven-vs-rest) |
247
+ | FLIP_AAV_low-vs-high | 突变 | 蛋白质点位 | 回归 | [FLIP_AAV_low-vs-high](https://huggingface.co/datasets/tyang816/FLIP_AAV_low-vs-high) |
248
+ | FLIP_AAV_sampled | 突变 | 蛋白质点位 | 回归 | [FLIP_AAV_sampled](https://huggingface.co/datasets/tyang816/FLIP_AAV_sampled) |
249
+ | FLIP_GB1 | 突变 | 蛋白质点位 | 回归 | |
250
+ | FLIP_GB1_one-vs-rest | 突变 | 蛋白质点位 | 回归 | [FLIP_GB1_one-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_GB1_one-vs-rest) |
251
+ | FLIP_GB1_two-vs-rest | 突变 | 蛋白质点位 | 回归 | [FLIP_GB1_two-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_GB1_two-vs-rest) |
252
+ | FLIP_GB1_three-vs-rest | 突变 | 蛋白质点位 | 回归 | [FLIP_GB1_three-vs-rest](https://huggingface.co/datasets/tyang816/FLIP_GB1_three-vs-rest) |
253
+ | FLIP_GB1_low-vs-high | 突变 | 蛋白质点位 | 回归 | [FLIP_GB1_low-vs-high](https://huggingface.co/datasets/tyang816/FLIP_GB1_low-vs-high) |
254
+ | FLIP_GB1_sampled | 突变 | 蛋白质点位 | 回归 | [FLIP_GB1_sampled](https://huggingface.co/datasets/tyang816/FLIP_GB1_sampled) |
255
+ | TAPE_Fluorescence | 突变 | 蛋白质点位 | 回归 | [TAPE_Fluorescence](https://huggingface.co/datasets/tyang816/TAPE_Fluorescence) |
256
+ | TAPE_Stability | 突变 | 蛋白质点位 | 回归 | [TAPE_Stability](https://huggingface.co/datasets/tyang816/TAPE_Stability) |
257
+
258
+
259
+ > 💡 不同数据集的序列结构不同,例如 ``DeepLocBinary_ESMFold`` 和 ``DeepLocBinary_AlphaFold2`` 共享相同的氨基酸序列,因此如果您只想使用 ``aa_seqs``,两者都可以使用!
260
+
261
+ </details>
262
+
263
+
264
+ ## 📈 支持的评估指标
265
+ <details>
266
+ <summary>支持的评估指标</summary>
267
+
268
+ | 名称 | Torchmetrics | 问题类型 |
269
+ |------|--------------|----------|
270
+ | accuracy | Accuracy | 单标签分类/多标签分类 |
271
+ | recall | Recall | 单标签分类/多标签分类 |
272
+ | precision | Precision | 单标签分类/多标签分类 |
273
+ | f1 | F1Score | 单标签分类/多标签分类 |
274
+ | mcc | MatthewsCorrCoef | 单标签���类/多标签分类 |
275
+ | auc | AUROC | 单标签分类/多标签分类 |
276
+ | f1_max | F1ScoreMax | 多标签分类 |
277
+ | spearman_corr | SpearmanCorrCoef | 回归 |
278
+ | mse | MeanSquaredError | 回归 |
279
+ </details>
280
+ ## ✈️ 环境要求
281
+
282
+ ### 硬件要求
283
+ - 推荐:NVIDIA RTX 3090 (24GB) 或更好
284
+ - 实际要求取决于您选择的蛋白质语言模型
285
+
286
+ ### 软件要求
287
+ - [Anaconda3](https://www.anaconda.com/download) 或 [Miniconda3](https://docs.conda.io/projects/miniconda/en/latest/)
288
+ - Python 3.10
289
+
290
+ ## 📦 安装指南
291
+ <details><summary> 在macOS上开始</summary>
292
+
293
+ ## 为了获得最佳性能和体验,我们推荐使用 ​带有M系列芯片的Mac设备​(如 M1、M2、M3 等)
294
+
295
+ ## 1️⃣ 克隆仓库
296
+
297
+ 首先,从Github获取VenusFactory的代码:
298
+
299
+ ```bash
300
+ git clone https://github.com/tyang816/VenusFactory.git
301
+ cd VenusFactory
302
+ ```
303
+
304
+ ## 2️⃣ 创建Conda环境
305
+
306
+ 确保已安装Anaconda或Miniconda。然后,创建一个名为`venus`的新环境,使用Python 3.10:
307
+
308
+ ```bash
309
+ conda create -n venus python=3.10
310
+ conda activate venus
311
+ ```
312
+
313
+ ## 3️⃣ 安装PyTorch和PyG依赖项
314
+
315
+ ```bash
316
+ # 安装PyTorch
317
+ pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
318
+
319
+ # 安装PyG依赖项
320
+ pip install torch_scatter torch-sparse torch-cluster torch-geometric -f https://data.pyg.org/whl/torch-2.2.0+cpu.html
321
+ ```
322
+
323
+ ## 4️⃣ 安装其他依赖项
324
+
325
+ 使用`requirements_for_macOS.txt`安装剩余依赖项:
326
+ ```bash
327
+ pip install -r requirements_for_macOS.txt
328
+ ```
329
+ </details>
330
+
331
+ <details><summary> 在Windows或Linux上开始(使用CUDA 12.X)</summary>
332
+
333
+ ## 我们推荐使用CUDA 12.2
334
+
335
+
336
+ ## 1️⃣ 克隆仓库
337
+
338
+ 首先,从Github获取VenusFactory的代码:
339
+
340
+ ```bash
341
+ git clone https://github.com/tyang816/VenusFactory.git
342
+ cd VenusFactory
343
+ ```
344
+
345
+ ## 2️⃣ 创建Conda环境
346
+
347
+ 确保已安装Anaconda或Miniconda。然后,创建一个名为`venus`的新环境,使用Python 3.10:
348
+
349
+
350
+ ```bash
351
+ conda create -n venus python=3.10
352
+ conda activate venus
353
+ ```
354
+
355
+ ## 3️⃣ 安装PyTorch和PyG依赖项
356
+
357
+ ```bash
358
+ # 安装PyTorch
359
+ pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu121
360
+
361
+ # 安装PyG依赖项
362
+ pip install torch_geometric==2.6.1 -f https://pytorch-geometric.com/whl/torch-2.5.1+cu121.html
363
+ pip install --no-index torch_scatter==2.1.2 -f https://pytorch-geometric.com/whl/torch-2.5.1+cu121.html
364
+ ```
365
+
366
+ ## 4️⃣ 安装其他依赖项
367
+
368
+ 使用`requirements.txt`安装剩余依赖项:
369
+ ```bash
370
+ pip install -r requirements.txt
371
+ ```
372
+ </details>
373
+
374
+ <details><summary> 在Windows或Linux上开始(使用CUDA 11.X)</summary>
375
+
376
+ ## 我们推荐使用CUDA 11.8或更高版本,因为它们支持更高版本的PyTorch,提供更好的体验。
377
+
378
+
379
+ ## 1️⃣ 克隆仓库
380
+
381
+ 首先,从Github获取VenusFactory的代码:
382
+
383
+ ```bash
384
+ git clone https://github.com/tyang816/VenusFactory.git
385
+ cd VenusFactory
386
+ ```
387
+
388
+ ## 2️⃣ 创建Conda环境
389
+
390
+ 确保已安装Anaconda或Miniconda。然后,创建一个名为`venus`的新环境,使用Python 3.10:
391
+
392
+
393
+ ```bash
394
+ conda create -n venus python=3.10
395
+ conda activate venus
396
+ ```
397
+
398
+ ## 3️⃣ 安装PyTorch和PyG依赖项
399
+
400
+ ```bash
401
+ # 安装PyTorch
402
+ pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu118
403
+
404
+ # 安装PyG依赖项
405
+ pip install torch_geometric==2.6.1 -f https://pytorch-geometric.com/whl/torch-2.5.1+cu118.html
406
+ pip install --no-index torch_scatter==2.1.2 -f https://pytorch-geometric.com/whl/torch-2.5.1+cu118.html
407
+ ```
408
+
409
+ ## 4️⃣ 安装其他依赖项
410
+
411
+ 使用`requirements.txt`安装剩余依赖项:
412
+ ```bash
413
+ pip install -r requirements.txt
414
+ ```
415
+ </details>
416
+
417
+ <details><summary> 在Windows或Linux上开始(使用CPU)</summary>
418
+
419
+ ## 1️⃣ 克隆仓库
420
+
421
+ 首先,从Github获取VenusFactory的代码:
422
+
423
+ ```bash
424
+ git clone https://github.com/tyang816/VenusFactory.git
425
+ cd VenusFactory
426
+ ```
427
+
428
+ ## 2️⃣ 创建Conda环境
429
+
430
+ 确保已安装Anaconda或Miniconda。然后,创建一个名为`venus`的新环境,使用Python 3.10:
431
+
432
+
433
+ ```bash
434
+ conda create -n venus python=3.10
435
+ conda activate venus
436
+ ```
437
+
438
+ ## 3️⃣ 安装PyTorch和PyG依赖项
439
+
440
+ ```bash
441
+ # 安装PyTorch
442
+ pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cpu
443
+
444
+ # 安装PyG依赖项
445
+ pip install torch_geometric==2.6.1 -f https://pytorch-geometric.com/whl/torch-2.5.1+cpu.html
446
+ pip install --no-index torch_scatter==2.1.2 -f https://pytorch-geometric.com/whl/torch-2.5.1+cpu.html
447
+ ```
448
+
449
+ ## 4️⃣ 安装其他依赖项
450
+
451
+ 使用`requirements.txt`安装剩余依赖项:
452
+ ```bash
453
+ pip install -r requirements.txt
454
+ ```
455
+ </details>
456
+
457
+
458
+ ## 🚀 快速开始
459
+
460
+ ### 启动 Venus Web UI
461
+
462
+ 使用我们基于 [Gradio](https://github.com/gradio-app/gradio) 的直观图形界面快速开始:
463
+
464
+ ```bash
465
+ python ./src/webui.py
466
+ ```
467
+
468
+ 您可以:
469
+ - 配置并运行微调实验
470
+ - 监控训练进度
471
+ - 评估模型
472
+ - 可视化结果
473
+
474
+ ### 使用各个标签页
475
+
476
+ 我们提供详细的指南帮助您浏览每个标签页。
477
+
478
+ <details>
479
+ <summary>1. 训练标签页:训练您自己的蛋白质语言模型</summary>
480
+
481
+ ![Model_Dataset_Config](img/Train/Model_Dataset_Config.png)
482
+
483
+ 从下拉菜单中选择蛋白质语言模型。上传您的数据集或选择可用数据集,并选择适合您问题类型的评估指标。
484
+
485
+ ![Training_Parameters](img/Train/Training_Parameters.png)
486
+ 选择训练方法(Freeze、SES-Adapter、LoRA、QLoRA等)并配置训练参数(批量大小、学习率等)。
487
+
488
+ ![Preview_Command](img/Train/Preview_Command.png)
489
+ ![Training_Progress](img/Train/Training_Progress.png)
490
+ ![Best_Model](img/Train/Best_Model.png)
491
+ ![Monitor_Figs](img/Train/Monitor_Figs.png)
492
+ 点击"开始训练"并实时监控进度。
493
+
494
+ <p align="center">
495
+ <img src="img/Train/Metric_Results.png" width="60%" alt="Metric_Results">
496
+ </p>
497
+
498
+ 点击"下载CSV"下载测试指标结果。
499
+ </details>
500
+
501
+ <details>
502
+ <summary>2. 评估标签页:在基准测试中评估您的训练模型</summary>
503
+
504
+ ![Model_Dataset_Config](img/Eval/Model_Dataset_Config.png)
505
+
506
+ 通过指定模型路径加载您的训练模型。选择训练时使用的相同蛋白质语言模型和模型配置。选择测试数据集并配置批量大小。选择适合您问题类型的评估指标。最后,点击"开始评估"查看性能指标。
507
+ </details>
508
+
509
+ <details>
510
+ <summary>3. 预测标签页:使用您的训练模型进行样本预测</summary>
511
+
512
+ ![Predict_Tab](img/Predict/Predict_Tab.png)
513
+
514
+ 通过指定模型路径加载您的训练模型。选择训练时使用的相同蛋白质语言模型和模型配置。
515
+
516
+ 单序列预测:在文本框中输入蛋白质序列。
517
+
518
+ 批量预测:上传包含序列的CSV文件。
519
+
520
+ ![Batch](img/Predict/Batch.png)
521
+
522
+ 点击"预测"生成并查看结果。
523
+ </details>
524
+
525
+ <details>
526
+ <summary>4. 下载标签页:高效收集来自不同来源的数据</summary>
527
+
528
+ - **AlphaFold2结构**:输入UniProt ID下载蛋白质结构
529
+ - **UniProt**:使用关键词或ID搜索蛋白质信息
530
+ - **InterPro**:获取蛋白质家族和结构域信息
531
+ - **RCSB PDB**:下载实验蛋白质结构
532
+ </details>
533
+
534
+ <details>
535
+ <summary>5. 手册标签页:详细文档和指南</summary>
536
+
537
+ 选择语言(英文/中文)。
538
+
539
+ 使用目录导航文档并找到分步指南。
540
+ </details>
541
+
542
+ ## 🧬 命令行使用
543
+
544
+ 对于偏好命令行界面的用户,我们提供全面的脚本解决方案。
545
+
546
+ <details>
547
+ <summary>训练方法:适应不同需求的各种微调方法</summary>
548
+
549
+ ### 全模型微调
550
+ ```bash
551
+ # 冻结微调:训练特定层同时冻结其他层
552
+ bash ./script/train/train_plm_vanilla.sh
553
+ ```
554
+
555
+ ### 参数高效微调 (PEFT)
556
+ ```bash
557
+ # SES-Adapter:选择性和高效的适配器微调
558
+ bash ./script/train/train_plm_ses-adapter.sh
559
+
560
+ # AdaLoRA:自适应低秩适配
561
+ bash ./script/train/train_plm_adalora.sh
562
+
563
+ # QLoRA:量化低秩适配
564
+ bash ./script/train/train_plm_qlora.sh
565
+
566
+ # LoRA:低秩适配
567
+ bash ./script/train/train_plm_lora.sh
568
+
569
+ # DoRA:双低秩适配
570
+ bash ./script/train/train_plm_dora.sh
571
+
572
+ # IA3:通过抑制和放大内部激活的注入适配器
573
+ bash ./script/train/train_plm_ia3.sh
574
+ ```
575
+
576
+ #### 训练方法比较
577
+ | 方法 | 内存使用 | 训练速度 | 性能 |
578
+ |------|----------|----------|------|
579
+ | Freeze | 低 | 快 | 良好 |
580
+ | SES-Adapter | 中等 | 中等 | 更好 |
581
+ | AdaLoRA | 低 | 中等 | 更好 |
582
+ | QLoRA | 非常低 | 较慢 | 良好 |
583
+ | LoRA | 低 | 快 | 良好 |
584
+ | DoRA | 低 | 中等 | 更好 |
585
+ | IA3 | 非常低 | 快 | 良好 |
586
+
587
+ </details>
588
+
589
+ <details>
590
+ <summary>模型评估:全面的评估工具</summary>
591
+
592
+ ### 基础评估
593
+ ```bash
594
+ # 在测试集上评估模型性能
595
+ bash ./script/eval/eval.sh
596
+ ```
597
+
598
+ ### 可用指标
599
+ - 分类:准确率、精确率、召回率、F1、MCC、AUC
600
+ - 回归:MSE、Spearman相关系数
601
+ - 多标签:F1-max
602
+
603
+ ### 可视化工具
604
+ - 训练曲线
605
+ - 混淆矩阵
606
+ - ROC曲线
607
+ - 性能比较图
608
+
609
+ </details>
610
+
611
+ <details>
612
+ <summary>结构序列工具:处理蛋白质结构信息</summary>
613
+
614
+ ### ESM结构序列
615
+ ```bash
616
+ # 使用ESM-3生成结构序列
617
+ bash ./script/get_get_structure_seq/get_esm3_structure_seq.sh
618
+ ```
619
+
620
+ ### 二级结构
621
+ ```bash
622
+ # 预测蛋白质二级结构
623
+ bash ./script/get_get_structure_seq/get_secondary_structure_seq.sh
624
+ ```
625
+
626
+ 特点:
627
+ - 支持多种序列格式
628
+ - 批处理能力
629
+ - 与流行的结构预测工具集成
630
+
631
+ </details>
632
+
633
+ <details>
634
+ <summary>数据收集工具:多源蛋白质数据获取</summary>
635
+
636
+ ### 格式转换
637
+ ```bash
638
+ # 将CIF格式转换为PDB
639
+ bash ./crawler/convert/maxit.sh
640
+ ```
641
+
642
+ ### 元数据收集
643
+ ```bash
644
+ # 从RCSB PDB下载元数据
645
+ bash ./crawler/metadata/download_rcsb.sh
646
+ ```
647
+
648
+ ### 序列数据
649
+ ```bash
650
+ # 从UniProt下载蛋白质序列
651
+ bash ./crawler/sequence/download_uniprot_seq.sh
652
+ ```
653
+
654
+ ### 结构数据
655
+ ```bash
656
+ # 从AlphaFold2数据库下载
657
+ bash ./crawler/structure/download_alphafold.sh
658
+
659
+ # 从RCSB PDB下载
660
+ bash ./crawler/structure/download_rcsb.sh
661
+ ```
662
+
663
+ 特点:
664
+ - 自动批量下载
665
+ - 断点续传
666
+ - 数据完整性验证
667
+ - 多源支持
668
+ - 自定义搜索条件
669
+
670
+ #### 支持的数据库
671
+ | 数据库 | 数据类型 | 访问方式 | 速率限制 |
672
+ |--------|----------|----------|----------|
673
+ | AlphaFold2 | 结构 | REST API | 是 |
674
+ | RCSB PDB | 结构 | FTP/HTTP | 否 |
675
+ | UniProt | 序列 | REST API | 是 |
676
+ | InterPro | 结构域 | REST API | 是 |
677
+
678
+ </details>
679
+
680
+ <details>
681
+ <summary>使用示例:常见场景和解决方案</summary>
682
+
683
+ ### 训练示例
684
+ ```bash
685
+ # 使用ESM2训练蛋白质溶解度预测器
686
+ bash ./script/train/train_plm_lora.sh \
687
+ --model "facebook/esm2_t33_650M_UR50D" \
688
+ --dataset "DeepSol" \
689
+ --batch_size 32 \
690
+ --learning_rate 1e-4
691
+ ```
692
+
693
+ ### 评估示例
694
+ ```bash
695
+ # 评估训练好的模型
696
+ bash ./script/eval/eval.sh \
697
+ --model_path "path/to/your/model" \
698
+ --test_dataset "DeepSol_test"
699
+ ```
700
+
701
+ ### 数据收集示例
702
+ ```bash
703
+ # 下载UniProt ID列表对应的结构
704
+ bash ./crawler/structure/download_alphafold.sh \
705
+ --input uniprot_ids.txt \
706
+ --output ./structures
707
+ ```
708
+
709
+ </details>
710
+
711
+ > 💡 所有脚本都支持额外的命令行参数进行自定义。使用任何脚本的 `--help` 选项查看可用选项。
712
+
713
+ ## 🙌 引用
714
+
715
+ 如果您使用了我们的代码或数据,请引用我们的工作:
716
+
717
+ ```bibtex
718
+ @article{tan2025venusfactory,
719
+ title={VenusFactory: A Unified Platform for Protein Engineering Data Retrieval and Language Model Fine-Tuning},
720
+ author={Tan, Yang and Liu, Chen and Gao, Jingyuan and Wu, Banghao and Li, Mingchen and Wang, Ruilin and Zhang, Lingrong and Yu, Huiqun and Fan, Guisheng and Hong, Liang and Zhou, Bingxin},
721
+ journal={arXiv preprint arXiv:2503.15438},
722
+ year={2025}
723
+ }
724
+ ```
725
+
726
+ ## 🎊 致谢
727
+
728
+ 感谢 [Liang's Lab](https://ins.sjtu.edu.cn/people/lhong/index.html) 的支持。
Scripts_notebook.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
WebUI_demo.md ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Quick Demo Guide
2
+
3
+ This document provides a comprehensive guide to help you quickly understand the main features of VenusFactory and perform fine-tuning, evaluation, and prediction on a demo dataset for protein solubility prediction.
4
+
5
+ ## 1. Environment Preparation
6
+
7
+ Before starting, please ensure that you have successfully installed **VenusFactory** and correctly configured the corresponding environment and Python dependencies. If not yet installed, please refer to the **✈️ Requirements** section in [README.md](README.md) for installation instructions.
8
+
9
+ ## 2. Launch Web Interface
10
+
11
+ Enter the following command in the command line to launch the Web UI:
12
+
13
+ ```bash
14
+ python src/webui.py
15
+ ```
16
+
17
+ ## 3. Training (Training Tab)
18
+
19
+ ### 3.1 Select Pre-trained Model
20
+
21
+ Choose a suitable pre-trained model from the Protein Language Model dropdown. It is recommended to start with ESM2-8M, which has lower computational cost and is suitable for beginners.
22
+
23
+ ### 3.2 Select Dataset
24
+
25
+ In the Dataset Configuration section, select the Demo_Solubility dataset (default option). Click the Preview Dataset button to preview the dataset content.
26
+
27
+ ### 3.3 Set Task Parameters
28
+
29
+ - Problem Type, Number of Labels, and Metrics options will be automatically filled when selecting a Pre-defined Dataset.
30
+
31
+ - For Batch Processing Mode, it is recommended to select Batch Token Mode to avoid uneven batch processing due to high variance in protein sequence lengths.
32
+
33
+ - Batch Token is recommended to be set to 4000. If you encounter CUDA memory errors, you can reduce this value accordingly.
34
+
35
+ ### 3.4 Choose Training Method
36
+
37
+ In the Training Parameters section:
38
+
39
+ - Training Method is a key selection. This Demo dataset does not currently support the SES-Adapter method (due to lack of structural sequence information).
40
+
41
+ - You can choose the Freeze method to only fine-tune the classification head, or use the LoRA method for efficient parameter fine-tuning.
42
+
43
+ ### 3.5 Start Training
44
+
45
+ - Click Preview Command to preview the command line script.
46
+
47
+ - Click Start to begin training. The Web interface will display model statistics and real-time training monitoring.
48
+
49
+ - After training is complete, the interface will show the model's Metrics on the test set to evaluate model performance.
50
+
51
+ ## 4. Evaluation (Evaluation Tab)
52
+
53
+ ### 4.1 Select Model Path
54
+
55
+ In the **Model Path** option, enter the path of the trained model (under the `ckpt` root directory). Ensure that the selected **PLM** and **method** are consistent with those used during training.
56
+
57
+ ### 4.2 Evaluation Dataset Loading Rules
58
+
59
+ - The evaluation system will automatically load the test set of the corresponding dataset.
60
+ - If the test set cannot be found, data will be loaded in the order of **validation set → training set**.
61
+ - For custom datasets uploaded to Hugging Face:
62
+ - **If only a single CSV file is uploaded**, the evaluation system will automatically load that file, regardless of naming.
63
+ - **If training, validation, and test sets are uploaded**, please ensure accurate file naming.
64
+
65
+ ### 4.3 Start Evaluation
66
+
67
+ Click **Start Evaluation** to begin the evaluation.
68
+
69
+ > **Example Model**
70
+ > This project provides a model **demo_provided.pt** that has already been trained on the **Demo_Solubility** dataset using the **Freeze** method, which can be used directly for evaluation.
71
+
72
+ ## 5. Prediction (Prediction Tab)
73
+
74
+ ### 5.1 Single Sequence Prediction
75
+
76
+ Enter a single amino acid sequence to directly predict its solubility.
77
+
78
+ ### 5.2 Batch Prediction
79
+
80
+ - By uploading a CSV file, you can predict the solubility of proteins in batch and download the results (in CSV format).
81
+
82
+ ## 6. Download (Download Tab)
83
+
84
+ For detailed instructions and examples regarding the **Download Tab**, please refer to the **Download** section in the **Manual Tab**.
WebUI_demo_CN.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 快速Demo指南
2
+
3
+ 本文档提供了一个全面的指南,帮助您快速了解VenusFactory的主要功能,并在一个蛋白质可溶性预测的Demo数据集上进行微调训练、评估和预测。
4
+
5
+ ## 1. 环境准备
6
+
7
+ 在开始之前,请确保您已成功安装 **VenusFactory** 并正确配置了相应的环境和 Python 依赖包。如果尚未安装,请参考 [README.md](README_CN.md) 中的 **✈️ Requirements** 章节进行安装。
8
+
9
+ ## 2. 启动 Web 界面
10
+
11
+ 在命令行中输入以下命令,启动 Web UI:
12
+ ```bash
13
+ python src/webui.py
14
+ ```
15
+
16
+ ## 3. 训练(Training Tab)
17
+
18
+ ### 3.1 选择预训练模型
19
+
20
+ 在 Protein Language Model 选项中选择合适的预训练模型。建议从 ESM2-8M 开始,该模型计算成本较低,便于快速上手。
21
+
22
+ ### 3.2 选择数据集
23
+
24
+ 在 Dataset Configuration 选项中,选择 Demo_Solubility 数据集(默认选项)。点击 Preview Dataset 按钮可预览数据集内容。
25
+
26
+ ### 3.3 设定任务参数
27
+
28
+ - Problem Type、Number of Labels 和 Metrics 选项会在选择 Pre-defined Dataset 时自动填充。
29
+
30
+ - Batch Processing Mode 建议选择 Batch Token Mode,以避免蛋白质序列长度方差过大导致批处理不均。
31
+
32
+ - Batch Token 推荐设为 4000,若出现 CUDA 内存不足错误,可适当减小该值。
33
+
34
+ ### 3.4 选择训练方法
35
+
36
+ 在 Training Parameters 选项中:
37
+
38
+ - Training Method 为关键选择项。本 Demo 数据集暂不支持 SES-Adapter 方法(因缺乏结构序列信息)。
39
+
40
+ - 可选择 Freeze 方法,仅微调分类头,或采用 LoRA 方法进行高效参数微调。
41
+
42
+ ### 3.5 开始训练
43
+
44
+ - 点击 Preview Command 预览命令行脚本。
45
+
46
+ - 点击 Start 启动训练,Web 界面会显示模型的统计信息和实时训练监控。
47
+
48
+ - 训练完成后,界面会展示模型在测试集上的 Metrics,用于评估模型效果。
49
+
50
+ ## 4. 评估(Evaluation Tab)
51
+
52
+ ### 4.1 选择模型路径
53
+
54
+ 在 **Model Path** 选项中,输入训练完成的模型路径(`ckpt` 根目录下)。确保选择的 **PLM** 和 **method** 与训练时一致。
55
+
56
+ ### 4.2 评估数据集加载规则
57
+
58
+ - 评估系统会自动加载相应数据集的测试集。
59
+ - 若找不到测试集,则按照 **验证集 → 训练集** 的顺序加载数据。
60
+ - 上传到 Hugging Face 的自定义数据集:
61
+ - **若仅上传单个 CSV 文件**,评估系统会自动加载该文件,不受命名影响。
62
+ - **若上传训练集、验证集和测试集**,请确保文件命名准确。
63
+
64
+ ### 4.3 启动评估
65
+
66
+ 点击 **Start Evaluation** 进行评估。
67
+
68
+ > **示例模型**
69
+ > 本项目提供了一个已经在 **Demo_Solubility** 数据集上使用 **Freeze** 方法训练的模型 **demo_provided.pt**,可直接用于评估。
70
+
71
+ ## 5. 预测(Prediction Tab)
72
+
73
+ ### 5.1 单序列预测(Sequence Prediction)
74
+
75
+ 输入单个氨基酸序列,即可直接进行可溶性预测。
76
+
77
+ ### 5.2 批量预测(Batch Prediction)
78
+
79
+ - 通过上传 CSV 文件,可批量预测蛋白质的可溶性,并下载结果(CSV 格式)。
80
+
81
+ ## 6. 下载(Download Tab)
82
+
83
+ 有关 **Download Tab** 的详细使用说明和示例,请参考 **Manual Tab** 中的 **Download** 章节。
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import time
3
+ import gradio as gr
4
+ from web.utils.monitor import TrainingMonitor
5
+ from web.train_tab import create_train_tab
6
+ from web.eval_tab import create_eval_tab
7
+ from web.download_tab import create_download_tab
8
+ from web.predict_tab import create_predict_tab
9
+ from web.manual_tab import create_manual_tab
10
+
11
+ def load_constant():
12
+ """Load constant values from config files"""
13
+ try:
14
+ return json.load(open("src/constant.json"))
15
+ except Exception as e:
16
+ return {"error": f"Failed to load constant.json: {str(e)}"}
17
+
18
+ def create_ui():
19
+ monitor = TrainingMonitor()
20
+ constant = load_constant()
21
+
22
+ def update_output():
23
+ try:
24
+ if monitor.is_training:
25
+ messages = monitor.get_messages()
26
+ loss_plot = monitor.get_loss_plot()
27
+ metrics_plot = monitor.get_metrics_plot()
28
+ return messages, loss_plot, metrics_plot
29
+ else:
30
+ if monitor.error_message:
31
+ return f"Training stopped with error:\n{monitor.error_message}", None, None
32
+ return "Click Start to begin training!", None, None
33
+ except Exception as e:
34
+ return f"Error in UI update: {str(e)}", None, None
35
+
36
+ with gr.Blocks() as demo:
37
+ gr.Markdown("# VenusFactory")
38
+
39
+ # Create tabs
40
+ with gr.Tabs():
41
+ try:
42
+ train_components = {"output_text": None, "loss_plot": None, "metrics_plot": None}
43
+ train_tab = create_train_tab(constant)
44
+ if train_components["output_text"] is not None and train_components["loss_plot"] is not None and train_components["metrics_plot"] is not None:
45
+ train_components["output_text"] = train_tab["output_text"]
46
+ train_components["loss_plot"] = train_tab["loss_plot"]
47
+ train_components["metrics_plot"] = train_tab["metrics_plot"]
48
+ eval_components = create_eval_tab(constant)
49
+ predict_components = create_predict_tab(constant)
50
+ download_components = create_download_tab(constant)
51
+ manual_components = create_manual_tab(constant)
52
+ except Exception as e:
53
+ gr.Markdown(f"Error creating UI components: {str(e)}")
54
+ train_components = {"output_text": None, "loss_plot": None, "metrics_plot": None}
55
+
56
+ if train_components["output_text"] is not None and train_components["loss_plot"] is not None and train_components["metrics_plot"] is not None:
57
+ demo.load(
58
+ fn=update_output,
59
+ inputs=None,
60
+ outputs=[
61
+ train_components["output_text"],
62
+ train_components["loss_plot"],
63
+ train_components["metrics_plot"]
64
+ ]
65
+ )
66
+
67
+ return demo
68
+
69
+ if __name__ == "__main__":
70
+ try:
71
+ demo = create_ui()
72
+ demo.launch(server_name="0.0.0.0", share=True, allowed_paths=["img"])
73
+ except Exception as e:
74
+ print(f"Failed to launch UI: {str(e)}")
ckpt/demo/demo.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hidden_size": 1280, "num_attention_head": 8, "attention_probs_dropout": 0.1, "plm_model": "facebook/esm2_t33_650M_UR50D", "pooling_method": "mean", "pooling_dropout": 0.1, "dataset": "tyang816/FLIP_AAV_des-mut", "dataset_config": "data/FLIP_AAV/FLIP_AAV_des-mut_HF.json", "normalize": "min_max", "num_labels": 1, "problem_type": "regression", "pdb_type": null, "train_file": null, "valid_file": null, "test_file": null, "metrics": ["spearman_corr"], "seed": 3407, "learning_rate": 0.0005, "scheduler": null, "warmup_steps": 0, "num_workers": 4, "batch_size": null, "batch_token": 4000, "num_epochs": 5, "max_seq_len": -1, "gradient_accumulation_steps": 1, "max_grad_norm": -1.0, "patience": 10, "monitor": "spearman_corr", "monitor_strategy": "max", "training_method": "freeze", "lora_r": 8, "lora_alpha": 32, "lora_dropout": 0.1, "feedforward_modules": "w0", "lora_target_modules": ["query", "key", "value"], "structure_seq": [], "output_model_name": "demo.pt", "output_root": "ckpt", "output_dir": "ckpt\\demo", "wandb": false, "wandb_entity": null, "wandb_project": "VenusFactory", "wandb_run_name": null}
ckpt/demo/demo.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85a61422d6f469c4dc94823bbdfcba090377c4235bca1f9e5768d1c89f853113
3
+ size 6576362
ckpt/demo/demo_provided.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hidden_size": 320, "num_attention_head": 8, "attention_probs_dropout": 0.1, "plm_model": "facebook/esm2_t6_8M_UR50D", "pooling_method": "mean", "pooling_dropout": 0.1, "dataset": "tyang816/Demo_Solubility", "dataset_config": "data/Demo/Demo_Solubility_HF.json", "normalize": null, "num_labels": 2, "problem_type": "single_label_classification", "pdb_type": null, "train_file": null, "valid_file": null, "test_file": null, "metrics": ["accuracy", "mcc", "f1", "precision", "recall", "auroc"], "seed": 3407, "learning_rate": 0.0005, "scheduler": null, "warmup_steps": 0, "num_workers": 4, "batch_size": null, "batch_token": 4000, "num_epochs": 20, "max_seq_len": -1, "gradient_accumulation_steps": 1, "max_grad_norm": -1.0, "patience": 10, "monitor": "accuracy", "monitor_strategy": "max", "training_method": "freeze", "lora_r": 8, "lora_alpha": 32, "lora_dropout": 0.1, "feedforward_modules": "w0", "lora_target_modules": ["query", "key", "value"], "structure_seq": [], "output_model_name": "demo_provided.pt", "output_root": "ckpt", "output_dir": "ckpt/demo", "wandb": false, "wandb_entity": null, "wandb_project": "VenusFactory", "wandb_run_name": null}
ckpt/demo/demo_provided.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b46ee577312579dee0906b3cdbd23d30d40bbb8a8ce873cba85abbf694c125e
3
+ size 418692
data/DeepET_Topt/DeepET_Topt_AlphaFold2_HF.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/DeepET_Topt_AlphaFold2",
3
+ "pdb_type": "AlphaFold2",
4
+ "num_labels": 1,
5
+ "problem_type": "regression",
6
+ "metrics": "mse,spearman_corr",
7
+ "monitor": "mse",
8
+ "monitor_strategy": "min",
9
+ "normalize": "standard"
10
+ }
data/DeepET_Topt/DeepET_Topt_ESMFold_HF.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/DeepET_Topt_ESMFold",
3
+ "pdb_type": "ESMFold",
4
+ "num_labels": 1,
5
+ "problem_type": "regression",
6
+ "metrics": "mse,spearman_corr",
7
+ "monitor": "mse",
8
+ "monitor_strategy": "min",
9
+ "normalize": "standard"
10
+ }
data/DeepET_Topt/DeepET_Topt_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/DeepET_Topt",
3
+ "num_labels": 1,
4
+ "problem_type": "regression",
5
+ "metrics": "mse,spearman_corr",
6
+ "monitor": "mse",
7
+ "monitor_strategy": "min",
8
+ "normalize": "standard"
9
+ }
data/DeepLoc2Multi/DeepLoc2Multi_AlphaFold2_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/DeepLoc2Multi_AlphaFold2",
3
+ "pdb_type": "AlphaFold2",
4
+ "num_labels": 10,
5
+ "problem_type": "multi_label_classification",
6
+ "metrics": "f1_max",
7
+ "monitor": "f1_max",
8
+ "monitor_strategy": "max"
9
+ }
data/DeepLoc2Multi/DeepLoc2Multi_HF.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/DeepLoc2Multi",
3
+ "num_labels": 10,
4
+ "problem_type": "multi_label_classification",
5
+ "metrics": "f1_max",
6
+ "monitor": "f1_max",
7
+ "monitor_strategy": "max"
8
+ }
data/DeepLocBinary/DeepLocBinary_AlphaFold2_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/DeepLocBinary_AlphaFold2",
3
+ "pdb_type": "AlphaFold2",
4
+ "num_labels": 2,
5
+ "problem_type": "single_label_classification",
6
+ "metrics": "accuracy,mcc,f1,precision,recall,auroc",
7
+ "monitor": "accuracy",
8
+ "monitor_strategy": "max"
9
+ }
data/DeepLocBinary/DeepLocBinary_ESMFold_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/DeepLocBinary_ESMFold",
3
+ "pdb_type": "ESMFold",
4
+ "num_labels": 2,
5
+ "problem_type": "single_label_classification",
6
+ "metrics": "accuracy,mcc,f1,precision,recall,auroc",
7
+ "monitor": "accuracy",
8
+ "monitor_strategy": "max"
9
+ }
data/DeepLocBinary/DeepLocBinary_HF.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/DeepLocBinary",
3
+ "num_labels": 2,
4
+ "problem_type": "single_label_classification",
5
+ "metrics": "accuracy,mcc,f1,precision,recall,auroc",
6
+ "monitor": "accuracy",
7
+ "monitor_strategy": "max"
8
+ }
data/DeepLocMulti/DeepLocMulti_AlphaFold2_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/DeepLocMulti_AlphaFold2",
3
+ "pdb_type": "AlphaFold2",
4
+ "num_labels": 10,
5
+ "problem_type": "single_label_classification",
6
+ "metrics": "accuracy,mcc,f1,precision,recall,auroc",
7
+ "monitor": "accuracy",
8
+ "monitor_strategy": "max"
9
+ }
data/DeepLocMulti/DeepLocMulti_ESMFold_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/DeepLocMulti_ESMFold",
3
+ "pdb_type": "ESMFold",
4
+ "num_labels": 10,
5
+ "problem_type": "single_label_classification",
6
+ "metrics": "accuracy,mcc,f1,precision,recall,auroc",
7
+ "monitor": "accuracy",
8
+ "monitor_strategy": "max"
9
+ }
data/DeepLocMulti/DeepLocMulti_HF.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/DeepLocMulti",
3
+ "num_labels": 10,
4
+ "problem_type": "single_label_classification",
5
+ "metrics": "accuracy,mcc,f1,precision,recall,auroc",
6
+ "monitor": "accuracy",
7
+ "monitor_strategy": "max"
8
+ }
data/DeepSol/DeepSol_ESMFold_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/DeepSol_ESMFold",
3
+ "pdb_type": "ESMFold",
4
+ "num_labels": 2,
5
+ "problem_type": "single_label_classification",
6
+ "metrics": "accuracy,mcc,f1,precision,recall,auroc",
7
+ "monitor": "accuracy",
8
+ "monitor_strategy": "max"
9
+ }
data/DeepSol/DeepSol_HF.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/DeepSol",
3
+ "num_labels": 2,
4
+ "problem_type": "single_label_classification",
5
+ "metrics": "accuracy,mcc,f1,precision,recall,auroc",
6
+ "monitor": "accuracy",
7
+ "monitor_strategy": "max"
8
+ }
data/DeepSoluE/DeepSoluE_ESMFold_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/DeepSoluE_ESMFold",
3
+ "pdb_type": "ESMFold",
4
+ "num_labels": 2,
5
+ "problem_type": "single_label_classification",
6
+ "metrics": "accuracy,mcc,f1,precision,recall,auroc",
7
+ "monitor": "accuracy",
8
+ "monitor_strategy": "max"
9
+ }
data/DeepSoluE/DeepSoluE_HF.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/DeepSoluE",
3
+ "num_labels": 2,
4
+ "problem_type": "single_label_classification",
5
+ "metrics": "accuracy,mcc,f1,precision,recall,auroc",
6
+ "monitor": "accuracy",
7
+ "monitor_strategy": "max"
8
+ }
data/Demo/Demo_Solubility_HF.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/Demo_Solubility",
3
+ "num_labels": 2,
4
+ "problem_type": "single_label_classification",
5
+ "metrics": "accuracy,mcc,f1,precision,recall,auroc",
6
+ "monitor": "accuracy",
7
+ "monitor_strategy": "max"
8
+ }
data/EC/EC_AlphaFold2_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/EC_AlphaFold2",
3
+ "pdb_type": "AlphaFold2",
4
+ "num_labels": 585,
5
+ "problem_type": "multi_label_classification",
6
+ "metrics": "f1_max",
7
+ "monitor": "f1_max",
8
+ "monitor_strategy": "max"
9
+ }
data/EC/EC_ESMFold_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/EC_ESMFold",
3
+ "pdb_type": "ESMFold",
4
+ "num_labels": 585,
5
+ "problem_type": "multi_label_classification",
6
+ "metrics": "f1_max",
7
+ "monitor": "f1_max",
8
+ "monitor_strategy": "max"
9
+ }
data/EC/EC_HF.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/EC",
3
+ "num_labels": 585,
4
+ "problem_type": "multi_label_classification",
5
+ "metrics": "f1_max",
6
+ "monitor": "f1_max",
7
+ "monitor_strategy": "max"
8
+ }
data/FLIP_AAV/FLIP_AAV_des-mut_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/FLIP_AAV_des-mut",
3
+ "num_labels": 1,
4
+ "problem_type": "regression",
5
+ "metrics": "spearman_corr",
6
+ "monitor": "spearman_corr",
7
+ "monitor_strategy": "max",
8
+ "normalize": "min_max"
9
+ }
data/FLIP_AAV/FLIP_AAV_low-vs-high_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/FLIP_AAV_low-vs-high",
3
+ "num_labels": 1,
4
+ "problem_type": "regression",
5
+ "metrics": "spearman_corr",
6
+ "monitor": "spearman_corr",
7
+ "monitor_strategy": "max",
8
+ "normalize": "min_max"
9
+ }
data/FLIP_AAV/FLIP_AAV_mut-des_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/FLIP_AAV_mut-des",
3
+ "num_labels": 1,
4
+ "problem_type": "regression",
5
+ "metrics": "spearman_corr",
6
+ "monitor": "spearman_corr",
7
+ "monitor_strategy": "max",
8
+ "normalize": "min_max"
9
+ }
data/FLIP_AAV/FLIP_AAV_one-vs-rest_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/FLIP_AAV_one-vs-rest",
3
+ "num_labels": 1,
4
+ "problem_type": "regression",
5
+ "metrics": "spearman_corr",
6
+ "monitor": "spearman_corr",
7
+ "monitor_strategy": "max",
8
+ "normalize": "min_max"
9
+ }
data/FLIP_AAV/FLIP_AAV_sampled_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/FLIP_AAV_sampled",
3
+ "num_labels": 1,
4
+ "problem_type": "regression",
5
+ "metrics": "spearman_corr",
6
+ "monitor": "spearman_corr",
7
+ "monitor_strategy": "max",
8
+ "normalize": "min_max"
9
+ }
data/FLIP_AAV/FLIP_AAV_seven-vs-rest_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/FLIP_AAV_seven-vs-rest",
3
+ "num_labels": 1,
4
+ "problem_type": "regression",
5
+ "metrics": "spearman_corr",
6
+ "monitor": "spearman_corr",
7
+ "monitor_strategy": "max",
8
+ "normalize": "min_max"
9
+ }
data/FLIP_AAV/FLIP_AAV_two-vs-rest_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/FLIP_AAV_two-vs-rest",
3
+ "num_labels": 1,
4
+ "problem_type": "regression",
5
+ "metrics": "spearman_corr",
6
+ "monitor": "spearman_corr",
7
+ "monitor_strategy": "max",
8
+ "normalize": "min_max"
9
+ }
data/FLIP_GB1/FLIP_GB1_low-vs-high_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/FLIP_GB1_low-vs-high",
3
+ "num_labels": 1,
4
+ "problem_type": "regression",
5
+ "metrics": "spearman_corr",
6
+ "monitor": "spearman_corr",
7
+ "monitor_strategy": "max",
8
+ "normalize": "min_max"
9
+ }
data/FLIP_GB1/FLIP_GB1_one-vs-rest_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/FLIP_GB1_one-vs-rest",
3
+ "num_labels": 1,
4
+ "problem_type": "regression",
5
+ "metrics": "spearman_corr",
6
+ "monitor": "spearman_corr",
7
+ "monitor_strategy": "max",
8
+ "normalize": "min_max"
9
+ }
data/FLIP_GB1/FLIP_GB1_sampled_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/FLIP_GB1_sampled",
3
+ "num_labels": 1,
4
+ "problem_type": "regression",
5
+ "metrics": "spearman_corr",
6
+ "monitor": "spearman_corr",
7
+ "monitor_strategy": "max",
8
+ "normalize": "min_max"
9
+ }
data/FLIP_GB1/FLIP_GB1_three-vs-rest_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/FLIP_GB1_three-vs-rest",
3
+ "num_labels": 1,
4
+ "problem_type": "regression",
5
+ "metrics": "spearman_corr",
6
+ "monitor": "spearman_corr",
7
+ "monitor_strategy": "max",
8
+ "normalize": "min_max"
9
+ }
data/FLIP_GB1/FLIP_GB1_two-vs-rest_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/FLIP_GB1_two-vs-rest",
3
+ "num_labels": 1,
4
+ "problem_type": "regression",
5
+ "metrics": "spearman_corr",
6
+ "monitor": "spearman_corr",
7
+ "monitor_strategy": "max",
8
+ "normalize": "min_max"
9
+ }
data/GO_BP/GO_BP_AlphaFold2_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/GO_BP_AlphaFold2",
3
+ "pdb_type": "AlphaFold2",
4
+ "num_labels": 1943,
5
+ "problem_type": "multi_label_classification",
6
+ "metrics": "f1_max",
7
+ "monitor": "f1_max",
8
+ "monitor_strategy": "max"
9
+ }
data/GO_BP/GO_BP_ESMFold_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/GO_BP_ESMFold",
3
+ "pdb_type": "ESMFold",
4
+ "num_labels": 1943,
5
+ "problem_type": "multi_label_classification",
6
+ "metrics": "f1_max",
7
+ "monitor": "f1_max",
8
+ "monitor_strategy": "max"
9
+ }
data/GO_BP/GO_BP_HF.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/GO_BP",
3
+ "num_labels": 1943,
4
+ "problem_type": "multi_label_classification",
5
+ "metrics": "f1_max",
6
+ "monitor": "f1_max",
7
+ "monitor_strategy": "max"
8
+ }
data/GO_CC/GO_CC_AlphaFold2_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/GO_CC_AlphaFold2",
3
+ "pdb_type": "AlphaFold2",
4
+ "num_labels": 320,
5
+ "problem_type": "multi_label_classification",
6
+ "metrics": "f1_max",
7
+ "monitor": "f1_max",
8
+ "monitor_strategy": "max"
9
+ }
data/GO_CC/GO_CC_ESMFold_HF.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/GO_CC_ESMFold",
3
+ "pdb_type": "ESMFold",
4
+ "num_labels": 320,
5
+ "problem_type": "multi_label_classification",
6
+ "metrics": "f1_max",
7
+ "monitor": "f1_max",
8
+ "monitor_strategy": "max"
9
+ }
data/GO_CC/GO_CC_HF.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "tyang816/GO_CC",
3
+ "num_labels": 320,
4
+ "problem_type": "multi_label_classification",
5
+ "metrics": "f1_max",
6
+ "monitor": "f1_max",
7
+ "monitor_strategy": "max"
8
+ }