sakshi7502 commited on
Commit
6376749
·
verified ·
1 Parent(s): 5ee7b54

Upload 64 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. SVFT-main/LLM-Adapters/DATA_LICENSE +183 -0
  2. SVFT-main/LLM-Adapters/LICENSE +201 -0
  3. SVFT-main/LLM-Adapters/README.md +267 -0
  4. SVFT-main/LLM-Adapters/commonsense_evaluate.py +300 -0
  5. SVFT-main/LLM-Adapters/evaluate.py +302 -0
  6. SVFT-main/LLM-Adapters/export_hf_checkpoint.py +57 -0
  7. SVFT-main/LLM-Adapters/export_state_dict_checkpoint.py +125 -0
  8. SVFT-main/LLM-Adapters/finetune.py +438 -0
  9. SVFT-main/LLM-Adapters/ft-training_set/commonsense_15k.json +0 -0
  10. SVFT-main/LLM-Adapters/generate.py +191 -0
  11. SVFT-main/LLM-Adapters/lengths.ipynb +204 -0
  12. SVFT-main/LLM-Adapters/mathqa.py +27 -0
  13. SVFT-main/LLM-Adapters/multi_dataset_eval.py +49 -0
  14. SVFT-main/LLM-Adapters/peft/LICENSE +201 -0
  15. SVFT-main/LLM-Adapters/peft/Makefile +20 -0
  16. SVFT-main/LLM-Adapters/peft/pyproject.toml +36 -0
  17. SVFT-main/LLM-Adapters/peft/setup.py +76 -0
  18. SVFT-main/LLM-Adapters/peft/src/peft/__init__.py +55 -0
  19. SVFT-main/LLM-Adapters/peft/src/peft/mapping.py +202 -0
  20. SVFT-main/LLM-Adapters/peft/src/peft/peft_model.py +974 -0
  21. SVFT-main/LLM-Adapters/peft/src/peft/tuners/__init__.py +24 -0
  22. SVFT-main/LLM-Adapters/peft/src/peft/tuners/bottleneck.py +532 -0
  23. SVFT-main/LLM-Adapters/peft/src/peft/tuners/lora.py +624 -0
  24. SVFT-main/LLM-Adapters/peft/src/peft/tuners/p_tuning.py +159 -0
  25. SVFT-main/LLM-Adapters/peft/src/peft/tuners/prefix_tuning.py +101 -0
  26. SVFT-main/LLM-Adapters/peft/src/peft/tuners/prompt_tuning.py +120 -0
  27. SVFT-main/LLM-Adapters/peft/src/peft/utils/__init__.py +30 -0
  28. SVFT-main/LLM-Adapters/peft/src/peft/utils/adapters_utils.py +18 -0
  29. SVFT-main/LLM-Adapters/peft/src/peft/utils/config.py +169 -0
  30. SVFT-main/LLM-Adapters/peft/src/peft/utils/other.py +159 -0
  31. SVFT-main/LLM-Adapters/peft/src/peft/utils/save_and_load.py +96 -0
  32. SVFT-main/LLM-Adapters/peft/tests/__init__.py +0 -0
  33. SVFT-main/LLM-Adapters/peft/tests/test_config.py +96 -0
  34. SVFT-main/LLM-Adapters/peft/tests/test_peft_model.py +156 -0
  35. SVFT-main/LLM-Adapters/peft/tests/testing_common.py +103 -0
  36. SVFT-main/LLM-Adapters/peft/tests/testing_utils.py +49 -0
  37. SVFT-main/LLM-Adapters/picture.jpg +0 -0
  38. SVFT-main/LLM-Adapters/pyproject.toml +8 -0
  39. SVFT-main/LLM-Adapters/requirements.txt +9 -0
  40. SVFT-main/LLM-Adapters/run_commonsense.sh +33 -0
  41. SVFT-main/MetaMath/LICENSE +201 -0
  42. SVFT-main/MetaMath/README.MD +172 -0
  43. SVFT-main/MetaMath/data/README.md +7 -0
  44. SVFT-main/MetaMath/data/test/GSM8K_Backward.jsonl +0 -0
  45. SVFT-main/MetaMath/data/test/GSM8K_test.jsonl +0 -0
  46. SVFT-main/MetaMath/data/test/MATH_test.jsonl +0 -0
  47. SVFT-main/MetaMath/data/train/README.md +3 -0
  48. SVFT-main/MetaMath/eval_gsm8k.py +134 -0
  49. SVFT-main/MetaMath/eval_math.py +115 -0
  50. SVFT-main/MetaMath/requirements.txt +16 -0
SVFT-main/LLM-Adapters/DATA_LICENSE ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Attribution License (ODC-By)
2
+ PREAMBLE
3
+ The Open Data Commons Attribution License is a license agreement intended to allow users to freely share, modify, and use this Database subject only to the attribution requirements set out in Section 4.
4
+
5
+ Databases can contain a wide variety of types of content (images, audiovisual material, and sounds all in the same database, for example), and so this license only governs the rights over the Database, and not the contents of the Database individually. Licensors may therefore wish to use this license together with another license for the contents.
6
+
7
+ Sometimes the contents of a database, or the database itself, can be covered by other rights not addressed here (such as private contracts, trademark over the name, or privacy rights / data protection rights over information in the contents), and so you are advised that you may have to consult other documents or clear other rights before doing activities not covered by this License.
8
+
9
+ The Licensor (as defined below)
10
+
11
+ and
12
+
13
+ You (as defined below)
14
+
15
+ agree as follows:
16
+
17
+ 1.0 DEFINITIONS OF CAPITALISED WORDS
18
+ “Collective Database” – Means this Database in unmodified form as part of a collection of independent databases in themselves that together are assembled into a collective whole. A work that constitutes a Collective Database will not be considered a Derivative Database.
19
+
20
+ “Convey” – As a verb, means Using the Database, a Derivative Database, or the Database as part of a Collective Database in any way that enables a Person to make or receive copies of the Database or a Derivative Database. Conveying does not include interaction with a user through a computer network, or creating and Using a Produced Work, where no transfer of a copy of the Database or a Derivative Database occurs.
21
+
22
+ “Contents” – The contents of this Database, which includes the information, independent works, or other material collected into the Database. For example, the contents of the Database could be factual data or works such as images, audiovisual material, text, or sounds.
23
+
24
+ “Database” – A collection of material (the Contents) arranged in a systematic or methodical way and individually accessible by electronic or other means offered under the terms of this License.
25
+
26
+ “Database Directive” – Means Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended or succeeded.
27
+
28
+ “Database Right” – Means rights resulting from the Chapter III (“sui generis”) rights in the Database Directive (as amended and as transposed by member states), which includes the Extraction and Re-utilisation of the whole or a Substantial part of the Contents, as well as any similar rights available in the relevant jurisdiction under Section 10.4.
29
+
30
+ “Derivative Database” – Means a database based upon the Database, and includes any translation, adaptation, arrangement, modification, or any other alteration of the Database or of a Substantial part of the Contents. This includes, but is not limited to, Extracting or Re-utilising the whole or a Substantial part of the Contents in a new Database.
31
+
32
+ “Extraction” – Means the permanent or temporary transfer of all or a Substantial part of the Contents to another medium by any means or in any form.
33
+
34
+ “License” – Means this license agreement and is both a license of rights such as copyright and Database Rights and an agreement in contract.
35
+
36
+ “Licensor” – Means the Person that offers the Database under the terms of this License.
37
+
38
+ “Person” – Means a natural or legal person or a body of persons corporate or incorporate.
39
+
40
+ “Produced Work” – a work (such as an image, audiovisual material, text, or sounds) resulting from using the whole or a Substantial part of the Contents (via a search or other query) from this Database, a Derivative Database, or this Database as part of a Collective Database.
41
+
42
+ “Publicly” – means to Persons other than You or under Your control by either more than 50% ownership or by the power to direct their activities (such as contracting with an independent consultant).
43
+
44
+ “Re-utilisation” – means any form of making available to the public all or a Substantial part of the Contents by the distribution of copies, by renting, by online or other forms of transmission.
45
+
46
+ “Substantial” – Means substantial in terms of quantity or quality or a combination of both. The repeated and systematic Extraction or Re-utilisation of insubstantial parts of the Contents may amount to the Extraction or Re-utilisation of a Substantial part of the Contents.
47
+
48
+ “Use” – As a verb, means doing any act that is restricted by copyright or Database Rights whether in the original medium or any other; and includes without limitation distributing, copying, publicly performing, publicly displaying, and preparing derivative works of the Database, as well as modifying the Database as may be technically necessary to use it in a different mode or format.
49
+
50
+ “You” – Means a Person exercising rights under this License who has not previously violated the terms of this License with respect to the Database, or who has received express permission from the Licensor to exercise rights under this License despite a previous violation.
51
+
52
+ Words in the singular include the plural and vice versa.
53
+
54
+ 2.0 WHAT THIS LICENSE COVERS
55
+ 2.1. Legal effect of this document. This License is:
56
+
57
+ a. A license of applicable copyright and neighbouring rights;
58
+
59
+ b. A license of the Database Right; and
60
+
61
+ c. An agreement in contract between You and the Licensor.
62
+
63
+ 2.2 Legal rights covered. This License covers the legal rights in the Database, including:
64
+
65
+ a. Copyright. Any copyright or neighbouring rights in the Database. The copyright licensed includes any individual elements of the Database, but does not cover the copyright over the Contents independent of this Database. See Section 2.4 for details. Copyright law varies between jurisdictions, but is likely to cover: the Database model or schema, which is the structure, arrangement, and organisation of the Database, and can also include the Database tables and table indexes; the data entry and output sheets; and the Field names of Contents stored in the Database;
66
+
67
+ b. Database Rights. Database Rights only extend to the Extraction and Re-utilisation of the whole or a Substantial part of the Contents. Database Rights can apply even when there is no copyright over the Database. Database Rights can also apply when the Contents are removed from the Database and are selected and arranged in a way that would not infringe any applicable copyright; and
68
+
69
+ c. Contract. This is an agreement between You and the Licensor for access to the Database. In return you agree to certain conditions of use on this access as outlined in this License.
70
+
71
+ 2.3 Rights not covered.
72
+
73
+ a. This License does not apply to computer programs used in the making or operation of the Database;
74
+
75
+ b. This License does not cover any patents over the Contents or the Database; and
76
+
77
+ c. This License does not cover any trademarks associated with the Database.
78
+
79
+ 2.4 Relationship to Contents in the Database. The individual items of the Contents contained in this Database may be covered by other rights, including copyright, patent, data protection, privacy, or personality rights, and this License does not cover any rights (other than Database Rights or in contract) in individual Contents contained in the Database.
80
+
81
+ For example, if used on a Database of images (the Contents), this License would not apply to copyright over individual images, which could have their own separate licenses, or one single license covering all of the rights over the images.
82
+
83
+ 3.0 RIGHTS GRANTED
84
+ 3.1 Subject to the terms and conditions of this License, the Licensor grants to You a worldwide, royalty-free, non-exclusive, terminable (but only under Section 9) license to Use the Database for the duration of any applicable copyright and Database Rights. These rights explicitly include commercial use, and do not exclude any field of endeavour. To the extent possible in the relevant jurisdiction, these rights may be exercised in all media and formats whether now known or created in the future.
85
+
86
+ The rights granted cover, for example:
87
+
88
+ a. Extraction and Re-utilisation of the whole or a Substantial part of the Contents;
89
+
90
+ b. Creation of Derivative Databases;
91
+
92
+ c. Creation of Collective Databases;
93
+
94
+ d. Creation of temporary or permanent reproductions by any means and in any form, in whole or in part, including of any Derivative Databases or as a part of Collective Databases; and
95
+
96
+ e. Distribution, communication, display, lending, making available, or performance to the public by any means and in any form, in whole or in part, including of any Derivative Database or as a part of Collective Databases.
97
+
98
+ 3.2 Compulsory license schemes. For the avoidance of doubt:
99
+
100
+ a. Non-waivable compulsory license schemes. In those jurisdictions in which the right to collect royalties through any statutory or compulsory licensing scheme cannot be waived, the Licensor reserves the exclusive right to collect such royalties for any exercise by You of the rights granted under this License;
101
+
102
+ b. Waivable compulsory license schemes. In those jurisdictions in which the right to collect royalties through any statutory or compulsory licensing scheme can be waived, the Licensor waives the exclusive right to collect such royalties for any exercise by You of the rights granted under this License; and,
103
+
104
+ c. Voluntary license schemes. The Licensor waives the right to collect royalties, whether individually or, in the event that the Licensor is a member of a collecting society that administers voluntary licensing schemes, via that society, from any exercise by You of the rights granted under this License.
105
+
106
+ 3.3 The right to release the Database under different terms, or to stop distributing or making available the Database, is reserved. Note that this Database may be multiple-licensed, and so You may have the choice of using alternative licenses for this Database. Subject to Section 10.4, all other rights not expressly granted by Licensor are reserved.
107
+
108
+ 4.0 CONDITIONS OF USE
109
+ 4.1 The rights granted in Section 3 above are expressly made subject to Your complying with the following conditions of use. These are important conditions of this License, and if You fail to follow them, You will be in material breach of its terms.
110
+
111
+ 4.2 Notices. If You Publicly Convey this Database, any Derivative Database, or the Database as part of a Collective Database, then You must:
112
+
113
+ a. Do so only under the terms of this License;
114
+
115
+ b. Include a copy of this License or its Uniform Resource Identifier (URI) with the Database or Derivative Database, including both in the Database or Derivative Database and in any relevant documentation;
116
+
117
+ c. Keep intact any copyright or Database Right notices and notices that refer to this License; and
118
+
119
+ d. If it is not possible to put the required notices in a particular file due to its structure, then You must include the notices in a location (such as a relevant directory) where users would be likely to look for it.
120
+
121
+ 4.3 Notice for using output (Contents). Creating and Using a Produced Work does not require the notice in Section 4.2. However, if you Publicly Use a Produced Work, You must include a notice associated with the Produced Work reasonably calculated to make any Person that uses, views, accesses, interacts with, or is otherwise exposed to the Produced Work aware that Content was obtained from the Database, Derivative Database, or the Database as part of a Collective Database, and that it is available under this License.
122
+
123
+ a. Example notice. The following text will satisfy notice under Section 4.3:
124
+
125
+ Contains information from DATABASE NAME which is made available
126
+ under the ODC Attribution License.
127
+ DATABASE NAME should be replaced with the name of the Database and a hyperlink to the location of the Database. “ODC Attribution License” should contain a hyperlink to the URI of the text of this License. If hyperlinks are not possible, You should include the plain text of the required URI’s with the above notice.
128
+
129
+ 4.4 Licensing of others. You may not sublicense the Database. Each time You communicate the Database, the whole or Substantial part of the Contents, or any Derivative Database to anyone else in any way, the Licensor offers to the recipient a license to the Database on the same terms and conditions as this License. You are not responsible for enforcing compliance by third parties with this License, but You may enforce any rights that You have over a Derivative Database. You are solely responsible for any modifications of a Derivative Database made by You or another Person at Your direction. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License.
130
+
131
+ 5.0 MORAL RIGHTS
132
+ 5.1 Moral rights. This section covers moral rights, including any rights to be identified as the author of the Database or to object to treatment that would otherwise prejudice the author’s honour and reputation, or any other derogatory treatment:
133
+
134
+ a. For jurisdictions allowing waiver of moral rights, Licensor waives all moral rights that Licensor may have in the Database to the fullest extent possible by the law of the relevant jurisdiction under Section 10.4;
135
+
136
+ b. If waiver of moral rights under Section 5.1 a in the relevant jurisdiction is not possible, Licensor agrees not to assert any moral rights over the Database and waives all claims in moral rights to the fullest extent possible by the law of the relevant jurisdiction under Section 10.4; and
137
+
138
+ c. For jurisdictions not allowing waiver or an agreement not to assert moral rights under Section 5.1 a and b, the author may retain their moral rights over certain aspects of the Database.
139
+
140
+ Please note that some jurisdictions do not allow for the waiver of moral rights, and so moral rights may still subsist over the Database in some jurisdictions.
141
+
142
+ 6.0 FAIR DEALING, DATABASE EXCEPTIONS, AND OTHER RIGHTS NOT AFFECTED
143
+ 6.1 This License does not affect any rights that You or anyone else may independently have under any applicable law to make any use of this Database, including without limitation:
144
+
145
+ a. Exceptions to the Database Right including: Extraction of Contents from non-electronic Databases for private purposes, Extraction for purposes of illustration for teaching or scientific research, and Extraction or Re-utilisation for public security or an administrative or judicial procedure.
146
+
147
+ b. Fair dealing, fair use, or any other legally recognised limitation or exception to infringement of copyright or other applicable laws.
148
+
149
+ 6.2 This License does not affect any rights of lawful users to Extract and Re-utilise insubstantial parts of the Contents, evaluated quantitatively or qualitatively, for any purposes whatsoever, including creating a Derivative Database (subject to other rights over the Contents, see Section 2.4). The repeated and systematic Extraction or Re-utilisation of insubstantial parts of the Contents may however amount to the Extraction or Re-utilisation of a Substantial part of the Contents.
150
+
151
+ 7.0 WARRANTIES AND DISCLAIMER
152
+ 7.1 The Database is licensed by the Licensor “as is” and without any warranty of any kind, either express, implied, or arising by statute, custom, course of dealing, or trade usage. Licensor specifically disclaims any and all implied warranties or conditions of title, non-infringement, accuracy or completeness, the presence or absence of errors, fitness for a particular purpose, merchantability, or otherwise. Some jurisdictions do not allow the exclusion of implied warranties, so this exclusion may not apply to You.
153
+
154
+ 8.0 LIMITATION OF LIABILITY
155
+ 8.1 Subject to any liability that may not be excluded or limited by law, the Licensor is not liable for, and expressly excludes, all liability for loss or damage however and whenever caused to anyone by any use under this License, whether by You or by anyone else, and whether caused by any fault on the part of the Licensor or not. This exclusion of liability includes, but is not limited to, any special, incidental, consequential, punitive, or exemplary damages such as loss of revenue, data, anticipated profits, and lost business. This exclusion applies even if the Licensor has been advised of the possibility of such damages.
156
+
157
+ 8.2 If liability may not be excluded by law, it is limited to actual and direct financial loss to the extent it is caused by proved negligence on the part of the Licensor.
158
+
159
+ 9.0 TERMINATION OF YOUR RIGHTS UNDER THIS LICENSE
160
+ 9.1 Any breach by You of the terms and conditions of this License automatically terminates this License with immediate effect and without notice to You. For the avoidance of doubt, Persons who have received the Database, the whole or a Substantial part of the Contents, Derivative Databases, or the Database as part of a Collective Database from You under this License will not have their licenses terminated provided their use is in full compliance with this License or a license granted under Section 4.8 of this License. Sections 1, 2, 7, 8, 9 and 10 will survive any termination of this License.
161
+
162
+ 9.2 If You are not in breach of the terms of this License, the Licensor will not terminate Your rights under it.
163
+
164
+ 9.3 Unless terminated under Section 9.1, this License is granted to You for the duration of applicable rights in the Database.
165
+
166
+ 9.4 Reinstatement of rights. If you cease any breach of the terms and conditions of this License, then your full rights under this License will be reinstated:
167
+
168
+ a. Provisionally and subject to permanent termination until the 60th day after cessation of breach;
169
+
170
+ b. Permanently on the 60th day after cessation of breach unless otherwise reasonably notified by the Licensor; or
171
+
172
+ c. Permanently if reasonably notified by the Licensor of the violation, this is the first time You have received notice of violation of this License from the Licensor, and You cure the violation prior to 30 days after your receipt of the notice.
173
+
174
+ 9.5 Notwithstanding the above, Licensor reserves the right to release the Database under different license terms or to stop distributing or making available the Database. Releasing the Database under different license terms or stopping the distribution of the Database will not withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated above.
175
+
176
+ 10.0 GENERAL
177
+ 10.1 If any provision of this License is held to be invalid or unenforceable, that must not affect the validity or enforceability of the remainder of the terms and conditions of this License and each remaining provision of this License shall be valid and enforced to the fullest extent permitted by law.
178
+
179
+ 10.2 This License is the entire agreement between the parties with respect to the rights granted here over the Database. It replaces any earlier understandings, agreements or representations with respect to the Database.
180
+
181
+ 10.3 If You are in breach of the terms of this License, You will not be entitled to rely on the terms of this License or to complain of any breach by the Licensor.
182
+
183
+ 10.4 Choice of law. This License takes effect in and will be governed by the laws of the relevant jurisdiction in which the License terms are sought to be enforced. If the standard suite of rights granted under applicable copyright law and Database Rights in the relevant jurisdiction includes additional rights not granted under this License, these additional rights are granted in this License in order to meet the terms of this License.
SVFT-main/LLM-Adapters/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
SVFT-main/LLM-Adapters/README.md ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!---
2
+ Copyright 2023 The HuggingFace Team. All rights reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ -->
16
+
17
+ <h1 align="center">
18
+ <img src="picture.jpg" width="73" height="114">
19
+ <p> LLM-Adapters</p>
20
+ </h1>
21
+
22
+ <h3 align="center">
23
+ <p>LLM-Adapters: An Adapter Family for Parameter-Efficient Fine-Tuning of Large Language Models </p>
24
+ </h3>
25
+ LLM-Adapters is an easy-to-use framework that integrates various adapters into LLMs and can execute adapter-based PEFT methods of LLMs for different tasks. LLM-Adapter is an extension of HuggingFace's PEFT library, many thanks for their amazing work! Please find our paper at this link: https://arxiv.org/abs/2304.01933.
26
+
27
+ The framework includes state-of-the-art open-access LLMs: LLaMa, OPT, BLOOM, and GPT-J, as well as widely used adapters such as Bottleneck adapters, Parallel adapters, and LoRA.
28
+
29
+ Supported Adapters:
30
+
31
+ 1. LoRA: [LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS](https://arxiv.org/pdf/2106.09685.pdf)
32
+ 2. AdapterH: [Parameter-Efficient Transfer Learning for NLP](https://arxiv.org/pdf/1902.00751.pdf)
33
+ 3. AdapterP: [GMAD-X: An Adapter-Based Framework for Multi-Task Cross-Lingual Transfer](https://arxiv.org/pdf/2005.00052.pdf)
34
+ 4. Parallel: [TOWARDS A UNIFIED VIEW OF PARAMETER-EFFICIENT TRANSFER LEARNING](https://arxiv.org/pdf/2110.04366.pdf)
35
+ 5. Prefix Tuning: [Prefix-Tuning: Optimizing Continuous Prompts for Generation](https://aclanthology.org/2021.acl-long.353/), [P-Tuning v2: Prompt Tuning Can Be Comparable to Fine-tuning Universally Across Scales and Tasks](https://arxiv.org/pdf/2110.07602.pdf)
36
+ 6. P-Tuning: [GPT Understands, Too](https://arxiv.org/pdf/2103.10385.pdf)
37
+ 7. Prompt Tuning: [The Power of Scale for Parameter-Efficient Prompt Tuning](https://arxiv.org/pdf/2104.08691.pdf)
38
+
39
+ ## Latest News 🔥🔥
40
+
41
+ * [2023-08-10] LLM-Adapters has been accepted by EMNLP 2023.
42
+ * [2023-07-16] we released commonsense170k dataset and the The LLaMA-13B-Parallel model outformances ChatGPT on 8 commonsense benchmarks.
43
+ * [2023-04-21] We released math10k dataset and the [LLaMA-13B adapter checkpoints](https://drive.google.com/file/d/1NqUv-Hn_mAkGXsUOqpJKmPKW5Gp8mRlO/view?usp=sharing). The LLaMA-13B-Parallel model achieves **91%** of GPT-3.5 performance!
44
+ * [2023-04-10] We can support GPT-Neo and ChatGLM now!
45
+ * [2023-04-04] [Release code and dataset](https://github.com/AGI-Edgerunners/LLM-Adapters)
46
+
47
+ ## Special Announcement
48
+ The `math_10k.json` data is collected with the training sets of GSM8K, MAWPS, and AQuA(1000 examples). However, MAWPS consists of AddSub, MultiArith, SingleOp, SingleEq, SimulEq-S, SimulEq-L. Thus, we can't utilize MultiArith, AddSub, and SingleEq as evaluation benchmarks with models trained with `math_10k.json`. We evaluate the PEFT methods on the MAWPS test set instead, and the result table has been updated (The findings in the paper are consistent). Furthermore, two variations of `math_10k.json` have been uploaded, `math_7K.json` where the MAWPS samples have been deleted, and `math_14k.json` where the MAWPS samples have been deleted as well and we combine ChatGPT and GPT-4 rationales. Sincerely apologize for any inconvenience!
49
+
50
+ ## Setup
51
+
52
+ 1. Install dependencies
53
+ ```bash
54
+ pip install -r requirements.txt
55
+ ```
56
+
57
+ 2. Set environment variables, or modify the files referencing `BASE_MODEL`:
58
+
59
+ ```bash
60
+ # Files referencing `BASE_MODEL`
61
+ # export_hf_checkpoint.py
62
+ # export_state_dict_checkpoint.py
63
+
64
+ export BASE_MODEL=yahma/llama-7b-hf
65
+ ```
66
+
67
+ Both `finetune.py` and `generate.py` use `--base_model` flag as shown further below.
68
+
69
+ 3. If bitsandbytes doesn't work, [install it from source.](https://github.com/TimDettmers/bitsandbytes/blob/main/compile_from_source.md) Windows users can follow [these instructions](https://github.com/tloen/alpaca-lora/issues/17).
70
+
71
+ ## Training(finetune.py)
72
+
73
+ This file contains some code related to prompt construction and tokenization.In this file, specify different adapters and different sets of data, so that different models can be trained.
74
+
75
+ Example usage for multiple GPUs:
76
+
77
+ ```bash
78
+ WORLD_SIZE=2 CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 --master_port=3192 finetune.py \
79
+ --base_model 'yahma/llama-7b-hf' \
80
+ --data_path 'math_10k.json' \
81
+ --output_dir './trained_models/llama-lora' \
82
+ --batch_size 16 \
83
+ --micro_batch_size 4 \
84
+ --num_epochs 3 \
85
+ --learning_rate 3e-4 \
86
+ --cutoff_len 256 \
87
+ --val_set_size 120 \
88
+ --adapter_name lora
89
+ ```
90
+
91
+ The `math_10k.json` data is collected with the training sets of GSM8K, MAWPS, and AQuA(1000 examples). `yahma/llama-7b-hf` is a base model, LLaMa-7B. Add `lora` adapter to this model.
92
+
93
+ Example usage for Single GPUs:
94
+
95
+ ```bash
96
+ CUDA_VISIBLE_DEVICES=0 python finetune.py \
97
+ --base_model 'yahma/llama-7b-hf' \
98
+ --data_path 'math_10k.json' \
99
+ --output_dir './trained_models/llama-lora' \
100
+ --batch_size 16 \
101
+ --micro_batch_size 4 \
102
+ --num_epochs 3 \
103
+ --learning_rate 3e-4 \
104
+ --cutoff_len 256 \
105
+ --val_set_size 120 \
106
+ --adapter_name lora
107
+ ```
108
+
109
+ Moreover, you can use `--use_gradient_checkpointing` to save more GPU memory, but it will increase the training time.
110
+
111
+ To use the AdapterH, just add the following arguments:
112
+
113
+ ```bash
114
+ --adapter_name bottleneck # use the bottleneck adapter, refers to AdapterH in the result table
115
+ ```
116
+
117
+ To use the AdapterP, just add the following arguments:
118
+
119
+ ```bash
120
+ --adapter_name bottleneck
121
+ --use_adapterp # use the AdapterP, refers to AdapterP in the result table
122
+ ```
123
+
124
+ To use parallel adapter, just add the following arguments:
125
+
126
+ ```bash
127
+ --adapter_name bottleneck
128
+ --use_parallel_adapter
129
+ ```
130
+
131
+ Note that, In order to facilitate INT8 training of large models with parallel adapters, we have adopted a technique whereby the parallel adapter layers are incorporated into multi-head attention layers and MLP layers, in parallel with Linear layers. It is different from [Hu et al. (2021)](https://arxiv.org/pdf/2106.09685.pdf).
132
+
133
+ ## Inference (generate.py)
134
+
135
+ This file reads the foundation model from the Hugging Face model hub and the LoRA weights from `'./trained_models/llama-lora'` , and runs a Gradio interface for inference on a specified input. Users should treat this as example code for the use of the model, and modify it as needed.
136
+ Example usage:
137
+
138
+ ```bash
139
+ CUDA_VISIBLE_DEVICES=0 torchrun generate.py \
140
+ --base_model 'yahma/llama-7b-hf' \
141
+ --lora_weights './trained_models/llama-lora'
142
+ ```
143
+
144
+ ## Evaluation (evaluate.py)
145
+
146
+ To evaluate the performance of the finetuned model on the Arithmetic Reasoning tasks, you can use the following command:
147
+
148
+ ```bash
149
+ CUDA_VISIBLE_DEVICES=0 python evaluate.py
150
+ --model LLaMA-7B \ #specify the base model
151
+ --adapter LoRA \ #specify the adapter name ["LoRA", "AdapterH", "AdapterP", "Parallel", "Scaled_Parallel""]
152
+ --dataset SVAMP \ #specify the test dataset
153
+ --base_model 'yahma/llama-7b-hf' \
154
+ --lora_weights './trained_models/llama-lora'
155
+ ```
156
+
157
+ <!-- ## Resource Consumption
158
+
159
+ There is a table of resouce needed for different adapters, which contains Trainable Parameters, GPU RAM Usage, and Fine-tuning Time on the Arithmetic Reasoning dataset `math_10k.json`
160
+
161
+ Hyper-parameter setting: num_epochs=3, lora_r=8, lora_alpha=16, bottleneck_size=256
162
+
163
+ Models: LLaMA-13B, LLaMA-7B, BLOOM-6.7B, GPT-j-6B
164
+ Dataset: 3.2K math word problems
165
+
166
+ Hardware: 2*3090 GPUs
167
+
168
+ | Model | Trainable Parameters | GPU RAM Usage | Fine-tuning Time |
169
+ |-----------------------|----------------------|---------------|------------------|
170
+ | LLaMA-7B-LoRA | 4.2M | 18GB | 4h |
171
+ | LLaMA-7B-AdapterH | 200M | 22GB | 4h |
172
+ | LLaMA-7B-AdapterP | 200M | 22GB | 4h |
173
+ | LLaMA-7B-Parallel | 200M | 22GB | 4h | -->
174
+
175
+
176
+ ## Finetune Result
177
+ There are the finetune results in different models with 4 math reasoning datasets, which contains GSM8K, AQuA, SVAMP, and MAWPS. In this table, we use the optimal configuration and placement of Prefix-Tuning, Series Adapter, LoRA, and Parallel Adapter according to the empirical study in our [paper](https://aclanthology.org/2023.emnlp-main.319/).
178
+
179
+ | Model | GSM8K | AQuA | MAWPS | SVAMP | Average |
180
+ |-----------------------|--------|--------|----------|--------|---------|
181
+ | GPT-3.5 |**56.4**|**38.9**| **87.4** |**69.9**|**63.2** |
182
+ | BLOOMz-7B-Prefix | 13.8 | 12.5 | 47.5 | 24.1 | 24.5 |
183
+ | BLOOMz-7B-Series | 14.3 | 20.5 | 62.2 | 38.1 | 33.8 |
184
+ | BLOOMz-7B-Parallel | 18.5 | 18.9 | 70.6 | 36.4 | 36.1 |
185
+ | BLOOMz-7B-LoRA | 17.4 | 21.3 | 70.2 | 41.0 | 37.5 |
186
+ | GPT-j-6B-Prefix | 16.0 | 14.7 | 59.2 | 31.0 | 30.2 |
187
+ | GPT-j-6B-Series | 19.5 | 15.0 | 80.3 | 43.6 | 39.6 |
188
+ | GPT-j-6B-Parallel | 18.9 | 17.9 | 78.2 | 41.1 | 39.0 |
189
+ | GPT-j-6B-LoRA | 23.0 | 16.1 | 79.4 | 46.0 | 41.1 |
190
+ | LLaMA-7B-Prefix | 24.4 | 14.2 | 63.4 | 38.1 | 35.0 |
191
+ | LLaMA-7B-Series | 33.3 | 15.0 | 77.7 | 52.3 | 44.6 |
192
+ | LLaMA-7B-Parallel | 35.3 | 18.1 | 82.4 | 49.6 | 46.4 |
193
+ | LLaMA-7B-LoRA | 37.5 | 18.9 | 79.0 | 52.1 | 46.9 |
194
+ | LLaMA-13B-Prefix | 31.1 | 15.7 | 66.8 | 41.4 | 38.8 |
195
+ | LLaMA-13B-Series | 44.0 | 22.0 | 78.6 | 50.8 | 48.9 |
196
+ | LLaMA-13B-Parallel | 43.3 | 20.5 | 81.1 | 55.7 | 50.2 |
197
+ | LLaMA-13B-LoRA | 47.5 | 18.5 | 83.6 | 54.6 | 51.1 |
198
+
199
+
200
+ There are the finetune results in different models with eight commonsense reasoning datasets.
201
+
202
+ | Model | BoolQ | PIQA | SIQA | HellaSwag | WinoGrande | ARC-e | ARC-c | OBQA | Average |
203
+ |-----------------------|---------|--------|--------|-------------|--------------|---------|---------|--------|-----------|
204
+ | ChatGPT | **73.1**|**85.4**| 68.5 | 78.5 | 66.1 |**89.8** |**79.9** | 74.8 | 77.0 |
205
+ | BLOOMz-7B-Prefix | 45.6 | 53.7 | 46.3 | 26.7 | 49.5 | 52.1 | 39.7 | 44.3 | 44.7 |
206
+ | BLOOMz-7B-Series | 65.4 | 70.4 | 73.6 | 53.4 | 69.3 | 72.3 | 55.9 | 68.0 | 66.0 |
207
+ | BLOOMz-7B-Parallel | 64.1 | 71.5 | 72.1 | 52.9 | 67.0 | 70.5 | 54.7 | 69.6 | 65.3 |
208
+ | BLOOMz-7B-LoRA | 65.9 | 75.3 | 74.5 | 57.3 | 72.5 | 74.6 | 57.8 | 73.4 | 68.9 |
209
+ | GPT-j-6B-Prefix | 63.1 | 66.9 | 68.7 | 34.4 | 64.5 | 64.4 | 46.8 | 59.0 | 58.5 |
210
+ | GPT-j-6B-Series | 62.1 | 63.5 | 72.3 | 30.6 | 68.0 | 63.9 | 48.1 | 63.8 | 59.0 |
211
+ | GPT-j-6B-Parallel | 62.2 | 69.7 | 70.0 | 41.7 | 65.0 | 60.2 | 44.6 | 58.2 | 59.0 |
212
+ | GPT-j-6B-LoRA | 62.4 | 68.6 | 49.5 | 43.1 | 57.3 | 43.4 | 31.0 | 46.6 | 50.2 |
213
+ | LLaMA-7B-Prefix | 64.3 | 76.8 | 73.9 | 42.1 | 72.1 | 72.9 | 54.0 | 60.6 | 64.6 |
214
+ | LLaMA-7B-Series | 63.0 | 79.2 | 76.3 | 67.9 | 75.7 | 74.5 | 57.1 | 72.4 | 70.8 |
215
+ | LLaMA-7B-Parallel | 67.9 | 76.4 | 78.8 | 69.8 | 78.9 | 73.7 | 57.3 | 75.2 | 72.3 |
216
+ | LLaMA-7B-LoRA | 68.9 | 80.7 | 77.4 | 78.1 | 78.8 | 77.8 | 61.3 | 74.8 | 74.7 |
217
+ | LLaMA-13B-Prefix | 65.3 | 75.4 | 72.1 | 55.2 | 68.6 | 79.5 | 62.9 | 68.0 | 68.4 |
218
+ | LLaMA-13B-Series | 71.8 | 83.0 | 79.2 | 88.1 | 82.4 | 82.5 | 67.3 | 81.8 | 79.5 |
219
+ | LLaMA-13B-Parallel | 72.5 | 84.8 | 79.8 |**92.1** |**84.7** | 84.2 | 71.2 |**82.4**|**81.5** |
220
+ | LLaMA-13B-LoRA | 72.1 | 83.5 |**80.5**| 90.5 | 83.7 | 82.8 | 68.3 |**82.4**| 80.5 |
221
+
222
+
223
+ ### Adapter support matrix
224
+ This metrix shows whether different models can use LoRA,AdapterH,AdapterP,Parallel and Scaled Parallel adapters.
225
+
226
+ | Adapter | LoRA | AdapterH | AdapterP | Parallel| Prefix Tuning |P-Tuning|Prompt Tuning|
227
+ |--------------|-------|-------|----------|-------|-------|-------|-------|
228
+ | LLaMA | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |
229
+ | BLOOM | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |
230
+ | GPT-J | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |
231
+ | OPT | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |
232
+ | GPT-2 | ✅ | 🔧Developing | 🔧Developing|🔧Developing | ✅ | ✅ | ✅ |
233
+ | GPT-Neo | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
234
+ | GPT-NeoX-20B | ✅ | 🔧Developing | 🔧Developing|🔧Developing | ✅ | ✅ | ✅ |
235
+ | ChatGLM | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |
236
+
237
+
238
+ ### TODO List
239
+ - [x] Add AdapterH
240
+ - [x] Add AdapterP
241
+ - [x] Add Parallel Adapter
242
+ - [ ] Support More LLMs
243
+ - [ ] Support Multiple Adapter
244
+ - [ ] Support Adapter Composition
245
+ - [ ] Support Adapter Fusion
246
+
247
+
248
+ ## :star: Star History
249
+
250
+ [![Star History Chart](https://api.star-history.com/svg?repos=AGI-Edgerunners/LLM-Adapters&type=Date)](https://star-history.com/#AGI-Edgerunners/LLM-Adapters&Date)
251
+
252
+ ## Citing <img src="picture.jpg" width="14px" height="14px"> LLM-Adapter
253
+
254
+ If you use <img src="picture.jpg" width="14px" height="14px"> LLM-Adapters in your publication, please cite it by using the following BibTeX entry.
255
+
256
+ ```bibtex
257
+ @article{hu2023llm,
258
+ title={LLM-Adapters: An Adapter Family for Parameter-Efficient Fine-Tuning of Large Language Models},
259
+ author={Hu, Zhiqiang and Lan, Yihuai and Wang, Lei and Xu, Wanyu and Lim, Ee-Peng and Lee, Roy Ka-Wei and Bing, Lidong and Poria, Soujanya},
260
+ journal={arXiv preprint arXiv:2304.01933},
261
+ year={2023}
262
+ }
263
+ ```
264
+
265
+ ## Acknowledgement
266
+
267
+ This repo benefits from [PEFT](https://github.com/huggingface/peft), [Adapter-Transformer](https://github.com/adapter-hub/adapter-transformers), [Alpaca-lora](https://github.com/tloen/alpaca-lora). Thanks for their wonderful works. Additionally, we thank DONG Shan and [dream.ai](https://dream.ai/create) for the exceptional logo design, which has added immense value to our project.
SVFT-main/LLM-Adapters/commonsense_evaluate.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import json
3
+ import os
4
+ import re
5
+ import sys
6
+ import argparse
7
+
8
+ import fire
9
+
10
+ import torch
11
+
12
+ sys.path.append(os.path.join(os.getcwd(), "peft/src/"))
13
+ from peft import PeftModel
14
+ from tqdm import tqdm
15
+ from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer, AutoModelForCausalLM, AutoTokenizer
16
+
17
+ if torch.cuda.is_available():
18
+ device = "cuda"
19
+ else:
20
+ device = "cpu"
21
+
22
+ try:
23
+ if torch.backends.mps.is_available():
24
+ device = "mps"
25
+ except: # noqa: E722
26
+ pass
27
+
28
+
29
+ def main(
30
+ load_8bit: bool = False,
31
+ base_model: str = "",
32
+ lora_weights: str = "tloen/alpaca-lora-7b",
33
+ share_gradio: bool = False,
34
+ ):
35
+ args = parse_args()
36
+
37
+ def evaluate(
38
+ instructions,
39
+ input=None,
40
+ temperature=0.1,
41
+ top_p=0.75,
42
+ top_k=40,
43
+ num_beams=4,
44
+ max_new_tokens=32,
45
+ **kwargs,
46
+ ):
47
+ prompts = [generate_prompt(instruction, input) for instruction in instructions]
48
+ inputs = tokenizer(prompts, return_tensors="pt", padding=True)
49
+ input_ids = inputs["input_ids"].to(device)
50
+ generation_config = GenerationConfig(
51
+ temperature=temperature,
52
+ top_p=top_p,
53
+ top_k=top_k,
54
+ num_beams=num_beams,
55
+ **kwargs,
56
+ )
57
+ with torch.no_grad():
58
+ generation_output = model.generate(
59
+ input_ids=input_ids,
60
+ generation_config=generation_config,
61
+ return_dict_in_generate=True,
62
+ output_scores=True,
63
+ max_new_tokens=max_new_tokens,
64
+ )
65
+ s = generation_output.sequences
66
+ outputs = tokenizer.batch_decode(s, skip_special_tokens=True)
67
+ outputs = [o.split("### Response:")[1].strip() for o in outputs]
68
+ print(outputs)
69
+ return outputs
70
+
71
+ save_file = f'experiment/{args.model}-{args.adapter}-{args.dataset}.json'
72
+ create_dir('experiment/')
73
+
74
+ dataset = load_data(args)
75
+ batches = create_batch(dataset, args.batch_size)
76
+ tokenizer, model = load_model(args)
77
+ total = len(batches)
78
+ correct = 0
79
+ current = 0
80
+ output_data = []
81
+ pbar = tqdm(total=total)
82
+ for idx, batch in enumerate(batches):
83
+ current += len(batch)
84
+ instructions = [data.get('instruction') for data in batch]
85
+
86
+ outputs = evaluate(instructions)
87
+
88
+ for data, output in zip(batch, outputs):
89
+ label = data.get('answer')
90
+ flag = False
91
+ predict = extract_answer(args, output)
92
+ if label == predict:
93
+ correct += 1
94
+ flag = True
95
+ new_data = copy.deepcopy(data)
96
+ new_data['output_pred'] = output
97
+ new_data['pred'] = predict
98
+ new_data['flag'] = flag
99
+ output_data.append(new_data)
100
+ print(data["instruction"])
101
+ print(output)
102
+ print('prediction:', predict)
103
+ print('label:', label)
104
+ print('---------------')
105
+ print(f'\rtest:{idx + 1}/{total} | accuracy {correct} {correct / current}')
106
+ print('---------------')
107
+ with open(save_file, 'w+') as f:
108
+ json.dump(output_data, f, indent=4)
109
+ pbar.update(1)
110
+ pbar.close()
111
+ print('\n')
112
+ print('test finished')
113
+
114
+
115
+ def create_dir(dir_path):
116
+ if not os.path.exists(dir_path):
117
+ os.mkdir(dir_path)
118
+ return
119
+
120
+
121
+ def generate_prompt(instruction, input=None):
122
+ if input:
123
+ return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
124
+
125
+ ### Instruction:
126
+ {instruction}
127
+
128
+ ### Input:
129
+ {input}
130
+
131
+ ### Response:
132
+ """ # noqa: E501
133
+ else:
134
+ return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
135
+
136
+ ### Instruction:
137
+ {instruction}
138
+
139
+ ### Response:
140
+ """ # noqa: E501
141
+
142
+
143
+ def load_data(args) -> list:
144
+ """
145
+ read data from dataset file
146
+ Args:
147
+ args:
148
+
149
+ Returns:
150
+
151
+ """
152
+ file_path = f'dataset/{args.dataset}/test.json'
153
+ if not os.path.exists(file_path):
154
+ raise FileNotFoundError(f"can not find dataset file : {file_path}")
155
+ json_data = json.load(open(file_path, 'r'))
156
+ return json_data
157
+
158
+ def create_batch(dataset, batch_size):
159
+ batches = []
160
+ num_batch = len(dataset)//batch_size if len(dataset) % batch_size == 0 else len(dataset)//batch_size + 1
161
+ for i in range(num_batch):
162
+ batch = dataset[i*batch_size: min((i+1)*batch_size, len(dataset))]
163
+ batches.append(batch)
164
+ return batches
165
+
166
+
167
+ def parse_args():
168
+ parser = argparse.ArgumentParser()
169
+ parser.add_argument('--dataset', choices=["boolq", "piqa", "social_i_qa", "hellaswag", "winogrande", "ARC-Challenge", "ARC-Easy", "openbookqa"],
170
+ required=True)
171
+ parser.add_argument('--model', choices=['LLaMA-7B', "LLaMA-13B",'BLOOM-7B', 'GPT-j-6B'], required=True)
172
+ parser.add_argument('--adapter', choices=['LoRA', 'AdapterP', 'AdapterH', 'Parallel'],
173
+ required=True)
174
+ parser.add_argument('--base_model', required=True)
175
+ parser.add_argument('--lora_weights', required=True)
176
+ parser.add_argument('--batch_size', type=int, required=True)
177
+ parser.add_argument('--load_8bit', action='store_true', default=False)
178
+
179
+ return parser.parse_args()
180
+
181
+
182
+ def load_model(args) -> tuple:
183
+ """
184
+ load tuned model
185
+ Args:
186
+ args:
187
+
188
+ Returns:
189
+ tuple(tokenizer, model)
190
+ """
191
+ base_model = args.base_model
192
+ if not base_model:
193
+ raise ValueError(f'can not find base model name by the value: {args.model}')
194
+ lora_weights = args.lora_weights
195
+ if not lora_weights:
196
+ raise ValueError(f'can not find lora weight, the value is: {lora_weights}')
197
+
198
+ load_8bit = args.load_8bit
199
+ if "LLaMA" in args.model:
200
+ tokenizer = LlamaTokenizer.from_pretrained(base_model)
201
+ else:
202
+ tokenizer = AutoTokenizer.from_pretrained(base_model)
203
+ tokenizer.padding_side = "left"
204
+ tokenizer.pad_token_id = (
205
+ 0 # unk. we want this to be different from the eos token
206
+ )
207
+ if device == "cuda":
208
+ model = AutoModelForCausalLM.from_pretrained(
209
+ base_model,
210
+ load_in_8bit=load_8bit,
211
+ torch_dtype=torch.float16,
212
+ device_map="auto",
213
+ trust_remote_code=True,
214
+ ) # fix zwq
215
+ model = PeftModel.from_pretrained(
216
+ model,
217
+ lora_weights,
218
+ torch_dtype=torch.float16,
219
+ device_map={"":0}
220
+ )
221
+ elif device == "mps":
222
+ model = AutoModelForCausalLM.from_pretrained(
223
+ base_model,
224
+ device_map={"": device},
225
+ torch_dtype=torch.float16,
226
+ )
227
+ model = PeftModel.from_pretrained(
228
+ model,
229
+ lora_weights,
230
+ device_map={"": device},
231
+ torch_dtype=torch.float16,
232
+ )
233
+ else:
234
+ model = AutoModelForCausalLM.from_pretrained(
235
+ base_model, device_map={"": device}, low_cpu_mem_usage=True
236
+ )
237
+ model = PeftModel.from_pretrained(
238
+ model,
239
+ lora_weights,
240
+ device_map={"": device},
241
+ )
242
+
243
+ # unwind broken decapoda-research config
244
+ model.config.pad_token_id = tokenizer.pad_token_id = 0 # unk
245
+ model.config.bos_token_id = 1
246
+ model.config.eos_token_id = 2
247
+
248
+ if not load_8bit:
249
+ model.half() # seems to fix bugs for some users.
250
+
251
+ model.eval()
252
+ if torch.__version__ >= "2" and sys.platform != "win32":
253
+ model = torch.compile(model)
254
+
255
+ return tokenizer, model
256
+
257
+
258
+ def load_instruction(args) -> str:
259
+ instruction = ''
260
+ if not instruction:
261
+ raise ValueError('instruct not initialized')
262
+ return instruction
263
+
264
+
265
+ def extract_answer(args, sentence: str) -> float:
266
+ dataset = args.dataset
267
+ if dataset == 'boolq':
268
+ sentence_ = sentence.strip()
269
+ pred_answers = re.findall(r'true|false', sentence_)
270
+ if not pred_answers:
271
+ return ""
272
+ return pred_answers[0]
273
+ elif dataset == 'piqa':
274
+ sentence_ = sentence.strip()
275
+ pred_answers = re.findall(r'solution1|solution2', sentence_)
276
+ if not pred_answers:
277
+ return ""
278
+ return pred_answers[0]
279
+ elif dataset in ['social_i_qa', 'ARC-Challenge', 'ARC-Easy', 'openbookqa']:
280
+ sentence_ = sentence.strip()
281
+ pred_answers = re.findall(r'answer1|answer2|answer3|answer4|answer5', sentence_)
282
+ if not pred_answers:
283
+ return ""
284
+ return pred_answers[0]
285
+ elif dataset == 'hellaswag':
286
+ sentence_ = sentence.strip()
287
+ pred_answers = re.findall(r'ending1|ending2|ending3|ending4', sentence_)
288
+ if not pred_answers:
289
+ return ""
290
+ return pred_answers[0]
291
+ elif dataset == 'winogrande':
292
+ sentence_ = sentence.strip()
293
+ pred_answers = re.findall(r'option1|option2', sentence_)
294
+ if not pred_answers:
295
+ return ""
296
+ return pred_answers[0]
297
+
298
+
299
+ if __name__ == "__main__":
300
+ main()
SVFT-main/LLM-Adapters/evaluate.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import json
3
+ import os
4
+ import re
5
+ import sys
6
+ import argparse
7
+
8
+ import fire
9
+
10
+ import torch
11
+
12
+ sys.path.append(os.path.join(os.getcwd(), "peft/src/"))
13
+ from peft import PeftModel
14
+ from tqdm import tqdm
15
+ from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer, AutoModelForCausalLM, AutoTokenizer
16
+
17
+ if torch.cuda.is_available():
18
+ device = "cuda"
19
+ else:
20
+ device = "cpu"
21
+
22
+ try:
23
+ if torch.backends.mps.is_available():
24
+ device = "mps"
25
+ except: # noqa: E722
26
+ pass
27
+
28
+
29
+ def main(
30
+ load_8bit: bool = False,
31
+ base_model: str = "",
32
+ lora_weights: str = "tloen/alpaca-lora-7b",
33
+ share_gradio: bool = False,
34
+ ):
35
+ args = parse_args()
36
+
37
+ def evaluate(
38
+ instruction,
39
+ input=None,
40
+ temperature=0.1,
41
+ top_p=0.75,
42
+ top_k=40,
43
+ num_beams=4,
44
+ max_new_tokens=256,
45
+ **kwargs,
46
+ ):
47
+ prompt = generate_prompt(instruction, input)
48
+ inputs = tokenizer(prompt, return_tensors="pt")
49
+ input_ids = inputs["input_ids"].to(device)
50
+ generation_config = GenerationConfig(
51
+ temperature=temperature,
52
+ top_p=top_p,
53
+ top_k=top_k,
54
+ num_beams=num_beams,
55
+ **kwargs,
56
+ )
57
+ with torch.no_grad():
58
+ generation_output = model.generate(
59
+ input_ids=input_ids,
60
+ generation_config=generation_config,
61
+ return_dict_in_generate=True,
62
+ output_scores=True,
63
+ max_new_tokens=max_new_tokens,
64
+ use_cache=False,
65
+ )
66
+ s = generation_output.sequences[0]
67
+ output = tokenizer.decode(s)
68
+ return output.split("### Response:")[1].strip()
69
+
70
+ """
71
+ # testing code for readme
72
+ for instruction in [
73
+ "Tell me about alpacas.",
74
+ "Tell me about the president of Mexico in 2019.",
75
+ "Tell me about the king of France in 2019.",
76
+ "List all Canadian provinces in alphabetical order.",
77
+ "Write a Python program that prints the first 10 Fibonacci numbers.",
78
+ "Write a program that prints the numbers from 1 to 100. But for multiples of three print 'Fizz' instead of the number and for the multiples of five print 'Buzz'. For numbers which are multiples of both three and five print 'FizzBuzz'.", # noqa: E501
79
+ "Tell me five words that rhyme with 'shock'.",
80
+ "Translate the sentence 'I have no mouth but I must scream' into Spanish.",
81
+ "Count up from 1 to 500.",
82
+ ]:
83
+ print("Instruction:", instruction)
84
+ print("Response:", evaluate(instruction))
85
+ print()
86
+ """
87
+ save_file = f'experiment/{args.model}-{args.adapter}-{args.dataset}.json'
88
+ create_dir('experiment/')
89
+
90
+ dataset = load_data(args)
91
+ tokenizer, model = load_model(args)
92
+ total = len(dataset)
93
+ correct = 0
94
+ miss = 0.001
95
+ output_data = []
96
+ pbar = tqdm(total=total)
97
+ for idx, data in enumerate(dataset):
98
+ instruction = data.get('instruction')
99
+
100
+ outputs = evaluate(instruction)
101
+ label = data.get('answer')
102
+ flag = False
103
+ if args.dataset.lower() in ['aqua']:
104
+ predict = extract_answer_letter(args, outputs)
105
+ if label == predict:
106
+ correct += 1
107
+ flag = True
108
+ else:
109
+ if isinstance(label, str):
110
+ label = float(label)
111
+ predict = extract_answer_number(args, outputs)
112
+ if abs(label - predict) <= miss:
113
+ correct += 1
114
+ flag = True
115
+ new_data = copy.deepcopy(data)
116
+ new_data['output_pred'] = outputs
117
+ new_data['pred'] = predict
118
+ new_data['flag'] = flag
119
+ output_data.append(new_data)
120
+ print(' ')
121
+ print('---------------')
122
+ print(outputs)
123
+ print('prediction:', predict)
124
+ print('label:', label)
125
+ print('---------------')
126
+ print(f'\rtest:{idx + 1}/{total} | accuracy {correct} {correct / (idx + 1)}')
127
+ with open(save_file, 'w+') as f:
128
+ json.dump(output_data, f, indent=4)
129
+ pbar.update(1)
130
+ pbar.close()
131
+ print('\n')
132
+ print('test finished')
133
+
134
+
135
+ def create_dir(dir_path):
136
+ if not os.path.exists(dir_path):
137
+ os.mkdir(dir_path)
138
+ return
139
+
140
+
141
+ def generate_prompt(instruction, input=None):
142
+ if input:
143
+ return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
144
+
145
+ ### Instruction:
146
+ {instruction}
147
+
148
+ ### Input:
149
+ {input}
150
+
151
+ ### Response:
152
+ """ # noqa: E501
153
+ else:
154
+ return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
155
+
156
+ ### Instruction:
157
+ {instruction}
158
+
159
+ ### Response:
160
+ """ # noqa: E501
161
+
162
+
163
+ def load_data(args) -> list:
164
+ """
165
+ read data from dataset file
166
+ Args:
167
+ args:
168
+
169
+ Returns:
170
+
171
+ """
172
+ file_path = f'dataset/{args.dataset}/test.json'
173
+ if not os.path.exists(file_path):
174
+ raise FileNotFoundError(f"can not find dataset file : {file_path}")
175
+ json_data = json.load(open(file_path, 'r'))
176
+ return json_data
177
+
178
+
179
+ def parse_args():
180
+ parser = argparse.ArgumentParser()
181
+ parser.add_argument('--dataset', choices=['AddSub', 'MultiArith', 'SingleEq', 'gsm8k', 'AQuA', 'SVAMP'],
182
+ required=True)
183
+ parser.add_argument('--model', choices=['LLaMA-7B', 'BLOOM-7B', 'GPT-j-6B'], required=True)
184
+ parser.add_argument('--adapter', choices=['LoRA', 'AdapterP', 'AdapterH', 'Parallel', 'Prefix'],
185
+ required=True)
186
+ parser.add_argument('--base_model', required=True)
187
+ parser.add_argument('--lora_weights', required=True)
188
+ parser.add_argument('--load_8bit', action='store_true', default=False)
189
+
190
+ return parser.parse_args()
191
+
192
+
193
+ def load_model(args) -> tuple:
194
+ """
195
+ load tuned model
196
+ Args:
197
+ args:
198
+
199
+ Returns:
200
+ tuple(tokenizer, model)
201
+ """
202
+ base_model = args.base_model
203
+ if not base_model:
204
+ raise ValueError(f'can not find base model name by the value: {args.model}')
205
+ lora_weights = args.lora_weights
206
+ if not lora_weights:
207
+ raise ValueError(f'can not find lora weight, the value is: {lora_weights}')
208
+
209
+ load_8bit = args.load_8bit
210
+ if args.model == 'LLaMA-7B':
211
+ tokenizer = LlamaTokenizer.from_pretrained(base_model)
212
+ else:
213
+ tokenizer = AutoTokenizer.from_pretrained(base_model)
214
+ if device == "cuda":
215
+ model = AutoModelForCausalLM.from_pretrained(
216
+ base_model,
217
+ load_in_8bit=load_8bit,
218
+ torch_dtype=torch.float16,
219
+ device_map="auto",
220
+ trust_remote_code=True,
221
+ ) # fix zwq
222
+ model = PeftModel.from_pretrained(
223
+ model,
224
+ lora_weights,
225
+ torch_dtype=torch.float16,
226
+ device_map={"":0}
227
+ )
228
+ elif device == "mps":
229
+ model = AutoModelForCausalLM.from_pretrained(
230
+ base_model,
231
+ device_map={"": device},
232
+ torch_dtype=torch.float16,
233
+ )
234
+ model = PeftModel.from_pretrained(
235
+ model,
236
+ lora_weights,
237
+ device_map={"": device},
238
+ torch_dtype=torch.float16,
239
+ )
240
+ else:
241
+ model = AutoModelForCausalLM.from_pretrained(
242
+ base_model, device_map={"": device}, low_cpu_mem_usage=True
243
+ )
244
+ model = PeftModel.from_pretrained(
245
+ model,
246
+ lora_weights,
247
+ device_map={"": device},
248
+ )
249
+
250
+ # unwind broken decapoda-research config
251
+ model.config.pad_token_id = tokenizer.pad_token_id = 0 # unk
252
+ model.config.bos_token_id = 1
253
+ model.config.eos_token_id = 2
254
+
255
+ if not load_8bit:
256
+ model.half() # seems to fix bugs for some users.
257
+
258
+ model.eval()
259
+ if torch.__version__ >= "2" and sys.platform != "win32":
260
+ model = torch.compile(model)
261
+
262
+ return tokenizer, model
263
+
264
+
265
+ def load_instruction(args) -> str:
266
+ instruction = ''
267
+ if not instruction:
268
+ raise ValueError('instruct not initialized')
269
+ return instruction
270
+
271
+
272
+ def extract_answer_number(args, sentence: str) -> float:
273
+ dataset = args.dataset.lower()
274
+ if dataset in ["multiarith", "addsub", "singleeq", "gsm8k", "svamp"]:
275
+ sentence = sentence.replace(',', '')
276
+ pred = [s for s in re.findall(r'-?\d+\.?\d*', sentence)]
277
+ if not pred:
278
+ return float('inf')
279
+ pred_answer = float(pred[-1])
280
+ else:
281
+ raise NotImplementedError(' not support dataset: {}'.format(dataset))
282
+ if isinstance(pred_answer, str):
283
+ try:
284
+ pred_answer = float(pred_answer)
285
+ except ValueError as e:
286
+ pred_answer = float('inf')
287
+ return pred_answer
288
+
289
+
290
+ def extract_answer_letter(args, sentence: str) -> str:
291
+ sentence_ = sentence.strip()
292
+ pred_answers = re.findall(r'A|B|C|D|E', sentence_)
293
+ if pred_answers:
294
+ if not pred_answers:
295
+ return ''
296
+ return pred_answers[0]
297
+ else:
298
+ return ''
299
+
300
+
301
+ if __name__ == "__main__":
302
+ fire.Fire(main)
SVFT-main/LLM-Adapters/export_hf_checkpoint.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+ import transformers
5
+ from peft import PeftModel
6
+ from transformers import LlamaForCausalLM, LlamaTokenizer # noqa: F402
7
+
8
+ BASE_MODEL = os.environ.get("BASE_MODEL", None)
9
+ assert (
10
+ BASE_MODEL
11
+ ), "Please specify a value for BASE_MODEL environment variable, e.g. `export BASE_MODEL=decapoda-research/llama-7b-hf`" # noqa: E501
12
+
13
+ tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
14
+
15
+ base_model = LlamaForCausalLM.from_pretrained(
16
+ BASE_MODEL,
17
+ load_in_8bit=False,
18
+ torch_dtype=torch.float16,
19
+ device_map={"": "cpu"},
20
+ )
21
+
22
+ first_weight = base_model.model.layers[0].self_attn.q_proj.weight
23
+ first_weight_old = first_weight.clone()
24
+
25
+ lora_model = PeftModel.from_pretrained(
26
+ base_model,
27
+ "tloen/alpaca-lora-7b",
28
+ device_map={"": "cpu"},
29
+ torch_dtype=torch.float16,
30
+ )
31
+
32
+ lora_weight = lora_model.base_model.model.model.layers[
33
+ 0
34
+ ].self_attn.q_proj.weight
35
+
36
+ assert torch.allclose(first_weight_old, first_weight)
37
+
38
+ # merge weights
39
+ for layer in lora_model.base_model.model.model.layers:
40
+ layer.self_attn.q_proj.merge_weights = True
41
+ layer.self_attn.v_proj.merge_weights = True
42
+
43
+ lora_model.train(False)
44
+
45
+ # did we do anything?
46
+ assert not torch.allclose(first_weight_old, first_weight)
47
+
48
+ lora_model_sd = lora_model.state_dict()
49
+ deloreanized_sd = {
50
+ k.replace("base_model.model.", ""): v
51
+ for k, v in lora_model_sd.items()
52
+ if "lora" not in k
53
+ }
54
+
55
+ LlamaForCausalLM.save_pretrained(
56
+ base_model, "./hf_ckpt", state_dict=deloreanized_sd, max_shard_size="400MB"
57
+ )
SVFT-main/LLM-Adapters/export_state_dict_checkpoint.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import torch
5
+ import transformers
6
+ from peft import PeftModel
7
+ from transformers import LlamaForCausalLM, LlamaTokenizer # noqa: E402
8
+
9
+ BASE_MODEL = os.environ.get("BASE_MODEL", None)
10
+ assert (
11
+ BASE_MODEL
12
+ ), "Please specify a value for BASE_MODEL environment variable, e.g. `export BASE_MODEL=decapoda-research/llama-7b-hf`" # noqa: E501
13
+
14
+ tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
15
+
16
+ base_model = LlamaForCausalLM.from_pretrained(
17
+ BASE_MODEL,
18
+ load_in_8bit=False,
19
+ torch_dtype=torch.float16,
20
+ device_map={"": "cpu"},
21
+ )
22
+
23
+ lora_model = PeftModel.from_pretrained(
24
+ base_model,
25
+ "tloen/alpaca-lora-7b",
26
+ device_map={"": "cpu"},
27
+ torch_dtype=torch.float16,
28
+ )
29
+
30
+ # merge weights
31
+ for layer in lora_model.base_model.model.model.layers:
32
+ layer.self_attn.q_proj.merge_weights = True
33
+ layer.self_attn.v_proj.merge_weights = True
34
+
35
+ lora_model.train(False)
36
+
37
+ lora_model_sd = lora_model.state_dict()
38
+
39
+ params = {
40
+ "dim": 4096,
41
+ "multiple_of": 256,
42
+ "n_heads": 32,
43
+ "n_layers": 32,
44
+ "norm_eps": 1e-06,
45
+ "vocab_size": -1,
46
+ }
47
+ n_layers = params["n_layers"]
48
+ n_heads = params["n_heads"]
49
+ dim = params["dim"]
50
+ dims_per_head = dim // n_heads
51
+ base = 10000.0
52
+ inv_freq = 1.0 / (
53
+ base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)
54
+ )
55
+
56
+
57
+ def permute(w):
58
+ return (
59
+ w.view(n_heads, dim // n_heads // 2, 2, dim)
60
+ .transpose(1, 2)
61
+ .reshape(dim, dim)
62
+ )
63
+
64
+
65
+ def unpermute(w):
66
+ return (
67
+ w.view(n_heads, 2, dim // n_heads // 2, dim)
68
+ .transpose(1, 2)
69
+ .reshape(dim, dim)
70
+ )
71
+
72
+
73
+ def translate_state_dict_key(k): # noqa: C901
74
+ k = k.replace("base_model.model.", "")
75
+ if k == "model.embed_tokens.weight":
76
+ return "tok_embeddings.weight"
77
+ elif k == "model.norm.weight":
78
+ return "norm.weight"
79
+ elif k == "lm_head.weight":
80
+ return "output.weight"
81
+ elif k.startswith("model.layers."):
82
+ layer = k.split(".")[2]
83
+ if k.endswith(".self_attn.q_proj.weight"):
84
+ return f"layers.{layer}.attention.wq.weight"
85
+ elif k.endswith(".self_attn.k_proj.weight"):
86
+ return f"layers.{layer}.attention.wk.weight"
87
+ elif k.endswith(".self_attn.v_proj.weight"):
88
+ return f"layers.{layer}.attention.wv.weight"
89
+ elif k.endswith(".self_attn.o_proj.weight"):
90
+ return f"layers.{layer}.attention.wo.weight"
91
+ elif k.endswith(".mlp.gate_proj.weight"):
92
+ return f"layers.{layer}.feed_forward.w1.weight"
93
+ elif k.endswith(".mlp.down_proj.weight"):
94
+ return f"layers.{layer}.feed_forward.w2.weight"
95
+ elif k.endswith(".mlp.up_proj.weight"):
96
+ return f"layers.{layer}.feed_forward.w3.weight"
97
+ elif k.endswith(".input_layernorm.weight"):
98
+ return f"layers.{layer}.attention_norm.weight"
99
+ elif k.endswith(".post_attention_layernorm.weight"):
100
+ return f"layers.{layer}.ffn_norm.weight"
101
+ elif k.endswith("rotary_emb.inv_freq") or "lora" in k:
102
+ return None
103
+ else:
104
+ print(layer, k)
105
+ raise NotImplementedError
106
+ else:
107
+ print(k)
108
+ raise NotImplementedError
109
+
110
+
111
+ new_state_dict = {}
112
+ for k, v in lora_model_sd.items():
113
+ new_k = translate_state_dict_key(k)
114
+ if new_k is not None:
115
+ if "wq" in new_k or "wk" in new_k:
116
+ new_state_dict[new_k] = unpermute(v)
117
+ else:
118
+ new_state_dict[new_k] = v
119
+
120
+ os.makedirs("./ckpt", exist_ok=True)
121
+
122
+ torch.save(new_state_dict, "./ckpt/consolidated.00.pth")
123
+
124
+ with open("./ckpt/params.json", "w") as f:
125
+ json.dump(params, f)
SVFT-main/LLM-Adapters/finetune.py ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from typing import List
4
+
5
+ import fire
6
+ import torch
7
+ import argparse
8
+ import transformers
9
+ from datasets import load_dataset
10
+ from typing import List, Optional, Union
11
+
12
+ from tqdm import tqdm
13
+ import sys
14
+ from functools import partial, reduce
15
+ sys.path.append("../")
16
+ from svft.svft_layers import LinearWithSVFT, create_and_replace_modules, get_target_modules_list, replace_svft_with_fused_linear
17
+
18
+ """
19
+ Unused imports:
20
+ import torch.nn as nn
21
+ import bitsandbytes as bnb
22
+ """
23
+ sys.path.append(os.path.join(os.getcwd(), "peft/src/"))
24
+
25
+ from peft import ( # noqa: E402
26
+ LoraConfig, BOFTConfig, VeraConfig,
27
+ PrefixTuningConfig,
28
+ get_peft_model,
29
+ get_peft_model_state_dict,
30
+ set_peft_model_state_dict,
31
+ )
32
+ from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer, AutoModel # noqa: F402
33
+
34
+
35
+ def train(
36
+ # model/data params
37
+ base_model: str = "", # the only required argument
38
+ data_path: str = "yahma/alpaca-cleaned",
39
+ output_dir: str = "./lora-alpaca",
40
+ adapter_name: str = "lora",
41
+ load_8bit : bool = False,
42
+ # training hyperparams
43
+ batch_size: int = 128,
44
+ micro_batch_size: int = 4,
45
+ num_epochs: int = 3,
46
+ learning_rate: float = 3e-4,
47
+ cutoff_len: int = 256,
48
+ val_set_size: int = 2000,
49
+ use_gradient_checkpointing: bool = False,
50
+ eval_step: int = 200,
51
+ save_step: int = 200,
52
+ # lora hyperparams
53
+ lora_r: int = None,
54
+ lora_alpha: int = 16,
55
+ lora_dropout: float = 0.05,
56
+ lora_target_modules: List[str] = None,
57
+ # bottleneck adapter hyperparams
58
+ bottleneck_size: int = 256,
59
+ non_linearity: str = "tanh",
60
+ adapter_dropout: float = 0.0,
61
+ use_parallel_adapter: bool = False,
62
+ use_adapterp: bool = False,
63
+ target_modules: List[str] = None,
64
+ scaling: Union[float, str] = 1.0,
65
+ # prefix tuning hyperparams
66
+ num_virtual_tokens: int = 30,
67
+ # llm hyperparams
68
+ train_on_inputs: bool = True, # if False, masks out inputs in loss
69
+ group_by_length: bool = False, # faster, but produces an odd training loss curve
70
+ # wandb params
71
+ wandb_project: str = "",
72
+ wandb_run_name: str = "",
73
+ wandb_watch: str = "", # options: false | gradients | all
74
+ wandb_log_model: str = "", # options: false | true
75
+ resume_from_checkpoint: str = None, # either training checkpoint or final adapter
76
+ off_diag: int = 0,
77
+ pattern: str = "banded",
78
+ fill_orthonormal: bool = False,
79
+ ):
80
+ print(
81
+ f"Finetuning model with params:\n"
82
+ f"base_model: {base_model}\n"
83
+ f"data_path: {data_path}\n"
84
+ f"output_dir: {output_dir}\n"
85
+ f"batch_size: {batch_size}\n"
86
+ f"micro_batch_size: {micro_batch_size}\n"
87
+ f"num_epochs: {num_epochs}\n"
88
+ f"learning_rate: {learning_rate}\n"
89
+ f"cutoff_len: {cutoff_len}\n"
90
+ f"val_set_size: {val_set_size}\n"
91
+ f"use_gradient_checkpointing: {use_gradient_checkpointing}\n"
92
+ f"lora_r: {lora_r}\n"
93
+ f"lora_alpha: {lora_alpha}\n"
94
+ f"lora_dropout: {lora_dropout}\n"
95
+ f"lora_target_modules: {lora_target_modules}\n"
96
+ f"bottleneck_size: {bottleneck_size}\n"
97
+ f"non_linearity: {non_linearity}\n"
98
+ f"adapter_dropout: {adapter_dropout}\n"
99
+ f"use_parallel_adapter: {use_parallel_adapter}\n"
100
+ f"use_adapterp: {use_adapterp}\n"
101
+ f"train_on_inputs: {train_on_inputs}\n"
102
+ f"scaling: {scaling}\n"
103
+ f"adapter_name: {adapter_name}\n"
104
+ f"target_modules: {target_modules}\n"
105
+ f"group_by_length: {group_by_length}\n"
106
+ f"wandb_project: {wandb_project}\n"
107
+ f"wandb_run_name: {wandb_run_name}\n"
108
+ f"wandb_watch: {wandb_watch}\n"
109
+ f"wandb_log_model: {wandb_log_model}\n"
110
+ f"resume_from_checkpoint: {resume_from_checkpoint}\n"
111
+ )
112
+
113
+ print(base_model)
114
+
115
+ # assert (
116
+ # base_model
117
+ # ), "Please specify a --base_model, e.g. --base_model='decapoda-research/llama-7b-hf'"
118
+ gradient_accumulation_steps = batch_size // micro_batch_size
119
+
120
+ device_map = "auto"
121
+ world_size = int(os.environ.get("WORLD_SIZE", 1))
122
+ ddp = world_size != 1
123
+ if ddp:
124
+ device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
125
+ gradient_accumulation_steps = gradient_accumulation_steps // world_size
126
+
127
+ # Check if parameter passed or if set within environ
128
+ use_wandb = len(wandb_project) > 0 or (
129
+ "WANDB_PROJECT" in os.environ and len(os.environ["WANDB_PROJECT"]) > 0
130
+ )
131
+ # Only overwrite environ if wandb param passed
132
+ if len(wandb_project) > 0:
133
+ os.environ["WANDB_PROJECT"] = "CommonsenseReasoning"
134
+ if len(wandb_watch) > 0:
135
+ os.environ["WANDB_WATCH"] = "all"
136
+ if len(wandb_log_model) > 0:
137
+ os.environ["WANDB_LOG_MODEL"] = False
138
+
139
+ if load_8bit:
140
+ model = AutoModelForCausalLM.from_pretrained(
141
+ base_model,
142
+ load_in_8bit=load_8bit,
143
+ torch_dtype=torch.float16,
144
+ device_map=device_map,
145
+ trust_remote_code=True,
146
+ )
147
+ else:
148
+ model = AutoModelForCausalLM.from_pretrained(
149
+ base_model,
150
+ load_in_8bit=False,
151
+ torch_dtype=torch.float32,
152
+ device_map={"": int(os.environ.get("LOCAL_RANK") or 0)},
153
+ trust_remote_code=True,
154
+ #revision="step143000",
155
+ )
156
+
157
+ if model.config.model_type == "llama":
158
+ # Due to the name of transformers' LlamaTokenizer, we have to do this
159
+ tokenizer = LlamaTokenizer.from_pretrained(base_model)
160
+ else:
161
+ tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
162
+
163
+ tokenizer.pad_token_id = (
164
+ 0 # unk. we want this to be different from the eos token
165
+ )
166
+ tokenizer.padding_side = "left" # Allow batched inference
167
+
168
+ def tokenize(prompt, add_eos_token=True):
169
+ # there's probably a way to do this with the tokenizer settings
170
+ # but again, gotta move fast
171
+ result = tokenizer(
172
+ prompt,
173
+ truncation=True,
174
+ max_length=cutoff_len,
175
+ padding=False,
176
+ return_tensors=None,
177
+ )
178
+ if (
179
+ result["input_ids"][-1] != tokenizer.eos_token_id
180
+ and len(result["input_ids"]) < cutoff_len
181
+ and add_eos_token
182
+ ):
183
+ result["input_ids"].append(tokenizer.eos_token_id)
184
+ if "chatglm" not in base_model:
185
+ result["attention_mask"].append(1)
186
+
187
+ result["labels"] = result["input_ids"].copy()
188
+
189
+ if "chatglm" in base_model:
190
+ return {"input_ids": result["input_ids"], "labels": result["labels"]}
191
+ else:
192
+ return result
193
+
194
+ def generate_and_tokenize_prompt(data_point):
195
+ full_prompt = generate_prompt(data_point)
196
+ tokenized_full_prompt = tokenize(full_prompt)
197
+ if not train_on_inputs:
198
+ user_prompt = generate_prompt({**data_point, "output": ""})
199
+ tokenized_user_prompt = tokenize(user_prompt, add_eos_token=False)
200
+ user_prompt_len = len(tokenized_user_prompt["input_ids"])
201
+
202
+ tokenized_full_prompt["labels"] = [
203
+ -100
204
+ ] * user_prompt_len + tokenized_full_prompt["labels"][
205
+ user_prompt_len:
206
+ ] # could be sped up, probably
207
+ return tokenized_full_prompt
208
+
209
+ if adapter_name == "lora":
210
+ config = LoraConfig(
211
+ r=lora_r,
212
+ lora_alpha=lora_alpha,
213
+ target_modules=lora_target_modules,
214
+ lora_dropout=lora_dropout,
215
+ bias="none",
216
+ task_type="CAUSAL_LM",
217
+ )
218
+
219
+ elif adapter_name == "dora":
220
+ config = LoraConfig(
221
+ use_dora=True,
222
+ r=lora_r,
223
+ lora_alpha=lora_alpha,
224
+ target_modules=lora_target_modules,
225
+ lora_dropout=lora_dropout,
226
+ bias="none",
227
+ task_type="CAUSAL_LM",
228
+ )
229
+
230
+ elif adapter_name == "boft":
231
+ config = BOFTConfig(
232
+ boft_block_size=8,
233
+ boft_n_butterfly_factor=2,
234
+ target_modules=lora_target_modules,
235
+ boft_dropout=0.05,
236
+ bias="boft_only",
237
+ )
238
+
239
+ elif adapter_name == "boft_r1":
240
+ config = BOFTConfig(
241
+ boft_block_size=1,
242
+ boft_n_butterfly_factor=1,
243
+ target_modules=lora_target_modules,
244
+ boft_dropout=0.05,
245
+ bias="boft_only",
246
+ )
247
+
248
+ elif adapter_name == "vera":
249
+ config = VeraConfig(r=lora_r, target_modules=lora_target_modules)
250
+
251
+ if adapter_name == 'svft':
252
+ # for SVFT turn off gradient requirement for all layers
253
+ # PEFT library handles this internally
254
+ for param in model.parameters():
255
+ param.requires_grad = False
256
+
257
+ print(f"Target Modules: {lora_target_modules}")
258
+ assign_svft_layer = partial(LinearWithSVFT,
259
+ off_diag=off_diag,
260
+ pattern=pattern,
261
+ rank=lora_r,
262
+ fill_orthonormal=fill_orthonormal)
263
+
264
+ create_and_replace_modules(model, get_target_modules_list(model, lora_target_modules), assign_svft_layer)
265
+
266
+ elif adapter_name == "full_ft":
267
+ pass
268
+ else:
269
+ # for baseline peft models
270
+ model = get_peft_model(model, config)
271
+
272
+ if adapter_name == "prefix-tuning":
273
+ model.to('cuda')
274
+
275
+ if data_path.endswith(".json"): # todo: support jsonl
276
+ data = load_dataset("json", data_files=data_path)
277
+ else:
278
+ data = load_dataset(data_path)
279
+
280
+ print(f"Trainable Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")
281
+ print(f"Output Dir: {output_dir}")
282
+
283
+ if val_set_size > 0:
284
+ train_val = data["train"].train_test_split(
285
+ test_size=val_set_size, shuffle=True, seed=42
286
+ )
287
+ train_data = (
288
+ train_val["train"].shuffle().map(generate_and_tokenize_prompt)
289
+ )
290
+ val_data = (
291
+ train_val["test"].shuffle().map(generate_and_tokenize_prompt)
292
+ )
293
+ else:
294
+ train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
295
+ val_data = None
296
+
297
+ if not ddp and torch.cuda.device_count() > 1:
298
+ # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
299
+ model.is_parallelizable = True
300
+ model.model_parallel = True
301
+
302
+ trainer = transformers.Trainer(
303
+ model=model,
304
+ train_dataset=train_data,
305
+ eval_dataset=val_data,
306
+ args=transformers.TrainingArguments(
307
+ per_device_train_batch_size=micro_batch_size,
308
+ gradient_accumulation_steps=gradient_accumulation_steps,
309
+ warmup_steps=100,
310
+ num_train_epochs=num_epochs,
311
+ learning_rate=learning_rate,
312
+ bf16=True,
313
+ logging_steps=10,
314
+ optim="adamw_torch",
315
+ evaluation_strategy="steps" if val_set_size > 0 else "no",
316
+ save_strategy="steps",
317
+ eval_steps=eval_step if val_set_size > 0 else None,
318
+ save_steps=save_step,
319
+ output_dir=output_dir,
320
+ save_total_limit=3,
321
+ load_best_model_at_end=False if val_set_size > 0 else False,
322
+ ddp_find_unused_parameters=False if ddp else None,
323
+ group_by_length=group_by_length,
324
+ report_to="wandb" if use_wandb else None,
325
+ run_name=wandb_run_name if use_wandb else None,
326
+ #deepspeed="deepspeed.json"
327
+ ),
328
+ data_collator=transformers.DataCollatorForSeq2Seq(
329
+ tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
330
+ ),
331
+ )
332
+ model.config.use_cache = False
333
+
334
+ if adapter_name not in ['boft', 'svft']:
335
+ model = model.bfloat16()
336
+
337
+ trainer.train(resume_from_checkpoint=resume_from_checkpoint)
338
+
339
+ model.generation_config.temperature = 1.0
340
+ model.generation_config.top_p = 1.0
341
+
342
+ if adapter_name == 'svft':
343
+ replace_svft_with_fused_linear(model, get_target_modules_list(model, lora_target_modules))
344
+ elif adapter_name=="full_ft":
345
+ pass
346
+ else:
347
+ model = model.merge_and_unload()
348
+
349
+ for param in model.parameters():
350
+ param.data = param.data.contiguous()
351
+ model.save_pretrained(output_dir)
352
+ tokenizer.save_pretrained(output_dir)
353
+
354
+ print(
355
+ "\n If there's a warning about missing keys above, please disregard :)"
356
+ )
357
+
358
+
359
+ def generate_prompt(data_point):
360
+ # sorry about the formatting disaster gotta move fast
361
+ if data_point["input"]:
362
+ return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
363
+
364
+ ### Instruction:
365
+ {data_point["instruction"]}
366
+
367
+ ### Input:
368
+ {data_point["input"]}
369
+
370
+ ### Response:
371
+ {data_point["output"]}""" # noqa: E501
372
+ else:
373
+ return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
374
+
375
+ ### Instruction:
376
+ {data_point["instruction"]}
377
+
378
+ ### Response:
379
+ {data_point["output"]}""" # noqa: E501
380
+
381
+
382
+ def parse_args():
383
+ parser = argparse.ArgumentParser(description='Train a model')
384
+
385
+ # model/data params
386
+ parser.add_argument('--base_model', type=str, required=True, help='Base model')
387
+ parser.add_argument('--data_path', type=str, default='yahma/alpaca-cleaned', help='Data path')
388
+ parser.add_argument('--output_dir', type=str, default='./lora-alpaca', help='Output directory')
389
+ parser.add_argument('--adapter_name', type=str, default='lora', help='Adapter name')
390
+ parser.add_argument('--load_8bit', action='store_true', help='Load 8-bit')
391
+
392
+ # training hyperparams
393
+ parser.add_argument('--batch_size', type=int, default=128, help='Batch size')
394
+ parser.add_argument('--micro_batch_size', type=int, default=4, help='Micro batch size')
395
+ parser.add_argument('--num_epochs', type=int, default=3, help='Number of epochs')
396
+ parser.add_argument('--learning_rate', type=float, default=3e-4, help='Learning rate')
397
+ parser.add_argument('--cutoff_len', type=int, default=256, help='Cutoff length')
398
+ parser.add_argument('--val_set_size', type=int, default=2000, help='Validation set size')
399
+ parser.add_argument('--use_gradient_checkpointing', action='store_true', help='Use gradient checkpointing')
400
+ parser.add_argument('--eval_step', type=int, default=200, help='Evaluation step')
401
+ parser.add_argument('--save_step', type=int, default=200, help='Save step')
402
+
403
+ # lora hyperparams
404
+ parser.add_argument('--lora_r', type=int, default=8, help='Lora r')
405
+ parser.add_argument('--lora_alpha', type=int, default=16, help='Lora alpha')
406
+ parser.add_argument('--lora_dropout', type=float, default=0.05, help='Lora dropout')
407
+ parser.add_argument('--lora_target_modules', nargs='+', help='Lora target modules')
408
+
409
+ # bottleneck adapter hyperparams
410
+ parser.add_argument('--bottleneck_size', type=int, default=256, help='Bottleneck size')
411
+ parser.add_argument('--non_linearity', type=str, default='tanh', help='Non-linearity')
412
+ parser.add_argument('--adapter_dropout', type=float, default=0.0, help='Adapter dropout')
413
+ parser.add_argument('--use_parallel_adapter', action='store_true', help='Use parallel adapter')
414
+ parser.add_argument('--use_adapterp', action='store_true', help='Use adapterp')
415
+ parser.add_argument('--target_modules', nargs='+', help='Target modules')
416
+ parser.add_argument('--scaling', type=Union[float, str], default=1.0, help='Scaling')
417
+
418
+ # prefix tuning hyperparams
419
+ parser.add_argument('--num_virtual_tokens', type=int, default=30, help='Number of virtual tokens')
420
+
421
+ # llm hyperparams
422
+ parser.add_argument('--train_on_inputs', action='store_true', help='Train on inputs')
423
+ parser.add_argument('--group_by_length', action='store_true', help='Group by length')
424
+
425
+ # wandb params
426
+ parser.add_argument('--wandb_project', type=str, default='', help='Wandb project')
427
+ parser.add_argument('--wandb_run_name', type=str, default='', help='Wandb run name')
428
+ parser.add_argument('--wandb_watch', type=str, default='', help='Wandb watch')
429
+ parser.add_argument('--wandb_log_model', type=str, default='', help='Wandb log model')
430
+ parser.add_argument('--resume_from_checkpoint', type=str, help='Resume from checkpoint')
431
+
432
+ return parser.parse_args()
433
+
434
+ if __name__ == "__main__":
435
+ fire.Fire(train)
436
+
437
+ # args = parse_args()
438
+ # train(**vars(args))
SVFT-main/LLM-Adapters/ft-training_set/commonsense_15k.json ADDED
The diff for this file is too large to render. See raw diff
 
SVFT-main/LLM-Adapters/generate.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ import fire
5
+ import gradio as gr
6
+ import torch
7
+ import transformers
8
+ from peft import PeftModel
9
+ from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer
10
+
11
+ if torch.cuda.is_available():
12
+ device = "cuda"
13
+ else:
14
+ device = "cpu"
15
+
16
+ try:
17
+ if torch.backends.mps.is_available():
18
+ device = "mps"
19
+ except: # noqa: E722
20
+ pass
21
+
22
+
23
+ def main(
24
+ load_8bit: bool = False,
25
+ base_model: str = "",
26
+ lora_weights: str = "tloen/alpaca-lora-7b",
27
+ share_gradio: bool = False,
28
+ ):
29
+ assert (
30
+ base_model
31
+ ), "Please specify a --base_model, e.g. --base_model='decapoda-research/llama-7b-hf'"
32
+
33
+ tokenizer = LlamaTokenizer.from_pretrained(base_model)
34
+ if device == "cuda":
35
+ model = LlamaForCausalLM.from_pretrained(
36
+ base_model,
37
+ load_in_8bit=load_8bit,
38
+ torch_dtype=torch.float16,
39
+ device_map="auto",
40
+ trust_remote_code=True,
41
+ )
42
+ model = PeftModel.from_pretrained(
43
+ model,
44
+ lora_weights,
45
+ torch_dtype=torch.float16,
46
+ )
47
+ elif device == "mps":
48
+ model = LlamaForCausalLM.from_pretrained(
49
+ base_model,
50
+ device_map={"": device},
51
+ torch_dtype=torch.float16,
52
+ )
53
+ model = PeftModel.from_pretrained(
54
+ model,
55
+ lora_weights,
56
+ device_map={"": device},
57
+ torch_dtype=torch.float16,
58
+ )
59
+ else:
60
+ model = LlamaForCausalLM.from_pretrained(
61
+ base_model, device_map={"": device}, low_cpu_mem_usage=True
62
+ )
63
+ model = PeftModel.from_pretrained(
64
+ model,
65
+ lora_weights,
66
+ device_map={"": device},
67
+ )
68
+
69
+ # unwind broken decapoda-research config
70
+ model.config.pad_token_id = tokenizer.pad_token_id = 0 # unk
71
+ model.config.bos_token_id = 1
72
+ model.config.eos_token_id = 2
73
+
74
+ if not load_8bit:
75
+ model.half() # seems to fix bugs for some users.
76
+
77
+ model.eval()
78
+ if torch.__version__ >= "2" and sys.platform != "win32":
79
+ model = torch.compile(model)
80
+
81
+ def evaluate(
82
+ instruction,
83
+ input=None,
84
+ temperature=0.1,
85
+ top_p=0.75,
86
+ top_k=40,
87
+ num_beams=4,
88
+ max_new_tokens=128,
89
+ **kwargs,
90
+ ):
91
+ prompt = generate_prompt(instruction, input)
92
+ inputs = tokenizer(prompt, return_tensors="pt")
93
+ input_ids = inputs["input_ids"].to(device)
94
+ generation_config = GenerationConfig(
95
+ temperature=temperature,
96
+ top_p=top_p,
97
+ top_k=top_k,
98
+ num_beams=num_beams,
99
+ **kwargs,
100
+ )
101
+ with torch.no_grad():
102
+ generation_output = model.generate(
103
+ input_ids=input_ids,
104
+ generation_config=generation_config,
105
+ return_dict_in_generate=True,
106
+ output_scores=True,
107
+ max_new_tokens=max_new_tokens,
108
+ )
109
+ s = generation_output.sequences[0]
110
+ output = tokenizer.decode(s)
111
+ return output.split("### Response:")[1].strip()
112
+
113
+ gr.Interface(
114
+ fn=evaluate,
115
+ inputs=[
116
+ gr.components.Textbox(
117
+ lines=2,
118
+ label="Instruction",
119
+ placeholder="Tell me about alpacas.",
120
+ ),
121
+ gr.components.Textbox(lines=2, label="Input", placeholder="none"),
122
+ gr.components.Slider(
123
+ minimum=0, maximum=1, value=0.1, label="Temperature"
124
+ ),
125
+ gr.components.Slider(
126
+ minimum=0, maximum=1, value=0.75, label="Top p"
127
+ ),
128
+ gr.components.Slider(
129
+ minimum=0, maximum=100, step=1, value=40, label="Top k"
130
+ ),
131
+ gr.components.Slider(
132
+ minimum=1, maximum=4, step=1, value=4, label="Beams"
133
+ ),
134
+ gr.components.Slider(
135
+ minimum=1, maximum=2000, step=1, value=128, label="Max tokens"
136
+ ),
137
+ ],
138
+ outputs=[
139
+ gr.inputs.Textbox(
140
+ lines=5,
141
+ label="Output",
142
+ )
143
+ ],
144
+ title="LLM-Adapters",
145
+ description="This is a 7B-parameter LLaMA model finetuned to follow instructions. It is trained on the [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) dataset and makes use of the Huggingface LLaMA implementation.", # noqa: E501
146
+ ).launch(share=share_gradio)
147
+ # Old testing code follows.
148
+
149
+ """
150
+ # testing code for readme
151
+ for instruction in [
152
+ "Tell me about alpacas.",
153
+ "Tell me about the president of Mexico in 2019.",
154
+ "Tell me about the king of France in 2019.",
155
+ "List all Canadian provinces in alphabetical order.",
156
+ "Write a Python program that prints the first 10 Fibonacci numbers.",
157
+ "Write a program that prints the numbers from 1 to 100. But for multiples of three print 'Fizz' instead of the number and for the multiples of five print 'Buzz'. For numbers which are multiples of both three and five print 'FizzBuzz'.", # noqa: E501
158
+ "Tell me five words that rhyme with 'shock'.",
159
+ "Translate the sentence 'I have no mouth but I must scream' into Spanish.",
160
+ "Count up from 1 to 500.",
161
+ ]:
162
+ print("Instruction:", instruction)
163
+ print("Response:", evaluate(instruction))
164
+ print()
165
+ """
166
+
167
+
168
+ def generate_prompt(instruction, input=None):
169
+ if input:
170
+ return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
171
+
172
+ ### Instruction:
173
+ {instruction}
174
+
175
+ ### Input:
176
+ {input}
177
+
178
+ ### Response:
179
+ """ # noqa: E501
180
+ else:
181
+ return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
182
+
183
+ ### Instruction:
184
+ {instruction}
185
+
186
+ ### Response:
187
+ """ # noqa: E501
188
+
189
+
190
+ if __name__ == "__main__":
191
+ fire.Fire(main)
SVFT-main/LLM-Adapters/lengths.ipynb ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/home/eric/miniconda3/envs/dl3/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
+ " from .autonotebook import tqdm as notebook_tqdm\n",
14
+ "Found cached dataset json (/home/eric/.cache/huggingface/datasets/json/default-789f51900889f651/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
15
+ "100%|██████████| 1/1 [00:00<00:00, 784.28it/s]\n",
16
+ "Loading cached processed dataset at /home/eric/.cache/huggingface/datasets/json/default-789f51900889f651/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-f691ee34ec2034cb.arrow\n"
17
+ ]
18
+ }
19
+ ],
20
+ "source": [
21
+ "from datasets import load_dataset\n",
22
+ "from transformers import LlamaTokenizer\n",
23
+ "\n",
24
+ "\n",
25
+ "tokenizer = LlamaTokenizer.from_pretrained(\n",
26
+ " \"decapoda-research/llama-7b-hf\", add_eos_token=True\n",
27
+ ")\n",
28
+ "tokenizer.pad_token = tokenizer.eos_token\n",
29
+ "tokenizer.pad_token_id = tokenizer.eos_token_id\n",
30
+ "\n",
31
+ "data = load_dataset(\"json\", data_files=\"alpaca_data.json\")\n",
32
+ "\n",
33
+ "\n",
34
+ "def generate_prompt(data_point):\n",
35
+ " # sorry about the formatting disaster gotta move fast\n",
36
+ " if data_point[\"input\"]:\n",
37
+ " return f\"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n",
38
+ "\n",
39
+ "### Instruction:\n",
40
+ "{data_point[\"instruction\"]}\n",
41
+ "\n",
42
+ "### Input:\n",
43
+ "{data_point[\"input\"]}\n",
44
+ "\n",
45
+ "### Response:\n",
46
+ "{data_point[\"output\"]}\"\"\"\n",
47
+ " else:\n",
48
+ " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
49
+ "\n",
50
+ "### Instruction:\n",
51
+ "{data_point[\"instruction\"]}\n",
52
+ "\n",
53
+ "### Response:\n",
54
+ "{data_point[\"output\"]}\"\"\"\n",
55
+ "\n",
56
+ "\n",
57
+ "data = data.map(\n",
58
+ " lambda data_point: {\"prompt\": tokenizer(generate_prompt(data_point))}\n",
59
+ ")"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "code",
64
+ "execution_count": 2,
65
+ "metadata": {},
66
+ "outputs": [
67
+ {
68
+ "data": {
69
+ "text/plain": [
70
+ "<matplotlib.lines.Line2D at 0x7f6f1af20af0>"
71
+ ]
72
+ },
73
+ "execution_count": 2,
74
+ "metadata": {},
75
+ "output_type": "execute_result"
76
+ },
77
+ {
78
+ "data": {
79
+ "image/png": "",
80
+ "text/plain": [
81
+ "<Figure size 640x480 with 1 Axes>"
82
+ ]
83
+ },
84
+ "metadata": {},
85
+ "output_type": "display_data"
86
+ }
87
+ ],
88
+ "source": [
89
+ "import matplotlib.pyplot as plt\n",
90
+ "\n",
91
+ "lens = [len(x[\"prompt\"][\"input_ids\"]) for x in data[\"train\"]]\n",
92
+ "plt.hist(lens, bins=100)\n",
93
+ "plt.title(\"Distribution of prompt lengths\")\n",
94
+ "plt.axvline(256, color=\"red\")"
95
+ ]
96
+ },
97
+ {
98
+ "cell_type": "code",
99
+ "execution_count": 3,
100
+ "metadata": {},
101
+ "outputs": [
102
+ {
103
+ "data": {
104
+ "text/plain": [
105
+ "<matplotlib.lines.Line2D at 0x7f6eef316ce0>"
106
+ ]
107
+ },
108
+ "execution_count": 3,
109
+ "metadata": {},
110
+ "output_type": "execute_result"
111
+ },
112
+ {
113
+ "data": {
114
+ "image/png": "",
115
+ "text/plain": [
116
+ "<Figure size 640x480 with 1 Axes>"
117
+ ]
118
+ },
119
+ "metadata": {},
120
+ "output_type": "display_data"
121
+ }
122
+ ],
123
+ "source": [
124
+ "plt.plot([len([l for l in lens if l <= m]) for m in range(max(lens) + 1)])\n",
125
+ "plt.title(\"Number of fully covered examples as a function of max length\")\n",
126
+ "plt.axvline(x=256, color=\"red\")"
127
+ ]
128
+ },
129
+ {
130
+ "attachments": {},
131
+ "cell_type": "markdown",
132
+ "metadata": {},
133
+ "source": [
134
+ "Percentage of tokens left out:"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": 4,
140
+ "metadata": {},
141
+ "outputs": [
142
+ {
143
+ "data": {
144
+ "text/plain": [
145
+ "<matplotlib.lines.Line2D at 0x7f6eef392020>"
146
+ ]
147
+ },
148
+ "execution_count": 4,
149
+ "metadata": {},
150
+ "output_type": "execute_result"
151
+ },
152
+ {
153
+ "data": {
154
+ "image/png": "",
155
+ "text/plain": [
156
+ "<Figure size 640x480 with 1 Axes>"
157
+ ]
158
+ },
159
+ "metadata": {},
160
+ "output_type": "display_data"
161
+ }
162
+ ],
163
+ "source": [
164
+ "plt.plot([sum(min(l, m) for l in lens) for m in range(max(lens) + 1)])\n",
165
+ "plt.title(\"Token coverage as a function of max length\")\n",
166
+ "plt.axvline(x=256, color=\"red\")"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "code",
171
+ "execution_count": null,
172
+ "metadata": {},
173
+ "outputs": [],
174
+ "source": []
175
+ }
176
+ ],
177
+ "metadata": {
178
+ "kernelspec": {
179
+ "display_name": "dl3",
180
+ "language": "python",
181
+ "name": "python3"
182
+ },
183
+ "language_info": {
184
+ "codemirror_mode": {
185
+ "name": "ipython",
186
+ "version": 3
187
+ },
188
+ "file_extension": ".py",
189
+ "mimetype": "text/x-python",
190
+ "name": "python",
191
+ "nbconvert_exporter": "python",
192
+ "pygments_lexer": "ipython3",
193
+ "version": "3.10.8"
194
+ },
195
+ "orig_nbformat": 4,
196
+ "vscode": {
197
+ "interpreter": {
198
+ "hash": "90bfda469df5ac7fed8d7e225d563f60a7a7aa420ccfadb091c914debf775e49"
199
+ }
200
+ }
201
+ },
202
+ "nbformat": 4,
203
+ "nbformat_minor": 2
204
+ }
SVFT-main/LLM-Adapters/mathqa.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from datasets import load_dataset
4
+
5
+ dataset = load_dataset("math_qa")
6
+ save_path = "dataset/mathqa/test.json"
7
+
8
+ if not os.path.exists("dataset/mathqa/"):
9
+ os.makedirs("dataset/mathqa/")
10
+
11
+
12
+ def writer(data, save_path):
13
+ with open(save_path, "w") as f:
14
+ json.dump(data, f, indent=4)
15
+
16
+ test_data = []
17
+ for sample in dataset["test"]:
18
+ options = sample["options"].replace("a", "A").replace("b", "B").replace("c", "C").replace("d", "D").replace("e", "E").replace("f", "F")
19
+ test_data.append({
20
+ "instruction": f"{sample['Problem']} The options: {options}",
21
+ "input": "",
22
+ "output": "",
23
+ "answer": sample["correct"].upper(),
24
+ })
25
+
26
+ writer(test_data, save_path)
27
+
SVFT-main/LLM-Adapters/multi_dataset_eval.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from concurrent.futures import ProcessPoolExecutor
2
+ import queue
3
+ import subprocess
4
+ import os
5
+
6
+ def evaluate(dataset, gpu):
7
+ print('*******dataset:', dataset)
8
+ model_name = "Pythia_2B_143000_SVFT_CR15K"
9
+ save_dir= "results/" + model_name
10
+
11
+ if not os.path.exists(save_dir):
12
+ try:
13
+ os.makedirs(save_dir)
14
+ except:
15
+ pass
16
+
17
+ save_path = os.path.join(save_dir, dataset + ".txt")
18
+ command = f"CUDA_VISIBLE_DEVICES={gpu} python commonsense_evaluate_latest.py \
19
+ --model LLaMA-7B \
20
+ --adapter LoRA \
21
+ --dataset {dataset} \
22
+ --base_model './{model_name}' \
23
+ --batch_size 1| tee -a {save_path}"
24
+
25
+ result = subprocess.run(command, shell=True, text=True, capture_output=False)
26
+ print(f"Evaluation results for dataset {dataset} on GPU {gpu}:\n{result.stdout}")
27
+ return gpu
28
+
29
+
30
+ datasets = ["boolq", "social_i_qa", "piqa", "ARC-Easy", "ARC-Challenge", "winogrande", "openbookqa", "hellaswag"]
31
+
32
+ gpus = [0, 0, 0, 0]
33
+ tasks_queue = queue.Queue()
34
+ gpu_queue = queue.Queue()
35
+
36
+ for gpu in gpus:
37
+ gpu_queue.put(gpu)
38
+ for task in datasets:
39
+ tasks_queue.put(task)
40
+
41
+ num_processes = min(len(datasets), len(gpus)) # number of processes to run in parallel
42
+
43
+ with ProcessPoolExecutor(max_workers=num_processes) as executor:
44
+ futures = [executor.submit(evaluate, tasks_queue.get(), gpu_queue.get()) for i in range(num_processes)]
45
+ for future in futures:
46
+ gpu_id = future.result()
47
+ gpu_queue.put(gpu_id)
48
+ if tasks_queue.qsize() > 0:
49
+ futures.append(executor.submit(evaluate, tasks_queue.get(), gpu_queue.get()))
SVFT-main/LLM-Adapters/peft/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
SVFT-main/LLM-Adapters/peft/Makefile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: quality style test docs
2
+
3
+ check_dirs := src tests examples
4
+
5
+ # Check that source code meets quality standards
6
+
7
+ # this target runs checks on all files
8
+ quality:
9
+ black --check $(check_dirs)
10
+ ruff $(check_dirs)
11
+ doc-builder style src tests --max_len 119 --check_only
12
+
13
+ # Format source code automatically and check is there are any problems left that need manual fixing
14
+ style:
15
+ black $(check_dirs)
16
+ ruff $(check_dirs) --fix
17
+ doc-builder style src tests --max_len 119
18
+
19
+ test:
20
+ pytest tests/
SVFT-main/LLM-Adapters/peft/pyproject.toml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.black]
2
+ line-length = 119
3
+ target-version = ['py36']
4
+
5
+ [tool.ruff]
6
+ ignore = ["C901", "E501", "E741", "W605"]
7
+ select = ["C", "E", "F", "I", "W"]
8
+ line-length = 119
9
+
10
+ [tool.ruff.isort]
11
+ lines-after-imports = 2
12
+ known-first-party = ["peft"]
13
+
14
+ [isort]
15
+ default_section = "FIRSTPARTY"
16
+ known_first_party = "peft"
17
+ known_third_party = [
18
+ "numpy",
19
+ "torch",
20
+ "accelerate",
21
+ "transformers",
22
+ ]
23
+ line_length = 119
24
+ lines_after_imports = 2
25
+ multi_line_output = 3
26
+ include_trailing_comma = true
27
+ force_grid_wrap = 0
28
+ use_parentheses = true
29
+ ensure_newline_before_comments = true
30
+
31
+ [tool.pytest]
32
+ doctest_optionflags = [
33
+ "NORMALIZE_WHITESPACE",
34
+ "ELLIPSIS",
35
+ "NUMBER",
36
+ ]
SVFT-main/LLM-Adapters/peft/setup.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from setuptools import find_packages, setup
16
+
17
+ extras = {}
18
+ extras["quality"] = ["black ~= 22.0", "ruff>=0.0.241"]
19
+ extras["docs_specific"] = ["hf-doc-builder"]
20
+ extras["dev"] = extras["quality"] + extras["docs_specific"]
21
+ extras["test"] = extras["dev"] + ["pytest", "pytest-xdist", "parameterized"]
22
+
23
+ setup(
24
+ name="peft",
25
+ version="0.3.0.dev0",
26
+ description="Parameter-Efficient Fine-Tuning (PEFT)",
27
+ license_files=["LICENSE"],
28
+ keywords="deep learning",
29
+ license="Apache",
30
+ author="The AGI-Edgerunners team",
31
+ author_email="[email protected]",
32
+ url="https://github.com/AGI-Edgerunners/LLM-Adapters",
33
+ package_dir={"": "src"},
34
+ packages=find_packages("src"),
35
+ entry_points={},
36
+ python_requires=">=3.7.0",
37
+ install_requires=[
38
+ "numpy>=1.17",
39
+ "packaging>=20.0",
40
+ "psutil",
41
+ "pyyaml",
42
+ "torch>=1.13.0",
43
+ "transformers",
44
+ "accelerate",
45
+ ],
46
+ extras_require=extras,
47
+ classifiers=[
48
+ "Development Status :: 5 - Production/Stable",
49
+ "Intended Audience :: Developers",
50
+ "Intended Audience :: Education",
51
+ "Intended Audience :: Science/Research",
52
+ "License :: OSI Approved :: Apache Software License",
53
+ "Operating System :: OS Independent",
54
+ "Programming Language :: Python :: 3",
55
+ "Programming Language :: Python :: 3.7",
56
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
57
+ ],
58
+ )
59
+
60
+ # Release checklist
61
+ # 1. Change the version in __init__.py and setup.py.
62
+ # 2. Commit these changes with the message: "Release: VERSION"
63
+ # 3. Add a tag in git to mark the release: "git tag VERSION -m 'Adds tag VERSION for pypi' "
64
+ # Push the tag to git: git push --tags origin main
65
+ # 4. Run the following commands in the top-level directory:
66
+ # python setup.py bdist_wheel
67
+ # python setup.py sdist
68
+ # 5. Upload the package to the pypi test server first:
69
+ # twine upload dist/* -r pypitest
70
+ # twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
71
+ # 6. Check that you can install it in a virtualenv by running:
72
+ # pip install -i https://testpypi.python.org/pypi peft
73
+ # 7. Upload the final version to actual pypi:
74
+ # twine upload dist/* -r pypi
75
+ # 8. Add release notes to the tag in github once everything is looking hunky-dory.
76
+ # 9. Update the version in __init__.py, setup.py to the new version "-dev" and push to master
SVFT-main/LLM-Adapters/peft/src/peft/__init__.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
+ # There's no way to ignore "F401 '...' imported but unused" warnings in this
3
+ # module, but to preserve other warnings. So, don't check this module at all.
4
+
5
+ # coding=utf-8
6
+ # Copyright 2023-present the HuggingFace Inc. team.
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License");
9
+ # you may not use this file except in compliance with the License.
10
+ # You may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
+ # See the License for the specific language governing permissions and
18
+ # limitations under the License.
19
+
20
+ __version__ = "0.3.0.dev0"
21
+
22
+ from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING, PEFT_TYPE_TO_CONFIG_MAPPING, get_peft_config, get_peft_model
23
+ from .peft_model import (
24
+ PeftModel,
25
+ PeftModelForCausalLM,
26
+ PeftModelForSeq2SeqLM,
27
+ PeftModelForSequenceClassification,
28
+ PeftModelForTokenClassification,
29
+ )
30
+ from .tuners import (
31
+ LoraConfig,
32
+ LoraModel,
33
+ BottleneckConfig,
34
+ BottleneckModel,
35
+ PrefixEncoder,
36
+ PrefixTuningConfig,
37
+ PromptEmbedding,
38
+ PromptEncoder,
39
+ PromptEncoderConfig,
40
+ PromptEncoderReparameterizationType,
41
+ PromptTuningConfig,
42
+ PromptTuningInit,
43
+ )
44
+ from .utils import (
45
+ TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,
46
+ PeftConfig,
47
+ PeftType,
48
+ PromptLearningConfig,
49
+ TaskType,
50
+ bloom_model_postprocess_past_key_value,
51
+ get_peft_model_state_dict,
52
+ prepare_model_for_int8_training,
53
+ set_peft_model_state_dict,
54
+ shift_tokens_right,
55
+ )
SVFT-main/LLM-Adapters/peft/src/peft/mapping.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023-present the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from .peft_model import (
17
+ PeftModel,
18
+ PeftModelForCausalLM,
19
+ PeftModelForSeq2SeqLM,
20
+ PeftModelForSequenceClassification,
21
+ PeftModelForTokenClassification,
22
+ )
23
+ from .tuners import LoraConfig, PrefixTuningConfig, PromptEncoderConfig, PromptTuningConfig, BottleneckConfig
24
+ from .utils import PromptLearningConfig
25
+
26
+
27
+ MODEL_TYPE_TO_PEFT_MODEL_MAPPING = {
28
+ "SEQ_CLS": PeftModelForSequenceClassification,
29
+ "SEQ_2_SEQ_LM": PeftModelForSeq2SeqLM,
30
+ "CAUSAL_LM": PeftModelForCausalLM,
31
+ "TOKEN_CLS": PeftModelForTokenClassification,
32
+ }
33
+
34
+ PEFT_TYPE_TO_CONFIG_MAPPING = {
35
+ "PROMPT_TUNING": PromptTuningConfig,
36
+ "PREFIX_TUNING": PrefixTuningConfig,
37
+ "P_TUNING": PromptEncoderConfig,
38
+ "LORA": LoraConfig,
39
+ "BOTTLENECK": BottleneckConfig,
40
+ }
41
+
42
+ TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING = {
43
+ "t5": ["q", "v"],
44
+ "mt5": ["q", "v"],
45
+ "bart": ["q_proj", "v_proj"],
46
+ "gpt2": ["c_attn"],
47
+ "bloom": ["query_key_value"],
48
+ "opt": ["q_proj", "v_proj"],
49
+ "gptj": ["q_proj", "v_proj"],
50
+ "gpt_neox": ["query_key_value"],
51
+ "gpt_neo": ["q_proj", "v_proj"],
52
+ "bert": ["query", "value"],
53
+ "roberta": ["query", "value"],
54
+ "xlm-roberta": ["query", "value"],
55
+ "electra": ["query", "value"],
56
+ "deberta-v2": ["query_proj", "value_proj"],
57
+ "deberta": ["in_proj"],
58
+ "layoutlm": ["query", "value"],
59
+ "llama": ["q_proj", "v_proj"],
60
+ "chatglm": ["query_key_value"],
61
+ }
62
+
63
+ TRANSFORMERS_MODELS_TO_BOTTLENECK_TARGET_MODULES_MAPPING = {
64
+ "bloom": ["dense_h_to_4h", "dense_4h_to_h"],
65
+ "gptj": ["fc_in", "fc_out"],
66
+ "gpt_neo": ["c_fc", "c_proj"],
67
+ "llama": ["gate_proj", "up_proj", "down_proj"],
68
+ "opt": ["fc1", "fc2"],
69
+ "chatglm": ["dense_h_to_4h", "dense_4h_to_h"],
70
+ }
71
+
72
+ TRANSFORMERS_MODELS_TO_ADAPTERP_TARGET_MODULES_MAPPING = {
73
+ "bloom": ["dense_4h_to_h"],
74
+ "gptj": ["fc_out"],
75
+ "gpt_neo": ["c_proj"],
76
+ "llama": ["down_proj"],
77
+ "opt": ["fc2"],
78
+ "chatglm": ["dense_4h_to_h"],
79
+ }
80
+
81
+ TRANSFORMERS_MODELS_TO_PARALLEL_TARGET_MODULES_MAPPING = {
82
+ "bloom": ["query_key_value"],
83
+ "gptj": ["q_proj", "v_proj", "k_proj"],
84
+ "gpt_neo": ["q_proj", "v_proj", "k_proj"],
85
+ "llama": ["q_proj", "v_proj", "k_proj"],
86
+ "opt": ["q_proj", "v_proj", "k_proj"],
87
+ "chatglm": ["query_key_value"],
88
+ }
89
+
90
+
91
+
92
+ def get_peft_config(config_dict):
93
+ """
94
+ Returns a Peft config object from a dictionary.
95
+
96
+ Args:
97
+ config_dict (`Dict[str, Any]`): Dictionary containing the configuration parameters.
98
+ """
99
+
100
+ return PEFT_TYPE_TO_CONFIG_MAPPING[config_dict["peft_type"]](**config_dict)
101
+
102
+
103
+ def _prepare_prompt_learning_config(peft_config, model_config):
104
+ if peft_config.num_layers is None:
105
+ if "num_hidden_layers" in model_config:
106
+ num_layers = model_config["num_hidden_layers"]
107
+ elif "num_layers" in model_config:
108
+ num_layers = model_config["num_layers"]
109
+ elif "n_layer" in model_config:
110
+ num_layers = model_config["n_layer"]
111
+ else:
112
+ raise ValueError("Please specify `num_layers` in `peft_config`")
113
+ peft_config.num_layers = num_layers
114
+
115
+ if peft_config.token_dim is None:
116
+ if "hidden_size" in model_config:
117
+ token_dim = model_config["hidden_size"]
118
+ elif "n_embd" in model_config:
119
+ token_dim = model_config["n_embd"]
120
+ elif "d_model" in model_config:
121
+ token_dim = model_config["d_model"]
122
+ else:
123
+ raise ValueError("Please specify `token_dim` in `peft_config`")
124
+ peft_config.token_dim = token_dim
125
+
126
+ if peft_config.num_attention_heads is None:
127
+ if "num_attention_heads" in model_config:
128
+ num_attention_heads = model_config["num_attention_heads"]
129
+ elif "n_head" in model_config:
130
+ num_attention_heads = model_config["n_head"]
131
+ elif "num_heads" in model_config:
132
+ num_attention_heads = model_config["num_heads"]
133
+ elif "encoder_attention_heads" in model_config:
134
+ num_attention_heads = model_config["encoder_attention_heads"]
135
+ else:
136
+ raise ValueError("Please specify `num_attention_heads` in `peft_config`")
137
+ peft_config.num_attention_heads = num_attention_heads
138
+
139
+ if getattr(peft_config, "encoder_hidden_size", None) is None:
140
+ setattr(peft_config, "encoder_hidden_size", token_dim)
141
+
142
+ return peft_config
143
+
144
+
145
+ def _prepare_lora_config(peft_config, model_config):
146
+ if peft_config.target_modules is None:
147
+ if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING:
148
+ raise ValueError("Please specify `target_modules` in `peft_config`")
149
+ peft_config.target_modules = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]]
150
+ if len(peft_config.target_modules) == 1:
151
+ peft_config.fan_in_fan_out = True
152
+ peft_config.enable_lora = [True, False, True]
153
+ if peft_config.inference_mode:
154
+ peft_config.merge_weights = True
155
+ return peft_config
156
+
157
+
158
+ def _prepare_bottleneck_config(peft_config, model_config):
159
+ if peft_config.target_modules is None:
160
+ if peft_config.use_parallel_adapter:
161
+ if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_PARALLEL_TARGET_MODULES_MAPPING:
162
+ raise ValueError("Please specify `target_modules` in `peft_config`")
163
+ peft_config.target_modules = TRANSFORMERS_MODELS_TO_PARALLEL_TARGET_MODULES_MAPPING[model_config["model_type"]]
164
+ elif peft_config.use_adapterp:
165
+ if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_ADAPTERP_TARGET_MODULES_MAPPING:
166
+ raise ValueError("Please specify `target_modules` in `peft_config`")
167
+ peft_config.target_modules = TRANSFORMERS_MODELS_TO_ADAPTERP_TARGET_MODULES_MAPPING[model_config["model_type"]]
168
+ else:
169
+ if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_BOTTLENECK_TARGET_MODULES_MAPPING:
170
+ raise ValueError("Please specify `target_modules` in `peft_config`")
171
+ peft_config.target_modules = TRANSFORMERS_MODELS_TO_BOTTLENECK_TARGET_MODULES_MAPPING[model_config["model_type"]]
172
+
173
+ return peft_config
174
+
175
+
176
+
177
+ def get_peft_model(model, peft_config):
178
+ """
179
+ Returns a Peft model object from a model and a config.
180
+
181
+ Args:
182
+ model ([`transformers.PreTrainedModel`]): Model to be wrapped.
183
+ peft_config ([`PeftConfig`]): Configuration object containing the parameters of the Peft model.
184
+ """
185
+
186
+ model_config = model.config.to_dict()
187
+ peft_config.base_model_name_or_path = model.__dict__.get("name_or_path", None)
188
+ if peft_config.task_type not in MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys():
189
+ if peft_config.peft_type == "LORA":
190
+ peft_config = _prepare_lora_config(peft_config, model_config)
191
+ return PeftModel(model, peft_config)
192
+ elif peft_config.peft_type == "BOTTLENECK":
193
+ peft_config = _prepare_bottleneck_config(peft_config, model_config)
194
+ return PeftModel(model, peft_config)
195
+ if not isinstance(peft_config, PromptLearningConfig):
196
+ if peft_config.peft_type == "BOTTLENECK":
197
+ peft_config = _prepare_bottleneck_config(peft_config, model_config)
198
+ elif peft_config.peft_type == "LORA":
199
+ peft_config = _prepare_lora_config(peft_config, model_config)
200
+ else:
201
+ peft_config = _prepare_prompt_learning_config(peft_config, model_config)
202
+ return MODEL_TYPE_TO_PEFT_MODEL_MAPPING[peft_config.task_type](model, peft_config)
SVFT-main/LLM-Adapters/peft/src/peft/peft_model.py ADDED
@@ -0,0 +1,974 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023-present the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import inspect
17
+ import os
18
+ import warnings
19
+ from contextlib import contextmanager
20
+
21
+ import torch
22
+ from accelerate import dispatch_model, infer_auto_device_map
23
+ from accelerate.hooks import AlignDevicesHook, add_hook_to_module, remove_hook_from_submodules
24
+ from accelerate.utils import get_balanced_memory
25
+ from huggingface_hub import hf_hub_download
26
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
27
+ from transformers import PreTrainedModel
28
+ from transformers.modeling_outputs import SequenceClassifierOutput, TokenClassifierOutput
29
+ from transformers.utils import PushToHubMixin
30
+
31
+ from .tuners import LoraModel, BottleneckModel, PrefixEncoder, PromptEmbedding, PromptEncoder
32
+ from .utils import (
33
+ TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,
34
+ WEIGHTS_NAME,
35
+ PeftConfig,
36
+ PeftType,
37
+ PromptLearningConfig,
38
+ TaskType,
39
+ _set_trainable,
40
+ get_peft_model_state_dict,
41
+ set_peft_model_state_dict,
42
+ shift_tokens_right,
43
+ )
44
+
45
+
46
+ class PeftModel(PushToHubMixin, torch.nn.Module):
47
+ """
48
+ Parameter-Efficient Fine-Tuning Model. Base model encompassing various Peft methods.
49
+
50
+ Args:
51
+ model ([`PreTrainedModel`]): The base transformer model used for Peft.
52
+ peft_config ([`PeftConfig`]): The configuration of the Peft model.
53
+
54
+
55
+ **Attributes**:
56
+ - **base_model** ([`PreTrainedModel`]) -- The base transformer model used for Peft.
57
+ - **peft_config** ([`PeftConfig`]) -- The configuration of the Peft model.
58
+ - **modules_to_save** (`list` of `str`) -- The list of sub-module names to save when
59
+ saving the model.
60
+ - **prompt_encoder** ([`PromptEncoder`]) -- The prompt encoder used for Peft if
61
+ `isinstance(self.peft_config, PromptLearningConfig)`.
62
+ - **prompt_tokens** (`torch.Tensor`) -- The virtual prompt tokens used for Peft if
63
+ `isinstance(self.peft_config, PromptLearningConfig)`.
64
+ - **transformer_backbone_name** (`str`) -- The name of the transformer
65
+ backbone in the base model if `isinstance(self.peft_config, PromptLearningConfig)`.
66
+ - **word_embeddings** (`torch.nn.Embedding`) -- The word embeddings of the transformer backbone
67
+ in the base model if `isinstance(self.peft_config, PromptLearningConfig)`.
68
+ """
69
+
70
+ def __init__(self, model, peft_config: PeftConfig):
71
+ super().__init__()
72
+ self.peft_config = peft_config
73
+ self.base_model = model
74
+ self.config = self.base_model.config
75
+ self.modules_to_save = None
76
+ if isinstance(self.peft_config, PromptLearningConfig):
77
+ self._setup_prompt_encoder()
78
+ else:
79
+ if self.peft_config.peft_type == PeftType.LORA:
80
+ self.base_model = LoraModel(peft_config, model)
81
+ elif self.peft_config.peft_type == PeftType.BOTTLENECK:
82
+ self.base_model = BottleneckModel(peft_config, model)
83
+ if getattr(self.peft_config, "modules_to_save", None) is not None:
84
+ self.modules_to_save = self.peft_config.modules_to_save
85
+ _set_trainable(self)
86
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
87
+ self.base_model_torch_dtype = getattr(model, "dtype", None)
88
+
89
+ def save_pretrained(self, save_directory, **kwargs):
90
+ r"""
91
+ Args:
92
+ This function saves the adapter model and the adapter configuration files to a directory, so that it can be
93
+ re-loaded using the `LoraModel.from_pretrained` class method, and also used by the `LoraModel.push_to_hub`
94
+ method.
95
+ save_directory (`str`):
96
+ Directory where the adapter model and configuration files will be saved (will be created if it does not
97
+ exist).
98
+ **kwargs:
99
+ Additional keyword arguments passed along to the `push_to_hub` method.
100
+ """
101
+ if os.path.isfile(save_directory):
102
+ raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
103
+ os.makedirs(save_directory, exist_ok=True)
104
+
105
+ # save only the trainable weights
106
+ output_state_dict = get_peft_model_state_dict(self, kwargs.get("state_dict", None))
107
+ torch.save(output_state_dict, os.path.join(save_directory, WEIGHTS_NAME))
108
+
109
+ # save the config and change the inference mode to `True`
110
+ if self.peft_config.base_model_name_or_path is None:
111
+ self.peft_config.base_model_name_or_path = (
112
+ self.base_model.__dict__.get("name_or_path", None)
113
+ if isinstance(self.peft_config, PromptLearningConfig)
114
+ else self.base_model.model.__dict__.get("name_or_path", None)
115
+ )
116
+ inference_mode = self.peft_config.inference_mode
117
+ self.peft_config.inference_mode = True
118
+ self.peft_config.save_pretrained(save_directory)
119
+ self.peft_config.inference_mode = inference_mode
120
+
121
+ @classmethod
122
+ def from_pretrained(cls, model, model_id, **kwargs):
123
+ r"""
124
+ Args:
125
+ Instantiate a `LoraModel` from a pretrained Lora configuration and weights.
126
+ model (`transformers.PreTrainedModel`):
127
+ The model to be adapted. The model should be initialized with the `from_pretrained` method. from
128
+ `transformers` library.
129
+ model_id (`str`):
130
+ The name of the Lora configuration to use. Can be either:
131
+ - A string, the `model id` of a Lora configuration hosted inside a model repo on
132
+ huggingface Hub
133
+ - A path to a directory containing a Lora configuration file saved using the
134
+ `save_pretrained` method, e.g., ``./my_lora_config_directory/``.
135
+ """
136
+ from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING, PEFT_TYPE_TO_CONFIG_MAPPING
137
+
138
+ # load the config
139
+ config = PEFT_TYPE_TO_CONFIG_MAPPING[PeftConfig.from_pretrained(model_id).peft_type].from_pretrained(model_id)
140
+
141
+ if getattr(model, "hf_device_map", None) is not None:
142
+ remove_hook_from_submodules(model)
143
+
144
+ if config.task_type not in MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys():
145
+ model = cls(model, config)
146
+ else:
147
+ model = MODEL_TYPE_TO_PEFT_MODEL_MAPPING[config.task_type](model, config)
148
+
149
+ # load weights if any
150
+ if os.path.exists(os.path.join(model_id, WEIGHTS_NAME)):
151
+ filename = os.path.join(model_id, WEIGHTS_NAME)
152
+ else:
153
+ try:
154
+ filename = hf_hub_download(model_id, WEIGHTS_NAME)
155
+ except: # noqa
156
+ raise ValueError(
157
+ f"Can't find weights for {model_id} in {model_id} or in the Hugging Face Hub. "
158
+ f"Please check that the file {WEIGHTS_NAME} is present at {model_id}."
159
+ )
160
+
161
+ adapters_weights = torch.load(
162
+ filename, map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu")
163
+ )
164
+ # load the weights into the model
165
+ model = set_peft_model_state_dict(model, adapters_weights)
166
+ if getattr(model, "hf_device_map", None) is not None:
167
+ device_map = kwargs.get("device_map", "auto")
168
+ max_memory = kwargs.get("max_memory", None)
169
+ no_split_module_classes = model._no_split_modules
170
+ if device_map != "sequential":
171
+ max_memory = get_balanced_memory(
172
+ model,
173
+ max_memory=max_memory,
174
+ no_split_module_classes=no_split_module_classes,
175
+ low_zero=(device_map == "balanced_low_0"),
176
+ )
177
+ if isinstance(device_map, str):
178
+ device_map = infer_auto_device_map(
179
+ model, max_memory=max_memory, no_split_module_classes=no_split_module_classes
180
+ )
181
+ model = dispatch_model(model, device_map=device_map)
182
+ hook = AlignDevicesHook(io_same_device=True)
183
+ if model.peft_config.peft_type == PeftType.LORA or model.peft_config.peft_type == PeftType.BOTTLENECK:
184
+ add_hook_to_module(model.base_model.model, hook)
185
+ else:
186
+ remove_hook_from_submodules(model.prompt_encoder)
187
+ add_hook_to_module(model.base_model, hook)
188
+ return model
189
+
190
+ def _setup_prompt_encoder(self):
191
+ transformer_backbone = None
192
+ for name, module in self.base_model.named_children():
193
+ for param in module.parameters():
194
+ param.requires_grad = False
195
+ if isinstance(module, PreTrainedModel):
196
+ # Make sure to freeze Tranformers model
197
+ if transformer_backbone is None:
198
+ transformer_backbone = module
199
+ self.transformer_backbone_name = name
200
+
201
+ if self.peft_config.num_transformer_submodules is None:
202
+ self.peft_config.num_transformer_submodules = (
203
+ 2 if self.peft_config.task_type == TaskType.SEQ_2_SEQ_LM else 1
204
+ )
205
+
206
+ for named_param, value in list(transformer_backbone.named_parameters()):
207
+ if value.shape[0] == self.base_model.config.vocab_size:
208
+ self.word_embeddings = transformer_backbone.get_submodule(named_param.replace(".weight", ""))
209
+ break
210
+
211
+ if self.peft_config.peft_type == PeftType.PROMPT_TUNING:
212
+ prompt_encoder = PromptEmbedding(self.peft_config, self.word_embeddings)
213
+ elif self.peft_config.peft_type == PeftType.P_TUNING:
214
+ prompt_encoder = PromptEncoder(self.peft_config)
215
+ elif self.peft_config.peft_type == PeftType.PREFIX_TUNING:
216
+ prompt_encoder = PrefixEncoder(self.peft_config)
217
+ else:
218
+ raise ValueError("Not supported")
219
+ self.prompt_encoder = prompt_encoder
220
+ self.prompt_tokens = torch.arange(
221
+ self.peft_config.num_virtual_tokens * self.peft_config.num_transformer_submodules
222
+ ).long()
223
+
224
+ def get_prompt_embedding_to_save(self):
225
+ """
226
+ Returns the prompt embedding to save when saving the model. Only applicable when `peft_config.peft_type !=
227
+ PeftType.LORA`.
228
+ """
229
+ prompt_tokens = self.prompt_tokens.unsqueeze(0).expand(1, -1).to(self.device)
230
+ if self.peft_config.peft_type == PeftType.PREFIX_TUNING:
231
+ prompt_tokens = prompt_tokens[:, : self.peft_config.num_virtual_tokens]
232
+ prompt_embeddings = self.prompt_encoder(prompt_tokens)
233
+ return prompt_embeddings[0].detach().cpu()
234
+
235
+ def get_prompt(self, batch_size):
236
+ """
237
+ Returns the virtual prompts to use for Peft. Only applicable when `peft_config.peft_type != PeftType.LORA`.
238
+ """
239
+ prompt_tokens = self.prompt_tokens.unsqueeze(0).expand(batch_size, -1).to(self.device)
240
+ if self.peft_config.peft_type == PeftType.PREFIX_TUNING:
241
+ prompt_tokens = prompt_tokens[:, : self.peft_config.num_virtual_tokens]
242
+ if self.peft_config.inference_mode:
243
+ past_key_values = self.prompt_encoder.embedding.weight.repeat(batch_size, 1, 1)
244
+ else:
245
+ past_key_values = self.prompt_encoder(prompt_tokens)
246
+ past_key_values = past_key_values.view(
247
+ batch_size,
248
+ self.peft_config.num_virtual_tokens,
249
+ self.peft_config.num_layers * 2,
250
+ self.peft_config.num_attention_heads,
251
+ self.peft_config.token_dim // self.peft_config.num_attention_heads,
252
+ )
253
+ if self.peft_config.num_transformer_submodules == 2:
254
+ past_key_values = torch.cat([past_key_values, past_key_values], dim=2)
255
+ past_key_values = past_key_values.permute([2, 0, 3, 1, 4]).split(
256
+ self.peft_config.num_transformer_submodules * 2
257
+ )
258
+ if TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING.get(self.config.model_type, None) is not None:
259
+ post_process_fn = TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING[self.config.model_type]
260
+ past_key_values = post_process_fn(past_key_values)
261
+ return past_key_values
262
+ else:
263
+ if self.peft_config.inference_mode:
264
+ prompts = self.prompt_encoder.embedding.weight.repeat(batch_size, 1, 1)
265
+ else:
266
+ prompts = self.prompt_encoder(prompt_tokens)
267
+ return prompts
268
+
269
+ def print_trainable_parameters(self):
270
+ """
271
+ Prints the number of trainable parameters in the model.
272
+ """
273
+ trainable_params = 0
274
+ all_param = 0
275
+ for _, param in self.named_parameters():
276
+ num_params = param.numel()
277
+ # if using DS Zero 3 and the weights are initialized empty
278
+ if num_params == 0 and hasattr(param, "ds_numel"):
279
+ num_params = param.ds_numel
280
+
281
+ all_param += num_params
282
+ if param.requires_grad:
283
+ trainable_params += num_params
284
+ print(
285
+ f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
286
+ )
287
+
288
+ def __getattr__(self, name: str):
289
+ """Forward missing attributes to the wrapped module."""
290
+ try:
291
+ return super().__getattr__(name) # defer to nn.Module's logic
292
+ except AttributeError:
293
+ return getattr(self.base_model, name)
294
+
295
+ def forward(self, *args, **kwargs):
296
+ """
297
+ Forward pass of the model.
298
+ """
299
+ return self.get_base_model()(*args, **kwargs)
300
+
301
+ @contextmanager
302
+ def disable_adapter(self):
303
+ """
304
+ Disables the adapter module.
305
+ """
306
+ if isinstance(self.peft_config, PromptLearningConfig):
307
+ old_forward = self.forward
308
+ self.forward = self.base_model.forward
309
+ else:
310
+ self.base_model.disable_adapter_layers()
311
+ yield
312
+ if isinstance(self.peft_config, PromptLearningConfig):
313
+ self.forward = old_forward
314
+ else:
315
+ self.base_model.enable_adapter_layers()
316
+
317
+ def get_base_model(self):
318
+ """
319
+ Returns the base model.
320
+ """
321
+ return self.base_model if isinstance(self.peft_config, PromptLearningConfig) else self.base_model.model
322
+
323
+
324
+ class PeftModelForSequenceClassification(PeftModel):
325
+ """
326
+ Peft model for sequence classification tasks.
327
+
328
+ Args:
329
+ model ([`PreTrainedModel`]): Base transformer model
330
+ peft_config ([`PeftConfig`]): Peft config.
331
+
332
+ **Attributes**:
333
+ - **config** ([`PretrainedConfig`]) -- The configuration object of the base model.
334
+ - **cls_layer_name** (`str`) -- The name of the classification layer.
335
+
336
+ Example::
337
+
338
+ >>> from transformers import AutoModelForSequenceClassification >>> from peft import
339
+ PeftModelForSequenceClassification, get_peft_config >>> config = {
340
+ 'peft_type': 'PREFIX_TUNING', 'task_type': 'SEQ_CLS', 'inference_mode': False, 'num_virtual_tokens':
341
+ 20, 'token_dim': 768, 'num_transformer_submodules': 1, 'num_attention_heads': 12, 'num_layers': 12,
342
+ 'encoder_hidden_size': 768, 'prefix_projection': False, 'postprocess_past_key_value_function': None
343
+ }
344
+ >>> peft_config = get_peft_config(config) >>> model =
345
+ AutoModelForSequenceClassification.from_pretrained("bert-base-cased") >>> peft_model =
346
+ PeftModelForSequenceClassification(model, peft_config) >>> peft_model.print_trainable_parameters() trainable
347
+ params: 370178 || all params: 108680450 || trainable%: 0.3406113979101117
348
+ """
349
+
350
+ def __init__(self, model, peft_config: PeftConfig):
351
+ super().__init__(model, peft_config)
352
+ self.modules_to_save = ["classifier", "score"]
353
+
354
+ for name, _ in self.base_model.named_children():
355
+ if any(module_name in name for module_name in self.modules_to_save):
356
+ self.cls_layer_name = name
357
+ break
358
+
359
+ # to make sure classifier layer is trainable
360
+ _set_trainable(self)
361
+
362
+ def forward(
363
+ self,
364
+ input_ids=None,
365
+ attention_mask=None,
366
+ inputs_embeds=None,
367
+ labels=None,
368
+ output_attentions=None,
369
+ output_hidden_states=None,
370
+ return_dict=None,
371
+ **kwargs,
372
+ ):
373
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
374
+
375
+ if not isinstance(self.peft_config, PromptLearningConfig):
376
+ return self.base_model(
377
+ input_ids=input_ids,
378
+ attention_mask=attention_mask,
379
+ inputs_embeds=inputs_embeds,
380
+ labels=labels,
381
+ output_attentions=output_attentions,
382
+ output_hidden_states=output_hidden_states,
383
+ return_dict=return_dict,
384
+ **kwargs,
385
+ )
386
+
387
+ batch_size = input_ids.shape[0]
388
+ if attention_mask is not None:
389
+ # concat prompt attention mask
390
+ prefix_attention_mask = torch.ones(batch_size, self.peft_config.num_virtual_tokens).to(self.device)
391
+ attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1)
392
+ if kwargs.get("position_ids", None) is not None:
393
+ warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.")
394
+ kwargs["position_ids"] = None
395
+ kwargs.update(
396
+ {
397
+ "attention_mask": attention_mask,
398
+ "labels": labels,
399
+ "output_attentions": output_attentions,
400
+ "output_hidden_states": output_hidden_states,
401
+ "return_dict": return_dict,
402
+ }
403
+ )
404
+
405
+ if self.peft_config.peft_type == PeftType.PREFIX_TUNING:
406
+ return self._prefix_tuning_forward(input_ids=input_ids, **kwargs)
407
+ else:
408
+ if kwargs.get("token_type_ids", None) is not None:
409
+ kwargs["token_type_ids"] = torch.cat(
410
+ (
411
+ torch.zeros(batch_size, self.peft_config.num_virtual_tokens).to(self.device),
412
+ kwargs["token_type_ids"],
413
+ ),
414
+ dim=1,
415
+ ).long()
416
+ if inputs_embeds is None:
417
+ inputs_embeds = self.word_embeddings(input_ids)
418
+ prompts = self.get_prompt(batch_size=batch_size)
419
+ prompts = prompts.to(inputs_embeds.dtype)
420
+ inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1)
421
+ return self.base_model(inputs_embeds=inputs_embeds, **kwargs)
422
+
423
+ def _prefix_tuning_forward(
424
+ self,
425
+ input_ids=None,
426
+ attention_mask=None,
427
+ inputs_embeds=None,
428
+ labels=None,
429
+ output_attentions=None,
430
+ output_hidden_states=None,
431
+ return_dict=None,
432
+ **kwargs,
433
+ ):
434
+ batch_size = input_ids.shape[0]
435
+ past_key_values = self.get_prompt(batch_size)
436
+ fwd_params = list(inspect.signature(self.base_model.forward).parameters.keys())
437
+ kwargs.update(
438
+ {
439
+ "input_ids": input_ids,
440
+ "attention_mask": attention_mask,
441
+ "inputs_embeds": inputs_embeds,
442
+ "output_attentions": output_attentions,
443
+ "output_hidden_states": output_hidden_states,
444
+ "return_dict": return_dict,
445
+ "past_key_values": past_key_values,
446
+ }
447
+ )
448
+ if "past_key_values" in fwd_params:
449
+ return self.base_model(labels=labels, **kwargs)
450
+ else:
451
+ transformer_backbone_name = self.base_model.get_submodule(self.transformer_backbone_name)
452
+ fwd_params = list(inspect.signature(transformer_backbone_name.forward).parameters.keys())
453
+ if "past_key_values" not in fwd_params:
454
+ raise ValueError("Model does not support past key values which are required for prefix tuning.")
455
+ outputs = transformer_backbone_name(**kwargs)
456
+ pooled_output = outputs[1] if len(outputs) > 1 else outputs[0]
457
+ if "dropout" in [name for name, _ in list(self.base_model.named_children())]:
458
+ pooled_output = self.base_model.dropout(pooled_output)
459
+ logits = self.base_model.get_submodule(self.cls_layer_name)(pooled_output)
460
+
461
+ loss = None
462
+ if labels is not None:
463
+ if self.config.problem_type is None:
464
+ if self.base_model.num_labels == 1:
465
+ self.config.problem_type = "regression"
466
+ elif self.base_model.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
467
+ self.config.problem_type = "single_label_classification"
468
+ else:
469
+ self.config.problem_type = "multi_label_classification"
470
+
471
+ if self.config.problem_type == "regression":
472
+ loss_fct = MSELoss()
473
+ if self.base_model.num_labels == 1:
474
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
475
+ else:
476
+ loss = loss_fct(logits, labels)
477
+ elif self.config.problem_type == "single_label_classification":
478
+ loss_fct = CrossEntropyLoss()
479
+ loss = loss_fct(logits.view(-1, self.base_model.num_labels), labels.view(-1))
480
+ elif self.config.problem_type == "multi_label_classification":
481
+ loss_fct = BCEWithLogitsLoss()
482
+ loss = loss_fct(logits, labels)
483
+ if not return_dict:
484
+ output = (logits,) + outputs[2:]
485
+ return ((loss,) + output) if loss is not None else output
486
+
487
+ return SequenceClassifierOutput(
488
+ loss=loss,
489
+ logits=logits,
490
+ hidden_states=outputs.hidden_states,
491
+ attentions=outputs.attentions,
492
+ )
493
+
494
+
495
+ class PeftModelForCausalLM(PeftModel):
496
+ """
497
+ Peft model for Causal LM
498
+
499
+ Args:
500
+ model ([`PreTrainedModel`]): Base transformer model
501
+ peft_config ([`PeftConfig`]): Peft config.
502
+
503
+
504
+ Example::
505
+
506
+ >>> from transformers import AutoModelForCausalLM >>> from peft import PeftModelForCausalLM, get_peft_config
507
+ >>> config = {
508
+ 'peft_type': 'PREFIX_TUNING', 'task_type': 'CAUSAL_LM', 'inference_mode': False, 'num_virtual_tokens':
509
+ 20, 'token_dim': 1280, 'num_transformer_submodules': 1, 'num_attention_heads': 20, 'num_layers': 36,
510
+ 'encoder_hidden_size': 1280, 'prefix_projection': False, 'postprocess_past_key_value_function': None
511
+ }
512
+ >>> peft_config = get_peft_config(config) >>> model = AutoModelForCausalLM.from_pretrained("gpt2-large") >>>
513
+ peft_model = PeftModelForCausalLM(model, peft_config) >>> peft_model.print_trainable_parameters() trainable
514
+ params: 1843200 || all params: 775873280 || trainable%: 0.23756456724479544
515
+ """
516
+
517
+ def __init__(self, model, peft_config: PeftConfig):
518
+ super().__init__(model, peft_config)
519
+ self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation
520
+
521
+ def forward(
522
+ self,
523
+ input_ids=None,
524
+ attention_mask=None,
525
+ inputs_embeds=None,
526
+ labels=None,
527
+ output_attentions=None,
528
+ output_hidden_states=None,
529
+ return_dict=None,
530
+ **kwargs,
531
+ ):
532
+ if not isinstance(self.peft_config, PromptLearningConfig):
533
+ return self.base_model(
534
+ input_ids=input_ids,
535
+ attention_mask=attention_mask,
536
+ inputs_embeds=inputs_embeds,
537
+ labels=labels,
538
+ output_attentions=output_attentions,
539
+ output_hidden_states=output_hidden_states,
540
+ return_dict=return_dict,
541
+ **kwargs,
542
+ )
543
+
544
+ batch_size = input_ids.shape[0]
545
+ if attention_mask is not None:
546
+ # concat prompt attention mask
547
+ prefix_attention_mask = torch.ones(batch_size, self.peft_config.num_virtual_tokens).to(self.device)
548
+ attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1)
549
+
550
+ if kwargs.get("position_ids", None) is not None:
551
+ warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.")
552
+ kwargs["position_ids"] = None
553
+ if kwargs.get("token_type_ids", None) is not None:
554
+ warnings.warn("Token type ids are not supported for parameter efficient tuning. Ignoring token type ids")
555
+ kwargs["token_type_ids"] = None
556
+ kwargs.update(
557
+ {
558
+ "attention_mask": attention_mask,
559
+ "labels": labels,
560
+ "output_attentions": output_attentions,
561
+ "output_hidden_states": output_hidden_states,
562
+ "return_dict": return_dict,
563
+ }
564
+ )
565
+
566
+ if self.peft_config.peft_type == PeftType.PREFIX_TUNING:
567
+ past_key_values = self.get_prompt(batch_size)
568
+ return self.base_model(input_ids=input_ids, past_key_values=past_key_values, **kwargs)
569
+ else:
570
+ if inputs_embeds is None:
571
+ inputs_embeds = self.word_embeddings(input_ids)
572
+ # concat prompt labels
573
+ if labels is not None:
574
+ prefix_labels = torch.full((batch_size, self.peft_config.num_virtual_tokens), -100).to(self.device)
575
+ kwargs["labels"] = torch.cat((prefix_labels, labels), dim=1)
576
+ prompts = self.get_prompt(batch_size=batch_size)
577
+ prompts = prompts.to(inputs_embeds.dtype)
578
+ inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1)
579
+ return self.base_model(inputs_embeds=inputs_embeds, **kwargs)
580
+
581
+ def generate(self, **kwargs):
582
+ self.base_model.prepare_inputs_for_generation = self.prepare_inputs_for_generation
583
+ try:
584
+ if not isinstance(self.peft_config, PromptLearningConfig):
585
+ outputs = self.base_model.generate(**kwargs)
586
+ else:
587
+ if "input_ids" not in kwargs:
588
+ raise ValueError("input_ids must be provided for Peft model generation")
589
+ if kwargs.get("attention_mask", None) is not None:
590
+ # concat prompt attention mask
591
+ prefix_attention_mask = torch.ones(
592
+ kwargs["input_ids"].shape[0], self.peft_config.num_virtual_tokens
593
+ ).to(kwargs["input_ids"].device)
594
+ kwargs["attention_mask"] = torch.cat((prefix_attention_mask, kwargs["attention_mask"]), dim=1)
595
+
596
+ if kwargs.get("position_ids", None) is not None:
597
+ warnings.warn(
598
+ "Position ids are not supported for parameter efficient tuning. Ignoring position ids."
599
+ )
600
+ kwargs["position_ids"] = None
601
+ if kwargs.get("token_type_ids", None) is not None:
602
+ warnings.warn(
603
+ "Token type ids are not supported for parameter efficient tuning. Ignoring token type ids"
604
+ )
605
+ kwargs["token_type_ids"] = None
606
+
607
+ outputs = self.base_model.generate(**kwargs)
608
+ except:
609
+ self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation
610
+ raise
611
+ else:
612
+ self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation
613
+ return outputs
614
+
615
+ def prepare_inputs_for_generation(self, *args, **kwargs):
616
+ model_kwargs = self.base_model_prepare_inputs_for_generation(*args, **kwargs)
617
+ if isinstance(self.peft_config, PromptLearningConfig):
618
+ if self.peft_config.peft_type == PeftType.PREFIX_TUNING:
619
+ prefix_attention_mask = torch.ones(
620
+ model_kwargs["input_ids"].shape[0], self.peft_config.num_virtual_tokens
621
+ ).to(model_kwargs["input_ids"].device)
622
+ model_kwargs["attention_mask"] = torch.cat(
623
+ (prefix_attention_mask, model_kwargs["attention_mask"]), dim=1
624
+ )
625
+
626
+ if model_kwargs["past_key_values"] is None and self.peft_config.peft_type == PeftType.PREFIX_TUNING:
627
+ past_key_values = self.get_prompt(batch_size=model_kwargs["input_ids"].shape[0])
628
+ if self.base_model_torch_dtype is not None:
629
+ # handle the case for Bloom where it outputs tuple of tuples
630
+ if isinstance(past_key_values[0], tuple):
631
+ past_key_values = tuple(
632
+ tuple(
633
+ past_key_value.to(self.base_model_torch_dtype)
634
+ for past_key_value in past_key_value_tuple
635
+ )
636
+ for past_key_value_tuple in past_key_values
637
+ )
638
+ else:
639
+ past_key_values = tuple(
640
+ past_key_value.to(self.base_model_torch_dtype) for past_key_value in past_key_values
641
+ )
642
+
643
+ model_kwargs["past_key_values"] = past_key_values
644
+ else:
645
+ if model_kwargs["past_key_values"] is None:
646
+ inputs_embeds = self.word_embeddings(model_kwargs["input_ids"])
647
+ prompts = self.get_prompt(batch_size=model_kwargs["input_ids"].shape[0])
648
+ prompts = prompts.to(inputs_embeds.dtype)
649
+ model_kwargs["inputs_embeds"] = torch.cat((prompts, inputs_embeds), dim=1)
650
+ model_kwargs["input_ids"] = None
651
+
652
+ return model_kwargs
653
+
654
+
655
+ class PeftModelForSeq2SeqLM(PeftModel):
656
+ """
657
+ Peft model for Seq2Seq LM
658
+
659
+ Args:
660
+ model ([`PreTrainedModel`]): Base transformer model
661
+ peft_config ([`PeftConfig`]): Peft config.
662
+
663
+
664
+ Example::
665
+
666
+ >>> from transformers import AutoModelForSeq2SeqLM >>> from peft import PeftModelForSeq2SeqLM, get_peft_config
667
+ >>> config = {
668
+ 'peft_type': 'LORA', 'task_type': 'SEQ_2_SEQ_LM', 'inference_mode': False, 'r': 8, 'target_modules':
669
+ ['q', 'v'], 'lora_alpha': 32, 'lora_dropout': 0.1, 'merge_weights': False, 'fan_in_fan_out': False,
670
+ 'enable_lora': None, 'bias': 'none'
671
+ }
672
+ >>> peft_config = get_peft_config(config) >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") >>>
673
+ peft_model = PeftModelForSeq2SeqLM(model, peft_config) >>> peft_model.print_trainable_parameters() trainable
674
+ params: 884736 || all params: 223843584 || trainable%: 0.3952474242013566
675
+ """
676
+
677
+ def __init__(self, model, peft_config: PeftConfig):
678
+ super().__init__(model, peft_config)
679
+ self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation
680
+ self.base_model_prepare_encoder_decoder_kwargs_for_generation = (
681
+ self.base_model._prepare_encoder_decoder_kwargs_for_generation
682
+ )
683
+
684
+ def forward(
685
+ self,
686
+ input_ids=None,
687
+ attention_mask=None,
688
+ inputs_embeds=None,
689
+ decoder_input_ids=None,
690
+ decoder_attention_mask=None,
691
+ decoder_inputs_embeds=None,
692
+ labels=None,
693
+ output_attentions=None,
694
+ output_hidden_states=None,
695
+ return_dict=None,
696
+ **kwargs,
697
+ ):
698
+ if not isinstance(self.peft_config, PromptLearningConfig):
699
+ return self.base_model(
700
+ input_ids=input_ids,
701
+ attention_mask=attention_mask,
702
+ inputs_embeds=inputs_embeds,
703
+ decoder_input_ids=decoder_input_ids,
704
+ decoder_attention_mask=decoder_attention_mask,
705
+ decoder_inputs_embeds=decoder_inputs_embeds,
706
+ labels=labels,
707
+ output_attentions=output_attentions,
708
+ output_hidden_states=output_hidden_states,
709
+ return_dict=return_dict,
710
+ **kwargs,
711
+ )
712
+
713
+ batch_size = input_ids.shape[0]
714
+ if decoder_attention_mask is not None:
715
+ # concat prompt attention mask
716
+ prefix_attention_mask = torch.ones(batch_size, self.peft_config.num_virtual_tokens).to(self.device)
717
+ decoder_attention_mask = torch.cat((prefix_attention_mask, decoder_attention_mask), dim=1)
718
+
719
+ if kwargs.get("position_ids", None) is not None:
720
+ warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.")
721
+ kwargs["position_ids"] = None
722
+ if kwargs.get("token_type_ids", None) is not None:
723
+ warnings.warn("Token type ids are not supported for parameter efficient tuning. Ignoring token type ids")
724
+ kwargs["token_type_ids"] = None
725
+ kwargs.update(
726
+ {
727
+ "attention_mask": attention_mask,
728
+ "decoder_attention_mask": decoder_attention_mask,
729
+ "labels": labels,
730
+ "output_attentions": output_attentions,
731
+ "output_hidden_states": output_hidden_states,
732
+ "return_dict": return_dict,
733
+ }
734
+ )
735
+
736
+ if self.peft_config.peft_type == PeftType.PREFIX_TUNING:
737
+ past_key_values = self.get_prompt(batch_size)
738
+ return self.base_model(
739
+ input_ids=input_ids, decoder_input_ids=decoder_input_ids, past_key_values=past_key_values, **kwargs
740
+ )
741
+ else:
742
+ if inputs_embeds is None:
743
+ inputs_embeds = self.word_embeddings(input_ids)
744
+ if decoder_inputs_embeds is None and decoder_input_ids is None:
745
+ decoder_input_ids = shift_tokens_right(
746
+ labels, self.config.pad_token_id, self.config.decoder_start_token_id
747
+ )
748
+ decoder_inputs_embeds = self.word_embeddings(decoder_input_ids)
749
+
750
+ if attention_mask is not None:
751
+ # concat prompt attention mask
752
+ prefix_attention_mask = torch.ones(batch_size, self.peft_config.num_virtual_tokens).to(self.device)
753
+ kwargs["attention_mask"] = torch.cat((prefix_attention_mask, attention_mask), dim=1)
754
+ # concat prompt labels
755
+ if labels is not None:
756
+ if self.peft_config.num_transformer_submodules == 1:
757
+ kwargs["labels"] = labels
758
+ elif self.peft_config.num_transformer_submodules == 2:
759
+ prefix_labels = torch.full((batch_size, self.peft_config.num_virtual_tokens), -100).to(self.device)
760
+ kwargs["labels"] = torch.cat((prefix_labels, labels), dim=1)
761
+ prompts = self.get_prompt(batch_size=batch_size)
762
+ prompts = prompts.to(inputs_embeds.dtype)
763
+ inputs_embeds = torch.cat((prompts[:, : self.peft_config.num_virtual_tokens], inputs_embeds), dim=1)
764
+ if self.peft_config.num_transformer_submodules == 1:
765
+ return self.base_model(inputs_embeds=inputs_embeds, **kwargs)
766
+ elif self.peft_config.num_transformer_submodules == 2:
767
+ decoder_inputs_embeds = torch.cat(
768
+ (prompts[:, self.peft_config.num_virtual_tokens :], decoder_inputs_embeds), dim=1
769
+ )
770
+ return self.base_model(
771
+ inputs_embeds=inputs_embeds, decoder_inputs_embeds=decoder_inputs_embeds, **kwargs
772
+ )
773
+
774
+ def generate(self, **kwargs):
775
+ self.base_model.prepare_inputs_for_generation = self.prepare_inputs_for_generation
776
+ self.base_model._prepare_encoder_decoder_kwargs_for_generation = (
777
+ self._prepare_encoder_decoder_kwargs_for_generation
778
+ )
779
+ try:
780
+ if not isinstance(self.peft_config, PromptLearningConfig):
781
+ outputs = self.base_model.generate(**kwargs)
782
+ else:
783
+ if "input_ids" not in kwargs:
784
+ raise ValueError("input_ids must be provided for Peft model generation")
785
+ if kwargs.get("position_ids", None) is not None:
786
+ warnings.warn(
787
+ "Position ids are not supported for parameter efficient tuning. Ignoring position ids."
788
+ )
789
+ kwargs["position_ids"] = None
790
+ if kwargs.get("token_type_ids", None) is not None:
791
+ warnings.warn(
792
+ "Token type ids are not supported for parameter efficient tuning. Ignoring token type ids"
793
+ )
794
+ kwargs["token_type_ids"] = None
795
+
796
+ if self.peft_config.peft_type == PeftType.PREFIX_TUNING:
797
+ outputs = self.base_model.generate(**kwargs)
798
+ else:
799
+ raise NotImplementedError
800
+ except:
801
+ self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation
802
+ self.base_model._prepare_encoder_decoder_kwargs_for_generation = (
803
+ self.base_model_prepare_encoder_decoder_kwargs_for_generation
804
+ )
805
+ raise
806
+ else:
807
+ self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation
808
+ self.base_model._prepare_encoder_decoder_kwargs_for_generation = (
809
+ self.base_model_prepare_encoder_decoder_kwargs_for_generation
810
+ )
811
+ return outputs
812
+
813
+ def prepare_inputs_for_generation(self, *args, **kwargs):
814
+ model_kwargs = self.base_model_prepare_inputs_for_generation(*args, **kwargs)
815
+ if model_kwargs["past_key_values"] is None and self.peft_config.peft_type == PeftType.PREFIX_TUNING:
816
+ batch_size = model_kwargs["decoder_input_ids"].shape[0]
817
+ past_key_values = self.get_prompt(batch_size)
818
+ model_kwargs["past_key_values"] = past_key_values
819
+ return model_kwargs
820
+
821
+
822
+ class PeftModelForTokenClassification(PeftModel):
823
+ """
824
+ Peft model for sequence classification tasks.
825
+
826
+ Args:
827
+ model ([`PreTrainedModel`]): Base transformer model
828
+ peft_config ([`PeftConfig`]): Peft config.
829
+
830
+ **Attributes**:
831
+ - **config** ([`PretrainedConfig`]) -- The configuration object of the base model.
832
+ - **cls_layer_name** (`str`) -- The name of the classification layer.
833
+
834
+ Example::
835
+
836
+ >>> from transformers import AutoModelForSequenceClassification >>> from peft import
837
+ PeftModelForTokenClassification, get_peft_config >>> config = {
838
+ 'peft_type': 'PREFIX_TUNING', 'task_type': 'TOKEN_CLS', 'inference_mode': False, 'num_virtual_tokens':
839
+ 20, 'token_dim': 768, 'num_transformer_submodules': 1, 'num_attention_heads': 12, 'num_layers': 12,
840
+ 'encoder_hidden_size': 768, 'prefix_projection': False, 'postprocess_past_key_value_function': None
841
+ }
842
+ >>> peft_config = get_peft_config(config) >>> model =
843
+ AutoModelForTokenClassification.from_pretrained("bert-base-cased") >>> peft_model =
844
+ PeftModelForTokenClassification(model, peft_config) >>> peft_model.print_trainable_parameters() trainable
845
+ params: 370178 || all params: 108680450 || trainable%: 0.3406113979101117
846
+ """
847
+
848
+ def __init__(self, model, peft_config: PeftConfig):
849
+ super().__init__(model, peft_config)
850
+ self.modules_to_save = ["classifier", "score"]
851
+
852
+ for name, _ in self.base_model.named_children():
853
+ if any(module_name in name for module_name in self.modules_to_save):
854
+ self.cls_layer_name = name
855
+ break
856
+
857
+ # to make sure classifier layer is trainable
858
+ _set_trainable(self)
859
+
860
+ def forward(
861
+ self,
862
+ input_ids=None,
863
+ attention_mask=None,
864
+ inputs_embeds=None,
865
+ labels=None,
866
+ output_attentions=None,
867
+ output_hidden_states=None,
868
+ return_dict=None,
869
+ **kwargs,
870
+ ):
871
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
872
+
873
+ if not isinstance(self.peft_config, PromptLearningConfig):
874
+ return self.base_model(
875
+ input_ids=input_ids,
876
+ attention_mask=attention_mask,
877
+ inputs_embeds=inputs_embeds,
878
+ labels=labels,
879
+ output_attentions=output_attentions,
880
+ output_hidden_states=output_hidden_states,
881
+ return_dict=return_dict,
882
+ **kwargs,
883
+ )
884
+
885
+ batch_size = input_ids.shape[0]
886
+ if attention_mask is not None:
887
+ # concat prompt attention mask
888
+ prefix_attention_mask = torch.ones(batch_size, self.peft_config.num_virtual_tokens).to(self.device)
889
+ attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1)
890
+ if kwargs.get("position_ids", None) is not None:
891
+ warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.")
892
+ kwargs["position_ids"] = None
893
+ kwargs.update(
894
+ {
895
+ "attention_mask": attention_mask,
896
+ "labels": labels,
897
+ "output_attentions": output_attentions,
898
+ "output_hidden_states": output_hidden_states,
899
+ "return_dict": return_dict,
900
+ }
901
+ )
902
+
903
+ if self.peft_config.peft_type == PeftType.PREFIX_TUNING:
904
+ return self._prefix_tuning_forward(input_ids=input_ids, **kwargs)
905
+ else:
906
+ if kwargs.get("token_type_ids", None) is not None:
907
+ kwargs["token_type_ids"] = torch.cat(
908
+ (
909
+ torch.zeros(batch_size, self.peft_config.num_virtual_tokens).to(self.device),
910
+ kwargs["token_type_ids"],
911
+ ),
912
+ dim=1,
913
+ ).long()
914
+ if inputs_embeds is None:
915
+ inputs_embeds = self.word_embeddings(input_ids)
916
+ prompts = self.get_prompt(batch_size=batch_size)
917
+ prompts = prompts.to(inputs_embeds.dtype)
918
+ inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1)
919
+ return self.base_model(inputs_embeds=inputs_embeds, **kwargs)
920
+
921
+ def _prefix_tuning_forward(
922
+ self,
923
+ input_ids=None,
924
+ attention_mask=None,
925
+ inputs_embeds=None,
926
+ labels=None,
927
+ output_attentions=None,
928
+ output_hidden_states=None,
929
+ return_dict=None,
930
+ **kwargs,
931
+ ):
932
+ batch_size = input_ids.shape[0]
933
+ past_key_values = self.get_prompt(batch_size)
934
+ fwd_params = list(inspect.signature(self.base_model.forward).parameters.keys())
935
+ kwargs.update(
936
+ {
937
+ "input_ids": input_ids,
938
+ "attention_mask": attention_mask,
939
+ "inputs_embeds": inputs_embeds,
940
+ "output_attentions": output_attentions,
941
+ "output_hidden_states": output_hidden_states,
942
+ "return_dict": return_dict,
943
+ "past_key_values": past_key_values,
944
+ }
945
+ )
946
+ if "past_key_values" in fwd_params:
947
+ return self.base_model(labels=labels, **kwargs)
948
+ else:
949
+ transformer_backbone_name = self.base_model.get_submodule(self.transformer_backbone_name)
950
+ fwd_params = list(inspect.signature(transformer_backbone_name.forward).parameters.keys())
951
+ if "past_key_values" not in fwd_params:
952
+ raise ValueError("Model does not support past key values which are required for prefix tuning.")
953
+ outputs = transformer_backbone_name(**kwargs)
954
+ sequence_output = outputs[0]
955
+ if "dropout" in [name for name, _ in list(self.base_model.named_children())]:
956
+ sequence_output = self.base_model.dropout(sequence_output)
957
+ logits = self.base_model.get_submodule(self.cls_layer_name)(sequence_output)
958
+
959
+ loss = None
960
+ loss = None
961
+ if labels is not None:
962
+ loss_fct = CrossEntropyLoss()
963
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
964
+
965
+ if not return_dict:
966
+ output = (logits,) + outputs[2:]
967
+ return ((loss,) + output) if loss is not None else output
968
+
969
+ return TokenClassifierOutput(
970
+ loss=loss,
971
+ logits=logits,
972
+ hidden_states=outputs.hidden_states,
973
+ attentions=outputs.attentions,
974
+ )
SVFT-main/LLM-Adapters/peft/src/peft/tuners/__init__.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
+ # There's no way to ignore "F401 '...' imported but unused" warnings in this
3
+ # module, but to preserve other warnings. So, don't check this module at all
4
+
5
+ # coding=utf-8
6
+ # Copyright 2023-present the HuggingFace Inc. team.
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License");
9
+ # you may not use this file except in compliance with the License.
10
+ # You may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
+ # See the License for the specific language governing permissions and
18
+ # limitations under the License.
19
+
20
+ from .lora import LoraConfig, LoraModel
21
+ from .bottleneck import BottleneckConfig, BottleneckModel
22
+ from .p_tuning import PromptEncoder, PromptEncoderConfig, PromptEncoderReparameterizationType
23
+ from .prefix_tuning import PrefixEncoder, PrefixTuningConfig
24
+ from .prompt_tuning import PromptEmbedding, PromptTuningConfig, PromptTuningInit
SVFT-main/LLM-Adapters/peft/src/peft/tuners/bottleneck.py ADDED
@@ -0,0 +1,532 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib
2
+ import math
3
+ import re
4
+ import warnings
5
+ from dataclasses import asdict, dataclass, field
6
+ from enum import Enum
7
+ from typing import List, Optional, Union
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.nn.functional as F
12
+
13
+ from ..utils import PeftConfig, PeftType, transpose
14
+ from transformers.activations import ACT2FN
15
+
16
+
17
+ TRANSFORMERS_MODELS_TO_ADAPTER_TYPE_MAPPING = {
18
+ "bloom": {"dense_h_to_4h": "mh_adapter", "dense_4h_to_h": "output_adapter"},
19
+ "gptj": {"fc_in":"mh_adapter", "fc_out":"output_adapter"},
20
+ "gpt_neo": {"c_fc":"mh_adapter", "c_proj":"output_adapter"},
21
+ "llama": {"gate_proj": "mh_adapter", "up_proj":"mh_adapter", "down_proj":"output_adapter"},
22
+ "opt": {"fc1":"mh_adapter", "fc2":"output_adapter"},
23
+ "chatglm": {"dense_h_to_4h": "mh_adapter", "dense_4h_to_h": "output_adapter"},
24
+ }
25
+
26
+ def is_bnb_available():
27
+ return importlib.util.find_spec("bitsandbytes") is not None
28
+
29
+
30
+ if is_bnb_available():
31
+ import bitsandbytes as bnb
32
+
33
+ @dataclass
34
+ class BottleneckConfig(PeftConfig):
35
+ """
36
+ This is the configuration class to store the configuration of a [`~peft.Bottleneck`].
37
+
38
+ Args:
39
+ bottleneck_size (`int`): The size of the bottleneck.
40
+ non_linearity (`str`): The non-linearity to apply to the bottleneck.
41
+ dropout (`float`, optional): The dropout probability of the bottleneck. Default to 0.0
42
+ bias ('str'): Bias type for Bottleneck. Can be 'none', 'all' or 'adapter_only'. Default to 'none'.
43
+ use_parallel_adapter (:obj:`bool`, optional): Whether to use parallel adapter. Defaults to False.
44
+ scaling (:obj:`float` or :obj:`str`, optional):
45
+ Scaling factor to use for scaled addition of adapter outputs as done by He et al. (2021). Can be either a
46
+ constant factor (float) or the string "learned", in which case the scaling factor is learned. Defaults to
47
+ 1.0.
48
+ target_modules (`Union[List[str],str]`): The names of the modules to apply Adapter to.
49
+ init_weights (:obj:`str`, optional): Initialization method for the weights of the adapter modules.
50
+ Currently, this can be either "bert" (default) or "mam_adapter".
51
+ modules_to_save (`List[str]`):List of modules apart from Bottleneck adapter layers to be set as trainable
52
+ and saved in the final checkpoint.
53
+ """
54
+ bottleneck_size : int = field(default=256, metadata={"help": "The size of the bottleneck"})
55
+ non_linearity : str = field(default="tanh", metadata={"help": "The non-linearity to apply to the bottleneck"})
56
+ adapter_dropout : float = field(default=0.0, metadata={"help": "The dropout probability of the bottleneck, default to 0.0"})
57
+ target_modules: Optional[Union[List[str], str]] = field(
58
+ default=None,
59
+ metadata={
60
+ "help": "List of module names or regex expression of the module names to replace with Adapter."
61
+ "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
62
+ },
63
+ )
64
+ use_parallel_adapter: bool = field(default=False, metadata={"help": "Whether to use parallel adapter"})
65
+ use_adapterp: bool = field(default=False, metadata={"help": "Whether to use adapterp"})
66
+ scaling: Union[float, str] = 1.0
67
+ bias: str = field(default="none", metadata={"help": "Bias type for Bottleneck. Can be 'none', 'all' or 'adapter_only'"})
68
+ init_weights: str = field(default="bert", metadata={"help": "Initialization method for the weights of the adapter modules."})
69
+ modules_to_save: Optional[List[str]] = field(
70
+ default=None,
71
+ metadata={
72
+ "help": "List of modules apart from Adapter layers to be set as trainable and saved in the final checkpoint. "
73
+ "For example, in Sequence Classification or Token Classification tasks, "
74
+ "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
75
+ },
76
+ )
77
+
78
+ def __post_init__(self):
79
+ self.peft_type = PeftType.BOTTLENECK
80
+
81
+
82
+ class BottleneckModel(torch.nn.Module):
83
+ """
84
+ Creates Bottleneck adapter model for a pretrained trainsformers model.
85
+
86
+ Args:
87
+ model ('transformers.PreTrainedModel'): The pretrained model to be adapted.
88
+ config (`BottleneckConfig`): The configuration of the Bottleneck adapter.
89
+
90
+ Returns:
91
+ `torch.nn.Module`: The Bottleneck adapter model.
92
+
93
+ Example::
94
+
95
+ >>> from transformers import AutoModelForCausalLM, BottleneckConfig
96
+ >>> from peft import BottleneckModel, BottleneckConfig
97
+ >>> config = BottleneckConfig(
98
+ peft_type="BOTTLNECK", task="CAUSAL_LM", target_modules=["gate_proj", "up_proj", "down_proj"],
99
+ bottleneck_size=256, non_linearity="tanh",
100
+ )
101
+ >>> model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-7b-hf")
102
+ >>> bottleneck_model = BottleneckModel(config, model)
103
+
104
+ **Attribute**:
105
+ - **model** (`transformers.PreTrainedModel`): The pretrained model to be adapted.
106
+ - **peft_config** (`BottleneckConfig`): The configuration of the Bottleneck adapter.
107
+ """
108
+
109
+ def __init__(self, config, model):
110
+ super().__init__()
111
+ self.model = model
112
+ self.peft_config = config
113
+ self._find_and_replace()
114
+ mark_only_adapter_as_trainable(self.model, self.peft_config.bias)
115
+ self.forward = self.model.forward
116
+
117
+ def _find_and_replace(self):
118
+ loaded_in_8bit = getattr(self.model, "is_loaded_in_8bit", False)
119
+ if loaded_in_8bit and not is_bnb_available():
120
+ raise ImportError(
121
+ "To use Adapter with 8-bit quantization, please install the `bitsandbytes` package. "
122
+ "You can install it with `pip install bitsandbytes`."
123
+ )
124
+ is_target_modules_in_base_model = False
125
+ is_hf_device_map_available = hasattr(self.model, "hf_device_map")
126
+ kwargs = {
127
+ "bottleneck_size": self.peft_config.bottleneck_size,
128
+ "non_linearity": self.peft_config.non_linearity,
129
+ "adapter_dropout": self.peft_config.adapter_dropout,
130
+ "scaling": self.peft_config.scaling,
131
+ "init_weights": self.peft_config.init_weights,
132
+ }
133
+ key_list = [key for key, _ in self.model.named_modules()]
134
+ for key in key_list:
135
+ if isinstance(self.peft_config.target_modules, str):
136
+ target_module_found = re.fullmatch(self.peft_config.target_modules, key)
137
+ else:
138
+ target_module_found = any(key.endswith(target_key) for target_key in self.peft_config.target_modules)
139
+ if target_module_found:
140
+ if not is_target_modules_in_base_model:
141
+ is_target_modules_in_base_model = True
142
+ parent, target, target_name = self._get_submodules(key)
143
+ # determine the type of adapter to be used, this will effect the forward pass
144
+ if self.peft_config.use_parallel_adapter:
145
+ adapter_type = "parallel_adapter"
146
+ else:
147
+ adapter_type = TRANSFORMERS_MODELS_TO_ADAPTER_TYPE_MAPPING[self.model.config.model_type][target_name]
148
+ kwargs.update({"adapter_type": adapter_type})
149
+
150
+ bias = target.bias is not None
151
+ if loaded_in_8bit and isinstance(target, bnb.nn.Linear8bitLt):
152
+ kwargs.update(
153
+ {
154
+ "has_fp16_weights": target.state.has_fp16_weights,
155
+ "memory_efficient_backward": target.state.memory_efficient_backward,
156
+ "threshold": target.state.threshold,
157
+ "index": target.index,
158
+ }
159
+ )
160
+ if adapter_type == "mh_adapter":
161
+ new_module = Linear8bitLt(target.in_features, target.in_features, bias=bias, **kwargs)
162
+ elif adapter_type == "output_adapter":
163
+ new_module = Linear8bitLt(target.out_features, target.out_features, bias=bias, **kwargs)
164
+ elif adapter_type == "parallel_adapter":
165
+ new_module = Linear8bitLt(target.in_features, target.out_features, bias=bias, **kwargs)
166
+ elif isinstance(target, torch.nn.Linear):
167
+ if adapter_type == "mh_adapter":
168
+ new_module = Linear(target.in_features, target.in_features, bias=bias, **kwargs)
169
+ elif adapter_type == "output_adapter":
170
+ new_module = Linear(target.out_features, target.out_features, bias=bias, **kwargs)
171
+ elif adapter_type == "parallel_adapter":
172
+ new_module = Linear(target.in_features, target.out_features, bias=bias, **kwargs)
173
+ self._replace_module(parent, target_name, new_module, target)
174
+ if not is_target_modules_in_base_model:
175
+ raise ValueError(
176
+ f"Target modules {self.peft_config.target_modules} not found in the base model. "
177
+ f"Please check the target modules and try again."
178
+ )
179
+
180
+ def _get_submodules(self, key):
181
+ parent = self.model.get_submodule(".".join(key.split(".")[:-1]))
182
+ target_name = key.split(".")[-1]
183
+ target = self.model.get_submodule(key)
184
+ return parent, target, target_name
185
+
186
+ def _replace_module(self, parent_module, child_name, new_module, old_module):
187
+ setattr(parent_module, child_name, new_module)
188
+ new_module.weight = old_module.weight
189
+ if old_module.bias is not None:
190
+ new_module.bias = old_module.bias
191
+ if getattr(old_module, "state", None) is not None:
192
+ new_module.state = old_module.state
193
+ new_module.to(old_module.weight.device)
194
+
195
+ # dispatch to correct device
196
+ for name, module in new_module.named_modules():
197
+ if "adapter_" in name:
198
+ module.to(old_module.weight.device)
199
+
200
+ def __getattr__(self, name: str):
201
+ """Forward missing attributes to the wrapped module."""
202
+ try:
203
+ return super().__getattr__(name) # defer to nn.Module's logic
204
+ except AttributeError:
205
+ return getattr(self.model, name)
206
+
207
+ @property
208
+ def modules_to_save(self):
209
+ return None
210
+
211
+ def get_peft_config_as_dict(self, inference: bool = False):
212
+ config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(self.peft_config).items()}
213
+ if inference:
214
+ config["inference_mode"] = True
215
+ return config
216
+
217
+ def _set_adapter_layers(self, enabled=True):
218
+ for module in self.model.modules():
219
+ if isinstance(module, AdapterLayer):
220
+ module.disable_adapters = False if enabled else True
221
+
222
+ def enable_adapter_layers(self):
223
+ self._set_adapter_layers(enabled=True)
224
+
225
+ def disable_adapter_layers(self):
226
+ self._set_adapter_layers(enabled=False)
227
+
228
+
229
+ # Below code is based on https://github.com/adapter-hub/adapter-transformers/blob/master/src/transformers/adapters/modeling.py and lora.py from huggingfance PEFT
230
+ # and modified to work with PyTorch FSDP
231
+
232
+
233
+ # ------------------------------------------------------------------------------------------
234
+ # Copyright (c) Microsoft Corporation. All rights reserved.
235
+ # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
236
+ # ------------------------------------------------------------------------------------------
237
+
238
+
239
+ # Copy from lora.py
240
+ # had to adapt it for `lora_only` to work
241
+ def mark_only_adapter_as_trainable(model: nn.Module, bias: str = "none") -> None:
242
+ for n, p in model.named_parameters():
243
+ if "adapter_" not in n:
244
+ p.requires_grad = False
245
+ if bias == "none":
246
+ return
247
+ elif bias == "all":
248
+ for n, p in model.named_parameters():
249
+ if "bias" in n:
250
+ p.requires_grad = True
251
+ elif bias == "adapter_only":
252
+ for m in model.modules():
253
+ if isinstance(m, AdapterLayer) and hasattr(m, "bias") and m.bias is not None:
254
+ m.bias.requires_grad = True
255
+ else:
256
+ raise NotImplementedError
257
+
258
+
259
+ class AdapterLayer:
260
+ def __init__(
261
+ self,
262
+ bottleneck_size: int,
263
+ non_linearity: str,
264
+ adapter_dropout: float,
265
+ scaling: Union[float, str],
266
+ ):
267
+ self.bottleneck_size = bottleneck_size
268
+ self.non_linearity = non_linearity
269
+ self.scaling = scaling
270
+ #optional dropout
271
+ if adapter_dropout > 0.0:
272
+ self.adapter_dropout = nn.Dropout(p=adapter_dropout)
273
+ else:
274
+ self.adapter_dropout = lambda x: x
275
+ self.disable_adapters = False
276
+
277
+
278
+ class Linear(nn.Linear, AdapterLayer):
279
+ """
280
+ Bottleneck adapter in a dense layer. The adapter can be applied after the multi-head attention layer and/or
281
+ after the feed-forward layer.
282
+ """
283
+ def __init__(
284
+ self,
285
+ in_features: int,
286
+ out_features: int,
287
+ adapter_type: str,
288
+ bottleneck_size: int,
289
+ non_linearity: str,
290
+ adapter_dropout: float,
291
+ scaling: Union[float, str],
292
+ init_weights: str,
293
+ **kwargs,
294
+ ):
295
+ nn.Linear.__init__(self, in_features, out_features, **kwargs)
296
+ AdapterLayer.__init__(self, bottleneck_size=bottleneck_size,
297
+ non_linearity=non_linearity,
298
+ adapter_dropout=adapter_dropout,
299
+ scaling=scaling)
300
+
301
+ self.init_weights = init_weights
302
+ self.adapter_type = adapter_type
303
+ if isinstance(scaling, float):
304
+ self.adapter_scaling = scaling
305
+ elif scaling == "learned":
306
+ self.adapter_scaling = nn.Parameter(torch.ones(1))
307
+ # Actual trainable parameters
308
+ self.adapter_down = nn.Linear(in_features, bottleneck_size, bias=False)
309
+ self.adapter_up = nn.Linear(bottleneck_size, out_features, bias=False)
310
+ self.act_fn = ACT2FN[self.non_linearity]
311
+ #Freezing the pre-trained weight matrix
312
+ self.weight.requires_grad = False
313
+ self.reset_parameters()
314
+
315
+ def reset_parameters(self):
316
+ nn.Linear.reset_parameters(self)
317
+ # if we want to initialize with the bert strategy then this function is called for all the linear layers
318
+ if hasattr(self, "adapter_down"):
319
+ if self.init_weights == "bert":
320
+ self.adapter_down.apply(self.init_bert_weights)
321
+ self.adapter_up.apply(self.init_bert_weights)
322
+ elif self.init_weights == "mam_adapter":
323
+ nn.init.kaiming_uniform_(self.adapter_down.weight, a=math.sqrt(5))
324
+ nn.init.zeros_(self.adapter_up.weight)
325
+ else:
326
+ raise ValueError("Unknown init_weights type: {}".format(config["init_weights"]))
327
+
328
+ # This is copied from the BertPreTrainedModel class to make this a self containing class.
329
+ @staticmethod
330
+ def init_bert_weights(module):
331
+ """Initialize the weights."""
332
+ if isinstance(module, (nn.Linear, nn.Embedding)):
333
+ # std defaults to 0.02, this might need to be changed
334
+ module.weight.data.normal_(mean=0.0, std=0.02)
335
+ elif isinstance(module, nn.LayerNorm):
336
+ module.bias.data.zero_()
337
+ module.weight.data.fill_(1.0)
338
+ if isinstance(module, nn.Linear) and module.bias is not None:
339
+ module.bias.data.zero_()
340
+
341
+ def train(self, mode: bool = True):
342
+ nn.Linear.train(self, mode)
343
+ self.adapter_down.train(mode)
344
+ self.adapter_up.train(mode)
345
+
346
+ def eval(self):
347
+ nn.Linear.eval(self)
348
+ self.adapter_down.eval()
349
+ self.adapter_up.eval()
350
+
351
+ def forward(self, x: torch.Tensor):
352
+ if self.disable_adapters:
353
+ return F.linear(x, self.weight, bias=self.bias)
354
+ else:
355
+ if self.adapter_type == "mh_adapter":
356
+ # for mh_adapter, x will pass the adapter first and then the linear layer
357
+ expected_dtype = x.dtype
358
+ residual = x
359
+
360
+ if x.dtype != torch.float32:
361
+ x = x.float()
362
+ output = self.adapter_up(self.act_fn(self.adapter_down(self.adapter_dropout(x)))).to(expected_dtype) * self.adapter_scaling
363
+
364
+ output = output + residual
365
+
366
+ result = F.linear(output, self.weight, bias=self.bias)
367
+ elif self.adapter_type == "output_adapter":
368
+ # for output_adapter, x will pass the linear layer first and then the adapter
369
+ x = F.linear(x, self.weight, bias=self.bias)
370
+ expected_dtype = x.dtype
371
+ residual = x
372
+
373
+ if x.dtype != torch.float32:
374
+ x = x.float()
375
+
376
+ output = self.adapter_up(self.act_fn(self.adapter_down(self.adapter_dropout(x)))).to(expected_dtype) * self.adapter_scaling
377
+
378
+ result = output + residual
379
+ elif self.adapter_type == "parallel_adapter":
380
+ # for parallel_adapter, x will pass the linear layer first and the adapter layer parallelly.
381
+ # The output of the adapter layer will be added to the output of the linear layer
382
+ result = F.linear(x, self.weight, bias=self.bias)
383
+ expected_dtype = result.dtype
384
+
385
+ if x.dtype != torch.float32:
386
+ x = x.float()
387
+ output = self.adapter_up(self.act_fn(self.adapter_down(self.adapter_dropout(x)))).to(expected_dtype) * self.adapter_scaling
388
+
389
+ result = result + output
390
+ return result
391
+
392
+
393
+ if is_bnb_available():
394
+
395
+ class Linear8bitLt(bnb.nn.Linear8bitLt, AdapterLayer):
396
+ # Aadapter layer for 8bit linear layer
397
+ def __init__(
398
+ self,
399
+ in_features: int,
400
+ out_features: int,
401
+ adapter_type: str,
402
+ bottleneck_size: int,
403
+ non_linearity: str,
404
+ adapter_dropout: float,
405
+ scaling: Union[float, str],
406
+ init_weights: str,
407
+ **kwargs,
408
+ ):
409
+ bnb.nn.Linear8bitLt.__init__(
410
+ self,
411
+ in_features,
412
+ out_features,
413
+ bias=kwargs.get("bias", True),
414
+ has_fp16_weights=kwargs.get("has_fp16_weights", True),
415
+ memory_efficient_backward=kwargs.get("memory_efficient_backward", False),
416
+ threshold=kwargs.get("threshold", 0.0),
417
+ index=kwargs.get("index", None),
418
+ )
419
+ AdapterLayer.__init__(
420
+ self,
421
+ bottleneck_size=bottleneck_size,
422
+ non_linearity=non_linearity,
423
+ adapter_dropout=adapter_dropout,
424
+ scaling=scaling,)
425
+
426
+ self.init_weights = init_weights
427
+ self.adapter_type = adapter_type
428
+ if isinstance(scaling, float):
429
+ self.adapter_scaling = scaling
430
+ elif scaling == "learned":
431
+ self.adapter_scaling = nn.Parameter(torch.ones(1))
432
+ # Actual trainable parameters
433
+ self.adapter_down = nn.Linear(in_features, bottleneck_size, bias=False)
434
+ self.adapter_up = nn.Linear(bottleneck_size, out_features, bias=False)
435
+ self.act_fn = ACT2FN[self.non_linearity]
436
+ #Freezing the pre-trained weight matrix
437
+ self.weight.requires_grad = False
438
+ self.reset_parameters()
439
+
440
+ def reset_parameters(self):
441
+ nn.Linear.reset_parameters(self)
442
+ # if we want to initialize with the bert strategy then this function is called for all the linear layers
443
+ if hasattr(self, "adapter_down"):
444
+ if self.init_weights == "bert":
445
+ self.adapter_down.apply(self.init_bert_weights)
446
+ self.adapter_up.apply(self.init_bert_weights)
447
+ elif self.init_weights == "mam_adapter":
448
+ nn.init.kaiming_uniform_(self.adapter_down.weight, a=math.sqrt(5))
449
+ nn.init.zeros_(self.adapter_up.weight)
450
+ else:
451
+ raise ValueError("Unknown init_weights type: {}".format(config["init_weights"]))
452
+
453
+ # This is copied from the BertPreTrainedModel class to make this a self containing class.
454
+ @staticmethod
455
+ def init_bert_weights(module):
456
+ """Initialize the weights."""
457
+ if isinstance(module, (nn.Linear, nn.Embedding)):
458
+ # std defaults to 0.02, this might need to be changed
459
+ module.weight.data.normal_(mean=0.0, std=0.02)
460
+ elif isinstance(module, nn.LayerNorm):
461
+ module.bias.data.zero_()
462
+ module.weight.data.fill_(1.0)
463
+ if isinstance(module, nn.Linear) and module.bias is not None:
464
+ module.bias.data.zero_()
465
+
466
+ def forward(self, x: torch.Tensor):
467
+ result_pre_forward = super().forward(x)
468
+
469
+ if self.disable_adapters:
470
+ return result_pre_forward
471
+ else:
472
+ if self.adapter_type == "mh_adapter":
473
+ if not torch.is_autocast_enabled():
474
+ expected_dtype = x.dtype
475
+
476
+ if x.dtype != torch.float32:
477
+ x = x.float()
478
+
479
+ residual = x
480
+ output = self.adapter_up(self.act_fn(self.adapter_down(self.adapter_dropout(x)))).to(expected_dtype) * self.adapter_scaling
481
+ output = (output + residual).to(expected_dtype)
482
+
483
+ result = super().forward(output)
484
+ else:
485
+ residual = x
486
+ output = self.adapter_up(self.act_fn(self.adapter_down(self.adapter_dropout(x)))) * self.adapter_scaling
487
+ output = output + residual
488
+
489
+ result = super().forward(output)
490
+ elif self.adapter_type == "output_adapter":
491
+ if not torch.is_autocast_enabled():
492
+ expected_dtype = result_pre_forward.dtype
493
+
494
+ if result_pre_forward.dtype != torch.float32:
495
+ result_pre_forward = result_pre_forward.float()
496
+
497
+ residual = result_pre_forward
498
+ output = self.adapter_up(self.act_fn(self.adapter_down(self.adapter_dropout(result_pre_forward)))).to(expected_dtype) * self.adapter_scaling
499
+ result = (output + residual).to(expected_dtype)
500
+ else:
501
+ residual = result_pre_forward
502
+ output = self.adapter_up(self.act_fn(self.adapter_down(self.adapter_dropout(result_pre_forward)))) * self.adapter_scaling
503
+ result = output + residual
504
+ elif self.adapter_type == "parallel_adapter":
505
+ if not torch.is_autocast_enabled():
506
+ expected_dtype = result_pre_forward.dtype
507
+
508
+ if x.dtype != torch.float32:
509
+ x = x.float()
510
+
511
+ output = self.adapter_up(self.act_fn(self.adapter_down(self.adapter_dropout(x)))).to(expected_dtype) * self.adapter_scaling
512
+ result = result_pre_forward + output
513
+ else:
514
+ output = self.adapter_up(self.act_fn(self.adapter_down(self.adapter_dropout(x)))) * self.adapter_scaling
515
+ result = result_pre_forward + output
516
+
517
+ return result
518
+
519
+
520
+
521
+
522
+
523
+
524
+
525
+
526
+
527
+
528
+
529
+
530
+
531
+
532
+
SVFT-main/LLM-Adapters/peft/src/peft/tuners/lora.py ADDED
@@ -0,0 +1,624 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023-present the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ import importlib
16
+ import math
17
+ import re
18
+ import warnings
19
+ from dataclasses import asdict, dataclass, field
20
+ from enum import Enum
21
+ from typing import List, Optional, Union
22
+
23
+ import torch
24
+ import torch.nn as nn
25
+ import torch.nn.functional as F
26
+ from transformers.pytorch_utils import Conv1D
27
+
28
+ from ..utils import PeftConfig, PeftType, transpose
29
+
30
+
31
+ def is_bnb_available():
32
+ return importlib.util.find_spec("bitsandbytes") is not None
33
+
34
+
35
+ if is_bnb_available():
36
+ import bitsandbytes as bnb
37
+
38
+
39
+ @dataclass
40
+ class LoraConfig(PeftConfig):
41
+ """
42
+ This is the configuration class to store the configuration of a [`~peft.Lora`].
43
+
44
+ Args:
45
+ r (`int`): Lora attention dimension
46
+ target_modules (`Union[List[str],str]`): The names of the modules to apply Lora to.
47
+ lora_alpha (`float`): The alpha parameter for Lora scaling.
48
+ lora_dropout (`float`): The dropout probability for Lora layers.
49
+ merge_weights (`bool`):
50
+ Whether to merge the weights of the Lora layers with the base transformer model in `eval` mode.
51
+ fan_in_fan_out (`bool`): Set this to True if the layer to replace stores weight like (fan_in, fan_out)
52
+ enable_lora ( `List[bool]`): Used with `lora.MergedLinear`.
53
+ bias (`str`): Bias type for Lora. Can be 'none', 'all' or 'lora_only'
54
+ modules_to_save (`List[str]`):List of modules apart from LoRA layers to be set as trainable
55
+ and saved in the final checkpoint.
56
+ """
57
+
58
+ r: int = field(default=8, metadata={"help": "Lora attention dimension"})
59
+ target_modules: Optional[Union[List[str], str]] = field(
60
+ default=None,
61
+ metadata={
62
+ "help": "List of module names or regex expression of the module names to replace with Lora."
63
+ "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
64
+ },
65
+ )
66
+ lora_alpha: int = field(default=None, metadata={"help": "Lora alpha"})
67
+ lora_dropout: float = field(default=None, metadata={"help": "Lora dropout"})
68
+ merge_weights: bool = field(
69
+ default=False, metadata={"help": "Merge weights of the original model and the Lora model"}
70
+ )
71
+ fan_in_fan_out: bool = field(
72
+ default=False,
73
+ metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
74
+ )
75
+ enable_lora: Optional[List[bool]] = field(default=None, metadata={"help": "Used with `lora.MergedLinear`."})
76
+ bias: str = field(default="none", metadata={"help": "Bias type for Lora. Can be 'none', 'all' or 'lora_only'"})
77
+ modules_to_save: Optional[List[str]] = field(
78
+ default=None,
79
+ metadata={
80
+ "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. "
81
+ "For example, in Sequence Classification or Token Classification tasks, "
82
+ "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
83
+ },
84
+ )
85
+
86
+ def __post_init__(self):
87
+ self.peft_type = PeftType.LORA
88
+
89
+
90
+ class LoraModel(torch.nn.Module):
91
+ """
92
+ Creates Low Rank Adapter (Lora) model from a pretrained transformers model.
93
+
94
+ Args:
95
+ model ([`transformers.PreTrainedModel`]): The model to be adapted.
96
+ config ([`LoraConfig`]): The configuration of the Lora model.
97
+
98
+ Returns:
99
+ `torch.nn.Module`: The Lora model.
100
+
101
+ Example::
102
+
103
+ >>> from transformers import AutoModelForSeq2SeqLM, LoraConfig >>> from peft import LoraModel, LoraConfig >>>
104
+ config = LoraConfig(
105
+ peft_type="LORA", task_type="SEQ_2_SEQ_LM", r=8, lora_alpha=32, target_modules=["q", "v"],
106
+ lora_dropout=0.01, )
107
+ >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") >>> lora_model = LoraModel(config, model)
108
+
109
+ **Attributes**:
110
+ - **model** ([`transformers.PreTrainedModel`]) -- The model to be adapted.
111
+ - **peft_config** ([`LoraConfig`]): The configuration of the Lora model.
112
+ """
113
+
114
+ def __init__(self, config, model):
115
+ super().__init__()
116
+ self.peft_config = config
117
+ self.model = model
118
+ self._find_and_replace()
119
+ mark_only_lora_as_trainable(self.model, self.peft_config.bias)
120
+ self.forward = self.model.forward
121
+
122
+ def _find_and_replace(self):
123
+ loaded_in_8bit = getattr(self.model, "is_loaded_in_8bit", False)
124
+ if loaded_in_8bit and not is_bnb_available():
125
+ raise ImportError(
126
+ "To use Lora with 8-bit quantization, please install the `bitsandbytes` package. "
127
+ "You can install it with `pip install bitsandbytes`."
128
+ )
129
+ is_target_modules_in_base_model = False
130
+ is_hf_device_map_available = hasattr(self.model, "hf_device_map")
131
+ kwargs = {
132
+ "r": self.peft_config.r,
133
+ "lora_alpha": self.peft_config.lora_alpha,
134
+ "lora_dropout": self.peft_config.lora_dropout,
135
+ "fan_in_fan_out": self.peft_config.fan_in_fan_out,
136
+ "merge_weights": (self.peft_config.merge_weights or self.peft_config.inference_mode)
137
+ and not is_hf_device_map_available,
138
+ }
139
+ key_list = [key for key, _ in self.model.named_modules()]
140
+ for key in key_list:
141
+ if isinstance(self.peft_config.target_modules, str):
142
+ target_module_found = re.fullmatch(self.peft_config.target_modules, key)
143
+ else:
144
+ target_module_found = any(key.endswith(target_key) for target_key in self.peft_config.target_modules)
145
+ if target_module_found:
146
+ if not is_target_modules_in_base_model:
147
+ is_target_modules_in_base_model = True
148
+ parent, target, target_name = self._get_submodules(key)
149
+ bias = target.bias is not None
150
+ if loaded_in_8bit and isinstance(target, bnb.nn.Linear8bitLt):
151
+ kwargs.update(
152
+ {
153
+ "has_fp16_weights": target.state.has_fp16_weights,
154
+ "memory_efficient_backward": target.state.memory_efficient_backward,
155
+ "threshold": target.state.threshold,
156
+ "index": target.index,
157
+ }
158
+ )
159
+ if self.peft_config.enable_lora is None:
160
+ new_module = Linear8bitLt(target.in_features, target.out_features, bias=bias, **kwargs)
161
+ else:
162
+ kwargs.update({"enable_lora": self.peft_config.enable_lora})
163
+ new_module = MergedLinear8bitLt(target.in_features, target.out_features, bias=bias, **kwargs)
164
+ elif isinstance(target, torch.nn.Linear) and self.peft_config.enable_lora is None:
165
+ new_module = Linear(target.in_features, target.out_features, bias=bias, **kwargs)
166
+ elif self.peft_config.enable_lora is not None:
167
+ kwargs.update({"enable_lora": self.peft_config.enable_lora})
168
+ if isinstance(target, Conv1D):
169
+ in_features, out_features = (
170
+ target.weight.ds_shape if hasattr(target.weight, "ds_shape") else target.weight.shape
171
+ )
172
+ else:
173
+ in_features, out_features = target.in_features, target.out_features
174
+ if kwargs["fan_in_fan_out"]:
175
+ warnings.warn(
176
+ "fan_in_fan_out is set to True but the target module is not a Conv1D. "
177
+ "Setting fan_in_fan_out to False."
178
+ )
179
+ kwargs["fan_in_fan_out"] = self.peft_config.fan_in_fan_out = False
180
+ new_module = MergedLinear(in_features, out_features, bias=bias, **kwargs)
181
+ self._replace_module(parent, target_name, new_module, target)
182
+ if not is_target_modules_in_base_model:
183
+ raise ValueError(
184
+ f"Target modules {self.peft_config.target_modules} not found in the base model. "
185
+ f"Please check the target modules and try again."
186
+ )
187
+
188
+ def _get_submodules(self, key):
189
+ parent = self.model.get_submodule(".".join(key.split(".")[:-1]))
190
+ target_name = key.split(".")[-1]
191
+ target = self.model.get_submodule(key)
192
+ return parent, target, target_name
193
+
194
+ def _replace_module(self, parent_module, child_name, new_module, old_module):
195
+ setattr(parent_module, child_name, new_module)
196
+ new_module.weight = old_module.weight
197
+ if old_module.bias is not None:
198
+ new_module.bias = old_module.bias
199
+ if getattr(old_module, "state", None) is not None:
200
+ new_module.state = old_module.state
201
+ new_module.to(old_module.weight.device)
202
+
203
+ # dispatch to correct device
204
+ for name, module in new_module.named_modules():
205
+ if "lora_" in name:
206
+ module.to(old_module.weight.device)
207
+
208
+ def __getattr__(self, name: str):
209
+ """Forward missing attributes to the wrapped module."""
210
+ try:
211
+ return super().__getattr__(name) # defer to nn.Module's logic
212
+ except AttributeError:
213
+ return getattr(self.model, name)
214
+
215
+ @property
216
+ def modules_to_save(self):
217
+ return None
218
+
219
+ def get_peft_config_as_dict(self, inference: bool = False):
220
+ config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(self.peft_config).items()}
221
+ if inference:
222
+ config["inference_mode"] = True
223
+ return config
224
+
225
+ def _set_adapter_layers(self, enabled=True):
226
+ for module in self.model.modules():
227
+ if isinstance(module, LoraLayer):
228
+ module.disable_adapters = False if enabled else True
229
+
230
+ def enable_adapter_layers(self):
231
+ self._set_adapter_layers(enabled=True)
232
+
233
+ def disable_adapter_layers(self):
234
+ self._set_adapter_layers(enabled=False)
235
+
236
+
237
+ # Below code is based on https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
238
+ # and modified to work with PyTorch FSDP
239
+
240
+
241
+ # ------------------------------------------------------------------------------------------
242
+ # Copyright (c) Microsoft Corporation. All rights reserved.
243
+ # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
244
+ # ------------------------------------------------------------------------------------------
245
+
246
+
247
+ # had to adapt it for `lora_only` to work
248
+ def mark_only_lora_as_trainable(model: nn.Module, bias: str = "none") -> None:
249
+ for n, p in model.named_parameters():
250
+ if "lora_" not in n:
251
+ p.requires_grad = False
252
+ if bias == "none":
253
+ return
254
+ elif bias == "all":
255
+ for n, p in model.named_parameters():
256
+ if "bias" in n:
257
+ p.requires_grad = True
258
+ elif bias == "lora_only":
259
+ for m in model.modules():
260
+ if isinstance(m, LoraLayer) and hasattr(m, "bias") and m.bias is not None:
261
+ m.bias.requires_grad = True
262
+ else:
263
+ raise NotImplementedError
264
+
265
+
266
+ class LoraLayer:
267
+ def __init__(
268
+ self,
269
+ r: int,
270
+ lora_alpha: int,
271
+ lora_dropout: float,
272
+ merge_weights: bool,
273
+ ):
274
+ self.r = r
275
+ self.lora_alpha = lora_alpha
276
+ # Optional dropout
277
+ if lora_dropout > 0.0:
278
+ self.lora_dropout = nn.Dropout(p=lora_dropout)
279
+ else:
280
+ self.lora_dropout = lambda x: x
281
+ # Mark the weight as unmerged
282
+ self.merged = False
283
+ self.merge_weights = merge_weights
284
+ self.disable_adapters = False
285
+
286
+
287
+ class Linear(nn.Linear, LoraLayer):
288
+ # Lora implemented in a dense layer
289
+ def __init__(
290
+ self,
291
+ in_features: int,
292
+ out_features: int,
293
+ r: int = 0,
294
+ lora_alpha: int = 1,
295
+ lora_dropout: float = 0.0,
296
+ fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
297
+ merge_weights: bool = True,
298
+ **kwargs,
299
+ ):
300
+ nn.Linear.__init__(self, in_features, out_features, **kwargs)
301
+ LoraLayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, merge_weights=merge_weights)
302
+
303
+ self.fan_in_fan_out = fan_in_fan_out
304
+ # Actual trainable parameters
305
+ if r > 0:
306
+ self.lora_A = nn.Linear(in_features, r, bias=False)
307
+ self.lora_B = nn.Linear(r, out_features, bias=False)
308
+ self.scaling = self.lora_alpha / self.r
309
+ # Freezing the pre-trained weight matrix
310
+ self.weight.requires_grad = False
311
+ self.reset_parameters()
312
+ if fan_in_fan_out:
313
+ self.weight.data = self.weight.data.T
314
+
315
+ def reset_parameters(self):
316
+ nn.Linear.reset_parameters(self)
317
+ if hasattr(self, "lora_A"):
318
+ # initialize A the same way as the default for nn.Linear and B to zero
319
+ nn.init.kaiming_uniform_(self.lora_A.weight, a=math.sqrt(5))
320
+ nn.init.zeros_(self.lora_B.weight)
321
+
322
+ def train(self, mode: bool = True):
323
+ nn.Linear.train(self, mode)
324
+ self.lora_A.train(mode)
325
+ self.lora_B.train(mode)
326
+ if not mode and self.merge_weights and not self.merged:
327
+ # Merge the weights and mark it
328
+ if self.r > 0:
329
+ self.weight.data += (
330
+ transpose(self.lora_B.weight @ self.lora_A.weight, self.fan_in_fan_out) * self.scaling
331
+ )
332
+ self.merged = True
333
+ elif self.merge_weights and self.merged:
334
+ # Make sure that the weights are not merged
335
+ if self.r > 0:
336
+ self.weight.data -= (
337
+ transpose(self.lora_B.weight @ self.lora_A.weight, self.fan_in_fan_out) * self.scaling
338
+ )
339
+ self.merged = False
340
+
341
+ def eval(self):
342
+ nn.Linear.eval(self)
343
+ self.lora_A.eval()
344
+ self.lora_B.eval()
345
+
346
+ def forward(self, x: torch.Tensor):
347
+ previous_dtype = self.weight.dtype
348
+
349
+ if self.disable_adapters:
350
+ if self.r > 0 and self.merged:
351
+ matmul_output = self.lora_B.weight @ self.lora_A.weight
352
+ self.weight.data -= transpose(matmul_output.to(previous_dtype), self.fan_in_fan_out) * self.scaling
353
+ self.merged = False
354
+
355
+ result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
356
+ elif self.r > 0 and not self.merged:
357
+ result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
358
+ if self.r > 0:
359
+ result += self.lora_B(self.lora_A(self.lora_dropout(x.to(self.lora_A.weight.dtype)))) * self.scaling
360
+ else:
361
+ result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
362
+
363
+ if result.dtype != previous_dtype:
364
+ result = result.to(previous_dtype)
365
+
366
+ return result
367
+
368
+ class MergedLinear(nn.Linear, LoraLayer):
369
+ # Lora implemented in a dense layer
370
+ def __init__(
371
+ self,
372
+ in_features: int,
373
+ out_features: int,
374
+ r: int = 0,
375
+ lora_alpha: int = 1,
376
+ lora_dropout: float = 0.0,
377
+ enable_lora: List[bool] = [False],
378
+ fan_in_fan_out: bool = False,
379
+ merge_weights: bool = True,
380
+ **kwargs,
381
+ ):
382
+ nn.Linear.__init__(self, in_features, out_features, **kwargs)
383
+ LoraLayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, merge_weights=merge_weights)
384
+ if out_features % len(enable_lora) != 0:
385
+ raise ValueError("The length of enable_lora must divide out_features")
386
+ self.enable_lora = enable_lora
387
+ self.fan_in_fan_out = fan_in_fan_out
388
+ # Actual trainable parameters
389
+ if r > 0 and any(enable_lora):
390
+ self.lora_A = nn.Linear(in_features, r * sum(enable_lora), bias=False)
391
+ self.lora_B = nn.Conv1d(
392
+ r * sum(enable_lora),
393
+ out_features // len(enable_lora) * sum(enable_lora),
394
+ kernel_size=1,
395
+ groups=2,
396
+ bias=False,
397
+ )
398
+ self.scaling = self.lora_alpha / self.r
399
+ # Freezing the pre-trained weight matrix
400
+ self.weight.requires_grad = False
401
+ # Compute the indices
402
+ self.lora_ind = self.weight.new_zeros((out_features,), dtype=torch.bool).view(len(enable_lora), -1)
403
+ self.lora_ind[enable_lora, :] = True
404
+ self.lora_ind = self.lora_ind.view(-1)
405
+ self.reset_parameters()
406
+ if fan_in_fan_out:
407
+ self.weight.data = self.weight.data.T
408
+
409
+ def reset_parameters(self):
410
+ nn.Linear.reset_parameters(self)
411
+ if hasattr(self, "lora_A"):
412
+ # initialize A the same way as the default for nn.Linear and B to zero
413
+ nn.init.kaiming_uniform_(self.lora_A.weight, a=math.sqrt(5))
414
+ nn.init.zeros_(self.lora_B.weight)
415
+
416
+ def zero_pad(self, x):
417
+ result = x.new_zeros((*x.shape[:-1], self.out_features))
418
+ result = result.view(-1, self.out_features)
419
+ result[:, self.lora_ind] = x.reshape(-1, self.out_features // len(self.enable_lora) * sum(self.enable_lora))
420
+ return result.view((*x.shape[:-1], self.out_features))
421
+
422
+ def train(self, mode: bool = True):
423
+ nn.Linear.train(self, mode)
424
+ self.lora_A.train(mode)
425
+ self.lora_B.train(mode)
426
+ if not mode and self.merge_weights and not self.merged:
427
+ # Merge the weights and mark it
428
+ if self.r > 0 and any(self.enable_lora):
429
+ delta_w = (
430
+ F.conv1d(
431
+ self.lora_A.weight.data.unsqueeze(0),
432
+ self.lora_B.weight.data,
433
+ groups=sum(self.enable_lora),
434
+ )
435
+ .squeeze(0)
436
+ .transpose(-2, -1)
437
+ )
438
+ self.weight.data += transpose(self.zero_pad(delta_w * self.scaling), not self.fan_in_fan_out)
439
+ self.merged = True
440
+ elif self.merge_weights and self.merged:
441
+ # Make sure that the weights are not merged
442
+ if self.r > 0 and any(self.enable_lora):
443
+ delta_w = (
444
+ F.conv1d(
445
+ self.lora_A.weight.data.unsqueeze(0),
446
+ self.lora_B.weight.data,
447
+ groups=sum(self.enable_lora),
448
+ )
449
+ .squeeze(0)
450
+ .transpose(-2, -1)
451
+ )
452
+ self.weight.data -= transpose(self.zero_pad(delta_w * self.scaling), not self.fan_in_fan_out)
453
+ self.merged = False
454
+
455
+ def eval(self):
456
+ nn.Linear.eval(self)
457
+ self.lora_A.eval()
458
+ self.lora_B.eval()
459
+
460
+ def forward(self, x: torch.Tensor):
461
+ previous_dtype = x.dtype
462
+ if self.disable_adapters:
463
+ if self.r > 0 and self.merged and any(self.enable_lora):
464
+ delta_w = (
465
+ F.conv1d(
466
+ self.lora_A.weight.data.unsqueeze(0),
467
+ self.lora_B.weight.data,
468
+ groups=sum(self.enable_lora),
469
+ )
470
+ .squeeze(0)
471
+ .transpose(-2, -1)
472
+ )
473
+ delta_w = delta_w.to(self.weight.dtype)
474
+ self.weight.data -= transpose(self.zero_pad(delta_w * self.scaling), not self.fan_in_fan_out)
475
+ self.merged = False
476
+ result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
477
+ elif self.merged:
478
+ result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
479
+ else:
480
+ result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
481
+ if self.r > 0:
482
+ after_A = self.lora_A(self.lora_dropout(x.to(self.lora_A.weight.dtype)))
483
+ after_B = self.lora_B(after_A.transpose(-2, -1)).transpose(-2, -1)
484
+ result += self.zero_pad(after_B) * self.scaling
485
+ result = result.to(previous_dtype)
486
+
487
+ return result
488
+
489
+
490
+ if is_bnb_available():
491
+
492
+ class Linear8bitLt(bnb.nn.Linear8bitLt, LoraLayer):
493
+ # Lora implemented in a dense layer
494
+ def __init__(
495
+ self,
496
+ in_features,
497
+ out_features,
498
+ r: int = 0,
499
+ lora_alpha: int = 1,
500
+ lora_dropout: float = 0.0,
501
+ **kwargs,
502
+ ):
503
+ bnb.nn.Linear8bitLt.__init__(
504
+ self,
505
+ in_features,
506
+ out_features,
507
+ bias=kwargs.get("bias", True),
508
+ has_fp16_weights=kwargs.get("has_fp16_weights", True),
509
+ memory_efficient_backward=kwargs.get("memory_efficient_backward", False),
510
+ threshold=kwargs.get("threshold", 0.0),
511
+ index=kwargs.get("index", None),
512
+ )
513
+ LoraLayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, merge_weights=False)
514
+ # Actual trainable parameters
515
+ if r > 0:
516
+ self.lora_A = nn.Linear(in_features, r, bias=False)
517
+ self.lora_B = nn.Linear(r, out_features, bias=False)
518
+ self.scaling = self.lora_alpha / self.r
519
+ # Freezing the pre-trained weight matrix
520
+ self.weight.requires_grad = False
521
+ self.reset_parameters()
522
+
523
+ def reset_parameters(self):
524
+ if hasattr(self, "lora_A"):
525
+ # initialize A the same way as the default for nn.Linear and B to zero
526
+ nn.init.kaiming_uniform_(self.lora_A.weight, a=math.sqrt(5))
527
+ nn.init.zeros_(self.lora_B.weight)
528
+
529
+ def forward(self, x: torch.Tensor):
530
+ result = super().forward(x)
531
+
532
+ if self.disable_adapters:
533
+ return result
534
+ elif self.r > 0:
535
+ if not torch.is_autocast_enabled():
536
+ expected_dtype = result.dtype
537
+
538
+ if x.dtype != torch.float32:
539
+ x = x.float()
540
+ output = self.lora_B(self.lora_A(self.lora_dropout(x))).to(expected_dtype) * self.scaling
541
+ result += output
542
+ else:
543
+ output = self.lora_B(self.lora_A(self.lora_dropout(x))) * self.scaling
544
+ result += output
545
+ return result
546
+
547
+ class MergedLinear8bitLt(bnb.nn.Linear8bitLt, LoraLayer):
548
+ # Lora implemented in a dense layer
549
+ def __init__(
550
+ self,
551
+ in_features: int,
552
+ out_features: int,
553
+ r: int = 0,
554
+ lora_alpha: int = 1,
555
+ lora_dropout: float = 0.0,
556
+ enable_lora: List[bool] = [False],
557
+ **kwargs,
558
+ ):
559
+ bnb.nn.Linear8bitLt.__init__(
560
+ self,
561
+ in_features,
562
+ out_features,
563
+ bias=kwargs.get("bias", True),
564
+ has_fp16_weights=kwargs.get("has_fp16_weights", True),
565
+ memory_efficient_backward=kwargs.get("memory_efficient_backward", False),
566
+ threshold=kwargs.get("threshold", 0.0),
567
+ index=kwargs.get("index", None),
568
+ )
569
+ LoraLayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, merge_weights=False)
570
+ if out_features % len(enable_lora) != 0:
571
+ raise ValueError("The length of enable_lora must divide out_features")
572
+ self.enable_lora = enable_lora
573
+ # Actual trainable parameters
574
+ if r > 0 and any(enable_lora):
575
+ self.lora_A = nn.Linear(in_features, r * sum(enable_lora), bias=False)
576
+ self.lora_B = nn.Conv1d(
577
+ r * sum(enable_lora),
578
+ out_features // len(enable_lora) * sum(enable_lora),
579
+ kernel_size=1,
580
+ groups=2,
581
+ bias=False,
582
+ )
583
+ self.scaling = self.lora_alpha / self.r
584
+ # Freezing the pre-trained weight matrix
585
+ self.weight.requires_grad = False
586
+ # Compute the indices
587
+ self.lora_ind = self.weight.new_zeros((out_features,), dtype=torch.bool).view(len(enable_lora), -1)
588
+ self.lora_ind[enable_lora, :] = True
589
+ self.lora_ind = self.lora_ind.view(-1)
590
+ self.reset_parameters()
591
+
592
+ def reset_parameters(self):
593
+ if hasattr(self, "lora_A"):
594
+ # initialize A the same way as the default for nn.Linear and B to zero
595
+ nn.init.kaiming_uniform_(self.lora_A.weight, a=math.sqrt(5))
596
+ nn.init.zeros_(self.lora_B.weight)
597
+
598
+ def zero_pad(self, x):
599
+ result = x.new_zeros((*x.shape[:-1], self.out_features))
600
+ result = result.view(-1, self.out_features)
601
+ result[:, self.lora_ind] = x.reshape(
602
+ -1, self.out_features // len(self.enable_lora) * sum(self.enable_lora)
603
+ )
604
+ return result.view((*x.shape[:-1], self.out_features))
605
+
606
+ def forward(self, x: torch.Tensor):
607
+ result = super().forward(x)
608
+ if self.disable_adapters:
609
+ return result
610
+ elif self.r > 0:
611
+ if not torch.is_autocast_enabled():
612
+ expected_dtype = result.dtype
613
+ if x.dtype != torch.float32:
614
+ x = x.float()
615
+ after_A = self.lora_A(self.lora_dropout(x))
616
+ after_B = self.lora_B(after_A.transpose(-2, -1)).transpose(-2, -1)
617
+ output = self.zero_pad(after_B).to(expected_dtype) * self.scaling
618
+ result += output
619
+ else:
620
+ after_A = self.lora_A(self.lora_dropout(x))
621
+ after_B = self.lora_B(after_A.transpose(-2, -1)).transpose(-2, -1)
622
+ output = self.zero_pad(after_B) * self.scaling
623
+ result += output
624
+ return result
SVFT-main/LLM-Adapters/peft/src/peft/tuners/p_tuning.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023-present the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import enum
17
+ import warnings
18
+ from dataclasses import dataclass, field
19
+ from typing import Union
20
+
21
+ import torch
22
+
23
+ from ..utils import PeftType, PromptLearningConfig
24
+
25
+
26
+ class PromptEncoderReparameterizationType(str, enum.Enum):
27
+ MLP = "MLP"
28
+ LSTM = "LSTM"
29
+
30
+
31
+ @dataclass
32
+ class PromptEncoderConfig(PromptLearningConfig):
33
+ """
34
+ This is the configuration class to store the configuration of a [`~peft.PromptEncoder`].
35
+
36
+ Args:
37
+ encoder_reparameterization_type
38
+ (Union[[`PromptEncoderReparameterizationType`], `str`]): The type of reparameterization to use.
39
+ encoder_hidden_size (`int`): The hidden size of the prompt encoder.
40
+ encoder_num_layers (`int`): The number of layers of the prompt encoder.
41
+ encoder_dropout (`float`): The dropout probability of the prompt encoder.
42
+ """
43
+
44
+ encoder_reparameterization_type: Union[str, PromptEncoderReparameterizationType] = field(
45
+ default=PromptEncoderReparameterizationType.MLP,
46
+ metadata={"help": "How to reparameterize the prompt encoder"},
47
+ )
48
+ encoder_hidden_size: int = field(
49
+ default=None,
50
+ metadata={"help": "The hidden size of the prompt encoder"},
51
+ )
52
+ encoder_num_layers: int = field(
53
+ default=2,
54
+ metadata={"help": "The number of layers of the prompt encoder"},
55
+ )
56
+ encoder_dropout: float = field(
57
+ default=0.0,
58
+ metadata={"help": "The dropout of the prompt encoder"},
59
+ )
60
+
61
+ def __post_init__(self):
62
+ self.peft_type = PeftType.P_TUNING
63
+
64
+
65
+ # Based on https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/modules/common/prompt_encoder.py
66
+ # with some refactor
67
+ class PromptEncoder(torch.nn.Module):
68
+ """
69
+ The prompt encoder network that is used to generate the virtual token embeddings for p-tuning.
70
+
71
+ Args:
72
+ config ([`PromptEncoderConfig`]): The configuration of the prompt encoder.
73
+
74
+ Example::
75
+
76
+ >>> from peft import PromptEncoder, PromptEncoderConfig >>> config = PromptEncoderConfig(
77
+ peft_type="P_TUNING", task_type="SEQ_2_SEQ_LM", num_virtual_tokens=20, token_dim=768,
78
+ num_transformer_submodules=1, num_attention_heads=12, num_layers=12,
79
+ encoder_reparameterization_type="MLP", encoder_hidden_size=768
80
+ )
81
+ >>> prompt_encoder = PromptEncoder(config)
82
+
83
+ **Attributes**:
84
+ - **embedding** ([`~torch.nn.Embedding`]) -- The embedding layer of the prompt encoder.
85
+ - **mlp_head** ([`~torch.nn.Sequential`]) -- The MLP head of the prompt encoder if `inference_mode=False`.
86
+ - **lstm_head** ([`~torch.nn.LSTM`]) -- The LSTM head of the prompt encoder if `inference_mode=False` and
87
+ `encoder_reparameterization_type="LSTM"`.
88
+ - **token_dim** (`int`) -- The hidden embedding dimension of the base transformer model.
89
+ - **input_size** (`int`) -- The input size of the prompt encoder.
90
+ - **output_size** (`int`) -- The output size of the prompt encoder.
91
+ - **hidden_size** (`int`) -- The hidden size of the prompt encoder.
92
+ - **total_virtual_tokens** (`int`): The total number of virtual tokens of the
93
+ prompt encoder.
94
+ - **encoder_type** (Union[[`PromptEncoderReparameterizationType`], `str`]):
95
+ The encoder type of the prompt encoder.
96
+
97
+
98
+ Input shape: (batch_size, total_virtual_tokens)
99
+
100
+ Output shape: (batch_size, total_virtual_tokens, token_dim)
101
+ """
102
+
103
+ def __init__(self, config):
104
+ super().__init__()
105
+ self.token_dim = config.token_dim
106
+ self.input_size = self.token_dim
107
+ self.output_size = self.token_dim
108
+ self.hidden_size = config.encoder_hidden_size
109
+ self.total_virtual_tokens = config.num_virtual_tokens * config.num_transformer_submodules
110
+ self.encoder_type = config.encoder_reparameterization_type
111
+
112
+ # embedding
113
+ self.embedding = torch.nn.Embedding(self.total_virtual_tokens, self.token_dim)
114
+ if not config.inference_mode:
115
+ if self.encoder_type == PromptEncoderReparameterizationType.LSTM:
116
+ lstm_dropout = config.encoder_dropout
117
+ num_layers = config.encoder_num_layers
118
+ # LSTM
119
+ self.lstm_head = torch.nn.LSTM(
120
+ input_size=self.input_size,
121
+ hidden_size=self.hidden_size,
122
+ num_layers=num_layers,
123
+ dropout=lstm_dropout,
124
+ bidirectional=True,
125
+ batch_first=True,
126
+ )
127
+
128
+ self.mlp_head = torch.nn.Sequential(
129
+ torch.nn.Linear(self.hidden_size * 2, self.hidden_size * 2),
130
+ torch.nn.ReLU(),
131
+ torch.nn.Linear(self.hidden_size * 2, self.output_size),
132
+ )
133
+
134
+ elif self.encoder_type == PromptEncoderReparameterizationType.MLP:
135
+ warnings.warn(
136
+ f"for {self.encoder_type}, the `encoder_num_layers` is ignored. Exactly 2 MLP layers are used."
137
+ )
138
+ layers = [
139
+ torch.nn.Linear(self.input_size, self.hidden_size),
140
+ torch.nn.ReLU(),
141
+ torch.nn.Linear(self.hidden_size, self.hidden_size),
142
+ torch.nn.ReLU(),
143
+ torch.nn.Linear(self.hidden_size, self.output_size),
144
+ ]
145
+ self.mlp_head = torch.nn.Sequential(*layers)
146
+
147
+ else:
148
+ raise ValueError("Prompt encoder type not recognized. Please use one of MLP (recommended) or LSTM.")
149
+
150
+ def forward(self, indices):
151
+ input_embeds = self.embedding(indices)
152
+ if self.encoder_type == PromptEncoderReparameterizationType.LSTM:
153
+ output_embeds = self.mlp_head(self.lstm_head(input_embeds)[0])
154
+ elif self.encoder_type == PromptEncoderReparameterizationType.MLP:
155
+ output_embeds = self.mlp_head(input_embeds)
156
+ else:
157
+ raise ValueError("Prompt encoder type not recognized. Please use one of MLP (recommended) or LSTM.")
158
+
159
+ return output_embeds
SVFT-main/LLM-Adapters/peft/src/peft/tuners/prefix_tuning.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023-present the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ from dataclasses import dataclass, field
18
+
19
+ import torch
20
+
21
+ from ..utils import PeftType, PromptLearningConfig
22
+
23
+
24
+ @dataclass
25
+ class PrefixTuningConfig(PromptLearningConfig):
26
+ """
27
+ This is the configuration class to store the configuration of a [`~peft.PrefixEncoder`].
28
+
29
+ Args:
30
+ encoder_hidden_size (`int`): The hidden size of the prompt encoder.
31
+ prefix_projection (`bool`): Whether to project the prefix embeddings.
32
+ """
33
+
34
+ encoder_hidden_size: int = field(
35
+ default=None,
36
+ metadata={"help": "The hidden size of the encoder"},
37
+ )
38
+ prefix_projection: bool = field(
39
+ default=False,
40
+ metadata={"help": "Whether to project the prefix tokens"},
41
+ )
42
+
43
+ def __post_init__(self):
44
+ self.peft_type = PeftType.PREFIX_TUNING
45
+
46
+
47
+ # Based on https://github.com/THUDM/P-tuning-v2/blob/main/model/prefix_encoder.py
48
+ # with some refactor
49
+ class PrefixEncoder(torch.nn.Module):
50
+ r"""
51
+ The torch.nn model to encode the prefix
52
+
53
+ Args:
54
+ config ([`PrefixTuningConfig`]): The configuration of the prefix encoder.
55
+
56
+ Example::
57
+
58
+ >>> from peft import PrefixEncoder, PrefixTuningConfig >>> config = PrefixTuningConfig(
59
+ peft_type="PREFIX_TUNING", task_type="SEQ_2_SEQ_LM", num_virtual_tokens=20, token_dim=768,
60
+ num_transformer_submodules=1, num_attention_heads=12, num_layers=12, encoder_hidden_size=768
61
+ )
62
+ >>> prefix_encoder = PrefixEncoder(config)
63
+
64
+
65
+ **Attributes**:
66
+ - **embedding** (`torch.nn.Embedding`) --
67
+ The embedding layer of the prefix encoder.
68
+ - **transform** (`torch.nn.Sequential`) -- The
69
+ two-layer MLP to transform the prefix embeddings if `prefix_projection` is `True`.
70
+ - **prefix_projection** (`bool`) -- Whether to project the prefix embeddings.
71
+
72
+ Input shape: (batch_size, num_virtual_tokens)
73
+
74
+ Output shape: (batch_size, num_virtual_tokens, 2*layers*hidden)
75
+ """
76
+
77
+ def __init__(self, config):
78
+ super().__init__()
79
+ self.prefix_projection = config.prefix_projection
80
+ token_dim = config.token_dim
81
+ num_layers = config.num_layers
82
+ encoder_hidden_size = config.encoder_hidden_size
83
+ num_virtual_tokens = config.num_virtual_tokens
84
+ if self.prefix_projection and not config.inference_mode:
85
+ # Use a two-layer MLP to encode the prefix
86
+ self.embedding = torch.nn.Embedding(num_virtual_tokens, token_dim)
87
+ self.transform = torch.nn.Sequential(
88
+ torch.nn.Linear(token_dim, encoder_hidden_size),
89
+ torch.nn.Tanh(),
90
+ torch.nn.Linear(encoder_hidden_size, num_layers * 2 * token_dim),
91
+ )
92
+ else:
93
+ self.embedding = torch.nn.Embedding(num_virtual_tokens, num_layers * 2 * token_dim)
94
+
95
+ def forward(self, prefix: torch.Tensor):
96
+ if self.prefix_projection:
97
+ prefix_tokens = self.embedding(prefix)
98
+ past_key_values = self.transform(prefix_tokens)
99
+ else:
100
+ past_key_values = self.embedding(prefix)
101
+ return past_key_values
SVFT-main/LLM-Adapters/peft/src/peft/tuners/prompt_tuning.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023-present the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import enum
17
+ import math
18
+ from dataclasses import dataclass, field
19
+ from typing import Optional, Union
20
+
21
+ import torch
22
+
23
+ from ..utils import PeftType, PromptLearningConfig
24
+
25
+
26
+ class PromptTuningInit(str, enum.Enum):
27
+ TEXT = "TEXT"
28
+ RANDOM = "RANDOM"
29
+
30
+
31
+ @dataclass
32
+ class PromptTuningConfig(PromptLearningConfig):
33
+ """
34
+ This is the configuration class to store the configuration of a [`~peft.PromptEmbedding`].
35
+
36
+ Args:
37
+ prompt_tuning_init (Union[[`PromptTuningInit`], `str`]): The initialization of the prompt embedding.
38
+ prompt_tuning_init_text ( Optional[`str`]): The text to initialize the prompt embedding.
39
+ Only used if `prompt_tuning_init` is `TEXT`
40
+ tokenizer_name_or_path ( Optional[`str`]): The name or path of the tokenizer.
41
+ Only used if `prompt_tuning_init` is `TEXT`
42
+ """
43
+
44
+ prompt_tuning_init: Union[PromptTuningInit, str] = field(
45
+ default=PromptTuningInit.RANDOM,
46
+ metadata={"help": "How to initialize the prompt tuning parameters"},
47
+ )
48
+ prompt_tuning_init_text: Optional[str] = field(
49
+ default=None,
50
+ metadata={
51
+ "help": "The text to use for prompt tuning initialization. Only used if prompt_tuning_init is `TEXT`"
52
+ },
53
+ )
54
+ tokenizer_name_or_path: Optional[str] = field(
55
+ default=None,
56
+ metadata={
57
+ "help": "The tokenizer to use for prompt tuning initialization. Only used if prompt_tuning_init is `TEXT`"
58
+ },
59
+ )
60
+
61
+ def __post_init__(self):
62
+ self.peft_type = PeftType.PROMPT_TUNING
63
+
64
+
65
+ class PromptEmbedding(torch.nn.Module):
66
+ """
67
+ The model to encode virtual tokens into prompt embeddings.
68
+
69
+ Args:
70
+ config ([`PromptTuningConfig`]): The configuration of the prompt embedding.
71
+ word_embeddings (`torch.nn.Module`): The word embeddings of the base transformer model.
72
+
73
+ **Attributes**:
74
+ **embedding** (`torch.nn.Embedding`) -- The embedding layer of the prompt embedding.
75
+
76
+ Example::
77
+
78
+ >>> from peft import PromptEmbedding, PromptTuningConfig >>> config = PromptTuningConfig(
79
+ peft_type="PROMPT_TUNING", task_type="SEQ_2_SEQ_LM", num_virtual_tokens=20, token_dim=768,
80
+ num_transformer_submodules=1, num_attention_heads=12, num_layers=12, prompt_tuning_init="TEXT",
81
+ prompt_tuning_init_text="Predict if sentiment of this review is positive, negative or neutral",
82
+ tokenizer_name_or_path="t5-base",
83
+ )
84
+ >>> # t5_model.shared is the word embeddings of the base model >>> prompt_embedding = PromptEmbedding(config,
85
+ t5_model.shared)
86
+
87
+
88
+ Input Shape: (batch_size, total_virtual_tokens)
89
+
90
+ Output Shape: (batch_size, total_virtual_tokens, token_dim)
91
+ """
92
+
93
+ def __init__(self, config, word_embeddings):
94
+ super().__init__()
95
+
96
+ total_virtual_tokens = config.num_virtual_tokens * config.num_transformer_submodules
97
+ self.embedding = torch.nn.Embedding(total_virtual_tokens, config.token_dim)
98
+ if config.prompt_tuning_init == PromptTuningInit.TEXT:
99
+ from transformers import AutoTokenizer
100
+
101
+ tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name_or_path)
102
+ init_text = config.prompt_tuning_init_text
103
+ init_token_ids = tokenizer(init_text)["input_ids"]
104
+ # Trim or iterate until num_text_tokens matches total_virtual_tokens
105
+ num_text_tokens = len(init_token_ids)
106
+ if num_text_tokens > total_virtual_tokens:
107
+ init_token_ids = init_token_ids[:total_virtual_tokens]
108
+ elif num_text_tokens < total_virtual_tokens:
109
+ num_reps = math.ceil(total_virtual_tokens / num_text_tokens)
110
+ init_token_ids = init_token_ids * num_reps
111
+ init_token_ids = init_token_ids[:total_virtual_tokens]
112
+
113
+ word_embedding_weights = word_embeddings(torch.LongTensor(init_token_ids)).detach().clone()
114
+ word_embedding_weights = word_embedding_weights.to(torch.float32)
115
+ self.embedding.weight = torch.nn.Parameter(word_embedding_weights)
116
+
117
+ def forward(self, indices):
118
+ # Just get embeddings
119
+ prompt_embeddings = self.embedding(indices)
120
+ return prompt_embeddings
SVFT-main/LLM-Adapters/peft/src/peft/utils/__init__.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
+ # There's no way to ignore "F401 '...' imported but unused" warnings in this
3
+ # module, but to preserve other warnings. So, don't check this module at all
4
+
5
+ # coding=utf-8
6
+ # Copyright 2023-present the HuggingFace Inc. team.
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License");
9
+ # you may not use this file except in compliance with the License.
10
+ # You may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
+ # See the License for the specific language governing permissions and
18
+ # limitations under the License.
19
+
20
+ from .adapters_utils import CONFIG_NAME, WEIGHTS_NAME
21
+ from .config import PeftConfig, PeftType, PromptLearningConfig, TaskType
22
+ from .other import (
23
+ TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,
24
+ _set_trainable,
25
+ bloom_model_postprocess_past_key_value,
26
+ prepare_model_for_int8_training,
27
+ shift_tokens_right,
28
+ transpose,
29
+ )
30
+ from .save_and_load import get_peft_model_state_dict, set_peft_model_state_dict
SVFT-main/LLM-Adapters/peft/src/peft/utils/adapters_utils.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023-present the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ WEIGHTS_NAME = "adapter_model.bin"
16
+ CONFIG_NAME = "adapter_config.json"
17
+
18
+ # TODO: add automapping and superclass here?
SVFT-main/LLM-Adapters/peft/src/peft/utils/config.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023-present the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ import enum
16
+ import json
17
+ import os
18
+ from dataclasses import asdict, dataclass, field
19
+ from typing import Optional, Union
20
+
21
+ from huggingface_hub import hf_hub_download
22
+ from transformers.utils import PushToHubMixin
23
+
24
+ from .adapters_utils import CONFIG_NAME
25
+
26
+
27
+ class PeftType(str, enum.Enum):
28
+ PROMPT_TUNING = "PROMPT_TUNING"
29
+ P_TUNING = "P_TUNING"
30
+ PREFIX_TUNING = "PREFIX_TUNING"
31
+ LORA = "LORA"
32
+ BOTTLENECK = "BOTTLENECK"
33
+
34
+
35
+
36
+ class TaskType(str, enum.Enum):
37
+ SEQ_CLS = "SEQ_CLS"
38
+ SEQ_2_SEQ_LM = "SEQ_2_SEQ_LM"
39
+ CAUSAL_LM = "CAUSAL_LM"
40
+ TOKEN_CLS = "TOKEN_CLS"
41
+
42
+
43
+ @dataclass
44
+ class PeftConfigMixin(PushToHubMixin):
45
+ r"""
46
+ This is the base configuration class for PEFT adapter models. It contains all the methods that are common to all
47
+ PEFT adapter models. This class inherits from `transformers.utils.PushToHubMixin` which contains the methods to
48
+ push your model to the Hub. The method `save_pretrained` will save the configuration of your adapter model in a
49
+ directory. The method `from_pretrained` will load the configuration of your adapter model from a directory.
50
+
51
+ Args:
52
+ peft_type (Union[[`~peft.utils.config.PeftType`], `str`]): The type of Peft method to use.
53
+ """
54
+ peft_type: Optional[PeftType] = field(default=None, metadata={"help": "The type of PEFT model."})
55
+
56
+ @property
57
+ def __dict__(self):
58
+ return asdict(self)
59
+
60
+ def to_dict(self):
61
+ return self.__dict__
62
+
63
+ def save_pretrained(self, save_directory, **kwargs):
64
+ r"""
65
+ This method saves the configuration of your adapter model in a directory.
66
+
67
+ Args:
68
+ save_directory (`str`):
69
+ The directory where the configuration will be saved.
70
+ **kwargs:
71
+ Additional keyword arguments passed along to the `transformers.utils.PushToHubMixin.push_to_hub`
72
+ method.
73
+ """
74
+ if os.path.isfile(save_directory):
75
+ raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
76
+
77
+ os.makedirs(save_directory, exist_ok=True)
78
+
79
+ output_dict = self.__dict__
80
+ output_path = os.path.join(save_directory, CONFIG_NAME)
81
+
82
+ # save it
83
+ with open(output_path, "w") as writer:
84
+ writer.write(json.dumps(output_dict, indent=2, sort_keys=True))
85
+
86
+ @classmethod
87
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
88
+ r"""
89
+ This method loads the configuration of your adapter model from a directory.
90
+
91
+ Args:
92
+ pretrained_model_name_or_path (`str`):
93
+ The directory or the hub-id where the configuration is saved.
94
+ **kwargs:
95
+ Additional keyword arguments passed along to the child class initialization.
96
+ """
97
+ if os.path.isfile(os.path.join(pretrained_model_name_or_path, CONFIG_NAME)):
98
+ config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
99
+ else:
100
+ try:
101
+ config_file = hf_hub_download(pretrained_model_name_or_path, CONFIG_NAME)
102
+ except Exception:
103
+ raise ValueError(f"Can't find config.json at '{pretrained_model_name_or_path}'")
104
+
105
+ loaded_attributes = cls.from_json_file(config_file)
106
+
107
+ config = cls(**kwargs)
108
+
109
+ for key, value in loaded_attributes.items():
110
+ if hasattr(config, key):
111
+ setattr(config, key, value)
112
+
113
+ return config
114
+
115
+ @classmethod
116
+ def from_json_file(cls, path_json_file, **kwargs):
117
+ r"""
118
+ Loads a configuration file from a json file.
119
+
120
+ Args:
121
+ path_json_file (`str`):
122
+ The path to the json file.
123
+ """
124
+ with open(path_json_file, "r") as file:
125
+ json_object = json.load(file)
126
+
127
+ return json_object
128
+
129
+
130
+ @dataclass
131
+ class PeftConfig(PeftConfigMixin):
132
+ """
133
+ This is the base configuration class to store the configuration of a :class:`~peft.PeftModel`.
134
+
135
+ Args:
136
+ peft_type (Union[[`~peft.utils.config.PeftType`], `str`]): The type of Peft method to use.
137
+ task_type (Union[[`~peft.utils.config.TaskType`], `str`]): The type of task to perform.
138
+ inference_mode (`bool`, defaults to `False`): Whether to use the Peft model in inference mode.
139
+ """
140
+
141
+ base_model_name_or_path: str = field(default=None, metadata={"help": "The name of the base model to use."})
142
+ peft_type: Union[str, PeftType] = field(default=None, metadata={"help": "Peft type"})
143
+ task_type: Union[str, TaskType] = field(default=None, metadata={"help": "Task type"})
144
+ inference_mode: bool = field(default=False, metadata={"help": "Whether to use inference mode"})
145
+
146
+
147
+ @dataclass
148
+ class PromptLearningConfig(PeftConfig):
149
+ """
150
+ This is the base configuration class to store the configuration of a Union[[`~peft.PrefixTuning`],
151
+ [`~peft.PromptEncoder`], [`~peft.PromptTuning`]].
152
+
153
+ Args:
154
+ num_virtual_tokens (`int`): The number of virtual tokens to use.
155
+ token_dim (`int`): The hidden embedding dimension of the base transformer model.
156
+ num_transformer_submodules (`int`): The number of transformer submodules in the base transformer model.
157
+ num_attention_heads (`int`): The number of attention heads in the base transformer model.
158
+ num_layers (`int`): The number of layers in the base transformer model.
159
+ """
160
+
161
+ num_virtual_tokens: int = field(default=None, metadata={"help": "Number of virtual tokens"})
162
+ token_dim: int = field(
163
+ default=None, metadata={"help": "The hidden embedding dimension of the base transformer model"}
164
+ )
165
+ num_transformer_submodules: Optional[int] = field(
166
+ default=None, metadata={"help": "Number of transformer submodules"}
167
+ )
168
+ num_attention_heads: Optional[int] = field(default=None, metadata={"help": "Number of attention heads"})
169
+ num_layers: Optional[int] = field(default=None, metadata={"help": "Number of transformer layers"})
SVFT-main/LLM-Adapters/peft/src/peft/utils/other.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023-present the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import torch
17
+
18
+
19
+ # needed for prefix-tuning of bloom model
20
+ def bloom_model_postprocess_past_key_value(past_key_values):
21
+ past_key_values = torch.cat(past_key_values)
22
+ total_layers, batch_size, num_attention_heads, num_virtual_tokens, head_dim = past_key_values.shape
23
+ keys = past_key_values[: total_layers // 2]
24
+ keys = keys.transpose(2, 3).reshape(
25
+ total_layers // 2, batch_size * num_attention_heads, head_dim, num_virtual_tokens
26
+ )
27
+ values = past_key_values[total_layers // 2 :]
28
+ values = values.reshape(total_layers // 2, batch_size * num_attention_heads, num_virtual_tokens, head_dim)
29
+
30
+ return tuple(zip(keys, values))
31
+
32
+
33
+ def prepare_model_for_int8_training(
34
+ model, output_embedding_layer_name="lm_head", use_gradient_checkpointing=True, layer_norm_names=["layer_norm"]
35
+ ):
36
+ r"""
37
+ This method wrapps the entire protocol for preparing a model before running a training. This includes:
38
+ 1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm
39
+ head to fp32
40
+
41
+ Args:
42
+ model, (`transformers.PreTrainedModel`):
43
+ The loaded model from `transformers`
44
+ """
45
+ loaded_in_8bit = getattr(model, "is_loaded_in_8bit", False)
46
+
47
+ for name, param in model.named_parameters():
48
+ # freeze base model's layers
49
+ param.requires_grad = False
50
+
51
+ if loaded_in_8bit:
52
+ # cast layer norm in fp32 for stability for 8bit models
53
+ if param.ndim == 1 and any(layer_norm_name in name for layer_norm_name in layer_norm_names):
54
+ param.data = param.data.to(torch.float32)
55
+
56
+ if loaded_in_8bit and use_gradient_checkpointing:
57
+ # For backward compatibility
58
+ if hasattr(model, "enable_input_require_grads"):
59
+ model.enable_input_require_grads()
60
+ else:
61
+
62
+ def make_inputs_require_grad(module, input, output):
63
+ output.requires_grad_(True)
64
+
65
+ model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
66
+
67
+ # enable gradient checkpointing for memory efficiency
68
+ model.gradient_checkpointing_enable()
69
+
70
+ if hasattr(model, output_embedding_layer_name):
71
+ output_embedding_layer = getattr(model, output_embedding_layer_name)
72
+ input_dtype = output_embedding_layer.weight.dtype
73
+
74
+ class CastOutputToFloat(torch.nn.Sequential):
75
+ r"""
76
+ Manually cast to the expected dtype of the lm_head as sometimes there is a final layer norm that is casted
77
+ in fp32
78
+
79
+ """
80
+
81
+ def forward(self, x):
82
+ return super().forward(x.to(input_dtype)).to(torch.float32)
83
+
84
+ setattr(model, output_embedding_layer_name, CastOutputToFloat(output_embedding_layer))
85
+
86
+ return model
87
+
88
+
89
+ TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING = {
90
+ "bloom": bloom_model_postprocess_past_key_value,
91
+ }
92
+
93
+
94
+ # copied from transformers.models.bart.modeling_bart
95
+ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
96
+ """
97
+ Shift input ids one token to the right.
98
+
99
+ Args:
100
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input ids
101
+ pad_token_id (`int`): The id of the `padding` token.
102
+ decoder_start_token_id (`int`): The id of the `start` token.
103
+ """
104
+ shifted_input_ids = input_ids.new_zeros(input_ids.shape)
105
+ shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
106
+ shifted_input_ids[:, 0] = decoder_start_token_id
107
+
108
+ if pad_token_id is None:
109
+ raise ValueError("self.model.config.pad_token_id has to be defined.")
110
+ # replace possible -100 values in labels by `pad_token_id`
111
+ shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
112
+
113
+ return shifted_input_ids
114
+
115
+
116
+ def _set_trainable(model):
117
+ if model.modules_to_save is not None:
118
+ for name, param in model.named_parameters():
119
+ if any(module_name in name for module_name in model.modules_to_save):
120
+ param.requires_grad = True
121
+
122
+
123
+ def fsdp_auto_wrap_policy(model):
124
+ import functools
125
+ import os
126
+
127
+ from accelerate import FullyShardedDataParallelPlugin
128
+ from torch.distributed.fsdp.wrap import _or_policy, lambda_auto_wrap_policy, transformer_auto_wrap_policy
129
+
130
+ from ..tuners import PrefixEncoder, PromptEmbedding, PromptEncoder
131
+
132
+ def lambda_policy_fn(module):
133
+ if (
134
+ len(list(module.named_children())) == 0
135
+ and getattr(module, "weight", None) is not None
136
+ and module.weight.requires_grad
137
+ ):
138
+ return True
139
+ return False
140
+
141
+ lambda_policy = functools.partial(lambda_auto_wrap_policy, lambda_fn=lambda_policy_fn)
142
+ transformer_wrap_policy = functools.partial(
143
+ transformer_auto_wrap_policy,
144
+ transformer_layer_cls=(
145
+ PrefixEncoder,
146
+ PromptEncoder,
147
+ PromptEmbedding,
148
+ FullyShardedDataParallelPlugin.get_module_class_from_name(
149
+ model, os.environ.get("FSDP_TRANSFORMER_CLS_TO_WRAP", "")
150
+ ),
151
+ ),
152
+ )
153
+
154
+ auto_wrap_policy = functools.partial(_or_policy, policies=[lambda_policy, transformer_wrap_policy])
155
+ return auto_wrap_policy
156
+
157
+
158
+ def transpose(weight, fan_in_fan_out):
159
+ return weight.T if fan_in_fan_out else weight
SVFT-main/LLM-Adapters/peft/src/peft/utils/save_and_load.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023-present the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from .config import PeftType
17
+
18
+
19
+ def get_peft_model_state_dict(model, state_dict=None):
20
+ """
21
+ Get the state dict of the Peft model.
22
+
23
+ Args:
24
+ model ([`PeftModel`]): The Peft model. When using torch.nn.DistributedDataParallel, DeepSpeed or FSDP,
25
+ the model should be the underlying model/unwrapped model (i.e. model.module).
26
+ state_dict (`dict`, *optional*, defaults to `None`):
27
+ The state dict of the model. If not provided, the state dict of the model
28
+ will be used.
29
+ """
30
+ if state_dict is None:
31
+ state_dict = model.state_dict()
32
+ if model.peft_config.peft_type == PeftType.LORA:
33
+ # to_return = lora_state_dict(model, bias=model.peft_config.bias)
34
+ # adapted from `https://github.com/microsoft/LoRA/blob/main/loralib/utils.py`
35
+ # to directly with the state dict which is necessary when using DeepSpeed or FSDP
36
+ bias = model.peft_config.bias
37
+ if bias == "none":
38
+ to_return = {k: state_dict[k] for k in state_dict if "lora_" in k}
39
+ elif bias == "all":
40
+ to_return = {k: state_dict[k] for k in state_dict if "lora_" in k or "bias" in k}
41
+ elif bias == "lora_only":
42
+ to_return = {}
43
+ for k in state_dict:
44
+ if "lora_" in k:
45
+ to_return[k] = state_dict[k]
46
+ bias_name = k.split("lora_")[0] + "bias"
47
+ if bias_name in state_dict:
48
+ to_return[bias_name] = state_dict[bias_name]
49
+ else:
50
+ raise NotImplementedError
51
+ elif model.peft_config.peft_type == PeftType.BOTTLENECK:
52
+ # return the state dict of the model with Bottleneck adapters
53
+ bias = model.peft_config.bias
54
+ if bias == "none":
55
+ to_return = {k: state_dict[k] for k in state_dict if "adapter_" in k}
56
+ elif bias == "all":
57
+ to_return = {k: state_dict[k] for k in state_dict if "adapter_" in k or "bias" in k}
58
+ elif bias == "adapter_only":
59
+ to_return = {}
60
+ for k in state_dict:
61
+ if "adapter_" in k:
62
+ to_return[k] = state_dict[k]
63
+ bias_name = k.split("adapter_")[0] + "bias"
64
+ if bias_name in state_dict:
65
+ to_return[bias_name] = state_dict[bias_name]
66
+ else:
67
+ raise NotImplementedError
68
+ else:
69
+ to_return = {}
70
+ if model.peft_config.inference_mode:
71
+ prompt_embeddings = model.prompt_encoder.embedding.weight
72
+ else:
73
+ prompt_embeddings = model.get_prompt_embedding_to_save()
74
+ to_return["prompt_embeddings"] = prompt_embeddings
75
+ if model.modules_to_save is not None:
76
+ for key, value in state_dict.items():
77
+ if any(module_name in key for module_name in model.modules_to_save):
78
+ to_return[key] = value
79
+ return to_return
80
+
81
+
82
+ def set_peft_model_state_dict(model, peft_model_state_dict):
83
+ """
84
+ Set the state dict of the Peft model.
85
+
86
+ Args:
87
+ model ([`PeftModel`]): The Peft model.
88
+ peft_model_state_dict (`dict`): The state dict of the Peft model.
89
+ """
90
+
91
+ model.load_state_dict(peft_model_state_dict, strict=False)
92
+ if model.peft_config.peft_type != PeftType.LORA and model.peft_config.peft_type != PeftType.BOTTLENECK:
93
+ model.prompt_encoder.embedding.load_state_dict(
94
+ {"weight": peft_model_state_dict["prompt_embeddings"]}, strict=True
95
+ )
96
+ return model
SVFT-main/LLM-Adapters/peft/tests/__init__.py ADDED
File without changes
SVFT-main/LLM-Adapters/peft/tests/test_config.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023-present the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ import os
16
+ import tempfile
17
+ import unittest
18
+
19
+ from peft import LoraConfig, PrefixTuningConfig, PromptEncoderConfig, PromptTuningConfig
20
+
21
+
22
+ class PeftConfigTestMixin:
23
+ all_config_classes = (
24
+ LoraConfig,
25
+ PromptEncoderConfig,
26
+ PrefixTuningConfig,
27
+ PromptTuningConfig,
28
+ )
29
+
30
+
31
+ class PeftConfigTester(unittest.TestCase, PeftConfigTestMixin):
32
+ def test_methods(self):
33
+ r"""
34
+ Test if all configs have the expected methods. Here we test
35
+ - to_dict
36
+ - save_pretrained
37
+ - from_pretrained
38
+ - from_json_file
39
+ """
40
+ # test if all configs have the expected methods
41
+ for config_class in self.all_config_classes:
42
+ config = config_class()
43
+ self.assertTrue(hasattr(config, "to_dict"))
44
+ self.assertTrue(hasattr(config, "save_pretrained"))
45
+ self.assertTrue(hasattr(config, "from_pretrained"))
46
+ self.assertTrue(hasattr(config, "from_json_file"))
47
+
48
+ def test_task_type(self):
49
+ for config_class in self.all_config_classes:
50
+ # assert this will not fail
51
+ _ = config_class(task_type="test")
52
+
53
+ def test_save_pretrained(self):
54
+ r"""
55
+ Test if the config is correctly saved and loaded using
56
+ - save_pretrained
57
+ """
58
+ for config_class in self.all_config_classes:
59
+ config = config_class()
60
+ with tempfile.TemporaryDirectory() as tmp_dirname:
61
+ config.save_pretrained(tmp_dirname)
62
+
63
+ config_from_pretrained = config_class.from_pretrained(tmp_dirname)
64
+ self.assertEqual(config.to_dict(), config_from_pretrained.to_dict())
65
+
66
+ def test_from_json_file(self):
67
+ for config_class in self.all_config_classes:
68
+ config = config_class()
69
+ with tempfile.TemporaryDirectory() as tmp_dirname:
70
+ config.save_pretrained(tmp_dirname)
71
+
72
+ config_from_json = config_class.from_json_file(os.path.join(tmp_dirname, "adapter_config.json"))
73
+ self.assertEqual(config.to_dict(), config_from_json)
74
+
75
+ def test_to_dict(self):
76
+ r"""
77
+ Test if the config can be correctly converted to a dict using:
78
+ - to_dict
79
+ - __dict__
80
+ """
81
+ for config_class in self.all_config_classes:
82
+ config = config_class()
83
+ self.assertEqual(config.to_dict(), config.__dict__)
84
+ self.assertTrue(isinstance(config.to_dict(), dict))
85
+
86
+ def test_set_attributes(self):
87
+ # manually set attributes and check if they are correctly written
88
+ for config_class in self.all_config_classes:
89
+ config = config_class(peft_type="test")
90
+
91
+ # save pretrained
92
+ with tempfile.TemporaryDirectory() as tmp_dirname:
93
+ config.save_pretrained(tmp_dirname)
94
+
95
+ config_from_pretrained = config_class.from_pretrained(tmp_dirname)
96
+ self.assertEqual(config.to_dict(), config_from_pretrained.to_dict())
SVFT-main/LLM-Adapters/peft/tests/test_peft_model.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023-present the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ import os
16
+ import tempfile
17
+ import unittest
18
+
19
+ import torch
20
+ from parameterized import parameterized
21
+ from transformers import AutoModelForCausalLM
22
+
23
+ from peft import (
24
+ PeftModel,
25
+ get_peft_model,
26
+ get_peft_model_state_dict,
27
+ prepare_model_for_int8_training,
28
+ )
29
+
30
+ from .testing_common import PeftTestConfigManager
31
+
32
+
33
+ # This has to be in the order: model_id, lora_kwargs, prefix_tuning_kwargs, prompt_encoder_kwargs, prompt_tuning_kwargs
34
+ PEFT_MODELS_TO_TEST = [
35
+ ("hf-internal-testing/tiny-random-OPTForCausalLM", {"target_modules": ["q_proj", "v_proj"]}, {}, {}, {}),
36
+ ]
37
+
38
+
39
+ class PeftTestMixin:
40
+ torch_device = "cuda" if torch.cuda.is_available() else "cpu"
41
+
42
+
43
+ class PeftModelTester(unittest.TestCase, PeftTestMixin):
44
+ r"""
45
+ Test if the PeftModel behaves as expected. This includes:
46
+ - test if the model has the expected methods
47
+
48
+ We use parametrized.expand for debugging purposes to test each model individually.
49
+ """
50
+
51
+ @parameterized.expand(PeftTestConfigManager.get_grid_parameters(PEFT_MODELS_TO_TEST))
52
+ def test_attributes_parametrized(self, test_name, model_id, config_cls, config_kwargs):
53
+ self._test_model_attr(model_id, config_cls, config_kwargs)
54
+
55
+ def _test_model_attr(self, model_id, config_cls, config_kwargs):
56
+ model = AutoModelForCausalLM.from_pretrained(model_id)
57
+ config = config_cls(
58
+ base_model_name_or_path=model_id,
59
+ **config_kwargs,
60
+ )
61
+ model = get_peft_model(model, config)
62
+
63
+ self.assertTrue(hasattr(model, "save_pretrained"))
64
+ self.assertTrue(hasattr(model, "from_pretrained"))
65
+ self.assertTrue(hasattr(model, "push_to_hub"))
66
+
67
+ def _test_prepare_for_training(self, model_id, config_cls, config_kwargs):
68
+ model = AutoModelForCausalLM.from_pretrained(model_id).to(self.torch_device)
69
+ config = config_cls(
70
+ base_model_name_or_path=model_id,
71
+ **config_kwargs,
72
+ )
73
+ model = get_peft_model(model, config)
74
+
75
+ dummy_input = torch.LongTensor([[1, 1, 1]]).to(self.torch_device)
76
+ dummy_output = model.get_input_embeddings()(dummy_input)
77
+
78
+ self.assertTrue(not dummy_output.requires_grad)
79
+
80
+ # load with `prepare_model_for_int8_training`
81
+ model = AutoModelForCausalLM.from_pretrained(model_id).to(self.torch_device)
82
+ model = prepare_model_for_int8_training(model)
83
+
84
+ for param in model.parameters():
85
+ self.assertTrue(not param.requires_grad)
86
+
87
+ config = config_cls(
88
+ base_model_name_or_path=model_id,
89
+ **config_kwargs,
90
+ )
91
+ model = get_peft_model(model, config)
92
+
93
+ # For backward compatibility
94
+ if hasattr(model, "enable_input_require_grads"):
95
+ model.enable_input_require_grads()
96
+ else:
97
+
98
+ def make_inputs_require_grad(module, input, output):
99
+ output.requires_grad_(True)
100
+
101
+ model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
102
+
103
+ dummy_input = torch.LongTensor([[1, 1, 1]]).to(self.torch_device)
104
+ dummy_output = model.get_input_embeddings()(dummy_input)
105
+
106
+ self.assertTrue(dummy_output.requires_grad)
107
+
108
+ @parameterized.expand(PeftTestConfigManager.get_grid_parameters(PEFT_MODELS_TO_TEST))
109
+ def test_prepare_for_training_parametrized(self, test_name, model_id, config_cls, config_kwargs):
110
+ self._test_prepare_for_training(model_id, config_cls, config_kwargs)
111
+
112
+ def _test_save_pretrained(self, model_id, config_cls, config_kwargs):
113
+ model = AutoModelForCausalLM.from_pretrained(model_id)
114
+ config = config_cls(
115
+ base_model_name_or_path=model_id,
116
+ **config_kwargs,
117
+ )
118
+ model = get_peft_model(model, config)
119
+ model = model.to(self.torch_device)
120
+
121
+ with tempfile.TemporaryDirectory() as tmp_dirname:
122
+ model.save_pretrained(tmp_dirname)
123
+
124
+ model_from_pretrained = AutoModelForCausalLM.from_pretrained(model_id)
125
+ model_from_pretrained = PeftModel.from_pretrained(model_from_pretrained, tmp_dirname)
126
+
127
+ # check if the state dicts are equal
128
+ state_dict = get_peft_model_state_dict(model)
129
+ state_dict_from_pretrained = get_peft_model_state_dict(model_from_pretrained)
130
+
131
+ # check if same keys
132
+ self.assertEqual(state_dict.keys(), state_dict_from_pretrained.keys())
133
+
134
+ # check if tensors equal
135
+ for key in state_dict.keys():
136
+ self.assertTrue(
137
+ torch.allclose(
138
+ state_dict[key].to(self.torch_device), state_dict_from_pretrained[key].to(self.torch_device)
139
+ )
140
+ )
141
+
142
+ # check if `adapter_model.bin` is present
143
+ self.assertTrue(os.path.exists(os.path.join(tmp_dirname, "adapter_model.bin")))
144
+
145
+ # check if `adapter_config.json` is present
146
+ self.assertTrue(os.path.exists(os.path.join(tmp_dirname, "adapter_config.json")))
147
+
148
+ # check if `pytorch_model.bin` is not present
149
+ self.assertFalse(os.path.exists(os.path.join(tmp_dirname, "pytorch_model.bin")))
150
+
151
+ # check if `config.json` is not present
152
+ self.assertFalse(os.path.exists(os.path.join(tmp_dirname, "config.json")))
153
+
154
+ @parameterized.expand(PeftTestConfigManager.get_grid_parameters(PEFT_MODELS_TO_TEST))
155
+ def test_save_pretrained(self, test_name, model_id, config_cls, config_kwargs):
156
+ self._test_save_pretrained(model_id, config_cls, config_kwargs)
SVFT-main/LLM-Adapters/peft/tests/testing_common.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023-present the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from collections import OrderedDict
16
+
17
+ from peft import (
18
+ LoraConfig,
19
+ PrefixTuningConfig,
20
+ PromptEncoderConfig,
21
+ PromptTuningConfig,
22
+ )
23
+
24
+
25
+ CONFIG_CLASSES = (
26
+ LoraConfig,
27
+ PrefixTuningConfig,
28
+ PromptEncoderConfig,
29
+ PromptTuningConfig,
30
+ )
31
+ CONFIG_TESTING_KWARGS = (
32
+ {
33
+ "r": 8,
34
+ "lora_alpha": 32,
35
+ "target_modules": None,
36
+ "lora_dropout": 0.05,
37
+ "bias": "none",
38
+ "task_type": "CAUSAL_LM",
39
+ },
40
+ {
41
+ "num_virtual_tokens": 10,
42
+ "task_type": "CAUSAL_LM",
43
+ },
44
+ {
45
+ "num_virtual_tokens": 10,
46
+ "encoder_hidden_size": 32,
47
+ "task_type": "CAUSAL_LM",
48
+ },
49
+ {
50
+ "num_virtual_tokens": 10,
51
+ "task_type": "CAUSAL_LM",
52
+ },
53
+ )
54
+
55
+ CLASSES_MAPPING = {
56
+ "lora": (LoraConfig, CONFIG_TESTING_KWARGS[0]),
57
+ "prefix_tuning": (PrefixTuningConfig, CONFIG_TESTING_KWARGS[1]),
58
+ "prompt_encoder": (PromptEncoderConfig, CONFIG_TESTING_KWARGS[2]),
59
+ "prompt_tuning": (PromptTuningConfig, CONFIG_TESTING_KWARGS[3]),
60
+ }
61
+
62
+
63
+ # Adapted from https://github.com/huggingface/transformers/blob/48327c57182fdade7f7797d1eaad2d166de5c55b/src/transformers/activations.py#LL166C7-L166C22
64
+ class ClassInstantier(OrderedDict):
65
+ def __getitem__(self, key, *args, **kwargs):
66
+ # check if any of the kwargs is inside the config class kwargs
67
+ if any([kwarg in self[key][1] for kwarg in kwargs]):
68
+ new_config_kwargs = self[key][1].copy()
69
+ new_config_kwargs.update(kwargs)
70
+ return (self[key][0], new_config_kwargs)
71
+
72
+ return super().__getitem__(key, *args, **kwargs)
73
+
74
+ def get_grid_parameters(self, model_list):
75
+ r"""
76
+ Returns a list of all possible combinations of the parameters in the config classes.
77
+ """
78
+ grid_parameters = []
79
+ for model_tuple in model_list:
80
+ model_id, lora_kwargs, prefix_tuning_kwargs, prompt_encoder_kwargs, prompt_tuning_kwargs = model_tuple
81
+ for key, value in self.items():
82
+ if key == "lora":
83
+ # update value[1] if necessary
84
+ if lora_kwargs is not None:
85
+ value[1].update(lora_kwargs)
86
+ elif key == "prefix_tuning":
87
+ # update value[1] if necessary
88
+ if prefix_tuning_kwargs is not None:
89
+ value[1].update(prefix_tuning_kwargs)
90
+ elif key == "prompt_encoder":
91
+ # update value[1] if necessary
92
+ if prompt_encoder_kwargs is not None:
93
+ value[1].update(prompt_encoder_kwargs)
94
+ else:
95
+ # update value[1] if necessary
96
+ if prompt_tuning_kwargs is not None:
97
+ value[1].update(prompt_tuning_kwargs)
98
+ grid_parameters.append((f"test_{model_id}_{key}", model_id, value[0], value[1]))
99
+
100
+ return grid_parameters
101
+
102
+
103
+ PeftTestConfigManager = ClassInstantier(CLASSES_MAPPING)
SVFT-main/LLM-Adapters/peft/tests/testing_utils.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023-present the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ import unittest
16
+
17
+ import torch
18
+
19
+
20
+ def require_torch_gpu(test_case):
21
+ """
22
+ Decorator marking a test that requires a GPU. Will be skipped when no GPU is available.
23
+ """
24
+ if not torch.cuda.is_available():
25
+ return unittest.skip("test requires GPU")(test_case)
26
+ else:
27
+ return test_case
28
+
29
+
30
+ def require_torch_multi_gpu(test_case):
31
+ """
32
+ Decorator marking a test that requires multiple GPUs. Will be skipped when less than 2 GPUs are available.
33
+ """
34
+ if not torch.cuda.is_available() or torch.cuda.device_count() < 2:
35
+ return unittest.skip("test requires multiple GPUs")(test_case)
36
+ else:
37
+ return test_case
38
+
39
+
40
+ def require_bitsandbytes(test_case):
41
+ """
42
+ Decorator marking a test that requires the bitsandbytes library. Will be skipped when the library is not installed.
43
+ """
44
+ try:
45
+ import bitsandbytes # noqa: F401
46
+ except ImportError:
47
+ return unittest.skip("test requires bitsandbytes")(test_case)
48
+ else:
49
+ return test_case
SVFT-main/LLM-Adapters/picture.jpg ADDED
SVFT-main/LLM-Adapters/pyproject.toml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ [tool.black]
2
+ line-length = 79
3
+
4
+ [tool.isort]
5
+ include_trailing_comma = true
6
+ line_length = 79
7
+ multi_line_output = 3
8
+ profile = "black"
SVFT-main/LLM-Adapters/requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ accelerate
2
+ appdirs
3
+ bitsandbytes
4
+ black
5
+ black[jupyter]
6
+ datasets
7
+ fire
8
+ git+https://github.com/huggingface/transformers.git
9
+ gradio
SVFT-main/LLM-Adapters/run_commonsense.sh ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ #SVFT_PLAIN
4
+ WORLD_SIZE=1 CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=1 --master_port=3191 finetune.py \
5
+ --base_model 'google/gemma-2b' \
6
+ --data_path './ft-training_set/commonsense_15k.json' \
7
+ --output_dir './Gemma_2B_svft_CR15K/' \
8
+ --batch_size 64 \
9
+ --micro_batch_size 4 \
10
+ --num_epochs 3 \
11
+ --learning_rate 5e-2 \
12
+ --cutoff_len 512\
13
+ --val_set_size 120 \
14
+ --adapter_name svft \
15
+ --off_diag 0 \
16
+ --pattern "banded" \
17
+ --lora_target_modules "q_proj","v_proj","k_proj","o_proj","up_proj","down_proj","gate_proj"
18
+
19
+ #SVFT_Random_d=16
20
+ WORLD_SIZE=1 CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=1 --master_port=3191 finetune.py \
21
+ --base_model 'google/gemma-2b' \
22
+ --data_path './ft-training_set/commonsense_15k.json' \
23
+ --output_dir './Gemma_2B_svft_16diag_random_CR15K/' \
24
+ --batch_size 64 \
25
+ --micro_batch_size 4 \
26
+ --num_epochs 3 \
27
+ --learning_rate 5e-3 \
28
+ --cutoff_len 512\
29
+ --val_set_size 120 \
30
+ --adapter_name svft \
31
+ --off_diag 16 \
32
+ --pattern "random" \
33
+ --lora_target_modules "q_proj","v_proj","k_proj","o_proj","up_proj","down_proj","gate_proj"
SVFT-main/MetaMath/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
SVFT-main/MetaMath/README.MD ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MetaMath: Bootstrap Your Own Mathematical Questions for Large Language Models
2
+
3
+ [![Code License](https://img.shields.io/badge/Code%20License-Apache_2.0-green.svg)](CODE_LICENSE)
4
+ [![Model Weight License](https://img.shields.io/badge/Model%20Weights%20License-LLaMA2-yellow)](MetaMath/LICENSE)
5
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/release/python-390/)
6
+
7
+ <p align="center">
8
+ 🤗 <a href="https://huggingface.co/meta-math" target="_blank">HF Repo</a> • 📃 <a href="https://arxiv.org/abs/2309.12284" target="_blank">[MetaMath]</a><br>
9
+ </p>
10
+
11
+ <p align="center" width="100%">
12
+ <a ><img src="./imgs/metamath.svg" alt="MetaMath" style="width: 80%; min-width: 300px; display: block; margin: auto;"></a>
13
+ </p>
14
+
15
+
16
+ ## News
17
+ - 🔥 Our **MetaMath-Llemma-7B** model achieves **30.0 pass@1** on the MATH Benchmarks, surpassing all the SOTA open-source LLM in 7B-13B scales! All the training scripts and the model are opened.
18
+ - 🔥 Our **MetaMath-Mistral-7B** model achieves **77.7 pass@1** on the [GSM8k Benchmarks](https://github.com/openai/grade-school-math), surpassing all the SOTA open-source LLM! All the training scripts and the model are opened.
19
+ - 🔥 The full **MetaMathQA** dataset is now released in the huggingface [MetaMathQA](https://huggingface.co/datasets/meta-math/MetaMathQA/tree/main)!
20
+ - 🔥 We released the GSM8K_Backward dataset is also released in the huggingface [GSM8K_Backward](https://huggingface.co/datasets/meta-math/GSM8K_Backward) to evaluate the reversal mathematical reasoning ability!
21
+ - 🔥 Although the data augmentation for **MetaMathQA** is sourced from **ChatGPT 3.5**, Our **MetaMath-70B** model outperforms the closed-source LLMs **ChatGPT 3.5** on the GSM8K!
22
+ - 🔥 Our **MetaMath-7B** model achieves **66.5 pass@1** on the [GSM8k Benchmarks](https://github.com/openai/grade-school-math), **11.6** points higher than the SOTA open-source LLM!
23
+ - 🔥 Our **MetaMath-7B** model achieves **19.8 pass@1** on the [MATH Benchmarks](https://github.com/hendrycks/math), **9.1** points higher than the SOTA open-source LLM!
24
+
25
+ | Model | Checkpoint | Paper | GSM8k | MATH | License|
26
+ | ----- |------| ---- |------|-------| ----- |
27
+ | MetaMath-70B-V1.0 | 🤗 <a href="https://huggingface.co/meta-math/MetaMath-70B-V1.0" target="_blank">HF Link</a> | 📃 <a href="https://arxiv.org/abs/2309.12284" target="_blank">[MetaMath]</a>| **82.3** | **26.6** | <a href="https://ai.meta.com/resources/models-and-libraries/llama-downloads/" target="_blank">Llama 2 </a> |
28
+ | MetaMath-13B-V1.0 | 🤗 <a href="https://huggingface.co/meta-math/MetaMath-13B-V1.0" target="_blank">HF Link</a> | 📃 <a href="https://arxiv.org/abs/2309.12284" target="_blank">[MetaMath]</a>| **72.3** | **22.4** | <a href="https://ai.meta.com/resources/models-and-libraries/llama-downloads/" target="_blank">Llama 2 </a> |
29
+ | MetaMath-7B-V1.0 | 🤗 <a href="https://huggingface.co/meta-math/MetaMath-7B-V1.0" target="_blank">HF Link</a> | 📃 <a href="https://arxiv.org/abs/2309.12284" target="_blank">[MetaMath]</a>| **66.5** | **19.8** | <a href="https://ai.meta.com/resources/models-and-libraries/llama-downloads/" target="_blank">Llama 2 </a>|
30
+ | MetaMath-Mistral-7B | 🤗 <a href="https://huggingface.co/meta-math/MetaMath-Mistral-7B" target="_blank">HF Link</a> | 📃 <a href="https://arxiv.org/abs/2309.12284" target="_blank">[MetaMath]</a>| **77.7** | **28.2** | <a href="http://www.apache.org/licenses/" target="_blank">Apache License 2.0 </a>|
31
+ | MetaMath-Llemma-7B | 🤗 <a href="https://huggingface.co/meta-math/MetaMath-Llemma-7B" target="_blank">HF Link</a> | 📃 <a href="https://arxiv.org/abs/2309.12284" target="_blank">[MetaMath]</a>| **69.2** | **30.0** | <a href="http://www.apache.org/licenses/" target="_blank">Apache License 2.0 </a>|
32
+
33
+
34
+
35
+ ## Comparing MetaMath with the LLM models.
36
+
37
+ 🔥 Comprehensive Results
38
+
39
+ | Model | GSM8k Pass@1 | MATH Pass@1 |
40
+ |---------------------|--------------|-------------|
41
+ | MPT-7B | 6.8 | 3.0 |
42
+ | Falcon-7B | 6.8 | 2.3 |
43
+ | LLaMA-1-7B | 11.0 | 2.9 |
44
+ | LLaMA-2-7B | 14.6 | 2.5 |
45
+ | MPT-30B | 15.2 | 3.1 |
46
+ | LLaMA-1-13B | 17.8 | 3.9 |
47
+ | GPT-Neo-2.7B | 19.5 | -- |
48
+ | Falcon-40B | 19.6 | 2.5 |
49
+ | Baichuan-chat-13B | 23.9 | -- |
50
+ | Vicuna-v1.3-13B | 27.6 | -- |
51
+ | LLaMA-2-13B | 28.7 | 3.9 |
52
+ | InternLM-7B | 31.2 | -- |
53
+ | ChatGLM-2-6B | 32.4 | -- |
54
+ | GPT-J-6B | 34.9 | -- |
55
+ | LLaMA-1-33B | 35.6 | 3.9 |
56
+ | LLaMA-2-34B | 42.2 | 6.24 |
57
+ | RFT-7B | 50.3 | -- |
58
+ | LLaMA-1-65B | 50.9 | 10.6 |
59
+ | Qwen-7B | 51.6 | -- |
60
+ | WizardMath-7B | 54.9 | 10.7 |
61
+ | LLaMA-2-70B | 56.8 | 13.5 |
62
+ | WizardMath-13B | 63.9 | 14.0 |
63
+ | 🔥 MetaMath-7B | **66.5** | **19.8** |
64
+ | 🔥 MetaMath-13B | **72.3** | **22.4** |
65
+ | 🔥 MetaMath-Mistral-7B | **77.7** | **28.2** |
66
+ | 🔥 MetaMath-Llemma-7B | **69.2** | **30.0** |
67
+ | WizardMath-70B | 81.6 | 22.7 |
68
+ | 🔥 MetaMath-70B | **82.3** | **26.6** |
69
+
70
+ <h2 id="env">Quick Start</h2>
71
+
72
+ Clone Metamath and install the required packages:
73
+
74
+ ```bash
75
+ git clone https://github.com/meta-math/MetaMath.git
76
+ cd MetaMath
77
+ pip install -r requirements.txt
78
+ ```
79
+
80
+ If you encounter a Ray installation problem, please run:
81
+
82
+ ```bash
83
+ pip install --upgrade ray
84
+ pip install --upgrade pyarrow
85
+ pip install pandas
86
+ ```
87
+
88
+ <h2 id="Inference">Dataset Usage</h2>
89
+
90
+ Run the following command to load the data:
91
+
92
+ ```python
93
+ from datasets import load_dataset
94
+ dataset = load_dataset("meta-math/MetaMathQA")
95
+ ```
96
+
97
+
98
+ <h2 id="train">Training</h2>
99
+
100
+ you need to prepare the llama-2 base model and our **MetaMathQA** dataset huggingface [MetaMathQA](https://huggingface.co/datasets/meta-math/MetaMathQA/tree/main)
101
+
102
+ ```
103
+ bash run.sh
104
+ ```
105
+ or
106
+
107
+ ```
108
+ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m torch.distributed.launch --master_addr ${MASTER_ADDR} --master_port ${MASTER_PORT} --nproc_per_node=8 --use_env train_math.py \
109
+ --model_name_or_path "meta-llama/Llama-2-7b-hf" \
110
+ --data_path "path/to/metamathqa" \
111
+ --data_length 10000000 \
112
+ --bf16 True \
113
+ --output_dir "path/to/save" \
114
+ --num_train_epochs 3 \
115
+ --per_device_train_batch_size 4 \
116
+ --per_device_eval_batch_size 4 \
117
+ --gradient_accumulation_steps 4 \
118
+ --evaluation_strategy "no" \
119
+ --save_strategy "steps" \
120
+ --save_steps 1000 \
121
+ --save_total_limit 2 \
122
+ --learning_rate 2e-5 \
123
+ --weight_decay 0. \
124
+ --warmup_ratio 0.03 \
125
+ --lr_scheduler_type "cosine" \
126
+ --logging_steps 1 \
127
+ --fsdp "full_shard auto_wrap" \
128
+ --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
129
+ --tf32 True
130
+ ```
131
+
132
+ ### Supervised fine-tuning
133
+
134
+ We supervised fine-tune MetaMath-7B with the following hyperparameters:
135
+
136
+ | Hyperparameter | LLaMA 2 7B |
137
+ |----------------|-------------|
138
+ | Batch size | 128 |
139
+ | Learning rate | 2e-5 |
140
+ | Epochs | 3 |
141
+ | Max length | 512 |
142
+ | LR scheduler | cosine |
143
+
144
+ <h2 id="evaluation">Evaluation</h2>
145
+
146
+ we use the vllm to help the fast generation:
147
+
148
+ ```
149
+ python eval_gsm8k.py --model "path/to/save" --data_file ./data/test/GSM8K_test.jsonl
150
+ python eval_math.py --model "path/to/save" --data_file ./data/test/MATH_test.jsonl
151
+ ```
152
+ where the "path/to/save" should be replaced by the finetuned model, you can also download our series of MetaMath models in huggingface:
153
+ 🤗 <a href="https://huggingface.co/meta-math/MetaMath-7B-V1.0" target="_blank">MetaMath 7B</a> 🤗 <a href="https://huggingface.co/meta-math/MetaMath-13B-V1.0" target="_blank">MetaMath 13B</a> 🤗 <a href="https://huggingface.co/meta-math/MetaMath-70B-V1.0" target="_blank">MetaMath 70B</a>
154
+
155
+ The inference prompt for our MetaMath is:
156
+ ```
157
+ "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response: Let's think step by step."
158
+ ```
159
+
160
+ Thanks for the open source code of [WizardMath](https://github.com/nlpxucan/WizardLM/tree/main/WizardMath) and [RFT](https://github.com/OFA-Sys/gsm8k-ScRel/tree/main). Some of our codes are based on them.
161
+
162
+ <h2 id="citation">Citation</h2>
163
+ Please cite the paper if you refer to our model, code, data or paper from MetaMath.
164
+
165
+ ```
166
+ @article{yu2023metamath,
167
+ title={MetaMath: Bootstrap Your Own Mathematical Questions for Large Language Models},
168
+ author={Yu, Longhui and Jiang, Weisen and Shi, Han and Yu, Jincheng and Liu, Zhengying and Zhang, Yu and Kwok, James T and Li, Zhenguo and Weller, Adrian and Liu, Weiyang},
169
+ journal={arXiv preprint arXiv:2309.12284},
170
+ year={2023}
171
+ }
172
+ ```
SVFT-main/MetaMath/data/README.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # MetaMathQA Data
2
+
3
+ ## Train Data
4
+ The full **MetaMathQA** dataset is now released in the huggingface [MetaMathQA](https://huggingface.co/datasets/meta-math/MetaMathQA/tree/main)!
5
+
6
+ ## Test Data
7
+ We released the GSM8K_Backward dataset is also released in the huggingface [GSM8K_Backward](https://huggingface.co/datasets/meta-math/GSM8K_Backward) to evaluate the reversal mathematical reasoning ability!
SVFT-main/MetaMath/data/test/GSM8K_Backward.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
SVFT-main/MetaMath/data/test/GSM8K_test.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
SVFT-main/MetaMath/data/test/MATH_test.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
SVFT-main/MetaMath/data/train/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # MetaMathQA
2
+
3
+ The full **MetaMathQA** dataset is now released in the huggingface [MetaMathQA](https://huggingface.co/datasets/meta-math/MetaMathQA/tree/main)
SVFT-main/MetaMath/eval_gsm8k.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import re
4
+ import jsonlines
5
+ from fraction import Fraction
6
+ from vllm import LLM, SamplingParams
7
+ import sys
8
+ MAX_INT = sys.maxsize
9
+
10
+ def is_number(s):
11
+ try:
12
+ float(s)
13
+ return True
14
+ except ValueError:
15
+ pass
16
+ try:
17
+ import unicodedata
18
+ unicodedata.numeric(s)
19
+ return True
20
+ except (TypeError, ValueError):
21
+ pass
22
+ return False
23
+
24
+ def extract_answer_number(completion):
25
+ text = completion.split('The answer is: ')
26
+ if len(text) > 1:
27
+ extract_ans = text[-1].strip()
28
+ match = re.search(r'[\-+]?\d*[\.,/]?\d+', extract_ans)
29
+ if match:
30
+ if '/' in match.group():
31
+ denominator = match.group().split('/')[1]
32
+ numerator = match.group().split('/')[0]
33
+ if is_number(denominator) == True and is_number(numerator) == True:
34
+ if denominator == '0':
35
+ return round(float(numerator.replace(',', '')))
36
+ else:
37
+ frac = Fraction(match.group().replace(',', ''))
38
+ num_numerator = frac.numerator
39
+ num_denominator = frac.denominator
40
+ return round(float(num_numerator / num_denominator))
41
+ else:
42
+ return None
43
+ else:
44
+ if float(match.group().replace(',', '')) == float('inf'):
45
+ return None
46
+ return round(float(match.group().replace(',', '')))
47
+ else:
48
+ return None
49
+ else:
50
+ return None
51
+
52
+ def batch_data(data_list, batch_size=1):
53
+ n = len(data_list) // batch_size
54
+ batch_data = []
55
+ for i in range(n-1):
56
+ start = i * batch_size
57
+ end = (i+1)*batch_size
58
+ batch_data.append(data_list[start:end])
59
+
60
+ last_start = (n-1) * batch_size
61
+ last_end = MAX_INT
62
+ batch_data.append(data_list[last_start:last_end])
63
+ return batch_data
64
+
65
+
66
+ def gsm8k_test(model, data_path, start=0, end=MAX_INT, batch_size=1, tensor_parallel_size=1):
67
+ INVALID_ANS = "[invalid]"
68
+ gsm8k_ins = []
69
+ gsm8k_answers = []
70
+ problem_prompt = (
71
+ "Below is an instruction that describes a task. "
72
+ "Write a response that appropriately completes the request.\n\n"
73
+ "### Instruction:\n{instruction}\n\n### Response: Let's think step by step."
74
+ )
75
+ print('promt =====', problem_prompt)
76
+ with open(data_path,"r+", encoding="utf8") as f:
77
+ for idx, item in enumerate(jsonlines.Reader(f)):
78
+ temp_instr = problem_prompt.format(instruction=item["query"])
79
+ gsm8k_ins.append(temp_instr)
80
+ temp_ans = item['response'].split('#### ')[1]
81
+ temp_ans = int(temp_ans.replace(',', ''))
82
+ gsm8k_answers.append(temp_ans)
83
+
84
+ gsm8k_ins = gsm8k_ins[start:end]
85
+ gsm8k_answers = gsm8k_answers[start:end]
86
+ print('lenght ====', len(gsm8k_ins))
87
+ batch_gsm8k_ins = batch_data(gsm8k_ins, batch_size=batch_size)
88
+
89
+ stop_tokens = ["Question:", "Question", "USER:", "USER", "ASSISTANT:", "ASSISTANT", "Instruction:", "Instruction", "Response:", "Response"]
90
+ sampling_params = SamplingParams(temperature=0.0, top_p=1, max_tokens=512, stop=stop_tokens)
91
+ print('sampleing =====', sampling_params)
92
+ llm = LLM(model=model,tensor_parallel_size=tensor_parallel_size)
93
+ result = []
94
+ res_completions = []
95
+ for idx, (prompt, prompt_answer) in enumerate(zip(batch_gsm8k_ins, gsm8k_answers)):
96
+ if isinstance(prompt, list):
97
+ pass
98
+ else:
99
+ prompt = [prompt]
100
+
101
+ completions = llm.generate(prompt, sampling_params)
102
+ for output in completions:
103
+ prompt = output.prompt
104
+ generated_text = output.outputs[0].text
105
+ res_completions.append(generated_text)
106
+
107
+ invalid_outputs = []
108
+ for idx, (prompt, completion, prompt_answer) in enumerate(zip(gsm8k_ins, res_completions, gsm8k_answers)):
109
+ doc = {'question': prompt}
110
+ y_pred = extract_answer_number(completion)
111
+ if y_pred != None:
112
+ result.append(float(y_pred) == float(prompt_answer))
113
+ else:
114
+ result.append(False)
115
+ temp = {'question': prompt, 'output': completion, 'answer': prompt_answer}
116
+ invalid_outputs.append(temp)
117
+ acc = sum(result) / len(result)
118
+ print('len invalid outputs ====', len(invalid_outputs), ', valid_outputs===', invalid_outputs)
119
+ print('start===', start, ', end====', end)
120
+ print('gsm8k length====', len(result), ', gsm8k acc====', acc)
121
+
122
+
123
+ def parse_args():
124
+ parser = argparse.ArgumentParser()
125
+ parser.add_argument("--model", type=str) # model path
126
+ parser.add_argument("--data_file", type=str, default='') # data path
127
+ parser.add_argument("--start", type=int, default=0) #start index
128
+ parser.add_argument("--end", type=int, default=MAX_INT) # end index
129
+ parser.add_argument("--batch_size", type=int, default=400) # batch_size
130
+ parser.add_argument("--tensor_parallel_size", type=int, default=8) # tensor_parallel_size
131
+ return parser.parse_args()
132
+ if __name__ == "__main__":
133
+ args = parse_args()
134
+ gsm8k_test(model=args.model, data_path=args.data_file, start=args.start, end=args.end, batch_size=args.batch_size, tensor_parallel_size=args.tensor_parallel_size)
SVFT-main/MetaMath/eval_math.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import pdb
4
+ import jsonlines
5
+
6
+ import util
7
+ from vllm import LLM, SamplingParams
8
+ import sys
9
+ MAX_INT = sys.maxsize
10
+ INVALID_ANS = "[invalid]"
11
+
12
+ invalid_outputs = []
13
+ def remove_boxed(s):
14
+ left = "\\boxed{"
15
+ try:
16
+ assert s[:len(left)] == left
17
+ assert s[-1] == "}"
18
+ return s[len(left):-1]
19
+ except:
20
+ return None
21
+
22
+ def process_results(doc, completion, answer):
23
+ split_ans = completion.split('The answer is: ')
24
+ if len(split_ans) > 1:
25
+ ans = split_ans[-1]
26
+ extract_ans_temp = ans.split('.\n')[0]
27
+ extract_ans_temp = extract_ans_temp.strip()
28
+ if len(extract_ans_temp)>0 and extract_ans_temp[-1] == '.':
29
+ extract_ans = extract_ans_temp[0:-1]
30
+ else:
31
+ extract_ans = extract_ans_temp
32
+ extract_ans = extract_ans.strip()
33
+ if util.is_equiv(extract_ans, answer):
34
+ return True
35
+ else:
36
+ return False
37
+ else:
38
+ temp = {'question': doc, 'output': completion, 'answer': answer}
39
+ invalid_outputs.append(temp)
40
+ return False
41
+ def batch_data(data_list, batch_size=1):
42
+ n = len(data_list) // batch_size
43
+ batch_data = []
44
+ for i in range(n-1):
45
+ start = i * batch_size
46
+ end = (i+1)*batch_size
47
+ batch_data.append(data_list[start:end])
48
+
49
+ last_start = (n-1) * batch_size
50
+ last_end = MAX_INT
51
+ batch_data.append(data_list[last_start:last_end])
52
+ return batch_data
53
+
54
+ def test_hendrycks_math(model, data_path, start=0, end=MAX_INT, batch_size=1, tensor_parallel_size=1):
55
+ hendrycks_math_ins = []
56
+ hendrycks_math_answers = []
57
+ problem_prompt = (
58
+ "Below is an instruction that describes a task. "
59
+ "Write a response that appropriately completes the request.\n\n"
60
+ "### Instruction:\n{instruction}\n\n### Response: Let's think step by step."
61
+ )
62
+ print('promt =====', problem_prompt)
63
+ with open(data_path, "r+", encoding="utf8") as f:
64
+ for idx, item in enumerate(jsonlines.Reader(f)):
65
+ temp_instr = problem_prompt.format(instruction=item["instruction"])
66
+ hendrycks_math_ins.append(temp_instr)
67
+ solution = item['output']
68
+ temp_ans = remove_boxed(util.last_boxed_only_string(solution))
69
+ hendrycks_math_answers.append(temp_ans)
70
+
71
+ print('total length ===', len(hendrycks_math_ins))
72
+ hendrycks_math_ins = hendrycks_math_ins[start:end]
73
+ hendrycks_math_answers = hendrycks_math_answers[start:end]
74
+ print('lenght ====', len(hendrycks_math_ins))
75
+ batch_hendrycks_math_ins = batch_data(hendrycks_math_ins, batch_size=batch_size)
76
+
77
+ stop_tokens = ["Question:", "Question", "USER:", "USER", "ASSISTANT:", "ASSISTANT", "Instruction:", "Instruction", "Response:", "Response"]
78
+ sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=2048, stop=stop_tokens)
79
+ print('sampleing =====', sampling_params)
80
+ llm = LLM(model=model,tensor_parallel_size=tensor_parallel_size)
81
+ res_completions = []
82
+ for idx, (prompt, prompt_answer) in enumerate(zip(batch_hendrycks_math_ins, hendrycks_math_answers)):
83
+ if isinstance(prompt, list):
84
+ pass
85
+ else:
86
+ prompt = [prompt]
87
+ completions = llm.generate(prompt, sampling_params)
88
+ for output in completions:
89
+ prompt_temp = output.prompt
90
+ generated_text = output.outputs[0].text
91
+ res_completions.append(generated_text)
92
+
93
+ results = []
94
+ for idx, (prompt, completion, prompt_answer) in enumerate(zip(hendrycks_math_ins, res_completions, hendrycks_math_answers)):
95
+ res = process_results(prompt, completion, prompt_answer)
96
+ results.append(res)
97
+
98
+ acc = sum(results) / len(results)
99
+ print('len invalid outputs ====', len(invalid_outputs), ', valid_outputs===', invalid_outputs)
100
+ print('start===', start, ', end====',end)
101
+ print('length====', len(results), ', acc====', acc)
102
+
103
+ def parse_args():
104
+ parser = argparse.ArgumentParser()
105
+ parser.add_argument("--model", type=str, default='') # model path
106
+ parser.add_argument("--data_file", type=str, default='') # data path
107
+ parser.add_argument("--start", type=int, default=0) #start index
108
+ parser.add_argument("--end", type=int, default=MAX_INT) # end index
109
+ parser.add_argument("--batch_size", type=int, default=400) # batch_size
110
+ parser.add_argument("--tensor_parallel_size", type=int, default=8) # tensor_parallel_size
111
+ return parser.parse_args()
112
+
113
+ if __name__ == "__main__":
114
+ args = parse_args()
115
+ test_hendrycks_math(model=args.model, data_path=args.data_file, start=args.start, end=args.end, batch_size=args.batch_size, tensor_parallel_size=args.tensor_parallel_size)
SVFT-main/MetaMath/requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers>=4.34.0
2
+ wandb==0.15.3
3
+ torch==2.0.1
4
+ sentencepiece==0.1.99
5
+ tokenizers==0.13.3
6
+ accelerate==0.21.0
7
+ bitsandbytes==0.40.0
8
+ vllm
9
+ fraction
10
+ tqdm
11
+ numpy
12
+ fire
13
+ openai
14
+ scipy
15
+ jsonlines
16
+ pandas