Compare commits
651 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d426b58b9e | ||
|
|
1e82cd8457 | ||
|
|
d18d47be67 | ||
|
|
ff6f1492e8 | ||
|
|
7365f01d43 | ||
|
|
3a21d6da6b | ||
|
|
0aa40e9569 | ||
|
|
8036ceb7c5 | ||
|
|
6664ea943d | ||
|
|
5a6b138b00 | ||
|
|
7fe294bf07 | ||
|
|
b85c59f997 | ||
|
|
f9bc3f5771 | ||
|
|
0b13fb822a | ||
|
|
71a382319f | ||
|
|
01a14ebd8d | ||
|
|
9fa836a73f | ||
|
|
b43cb09aaa | ||
|
|
df27648bd9 | ||
|
|
93dccf527b | ||
|
|
90787fed81 | ||
|
|
73306d028b | ||
|
|
ce2f4227ab | ||
|
|
f0a4fc6cd6 | ||
|
|
a5381495e6 | ||
|
|
83446a88d9 | ||
|
|
9fde13a3ac | ||
|
|
e63a81dd25 | ||
|
|
217349016a | ||
|
|
adb8c93134 | ||
|
|
c69b082601 | ||
|
|
ca1d66734d | ||
|
|
5e3c72842d | ||
|
|
0731fa1587 | ||
|
|
a3998e76ae | ||
|
|
b5625f131d | ||
|
|
44a5b4bbe7 | ||
|
|
7fc628d98e | ||
|
|
64ca855617 | ||
|
|
9d87eafd11 | ||
|
|
a3b3638f6f | ||
|
|
c96ca70f25 | ||
|
|
7b5eda32bb | ||
|
|
c63d91dd1c | ||
|
|
b2907cd06e | ||
|
|
2fec88ee02 | ||
|
|
7e03d2bd7c | ||
|
|
335dd5e68a | ||
|
|
ea2600bd5f | ||
|
|
5c3d441ee1 | ||
|
|
f5a236c3ca | ||
|
|
6b4c3ee234 | ||
|
|
79815bf666 | ||
|
|
5004d5af42 | ||
|
|
9ca21c838b | ||
|
|
e0849a66ac | ||
|
|
6b081f04e6 | ||
|
|
0e31e06a75 | ||
|
|
ea56d305be | ||
|
|
d440e21f5b | ||
|
|
875c4ae48f | ||
|
|
f09f42d4d3 | ||
|
|
bac51fba3a | ||
|
|
babd41e7fa | ||
|
|
974d083c7b | ||
|
|
983fef469c | ||
|
|
009fcb0ec1 | ||
|
|
11b13e94a3 | ||
|
|
1ce3fb5cc7 | ||
|
|
62f5804608 | ||
|
|
908230d261 | ||
|
|
24d5ad1dcc | ||
|
|
9ddf60b694 | ||
|
|
0e9899f451 | ||
|
|
48ac24020d | ||
|
|
7511f3dd89 | ||
|
|
980211a63a | ||
|
|
6bc966793a | ||
|
|
db1a7f27a1 | ||
|
|
b28020f590 | ||
|
|
3e1bc27e1b | ||
|
|
f44ff574d3 | ||
|
|
264eb23912 | ||
|
|
ccebcae75f | ||
|
|
92b3cb786d | ||
|
|
cd656fb21a | ||
|
|
83fa8d9fb5 | ||
|
|
98edad418e | ||
|
|
96d21ad06b | ||
|
|
850795c487 | ||
|
|
1487b840d3 | ||
|
|
bd0d3fd76e | ||
|
|
dbeb7fb4e6 | ||
|
|
cd77c750c5 | ||
|
|
3922a2497e | ||
|
|
00df3d4de0 | ||
|
|
f81b6c95f2 | ||
|
|
632675ea88 | ||
|
|
eaa6b9afc6 | ||
|
|
9bab9b83d2 | ||
|
|
64abd3e0aa | ||
|
|
837577256b | ||
|
|
90b7df444f | ||
|
|
119dc50e2a | ||
|
|
34a3c25a30 | ||
|
|
1a8e87be4e | ||
|
|
b94cf7faac | ||
|
|
2eaa8b6e56 | ||
|
|
801aaa5508 | ||
|
|
56d4ba8ddb | ||
|
|
c7f79815e7 | ||
|
|
15579e2d55 | ||
|
|
088fa7b759 | ||
|
|
073219b43f | ||
|
|
983c484fa2 | ||
|
|
cefd51c50c | ||
|
|
ca6ce3040d | ||
|
|
908cd5ea27 | ||
|
|
6e6c8c52ed | ||
|
|
23c6998bf4 | ||
|
|
65a89a8976 | ||
|
|
6d5049a24d | ||
|
|
23a2cea8cb | ||
|
|
99f9243de5 | ||
|
|
9d8fd2d40e | ||
|
|
6e2c28a14a | ||
|
|
b8f43cb273 | ||
|
|
258ed2eaa8 | ||
|
|
50ee59578d | ||
|
|
1c9333584a | ||
|
|
e25b6fe354 | ||
|
|
27c7b99015 | ||
|
|
99d4515572 | ||
|
|
dc17f2a111 | ||
|
|
880854846b | ||
|
|
d9fa1bad72 | ||
|
|
a98b2ca8c0 | ||
|
|
83a41d39b3 | ||
|
|
cd51893d37 | ||
|
|
248aeaa842 | ||
|
|
c76c3cebed | ||
|
|
eb59e9f705 | ||
|
|
e184ad13cf | ||
|
|
dfe012ad9d | ||
|
|
c024ab98df | ||
|
|
9aeb0b9b8a | ||
|
|
715fa638a7 | ||
|
|
100e3b6f21 | ||
|
|
6c32d8bb95 | ||
|
|
760164d63b | ||
|
|
387217bd3e | ||
|
|
7d1bb7f256 | ||
|
|
a1cb100460 | ||
|
|
c11b6fd393 | ||
|
|
632682726f | ||
|
|
2b566c182e | ||
|
|
764f836d52 | ||
|
|
d5831acb07 | ||
|
|
ed6cd597cc | ||
|
|
5cb463a714 | ||
|
|
afc24ea5d4 | ||
|
|
894812c652 | ||
|
|
b20f11d4ca | ||
|
|
0304628590 | ||
|
|
1fc855e456 | ||
|
|
3c86b6f3c5 | ||
|
|
b803b067bf | ||
|
|
896a0eb1fd | ||
|
|
0d6c17fc1b | ||
|
|
a3085020ed | ||
|
|
cf8a70bf68 | ||
|
|
6bb3edc300 | ||
|
|
c6f682c1eb | ||
|
|
4d1c98c012 | ||
|
|
2f32dfd33b | ||
|
|
e83d9f1c1d | ||
|
|
ebba9e929d | ||
|
|
055e80cfad | ||
|
|
b1e1a9f9b2 | ||
|
|
fd8423321f | ||
|
|
0cd81fb99f | ||
|
|
90d3b787f6 | ||
|
|
84c0aa1868 | ||
|
|
331065e62d | ||
|
|
414e9e7122 | ||
|
|
3cdb38a7c0 | ||
|
|
ebd45980a0 | ||
|
|
45634f87f8 | ||
|
|
af1ee9e648 | ||
|
|
164c794eb3 | ||
|
|
801f2ac8c7 | ||
|
|
bfec203d4e | ||
|
|
f599623a99 | ||
|
|
f26a353057 | ||
|
|
16ce15ed4b | ||
|
|
f24232cd1b | ||
|
|
1b59b57b57 | ||
|
|
569da80ced | ||
|
|
43114b89ba | ||
|
|
d6a677b14b | ||
|
|
27c1b656cc | ||
|
|
24df44d9c7 | ||
|
|
73be60c47b | ||
|
|
6806f8204e | ||
|
|
176d3b3079 | ||
|
|
9261c7f771 | ||
|
|
91d33c798b | ||
|
|
2926852f14 | ||
|
|
e2810edc8f | ||
|
|
c301faa92b | ||
|
|
81d6841b4b | ||
|
|
dd4df80f0b | ||
|
|
1efc208ff3 | ||
|
|
c45d0cf60f | ||
|
|
bf89be77b9 | ||
|
|
bf8d4bc674 | ||
|
|
74755c89b9 | ||
|
|
0ffc8eaf53 | ||
|
|
f01b3e6680 | ||
|
|
78528742f1 | ||
|
|
12e0aa4368 | ||
|
|
80faf22b4a | ||
|
|
d0e594f9db | ||
|
|
629b22adcf | ||
|
|
594ca6dead | ||
|
|
0df4e62da0 | ||
|
|
f75bf05ce6 | ||
|
|
0d467fd6de | ||
|
|
d8293e84f3 | ||
|
|
4d6c93e923 | ||
|
|
9b2badf3c9 | ||
|
|
f78ebc22ad | ||
|
|
bfe870be65 | ||
|
|
74ea432847 | ||
|
|
492bea9aa0 | ||
|
|
e213900fa2 | ||
|
|
9f5f646442 | ||
|
|
9024b19994 | ||
|
|
3233b58ad4 | ||
|
|
e6ec24fa88 | ||
|
|
599db139f9 | ||
|
|
835b76a46f | ||
|
|
7ead04ce14 | ||
|
|
1f82a5d910 | ||
|
|
8c67b529f6 | ||
|
|
7211541ade | ||
|
|
0f6017bee3 | ||
|
|
87c8fca9bc | ||
|
|
88def24c45 | ||
|
|
822f725a07 | ||
|
|
fc84bd5254 | ||
|
|
deff792bb6 | ||
|
|
9398058e19 | ||
|
|
90cda45e9e | ||
|
|
6bca56fdb0 | ||
|
|
365ccd0af2 | ||
|
|
d039c679d2 | ||
|
|
7e0c5c731a | ||
|
|
eeaa402cd4 | ||
|
|
7bb4271291 | ||
|
|
267587c258 | ||
|
|
d891fd0ae0 | ||
|
|
aeef4823ab | ||
|
|
0412f3d929 | ||
|
|
8742c95461 | ||
|
|
1240be3ed9 | ||
|
|
b262577d17 | ||
|
|
83a2347952 | ||
|
|
cea04a2443 | ||
|
|
e1844d9a45 | ||
|
|
9fb7addd4d | ||
|
|
734d29b03d | ||
|
|
2818e50569 | ||
|
|
31c56f2e0b | ||
|
|
951ae99bea | ||
|
|
041eac2d6d | ||
|
|
3471ff0d35 | ||
|
|
18e5bdbec5 | ||
|
|
f18ac4c28e | ||
|
|
359dc43837 | ||
|
|
d98a384cb0 | ||
|
|
3e0cf49514 | ||
|
|
35d32308de | ||
|
|
81db12c3ba | ||
|
|
10724a8123 | ||
|
|
a8d34e534e | ||
|
|
e74c73a85d | ||
|
|
e6c0019c80 | ||
|
|
495580dad1 | ||
|
|
71f94a8a1c | ||
|
|
81422c4e6d | ||
|
|
072750f4dc | ||
|
|
4621ad6f9d | ||
|
|
a31d4a2971 | ||
|
|
c8b0c1e551 | ||
|
|
4c09a96096 | ||
|
|
5565dcdd35 | ||
|
|
8a6881822a | ||
|
|
7a865821d9 | ||
|
|
70373a5f7c | ||
|
|
c3783399db | ||
|
|
d79e9c9a9a | ||
|
|
d73eb552e8 | ||
|
|
9fcc532df6 | ||
|
|
76a1417f2a | ||
|
|
9fc8dcb2a0 | ||
|
|
f2522869ea | ||
|
|
7cef764ec0 | ||
|
|
23dad8447c | ||
|
|
d8e33dbd67 | ||
|
|
59b123bc50 | ||
|
|
ba2378ced5 | ||
|
|
e4e2a666c9 | ||
|
|
398bb03f98 | ||
|
|
ce50305e5b | ||
|
|
1a948d7020 | ||
|
|
1c62e87b34 | ||
|
|
d6eaf4e6d2 | ||
|
|
45841eaf7b | ||
|
|
0dddc1494d | ||
|
|
75a23d24af | ||
|
|
798b3b3899 | ||
|
|
8af25b1664 | ||
|
|
6b2200fc88 | ||
|
|
c824d15aa1 | ||
|
|
b6ea0f43ae | ||
|
|
5daca95ddd | ||
|
|
54abc67aec | ||
|
|
00204f2b4c | ||
|
|
a3c5883f2c | ||
|
|
daf8bebcdd | ||
|
|
345c23a60f | ||
|
|
7e98e211f0 | ||
|
|
6be7cdda66 | ||
|
|
ced0a94204 | ||
|
|
067395d5c5 | ||
|
|
698f9e3d7a | ||
|
|
c11b3e2926 | ||
|
|
2a34d5b71b | ||
|
|
c9270086ea | ||
|
|
577a03664d | ||
|
|
7c6812645a | ||
|
|
939148b050 | ||
|
|
783a616999 | ||
|
|
80327a13ea | ||
|
|
654e051e2a | ||
|
|
fa2ccbc081 | ||
|
|
2ab78325f0 | ||
|
|
631be27078 | ||
|
|
b0f7db73cd | ||
|
|
ea89bec185 | ||
|
|
fd2f17a7a1 | ||
|
|
5eab3cf6bc | ||
|
|
7dce8dc7ac | ||
|
|
eed46f38b7 | ||
|
|
b1de7ae08a | ||
|
|
357db7098c | ||
|
|
f9c5317db2 | ||
|
|
28e608a2c2 | ||
|
|
1efa0a7552 | ||
|
|
d0c9fe277a | ||
|
|
5ca054757f | ||
|
|
9e80fc7b2f | ||
|
|
158e82e061 | ||
|
|
9d00f78f16 | ||
|
|
b668a740ca | ||
|
|
bc1715c1e0 | ||
|
|
36883c1192 | ||
|
|
6e5291a915 | ||
|
|
fa84ae26d6 | ||
|
|
63e3827c6b | ||
|
|
645713e2cb | ||
|
|
73f6e9817c | ||
|
|
77676c27d2 | ||
|
|
344126fe58 | ||
|
|
5b7fb6a4a1 | ||
|
|
6f68d559ab | ||
|
|
1ab25c49d3 | ||
|
|
b03872aae0 | ||
|
|
518ba748e0 | ||
|
|
18601c3b6e | ||
|
|
6e7102cfb3 | ||
|
|
deceb00161 | ||
|
|
eeb70cdd77 | ||
|
|
ed9b84816e | ||
|
|
f86ed23189 | ||
|
|
cfa0380515 | ||
|
|
300ec3003c | ||
|
|
1c37746892 | ||
|
|
7e17f09fb5 | ||
|
|
8a2be93b4e | ||
|
|
562f864038 | ||
|
|
8618bf15d6 | ||
|
|
2fa8737c44 | ||
|
|
f15f087143 | ||
|
|
fae4d1c266 | ||
|
|
b8e924e10d | ||
|
|
767bc3ca68 | ||
|
|
343c094f21 | ||
|
|
80caf79d07 | ||
|
|
bb3bfa2d29 | ||
|
|
29cbab98f0 | ||
|
|
a4c9338b83 | ||
|
|
b670c26684 | ||
|
|
b67fa1a8d2 | ||
|
|
286d5bb6b7 | ||
|
|
478e456e83 | ||
|
|
12726f8556 | ||
|
|
ac1b449cc9 | ||
|
|
3e52915fa7 | ||
|
|
228f52867c | ||
|
|
a80778f40e | ||
|
|
3df1d2d144 | ||
|
|
a436574bfd | ||
|
|
d0f8b9a978 | ||
|
|
a557836a70 | ||
|
|
655fd06853 | ||
|
|
e5812462fc | ||
|
|
4775ec354b | ||
|
|
cb6d54bfda | ||
|
|
f79a7dc661 | ||
|
|
a241011057 | ||
|
|
e37ca8e11a | ||
|
|
ceae85ad60 | ||
|
|
71883b6ddc | ||
|
|
8d5a47c79b | ||
|
|
79e4a6a25c | ||
|
|
bbaaec046c | ||
|
|
1c12ee0e55 | ||
|
|
65c75fc587 | ||
|
|
fb393ad994 | ||
|
|
90debb9ff2 | ||
|
|
b98ff88544 | ||
|
|
3a2c4e6f63 | ||
|
|
4e3f745ba4 | ||
|
|
db0795b5d0 | ||
|
|
7f74084528 | ||
|
|
c37815f130 | ||
|
|
73fcebf7ec | ||
|
|
59941c5d1f | ||
|
|
15dda5ea32 | ||
|
|
01ffc65e9b | ||
|
|
825697cad4 | ||
|
|
1fa93ca1ea | ||
|
|
ca6bdb28f6 | ||
|
|
61d9ee45e3 | ||
|
|
ff36e6d8d7 | ||
|
|
e516a34a15 | ||
|
|
9d0d1cd339 | ||
|
|
15d897ff4a | ||
|
|
f25e9b6f77 | ||
|
|
a5a06a851e | ||
|
|
1718fb9e74 | ||
|
|
9a399ead25 | ||
|
|
3376adc051 | ||
|
|
e4baa68ddb | ||
|
|
149dc376aa | ||
|
|
407093b3fa | ||
|
|
c7be096c39 | ||
|
|
a305067f2d | ||
|
|
3492a6ec17 | ||
|
|
33adab2b91 | ||
|
|
a1f1dce0ae | ||
|
|
62c1fc3c1e | ||
|
|
284572efc0 | ||
|
|
ed6ba93912 | ||
|
|
81a911cce5 | ||
|
|
faef6f6191 | ||
|
|
5664327c24 | ||
|
|
3b29322d4c | ||
|
|
fc624716aa | ||
|
|
f516cf3956 | ||
|
|
d72fa2a0f6 | ||
|
|
bcc99fd92e | ||
|
|
a26ce4dee1 | ||
|
|
ec5d6c6a70 | ||
|
|
fe9aab1055 | ||
|
|
5c5f67a256 | ||
|
|
db90e12114 | ||
|
|
d0724d0794 | ||
|
|
7711403bbd | ||
|
|
8bb166db5d | ||
|
|
f09d999641 | ||
|
|
dd7a958fd6 | ||
|
|
d35405b7a3 | ||
|
|
3e89fca543 | ||
|
|
128cfdee9b | ||
|
|
e778dd854d | ||
|
|
04b602f96f | ||
|
|
64a971a915 | ||
|
|
036831e279 | ||
|
|
41a13a6375 | ||
|
|
0c88c856d5 | ||
|
|
8efc6dd544 | ||
|
|
a2978465a2 | ||
|
|
01b68be34f | ||
|
|
3d2096f516 | ||
|
|
ca31abc6d6 | ||
|
|
8e5587fb79 | ||
|
|
cce3089b65 | ||
|
|
641a8decdc | ||
|
|
e347725d8c | ||
|
|
94c99db34c | ||
|
|
7ffa817390 | ||
|
|
c5f35e61db | ||
|
|
abc43ffbff | ||
|
|
8ac840ff87 | ||
|
|
a0d386455b | ||
|
|
ea636440d1 | ||
|
|
a4df2e0113 | ||
|
|
77d397202b | ||
|
|
bbc0c86f9b | ||
|
|
5e289f69bc | ||
|
|
2cff4bd8f3 | ||
|
|
55397dfb9b | ||
|
|
b6938916ac | ||
|
|
d303f84e7b | ||
|
|
2fde5a2489 | ||
|
|
2f1c745cde | ||
|
|
83bc5235cf | ||
|
|
d7c62661a3 | ||
|
|
f349826a57 | ||
|
|
f061606277 | ||
|
|
805c21aeba | ||
|
|
d000195ee6 | ||
|
|
3c6efd0ca3 | ||
|
|
3f5ccb183e | ||
|
|
3cb51299c3 | ||
|
|
18a879f475 | ||
|
|
d803409215 | ||
|
|
a468870fd2 | ||
|
|
855ff0e91d | ||
|
|
d064009b72 | ||
|
|
a701a0cee1 | ||
|
|
59a1aefb1c | ||
|
|
69f4f058fa | ||
|
|
a648ff738c | ||
|
|
9ed09cb4a3 | ||
|
|
d3549b66af | ||
|
|
a096e2a88b | ||
|
|
71b4750517 | ||
|
|
43a4e1bbe4 | ||
|
|
46ccbb42fc | ||
|
|
bbc707cf39 | ||
|
|
9c391277cc | ||
|
|
1bbdbacd5b | ||
|
|
955d7ecb57 | ||
|
|
031ad4eb37 | ||
|
|
db0a9ee6e0 | ||
|
|
a4d07b983a | ||
|
|
d3418a94ff | ||
|
|
56e98ba81a | ||
|
|
8669598abd | ||
|
|
1b8613acb3 | ||
|
|
8e3b1c860f | ||
|
|
f1971bf303 | ||
|
|
cc0135134b | ||
|
|
dc667ce1a7 | ||
|
|
7140363e09 | ||
|
|
a52d56c8d9 | ||
|
|
e92bcb7eb6 | ||
|
|
cbb368ca06 | ||
|
|
b6d4284b26 | ||
|
|
a1faaf9962 | ||
|
|
c7780700f5 | ||
|
|
76f0d99f02 | ||
|
|
8e9526b4b5 | ||
|
|
5c00e344c1 | ||
|
|
0b51532ce9 | ||
|
|
110394b2ba | ||
|
|
8ade204098 | ||
|
|
47f0e3cfb7 | ||
|
|
8938b546bf | ||
|
|
1ca52567a4 | ||
|
|
28e64ad5a4 | ||
|
|
be5bf7b81b | ||
|
|
80eacb8f16 | ||
|
|
33e72b08d5 | ||
|
|
9b312f9d41 | ||
|
|
40ed717232 | ||
|
|
3fd71c4431 | ||
|
|
f19dad61c7 | ||
|
|
f69dbecc38 | ||
|
|
6709739a05 | ||
|
|
c28273793e | ||
|
|
b040bff6df | ||
|
|
fafd4c86ec | ||
|
|
6aa919469d | ||
|
|
89896fe04f | ||
|
|
fdc05cd68f | ||
|
|
854ec5784e | ||
|
|
9a24e0cf76 | ||
|
|
b72f9d340e | ||
|
|
ec6fb25c21 | ||
|
|
418589244d | ||
|
|
67a8be8e90 | ||
|
|
07bc8efbc3 | ||
|
|
63e36007ee | ||
|
|
f2538c1274 | ||
|
|
a5df980c5b | ||
|
|
40a39ab650 | ||
|
|
7c3a15ace9 | ||
|
|
981a5c8c17 | ||
|
|
8ae1044f80 | ||
|
|
aae74065df | ||
|
|
a7d3794a29 | ||
|
|
fe0f552e00 | ||
|
|
348e19aa21 | ||
|
|
c2407fdd88 | ||
|
|
f116cf599c | ||
|
|
6e61e06051 | ||
|
|
02110485b0 | ||
|
|
e1d89cb24d | ||
|
|
0558c9cb9b | ||
|
|
81babb227e | ||
|
|
31a3a73ee3 | ||
|
|
7c1697562a | ||
|
|
b81ab431f2 | ||
|
|
2d8559731a | ||
|
|
72c36b9ea2 | ||
|
|
608a8f5b56 | ||
|
|
df3961121f | ||
|
|
8e651f56b7 | ||
|
|
808bb8da7e | ||
|
|
b016dd16c9 | ||
|
|
169fea6855 | ||
|
|
f3776df0f3 | ||
|
|
ca99a2d500 | ||
|
|
7da3ef24cd | ||
|
|
268d4f2099 | ||
|
|
b4fcd59a5a | ||
|
|
15e53c4e87 | ||
|
|
f03c0c1423 | ||
|
|
4321c54125 | ||
|
|
727a79b305 | ||
|
|
8fda532c3c | ||
|
|
ba10065c4b | ||
|
|
076a207935 | ||
|
|
73f2c342f5 | ||
|
|
3835e1e651 | ||
|
|
88e5bef58f | ||
|
|
568c0ffb7e | ||
|
|
60a5babd57 | ||
|
|
dfb61caf77 | ||
|
|
d36680df54 | ||
|
|
ec276d6aba | ||
|
|
6e011690a9 | ||
|
|
3a52b65795 | ||
|
|
86a630702d | ||
|
|
b5d73976ad | ||
|
|
22e7c4edaf |
@@ -1,109 +1,110 @@
|
||||
version: 2
|
||||
jobs:
|
||||
build_py3_torch_and_tf:
|
||||
run_tests_torch_and_tf:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.5
|
||||
environment:
|
||||
OMP_NUM_THREADS: 1
|
||||
resource_class: xlarge
|
||||
parallelism: 1
|
||||
steps:
|
||||
- checkout
|
||||
- run: sudo pip install torch
|
||||
- run: sudo pip install tensorflow
|
||||
- run: sudo pip install --progress-bar off .
|
||||
- run: sudo pip install pytest codecov pytest-cov
|
||||
- run: sudo pip install tensorboardX scikit-learn
|
||||
- run: python -m pytest -sv ./transformers/tests/ --cov
|
||||
- run: sudo pip install .[sklearn,tf,torch,testing]
|
||||
- run: sudo pip install codecov pytest-cov
|
||||
- run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
|
||||
- run: codecov
|
||||
build_py3_torch:
|
||||
run_all_tests_torch_and_tf:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.5
|
||||
environment:
|
||||
OMP_NUM_THREADS: 1
|
||||
RUN_SLOW: yes
|
||||
RUN_CUSTOM_TOKENIZERS: yes
|
||||
resource_class: xlarge
|
||||
parallelism: 1
|
||||
steps:
|
||||
- checkout
|
||||
- run: sudo pip install torch
|
||||
- run: sudo pip install --progress-bar off .
|
||||
- run: sudo pip install pytest codecov pytest-cov
|
||||
- run: sudo pip install tensorboardX scikit-learn
|
||||
- run: python -m pytest -sv ./transformers/tests/ --cov
|
||||
- run: python -m pytest -sv ./examples/
|
||||
- run: codecov
|
||||
build_py3_tf:
|
||||
- run: sudo pip install .[mecab,sklearn,tf,torch,testing]
|
||||
- run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/
|
||||
run_tests_torch:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.5
|
||||
- image: circleci/python:3.7
|
||||
environment:
|
||||
OMP_NUM_THREADS: 1
|
||||
resource_class: xlarge
|
||||
parallelism: 1
|
||||
steps:
|
||||
- checkout
|
||||
- run: sudo pip install tensorflow
|
||||
- run: sudo pip install --progress-bar off .
|
||||
- run: sudo pip install pytest codecov pytest-cov
|
||||
- run: sudo pip install tensorboardX scikit-learn
|
||||
- run: python -m pytest -sv ./transformers/tests/ --cov
|
||||
- run: sudo pip install .[sklearn,torch,testing]
|
||||
- run: sudo pip install codecov pytest-cov
|
||||
- run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
|
||||
- run: codecov
|
||||
build_py2_torch:
|
||||
run_tests_tf:
|
||||
working_directory: ~/transformers
|
||||
resource_class: large
|
||||
parallelism: 1
|
||||
docker:
|
||||
- image: circleci/python:2.7
|
||||
- image: circleci/python:3.7
|
||||
environment:
|
||||
OMP_NUM_THREADS: 1
|
||||
resource_class: xlarge
|
||||
parallelism: 1
|
||||
steps:
|
||||
- checkout
|
||||
- run: sudo pip install torch
|
||||
- run: sudo pip install --progress-bar off .
|
||||
- run: sudo pip install pytest codecov pytest-cov
|
||||
- run: python -m pytest -sv ./transformers/tests/ --cov
|
||||
- run: sudo pip install .[sklearn,tf,testing]
|
||||
- run: sudo pip install codecov pytest-cov
|
||||
- run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
|
||||
- run: codecov
|
||||
build_py2_tf:
|
||||
working_directory: ~/transformers
|
||||
resource_class: large
|
||||
parallelism: 1
|
||||
docker:
|
||||
- image: circleci/python:2.7
|
||||
steps:
|
||||
- checkout
|
||||
- run: sudo pip install tensorflow
|
||||
- run: sudo pip install --progress-bar off .
|
||||
- run: sudo pip install pytest codecov pytest-cov
|
||||
- run: python -m pytest -sv ./transformers/tests/ --cov
|
||||
- run: codecov
|
||||
build_py3_custom_tokenizers:
|
||||
run_tests_custom_tokenizers:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.5
|
||||
environment:
|
||||
RUN_CUSTOM_TOKENIZERS: yes
|
||||
steps:
|
||||
- checkout
|
||||
- run: sudo pip install --progress-bar off .
|
||||
- run: sudo pip install pytest
|
||||
- run: sudo pip install mecab-python3
|
||||
- run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
|
||||
build_py2_custom_tokenizers:
|
||||
- run: sudo pip install .[mecab,testing]
|
||||
- run: python -m pytest -sv ./tests/test_tokenization_bert_japanese.py
|
||||
run_examples_torch:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:2.7
|
||||
- image: circleci/python:3.5
|
||||
environment:
|
||||
OMP_NUM_THREADS: 1
|
||||
resource_class: xlarge
|
||||
parallelism: 1
|
||||
steps:
|
||||
- checkout
|
||||
- run: sudo pip install --progress-bar off .
|
||||
- run: sudo pip install pytest
|
||||
- run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
|
||||
- run: sudo pip install mecab-python
|
||||
- run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
|
||||
- run: sudo pip install .[sklearn,torch,testing]
|
||||
- run: sudo pip install -r examples/requirements.txt
|
||||
- run: python -m pytest -n 8 --dist=loadfile -s -v ./examples/
|
||||
deploy_doc:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.5
|
||||
steps:
|
||||
- add_ssh_keys:
|
||||
fingerprints:
|
||||
- "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71"
|
||||
fingerprints:
|
||||
- "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71"
|
||||
- checkout
|
||||
- run: sudo pip install --progress-bar off -r docs/requirements.txt
|
||||
- run: sudo pip install --progress-bar off -r requirements.txt
|
||||
- run: sudo pip install .[tf,torch,docs]
|
||||
- run: ./.circleci/deploy.sh
|
||||
repository_consistency:
|
||||
check_code_quality:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.6
|
||||
resource_class: medium
|
||||
parallelism: 1
|
||||
steps:
|
||||
- checkout
|
||||
# we need a version of isort with https://github.com/timothycrosley/isort/pull/1000
|
||||
- run: sudo pip install git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort
|
||||
- run: sudo pip install .[tf,torch,quality]
|
||||
- run: black --check --line-length 119 --target-version py35 examples templates tests src utils
|
||||
- run: isort --check-only --recursive examples templates tests src utils
|
||||
- run: flake8 examples templates tests src utils
|
||||
check_repository_consistency:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.5
|
||||
@@ -122,12 +123,21 @@ workflows:
|
||||
version: 2
|
||||
build_and_test:
|
||||
jobs:
|
||||
- repository_consistency
|
||||
- build_py3_custom_tokenizers
|
||||
- build_py2_custom_tokenizers
|
||||
- build_py3_torch_and_tf
|
||||
- build_py3_torch
|
||||
- build_py3_tf
|
||||
- build_py2_torch
|
||||
- build_py2_tf
|
||||
- check_code_quality
|
||||
- check_repository_consistency
|
||||
- run_examples_torch
|
||||
- run_tests_custom_tokenizers
|
||||
- run_tests_torch_and_tf
|
||||
- run_tests_torch
|
||||
- run_tests_tf
|
||||
- deploy_doc: *workflow_filters
|
||||
run_slow_tests:
|
||||
triggers:
|
||||
- schedule:
|
||||
cron: "0 4 * * 1"
|
||||
filters:
|
||||
branches:
|
||||
only:
|
||||
- master
|
||||
jobs:
|
||||
- run_all_tests_torch_and_tf
|
||||
|
||||
@@ -3,7 +3,7 @@ cd docs
|
||||
function deploy_doc(){
|
||||
echo "Creating doc at commit $1 and pushing to folder $2"
|
||||
git checkout $1
|
||||
if [ ! -z "$2" ]
|
||||
if [ ! -z "$2" ]
|
||||
then
|
||||
if [ -d "$dir/$2" ]; then
|
||||
echo "Directory" $2 "already exists"
|
||||
@@ -17,10 +17,12 @@ function deploy_doc(){
|
||||
fi
|
||||
}
|
||||
|
||||
deploy_doc "master"
|
||||
deploy_doc "master"
|
||||
deploy_doc "b33a385" v1.0.0
|
||||
deploy_doc "fe02e45" v1.1.0
|
||||
deploy_doc "89fd345" v1.2.0
|
||||
deploy_doc "fc9faa8" v2.0.0
|
||||
deploy_doc "3ddce1d" v2.1.1
|
||||
deploy_doc "3616209" v2.2.0
|
||||
deploy_doc "d0f8b9a" v2.3.0
|
||||
deploy_doc "6664ea9" v2.4.0
|
||||
8
.github/ISSUE_TEMPLATE/---new-benchmark.md
vendored
8
.github/ISSUE_TEMPLATE/---new-benchmark.md
vendored
@@ -1,17 +1,17 @@
|
||||
---
|
||||
name: "\U0001F5A5 New Benchmark"
|
||||
about: You benchmark a part of this library and would like to share your results
|
||||
name: "\U0001F5A5 New benchmark"
|
||||
about: Benchmark a part of this library and share your results
|
||||
title: "[Benchmark]"
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
# Benchmarking Transformers
|
||||
# 🖥 Benchmarking `transformers`
|
||||
|
||||
## Benchmark
|
||||
|
||||
Which part of Transformers did you benchmark?
|
||||
Which part of `transformers` did you benchmark?
|
||||
|
||||
## Set-up
|
||||
|
||||
|
||||
12
.github/ISSUE_TEMPLATE/--new-model-addition.md
vendored
12
.github/ISSUE_TEMPLATE/--new-model-addition.md
vendored
@@ -1,5 +1,5 @@
|
||||
---
|
||||
name: "\U0001F31FNew model addition"
|
||||
name: "\U0001F31F New model addition"
|
||||
about: Submit a proposal/request to implement a new Transformer-based model
|
||||
title: ''
|
||||
labels: ''
|
||||
@@ -7,18 +7,14 @@ assignees: ''
|
||||
|
||||
---
|
||||
|
||||
# 🌟New model addition
|
||||
# 🌟 New model addition
|
||||
|
||||
## Model description
|
||||
|
||||
<!-- Important information -->
|
||||
|
||||
## Open Source status
|
||||
## Open source status
|
||||
|
||||
* [ ] the model implementation is available: (give details)
|
||||
* [ ] the model weights are available: (give details)
|
||||
* [ ] who are the authors: (mention them)
|
||||
|
||||
## Additional context
|
||||
|
||||
<!-- Add any other context about the problem here. -->
|
||||
* [ ] who are the authors: (mention them, if possible by @gh-username)
|
||||
|
||||
34
.github/ISSUE_TEMPLATE/bug-report.md
vendored
34
.github/ISSUE_TEMPLATE/bug-report.md
vendored
@@ -1,29 +1,29 @@
|
||||
---
|
||||
name: "\U0001F41B Bug Report"
|
||||
about: Submit a bug report to help us improve PyTorch Transformers
|
||||
about: Submit a bug report to help us improve transformers
|
||||
title: ''
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
## 🐛 Bug
|
||||
# 🐛 Bug
|
||||
|
||||
<!-- Important information -->
|
||||
## Information
|
||||
|
||||
Model I am using (Bert, XLNet....):
|
||||
Model I am using (Bert, XLNet ...):
|
||||
|
||||
Language I am using the model on (English, Chinese....):
|
||||
Language I am using the model on (English, Chinese ...):
|
||||
|
||||
The problem arise when using:
|
||||
* [ ] the official example scripts: (give details)
|
||||
* [ ] my own modified scripts: (give details)
|
||||
The problem arises when using:
|
||||
* [ ] the official example scripts: (give details below)
|
||||
* [ ] my own modified scripts: (give details below)
|
||||
|
||||
The tasks I am working on is:
|
||||
* [ ] an official GLUE/SQUaD task: (give the name)
|
||||
* [ ] my own task or dataset: (give details)
|
||||
* [ ] my own task or dataset: (give details below)
|
||||
|
||||
## To Reproduce
|
||||
## To reproduce
|
||||
|
||||
Steps to reproduce the behavior:
|
||||
|
||||
@@ -31,22 +31,20 @@ Steps to reproduce the behavior:
|
||||
2.
|
||||
3.
|
||||
|
||||
<!-- If you have a code sample, error messages, stack traces, please provide it here as well. -->
|
||||
<!-- If you have code snippets, error messages, stack traces please provide them here as well.
|
||||
Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
|
||||
Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.-->
|
||||
|
||||
## Expected behavior
|
||||
|
||||
<!-- A clear and concise description of what you expected to happen. -->
|
||||
<!-- A clear and concise description of what you would expect to happen. -->
|
||||
|
||||
## Environment
|
||||
|
||||
* OS:
|
||||
* Python version:
|
||||
* PyTorch version:
|
||||
* PyTorch Transformers version (or branch):
|
||||
* `transformers` version (or branch):
|
||||
* Using GPU ?
|
||||
* Distributed of parallel setup ?
|
||||
* Distributed or parallel setup ?
|
||||
* Any other relevant information:
|
||||
|
||||
## Additional context
|
||||
|
||||
<!-- Add any other context about the problem here. -->
|
||||
|
||||
19
.github/ISSUE_TEMPLATE/feature-request.md
vendored
19
.github/ISSUE_TEMPLATE/feature-request.md
vendored
@@ -1,20 +1,25 @@
|
||||
---
|
||||
name: "\U0001F680 Feature Request"
|
||||
about: Submit a proposal/request for a new PyTorch Transformers feature
|
||||
name: "\U0001F680 Feature request"
|
||||
about: Submit a proposal/request for a new transformers feature
|
||||
title: ''
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
## 🚀 Feature
|
||||
# 🚀 Feature request
|
||||
|
||||
<!-- A clear and concise description of the feature proposal. Please provide a link to the paper and code in case they exist. -->
|
||||
<!-- A clear and concise description of the feature proposal.
|
||||
Please provide a link to the paper and code in case they exist. -->
|
||||
|
||||
## Motivation
|
||||
|
||||
<!-- Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too. -->
|
||||
<!-- Please outline the motivation for the proposal. Is your feature request
|
||||
related to a problem? e.g., I'm always frustrated when [...]. If this is related
|
||||
to another GitHub issue, please link here too. -->
|
||||
|
||||
## Additional context
|
||||
## Your contribution
|
||||
|
||||
<!-- Add any other context or screenshots about the feature request here. -->
|
||||
<!-- Is there any way that you could help, e.g. by submitting a PR?
|
||||
Make sure to read the CONTRIBUTING.MD readme:
|
||||
https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md -->
|
||||
|
||||
41
.github/ISSUE_TEMPLATE/migration.md
vendored
41
.github/ISSUE_TEMPLATE/migration.md
vendored
@@ -1,47 +1,52 @@
|
||||
---
|
||||
name: "\U0001F4DA Migration from PyTorch-pretrained-Bert"
|
||||
about: Report a problem when migrating from PyTorch-pretrained-Bert to Transformers
|
||||
name: "\U0001F4DA Migration from pytorch-pretrained-bert or pytorch-transformers"
|
||||
about: Report a problem when migrating from pytorch-pretrained-bert or pytorch-transformers to transformers
|
||||
title: ''
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
## 📚 Migration
|
||||
# 📚 Migration
|
||||
|
||||
## Information
|
||||
|
||||
<!-- Important information -->
|
||||
|
||||
Model I am using (Bert, XLNet....):
|
||||
Model I am using (Bert, XLNet ...):
|
||||
|
||||
Language I am using the model on (English, Chinese....):
|
||||
Language I am using the model on (English, Chinese ...):
|
||||
|
||||
The problem arise when using:
|
||||
* [ ] the official example scripts: (give details)
|
||||
* [ ] my own modified scripts: (give details)
|
||||
The problem arises when using:
|
||||
* [ ] the official example scripts: (give details below)
|
||||
* [ ] my own modified scripts: (give details below)
|
||||
|
||||
The tasks I am working on is:
|
||||
* [ ] an official GLUE/SQUaD task: (give the name)
|
||||
* [ ] my own task or dataset: (give details)
|
||||
* [ ] my own task or dataset: (give details below)
|
||||
|
||||
Details of the issue:
|
||||
## Details
|
||||
|
||||
<!-- A clear and concise description of the migration issue. If you have code snippets, please provide it here as well. -->
|
||||
<!-- A clear and concise description of the migration issue.
|
||||
If you have code snippets, please provide it here as well.
|
||||
Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
|
||||
Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
|
||||
-->
|
||||
|
||||
## Environment
|
||||
|
||||
* OS:
|
||||
* Python version:
|
||||
* PyTorch version:
|
||||
* PyTorch Transformers version (or branch):
|
||||
* Using GPU ?
|
||||
* Distributed of parallel setup ?
|
||||
* `pytorch-transformers` or `pytorch-pretrained-bert` version (or branch):
|
||||
* `transformers` version (or branch):
|
||||
* Using GPU?
|
||||
* Distributed or parallel setup?
|
||||
* Any other relevant information:
|
||||
|
||||
## Checklist
|
||||
|
||||
- [ ] I have read the migration guide in the readme.
|
||||
([pytorch-transformers](https://github.com/huggingface/transformers#migrating-from-pytorch-transformers-to-transformers);
|
||||
[pytorch-pretrained-bert](https://github.com/huggingface/transformers#migrating-from-pytorch-pretrained-bert-to-transformers))
|
||||
- [ ] I checked if a related official extension example runs on my machine.
|
||||
|
||||
## Additional context
|
||||
|
||||
<!-- Add any other context about the problem here. -->
|
||||
|
||||
25
.github/ISSUE_TEMPLATE/question-help.md
vendored
25
.github/ISSUE_TEMPLATE/question-help.md
vendored
@@ -1,12 +1,29 @@
|
||||
---
|
||||
name: "❓Questions & Help"
|
||||
about: Start a general discussion related to PyTorch Transformers
|
||||
name: "❓ Questions & Help"
|
||||
about: Post your general questions on Stack Overflow tagged huggingface-transformers
|
||||
title: ''
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
## ❓ Questions & Help
|
||||
# ❓ Questions & Help
|
||||
|
||||
<!-- A clear and concise description of the question. -->
|
||||
<!-- The GitHub issue tracker is primarly intended for bugs, feature requests,
|
||||
new models and benchmarks, and migration questions. For all other questions,
|
||||
we direct you to Stack Overflow (SO) where a whole community of PyTorch and
|
||||
Tensorflow enthusiast can help you out. Make sure to tag your question with the
|
||||
right deep learning framework as well as the huggingface-transformers tag:
|
||||
https://stackoverflow.com/questions/tagged/huggingface-transformers
|
||||
|
||||
If your question wasn't answered after a period of time on Stack Overflow, you
|
||||
can always open a question on GitHub. You should then link to the SO question
|
||||
that you posted.
|
||||
-->
|
||||
|
||||
## Details
|
||||
<!-- Description of your issue -->
|
||||
|
||||
<!-- You should first ask your question on SO, and only if
|
||||
you didn't get an answer ask it here on GitHub. -->
|
||||
**A link to original question on Stack Overflow**:
|
||||
121
CONTRIBUTING.md
121
CONTRIBUTING.md
@@ -100,9 +100,10 @@ Follow these steps to start contributing:
|
||||
|
||||
1. Fork the [repository](https://github.com/huggingface/transformers) by
|
||||
clicking on the 'Fork' button on the repository's page. This creates a copy of the code
|
||||
under your github user account.
|
||||
under your GitHub user account.
|
||||
|
||||
2. Clone your fork to your local disk, and add the base repository as a remote:
|
||||
|
||||
|
||||
```bash
|
||||
$ git clone git@github.com:<your Github handle>/transformers.git
|
||||
$ cd transformers
|
||||
@@ -114,43 +115,78 @@ Follow these steps to start contributing:
|
||||
```bash
|
||||
$ git checkout -b a-descriptive-name-for-my-changes
|
||||
```
|
||||
|
||||
|
||||
**do not** work on the `master` branch.
|
||||
|
||||
|
||||
4. Set up a development environment by running the following command in a virtual environment:
|
||||
|
||||
```bash
|
||||
$ pip install -r requirements-dev.txt
|
||||
$ pip install -e ".[dev]"
|
||||
```
|
||||
|
||||
5. Develop the features on your branch. Add changed files using `git add` and
|
||||
then `git commit` to record your changes locally:
|
||||
|
||||
(If transformers was already installed in the virtual environment, remove
|
||||
it with `pip uninstall transformers` before reinstalling it in editable
|
||||
mode with the `-e` flag.)
|
||||
|
||||
Right now, we need an unreleased version of `isort` to avoid a
|
||||
[bug](https://github.com/timothycrosley/isort/pull/1000):
|
||||
|
||||
```bash
|
||||
$ pip install -U git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort
|
||||
```
|
||||
|
||||
5. Develop the features on your branch.
|
||||
|
||||
As you work on the features, you should make sure that the test suite
|
||||
passes:
|
||||
|
||||
```bash
|
||||
$ make test
|
||||
```
|
||||
|
||||
`transformers` relies on `black` and `isort` to format its source code
|
||||
consistently. After you make changes, format them with:
|
||||
|
||||
```bash
|
||||
$ make style
|
||||
```
|
||||
|
||||
`transformers` also uses `flake8` to check for coding mistakes. Quality
|
||||
control runs in CI, however you can also run the same checks with:
|
||||
|
||||
```bash
|
||||
$ make quality
|
||||
```
|
||||
|
||||
Once you're happy with your changes, add changed files using `git add` and
|
||||
make a commit with `git commit` to record your changes locally:
|
||||
|
||||
```bash
|
||||
$ git add modified_file.py
|
||||
$ git commit
|
||||
```
|
||||
|
||||
|
||||
Please write [good commit
|
||||
messages](https://chris.beams.io/posts/git-commit/). It
|
||||
is a good idea to sync your copy of the code with the original repository
|
||||
regularly. This way you can quickly account for changes:
|
||||
|
||||
messages](https://chris.beams.io/posts/git-commit/).
|
||||
|
||||
It is a good idea to sync your copy of the code with the original
|
||||
repository regularly. This way you can quickly account for changes:
|
||||
|
||||
```bash
|
||||
$ git fetch upstream
|
||||
$ git rebase upstream/master
|
||||
```
|
||||
|
||||
|
||||
Push the changes to your account using:
|
||||
|
||||
|
||||
```bash
|
||||
$ git push -u origin a-descriptive-name-for-my-changes
|
||||
```
|
||||
|
||||
|
||||
6. Once you are satisfied (**and the checklist below is happy too**), go to the
|
||||
webpage of your fork on Github. Click on 'Pull request' to send your changes
|
||||
webpage of your fork on GitHub. Click on 'Pull request' to send your changes
|
||||
to the project maintainers for review.
|
||||
|
||||
|
||||
7. It's ok if maintainers ask you for changes. It happens to core contributors
|
||||
too! So everyone can see the changes in the Pull request, work in your local
|
||||
branch and push the changes to your fork. They will automatically appear in
|
||||
@@ -168,7 +204,54 @@ Follow these steps to start contributing:
|
||||
to be merged;
|
||||
4. Make sure pre-existing tests still pass;
|
||||
5. Add high-coverage tests. No quality test, no merge;
|
||||
6. All public methods must have informative doctrings;
|
||||
6. All public methods must have informative docstrings;
|
||||
|
||||
|
||||
### Tests
|
||||
|
||||
You can run 🤗 Transformers tests with `unittest` or `pytest`.
|
||||
|
||||
We like `pytest` and `pytest-xdist` because it's faster. From the root of the
|
||||
repository, here's how to run tests with `pytest` for the library:
|
||||
|
||||
```bash
|
||||
$ python -m pytest -n auto --dist=loadfile -s -v ./tests/
|
||||
```
|
||||
|
||||
and for the examples:
|
||||
|
||||
```bash
|
||||
$ pip install -r examples/requirements.txt # only needed the first time
|
||||
$ python -m pytest -n auto --dist=loadfile -s -v ./examples/
|
||||
```
|
||||
|
||||
In fact, that's how `make test` and `make test-examples` are implemented!
|
||||
|
||||
You can specify a smaller set of tests in order to test only the feature
|
||||
you're working on.
|
||||
|
||||
By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to
|
||||
`yes` to run them. This will download many gigabytes of models — make sure you
|
||||
have enough disk space and a good Internet connection, or a lot of patience!
|
||||
|
||||
```bash
|
||||
$ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/
|
||||
$ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/
|
||||
```
|
||||
|
||||
Likewise, set the `RUN_CUSTOM_TOKENIZERS` environment variable to `yes` to run
|
||||
tests for custom tokenizers, which don't run by default either.
|
||||
|
||||
🤗 Transformers uses `pytest` as a test runner only. It doesn't use any
|
||||
`pytest`-specific features in the test suite itself.
|
||||
|
||||
This means `unittest` is fully supported. Here's how to run tests with
|
||||
`unittest`:
|
||||
|
||||
```bash
|
||||
$ python -m unittest discover -s tests -t . -v
|
||||
$ python -m unittest discover -s examples -t examples -v
|
||||
```
|
||||
|
||||
|
||||
### Style guide
|
||||
|
||||
24
Makefile
Normal file
24
Makefile
Normal file
@@ -0,0 +1,24 @@
|
||||
.PHONY: quality style test test-examples
|
||||
|
||||
# Check that source code meets quality standards
|
||||
|
||||
quality:
|
||||
black --check --line-length 119 --target-version py35 examples templates tests src utils
|
||||
isort --check-only --recursive examples templates tests src utils
|
||||
flake8 examples templates tests src utils
|
||||
|
||||
# Format source code automatically
|
||||
|
||||
style:
|
||||
black --line-length 119 --target-version py35 examples templates tests src utils
|
||||
isort --recursive examples templates tests src utils
|
||||
|
||||
# Run tests for the library
|
||||
|
||||
test:
|
||||
python -m pytest -n auto --dist=loadfile -s -v ./tests/
|
||||
|
||||
# Run tests for examples
|
||||
|
||||
test-examples:
|
||||
python -m pytest -n auto --dist=loadfile -s -v ./examples/
|
||||
149
README.md
149
README.md
@@ -55,14 +55,22 @@ Choose the right framework for every part of a model's lifetime
|
||||
| [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities |
|
||||
| [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
|
||||
| [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-20-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch |
|
||||
| [Quick tour: pipelines](#quick-tour-of-pipelines) | Using Pipelines: Wrapper around tokenizer and models to use finetuned models |
|
||||
| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
|
||||
| [Quick tour: Share your models ](#Quick-tour-of-model-sharing) | Upload and share your fine-tuned models with the community |
|
||||
| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
|
||||
| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
|
||||
| [Documentation][(v2.2.0/v2.2.1/v2.2.2)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |
|
||||
| [Documentation][(v2.4.0)](https://huggingface.co/transformers/v2.4.0)[(v2.3.0)](https://huggingface.co/transformers/v2.3.0)[(v2.2.0/v2.2.1/v2.2.2)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |
|
||||
|
||||
## Installation
|
||||
|
||||
This repo is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+), PyTorch 1.0.0+ and TensorFlow 2.0.0-rc1
|
||||
This repo is tested on Python 3.5+, PyTorch 1.0.0+ and TensorFlow 2.0.0-rc1
|
||||
|
||||
You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
|
||||
|
||||
Create a virtual environment with the version of Python you're going to use and activate it.
|
||||
|
||||
Now, if you want to use 🤗 Transformers, you can install it with pip. If you'd like to play with the examples, you must install it from source.
|
||||
|
||||
### With pip
|
||||
|
||||
@@ -83,43 +91,48 @@ Please refer to [TensorFlow installation page](https://www.tensorflow.org/instal
|
||||
When TensorFlow 2.0 and/or PyTorch has been installed, you can install from source by cloning the repository and running:
|
||||
|
||||
```bash
|
||||
pip install [--editable] .
|
||||
git clone https://github.com/huggingface/transformers
|
||||
cd transformers
|
||||
pip install .
|
||||
```
|
||||
|
||||
When you update the repository, you should upgrade the transformers installation and its dependencies as follows:
|
||||
|
||||
```bash
|
||||
git pull
|
||||
pip install --upgrade .
|
||||
```
|
||||
|
||||
### Run the examples
|
||||
|
||||
Examples are included in the repository but are not shipped with the library.
|
||||
Therefore, in order to run the latest versions of the examples you also need to install from source. To do so, create a new virtual environment and follow these steps:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/huggingface/transformers
|
||||
cd transformers
|
||||
pip install [--editable] .
|
||||
```
|
||||
Therefore, in order to run the latest versions of the examples, you need to install from source, as described above.
|
||||
|
||||
Look at the [README](https://github.com/huggingface/transformers/blob/master/examples/README.md) for how to run examples.
|
||||
|
||||
### Tests
|
||||
|
||||
A series of tests are included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
|
||||
|
||||
These tests can be run using `unittest` or `pytest` (install pytest if needed with `pip install pytest`).
|
||||
A series of tests are included for the library and for some example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
|
||||
|
||||
Depending on which framework is installed (TensorFlow 2.0 and/or PyTorch), the irrelevant tests will be skipped. Ensure that both frameworks are installed if you want to execute all tests.
|
||||
|
||||
You can run the tests from the root of the cloned repository with the commands:
|
||||
Here's the easiest way to run tests for the library:
|
||||
|
||||
```bash
|
||||
python -m unittest discover -s transformers/tests -p "*test.py" -t .
|
||||
python -m unittest discover -s examples -p "*test.py" -t examples
|
||||
pip install -e ".[testing]"
|
||||
make test
|
||||
```
|
||||
|
||||
or
|
||||
and for the examples:
|
||||
|
||||
```bash
|
||||
python -m pytest -sv ./transformers/tests/
|
||||
python -m pytest -sv ./examples/
|
||||
pip install -e ".[testing]"
|
||||
pip install -r examples/requirements.txt
|
||||
make test-examples
|
||||
```
|
||||
|
||||
By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to `yes` to run them.
|
||||
For details, refer to the [contributing guide](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md#tests).
|
||||
|
||||
### Do you want to run a Transformer model on a mobile device?
|
||||
|
||||
@@ -131,7 +144,7 @@ At some point in the future, you'll be able to seamlessly move from pre-training
|
||||
|
||||
## Model architectures
|
||||
|
||||
🤗 Transformers currently provides 10 NLU/NLG architectures:
|
||||
🤗 Transformers currently provides the following NLU/NLG architectures:
|
||||
|
||||
1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
|
||||
2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
|
||||
@@ -144,7 +157,12 @@ At some point in the future, you'll be able to seamlessly move from pre-training
|
||||
9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||
10. **[CamemBERT](https://camembert-model.fr)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
|
||||
11. **[ALBERT](https://github.com/google-research/ALBERT)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
||||
11. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
|
||||
12. **[T5](https://github.com/google-research/text-to-text-transfer-transformer)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
|
||||
13. **[XLM-RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/xlmr)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
|
||||
14. **[MMBT](https://github.com/facebookresearch/mmbt/)** (from Facebook), released together with the paper a [Supervised Multimodal Bitransformers for Classifying Images and Text](https://arxiv.org/pdf/1909.02950.pdf) by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine.
|
||||
15. **[FlauBERT](https://github.com/getalp/Flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
|
||||
16. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
|
||||
17. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
|
||||
|
||||
These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
|
||||
|
||||
@@ -166,7 +184,7 @@ import torch
|
||||
from transformers import *
|
||||
|
||||
# Transformers has a unified API
|
||||
# for 8 transformer architectures and 30 pretrained weights.
|
||||
# for 10 transformer architectures and 30 pretrained weights.
|
||||
# Model | Tokenizer | Pretrained weights shortcut
|
||||
MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'),
|
||||
(OpenAIGPTModel, OpenAIGPTTokenizer, 'openai-gpt'),
|
||||
@@ -176,7 +194,9 @@ MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'),
|
||||
(XLNetModel, XLNetTokenizer, 'xlnet-base-cased'),
|
||||
(XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024'),
|
||||
(DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
|
||||
(RobertaModel, RobertaTokenizer, 'roberta-base')]
|
||||
(RobertaModel, RobertaTokenizer, 'roberta-base'),
|
||||
(XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),
|
||||
]
|
||||
|
||||
# To use TensorFlow 2.0 versions of the models, simply prefix the class names with 'TF', e.g. `TFRobertaModel` is the TF 2.0 counterpart of the PyTorch model `RobertaModel`
|
||||
|
||||
@@ -244,7 +264,7 @@ valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer,
|
||||
train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
|
||||
valid_dataset = valid_dataset.batch(64)
|
||||
|
||||
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
|
||||
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
|
||||
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
|
||||
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
|
||||
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
|
||||
@@ -274,7 +294,7 @@ print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sen
|
||||
|
||||
## Quick tour of the fine-tuning/usage scripts
|
||||
|
||||
**Important**
|
||||
**Important**
|
||||
Before running the fine-tuning scripts, please read the
|
||||
[instructions](#run-the-examples) on how to
|
||||
setup your environment to run the examples.
|
||||
@@ -435,7 +455,7 @@ python ./examples/run_generation.py \
|
||||
--model_name_or_path=gpt2 \
|
||||
```
|
||||
|
||||
and from the Salesforce CTRL model:
|
||||
and from the Salesforce CTRL model:
|
||||
```shell
|
||||
python ./examples/run_generation.py \
|
||||
--model_type=ctrl \
|
||||
@@ -445,6 +465,83 @@ python ./examples/run_generation.py \
|
||||
--repetition_penalty=1.2 \
|
||||
```
|
||||
|
||||
## Quick tour of model sharing
|
||||
|
||||
Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
|
||||
|
||||
**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then:
|
||||
|
||||
```shell
|
||||
transformers-cli login
|
||||
# log in using the same credentials as on huggingface.co
|
||||
```
|
||||
Upload your model:
|
||||
```shell
|
||||
transformers-cli upload ./path/to/pretrained_model/
|
||||
|
||||
# ^^ Upload folder containing weights/tokenizer/config
|
||||
# saved via `.save_pretrained()`
|
||||
|
||||
transformers-cli upload ./config.json [--filename folder/foobar.json]
|
||||
|
||||
# ^^ Upload a single file
|
||||
# (you can optionally override its filename, which can be nested inside a folder)
|
||||
```
|
||||
|
||||
Your model will then be accessible through its identifier, a concatenation of your username and the folder name above:
|
||||
```python
|
||||
"username/pretrained_model"
|
||||
```
|
||||
|
||||
Anyone can load it from code:
|
||||
```python
|
||||
tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")
|
||||
model = AutoModel.from_pretrained("username/pretrained_model")
|
||||
```
|
||||
|
||||
Finally, list all your files on S3:
|
||||
```shell
|
||||
transformers-cli s3 ls
|
||||
# List all your S3 objects.
|
||||
```
|
||||
|
||||
You can also delete files:
|
||||
|
||||
```shell
|
||||
transformers-cli s3 rm …
|
||||
```
|
||||
|
||||
## Quick tour of pipelines
|
||||
|
||||
New in version `v2.3`: `Pipeline` are high-level objects which automatically handle tokenization, running your data through a transformers model
|
||||
and outputting the result in a structured object.
|
||||
|
||||
You can create `Pipeline` objects for the following down-stream tasks:
|
||||
|
||||
- `feature-extraction`: Generates a tensor representation for the input sequence
|
||||
- `ner`: Generates named entity mapping for each word in the input sequence.
|
||||
- `sentiment-analysis`: Gives the polarity (positive / negative) of the whole input sequence.
|
||||
- `text-classification`: Initialize a `TextClassificationPipeline` directly, or see `sentiment-analysis` for an example.
|
||||
- `question-answering`: Provided some context and a question refering to the context, it will extract the answer to the question in the context.
|
||||
- `fill-mask`: Takes an input sequence containing a masked token (e.g. `<mask>`) and return list of most probable filled sequences, with their probabilities.
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
# Allocate a pipeline for sentiment-analysis
|
||||
nlp = pipeline('sentiment-analysis')
|
||||
nlp('We are very happy to include pipeline into the transformers repository.')
|
||||
>>> {'label': 'POSITIVE', 'score': 0.99893874}
|
||||
|
||||
# Allocate a pipeline for question-answering
|
||||
nlp = pipeline('question-answering')
|
||||
nlp({
|
||||
'question': 'What is the name of the repository ?',
|
||||
'context': 'Pipeline have been included in the huggingface/transformers repository'
|
||||
})
|
||||
>>> {'score': 0.28756016668193496, 'start': 35, 'end': 59, 'answer': 'huggingface/transformers'}
|
||||
```
|
||||
|
||||
## Migrating from pytorch-transformers to transformers
|
||||
|
||||
Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
|
||||
|
||||
@@ -19,4 +19,5 @@ deploy_doc "fe02e45" v1.1.0
|
||||
deploy_doc "89fd345" v1.2.0
|
||||
deploy_doc "fc9faa8" v2.0.0
|
||||
deploy_doc "3ddce1d" v2.1.1
|
||||
deploy_doc "f2f3294" v2.2.0
|
||||
deploy_doc "f2f3294" v2.2.0
|
||||
deploy_doc "d0f8b9a" v2.3.0
|
||||
|
||||
@@ -1,25 +1,25 @@
|
||||
# Generating the documentation
|
||||
|
||||
To generate the documentation, you first have to build it. Several packages are necessary to build the doc,
|
||||
you can install them using:
|
||||
you can install them with the following command, at the root of the code repository:
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
pip install -e ".[docs]"
|
||||
```
|
||||
|
||||
|
||||
## Packages installed
|
||||
|
||||
Here's an overview of all the packages installed. If you ran the previous command installing all packages from
|
||||
Here's an overview of all the packages installed. If you ran the previous command installing all packages from
|
||||
`requirements.txt`, you do not need to run the following commands.
|
||||
|
||||
Building it requires the package `sphinx` that you can
|
||||
Building it requires the package `sphinx` that you can
|
||||
install using:
|
||||
|
||||
```bash
|
||||
pip install -U sphinx
|
||||
```
|
||||
|
||||
You would also need the custom installed [theme](https://github.com/readthedocs/sphinx_rtd_theme) by
|
||||
You would also need the custom installed [theme](https://github.com/readthedocs/sphinx_rtd_theme) by
|
||||
[Read The Docs](https://readthedocs.org/). You can install it using the following command:
|
||||
|
||||
```bash
|
||||
@@ -34,7 +34,7 @@ pip install recommonmark
|
||||
|
||||
## Building the documentation
|
||||
|
||||
Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the following
|
||||
Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the following
|
||||
command to generate it:
|
||||
|
||||
```bash
|
||||
|
||||
@@ -1,32 +0,0 @@
|
||||
alabaster==0.7.12
|
||||
Babel==2.7.0
|
||||
certifi==2019.6.16
|
||||
chardet==3.0.4
|
||||
commonmark==0.9.0
|
||||
docutils==0.14
|
||||
future==0.17.1
|
||||
idna==2.8
|
||||
imagesize==1.1.0
|
||||
Jinja2==2.10.1
|
||||
MarkupSafe==1.1.1
|
||||
packaging==19.0
|
||||
Pygments==2.4.2
|
||||
pyparsing==2.4.0
|
||||
pytz==2019.1
|
||||
recommonmark==0.5.0
|
||||
requests==2.22.0
|
||||
six==1.12.0
|
||||
snowballstemmer==1.9.0
|
||||
Sphinx==2.1.2
|
||||
sphinx-rtd-theme==0.4.3
|
||||
sphinxcontrib-applehelp==1.0.1
|
||||
sphinxcontrib-devhelp==1.0.1
|
||||
sphinxcontrib-htmlhelp==1.0.2
|
||||
sphinxcontrib-jsmath==1.0.1
|
||||
sphinxcontrib-qthelp==1.0.2
|
||||
sphinxcontrib-serializinghtml==1.1.3
|
||||
urllib3==1.25.3
|
||||
sphinx-markdown-tables==0.0.9
|
||||
numpy==1.17.2
|
||||
tensorflow==2.0.0rc2
|
||||
torch==1.2.0
|
||||
@@ -14,7 +14,7 @@
|
||||
#
|
||||
import os
|
||||
import sys
|
||||
sys.path.insert(0, os.path.abspath('../..'))
|
||||
sys.path.insert(0, os.path.abspath('../../src'))
|
||||
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
@@ -26,7 +26,7 @@ author = u'huggingface'
|
||||
# The short X.Y version
|
||||
version = u''
|
||||
# The full version, including alpha/beta/rc tags
|
||||
release = u'2.2.2'
|
||||
release = u'2.4.1'
|
||||
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
@@ -3,6 +3,12 @@ Converting Tensorflow Checkpoints
|
||||
|
||||
A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints in models than be loaded using the ``from_pretrained`` methods of the library.
|
||||
|
||||
.. note::
|
||||
Since 2.3.0 the conversion script is now part of the transformers CLI (**transformers-cli**)
|
||||
available in any transformers >= 2.3.0 installation.
|
||||
|
||||
The documentation below reflects the **transformers-cli convert** command format.
|
||||
|
||||
BERT
|
||||
^^^^
|
||||
|
||||
@@ -20,10 +26,10 @@ Here is an example of the conversion process for a pre-trained ``BERT-Base Uncas
|
||||
|
||||
export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
|
||||
|
||||
transformers bert \
|
||||
$BERT_BASE_DIR/bert_model.ckpt \
|
||||
$BERT_BASE_DIR/bert_config.json \
|
||||
$BERT_BASE_DIR/pytorch_model.bin
|
||||
transformers-cli convert --model_type bert \
|
||||
--tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
|
||||
--config $BERT_BASE_DIR/bert_config.json \
|
||||
--pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
|
||||
|
||||
You can download Google's pre-trained models for the conversion `here <https://github.com/google-research/bert#pre-trained-models>`__.
|
||||
|
||||
@@ -36,10 +42,12 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT model,
|
||||
|
||||
export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
|
||||
|
||||
transformers gpt \
|
||||
$OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
|
||||
$PYTORCH_DUMP_OUTPUT \
|
||||
[OPENAI_GPT_CONFIG]
|
||||
transformers-cli convert --model_type gpt \
|
||||
--tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
|
||||
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
|
||||
[--config OPENAI_GPT_CONFIG] \
|
||||
[--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
|
||||
|
||||
|
||||
OpenAI GPT-2
|
||||
^^^^^^^^^^^^
|
||||
@@ -50,10 +58,11 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT-2 mode
|
||||
|
||||
export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
|
||||
|
||||
transformers gpt2 \
|
||||
$OPENAI_GPT2_CHECKPOINT_PATH \
|
||||
$PYTORCH_DUMP_OUTPUT \
|
||||
[OPENAI_GPT2_CONFIG]
|
||||
transformers-cli convert --model_type gpt2 \
|
||||
--tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
|
||||
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
|
||||
[--config OPENAI_GPT2_CONFIG] \
|
||||
[--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
|
||||
|
||||
Transformer-XL
|
||||
^^^^^^^^^^^^^^
|
||||
@@ -64,27 +73,28 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo
|
||||
|
||||
export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
|
||||
|
||||
transformers transfo_xl \
|
||||
$TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
|
||||
$PYTORCH_DUMP_OUTPUT \
|
||||
[TRANSFO_XL_CONFIG]
|
||||
transformers-cli convert --model_type transfo_xl \
|
||||
--tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
|
||||
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
|
||||
[--config TRANSFO_XL_CONFIG] \
|
||||
[--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
|
||||
|
||||
|
||||
XLNet
|
||||
^^^^^
|
||||
|
||||
Here is an example of the conversion process for a pre-trained XLNet model, fine-tuned on STS-B using the TensorFlow script:
|
||||
Here is an example of the conversion process for a pre-trained XLNet model:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
|
||||
export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
|
||||
|
||||
transformers xlnet \
|
||||
$TRANSFO_XL_CHECKPOINT_PATH \
|
||||
$TRANSFO_XL_CONFIG_PATH \
|
||||
$PYTORCH_DUMP_OUTPUT \
|
||||
STS-B \
|
||||
transformers-cli convert --model_type xlnet \
|
||||
--tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
|
||||
--config $TRANSFO_XL_CONFIG_PATH \
|
||||
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
|
||||
[--finetuning_task_name XLNET_FINETUNED_TASK] \
|
||||
|
||||
|
||||
XLM
|
||||
@@ -96,6 +106,8 @@ Here is an example of the conversion process for a pre-trained XLM model:
|
||||
|
||||
export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
|
||||
|
||||
transformers xlm \
|
||||
$XLM_CHECKPOINT_PATH \
|
||||
$PYTORCH_DUMP_OUTPUT \
|
||||
transformers-cli convert --model_type xlm \
|
||||
--tf_checkpoint $XLM_CHECKPOINT_PATH \
|
||||
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT
|
||||
[--config XML_CONFIG] \
|
||||
[--finetuning_task_name XML_FINETUNED_TASK]
|
||||
145
docs/source/glossary.rst
Normal file
145
docs/source/glossary.rst
Normal file
@@ -0,0 +1,145 @@
|
||||
Glossary
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Every model is different yet bears similarities with the others. Therefore most models use the same inputs, which are
|
||||
detailed here alongside usage examples.
|
||||
|
||||
Input IDs
|
||||
--------------------------
|
||||
|
||||
The input ids are often the only required parameters to be passed to the model as input. *They are token indices,
|
||||
numerical representations of tokens building the sequences that will be used as input by the model*.
|
||||
|
||||
Each tokenizer works differently but the underlying mechanism remains the same. Here's an example using the BERT
|
||||
tokenizer, which is a `WordPiece <https://arxiv.org/pdf/1609.08144.pdf>`__ tokenizer:
|
||||
|
||||
::
|
||||
|
||||
from transformers import BertTokenizer
|
||||
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
|
||||
|
||||
sequence = "A Titan RTX has 24GB of VRAM"
|
||||
|
||||
The tokenizer takes care of splitting the sequence into tokens available in the tokenizer vocabulary.
|
||||
|
||||
::
|
||||
|
||||
# Continuation of the previous script
|
||||
tokenized_sequence = tokenizer.tokenize(sequence)
|
||||
assert tokenized_sequence == ['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
|
||||
|
||||
These tokens can then be converted into IDs which are understandable by the model. Several methods are available for
|
||||
this, the recommended being `encode` or `encode_plus`, which leverage the Rust implementation of
|
||||
`huggingface/tokenizers <https://github.com/huggingface/tokenizers>`__ for peak performance.
|
||||
|
||||
::
|
||||
|
||||
# Continuation of the previous script
|
||||
encoded_sequence = tokenizer.encode(sequence)
|
||||
assert encoded_sequence == [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
|
||||
|
||||
The `encode` and `encode_plus` methods automatically add "special tokens" which are special IDs the model uses.
|
||||
|
||||
Attention mask
|
||||
--------------------------
|
||||
|
||||
The attention mask is an optional argument used when batching sequences together. This argument indicates to the
|
||||
model which tokens should be attended to, and which should not.
|
||||
|
||||
For example, consider these two sequences:
|
||||
|
||||
::
|
||||
|
||||
from transformers import BertTokenizer
|
||||
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
|
||||
|
||||
sequence_a = "This is a short sequence."
|
||||
sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
|
||||
|
||||
encoded_sequence_a = tokenizer.encode(sequence_a)
|
||||
assert len(encoded_sequence_a) == 8
|
||||
|
||||
encoded_sequence_b = tokenizer.encode(sequence_b)
|
||||
assert len(encoded_sequence_b) == 19
|
||||
|
||||
These two sequences have different lengths and therefore can't be put together in a same tensor as-is. The first
|
||||
sequence needs to be padded up to the length of the second one, or the second one needs to be truncated down to
|
||||
the length of the first one.
|
||||
|
||||
In the first case, the list of IDs will be extended by the padding indices:
|
||||
|
||||
::
|
||||
|
||||
# Continuation of the previous script
|
||||
padded_sequence_a = tokenizer.encode(sequence_a, max_length=19, pad_to_max_length=True)
|
||||
|
||||
assert padded_sequence_a == [101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
||||
assert encoded_sequence_b == [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]
|
||||
|
||||
These can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating
|
||||
the position of the padded indices so that the model does not attend to them. For the
|
||||
:class:`~transformers.BertTokenizer`, :obj:`1` indicate a value that should be attended to while :obj:`0` indicate
|
||||
a padded value.
|
||||
|
||||
The method :func:`~transformers.PreTrainedTokenizer.encode_plus` may be used to obtain the attention mask directly:
|
||||
|
||||
::
|
||||
|
||||
# Continuation of the previous script
|
||||
sequence_a_dict = tokenizer.encode_plus(sequence_a, max_length=19, pad_to_max_length=True)
|
||||
|
||||
assert sequence_a_dict['input_ids'] == [101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
||||
assert sequence_a_dict['attention_mask'] == [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
||||
|
||||
|
||||
Token Type IDs
|
||||
--------------------------
|
||||
|
||||
Some models' purpose is to do sequence classification or question answering. These require two different sequences to
|
||||
be encoded in the same input IDs. They are usually separated by special tokens, such as the classifier and separator
|
||||
tokens. For example, the BERT model builds its two sequence input as such:
|
||||
|
||||
::
|
||||
|
||||
from transformers import BertTokenizer
|
||||
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
|
||||
|
||||
# [CLS] SEQ_A [SEP] SEQ_B [SEP]
|
||||
|
||||
sequence_a = "HuggingFace is based in NYC"
|
||||
sequence_b = "Where is HuggingFace based?"
|
||||
|
||||
encoded_sequence = tokenizer.encode(sequence_a, sequence_b)
|
||||
assert tokenizer.decode(encoded_sequence) == "[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]"
|
||||
|
||||
This is enough for some models to understand where one sequence ends and where another begins. However, other models
|
||||
such as BERT have an additional mechanism, which are the segment IDs. The Token Type IDs are a binary mask identifying
|
||||
the different sequences in the model.
|
||||
|
||||
We can leverage :func:`~transformers.PreTrainedTokenizer.encode_plus` to output the Token Type IDs for us:
|
||||
|
||||
::
|
||||
|
||||
# Continuation of the previous script
|
||||
encoded_dict = tokenizer.encode_plus(sequence_a, sequence_b)
|
||||
|
||||
assert encoded_dict['input_ids'] == [101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 17520, 102, 2777, 1110, 20164, 10932, 2271, 7954, 1359, 136, 102]
|
||||
assert encoded_dict['token_type_ids'] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
|
||||
|
||||
The first sequence, the "context" used for the question, has all its tokens represented by :obj:`0`, whereas the
|
||||
question has all its tokens represented by :obj:`1`. Some models, like :class:`~transformers.XLNetModel` use an
|
||||
additional token represented by a :obj:`2`.
|
||||
|
||||
|
||||
Position IDs
|
||||
--------------------------
|
||||
|
||||
The position IDs are used by the model to identify which token is at which position. Contrary to RNNs that have the
|
||||
position of each token embedded within them, transformers are unaware of the position of each token. The position
|
||||
IDs are created for this purpose.
|
||||
|
||||
They are an optional parameter. If no position IDs are passed to the model, they are automatically created as absolute
|
||||
positional embeddings.
|
||||
|
||||
Absolute positional embeddings are selected in the range ``[0, config.max_position_embeddings - 1]``. Some models
|
||||
use other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.
|
||||
@@ -50,6 +50,8 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
|
||||
9. `CTRL <https://github.com/pytorch/fairseq/tree/master/examples/ctrl>`_ (from Salesforce), released together with the paper `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://www.github.com/salesforce/ctrl>`_ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||
10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot.
|
||||
11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
||||
12. `XLM-RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_ (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
|
||||
13. `FlauBERT <https://github.com/getalp/Flaubert>`_ (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`_ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
@@ -57,7 +59,9 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
|
||||
|
||||
installation
|
||||
quickstart
|
||||
glossary
|
||||
pretrained_models
|
||||
model_sharing
|
||||
examples
|
||||
notebooks
|
||||
serialization
|
||||
@@ -94,3 +98,5 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
|
||||
model_doc/ctrl
|
||||
model_doc/camembert
|
||||
model_doc/albert
|
||||
model_doc/xlmroberta
|
||||
model_doc/flaubert
|
||||
@@ -1,6 +1,6 @@
|
||||
# Installation
|
||||
|
||||
Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
|
||||
Transformers is tested on Python 3.5+ and PyTorch 1.1.0
|
||||
|
||||
## With pip
|
||||
|
||||
@@ -17,34 +17,18 @@ To install from source, clone the repository and install with:
|
||||
``` bash
|
||||
git clone https://github.com/huggingface/transformers.git
|
||||
cd transformers
|
||||
pip install [--editable] .
|
||||
pip install .
|
||||
```
|
||||
|
||||
## Tests
|
||||
|
||||
An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
|
||||
An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
|
||||
|
||||
Tests can be run using `unittest` or `pytest` (install pytest if needed with `pip install pytest`).
|
||||
|
||||
Run all the tests from the root of the cloned repository with the commands:
|
||||
|
||||
```bash
|
||||
python -m unittest discover -s transformers/tests -p "*test.py" -t .
|
||||
python -m unittest discover -s examples -p "*test.py" -t examples
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
``` bash
|
||||
python -m pytest -sv ./transformers/tests/
|
||||
python -m pytest -sv ./examples/
|
||||
```
|
||||
|
||||
By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to `yes` to run them.
|
||||
Refer to the [contributing guide](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md#tests) for details about running tests.
|
||||
|
||||
## OpenAI GPT original tokenization workflow
|
||||
|
||||
If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (use version 4.4.3 if you are using Python 2) and `SpaCy`:
|
||||
If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` and `SpaCy`:
|
||||
|
||||
``` bash
|
||||
pip install spacy ftfy==4.4.3
|
||||
|
||||
@@ -20,14 +20,12 @@ The ``.optimization`` module provides:
|
||||
:members:
|
||||
|
||||
.. autofunction:: transformers.create_optimizer
|
||||
:members:
|
||||
|
||||
Schedules
|
||||
----------------------------------------------------
|
||||
|
||||
Learning Rate Schedules
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
.. autofunction:: transformers.get_constant_schedule
|
||||
|
||||
|
||||
@@ -39,7 +37,6 @@ Learning Rate Schedules
|
||||
|
||||
|
||||
.. autofunction:: transformers.get_cosine_schedule_with_warmup
|
||||
:members:
|
||||
|
||||
.. image:: /imgs/warmup_cosine_schedule.png
|
||||
:target: /imgs/warmup_cosine_schedule.png
|
||||
@@ -63,7 +60,7 @@ Learning Rate Schedules
|
||||
``Warmup``
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.Warmup
|
||||
.. autoclass:: transformers.WarmUp
|
||||
:members:
|
||||
|
||||
Gradient Strategies
|
||||
|
||||
@@ -1,63 +1,92 @@
|
||||
ALBERT
|
||||
----------------------------------------------------
|
||||
|
||||
``AlbrtConfig``
|
||||
Overview
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The ALBERT model was proposed in `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_
|
||||
by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. It presents
|
||||
two parameter-reduction techniques to lower memory consumption and increase the trainig speed of BERT:
|
||||
|
||||
- Splitting the embedding matrix into two smaller matrices
|
||||
- Using repeating layers split among groups
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Increasing model size when pretraining natural language representations often results in improved performance on
|
||||
downstream tasks. However, at some point further model increases become harder due to GPU/TPU memory limitations,
|
||||
longer training times, and unexpected model degradation. To address these problems, we present two parameter-reduction
|
||||
techniques to lower memory consumption and increase the training speed of BERT. Comprehensive empirical evidence shows
|
||||
that our proposed methods lead to models that scale much better compared to the original BERT. We also use a
|
||||
self-supervised loss that focuses on modeling inter-sentence coherence, and show it consistently helps downstream
|
||||
tasks with multi-sentence inputs. As a result, our best model establishes new state-of-the-art results on the GLUE,
|
||||
RACE, and SQuAD benchmarks while having fewer parameters compared to BERT-large.*
|
||||
|
||||
Tips:
|
||||
|
||||
- ALBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||
the right rather than the left.
|
||||
- ALBERT uses repeating layers which results in a small memory footprint, however the computational cost remains
|
||||
similar to a BERT-like architecture with the same number of hidden layers as it has to iterate through the same
|
||||
number of (repeating) layers.
|
||||
|
||||
AlbertConfig
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.AlbertConfig
|
||||
:members:
|
||||
|
||||
|
||||
``AlbertTokenizer``
|
||||
AlbertTokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.AlbertTokenizer
|
||||
:members:
|
||||
|
||||
|
||||
``AlbertModel``
|
||||
AlbertModel
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.AlbertModel
|
||||
:members:
|
||||
|
||||
|
||||
``AlbertForMaskedLM``
|
||||
AlbertForMaskedLM
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.AlbertForMaskedLM
|
||||
:members:
|
||||
|
||||
|
||||
``AlbertForSequenceClassification``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
AlbertForSequenceClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.AlbertForSequenceClassification
|
||||
:members:
|
||||
|
||||
|
||||
``AlbertForQuestionAnswering``
|
||||
AlbertForQuestionAnswering
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.AlbertForQuestionAnswering
|
||||
:members:
|
||||
|
||||
|
||||
``TFAlbertModel``
|
||||
TFAlbertModel
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFAlbertModel
|
||||
:members:
|
||||
|
||||
|
||||
``TFAlbertForMaskedLM``
|
||||
TFAlbertForMaskedLM
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFAlbertForMaskedLM
|
||||
:members:
|
||||
|
||||
|
||||
``TFAlbertForSequenceClassification``
|
||||
TFAlbertForSequenceClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFAlbertForSequenceClassification
|
||||
|
||||
@@ -3,7 +3,7 @@ AutoModels
|
||||
|
||||
In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the ``from_pretrained`` method.
|
||||
|
||||
AutoClasses are here to do this job for you so that you automatically retreive the relevant model given the name/path to the pretrained weights/config/vocabulary:
|
||||
AutoClasses are here to do this job for you so that you automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary:
|
||||
|
||||
Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will directly create a class of the relevant architecture (ex: ``model = AutoModel.from_pretrained('bert-base-cased')`` will create a instance of ``BertModel``).
|
||||
|
||||
@@ -15,6 +15,13 @@ Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will di
|
||||
:members:
|
||||
|
||||
|
||||
``AutoTokenizer``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.AutoTokenizer
|
||||
:members:
|
||||
|
||||
|
||||
``AutoModel``
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
@@ -22,8 +29,37 @@ Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will di
|
||||
:members:
|
||||
|
||||
|
||||
``AutoTokenizer``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
``AutoModelForPreTraining``
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.AutoTokenizer
|
||||
.. autoclass:: transformers.AutoModelForPreTraining
|
||||
:members:
|
||||
|
||||
|
||||
``AutoModelWithLMHead``
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.AutoModelWithLMHead
|
||||
:members:
|
||||
|
||||
|
||||
``AutoModelForSequenceClassification``
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.AutoModelForSequenceClassification
|
||||
:members:
|
||||
|
||||
|
||||
``AutoModelForQuestionAnswering``
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.AutoModelForQuestionAnswering
|
||||
:members:
|
||||
|
||||
|
||||
``AutoModelForTokenClassification``
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.AutoModelForTokenClassification
|
||||
:members:
|
||||
|
||||
|
||||
@@ -1,126 +1,160 @@
|
||||
BERT
|
||||
----------------------------------------------------
|
||||
|
||||
``BertConfig``
|
||||
Overview
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The BERT model was proposed in `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`__
|
||||
by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer
|
||||
pre-trained using a combination of masked language modeling objective and next sentence prediction
|
||||
on a large corpus comprising the Toronto Book Corpus and Wikipedia.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations
|
||||
from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional
|
||||
representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result,
|
||||
the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models
|
||||
for a wide range of tasks, such as question answering and language inference, without substantial task-specific
|
||||
architecture modifications.*
|
||||
|
||||
*BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural
|
||||
language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI
|
||||
accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute
|
||||
improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).*
|
||||
|
||||
Tips:
|
||||
|
||||
- BERT is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||
the right rather than the left.
|
||||
- BERT was trained with a masked language modeling (MLM) objective. It is therefore efficient at predicting masked
|
||||
tokens and at NLU in general, but is not optimal for text generation. Models trained with a causal language
|
||||
modeling (CLM) objective are better in that regard.
|
||||
- Alongside MLM, BERT was trained using a next sentence prediction (NSP) objective using the [CLS] token as a sequence
|
||||
approximate. The user may use this token (the first token in a sequence built with special tokens) to get a sequence
|
||||
prediction rather than a token prediction. However, averaging over the sequence may yield better results than using
|
||||
the [CLS] token.
|
||||
|
||||
BertConfig
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.BertConfig
|
||||
:members:
|
||||
|
||||
|
||||
``BertTokenizer``
|
||||
BertTokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.BertTokenizer
|
||||
:members:
|
||||
|
||||
|
||||
``BertModel``
|
||||
BertModel
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.BertModel
|
||||
:members:
|
||||
|
||||
|
||||
``BertForPreTraining``
|
||||
BertForPreTraining
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.BertForPreTraining
|
||||
:members:
|
||||
|
||||
|
||||
``BertForMaskedLM``
|
||||
BertForMaskedLM
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.BertForMaskedLM
|
||||
:members:
|
||||
|
||||
|
||||
``BertForNextSentencePrediction``
|
||||
BertForNextSentencePrediction
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.BertForNextSentencePrediction
|
||||
:members:
|
||||
|
||||
|
||||
``BertForSequenceClassification``
|
||||
BertForSequenceClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.BertForSequenceClassification
|
||||
:members:
|
||||
|
||||
|
||||
``BertForMultipleChoice``
|
||||
BertForMultipleChoice
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.BertForMultipleChoice
|
||||
:members:
|
||||
|
||||
|
||||
``BertForTokenClassification``
|
||||
BertForTokenClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.BertForTokenClassification
|
||||
:members:
|
||||
|
||||
|
||||
``BertForQuestionAnswering``
|
||||
BertForQuestionAnswering
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.BertForQuestionAnswering
|
||||
:members:
|
||||
|
||||
|
||||
``TFBertModel``
|
||||
TFBertModel
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFBertModel
|
||||
:members:
|
||||
|
||||
|
||||
``TFBertForPreTraining``
|
||||
TFBertForPreTraining
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFBertForPreTraining
|
||||
:members:
|
||||
|
||||
|
||||
``TFBertForMaskedLM``
|
||||
TFBertForMaskedLM
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFBertForMaskedLM
|
||||
:members:
|
||||
|
||||
|
||||
``TFBertForNextSentencePrediction``
|
||||
TFBertForNextSentencePrediction
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFBertForNextSentencePrediction
|
||||
:members:
|
||||
|
||||
|
||||
``TFBertForSequenceClassification``
|
||||
TFBertForSequenceClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFBertForSequenceClassification
|
||||
:members:
|
||||
|
||||
|
||||
``TFBertForMultipleChoice``
|
||||
TFBertForMultipleChoice
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFBertForMultipleChoice
|
||||
:members:
|
||||
|
||||
|
||||
``TFBertForTokenClassification``
|
||||
TFBertForTokenClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFBertForTokenClassification
|
||||
:members:
|
||||
|
||||
|
||||
``TFBertForQuestionAnswering``
|
||||
TFBertForQuestionAnswering
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFBertForQuestionAnswering
|
||||
|
||||
@@ -1,50 +1,99 @@
|
||||
CamemBERT
|
||||
----------------------------------------------------
|
||||
|
||||
``CamembertConfig``
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
The CamemBERT model was proposed in `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`__
|
||||
by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la
|
||||
Clergerie, Djamé Seddah, and Benoît Sagot. It is based on Facebook's RoBERTa model released in 2019. It is a model
|
||||
trained on 138GB of French text.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Pretrained language models are now ubiquitous in Natural Language Processing. Despite their success,
|
||||
most available models have either been trained on English data or on the concatenation of data in multiple
|
||||
languages. This makes practical use of such models --in all languages except English-- very limited. Aiming
|
||||
to address this issue for French, we release CamemBERT, a French version of the Bi-directional Encoders for
|
||||
Transformers (BERT). We measure the performance of CamemBERT compared to multilingual models in multiple
|
||||
downstream tasks, namely part-of-speech tagging, dependency parsing, named-entity recognition, and natural
|
||||
language inference. CamemBERT improves the state of the art for most of the tasks considered. We release the
|
||||
pretrained model for CamemBERT hoping to foster research and downstream applications for French NLP.*
|
||||
|
||||
Tips:
|
||||
|
||||
- This implementation is the same as RoBERTa. Refer to the `documentation of RoBERTa <./roberta.html>`__ for usage
|
||||
examples as well as the information relative to the inputs and outputs.
|
||||
|
||||
CamembertConfig
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.CamembertConfig
|
||||
:members:
|
||||
|
||||
|
||||
``CamembertTokenizer``
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
CamembertTokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.CamembertTokenizer
|
||||
:members:
|
||||
|
||||
|
||||
``CamembertModel``
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
CamembertModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.CamembertModel
|
||||
:members:
|
||||
|
||||
|
||||
``CamembertForMaskedLM``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
CamembertForMaskedLM
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.CamembertForMaskedLM
|
||||
:members:
|
||||
|
||||
|
||||
``CamembertForSequenceClassification``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
CamembertForSequenceClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.CamembertForSequenceClassification
|
||||
:members:
|
||||
|
||||
|
||||
``CamembertForMultipleChoice``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
CamembertForMultipleChoice
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.CamembertForMultipleChoice
|
||||
:members:
|
||||
|
||||
|
||||
``CamembertForTokenClassification``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
CamembertForTokenClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.CamembertForTokenClassification
|
||||
:members:
|
||||
|
||||
|
||||
TFCamembertModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFCamembertModel
|
||||
:members:
|
||||
|
||||
|
||||
TFCamembertForMaskedLM
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFCamembertForMaskedLM
|
||||
:members:
|
||||
|
||||
|
||||
TFCamembertForSequenceClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFCamembertForSequenceClassification
|
||||
:members:
|
||||
|
||||
|
||||
TFCamembertForTokenClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFCamembertForTokenClassification
|
||||
:members:
|
||||
|
||||
@@ -1,47 +1,73 @@
|
||||
CTRL
|
||||
----------------------------------------------------
|
||||
|
||||
Note: if you fine-tune a CTRL model using the Salesforce code (https://github.com/salesforce/ctrl),
|
||||
you'll be able to convert from TF to our HuggingFace/Transformers format using the
|
||||
``convert_tf_to_huggingface_pytorch.py`` script (see `issue #1654 <https://github.com/huggingface/transformers/issues/1654>`_).
|
||||
CTRL model was proposed in `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`_
|
||||
by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
|
||||
It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
|
||||
corpus of ~140 GB of text data with the first token reserved as a control code (such as Links, Books, Wikipedia etc.).
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Large-scale language models show promising text generation capabilities, but users cannot easily control particular
|
||||
aspects of the generated text. We release CTRL, a 1.63 billion-parameter conditional transformer language model,
|
||||
trained to condition on control codes that govern style, content, and task-specific behavior. Control codes were
|
||||
derived from structure that naturally co-occurs with raw text, preserving the advantages of unsupervised learning
|
||||
while providing more explicit control over text generation. These codes also allow CTRL to predict which parts of
|
||||
the training data are most likely given a sequence. This provides a potential method for analyzing large amounts
|
||||
of data via model-based source attribution.*
|
||||
|
||||
Tips:
|
||||
|
||||
- CTRL makes use of control codes to generate text: it requires generations to be started by certain words, sentences
|
||||
or links to generate coherent text. Refer to the `original implementation <https://github.com/salesforce/ctrl>`__
|
||||
for more information.
|
||||
- CTRL is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||
the right rather than the left.
|
||||
- CTRL was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
|
||||
token in a sequence. Leveraging this feature allows CTRL to generate syntactically coherent text as
|
||||
it can be observed in the `run_generation.py` example script.
|
||||
- The PyTorch models can take the `past` as input, which is the previously computed key/value attention pairs. Using
|
||||
this `past` value prevents the model from re-computing pre-computed values in the context of text generation.
|
||||
See `reusing the past in generative models <../quickstart.html#using-the-past>`_ for more information on the usage
|
||||
of this argument.
|
||||
|
||||
|
||||
``CTRLConfig``
|
||||
CTRLConfig
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.CTRLConfig
|
||||
:members:
|
||||
|
||||
|
||||
``CTRLTokenizer``
|
||||
CTRLTokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.CTRLTokenizer
|
||||
:members:
|
||||
|
||||
|
||||
``CTRLModel``
|
||||
CTRLModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.CTRLModel
|
||||
:members:
|
||||
|
||||
|
||||
``CTRLLMHeadModel``
|
||||
CTRLLMHeadModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.CTRLLMHeadModel
|
||||
:members:
|
||||
|
||||
|
||||
``TFCTRLModel``
|
||||
TFCTRLModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFCTRLModel
|
||||
:members:
|
||||
|
||||
|
||||
``TFCTRLLMHeadModel``
|
||||
TFCTRLLMHeadModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFCTRLLMHeadModel
|
||||
|
||||
@@ -1,69 +1,96 @@
|
||||
DistilBERT
|
||||
----------------------------------------------------
|
||||
|
||||
``DistilBertConfig``
|
||||
The DistilBERT model was proposed in the blog post
|
||||
`Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`__,
|
||||
and the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__.
|
||||
DistilBERT is a small, fast, cheap and light Transformer model trained by distilling Bert base. It has 40% less
|
||||
parameters than `bert-base-uncased`, runs 60% faster while preserving over 95% of Bert's performances as measured on
|
||||
the GLUE language understanding benchmark.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*As Transfer Learning from large-scale pre-trained models becomes more prevalent in Natural Language Processing (NLP),
|
||||
operating these large models in on-the-edge and/or under constrained computational training or inference budgets
|
||||
remains challenging. In this work, we propose a method to pre-train a smaller general-purpose language representation
|
||||
model, called DistilBERT, which can then be fine-tuned with good performances on a wide range of tasks like its larger
|
||||
counterparts. While most prior work investigated the use of distillation for building task-specific models, we
|
||||
leverage knowledge distillation during the pre-training phase and show that it is possible to reduce the size of a
|
||||
BERT model by 40%, while retaining 97% of its language understanding capabilities and being 60% faster. To leverage
|
||||
the inductive biases learned by larger models during pre-training, we introduce a triple loss combining language
|
||||
modeling, distillation and cosine-distance losses. Our smaller, faster and lighter model is cheaper to pre-train
|
||||
and we demonstrate its capabilities for on-device computations in a proof-of-concept experiment and a comparative
|
||||
on-device study.*
|
||||
|
||||
Tips:
|
||||
|
||||
- DistilBert doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`)
|
||||
- DistilBert doesn't have options to select the input positions (`position_ids` input). This could be added if necessary though, just let's us know if you need this option.
|
||||
|
||||
|
||||
DistilBertConfig
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.DistilBertConfig
|
||||
:members:
|
||||
|
||||
|
||||
``DistilBertTokenizer``
|
||||
DistilBertTokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.DistilBertTokenizer
|
||||
:members:
|
||||
|
||||
|
||||
``DistilBertModel``
|
||||
DistilBertModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.DistilBertModel
|
||||
:members:
|
||||
|
||||
|
||||
``DistilBertForMaskedLM``
|
||||
DistilBertForMaskedLM
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.DistilBertForMaskedLM
|
||||
:members:
|
||||
|
||||
|
||||
``DistilBertForSequenceClassification``
|
||||
DistilBertForSequenceClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.DistilBertForSequenceClassification
|
||||
:members:
|
||||
|
||||
|
||||
``DistilBertForQuestionAnswering``
|
||||
DistilBertForQuestionAnswering
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.DistilBertForQuestionAnswering
|
||||
:members:
|
||||
|
||||
``TFDistilBertModel``
|
||||
TFDistilBertModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFDistilBertModel
|
||||
:members:
|
||||
|
||||
|
||||
``TFDistilBertForMaskedLM``
|
||||
TFDistilBertForMaskedLM
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFDistilBertForMaskedLM
|
||||
:members:
|
||||
|
||||
|
||||
``TFDistilBertForSequenceClassification``
|
||||
TFDistilBertForSequenceClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFDistilBertForSequenceClassification
|
||||
:members:
|
||||
|
||||
|
||||
``TFDistilBertForQuestionAnswering``
|
||||
TFDistilBertForQuestionAnswering
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFDistilBertForQuestionAnswering
|
||||
|
||||
72
docs/source/model_doc/flaubert.rst
Normal file
72
docs/source/model_doc/flaubert.rst
Normal file
@@ -0,0 +1,72 @@
|
||||
FlauBERT
|
||||
----------------------------------------------------
|
||||
|
||||
The FlauBERT model was proposed in the paper
|
||||
`FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le et al.
|
||||
It's a transformer pre-trained using a masked language modeling (MLM) objective (BERT-like).
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Language models have become a key step to achieve state-of-the art results in many different Natural Language
|
||||
Processing (NLP) tasks. Leveraging the huge amount of unlabeled texts nowadays available, they provide an efficient
|
||||
way to pre-train continuous word representations that can be fine-tuned for a downstream task, along with their
|
||||
contextualization at the sentence level. This has been widely demonstrated for English using contextualized
|
||||
representations (Dai and Le, 2015; Peters et al., 2018; Howard and Ruder, 2018; Radford et al., 2018; Devlin et
|
||||
al., 2019; Yang et al., 2019b). In this paper, we introduce and share FlauBERT, a model learned on a very large
|
||||
and heterogeneous French corpus. Models of different sizes are trained using the new CNRS (French National Centre
|
||||
for Scientific Research) Jean Zay supercomputer. We apply our French language models to diverse NLP tasks (text
|
||||
classification, paraphrasing, natural language inference, parsing, word sense disambiguation) and show that most
|
||||
of the time they outperform other pre-training approaches. Different versions of FlauBERT as well as a unified
|
||||
evaluation protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared
|
||||
to the research community for further reproducible experiments in French NLP.*
|
||||
|
||||
|
||||
FlaubertConfig
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.FlaubertConfig
|
||||
:members:
|
||||
|
||||
|
||||
FlaubertTokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.FlaubertTokenizer
|
||||
:members:
|
||||
|
||||
|
||||
FlaubertModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.FlaubertModel
|
||||
:members:
|
||||
|
||||
|
||||
FlaubertWithLMHeadModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.FlaubertWithLMHeadModel
|
||||
:members:
|
||||
|
||||
|
||||
FlaubertForSequenceClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.FlaubertForSequenceClassification
|
||||
:members:
|
||||
|
||||
|
||||
FlaubertForQuestionAnsweringSimple
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.FlaubertForQuestionAnsweringSimple
|
||||
:members:
|
||||
|
||||
|
||||
FlaubertForQuestionAnswering
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.FlaubertForQuestionAnswering
|
||||
:members:
|
||||
|
||||
|
||||
@@ -1,56 +1,91 @@
|
||||
OpenAI GPT
|
||||
----------------------------------------------------
|
||||
|
||||
``OpenAIGPTConfig``
|
||||
Overview
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
OpenAI GPT model was proposed in `Improving Language Understanding by Generative Pre-Training <https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf>`__
|
||||
by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. It's a causal (unidirectional)
|
||||
transformer pre-trained using language modeling on a large corpus will long range dependencies, the Toronto Book Corpus.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Natural language understanding comprises a wide range of diverse tasks such
|
||||
as textual entailment, question answering, semantic similarity assessment, and
|
||||
document classification. Although large unlabeled text corpora are abundant,
|
||||
labeled data for learning these specific tasks is scarce, making it challenging for
|
||||
discriminatively trained models to perform adequately. We demonstrate that large
|
||||
gains on these tasks can be realized by generative pre-training of a language model
|
||||
on a diverse corpus of unlabeled text, followed by discriminative fine-tuning on each
|
||||
specific task. In contrast to previous approaches, we make use of task-aware input
|
||||
transformations during fine-tuning to achieve effective transfer while requiring
|
||||
minimal changes to the model architecture. We demonstrate the effectiveness of
|
||||
our approach on a wide range of benchmarks for natural language understanding.
|
||||
Our general task-agnostic model outperforms discriminatively trained models that
|
||||
use architectures specifically crafted for each task, significantly improving upon the
|
||||
state of the art in 9 out of the 12 tasks studied.*
|
||||
|
||||
Tips:
|
||||
|
||||
- GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||
the right rather than the left.
|
||||
- GPT was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
|
||||
token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as
|
||||
it can be observed in the `run_generation.py` example script.
|
||||
|
||||
`Write With Transformer <https://transformer.huggingface.co/doc/gpt>`__ is a webapp created and hosted by
|
||||
Hugging Face showcasing the generative capabilities of several models. GPT is one of them.
|
||||
|
||||
OpenAIGPTConfig
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.OpenAIGPTConfig
|
||||
:members:
|
||||
|
||||
|
||||
``OpenAIGPTTokenizer``
|
||||
OpenAIGPTTokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.OpenAIGPTTokenizer
|
||||
:members:
|
||||
|
||||
|
||||
``OpenAIGPTModel``
|
||||
OpenAIGPTModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.OpenAIGPTModel
|
||||
:members:
|
||||
|
||||
|
||||
``OpenAIGPTLMHeadModel``
|
||||
OpenAIGPTLMHeadModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.OpenAIGPTLMHeadModel
|
||||
:members:
|
||||
|
||||
|
||||
``OpenAIGPTDoubleHeadsModel``
|
||||
OpenAIGPTDoubleHeadsModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.OpenAIGPTDoubleHeadsModel
|
||||
:members:
|
||||
|
||||
|
||||
``TFOpenAIGPTModel``
|
||||
TFOpenAIGPTModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFOpenAIGPTModel
|
||||
:members:
|
||||
|
||||
|
||||
``TFOpenAIGPTLMHeadModel``
|
||||
TFOpenAIGPTLMHeadModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFOpenAIGPTLMHeadModel
|
||||
:members:
|
||||
|
||||
|
||||
``TFOpenAIGPTDoubleHeadsModel``
|
||||
TFOpenAIGPTDoubleHeadsModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFOpenAIGPTDoubleHeadsModel
|
||||
|
||||
@@ -1,56 +1,90 @@
|
||||
OpenAI GPT2
|
||||
----------------------------------------------------
|
||||
|
||||
``GPT2Config``
|
||||
Overview
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
OpenAI GPT-2 model was proposed in
|
||||
`Language Models are Unsupervised Multitask Learners`_
|
||||
by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
||||
It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
|
||||
corpus of ~40 GB of text data.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*GPT-2 is a large transformer-based language model with 1.5 billion parameters, trained on a dataset[1]
|
||||
of 8 million web pages. GPT-2 is trained with a simple objective: predict the next word, given all of the previous
|
||||
words within some text. The diversity of the dataset causes this simple goal to contain naturally occurring
|
||||
demonstrations of many tasks across diverse domains. GPT-2 is a direct scale-up of GPT, with more than 10X
|
||||
the parameters and trained on more than 10X the amount of data.*
|
||||
|
||||
Tips:
|
||||
|
||||
- GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||
the right rather than the left.
|
||||
- GPT-2 was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
|
||||
token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as
|
||||
it can be observed in the `run_generation.py` example script.
|
||||
- The PyTorch models can take the `past` as input, which is the previously computed key/value attention pairs. Using
|
||||
this `past` value prevents the model from re-computing pre-computed values in the context of text generation.
|
||||
See `reusing the past in generative models <../quickstart.html#using-the-past>`_ for more information on the usage
|
||||
of this argument.
|
||||
|
||||
`Write With Transformer <https://transformer.huggingface.co/doc/gpt2-large>`__ is a webapp created and hosted by
|
||||
Hugging Face showcasing the generative capabilities of several models. GPT-2 is one of them and is available in five
|
||||
different sizes: small, medium, large, xl and a distilled version of the small checkpoint: distilgpt-2.
|
||||
|
||||
|
||||
GPT2Config
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.GPT2Config
|
||||
:members:
|
||||
|
||||
|
||||
``GPT2Tokenizer``
|
||||
GPT2Tokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.GPT2Tokenizer
|
||||
:members:
|
||||
|
||||
|
||||
``GPT2Model``
|
||||
GPT2Model
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.GPT2Model
|
||||
:members:
|
||||
|
||||
|
||||
``GPT2LMHeadModel``
|
||||
GPT2LMHeadModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.GPT2LMHeadModel
|
||||
:members:
|
||||
|
||||
|
||||
``GPT2DoubleHeadsModel``
|
||||
GPT2DoubleHeadsModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.GPT2DoubleHeadsModel
|
||||
:members:
|
||||
|
||||
|
||||
``TFGPT2Model``
|
||||
TFGPT2Model
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFGPT2Model
|
||||
:members:
|
||||
|
||||
|
||||
``TFGPT2LMHeadModel``
|
||||
TFGPT2LMHeadModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFGPT2LMHeadModel
|
||||
:members:
|
||||
|
||||
|
||||
``TFGPT2DoubleHeadsModel``
|
||||
TFGPT2DoubleHeadsModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFGPT2DoubleHeadsModel
|
||||
|
||||
@@ -1,57 +1,94 @@
|
||||
RoBERTa
|
||||
----------------------------------------------------
|
||||
|
||||
``RobertaConfig``
|
||||
The RoBERTa model was proposed in `RoBERTa: A Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_
|
||||
by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer,
|
||||
Veselin Stoyanov. It is based on Google's BERT model released in 2018.
|
||||
|
||||
It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining
|
||||
objective and training with much larger mini-batches and learning rates.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Language model pretraining has led to significant performance gains but careful comparison between different
|
||||
approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes,
|
||||
and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication
|
||||
study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and
|
||||
training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of
|
||||
every model published after it. Our best model achieves state-of-the-art results on GLUE, RACE and SQuAD. These
|
||||
results highlight the importance of previously overlooked design choices, and raise questions about the source
|
||||
of recently reported improvements. We release our models and code.*
|
||||
|
||||
Tips:
|
||||
|
||||
- This implementation is the same as :class:`~transformers.BertModel` with a tiny embeddings tweak as well as a
|
||||
setup for Roberta pretrained models.
|
||||
- `Camembert <./camembert.html>`__ is a wrapper around RoBERTa. Refer to this page for usage examples.
|
||||
|
||||
RobertaConfig
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.RobertaConfig
|
||||
:members:
|
||||
|
||||
|
||||
``RobertaTokenizer``
|
||||
RobertaTokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.RobertaTokenizer
|
||||
:members:
|
||||
|
||||
|
||||
``RobertaModel``
|
||||
RobertaModel
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.RobertaModel
|
||||
:members:
|
||||
|
||||
|
||||
``RobertaForMaskedLM``
|
||||
RobertaForMaskedLM
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.RobertaForMaskedLM
|
||||
:members:
|
||||
|
||||
|
||||
``RobertaForSequenceClassification``
|
||||
RobertaForSequenceClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.RobertaForSequenceClassification
|
||||
:members:
|
||||
|
||||
|
||||
``TFRobertaModel``
|
||||
RobertaForTokenClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.RobertaForTokenClassification
|
||||
:members:
|
||||
|
||||
TFRobertaModel
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFRobertaModel
|
||||
:members:
|
||||
|
||||
|
||||
``TFRobertaForMaskedLM``
|
||||
TFRobertaForMaskedLM
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFRobertaForMaskedLM
|
||||
:members:
|
||||
|
||||
|
||||
``TFRobertaForSequenceClassification``
|
||||
TFRobertaForSequenceClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFRobertaForSequenceClassification
|
||||
:members:
|
||||
|
||||
|
||||
TFRobertaForTokenClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFRobertaForTokenClassification
|
||||
:members:
|
||||
|
||||
@@ -1,43 +1,72 @@
|
||||
Transformer XL
|
||||
----------------------------------------------------
|
||||
|
||||
Overview
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
``TransfoXLConfig``
|
||||
The Transformer-XL model was proposed in
|
||||
`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__
|
||||
by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
|
||||
It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can reuse
|
||||
previously computed hidden-states to attend to longer context (memory).
|
||||
This model also uses adaptive softmax inputs and outputs (tied).
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Transformers have a potential of learning longer-term dependency, but are limited by a fixed-length context in the
|
||||
setting of language modeling. We propose a novel neural architecture Transformer-XL that enables learning dependency
|
||||
beyond a fixed length without disrupting temporal coherence. It consists of a segment-level recurrence mechanism and
|
||||
a novel positional encoding scheme. Our method not only enables capturing longer-term dependency, but also resolves
|
||||
the context fragmentation problem. As a result, Transformer-XL learns dependency that is 80% longer than RNNs and
|
||||
450% longer than vanilla Transformers, achieves better performance on both short and long sequences, and is up
|
||||
to 1,800+ times faster than vanilla Transformers during evaluation. Notably, we improve the state-of-the-art results
|
||||
of bpc/perplexity to 0.99 on enwiki8, 1.08 on text8, 18.3 on WikiText-103, 21.8 on One Billion Word, and 54.5 on
|
||||
Penn Treebank (without finetuning). When trained only on WikiText-103, Transformer-XL manages to generate reasonably
|
||||
coherent, novel text articles with thousands of tokens.*
|
||||
|
||||
Tips:
|
||||
|
||||
- Transformer-XL uses relative sinusoidal positional embeddings. Padding can be done on the left or on the right.
|
||||
The original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left.
|
||||
- Transformer-XL is one of the few models that has no sequence length limit.
|
||||
|
||||
|
||||
TransfoXLConfig
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TransfoXLConfig
|
||||
:members:
|
||||
|
||||
|
||||
``TransfoXLTokenizer``
|
||||
TransfoXLTokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TransfoXLTokenizer
|
||||
:members:
|
||||
|
||||
|
||||
``TransfoXLModel``
|
||||
TransfoXLModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TransfoXLModel
|
||||
:members:
|
||||
|
||||
|
||||
``TransfoXLLMHeadModel``
|
||||
TransfoXLLMHeadModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TransfoXLLMHeadModel
|
||||
:members:
|
||||
|
||||
|
||||
``TFTransfoXLModel``
|
||||
TFTransfoXLModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFTransfoXLModel
|
||||
:members:
|
||||
|
||||
|
||||
``TFTransfoXLLMHeadModel``
|
||||
TFTransfoXLLMHeadModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFTransfoXLLMHeadModel
|
||||
|
||||
@@ -1,68 +1,105 @@
|
||||
XLM
|
||||
----------------------------------------------------
|
||||
|
||||
``XLMConfig``
|
||||
Overview
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The XLM model was proposed in `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_
|
||||
by Guillaume Lample*, Alexis Conneau*. It's a transformer pre-trained using one of the following objectives:
|
||||
|
||||
- a causal language modeling (CLM) objective (next token prediction),
|
||||
- a masked language modeling (MLM) objective (Bert-like), or
|
||||
- a Translation Language Modeling (TLM) object (extension of Bert's MLM to multiple language inputs)
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*Recent studies have demonstrated the efficiency of generative pretraining for English natural language understanding.
|
||||
In this work, we extend this approach to multiple languages and show the effectiveness of cross-lingual pretraining.
|
||||
We propose two methods to learn cross-lingual language models (XLMs): one unsupervised that only relies on monolingual
|
||||
data, and one supervised that leverages parallel data with a new cross-lingual language model objective. We obtain
|
||||
state-of-the-art results on cross-lingual classification, unsupervised and supervised machine translation. On XNLI,
|
||||
our approach pushes the state of the art by an absolute gain of 4.9% accuracy. On unsupervised machine translation,
|
||||
we obtain 34.3 BLEU on WMT'16 German-English, improving the previous state of the art by more than 9 BLEU. On
|
||||
supervised machine translation, we obtain a new state of the art of 38.5 BLEU on WMT'16 Romanian-English, outperforming
|
||||
the previous best approach by more than 4 BLEU. Our code and pretrained models will be made publicly available.*
|
||||
|
||||
Tips:
|
||||
|
||||
- XLM has many different checkpoints, which were trained using different objectives: CLM, MLM or TLM. Make sure to
|
||||
select the correct objective for your task (e.g. MLM checkpoints are not suitable for generation).
|
||||
- XLM has multilingual checkpoints which leverage a specific `lang` parameter. Check out the
|
||||
`multi-lingual <../multilingual.html>`__ page for more information.
|
||||
|
||||
|
||||
XLMConfig
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLMConfig
|
||||
:members:
|
||||
|
||||
``XLMTokenizer``
|
||||
XLMTokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLMTokenizer
|
||||
:members:
|
||||
|
||||
``XLMModel``
|
||||
XLMModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLMModel
|
||||
:members:
|
||||
|
||||
|
||||
``XLMWithLMHeadModel``
|
||||
XLMWithLMHeadModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLMWithLMHeadModel
|
||||
:members:
|
||||
|
||||
|
||||
``XLMForSequenceClassification``
|
||||
XLMForSequenceClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLMForSequenceClassification
|
||||
:members:
|
||||
|
||||
|
||||
``XLMForQuestionAnswering``
|
||||
XLMForQuestionAnsweringSimple
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLMForQuestionAnsweringSimple
|
||||
:members:
|
||||
|
||||
|
||||
XLMForQuestionAnswering
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLMForQuestionAnswering
|
||||
:members:
|
||||
|
||||
|
||||
``TFXLMModel``
|
||||
TFXLMModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFXLMModel
|
||||
:members:
|
||||
|
||||
|
||||
``TFXLMWithLMHeadModel``
|
||||
TFXLMWithLMHeadModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFXLMWithLMHeadModel
|
||||
:members:
|
||||
|
||||
|
||||
``TFXLMForSequenceClassification``
|
||||
TFXLMForSequenceClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFXLMForSequenceClassification
|
||||
:members:
|
||||
|
||||
|
||||
``TFXLMForQuestionAnsweringSimple``
|
||||
TFXLMForQuestionAnsweringSimple
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFXLMForQuestionAnsweringSimple
|
||||
|
||||
102
docs/source/model_doc/xlmroberta.rst
Normal file
102
docs/source/model_doc/xlmroberta.rst
Normal file
@@ -0,0 +1,102 @@
|
||||
XLM-RoBERTa
|
||||
------------------------------------------
|
||||
|
||||
The XLM-RoBERTa model was proposed in `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__
|
||||
by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán,
|
||||
Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's RoBERTa model released in 2019.
|
||||
It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl data.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*This paper shows that pretraining multilingual language models at scale leads to significant performance gains for
|
||||
a wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred
|
||||
languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly
|
||||
outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +13.8% average accuracy
|
||||
on XNLI, +12.3% average F1 score on MLQA, and +2.1% average F1 score on NER. XLM-R performs particularly well on
|
||||
low-resource languages, improving 11.8% in XNLI accuracy for Swahili and 9.2% for Urdu over the previous XLM model.
|
||||
We also present a detailed empirical evaluation of the key factors that are required to achieve these gains,
|
||||
including the trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and
|
||||
low resource languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling
|
||||
without sacrificing per-language performance; XLM-Ris very competitive with strong monolingual models on the GLUE
|
||||
and XNLI benchmarks. We will make XLM-R code, data, and models publicly available.*
|
||||
|
||||
Tips:
|
||||
|
||||
- This implementation is the same as RoBERTa. Refer to the `documentation of RoBERTa <./roberta.html>`__ for usage
|
||||
examples as well as the information relative to the inputs and outputs.
|
||||
|
||||
XLMRobertaConfig
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLMRobertaConfig
|
||||
:members:
|
||||
|
||||
|
||||
XLMRobertaTokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLMRobertaTokenizer
|
||||
:members:
|
||||
|
||||
|
||||
XLMRobertaModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLMRobertaModel
|
||||
:members:
|
||||
|
||||
|
||||
XLMRobertaForMaskedLM
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLMRobertaForMaskedLM
|
||||
:members:
|
||||
|
||||
|
||||
XLMRobertaForSequenceClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLMRobertaForSequenceClassification
|
||||
:members:
|
||||
|
||||
|
||||
XLMRobertaForMultipleChoice
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLMRobertaForMultipleChoice
|
||||
:members:
|
||||
|
||||
|
||||
XLMRobertaForTokenClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLMRobertaForTokenClassification
|
||||
:members:
|
||||
|
||||
|
||||
TFXLMRobertaModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFXLMRobertaModel
|
||||
:members:
|
||||
|
||||
|
||||
TFXLMRobertaForMaskedLM
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFXLMRobertaForMaskedLM
|
||||
:members:
|
||||
|
||||
|
||||
TFXLMRobertaForSequenceClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFXLMRobertaForSequenceClassification
|
||||
:members:
|
||||
|
||||
|
||||
TFXLMRobertaForTokenClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFXLMRobertaForTokenClassification
|
||||
:members:
|
||||
@@ -1,70 +1,123 @@
|
||||
XLNet
|
||||
----------------------------------------------------
|
||||
|
||||
``XLNetConfig``
|
||||
Overview
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The XLNet model was proposed in `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_
|
||||
by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
|
||||
XLnet is an extension of the Transformer-XL model pre-trained using an autoregressive method
|
||||
to learn bidirectional contexts by maximizing the expected likelihood over all permutations
|
||||
of the input sequence factorization order.
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*With the capability of modeling bidirectional contexts, denoising autoencoding based pretraining like BERT achieves
|
||||
better performance than pretraining approaches based on autoregressive language modeling. However, relying on
|
||||
corrupting the input with masks, BERT neglects dependency between the masked positions and suffers from a
|
||||
pretrain-finetune discrepancy. In light of these pros and cons, we propose XLNet, a generalized autoregressive
|
||||
pretraining method that (1) enables learning bidirectional contexts by maximizing the expected likelihood over
|
||||
all permutations of the factorization order and (2) overcomes the limitations of BERT thanks to its autoregressive
|
||||
formulation. Furthermore, XLNet integrates ideas from Transformer-XL, the state-of-the-art autoregressive model,
|
||||
into pretraining. Empirically, under comparable experiment settings, XLNet outperforms BERT on 20 tasks, often by
|
||||
a large margin, including question answering, natural language inference, sentiment analysis, and document ranking.*
|
||||
|
||||
Tips:
|
||||
|
||||
- The specific attention pattern can be controlled at training and test time using the `perm_mask` input.
|
||||
- Due to the difficulty of training a fully auto-regressive model over various factorization order,
|
||||
XLNet is pretrained using only a sub-set of the output tokens as target which are selected
|
||||
with the `target_mapping` input.
|
||||
- To use XLNet for sequential decoding (i.e. not in fully bi-directional setting), use the `perm_mask` and
|
||||
`target_mapping` inputs to control the attention span and outputs (see examples in `examples/run_generation.py`)
|
||||
- XLNet is one of the few models that has no sequence length limit.
|
||||
|
||||
|
||||
XLNetConfig
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLNetConfig
|
||||
:members:
|
||||
|
||||
|
||||
``XLNetTokenizer``
|
||||
XLNetTokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLNetTokenizer
|
||||
:members:
|
||||
|
||||
|
||||
``XLNetModel``
|
||||
XLNetModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLNetModel
|
||||
:members:
|
||||
|
||||
|
||||
``XLNetLMHeadModel``
|
||||
XLNetLMHeadModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLNetLMHeadModel
|
||||
:members:
|
||||
|
||||
|
||||
``XLNetForSequenceClassification``
|
||||
XLNetForSequenceClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLNetForSequenceClassification
|
||||
:members:
|
||||
|
||||
|
||||
``XLNetForQuestionAnswering``
|
||||
XLNetForTokenClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLNetForTokenClassification
|
||||
:members:
|
||||
|
||||
|
||||
XLNetForMultipleChoice
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLNetForMultipleChoice
|
||||
:members:
|
||||
|
||||
|
||||
XLNetForQuestionAnsweringSimple
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLNetForQuestionAnsweringSimple
|
||||
:members:
|
||||
|
||||
|
||||
XLNetForQuestionAnswering
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.XLNetForQuestionAnswering
|
||||
:members:
|
||||
|
||||
|
||||
``TFXLNetModel``
|
||||
TFXLNetModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFXLNetModel
|
||||
:members:
|
||||
|
||||
|
||||
``TFXLNetLMHeadModel``
|
||||
TFXLNetLMHeadModel
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFXLNetLMHeadModel
|
||||
:members:
|
||||
|
||||
|
||||
``TFXLNetForSequenceClassification``
|
||||
TFXLNetForSequenceClassification
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFXLNetForSequenceClassification
|
||||
:members:
|
||||
|
||||
|
||||
``TFXLNetForQuestionAnsweringSimple``
|
||||
TFXLNetForQuestionAnsweringSimple
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.TFXLNetForQuestionAnsweringSimple
|
||||
|
||||
45
docs/source/model_sharing.md
Normal file
45
docs/source/model_sharing.md
Normal file
@@ -0,0 +1,45 @@
|
||||
# Model upload and sharing
|
||||
|
||||
Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
|
||||
|
||||
**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then:
|
||||
|
||||
```shell
|
||||
transformers-cli login
|
||||
# log in using the same credentials as on huggingface.co
|
||||
```
|
||||
Upload your model:
|
||||
```shell
|
||||
transformers-cli upload ./path/to/pretrained_model/
|
||||
|
||||
# ^^ Upload folder containing weights/tokenizer/config
|
||||
# saved via `.save_pretrained()`
|
||||
|
||||
transformers-cli upload ./config.json [--filename folder/foobar.json]
|
||||
|
||||
# ^^ Upload a single file
|
||||
# (you can optionally override its filename, which can be nested inside a folder)
|
||||
```
|
||||
|
||||
Your model will then be accessible through its identifier, a concatenation of your username and the folder name above:
|
||||
```python
|
||||
"username/pretrained_model"
|
||||
```
|
||||
|
||||
Anyone can load it from code:
|
||||
```python
|
||||
tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")
|
||||
model = AutoModel.from_pretrained("username/pretrained_model")
|
||||
```
|
||||
|
||||
Finally, list all your files on S3:
|
||||
```shell
|
||||
transformers-cli s3 ls
|
||||
# List all your S3 objects.
|
||||
```
|
||||
|
||||
You can also delete files:
|
||||
|
||||
```shell
|
||||
transformers-cli s3 rm …
|
||||
```
|
||||
@@ -3,6 +3,7 @@ Pretrained models
|
||||
|
||||
Here is the full list of the currently provided pretrained models together with a short presentation of each model.
|
||||
|
||||
For a list that includes community-uploaded models, refer to `https://huggingface.co/models <https://huggingface.co/models>`__.
|
||||
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| Architecture | Shortcut name | Details of the model |
|
||||
@@ -79,6 +80,18 @@ Here is the full list of the currently provided pretrained models together with
|
||||
| | ``bert-base-japanese-char-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||
| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters. |
|
||||
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``bert-base-finnish-cased-v1`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||
| | | | Trained on cased Finnish text. |
|
||||
| | | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__). |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``bert-base-finnish-uncased-v1`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||
| | | | Trained on uncased Finnish text. |
|
||||
| | | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__). |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``bert-base-dutch-cased`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||
| | | | Trained on cased Dutch text. |
|
||||
| | | (see `details on wietsedv repository <https://github.com/wietsedv/bertje/>`__). |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
|
||||
| | | | OpenAI GPT English model |
|
||||
@@ -146,6 +159,10 @@ Here is the full list of the currently provided pretrained models together with
|
||||
| | | | ``roberta-large`` fine-tuned on `MNLI <http://www.nyu.edu/projects/bowman/multinli/>`__. |
|
||||
| | | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``distilroberta-base`` | | 6-layer, 768-hidden, 12-heads, 82M parameters |
|
||||
| | | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint. |
|
||||
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``roberta-base-openai-detector`` | | 12-layer, 768-hidden, 12-heads, 125M parameters |
|
||||
| | | | ``roberta-base`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model. |
|
||||
| | | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__) |
|
||||
@@ -166,10 +183,6 @@ Here is the full list of the currently provided pretrained models together with
|
||||
| | | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint. |
|
||||
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``distilroberta-base`` | | 6-layer, 768-hidden, 12-heads, 82M parameters |
|
||||
| | | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint. |
|
||||
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``distilbert-base-german-cased`` | | 6-layer, 768-hidden, 12-heads, 66M parameters |
|
||||
| | | | The German DistilBERT model distilled from the German DBMDZ BERT model `bert-base-german-dbmdz-cased` checkpoint. |
|
||||
| | | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__) |
|
||||
@@ -217,6 +230,43 @@ Here is the full list of the currently provided pretrained models together with
|
||||
| | | | ALBERT xxlarge model with no dropout, additional training data and longer training |
|
||||
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| T5 | ``t5-small`` | | ~60M parameters with 6-layers, 512-hidden-state, 2048 feed-forward hidden-state, 8-heads, |
|
||||
| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``t5-base`` | | ~220M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 12-heads, |
|
||||
| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``t5-large`` | | ~770M parameters with 24-layers, 1024-hidden-state, 4096 feed-forward hidden-state, 16-heads, |
|
||||
| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``t5-3B`` | | ~2.8B parameters with 24-layers, 1024-hidden-state, 16384 feed-forward hidden-state, 32-heads, |
|
||||
| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``t5-11B`` | | ~11B parameters with 24-layers, 1024-hidden-state, 65536 feed-forward hidden-state, 128-heads, |
|
||||
| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| XLM-RoBERTa | ``xlm-roberta-base`` | | ~125M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 8-heads, |
|
||||
| | | | Trained on on 2.5 TB of newly created clean CommonCrawl data in 100 languages |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``xlm-roberta-large`` | | ~355M parameters with 24-layers, 1027-hidden-state, 4096 feed-forward hidden-state, 16-heads, |
|
||||
| | | | Trained on 2.5 TB of newly created clean CommonCrawl data in 100 languages |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| FlauBERT | ``flaubert-small-cased`` | | 6-layer, 512-hidden, 8-heads, 54M parameters |
|
||||
| | | | FlauBERT small architecture |
|
||||
| | | (see `details <https://github.com/getalp/Flaubert>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``flaubert-base-uncased`` | | 12-layer, 768-hidden, 12-heads, 137M parameters |
|
||||
| | | | FlauBERT base architecture with uncased vocabulary |
|
||||
| | | (see `details <https://github.com/getalp/Flaubert>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``flaubert-base-cased`` | | 12-layer, 768-hidden, 12-heads, 138M parameters |
|
||||
| | | | FlauBERT base architecture with cased vocabulary |
|
||||
| | | (see `details <https://github.com/getalp/Flaubert>`__) |
|
||||
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| | ``flaubert-large-cased`` | | 24-layer, 1024-hidden, 16-heads, 373M parameters |
|
||||
| | | | FlauBERT large architecture |
|
||||
| | | (see `details <https://github.com/getalp/Flaubert>`__) |
|
||||
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
|
||||
|
||||
.. <https://huggingface.co/transformers/examples.html>`__
|
||||
|
||||
@@ -219,4 +219,97 @@ sequence = tokenizer.decode(generated)
|
||||
print(sequence)
|
||||
```
|
||||
|
||||
The model only requires a single token as input as all the previous tokens' key/value pairs are contained in the `past`.
|
||||
The model only requires a single token as input as all the previous tokens' key/value pairs are contained in the `past`.
|
||||
|
||||
### Model2Model example
|
||||
|
||||
Encoder-decoder architectures require two tokenized inputs: one for the encoder and the other one for the decoder. Let's assume that we want to use `Model2Model` for generative question answering, and start by tokenizing the question and answer that will be fed to the model.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import BertTokenizer, Model2Model
|
||||
|
||||
# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
|
||||
import logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
# Load pre-trained model tokenizer (vocabulary)
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
|
||||
# Encode the input to the encoder (the question)
|
||||
question = "Who was Jim Henson?"
|
||||
encoded_question = tokenizer.encode(question)
|
||||
|
||||
# Encode the input to the decoder (the answer)
|
||||
answer = "Jim Henson was a puppeteer"
|
||||
encoded_answer = tokenizer.encode(answer)
|
||||
|
||||
# Convert inputs to PyTorch tensors
|
||||
question_tensor = torch.tensor([encoded_question])
|
||||
answer_tensor = torch.tensor([encoded_answer])
|
||||
```
|
||||
|
||||
Let's see how we can use `Model2Model` to get the value of the loss associated with this (question, answer) pair:
|
||||
|
||||
```python
|
||||
# In order to compute the loss we need to provide language model
|
||||
# labels (the token ids that the model should have produced) to
|
||||
# the decoder.
|
||||
lm_labels = encoded_answer
|
||||
labels_tensor = torch.tensor([lm_labels])
|
||||
|
||||
# Load pre-trained model (weights)
|
||||
model = Model2Model.from_pretrained('bert-base-uncased')
|
||||
|
||||
# Set the model in evaluation mode to deactivate the DropOut modules
|
||||
# This is IMPORTANT to have reproducible results during evaluation!
|
||||
model.eval()
|
||||
|
||||
# If you have a GPU, put everything on cuda
|
||||
question_tensor = question_tensor.to('cuda')
|
||||
answer_tensor = answer_tensor.to('cuda')
|
||||
labels_tensor = labels_tensor.to('cuda')
|
||||
model.to('cuda')
|
||||
|
||||
# Predict hidden states features for each layer
|
||||
with torch.no_grad():
|
||||
# See the models docstrings for the detail of the inputs
|
||||
outputs = model(question_tensor, answer_tensor, decoder_lm_labels=labels_tensor)
|
||||
# Transformers models always output tuples.
|
||||
# See the models docstrings for the detail of all the outputs
|
||||
# In our case, the first element is the value of the LM loss
|
||||
lm_loss = outputs[0]
|
||||
```
|
||||
|
||||
This loss can be used to fine-tune `Model2Model` on the question answering task. Assuming that we fine-tuned the model, let us now see how to generate an answer:
|
||||
|
||||
```python
|
||||
# Let's re-use the previous question
|
||||
question = "Who was Jim Henson?"
|
||||
encoded_question = tokenizer.encode(question)
|
||||
question_tensor = torch.tensor([encoded_question])
|
||||
|
||||
# This time we try to generate the answer, so we start with an empty sequence
|
||||
answer = "[CLS]"
|
||||
encoded_answer = tokenizer.encode(answer, add_special_tokens=False)
|
||||
answer_tensor = torch.tensor([encoded_answer])
|
||||
|
||||
# Load pre-trained model (weights)
|
||||
model = Model2Model.from_pretrained('fine-tuned-weights')
|
||||
model.eval()
|
||||
|
||||
# If you have a GPU, put everything on cuda
|
||||
question_tensor = encoded_question.to('cuda')
|
||||
answer_tensor = encoded_answer.to('cuda')
|
||||
model.to('cuda')
|
||||
|
||||
# Predict all tokens
|
||||
with torch.no_grad():
|
||||
outputs = model(question_tensor, answer_tensor)
|
||||
predictions = outputs[0]
|
||||
|
||||
# confirm we were able to predict 'jim'
|
||||
predicted_index = torch.argmax(predictions[0, -1]).item()
|
||||
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
|
||||
assert predicted_token == 'jim'
|
||||
```
|
||||
|
||||
@@ -10,7 +10,7 @@ Execute the following steps in a new virtual environment:
|
||||
```bash
|
||||
git clone https://github.com/huggingface/transformers
|
||||
cd transformers
|
||||
pip install [--editable] .
|
||||
pip install .
|
||||
pip install -r ./examples/requirements.txt
|
||||
```
|
||||
|
||||
@@ -24,6 +24,8 @@ pip install -r ./examples/requirements.txt
|
||||
| [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks.
|
||||
| [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training. |
|
||||
| [XNLI](#xnli) | Examples running BERT/XLM on the XNLI benchmark. |
|
||||
| [Adversarial evaluation of model performances](#adversarial-evaluation-of-model-performances) | Testing a model with adversarial evaluation of natural language
|
||||
inference on the Heuristic Analysis for NLI Systems (HANS) dataset (McCoy et al., 2019.) |
|
||||
|
||||
## TensorFlow 2.0 Bert models on GLUE
|
||||
|
||||
@@ -43,7 +45,7 @@ Quick benchmarks from the script (no other modifications):
|
||||
| Titan V | AMP | 26s | 0.8281/0.8568/0.8411 |
|
||||
| V100 | FP32 | 35s | 0.8646/0.8359/0.8464 |
|
||||
| V100 | AMP | 22s | 0.8646/0.8385/0.8411 |
|
||||
| 1080 Ti | FP32 | 55s | - |
|
||||
| 1080 Ti | FP32 | 55s | - |
|
||||
|
||||
Mixed precision (AMP) reduces the training time considerably for the same hardware and hyper-parameters (same batch size was used).
|
||||
|
||||
@@ -133,21 +135,21 @@ Fine-tuning the library models for sequence classification on the GLUE benchmark
|
||||
Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa.
|
||||
|
||||
GLUE is made up of a total of 9 different tasks. We get the following results on the dev set of the benchmark with an
|
||||
uncased BERT base model (the checkpoint `bert-base-uncased`). All experiments ran on 8 V100 GPUs with a total train
|
||||
batch size of 24. Some of these tasks have a small dataset and training can lead to high variance in the results
|
||||
uncased BERT base model (the checkpoint `bert-base-uncased`). All experiments ran single V100 GPUs with a total train
|
||||
batch sizes between 16 and 64. Some of these tasks have a small dataset and training can lead to high variance in the results
|
||||
between different runs. We report the median on 5 runs (with different seeds) for each of the metrics.
|
||||
|
||||
| Task | Metric | Result |
|
||||
|-------|------------------------------|-------------|
|
||||
| CoLA | Matthew's corr | 48.87 |
|
||||
| SST-2 | Accuracy | 91.74 |
|
||||
| MRPC | F1/Accuracy | 90.70/86.27 |
|
||||
| STS-B | Person/Spearman corr. | 91.39/91.04 |
|
||||
| QQP | Accuracy/F1 | 90.79/87.66 |
|
||||
| MNLI | Matched acc./Mismatched acc. | 83.70/84.83 |
|
||||
| QNLI | Accuracy | 89.31 |
|
||||
| RTE | Accuracy | 71.43 |
|
||||
| WNLI | Accuracy | 43.66 |
|
||||
| CoLA | Matthew's corr | 49.23 |
|
||||
| SST-2 | Accuracy | 91.97 |
|
||||
| MRPC | F1/Accuracy | 89.47/85.29 |
|
||||
| STS-B | Person/Spearman corr. | 83.95/83.70 |
|
||||
| QQP | Accuracy/F1 | 88.40/84.31 |
|
||||
| MNLI | Matched acc./Mismatched acc. | 80.61/81.08 |
|
||||
| QNLI | Accuracy | 87.46 |
|
||||
| RTE | Accuracy | 61.73 |
|
||||
| WNLI | Accuracy | 45.07 |
|
||||
|
||||
Some of these results are significantly different from the ones reported on the test set
|
||||
of GLUE benchmark on the website. For QQP and WNLI, please refer to [FAQ #12](https://gluebenchmark.com/faq) on the webite.
|
||||
@@ -357,9 +359,9 @@ eval_loss = 0.44457291918821606
|
||||
|
||||
Based on the script [`run_squad.py`](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py).
|
||||
|
||||
#### Fine-tuning on SQuAD
|
||||
#### Fine-tuning BERT on SQuAD1.0
|
||||
|
||||
This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large)
|
||||
This example code fine-tunes BERT on the SQuAD1.0 dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large)
|
||||
on a single tesla V100 16GB. The data for SQuAD can be downloaded with the following links and should be saved in a
|
||||
$SQUAD_DIR directory.
|
||||
|
||||
@@ -367,6 +369,12 @@ $SQUAD_DIR directory.
|
||||
* [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
|
||||
* [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
|
||||
|
||||
And for SQuAD2.0, you need to download:
|
||||
|
||||
- [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
|
||||
- [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json)
|
||||
- [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
|
||||
|
||||
```bash
|
||||
export SQUAD_DIR=/path/to/SQUAD
|
||||
|
||||
@@ -396,12 +404,12 @@ exact_match = 81.22
|
||||
#### Distributed training
|
||||
|
||||
|
||||
Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
|
||||
Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD1.1:
|
||||
|
||||
```bash
|
||||
python -m torch.distributed.launch --nproc_per_node=8 run_squad.py \
|
||||
python -m torch.distributed.launch --nproc_per_node=8 ./examples/run_squad.py \
|
||||
--model_type bert \
|
||||
--model_name_or_path bert-base-cased \
|
||||
--model_name_or_path bert-large-uncased-whole-word-masking \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--do_lower_case \
|
||||
@@ -411,9 +419,9 @@ python -m torch.distributed.launch --nproc_per_node=8 run_squad.py \
|
||||
--num_train_epochs 2 \
|
||||
--max_seq_length 384 \
|
||||
--doc_stride 128 \
|
||||
--output_dir ../models/wwm_uncased_finetuned_squad/ \
|
||||
--per_gpu_train_batch_size 24 \
|
||||
--gradient_accumulation_steps 12
|
||||
--output_dir ./examples/models/wwm_uncased_finetuned_squad/ \
|
||||
--per_gpu_eval_batch_size=3 \
|
||||
--per_gpu_train_batch_size=3 \
|
||||
```
|
||||
|
||||
Training with the previously defined hyper-parameters yields the following results:
|
||||
@@ -428,7 +436,9 @@ This fine-tuned model is available as a checkpoint under the reference
|
||||
|
||||
#### Fine-tuning XLNet on SQuAD
|
||||
|
||||
This example code fine-tunes XLNet on the SQuAD dataset. See above to download the data for SQuAD .
|
||||
This example code fine-tunes XLNet on both SQuAD1.0 and SQuAD2.0 dataset. See above to download the data for SQuAD .
|
||||
|
||||
##### Command for SQuAD1.0:
|
||||
|
||||
```bash
|
||||
export SQUAD_DIR=/path/to/SQUAD
|
||||
@@ -451,7 +461,32 @@ python /data/home/hlu/transformers/examples/run_squad.py \
|
||||
--save_steps 5000
|
||||
```
|
||||
|
||||
Training with the previously defined hyper-parameters yields the following results:
|
||||
##### Command for SQuAD2.0:
|
||||
|
||||
```bash
|
||||
export SQUAD_DIR=/path/to/SQUAD
|
||||
|
||||
python run_squad.py \
|
||||
--model_type xlnet \
|
||||
--model_name_or_path xlnet-large-cased \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--version_2_with_negative \
|
||||
--train_file $SQUAD_DIR/train-v2.0.json \
|
||||
--predict_file $SQUAD_DIR/dev-v2.0.json \
|
||||
--learning_rate 3e-5 \
|
||||
--num_train_epochs 4 \
|
||||
--max_seq_length 384 \
|
||||
--doc_stride 128 \
|
||||
--output_dir ./wwm_cased_finetuned_squad/ \
|
||||
--per_gpu_eval_batch_size=2 \
|
||||
--per_gpu_train_batch_size=2 \
|
||||
--save_steps 5000
|
||||
```
|
||||
|
||||
Larger batch size may improve the performance while costing more memory.
|
||||
|
||||
##### Results for SQuAD1.0 with the previously defined hyper-parameters:
|
||||
|
||||
```python
|
||||
{
|
||||
@@ -464,10 +499,28 @@ Training with the previously defined hyper-parameters yields the following resul
|
||||
}
|
||||
```
|
||||
|
||||
##### Results for SQuAD2.0 with the previously defined hyper-parameters:
|
||||
|
||||
```python
|
||||
{
|
||||
"exact": 80.4177545691906,
|
||||
"f1": 84.07154997729623,
|
||||
"total": 11873,
|
||||
"HasAns_exact": 76.73751686909581,
|
||||
"HasAns_f1": 84.05558584352873,
|
||||
"HasAns_total": 5928,
|
||||
"NoAns_exact": 84.0874684608915,
|
||||
"NoAns_f1": 84.0874684608915,
|
||||
"NoAns_total": 5945
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Named Entity Recognition
|
||||
|
||||
Based on the scripts [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py) for Pytorch and
|
||||
[`run_tf_ner.py`(https://github.com/huggingface/transformers/blob/master/examples/run_tf_ner.py)] for Tensorflow 2.
|
||||
[`run_tf_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_tf_ner.py) for Tensorflow 2.
|
||||
This example fine-tune Bert Multilingual on GermEval 2014 (German NER).
|
||||
Details and results for the fine-tuning provided by @stefan-it.
|
||||
|
||||
@@ -683,3 +736,66 @@ Training with the previously defined hyper-parameters yields the following resul
|
||||
```bash
|
||||
acc = 0.7093812375249501
|
||||
```
|
||||
|
||||
## MM-IMDb
|
||||
|
||||
Based on the script [`run_mmimdb.py`](https://github.com/huggingface/transformers/blob/master/examples/mm-imdb/run_mmimdb.py).
|
||||
|
||||
[MM-IMDb](http://lisi1.unal.edu.co/mmimdb/) is a Multimodal dataset with around 26,000 movies including images, plots and other metadata.
|
||||
|
||||
### Training on MM-IMDb
|
||||
|
||||
```
|
||||
python run_mmimdb.py \
|
||||
--data_dir /path/to/mmimdb/dataset/ \
|
||||
--model_type bert \
|
||||
--model_name_or_path bert-base-uncased \
|
||||
--output_dir /path/to/save/dir/ \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--max_seq_len 512 \
|
||||
--gradient_accumulation_steps 20 \
|
||||
--num_image_embeds 3 \
|
||||
--num_train_epochs 100 \
|
||||
--patience 5
|
||||
```
|
||||
|
||||
## Adversarial evaluation of model performances
|
||||
|
||||
Here is an example on evaluating a model using adversarial evaluation of natural language inference with the Heuristic Analysis for NLI Systems (HANS) dataset [McCoy et al., 2019](https://arxiv.org/abs/1902.01007). The example was gracefully provided by [Nafise Sadat Moosavi](https://github.com/ns-moosavi).
|
||||
|
||||
The HANS dataset can be downloaded from [this location](https://github.com/tommccoy1/hans).
|
||||
|
||||
This is an example of using test_hans.py:
|
||||
|
||||
```bash
|
||||
export HANS_DIR=path-to-hans
|
||||
export MODEL_TYPE=type-of-the-model-e.g.-bert-roberta-xlnet-etc
|
||||
export MODEL_PATH=path-to-the-model-directory-that-is-trained-on-NLI-e.g.-by-using-run_glue.py
|
||||
|
||||
python examples/test_hans.py \
|
||||
--task_name hans \
|
||||
--model_type $MODEL_TYPE \
|
||||
--do_eval \
|
||||
--do_lower_case \
|
||||
--data_dir $HANS_DIR \
|
||||
--model_name_or_path $MODEL_PATH \
|
||||
--max_seq_length 128 \
|
||||
-output_dir $MODEL_PATH \
|
||||
```
|
||||
|
||||
This will create the hans_predictions.txt file in MODEL_PATH, which can then be evaluated using hans/evaluate_heur_output.py from the HANS dataset.
|
||||
|
||||
The results of the BERT-base model that is trained on MNLI using batch size 8 and the random seed 42 on the HANS dataset is as follows:
|
||||
|
||||
```bash
|
||||
Heuristic entailed results:
|
||||
lexical_overlap: 0.9702
|
||||
subsequence: 0.9942
|
||||
constituent: 0.9962
|
||||
|
||||
Heuristic non-entailed results:
|
||||
lexical_overlap: 0.199
|
||||
subsequence: 0.0396
|
||||
constituent: 0.118
|
||||
```
|
||||
|
||||
@@ -18,12 +18,14 @@
|
||||
# If checking the tensors placement
|
||||
# tf.debugging.set_log_device_placement(True)
|
||||
|
||||
from typing import List
|
||||
import timeit
|
||||
from transformers import is_tf_available, is_torch_available
|
||||
from time import time
|
||||
import argparse
|
||||
import csv
|
||||
import timeit
|
||||
from time import time
|
||||
from typing import List
|
||||
|
||||
from transformers import AutoConfig, AutoTokenizer, is_tf_available, is_torch_available
|
||||
|
||||
|
||||
if is_tf_available():
|
||||
import tensorflow as tf
|
||||
@@ -33,230 +35,231 @@ if is_torch_available():
|
||||
import torch
|
||||
from transformers import AutoModel
|
||||
|
||||
from transformers import AutoConfig, AutoTokenizer
|
||||
|
||||
input_text = """Bent over their instruments, three hundred Fertilizers were plunged, as
|
||||
the Director of Hatcheries and Conditioning entered the room, in the
|
||||
input_text = """Bent over their instruments, three hundred Fertilizers were plunged, as
|
||||
the Director of Hatcheries and Conditioning entered the room, in the
|
||||
|
||||
|
||||
|
||||
scarcely breathing silence, the absent-minded, soliloquizing hum or
|
||||
whistle, of absorbed concentration. A troop of newly arrived students,
|
||||
very young, pink and callow, followed nervously, rather abjectly, at the
|
||||
Director's heels. Each of them carried a notebook, in which, whenever
|
||||
the great man spoke, he desperately scribbled. Straight from the
|
||||
horse's mouth. It was a rare privilege. The D. H. C. for Central London
|
||||
always made a point of personally conducting his new students round
|
||||
the various departments.
|
||||
scarcely breathing silence, the absent-minded, soliloquizing hum or
|
||||
whistle, of absorbed concentration. A troop of newly arrived students,
|
||||
very young, pink and callow, followed nervously, rather abjectly, at the
|
||||
Director's heels. Each of them carried a notebook, in which, whenever
|
||||
the great man spoke, he desperately scribbled. Straight from the
|
||||
horse's mouth. It was a rare privilege. The D. H. C. for Central London
|
||||
always made a point of personally conducting his new students round
|
||||
the various departments.
|
||||
|
||||
"Just to give you a general idea," he would explain to them. For of
|
||||
course some sort of general idea they must have, if they were to do
|
||||
their work intelligently-though as little of one, if they were to be good
|
||||
and happy members of society, as possible. For particulars, as every
|
||||
one knows, make for virtue and happiness; generalities are intellectu-
|
||||
ally necessary evils. Not philosophers but fret-sawyers and stamp col-
|
||||
lectors compose the backbone of society.
|
||||
"Just to give you a general idea," he would explain to them. For of
|
||||
course some sort of general idea they must have, if they were to do
|
||||
their work intelligently-though as little of one, if they were to be good
|
||||
and happy members of society, as possible. For particulars, as every
|
||||
one knows, make for virtue and happiness; generalities are intellectu-
|
||||
ally necessary evils. Not philosophers but fret-sawyers and stamp col-
|
||||
lectors compose the backbone of society.
|
||||
|
||||
"To-morrow," he would add, smiling at them with a slightly menacing
|
||||
geniality, "you'll be settling down to serious work. You won't have time
|
||||
for generalities. Meanwhile ..."
|
||||
"To-morrow," he would add, smiling at them with a slightly menacing
|
||||
geniality, "you'll be settling down to serious work. You won't have time
|
||||
for generalities. Meanwhile ..."
|
||||
|
||||
Meanwhile, it was a privilege. Straight from the horse's mouth into the
|
||||
notebook. The boys scribbled like mad.
|
||||
Meanwhile, it was a privilege. Straight from the horse's mouth into the
|
||||
notebook. The boys scribbled like mad.
|
||||
|
||||
Tall and rather thin but upright, the Director advanced into the room.
|
||||
He had a long chin and big rather prominent teeth, just covered, when
|
||||
he was not talking, by his full, floridly curved lips. Old, young? Thirty?
|
||||
Fifty? Fifty-five? It was hard to say. And anyhow the question didn't
|
||||
arise; in this year of stability, A. F. 632, it didn't occur to you to ask it.
|
||||
Tall and rather thin but upright, the Director advanced into the room.
|
||||
He had a long chin and big rather prominent teeth, just covered, when
|
||||
he was not talking, by his full, floridly curved lips. Old, young? Thirty?
|
||||
Fifty? Fifty-five? It was hard to say. And anyhow the question didn't
|
||||
arise; in this year of stability, A. F. 632, it didn't occur to you to ask it.
|
||||
|
||||
"I shall begin at the beginning," said the D.H.C. and the more zealous
|
||||
students recorded his intention in their notebooks: Begin at the begin-
|
||||
ning. "These," he waved his hand, "are the incubators." And opening
|
||||
an insulated door he showed them racks upon racks of numbered test-
|
||||
tubes. "The week's supply of ova. Kept," he explained, "at blood heat;
|
||||
whereas the male gametes," and here he opened another door, "they
|
||||
have to be kept at thirty-five instead of thirty-seven. Full blood heat
|
||||
sterilizes." Rams wrapped in theremogene beget no lambs.
|
||||
"I shall begin at the beginning," said the D.H.C. and the more zealous
|
||||
students recorded his intention in their notebooks: Begin at the begin-
|
||||
ning. "These," he waved his hand, "are the incubators." And opening
|
||||
an insulated door he showed them racks upon racks of numbered test-
|
||||
tubes. "The week's supply of ova. Kept," he explained, "at blood heat;
|
||||
whereas the male gametes," and here he opened another door, "they
|
||||
have to be kept at thirty-five instead of thirty-seven. Full blood heat
|
||||
sterilizes." Rams wrapped in theremogene beget no lambs.
|
||||
|
||||
Still leaning against the incubators he gave them, while the pencils
|
||||
scurried illegibly across the pages, a brief description of the modern
|
||||
Still leaning against the incubators he gave them, while the pencils
|
||||
scurried illegibly across the pages, a brief description of the modern
|
||||
|
||||
|
||||
|
||||
fertilizing process; spoke first, of course, of its surgical introduc-
|
||||
tion-"the operation undergone voluntarily for the good of Society, not
|
||||
to mention the fact that it carries a bonus amounting to six months'
|
||||
salary"; continued with some account of the technique for preserving
|
||||
the excised ovary alive and actively developing; passed on to a consid-
|
||||
eration of optimum temperature, salinity, viscosity; referred to the liq-
|
||||
uor in which the detached and ripened eggs were kept; and, leading
|
||||
his charges to the work tables, actually showed them how this liquor
|
||||
was drawn off from the test-tubes; how it was let out drop by drop
|
||||
onto the specially warmed slides of the microscopes; how the eggs
|
||||
which it contained were inspected for abnormalities, counted and
|
||||
transferred to a porous receptacle; how (and he now took them to
|
||||
watch the operation) this receptacle was immersed in a warm bouillon
|
||||
containing free-swimming spermatozoa-at a minimum concentration
|
||||
of one hundred thousand per cubic centimetre, he insisted; and how,
|
||||
after ten minutes, the container was lifted out of the liquor and its
|
||||
contents re-examined; how, if any of the eggs remained unfertilized, it
|
||||
was again immersed, and, if necessary, yet again; how the fertilized
|
||||
ova went back to the incubators; where the Alphas and Betas re-
|
||||
mained until definitely bottled; while the Gammas, Deltas and Epsilons
|
||||
were brought out again, after only thirty-six hours, to undergo Bo-
|
||||
kanovsky's Process.
|
||||
fertilizing process; spoke first, of course, of its surgical introduc-
|
||||
tion-"the operation undergone voluntarily for the good of Society, not
|
||||
to mention the fact that it carries a bonus amounting to six months'
|
||||
salary"; continued with some account of the technique for preserving
|
||||
the excised ovary alive and actively developing; passed on to a consid-
|
||||
eration of optimum temperature, salinity, viscosity; referred to the liq-
|
||||
uor in which the detached and ripened eggs were kept; and, leading
|
||||
his charges to the work tables, actually showed them how this liquor
|
||||
was drawn off from the test-tubes; how it was let out drop by drop
|
||||
onto the specially warmed slides of the microscopes; how the eggs
|
||||
which it contained were inspected for abnormalities, counted and
|
||||
transferred to a porous receptacle; how (and he now took them to
|
||||
watch the operation) this receptacle was immersed in a warm bouillon
|
||||
containing free-swimming spermatozoa-at a minimum concentration
|
||||
of one hundred thousand per cubic centimetre, he insisted; and how,
|
||||
after ten minutes, the container was lifted out of the liquor and its
|
||||
contents re-examined; how, if any of the eggs remained unfertilized, it
|
||||
was again immersed, and, if necessary, yet again; how the fertilized
|
||||
ova went back to the incubators; where the Alphas and Betas re-
|
||||
mained until definitely bottled; while the Gammas, Deltas and Epsilons
|
||||
were brought out again, after only thirty-six hours, to undergo Bo-
|
||||
kanovsky's Process.
|
||||
|
||||
"Bokanovsky's Process," repeated the Director, and the students un-
|
||||
derlined the words in their little notebooks.
|
||||
"Bokanovsky's Process," repeated the Director, and the students un-
|
||||
derlined the words in their little notebooks.
|
||||
|
||||
One egg, one embryo, one adult-normality. But a bokanovskified egg
|
||||
will bud, will proliferate, will divide. From eight to ninety-six buds, and
|
||||
every bud will grow into a perfectly formed embryo, and every embryo
|
||||
into a full-sized adult. Making ninety-six human beings grow where
|
||||
only one grew before. Progress.
|
||||
One egg, one embryo, one adult-normality. But a bokanovskified egg
|
||||
will bud, will proliferate, will divide. From eight to ninety-six buds, and
|
||||
every bud will grow into a perfectly formed embryo, and every embryo
|
||||
into a full-sized adult. Making ninety-six human beings grow where
|
||||
only one grew before. Progress.
|
||||
|
||||
"Essentially," the D.H.C. concluded, "bokanovskification consists of a
|
||||
series of arrests of development. We check the normal growth and,
|
||||
paradoxically enough, the egg responds by budding."
|
||||
"Essentially," the D.H.C. concluded, "bokanovskification consists of a
|
||||
series of arrests of development. We check the normal growth and,
|
||||
paradoxically enough, the egg responds by budding."
|
||||
|
||||
Responds by budding. The pencils were busy.
|
||||
Responds by budding. The pencils were busy.
|
||||
|
||||
He pointed. On a very slowly moving band a rack-full of test-tubes was
|
||||
entering a large metal box, another, rack-full was emerging. Machinery
|
||||
faintly purred. It took eight minutes for the tubes to go through, he
|
||||
He pointed. On a very slowly moving band a rack-full of test-tubes was
|
||||
entering a large metal box, another, rack-full was emerging. Machinery
|
||||
faintly purred. It took eight minutes for the tubes to go through, he
|
||||
|
||||
|
||||
|
||||
told them. Eight minutes of hard X-rays being about as much as an
|
||||
egg can stand. A few died; of the rest, the least susceptible divided
|
||||
into two; most put out four buds; some eight; all were returned to the
|
||||
incubators, where the buds began to develop; then, after two days,
|
||||
were suddenly chilled, chilled and checked. Two, four, eight, the buds
|
||||
in their turn budded; and having budded were dosed almost to death
|
||||
with alcohol; consequently burgeoned again and having budded-bud
|
||||
out of bud out of bud-were thereafter-further arrest being generally
|
||||
fatal-left to develop in peace. By which time the original egg was in a
|
||||
fair way to becoming anything from eight to ninety-six embryos- a
|
||||
prodigious improvement, you will agree, on nature. Identical twins-but
|
||||
not in piddling twos and threes as in the old viviparous days, when an
|
||||
egg would sometimes accidentally divide; actually by dozens, by
|
||||
scores at a time.
|
||||
told them. Eight minutes of hard X-rays being about as much as an
|
||||
egg can stand. A few died; of the rest, the least susceptible divided
|
||||
into two; most put out four buds; some eight; all were returned to the
|
||||
incubators, where the buds began to develop; then, after two days,
|
||||
were suddenly chilled, chilled and checked. Two, four, eight, the buds
|
||||
in their turn budded; and having budded were dosed almost to death
|
||||
with alcohol; consequently burgeoned again and having budded-bud
|
||||
out of bud out of bud-were thereafter-further arrest being generally
|
||||
fatal-left to develop in peace. By which time the original egg was in a
|
||||
fair way to becoming anything from eight to ninety-six embryos- a
|
||||
prodigious improvement, you will agree, on nature. Identical twins-but
|
||||
not in piddling twos and threes as in the old viviparous days, when an
|
||||
egg would sometimes accidentally divide; actually by dozens, by
|
||||
scores at a time.
|
||||
|
||||
"Scores," the Director repeated and flung out his arms, as though he
|
||||
were distributing largesse. "Scores."
|
||||
"Scores," the Director repeated and flung out his arms, as though he
|
||||
were distributing largesse. "Scores."
|
||||
|
||||
But one of the students was fool enough to ask where the advantage
|
||||
lay.
|
||||
But one of the students was fool enough to ask where the advantage
|
||||
lay.
|
||||
|
||||
"My good boy!" The Director wheeled sharply round on him. "Can't you
|
||||
see? Can't you see?" He raised a hand; his expression was solemn.
|
||||
"Bokanovsky's Process is one of the major instruments of social stabil-
|
||||
ity!"
|
||||
"My good boy!" The Director wheeled sharply round on him. "Can't you
|
||||
see? Can't you see?" He raised a hand; his expression was solemn.
|
||||
"Bokanovsky's Process is one of the major instruments of social stabil-
|
||||
ity!"
|
||||
|
||||
Major instruments of social stability.
|
||||
Major instruments of social stability.
|
||||
|
||||
Standard men and women; in uniform batches. The whole of a small
|
||||
factory staffed with the products of a single bokanovskified egg.
|
||||
Standard men and women; in uniform batches. The whole of a small
|
||||
factory staffed with the products of a single bokanovskified egg.
|
||||
|
||||
"Ninety-six identical twins working ninety-six identical machines!" The
|
||||
voice was almost tremulous with enthusiasm. "You really know where
|
||||
you are. For the first time in history." He quoted the planetary motto.
|
||||
"Community, Identity, Stability." Grand words. "If we could bo-
|
||||
kanovskify indefinitely the whole problem would be solved."
|
||||
"Ninety-six identical twins working ninety-six identical machines!" The
|
||||
voice was almost tremulous with enthusiasm. "You really know where
|
||||
you are. For the first time in history." He quoted the planetary motto.
|
||||
"Community, Identity, Stability." Grand words. "If we could bo-
|
||||
kanovskify indefinitely the whole problem would be solved."
|
||||
|
||||
Solved by standard Gammas, unvarying Deltas, uniform Epsilons. Mil-
|
||||
lions of identical twins. The principle of mass production at last applied
|
||||
to biology.
|
||||
Solved by standard Gammas, unvarying Deltas, uniform Epsilons. Mil-
|
||||
lions of identical twins. The principle of mass production at last applied
|
||||
to biology.
|
||||
|
||||
|
||||
|
||||
"But, alas," the Director shook his head, "we can't bokanovskify indefi-
|
||||
nitely."
|
||||
"But, alas," the Director shook his head, "we can't bokanovskify indefi-
|
||||
nitely."
|
||||
|
||||
Ninety-six seemed to be the limit; seventy-two a good average. From
|
||||
the same ovary and with gametes of the same male to manufacture as
|
||||
many batches of identical twins as possible-that was the best (sadly a
|
||||
second best) that they could do. And even that was difficult.
|
||||
Ninety-six seemed to be the limit; seventy-two a good average. From
|
||||
the same ovary and with gametes of the same male to manufacture as
|
||||
many batches of identical twins as possible-that was the best (sadly a
|
||||
second best) that they could do. And even that was difficult.
|
||||
|
||||
"For in nature it takes thirty years for two hundred eggs to reach ma-
|
||||
turity. But our business is to stabilize the population at this moment,
|
||||
here and now. Dribbling out twins over a quarter of a century-what
|
||||
would be the use of that?"
|
||||
"For in nature it takes thirty years for two hundred eggs to reach ma-
|
||||
turity. But our business is to stabilize the population at this moment,
|
||||
here and now. Dribbling out twins over a quarter of a century-what
|
||||
would be the use of that?"
|
||||
|
||||
Obviously, no use at all. But Podsnap's Technique had immensely ac-
|
||||
celerated the process of ripening. They could make sure of at least a
|
||||
hundred and fifty mature eggs within two years. Fertilize and bo-
|
||||
kanovskify-in other words, multiply by seventy-two-and you get an
|
||||
average of nearly eleven thousand brothers and sisters in a hundred
|
||||
and fifty batches of identical twins, all within two years of the same
|
||||
age.
|
||||
Obviously, no use at all. But Podsnap's Technique had immensely ac-
|
||||
celerated the process of ripening. They could make sure of at least a
|
||||
hundred and fifty mature eggs within two years. Fertilize and bo-
|
||||
kanovskify-in other words, multiply by seventy-two-and you get an
|
||||
average of nearly eleven thousand brothers and sisters in a hundred
|
||||
and fifty batches of identical twins, all within two years of the same
|
||||
age.
|
||||
|
||||
"And in exceptional cases we can make one ovary yield us over fifteen
|
||||
thousand adult individuals."
|
||||
"And in exceptional cases we can make one ovary yield us over fifteen
|
||||
thousand adult individuals."
|
||||
|
||||
Beckoning to a fair-haired, ruddy young man who happened to be
|
||||
passing at the moment. "Mr. Foster," he called. The ruddy young man
|
||||
approached. "Can you tell us the record for a single ovary, Mr. Foster?"
|
||||
Beckoning to a fair-haired, ruddy young man who happened to be
|
||||
passing at the moment. "Mr. Foster," he called. The ruddy young man
|
||||
approached. "Can you tell us the record for a single ovary, Mr. Foster?"
|
||||
|
||||
"Sixteen thousand and twelve in this Centre," Mr. Foster replied with-
|
||||
out hesitation. He spoke very quickly, had a vivacious blue eye, and
|
||||
took an evident pleasure in quoting figures. "Sixteen thousand and
|
||||
twelve; in one hundred and eighty-nine batches of identicals. But of
|
||||
course they've done much better," he rattled on, "in some of the tropi-
|
||||
cal Centres. Singapore has often produced over sixteen thousand five
|
||||
hundred; and Mombasa has actually touched the seventeen thousand
|
||||
mark. But then they have unfair advantages. You should see the way a
|
||||
negro ovary responds to pituitary! It's quite astonishing, when you're
|
||||
used to working with European material. Still," he added, with a laugh
|
||||
(but the light of combat was in his eyes and the lift of his chin was
|
||||
challenging), "still, we mean to beat them if we can. I'm working on a
|
||||
wonderful Delta-Minus ovary at this moment. Only just eighteen
|
||||
"Sixteen thousand and twelve in this Centre," Mr. Foster replied with-
|
||||
out hesitation. He spoke very quickly, had a vivacious blue eye, and
|
||||
took an evident pleasure in quoting figures. "Sixteen thousand and
|
||||
twelve; in one hundred and eighty-nine batches of identicals. But of
|
||||
course they've done much better," he rattled on, "in some of the tropi-
|
||||
cal Centres. Singapore has often produced over sixteen thousand five
|
||||
hundred; and Mombasa has actually touched the seventeen thousand
|
||||
mark. But then they have unfair advantages. You should see the way a
|
||||
negro ovary responds to pituitary! It's quite astonishing, when you're
|
||||
used to working with European material. Still," he added, with a laugh
|
||||
(but the light of combat was in his eyes and the lift of his chin was
|
||||
challenging), "still, we mean to beat them if we can. I'm working on a
|
||||
wonderful Delta-Minus ovary at this moment. Only just eighteen
|
||||
|
||||
|
||||
|
||||
months old. Over twelve thousand seven hundred children already, ei-
|
||||
ther decanted or in embryo. And still going strong. We'll beat them
|
||||
yet."
|
||||
months old. Over twelve thousand seven hundred children already, ei-
|
||||
ther decanted or in embryo. And still going strong. We'll beat them
|
||||
yet."
|
||||
|
||||
"That's the spirit I like!" cried the Director, and clapped Mr. Foster on
|
||||
the shoulder. "Come along with us, and give these boys the benefit of
|
||||
your expert knowledge."
|
||||
"That's the spirit I like!" cried the Director, and clapped Mr. Foster on
|
||||
the shoulder. "Come along with us, and give these boys the benefit of
|
||||
your expert knowledge."
|
||||
|
||||
Mr. Foster smiled modestly. "With pleasure." They went.
|
||||
In the Bottling Room all was harmonious bustle and ordered activity.
|
||||
Flaps of fresh sow's peritoneum ready cut to the proper size came
|
||||
shooting up in little lifts from the Organ Store in the sub-basement.
|
||||
Whizz and then, click! the lift-hatches hew open; the bottle-liner had
|
||||
only to reach out a hand, take the flap, insert, smooth-down, and be-
|
||||
fore the lined bottle had had time to travel out of reach along the end-
|
||||
less band, whizz, click! another flap of peritoneum had shot up from
|
||||
the depths, ready to be slipped into yet another bottle, the next of that
|
||||
slow interminable procession on the band.
|
||||
Mr. Foster smiled modestly. "With pleasure." They went.
|
||||
In the Bottling Room all was harmonious bustle and ordered activity.
|
||||
Flaps of fresh sow's peritoneum ready cut to the proper size came
|
||||
shooting up in little lifts from the Organ Store in the sub-basement.
|
||||
Whizz and then, click! the lift-hatches hew open; the bottle-liner had
|
||||
only to reach out a hand, take the flap, insert, smooth-down, and be-
|
||||
fore the lined bottle had had time to travel out of reach along the end-
|
||||
less band, whizz, click! another flap of peritoneum had shot up from
|
||||
the depths, ready to be slipped into yet another bottle, the next of that
|
||||
slow interminable procession on the band.
|
||||
|
||||
Next to the Liners stood the Matriculators. The procession advanced;
|
||||
one by one the eggs were transferred from their test-tubes to the
|
||||
larger containers; deftly the peritoneal lining was slit, the morula
|
||||
dropped into place, the saline solution poured in ... and already the
|
||||
bottle had passed, and it was the turn of the labellers. Heredity, date
|
||||
of fertilization, membership of Bokanovsky Group-details were trans-
|
||||
ferred from test-tube to bottle. No longer anonymous, but named,
|
||||
identified, the procession marched slowly on; on through an opening in
|
||||
the wall, slowly on into the Social Predestination Room.
|
||||
"Eighty-eight cubic metres of card-index," said Mr. Foster with relish,
|
||||
Next to the Liners stood the Matriculators. The procession advanced;
|
||||
one by one the eggs were transferred from their test-tubes to the
|
||||
larger containers; deftly the peritoneal lining was slit, the morula
|
||||
dropped into place, the saline solution poured in ... and already the
|
||||
bottle had passed, and it was the turn of the labellers. Heredity, date
|
||||
of fertilization, membership of Bokanovsky Group-details were trans-
|
||||
ferred from test-tube to bottle. No longer anonymous, but named,
|
||||
identified, the procession marched slowly on; on through an opening in
|
||||
the wall, slowly on into the Social Predestination Room.
|
||||
"Eighty-eight cubic metres of card-index," said Mr. Foster with relish,
|
||||
as they entered."""
|
||||
|
||||
|
||||
def create_setup_and_compute(model_names: List[str],
|
||||
gpu: bool = True,
|
||||
tensorflow: bool = False,
|
||||
average_over: int = 3,
|
||||
torchscript: bool = False,
|
||||
xla: bool = False,
|
||||
amp: bool = False,
|
||||
fp16: bool = False,
|
||||
save_to_csv: bool = False,
|
||||
csv_filename: str = f"results_{round(time())}.csv"):
|
||||
def create_setup_and_compute(
|
||||
model_names: List[str],
|
||||
gpu: bool = True,
|
||||
tensorflow: bool = False,
|
||||
average_over: int = 3,
|
||||
torchscript: bool = False,
|
||||
xla: bool = False,
|
||||
amp: bool = False,
|
||||
fp16: bool = False,
|
||||
save_to_csv: bool = False,
|
||||
csv_filename: str = f"results_{round(time())}.csv",
|
||||
):
|
||||
if xla:
|
||||
tf.config.optimizer.set_jit(True)
|
||||
if amp:
|
||||
@@ -266,7 +269,7 @@ def create_setup_and_compute(model_names: List[str],
|
||||
dictionary = {model_name: {} for model_name in model_names}
|
||||
results = _compute_tensorflow(model_names, dictionary, average_over, amp)
|
||||
else:
|
||||
device = 'cuda' if (gpu and torch.cuda.is_available()) else 'cpu'
|
||||
device = "cuda" if (gpu and torch.cuda.is_available()) else "cpu"
|
||||
dictionary = {model_name: {} for model_name in model_names}
|
||||
results = _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16)
|
||||
|
||||
@@ -276,34 +279,52 @@ def create_setup_and_compute(model_names: List[str],
|
||||
for batch_size in results[model_name]["bs"]:
|
||||
print("\t\t" + f"===== BATCH SIZE: {batch_size} =====")
|
||||
for slice_size in results[model_name]["ss"]:
|
||||
result = results[model_name]['results'][batch_size][slice_size]
|
||||
result = results[model_name]["results"][batch_size][slice_size]
|
||||
if isinstance(result, str):
|
||||
print(f"\t\t{model_name}/{batch_size}/{slice_size}: "
|
||||
f"{result}")
|
||||
print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{result}")
|
||||
else:
|
||||
print(f"\t\t{model_name}/{batch_size}/{slice_size}: "
|
||||
f"{(round(1000 * result) / 1000)}"
|
||||
f"s")
|
||||
print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{(round(1000 * result) / 1000)}" f"s")
|
||||
|
||||
if save_to_csv:
|
||||
with open(csv_filename, mode='w') as csv_file:
|
||||
fieldnames = ['model',
|
||||
'1x8', '1x64', '1x128', '1x256', '1x512', '1x1024',
|
||||
'2x8', '2x64', '2x128', '2x256', '2x512', '2x1024',
|
||||
'4x8', '4x64', '4x128', '4x256', '4x512', '4x1024',
|
||||
'8x8', '8x64', '8x128', '8x256', '8x512', '8x1024',
|
||||
]
|
||||
with open(csv_filename, mode="w") as csv_file:
|
||||
fieldnames = [
|
||||
"model",
|
||||
"1x8",
|
||||
"1x64",
|
||||
"1x128",
|
||||
"1x256",
|
||||
"1x512",
|
||||
"1x1024",
|
||||
"2x8",
|
||||
"2x64",
|
||||
"2x128",
|
||||
"2x256",
|
||||
"2x512",
|
||||
"2x1024",
|
||||
"4x8",
|
||||
"4x64",
|
||||
"4x128",
|
||||
"4x256",
|
||||
"4x512",
|
||||
"4x1024",
|
||||
"8x8",
|
||||
"8x64",
|
||||
"8x128",
|
||||
"8x256",
|
||||
"8x512",
|
||||
"8x1024",
|
||||
]
|
||||
|
||||
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
|
||||
for model_name in model_names:
|
||||
model_results = {
|
||||
f'{bs}x{ss}': results[model_name]['results'][bs][ss]
|
||||
f"{bs}x{ss}": results[model_name]["results"][bs][ss]
|
||||
for bs in results[model_name]["results"]
|
||||
for ss in results[model_name]['results'][bs]
|
||||
for ss in results[model_name]["results"][bs]
|
||||
}
|
||||
writer.writerow({'model': model_name, **model_results})
|
||||
writer.writerow({"model": model_name, **model_results})
|
||||
|
||||
|
||||
def _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16):
|
||||
@@ -343,7 +364,7 @@ def _compute_pytorch(model_names, dictionary, average_over, device, torchscript,
|
||||
|
||||
print("Going through model with sequence of shape", sequence.shape)
|
||||
runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
|
||||
average_time = sum(runtimes)/float(len(runtimes)) / 3.0
|
||||
average_time = sum(runtimes) / float(len(runtimes)) / 3.0
|
||||
dictionary[model_name]["results"][batch_size][slice_size] = average_time
|
||||
except RuntimeError as e:
|
||||
print("Doesn't fit on GPU.", e)
|
||||
@@ -379,7 +400,9 @@ def _compute_tensorflow(model_names, dictionary, average_over, amp):
|
||||
if max_input_size is not None and slice_size > max_input_size:
|
||||
dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
|
||||
else:
|
||||
sequence = tf.stack([tf.squeeze(tf.constant(tokenized_sequence[:slice_size])[None, :])] * batch_size)
|
||||
sequence = tf.stack(
|
||||
[tf.squeeze(tf.constant(tokenized_sequence[:slice_size])[None, :])] * batch_size
|
||||
)
|
||||
|
||||
try:
|
||||
print("Going through model with sequence of shape", sequence.shape)
|
||||
@@ -387,7 +410,7 @@ def _compute_tensorflow(model_names, dictionary, average_over, amp):
|
||||
inference(sequence)
|
||||
|
||||
runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
|
||||
average_time = sum(runtimes)/float(len(runtimes)) / 3.0
|
||||
average_time = sum(runtimes) / float(len(runtimes)) / 3.0
|
||||
dictionary[model_name]["results"][batch_size][slice_size] = average_time
|
||||
except tf.errors.ResourceExhaustedError as e:
|
||||
print("Doesn't fit on GPU.", e)
|
||||
@@ -399,33 +422,64 @@ def _compute_tensorflow(model_names, dictionary, average_over, amp):
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("--models", required=False, type=str, default='all', help="Model checkpoints to be provided "
|
||||
"to the AutoModel classes. Leave "
|
||||
"blank to benchmark the base version "
|
||||
"of all available model "
|
||||
"architectures.")
|
||||
parser.add_argument("--torch", required=False, action="store_true", help="Benchmark the Pytorch version of the "
|
||||
"models")
|
||||
parser.add_argument("--torch_cuda", required=False, action="store_true", help="Pytorch only: run on available "
|
||||
"cuda devices")
|
||||
parser.add_argument("--torchscript", required=False, action="store_true", help="Pytorch only: trace the models "
|
||||
"using torchscript")
|
||||
parser.add_argument("--tensorflow", required=False, action="store_true", help="Benchmark the TensorFlow version "
|
||||
"of the models. Will run on GPU if "
|
||||
"the correct dependencies are "
|
||||
"installed")
|
||||
parser.add_argument(
|
||||
"--models",
|
||||
required=False,
|
||||
type=str,
|
||||
default="all",
|
||||
help="Model checkpoints to be provided "
|
||||
"to the AutoModel classes. Leave "
|
||||
"blank to benchmark the base version "
|
||||
"of all available model "
|
||||
"architectures.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--torch", required=False, action="store_true", help="Benchmark the Pytorch version of the " "models"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--torch_cuda", required=False, action="store_true", help="Pytorch only: run on available " "cuda devices"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--torchscript",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="Pytorch only: trace the models " "using torchscript",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tensorflow",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="Benchmark the TensorFlow version "
|
||||
"of the models. Will run on GPU if "
|
||||
"the correct dependencies are "
|
||||
"installed",
|
||||
)
|
||||
parser.add_argument("--xla", required=False, action="store_true", help="TensorFlow only: use XLA acceleration.")
|
||||
parser.add_argument("--amp", required=False, action="store_true", help="TensorFlow only: use automatic mixed precision acceleration.")
|
||||
parser.add_argument("--fp16", required=False, action="store_true", help="PyTorch only: use FP16 to accelerate inference.")
|
||||
parser.add_argument("--keras_predict", required=False, action="store_true", help="Whether to use model.predict "
|
||||
"instead of model() to do a "
|
||||
"forward pass.")
|
||||
parser.add_argument(
|
||||
"--amp",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="TensorFlow only: use automatic mixed precision acceleration.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fp16", required=False, action="store_true", help="PyTorch only: use FP16 to accelerate inference."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--keras_predict",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="Whether to use model.predict " "instead of model() to do a " "forward pass.",
|
||||
)
|
||||
parser.add_argument("--save_to_csv", required=False, action="store_true", help="Save to a CSV file.")
|
||||
parser.add_argument("--csv_filename", required=False, default=None, help="CSV filename used if saving results to csv.")
|
||||
parser.add_argument("--average_over", required=False, default=30, type=int, help="Times an experiment will be run.")
|
||||
parser.add_argument(
|
||||
"--csv_filename", required=False, default=None, help="CSV filename used if saving results to csv."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--average_over", required=False, default=30, type=int, help="Times an experiment will be run."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.models == 'all':
|
||||
if args.models == "all":
|
||||
args.models = [
|
||||
"gpt2",
|
||||
"bert-base-cased",
|
||||
@@ -436,7 +490,7 @@ def main():
|
||||
"distilbert-base-uncased",
|
||||
"distilgpt2",
|
||||
"roberta-base",
|
||||
"ctrl"
|
||||
"ctrl",
|
||||
]
|
||||
else:
|
||||
args.models = args.models.split()
|
||||
@@ -453,7 +507,7 @@ def main():
|
||||
fp16=args.fp16,
|
||||
save_to_csv=args.save_to_csv,
|
||||
csv_filename=args.csv_filename,
|
||||
average_over=args.average_over
|
||||
average_over=args.average_over,
|
||||
)
|
||||
else:
|
||||
raise ImportError("Trying to run a PyTorch benchmark but PyTorch was not found in the environment.")
|
||||
@@ -467,11 +521,11 @@ def main():
|
||||
amp=args.amp,
|
||||
save_to_csv=args.save_to_csv,
|
||||
csv_filename=args.csv_filename,
|
||||
average_over=args.average_over
|
||||
average_over=args.average_over,
|
||||
)
|
||||
else:
|
||||
raise ImportError("Trying to run a TensorFlow benchmark but TensorFlow was not found in the environment.")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,47 +1,42 @@
|
||||
from pathlib import Path
|
||||
import tarfile
|
||||
import urllib.request
|
||||
|
||||
import torch
|
||||
|
||||
from transformers.tokenization_camembert import CamembertTokenizer
|
||||
from transformers.modeling_camembert import CamembertForMaskedLM
|
||||
from transformers.tokenization_camembert import CamembertTokenizer
|
||||
|
||||
|
||||
def fill_mask(masked_input, model, tokenizer, topk=5):
|
||||
# Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
|
||||
assert masked_input.count('<mask>') == 1
|
||||
assert masked_input.count("<mask>") == 1
|
||||
input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
logits = model(input_ids)[0] # The last hidden-state is the first element of the output tuple
|
||||
masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item()
|
||||
logits = logits[0, masked_index, :]
|
||||
prob = logits.softmax(dim=0)
|
||||
values, indices = prob.topk(k=topk, dim=0)
|
||||
topk_predicted_token_bpe = ' '.join([tokenizer.convert_ids_to_tokens(indices[i].item())
|
||||
for i in range(len(indices))])
|
||||
topk_predicted_token_bpe = " ".join(
|
||||
[tokenizer.convert_ids_to_tokens(indices[i].item()) for i in range(len(indices))]
|
||||
)
|
||||
masked_token = tokenizer.mask_token
|
||||
topk_filled_outputs = []
|
||||
for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(' ')):
|
||||
predicted_token = predicted_token_bpe.replace('\u2581', ' ')
|
||||
for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(" ")):
|
||||
predicted_token = predicted_token_bpe.replace("\u2581", " ")
|
||||
if " {0}".format(masked_token) in masked_input:
|
||||
topk_filled_outputs.append((
|
||||
masked_input.replace(
|
||||
' {0}'.format(masked_token), predicted_token
|
||||
),
|
||||
values[index].item(),
|
||||
predicted_token,
|
||||
))
|
||||
topk_filled_outputs.append(
|
||||
(
|
||||
masked_input.replace(" {0}".format(masked_token), predicted_token),
|
||||
values[index].item(),
|
||||
predicted_token,
|
||||
)
|
||||
)
|
||||
else:
|
||||
topk_filled_outputs.append((
|
||||
masked_input.replace(masked_token, predicted_token),
|
||||
values[index].item(),
|
||||
predicted_token,
|
||||
))
|
||||
topk_filled_outputs.append(
|
||||
(masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,)
|
||||
)
|
||||
return topk_filled_outputs
|
||||
|
||||
|
||||
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
|
||||
model = CamembertForMaskedLM.from_pretrained('camembert-base')
|
||||
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
|
||||
model = CamembertForMaskedLM.from_pretrained("camembert-base")
|
||||
model.eval()
|
||||
|
||||
masked_input = "Le camembert est <mask> :)"
|
||||
|
||||
@@ -22,48 +22,54 @@
|
||||
--model_name openai-gpt \
|
||||
--do_train \
|
||||
--do_eval \
|
||||
--train_dataset $ROC_STORIES_DIR/cloze_test_val__spring2016\ -\ cloze_test_ALL_val.csv \
|
||||
--eval_dataset $ROC_STORIES_DIR/cloze_test_test__spring2016\ -\ cloze_test_ALL_test.csv \
|
||||
--train_dataset "$ROC_STORIES_DIR/cloze_test_val__spring2016 - cloze_test_ALL_val.csv" \
|
||||
--eval_dataset "$ROC_STORIES_DIR/cloze_test_test__spring2016 - cloze_test_ALL_test.csv" \
|
||||
--output_dir ../log \
|
||||
--train_batch_size 16 \
|
||||
"""
|
||||
import argparse
|
||||
import os
|
||||
import csv
|
||||
import random
|
||||
import logging
|
||||
from tqdm import tqdm, trange
|
||||
import os
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||
TensorDataset)
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
|
||||
AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
|
||||
get_linear_schedule_with_warmup)
|
||||
from transformers import (
|
||||
CONFIG_NAME,
|
||||
WEIGHTS_NAME,
|
||||
AdamW,
|
||||
OpenAIGPTDoubleHeadsModel,
|
||||
OpenAIGPTTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
)
|
||||
|
||||
ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def accuracy(out, labels):
|
||||
outputs = np.argmax(out, axis=1)
|
||||
return np.sum(outputs == labels)
|
||||
|
||||
|
||||
def load_rocstories_dataset(dataset_path):
|
||||
""" Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
|
||||
with open(dataset_path, encoding='utf_8') as f:
|
||||
with open(dataset_path, encoding="utf_8") as f:
|
||||
f = csv.reader(f)
|
||||
output = []
|
||||
next(f) # skip the first line
|
||||
next(f) # skip the first line
|
||||
for line in tqdm(f):
|
||||
output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1))
|
||||
output.append((" ".join(line[1:5]), line[5], line[6], int(line[-1]) - 1))
|
||||
return output
|
||||
|
||||
|
||||
def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
|
||||
""" Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
|
||||
|
||||
@@ -75,61 +81,73 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
|
||||
n_batch = len(dataset)
|
||||
input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)
|
||||
mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
|
||||
lm_labels = np.full((n_batch, 2, input_len), fill_value=-1, dtype=np.int64)
|
||||
lm_labels = np.full((n_batch, 2, input_len), fill_value=-100, dtype=np.int64)
|
||||
mc_labels = np.zeros((n_batch,), dtype=np.int64)
|
||||
for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
|
||||
with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
|
||||
with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
|
||||
input_ids[i, 0, :len(with_cont1)] = with_cont1
|
||||
input_ids[i, 1, :len(with_cont2)] = with_cont2
|
||||
input_ids[i, 0, : len(with_cont1)] = with_cont1
|
||||
input_ids[i, 1, : len(with_cont2)] = with_cont2
|
||||
mc_token_ids[i, 0] = len(with_cont1) - 1
|
||||
mc_token_ids[i, 1] = len(with_cont2) - 1
|
||||
lm_labels[i, 0, :len(with_cont1)] = with_cont1
|
||||
lm_labels[i, 1, :len(with_cont2)] = with_cont2
|
||||
lm_labels[i, 0, : len(with_cont1)] = with_cont1
|
||||
lm_labels[i, 1, : len(with_cont2)] = with_cont2
|
||||
mc_labels[i] = mc_label
|
||||
all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
|
||||
tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
|
||||
return tensor_datasets
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--model_name', type=str, default='openai-gpt',
|
||||
help='pretrained model name')
|
||||
parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
|
||||
parser.add_argument("--output_dir", default=None, type=str, required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.")
|
||||
parser.add_argument('--train_dataset', type=str, default='')
|
||||
parser.add_argument('--eval_dataset', type=str, default='')
|
||||
parser.add_argument('--seed', type=int, default=42)
|
||||
parser.add_argument('--num_train_epochs', type=int, default=3)
|
||||
parser.add_argument('--train_batch_size', type=int, default=8)
|
||||
parser.add_argument('--eval_batch_size', type=int, default=16)
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
|
||||
help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument('--max_grad_norm', type=int, default=1)
|
||||
parser.add_argument("--max_steps", default=-1, type=int,
|
||||
help="If > 0: set total number of training \
|
||||
steps to perform. Override num_train_epochs.")
|
||||
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
|
||||
help="Number of updates steps to accumulate before\
|
||||
performing a backward/update pass.")
|
||||
parser.add_argument('--learning_rate', type=float, default=6.25e-5)
|
||||
parser.add_argument("--warmup_steps", default=0, type=int,
|
||||
help="Linear warmup over warmup_steps.")
|
||||
parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
|
||||
parser.add_argument('--weight_decay', type=float, default=0.01)
|
||||
parser.add_argument('--lm_coef', type=float, default=0.9)
|
||||
parser.add_argument('--n_valid', type=int, default=374)
|
||||
parser.add_argument("--model_name", type=str, default="openai-gpt", help="pretrained model name")
|
||||
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
parser.add_argument("--train_dataset", type=str, default="")
|
||||
parser.add_argument("--eval_dataset", type=str, default="")
|
||||
parser.add_argument("--seed", type=int, default=42)
|
||||
parser.add_argument("--num_train_epochs", type=int, default=3)
|
||||
parser.add_argument("--train_batch_size", type=int, default=8)
|
||||
parser.add_argument("--eval_batch_size", type=int, default=16)
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", type=int, default=1)
|
||||
parser.add_argument(
|
||||
"--max_steps",
|
||||
default=-1,
|
||||
type=int,
|
||||
help="If > 0: set total number of training \
|
||||
steps to perform. Override num_train_epochs.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumulate before\
|
||||
performing a backward/update pass.",
|
||||
)
|
||||
parser.add_argument("--learning_rate", type=float, default=6.25e-5)
|
||||
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
|
||||
parser.add_argument("--lr_schedule", type=str, default="warmup_linear")
|
||||
parser.add_argument("--weight_decay", type=float, default=0.01)
|
||||
parser.add_argument("--lm_coef", type=float, default=0.9)
|
||||
parser.add_argument("--n_valid", type=int, default=374)
|
||||
|
||||
parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
|
||||
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
|
||||
parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
|
||||
parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
@@ -152,7 +170,7 @@ def main():
|
||||
# Load tokenizer and model
|
||||
# This loading functions also add new tokens and embeddings called `special tokens`
|
||||
# These new embeddings will be fine-tuned on the RocStories dataset
|
||||
special_tokens = ['_start_', '_delimiter_', '_classify_']
|
||||
special_tokens = ["_start_", "_delimiter_", "_classify_"]
|
||||
tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name)
|
||||
tokenizer.add_tokens(special_tokens)
|
||||
special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens)
|
||||
@@ -161,8 +179,6 @@ def main():
|
||||
model.to(device)
|
||||
|
||||
# Load and encode the datasets
|
||||
if not args.train_dataset and not args.eval_dataset:
|
||||
roc_stories = cached_path(ROCSTORIES_URL)
|
||||
def tokenize_and_encode(obj):
|
||||
""" Tokenize and encode a nested object """
|
||||
if isinstance(obj, str):
|
||||
@@ -170,6 +186,7 @@ def main():
|
||||
elif isinstance(obj, int):
|
||||
return obj
|
||||
return list(tokenize_and_encode(o) for o in obj)
|
||||
|
||||
logger.info("Encoding dataset...")
|
||||
train_dataset = load_rocstories_dataset(args.train_dataset)
|
||||
eval_dataset = load_rocstories_dataset(args.eval_dataset)
|
||||
@@ -178,8 +195,11 @@ def main():
|
||||
|
||||
# Compute the max input length for the Transformer
|
||||
max_length = model.config.n_positions // 2 - 2
|
||||
input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \
|
||||
for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
|
||||
input_length = max(
|
||||
len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3
|
||||
for dataset in encoded_datasets
|
||||
for story, cont1, cont2, _ in dataset
|
||||
)
|
||||
input_length = min(input_length, model.config.n_positions) # Max size of input for the pre-trained model
|
||||
|
||||
# Prepare inputs tensors and dataloaders
|
||||
@@ -198,20 +218,23 @@ def main():
|
||||
if args.do_train:
|
||||
if args.max_steps > 0:
|
||||
t_total = args.max_steps
|
||||
args.num_train_epochs = args.max_steps //\
|
||||
(len(train_dataloader) // args.gradient_accumulation_steps) + 1
|
||||
args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
|
||||
else:
|
||||
t_total = len(train_dataloader)\
|
||||
// args.gradient_accumulation_steps * args.num_train_epochs
|
||||
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
||||
|
||||
param_optimizer = list(model.named_parameters())
|
||||
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
||||
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
|
||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||
]
|
||||
{
|
||||
"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
|
||||
"weight_decay": args.weight_decay,
|
||||
},
|
||||
{"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
|
||||
)
|
||||
|
||||
if args.do_train:
|
||||
nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
|
||||
@@ -230,14 +253,16 @@ def main():
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
tr_loss += loss.item()
|
||||
exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
|
||||
exp_average_loss = (
|
||||
loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item()
|
||||
)
|
||||
nb_tr_steps += 1
|
||||
tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, scheduler.get_lr()[0])
|
||||
|
||||
# Save a trained model
|
||||
if args.do_train:
|
||||
# Save a trained model, configuration and tokenizer
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model itself
|
||||
model_to_save = model.module if hasattr(model, "module") else model # Only save the model itself
|
||||
|
||||
# If we save using the predefined names, we can load using `from_pretrained`
|
||||
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
|
||||
@@ -260,10 +285,12 @@ def main():
|
||||
batch = tuple(t.to(device) for t in batch)
|
||||
input_ids, mc_token_ids, lm_labels, mc_labels = batch
|
||||
with torch.no_grad():
|
||||
_, mc_loss, _, mc_logits = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels)
|
||||
_, mc_loss, _, mc_logits = model(
|
||||
input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels
|
||||
)
|
||||
|
||||
mc_logits = mc_logits.detach().cpu().numpy()
|
||||
mc_labels = mc_labels.to('cpu').numpy()
|
||||
mc_labels = mc_labels.to("cpu").numpy()
|
||||
tmp_eval_accuracy = accuracy(mc_logits, mc_labels)
|
||||
|
||||
eval_loss += mc_loss.mean().item()
|
||||
@@ -274,10 +301,8 @@ def main():
|
||||
|
||||
eval_loss = eval_loss / nb_eval_steps
|
||||
eval_accuracy = eval_accuracy / nb_eval_examples
|
||||
train_loss = tr_loss/nb_tr_steps if args.do_train else None
|
||||
result = {'eval_loss': eval_loss,
|
||||
'eval_accuracy': eval_accuracy,
|
||||
'train_loss': train_loss}
|
||||
train_loss = tr_loss / nb_tr_steps if args.do_train else None
|
||||
result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy, "train_loss": train_loss}
|
||||
|
||||
output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
@@ -286,5 +311,6 @@ def main():
|
||||
logger.info(" %s = %s", key, str(result[key]))
|
||||
writer.write("%s = %s\n" % (key, str(result[key])))
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -16,54 +16,50 @@
|
||||
"""BERT finetuning runner.
|
||||
Finetuning the library models for multiple choice on SWAG (Bert).
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import csv
|
||||
import glob
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
import glob
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||
TensorDataset)
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from transformers import (
|
||||
WEIGHTS_NAME,
|
||||
AdamW,
|
||||
BertConfig,
|
||||
BertForMultipleChoice,
|
||||
BertTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
)
|
||||
|
||||
|
||||
try:
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
except:
|
||||
except ImportError:
|
||||
from tensorboardX import SummaryWriter
|
||||
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from transformers import (WEIGHTS_NAME, BertConfig,
|
||||
BertForMultipleChoice, BertTokenizer)
|
||||
|
||||
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
|
||||
for conf in [BertConfig]), ())
|
||||
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in [BertConfig]), ())
|
||||
|
||||
MODEL_CLASSES = {
|
||||
'bert': (BertConfig, BertForMultipleChoice, BertTokenizer),
|
||||
"bert": (BertConfig, BertForMultipleChoice, BertTokenizer),
|
||||
}
|
||||
|
||||
|
||||
class SwagExample(object):
|
||||
"""A single training/test example for the SWAG dataset."""
|
||||
def __init__(self,
|
||||
swag_id,
|
||||
context_sentence,
|
||||
start_ending,
|
||||
ending_0,
|
||||
ending_1,
|
||||
ending_2,
|
||||
ending_3,
|
||||
label = None):
|
||||
|
||||
def __init__(self, swag_id, context_sentence, start_ending, ending_0, ending_1, ending_2, ending_3, label=None):
|
||||
self.swag_id = swag_id
|
||||
self.context_sentence = context_sentence
|
||||
self.start_ending = start_ending
|
||||
@@ -79,7 +75,7 @@ class SwagExample(object):
|
||||
return self.__repr__()
|
||||
|
||||
def __repr__(self):
|
||||
l = [
|
||||
attributes = [
|
||||
"swag_id: {}".format(self.swag_id),
|
||||
"context_sentence: {}".format(self.context_sentence),
|
||||
"start_ending: {}".format(self.start_ending),
|
||||
@@ -90,61 +86,48 @@ class SwagExample(object):
|
||||
]
|
||||
|
||||
if self.label is not None:
|
||||
l.append("label: {}".format(self.label))
|
||||
attributes.append("label: {}".format(self.label))
|
||||
|
||||
return ", ".join(attributes)
|
||||
|
||||
return ", ".join(l)
|
||||
|
||||
class InputFeatures(object):
|
||||
def __init__(self,
|
||||
example_id,
|
||||
choices_features,
|
||||
label
|
||||
|
||||
):
|
||||
def __init__(self, example_id, choices_features, label):
|
||||
self.example_id = example_id
|
||||
self.choices_features = [
|
||||
{
|
||||
'input_ids': input_ids,
|
||||
'input_mask': input_mask,
|
||||
'segment_ids': segment_ids
|
||||
}
|
||||
{"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
|
||||
for _, input_ids, input_mask, segment_ids in choices_features
|
||||
]
|
||||
self.label = label
|
||||
|
||||
def read_swag_examples(input_file, is_training=True):
|
||||
with open(input_file, 'r', encoding='utf-8') as f:
|
||||
reader = csv.reader(f)
|
||||
lines = []
|
||||
for line in reader:
|
||||
if sys.version_info[0] == 2:
|
||||
line = list(unicode(cell, 'utf-8') for cell in line)
|
||||
lines.append(line)
|
||||
|
||||
if is_training and lines[0][-1] != 'label':
|
||||
raise ValueError(
|
||||
"For training, the input file must contain a label column."
|
||||
)
|
||||
def read_swag_examples(input_file, is_training=True):
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
lines = list(csv.reader(f))
|
||||
|
||||
if is_training and lines[0][-1] != "label":
|
||||
raise ValueError("For training, the input file must contain a label column.")
|
||||
|
||||
examples = [
|
||||
SwagExample(
|
||||
swag_id = line[2],
|
||||
context_sentence = line[4],
|
||||
start_ending = line[5], # in the swag dataset, the
|
||||
# common beginning of each
|
||||
# choice is stored in "sent2".
|
||||
ending_0 = line[7],
|
||||
ending_1 = line[8],
|
||||
ending_2 = line[9],
|
||||
ending_3 = line[10],
|
||||
label = int(line[11]) if is_training else None
|
||||
) for line in lines[1:] # we skip the line with the column names
|
||||
swag_id=line[2],
|
||||
context_sentence=line[4],
|
||||
start_ending=line[5], # in the swag dataset, the
|
||||
# common beginning of each
|
||||
# choice is stored in "sent2".
|
||||
ending_0=line[7],
|
||||
ending_1=line[8],
|
||||
ending_2=line[9],
|
||||
ending_3=line[10],
|
||||
label=int(line[11]) if is_training else None,
|
||||
)
|
||||
for line in lines[1:] # we skip the line with the column names
|
||||
]
|
||||
|
||||
return examples
|
||||
|
||||
def convert_examples_to_features(examples, tokenizer, max_seq_length,
|
||||
is_training):
|
||||
|
||||
def convert_examples_to_features(examples, tokenizer, max_seq_length, is_training):
|
||||
"""Loads a data file into a list of `InputBatch`s."""
|
||||
|
||||
# Swag is a multiple choice task. To perform this task using Bert,
|
||||
@@ -204,23 +187,18 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
|
||||
logger.info("swag_id: {}".format(example.swag_id))
|
||||
for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
|
||||
logger.info("choice: {}".format(choice_idx))
|
||||
logger.info("tokens: {}".format(' '.join(tokens)))
|
||||
logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
|
||||
logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
|
||||
logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
|
||||
logger.info("tokens: {}".format(" ".join(tokens)))
|
||||
logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
|
||||
logger.info("input_mask: {}".format(" ".join(map(str, input_mask))))
|
||||
logger.info("segment_ids: {}".format(" ".join(map(str, segment_ids))))
|
||||
if is_training:
|
||||
logger.info("label: {}".format(label))
|
||||
|
||||
features.append(
|
||||
InputFeatures(
|
||||
example_id = example.swag_id,
|
||||
choices_features = choices_features,
|
||||
label = label
|
||||
)
|
||||
)
|
||||
features.append(InputFeatures(example_id=example.swag_id, choices_features=choices_features, label=label))
|
||||
|
||||
return features
|
||||
|
||||
|
||||
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
|
||||
"""Truncates a sequence pair in place to the maximum length."""
|
||||
|
||||
@@ -237,18 +215,14 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length):
|
||||
else:
|
||||
tokens_b.pop()
|
||||
|
||||
|
||||
def accuracy(out, labels):
|
||||
outputs = np.argmax(out, axis=1)
|
||||
return np.sum(outputs == labels)
|
||||
|
||||
|
||||
def select_field(features, field):
|
||||
return [
|
||||
[
|
||||
choice[field]
|
||||
for choice in feature.choices_features
|
||||
]
|
||||
for feature in features
|
||||
]
|
||||
return [[choice[field] for choice in feature.choices_features] for feature in features]
|
||||
|
||||
|
||||
def set_seed(args):
|
||||
@@ -258,24 +232,28 @@ def set_seed(args):
|
||||
if args.n_gpu > 0:
|
||||
torch.cuda.manual_seed_all(args.seed)
|
||||
|
||||
|
||||
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
|
||||
if args.local_rank not in [-1, 0]:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||
|
||||
# Load data features from cache or dataset file
|
||||
input_file = args.predict_file if evaluate else args.train_file
|
||||
cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
|
||||
'dev' if evaluate else 'train',
|
||||
list(filter(None, args.model_name_or_path.split('/'))).pop(),
|
||||
str(args.max_seq_length)))
|
||||
cached_features_file = os.path.join(
|
||||
os.path.dirname(input_file),
|
||||
"cached_{}_{}_{}".format(
|
||||
"dev" if evaluate else "train",
|
||||
list(filter(None, args.model_name_or_path.split("/"))).pop(),
|
||||
str(args.max_seq_length),
|
||||
),
|
||||
)
|
||||
if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
|
||||
logger.info("Loading features from cached file %s", cached_features_file)
|
||||
features = torch.load(cached_features_file)
|
||||
else:
|
||||
logger.info("Creating features from dataset file at %s", input_file)
|
||||
examples = read_swag_examples(input_file)
|
||||
features = convert_examples_to_features(
|
||||
examples, tokenizer, args.max_seq_length, not evaluate)
|
||||
features = convert_examples_to_features(examples, tokenizer, args.max_seq_length, not evaluate)
|
||||
|
||||
if args.local_rank in [-1, 0]:
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
@@ -285,21 +263,21 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||
|
||||
# Convert to Tensors and build dataset
|
||||
all_input_ids = torch.tensor(select_field(features, 'input_ids'), dtype=torch.long)
|
||||
all_input_mask = torch.tensor(select_field(features, 'input_mask'), dtype=torch.long)
|
||||
all_segment_ids = torch.tensor(select_field(features, 'segment_ids'), dtype=torch.long)
|
||||
all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
|
||||
all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long)
|
||||
all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
|
||||
all_label = torch.tensor([f.label for f in features], dtype=torch.long)
|
||||
|
||||
if evaluate:
|
||||
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
|
||||
all_label)
|
||||
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
|
||||
else:
|
||||
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
|
||||
all_label)
|
||||
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
|
||||
|
||||
if output_examples:
|
||||
return dataset, examples, features
|
||||
return dataset
|
||||
|
||||
|
||||
def train(args, train_dataset, model, tokenizer):
|
||||
""" Train the model """
|
||||
if args.local_rank in [-1, 0]:
|
||||
@@ -316,13 +294,18 @@ def train(args, train_dataset, model, tokenizer):
|
||||
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
||||
|
||||
# Prepare optimizer and schedule (linear warmup and decay)
|
||||
no_decay = ['bias', 'LayerNorm.weight']
|
||||
no_decay = ["bias", "LayerNorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
|
||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||
]
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||
"weight_decay": args.weight_decay,
|
||||
},
|
||||
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
|
||||
)
|
||||
if args.fp16:
|
||||
try:
|
||||
from apex import amp
|
||||
@@ -336,17 +319,21 @@ def train(args, train_dataset, model, tokenizer):
|
||||
|
||||
# Distributed training (should be after apex fp16 initialization)
|
||||
if args.local_rank != -1:
|
||||
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
|
||||
output_device=args.local_rank,
|
||||
find_unused_parameters=True)
|
||||
model = torch.nn.parallel.DistributedDataParallel(
|
||||
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
|
||||
)
|
||||
|
||||
# Train!
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num examples = %d", len(train_dataset))
|
||||
logger.info(" Num Epochs = %d", args.num_train_epochs)
|
||||
logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
|
||||
logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
|
||||
logger.info(
|
||||
" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
args.train_batch_size
|
||||
* args.gradient_accumulation_steps
|
||||
* (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
|
||||
)
|
||||
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
|
||||
logger.info(" Total optimization steps = %d", t_total)
|
||||
|
||||
@@ -354,17 +341,19 @@ def train(args, train_dataset, model, tokenizer):
|
||||
tr_loss, logging_loss = 0.0, 0.0
|
||||
model.zero_grad()
|
||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
||||
set_seed(args) # Added here for reproductibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
model.train()
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
inputs = {'input_ids': batch[0],
|
||||
'attention_mask': batch[1],
|
||||
#'token_type_ids': None if args.model_type == 'xlm' else batch[2],
|
||||
'token_type_ids': batch[2],
|
||||
'labels': batch[3]}
|
||||
inputs = {
|
||||
"input_ids": batch[0],
|
||||
"attention_mask": batch[1],
|
||||
# 'token_type_ids': None if args.model_type == 'xlm' else batch[2],
|
||||
"token_type_ids": batch[2],
|
||||
"labels": batch[3],
|
||||
}
|
||||
# if args.model_type in ['xlnet', 'xlm']:
|
||||
# inputs.update({'cls_index': batch[5],
|
||||
# 'p_mask': batch[6]})
|
||||
@@ -372,7 +361,7 @@ def train(args, train_dataset, model, tokenizer):
|
||||
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||
|
||||
if args.n_gpu > 1:
|
||||
loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
|
||||
loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
|
||||
if args.gradient_accumulation_steps > 1:
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
|
||||
@@ -393,23 +382,27 @@ def train(args, train_dataset, model, tokenizer):
|
||||
|
||||
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
||||
# Log metrics
|
||||
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
|
||||
if (
|
||||
args.local_rank == -1 and args.evaluate_during_training
|
||||
): # Only evaluate when single GPU otherwise metrics may not average well
|
||||
results = evaluate(args, model, tokenizer)
|
||||
for key, value in results.items():
|
||||
tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
|
||||
tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
|
||||
tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
|
||||
tb_writer.add_scalar("eval_{}".format(key), value, global_step)
|
||||
tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
|
||||
tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
|
||||
logging_loss = tr_loss
|
||||
|
||||
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||
# Save model checkpoint
|
||||
output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
|
||||
output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
model_to_save = (
|
||||
model.module if hasattr(model, "module") else model
|
||||
) # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(output_dir)
|
||||
tokenizer.save_vocabulary(output_dir)
|
||||
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
||||
torch.save(args, os.path.join(output_dir, "training_args.bin"))
|
||||
logger.info("Saving model checkpoint to %s", output_dir)
|
||||
|
||||
if args.max_steps > 0 and global_step > args.max_steps:
|
||||
@@ -424,6 +417,7 @@ def train(args, train_dataset, model, tokenizer):
|
||||
|
||||
return global_step, tr_loss / global_step
|
||||
|
||||
|
||||
def evaluate(args, model, tokenizer, prefix=""):
|
||||
dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
|
||||
|
||||
@@ -440,7 +434,6 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
logger.info(" Num examples = %d", len(dataset))
|
||||
logger.info(" Batch size = %d", args.eval_batch_size)
|
||||
|
||||
|
||||
eval_loss, eval_accuracy = 0, 0
|
||||
nb_eval_steps, nb_eval_examples = 0, 0
|
||||
|
||||
@@ -448,11 +441,13 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
model.eval()
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
with torch.no_grad():
|
||||
inputs = {'input_ids': batch[0],
|
||||
'attention_mask': batch[1],
|
||||
# 'token_type_ids': None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids
|
||||
'token_type_ids': batch[2],
|
||||
'labels': batch[3]}
|
||||
inputs = {
|
||||
"input_ids": batch[0],
|
||||
"attention_mask": batch[1],
|
||||
# 'token_type_ids': None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids
|
||||
"token_type_ids": batch[2],
|
||||
"labels": batch[3],
|
||||
}
|
||||
|
||||
# if args.model_type in ['xlnet', 'xlm']:
|
||||
# inputs.update({'cls_index': batch[4],
|
||||
@@ -462,17 +457,16 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
eval_loss += tmp_eval_loss.mean().item()
|
||||
|
||||
logits = logits.detach().cpu().numpy()
|
||||
label_ids = inputs['labels'].to('cpu').numpy()
|
||||
label_ids = inputs["labels"].to("cpu").numpy()
|
||||
tmp_eval_accuracy = accuracy(logits, label_ids)
|
||||
eval_accuracy += tmp_eval_accuracy
|
||||
|
||||
nb_eval_steps += 1
|
||||
nb_eval_examples += inputs['input_ids'].size(0)
|
||||
nb_eval_examples += inputs["input_ids"].size(0)
|
||||
|
||||
eval_loss = eval_loss / nb_eval_steps
|
||||
eval_accuracy = eval_accuracy / nb_eval_examples
|
||||
result = {'eval_loss': eval_loss,
|
||||
'eval_accuracy': eval_accuracy}
|
||||
result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy}
|
||||
|
||||
output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
@@ -483,92 +477,144 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
## Required parameters
|
||||
parser.add_argument("--train_file", default=None, type=str, required=True,
|
||||
help="SWAG csv for training. E.g., train.csv")
|
||||
parser.add_argument("--predict_file", default=None, type=str, required=True,
|
||||
help="SWAG csv for predictions. E.g., val.csv or test.csv")
|
||||
parser.add_argument("--model_type", default=None, type=str, required=True,
|
||||
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
|
||||
parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
|
||||
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
|
||||
parser.add_argument("--output_dir", default=None, type=str, required=True,
|
||||
help="The output directory where the model checkpoints and predictions will be written.")
|
||||
# Required parameters
|
||||
parser.add_argument(
|
||||
"--train_file", default=None, type=str, required=True, help="SWAG csv for training. E.g., train.csv"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--predict_file",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="SWAG csv for predictions. E.g., val.csv or test.csv",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_type",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_name_or_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The output directory where the model checkpoints and predictions will be written.",
|
||||
)
|
||||
|
||||
## Other parameters
|
||||
parser.add_argument("--config_name", default="", type=str,
|
||||
help="Pretrained config name or path if not the same as model_name")
|
||||
parser.add_argument("--tokenizer_name", default="", type=str,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name")
|
||||
parser.add_argument("--max_seq_length", default=384, type=int,
|
||||
help="The maximum total input sequence length after tokenization. Sequences "
|
||||
"longer than this will be truncated, and sequences shorter than this will be padded.")
|
||||
parser.add_argument("--do_train", action='store_true',
|
||||
help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action='store_true',
|
||||
help="Whether to run eval on the dev set.")
|
||||
parser.add_argument("--evaluate_during_training", action='store_true',
|
||||
help="Rul evaluation during training at each logging step.")
|
||||
parser.add_argument("--do_lower_case", action='store_true',
|
||||
help="Set this flag if you are using an uncased model.")
|
||||
# Other parameters
|
||||
parser.add_argument(
|
||||
"--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer_name",
|
||||
default="",
|
||||
type=str,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_seq_length",
|
||||
default=384,
|
||||
type=int,
|
||||
help="The maximum total input sequence length after tokenization. Sequences "
|
||||
"longer than this will be truncated, and sequences shorter than this will be padded.",
|
||||
)
|
||||
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
|
||||
parser.add_argument(
|
||||
"--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
|
||||
)
|
||||
|
||||
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
|
||||
help="Batch size per GPU/CPU for training.")
|
||||
parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
|
||||
help="Batch size per GPU/CPU for evaluation.")
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float,
|
||||
help="The initial learning rate for Adam.")
|
||||
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float,
|
||||
help="Weight deay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
|
||||
help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float,
|
||||
help="Max gradient norm.")
|
||||
parser.add_argument("--num_train_epochs", default=3.0, type=float,
|
||||
help="Total number of training epochs to perform.")
|
||||
parser.add_argument("--max_steps", default=-1, type=int,
|
||||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
|
||||
parser.add_argument("--warmup_steps", default=0, type=int,
|
||||
help="Linear warmup over warmup_steps.")
|
||||
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
|
||||
parser.add_argument(
|
||||
"--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
|
||||
)
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.",
|
||||
)
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
|
||||
parser.add_argument(
|
||||
"--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_steps",
|
||||
default=-1,
|
||||
type=int,
|
||||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
|
||||
)
|
||||
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
|
||||
|
||||
parser.add_argument('--logging_steps', type=int, default=50,
|
||||
help="Log every X updates steps.")
|
||||
parser.add_argument('--save_steps', type=int, default=50,
|
||||
help="Save checkpoint every X updates steps.")
|
||||
parser.add_argument("--eval_all_checkpoints", action='store_true',
|
||||
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
|
||||
parser.add_argument("--no_cuda", action='store_true',
|
||||
help="Whether not to use CUDA when available")
|
||||
parser.add_argument('--overwrite_output_dir', action='store_true',
|
||||
help="Overwrite the content of the output directory")
|
||||
parser.add_argument('--overwrite_cache', action='store_true',
|
||||
help="Overwrite the cached training and evaluation sets")
|
||||
parser.add_argument('--seed', type=int, default=42,
|
||||
help="random seed for initialization")
|
||||
parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
|
||||
parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
|
||||
parser.add_argument(
|
||||
"--eval_all_checkpoints",
|
||||
action="store_true",
|
||||
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
|
||||
)
|
||||
parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
|
||||
parser.add_argument(
|
||||
"--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
|
||||
|
||||
parser.add_argument("--local_rank", type=int, default=-1,
|
||||
help="local_rank for distributed training on gpus")
|
||||
parser.add_argument('--fp16', action='store_true',
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
|
||||
parser.add_argument('--fp16_opt_level', type=str, default='O1',
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html")
|
||||
parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
|
||||
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
|
||||
parser.add_argument(
|
||||
"--fp16",
|
||||
action="store_true",
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fp16_opt_level",
|
||||
type=str,
|
||||
default="O1",
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html",
|
||||
)
|
||||
parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
|
||||
parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
|
||||
raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
|
||||
if (
|
||||
os.path.exists(args.output_dir)
|
||||
and os.listdir(args.output_dir)
|
||||
and args.do_train
|
||||
and not args.overwrite_output_dir
|
||||
):
|
||||
raise ValueError(
|
||||
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
|
||||
args.output_dir
|
||||
)
|
||||
)
|
||||
|
||||
# Setup distant debugging if needed
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
@@ -580,16 +626,24 @@ def main():
|
||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
torch.distributed.init_process_group(backend='nccl')
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
args.n_gpu = 1
|
||||
args.device = device
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
|
||||
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
|
||||
)
|
||||
logger.warning(
|
||||
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
args.local_rank,
|
||||
device,
|
||||
args.n_gpu,
|
||||
bool(args.local_rank != -1),
|
||||
args.fp16,
|
||||
)
|
||||
|
||||
# Set seed
|
||||
set_seed(args)
|
||||
@@ -601,8 +655,12 @@ def main():
|
||||
args.model_type = args.model_type.lower()
|
||||
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
|
||||
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
|
||||
model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
|
||||
tokenizer = tokenizer_class.from_pretrained(
|
||||
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case
|
||||
)
|
||||
model = model_class.from_pretrained(
|
||||
args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
|
||||
)
|
||||
|
||||
if args.local_rank == 0:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||
@@ -617,7 +675,6 @@ def main():
|
||||
global_step, tr_loss = train(args, train_dataset, model, tokenizer)
|
||||
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
|
||||
|
||||
|
||||
# Save the trained model and the tokenizer
|
||||
if args.local_rank == -1 or torch.distributed.get_rank() == 0:
|
||||
# Create output directory if needed
|
||||
@@ -627,19 +684,20 @@ def main():
|
||||
logger.info("Saving model checkpoint to %s", args.output_dir)
|
||||
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
|
||||
# They can then be reloaded using `from_pretrained()`
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
model_to_save = (
|
||||
model.module if hasattr(model, "module") else model
|
||||
) # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(args.output_dir)
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
|
||||
# Good practice: save your training arguments together with the trained model
|
||||
torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
|
||||
torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
|
||||
|
||||
# Load a trained model and vocabulary that you have fine-tuned
|
||||
model = model_class.from_pretrained(args.output_dir)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir)
|
||||
model.to(args.device)
|
||||
|
||||
|
||||
# Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
|
||||
results = {}
|
||||
if args.do_eval and args.local_rank in [-1, 0]:
|
||||
@@ -650,14 +708,16 @@ def main():
|
||||
checkpoints = [args.model_name_or_path]
|
||||
|
||||
if args.eval_all_checkpoints:
|
||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||
checkpoints = list(
|
||||
os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
|
||||
)
|
||||
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs
|
||||
|
||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||
|
||||
for checkpoint in checkpoints:
|
||||
# Reload the model
|
||||
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
||||
global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
|
||||
model = model_class.from_pretrained(checkpoint)
|
||||
tokenizer = tokenizer_class.from_pretrained(checkpoint)
|
||||
model.to(args.device)
|
||||
@@ -665,7 +725,7 @@ def main():
|
||||
# Evaluate
|
||||
result = evaluate(args, model, tokenizer, prefix=global_step)
|
||||
|
||||
result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
|
||||
result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
|
||||
results.update(result)
|
||||
|
||||
logger.info("Results: {}".format(results))
|
||||
|
||||
@@ -19,55 +19,48 @@
|
||||
|
||||
This script with default values evaluates a pretrained Transformer-XL on WikiText 103
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import time
|
||||
import math
|
||||
import time
|
||||
|
||||
import torch
|
||||
|
||||
from transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
|
||||
from transformers import TransfoXLCorpus, TransfoXLLMHeadModel
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
|
||||
parser.add_argument('--model_name', type=str, default='transfo-xl-wt103',
|
||||
help='pretrained model name')
|
||||
parser.add_argument('--split', type=str, default='test',
|
||||
choices=['all', 'valid', 'test'],
|
||||
help='which split to evaluate')
|
||||
parser.add_argument('--batch_size', type=int, default=10,
|
||||
help='batch size')
|
||||
parser.add_argument('--tgt_len', type=int, default=128,
|
||||
help='number of tokens to predict')
|
||||
parser.add_argument('--ext_len', type=int, default=0,
|
||||
help='length of the extended context')
|
||||
parser.add_argument('--mem_len', type=int, default=1600,
|
||||
help='length of the retained previous heads')
|
||||
parser.add_argument('--clamp_len', type=int, default=1000,
|
||||
help='max positional embedding index')
|
||||
parser.add_argument('--no_cuda', action='store_true',
|
||||
help='Do not use CUDA even though CUA is available')
|
||||
parser.add_argument('--work_dir', type=str, required=True,
|
||||
help='path to the work_dir')
|
||||
parser.add_argument('--no_log', action='store_true',
|
||||
help='do not log the eval result')
|
||||
parser.add_argument('--same_length', action='store_true',
|
||||
help='set same length attention with masking')
|
||||
parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
|
||||
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
|
||||
parser = argparse.ArgumentParser(description="PyTorch Transformer Language Model")
|
||||
parser.add_argument("--model_name", type=str, default="transfo-xl-wt103", help="pretrained model name")
|
||||
parser.add_argument(
|
||||
"--split", type=str, default="test", choices=["all", "valid", "test"], help="which split to evaluate"
|
||||
)
|
||||
parser.add_argument("--batch_size", type=int, default=10, help="batch size")
|
||||
parser.add_argument("--tgt_len", type=int, default=128, help="number of tokens to predict")
|
||||
parser.add_argument("--ext_len", type=int, default=0, help="length of the extended context")
|
||||
parser.add_argument("--mem_len", type=int, default=1600, help="length of the retained previous heads")
|
||||
parser.add_argument("--clamp_len", type=int, default=1000, help="max positional embedding index")
|
||||
parser.add_argument("--no_cuda", action="store_true", help="Do not use CUDA even though CUA is available")
|
||||
parser.add_argument("--work_dir", type=str, required=True, help="path to the work_dir")
|
||||
parser.add_argument("--no_log", action="store_true", help="do not log the eval result")
|
||||
parser.add_argument("--same_length", action="store_true", help="set same length attention with masking")
|
||||
parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
|
||||
parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
|
||||
args = parser.parse_args()
|
||||
assert args.ext_len >= 0, 'extended context length must be non-negative'
|
||||
assert args.ext_len >= 0, "extended context length must be non-negative"
|
||||
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
@@ -80,21 +73,20 @@ def main():
|
||||
# The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
|
||||
# and tokenizing the dataset
|
||||
# The pre-processed corpus is a convertion (using the conversion script )
|
||||
tokenizer = TransfoXLTokenizer.from_pretrained(args.model_name)
|
||||
corpus = TransfoXLCorpus.from_pretrained(args.model_name)
|
||||
ntokens = len(corpus.vocab)
|
||||
|
||||
va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,
|
||||
device=device, ext_len=args.ext_len)
|
||||
te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
|
||||
device=device, ext_len=args.ext_len)
|
||||
va_iter = corpus.get_iterator("valid", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)
|
||||
te_iter = corpus.get_iterator("test", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)
|
||||
|
||||
# Load a pre-trained model
|
||||
model = TransfoXLLMHeadModel.from_pretrained(args.model_name)
|
||||
model = model.to(device)
|
||||
|
||||
logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
|
||||
args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
|
||||
logger.info(
|
||||
"Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}".format(
|
||||
args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len
|
||||
)
|
||||
)
|
||||
|
||||
model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
|
||||
if args.clamp_len > 0:
|
||||
@@ -108,7 +100,7 @@ def main():
|
||||
def evaluate(eval_iter):
|
||||
# Turn on evaluation mode which disables dropout.
|
||||
model.eval()
|
||||
total_len, total_loss = 0, 0.
|
||||
total_len, total_loss = 0, 0.0
|
||||
start_time = time.time()
|
||||
with torch.no_grad():
|
||||
mems = None
|
||||
@@ -119,35 +111,34 @@ def main():
|
||||
total_loss += seq_len * loss.item()
|
||||
total_len += seq_len
|
||||
total_time = time.time() - start_time
|
||||
logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format(
|
||||
total_time, 1000 * total_time / (idx+1)))
|
||||
logger.info("Time : {:.2f}s, {:.2f}ms/segment".format(total_time, 1000 * total_time / (idx + 1)))
|
||||
return total_loss / total_len
|
||||
|
||||
# Run on test data.
|
||||
if args.split == 'all':
|
||||
if args.split == "all":
|
||||
test_loss = evaluate(te_iter)
|
||||
valid_loss = evaluate(va_iter)
|
||||
elif args.split == 'valid':
|
||||
elif args.split == "valid":
|
||||
valid_loss = evaluate(va_iter)
|
||||
test_loss = None
|
||||
elif args.split == 'test':
|
||||
elif args.split == "test":
|
||||
test_loss = evaluate(te_iter)
|
||||
valid_loss = None
|
||||
|
||||
def format_log(loss, split):
|
||||
log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
|
||||
split, loss, math.exp(loss))
|
||||
log_str = "| {0} loss {1:5.2f} | {0} ppl {2:9.3f} ".format(split, loss, math.exp(loss))
|
||||
return log_str
|
||||
|
||||
log_str = ''
|
||||
log_str = ""
|
||||
if valid_loss is not None:
|
||||
log_str += format_log(valid_loss, 'valid')
|
||||
log_str += format_log(valid_loss, "valid")
|
||||
if test_loss is not None:
|
||||
log_str += format_log(test_loss, 'test')
|
||||
log_str += format_log(test_loss, "test")
|
||||
|
||||
logger.info('=' * 100)
|
||||
logger.info("=" * 100)
|
||||
logger.info(log_str)
|
||||
logger.info('=' * 100)
|
||||
logger.info("=" * 100)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -2,23 +2,25 @@
|
||||
|
||||
This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT, DistilRoBERTa and DistilGPT2.
|
||||
|
||||
**December 6th, 2019 - Update** We release **DistilmBERT**: 92% of `bert-base-multilingual-cased` on XNLI. The model supports 104 different languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages).
|
||||
**January 20, 2020 - Bug fixing** We have recently discovered and fixed [a bug](https://github.com/huggingface/transformers/commit/48cbf267c988b56c71a2380f748a3e6092ccaed3) in the evaluation of our `run_*.py` scripts that caused the reported metrics to be over-estimated on average. We have updated all the metrics with the latest runs.
|
||||
|
||||
**November 19th, 2019 - Update** We release German **DistilBERT**: 98.8% of `bert-base-german-dbmdz-cased` on NER tasks.
|
||||
**December 6, 2019 - Update** We release **DistilmBERT**: 92% of `bert-base-multilingual-cased` on XNLI. The model supports 104 different languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages).
|
||||
|
||||
**October 23rd, 2019 - Update** We release **DistilRoBERTa**: 95% of `RoBERTa-base`'s performance on GLUE, twice as fast as RoBERTa while being 35% smaller.
|
||||
**November 19, 2019 - Update** We release German **DistilBERT**: 98.8% of `bert-base-german-dbmdz-cased` on NER tasks.
|
||||
|
||||
**October 3rd, 2019 - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.**
|
||||
**October 23, 2019 - Update** We release **DistilRoBERTa**: 95% of `RoBERTa-base`'s performance on GLUE, twice as fast as RoBERTa while being 35% smaller.
|
||||
|
||||
**September 19th, 2019 - Update:** We fixed bugs in the code and released an upadted version of the weights trained with a modification of the distillation loss. DistilBERT now reaches 97% of `BERT-base`'s performance on GLUE, and 86.9 F1 score on SQuAD v1.1 dev set (compared to 88.5 for `BERT-base`). We will publish a formal write-up of our approach in the near future!
|
||||
**October 3, 2019 - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.**
|
||||
|
||||
**September 19, 2019 - Update:** We fixed bugs in the code and released an upadted version of the weights trained with a modification of the distillation loss. DistilBERT now reaches 99% of `BERT-base`'s performance on GLUE, and 86.9 F1 score on SQuAD v1.1 dev set (compared to 88.5 for `BERT-base`). We will publish a formal write-up of our approach in the near future!
|
||||
|
||||
|
||||
## What is Distil*
|
||||
|
||||
Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
|
||||
Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 99% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
|
||||
|
||||
We have applied the same method to other Transformer architectures and released the weights:
|
||||
- GPT2: on the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 15.0 compared to 18.5 for **DistilGPT2** (after fine-tuning on the train set).
|
||||
- GPT2: on the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 16.3 compared to 21.1 for **DistilGPT2** (after fine-tuning on the train set).
|
||||
- RoBERTa: **DistilRoBERTa** reaches 95% of `RoBERTa-base`'s performance on GLUE while being twice faster and 35% smaller.
|
||||
- German BERT: **German DistilBERT** reaches 99% of `bert-base-german-dbmdz-cased`'s performance on German NER (CoNLL-2003).
|
||||
- Multilingual BERT: **DistilmBERT** reaches 92% of Multilingual BERT's performance on XNLI while being twice faster and 25% smaller. The model supports 104 languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages).
|
||||
@@ -29,11 +31,11 @@ Here are the results on the dev sets of GLUE:
|
||||
|
||||
| Model | Macro-score | CoLA | MNLI | MRPC | QNLI | QQP | RTE | SST-2| STS-B| WNLI |
|
||||
| :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---: |
|
||||
| BERT-base | **77.6** | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7 |
|
||||
| DistilBERT | **76.8** | 49.1 | 81.8 | 90.2 | 90.2 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 |
|
||||
| BERT-base-uncased | **77.6** | 49.2 | 80.8 | 87.4 | 87.5 | 86.4 | 61.7 | 92.0 | 83.8 | 45.1 |
|
||||
| DistilBERT-base-uncased | **76.8** | 43.6 | 79.0 | 87.5 | 85.3 | 84.9 | 59.9 | 90.7 | 81.2 | 56.3 |
|
||||
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|
||||
| RoBERTa-base (reported) | **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>3</sup> |
|
||||
| DistilRoBERTa<sup>1</sup> | **79.0**/**82.3**<sup>2</sup> | 59.4 | 83.9 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1 |
|
||||
| DistilRoBERTa<sup>1</sup> | **79.0**/**82.3**<sup>2</sup> | 59.3 | 84.0 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1 |
|
||||
|
||||
<sup>1</sup> We did not use the MNLI checkpoint for fine-tuning but directy perform transfer learning on the pre-trained DistilRoBERTa.
|
||||
|
||||
|
||||
@@ -15,39 +15,36 @@
|
||||
""" The distiller to distil the student.
|
||||
Adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
|
||||
"""
|
||||
import os
|
||||
import math
|
||||
import psutil
|
||||
import os
|
||||
import time
|
||||
from tqdm import trange, tqdm
|
||||
import numpy as np
|
||||
|
||||
import psutil
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.optim import AdamW
|
||||
from torch.utils.data import BatchSampler, DataLoader, RandomSampler
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from torch.utils.data import RandomSampler, BatchSampler, DataLoader
|
||||
from tqdm import tqdm
|
||||
|
||||
from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups
|
||||
from lm_seqs_dataset import LmSeqsDataset
|
||||
from transformers import get_linear_schedule_with_warmup
|
||||
from utils import logger
|
||||
|
||||
|
||||
try:
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
except:
|
||||
except ImportError:
|
||||
from tensorboardX import SummaryWriter
|
||||
|
||||
from transformers import get_linear_schedule_with_warmup
|
||||
|
||||
from utils import logger
|
||||
from lm_seqs_dataset import LmSeqsDataset
|
||||
from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups
|
||||
|
||||
class Distiller:
|
||||
def __init__(self,
|
||||
params: dict,
|
||||
dataset: LmSeqsDataset,
|
||||
token_probs: torch.tensor,
|
||||
student: nn.Module,
|
||||
teacher: nn.Module):
|
||||
logger.info('Initializing Distiller')
|
||||
def __init__(
|
||||
self, params: dict, dataset: LmSeqsDataset, token_probs: torch.tensor, student: nn.Module, teacher: nn.Module
|
||||
):
|
||||
logger.info("Initializing Distiller")
|
||||
self.params = params
|
||||
self.dump_path = params.dump_path
|
||||
self.multi_gpu = params.multi_gpu
|
||||
@@ -70,12 +67,10 @@ class Distiller:
|
||||
else:
|
||||
sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False)
|
||||
|
||||
self.dataloader = DataLoader(dataset=dataset,
|
||||
batch_sampler=sampler,
|
||||
collate_fn=dataset.batch_sequences)
|
||||
self.dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, collate_fn=dataset.batch_sequences)
|
||||
|
||||
self.temperature = params.temperature
|
||||
assert self.temperature > 0.
|
||||
assert self.temperature > 0.0
|
||||
|
||||
self.alpha_ce = params.alpha_ce
|
||||
self.alpha_mlm = params.alpha_mlm
|
||||
@@ -85,18 +80,18 @@ class Distiller:
|
||||
|
||||
self.mlm = params.mlm
|
||||
if self.mlm:
|
||||
logger.info(f'Using MLM loss for LM step.')
|
||||
logger.info(f"Using MLM loss for LM step.")
|
||||
self.mlm_mask_prop = params.mlm_mask_prop
|
||||
assert 0.0 <= self.mlm_mask_prop <= 1.0
|
||||
assert params.word_mask + params.word_keep + params.word_rand == 1.0
|
||||
self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand])
|
||||
self.pred_probs = self.pred_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else self.pred_probs
|
||||
self.token_probs = token_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else token_probs
|
||||
self.pred_probs = self.pred_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else self.pred_probs
|
||||
self.token_probs = token_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else token_probs
|
||||
if self.fp16:
|
||||
self.pred_probs = self.pred_probs.half()
|
||||
self.token_probs = self.token_probs.half()
|
||||
else:
|
||||
logger.info(f'Using CLM loss for LM step.')
|
||||
logger.info(f"Using CLM loss for LM step.")
|
||||
|
||||
self.epoch = 0
|
||||
self.n_iter = 0
|
||||
@@ -107,38 +102,54 @@ class Distiller:
|
||||
self.last_loss_ce = 0
|
||||
self.last_loss_mlm = 0
|
||||
self.last_loss_clm = 0
|
||||
if self.alpha_mse > 0.: self.last_loss_mse = 0
|
||||
if self.alpha_cos > 0.: self.last_loss_cos = 0
|
||||
if self.alpha_mse > 0.0:
|
||||
self.last_loss_mse = 0
|
||||
if self.alpha_cos > 0.0:
|
||||
self.last_loss_cos = 0
|
||||
self.last_log = 0
|
||||
|
||||
self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean')
|
||||
self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
|
||||
if self.alpha_mse > 0.:
|
||||
self.mse_loss_fct = nn.MSELoss(reduction='sum')
|
||||
if self.alpha_cos > 0.:
|
||||
self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction='mean')
|
||||
self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean")
|
||||
self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
|
||||
if self.alpha_mse > 0.0:
|
||||
self.mse_loss_fct = nn.MSELoss(reduction="sum")
|
||||
if self.alpha_cos > 0.0:
|
||||
self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction="mean")
|
||||
|
||||
logger.info('--- Initializing model optimizer')
|
||||
logger.info("--- Initializing model optimizer")
|
||||
assert params.gradient_accumulation_steps >= 1
|
||||
self.num_steps_epoch = len(self.dataloader)
|
||||
num_train_optimization_steps = int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
|
||||
num_train_optimization_steps = (
|
||||
int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
|
||||
)
|
||||
|
||||
no_decay = ['bias', 'LayerNorm.weight']
|
||||
no_decay = ["bias", "LayerNorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': params.weight_decay},
|
||||
{'params': [p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': 0.0}
|
||||
{
|
||||
"params": [
|
||||
p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad
|
||||
],
|
||||
"weight_decay": params.weight_decay,
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad
|
||||
],
|
||||
"weight_decay": 0.0,
|
||||
},
|
||||
]
|
||||
logger.info("------ Number of trainable parameters (student): %i" % sum([p.numel() for p in self.student.parameters() if p.requires_grad]))
|
||||
logger.info(
|
||||
"------ Number of trainable parameters (student): %i"
|
||||
% sum([p.numel() for p in self.student.parameters() if p.requires_grad])
|
||||
)
|
||||
logger.info("------ Number of parameters (student): %i" % sum([p.numel() for p in self.student.parameters()]))
|
||||
self.optimizer = AdamW(optimizer_grouped_parameters,
|
||||
lr=params.learning_rate,
|
||||
eps=params.adam_epsilon,
|
||||
betas=(0.9, 0.98))
|
||||
self.optimizer = AdamW(
|
||||
optimizer_grouped_parameters, lr=params.learning_rate, eps=params.adam_epsilon, betas=(0.9, 0.98)
|
||||
)
|
||||
|
||||
warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
|
||||
self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
|
||||
num_warmup_steps=warmup_steps,
|
||||
num_training_steps=num_train_optimization_steps)
|
||||
self.scheduler = get_linear_schedule_with_warmup(
|
||||
self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps
|
||||
)
|
||||
|
||||
if self.fp16:
|
||||
try:
|
||||
@@ -146,33 +157,36 @@ class Distiller:
|
||||
except ImportError:
|
||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
||||
logger.info(f"Using fp16 training: {self.params.fp16_opt_level} level")
|
||||
self.student, self.optimizer = amp.initialize(self.student,
|
||||
self.optimizer,
|
||||
opt_level=self.params.fp16_opt_level)
|
||||
self.student, self.optimizer = amp.initialize(
|
||||
self.student, self.optimizer, opt_level=self.params.fp16_opt_level
|
||||
)
|
||||
self.teacher = self.teacher.half()
|
||||
|
||||
if self.multi_gpu:
|
||||
if self.fp16:
|
||||
from apex.parallel import DistributedDataParallel
|
||||
|
||||
logger.info("Using apex.parallel.DistributedDataParallel for distributed training.")
|
||||
self.student = DistributedDataParallel(self.student)
|
||||
else:
|
||||
from torch.nn.parallel import DistributedDataParallel
|
||||
|
||||
logger.info("Using nn.parallel.DistributedDataParallel for distributed training.")
|
||||
self.student = DistributedDataParallel(self.student,
|
||||
device_ids=[params.local_rank],
|
||||
output_device=params.local_rank,
|
||||
find_unused_parameters=True)
|
||||
self.student = DistributedDataParallel(
|
||||
self.student,
|
||||
device_ids=[params.local_rank],
|
||||
output_device=params.local_rank,
|
||||
find_unused_parameters=True,
|
||||
)
|
||||
|
||||
self.is_master = params.is_master
|
||||
if self.is_master:
|
||||
logger.info('--- Initializing Tensorboard')
|
||||
self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, 'log', 'train'))
|
||||
self.tensorboard.add_text(tag='config/training', text_string=str(self.params), global_step=0)
|
||||
self.tensorboard.add_text(tag='config/student', text_string=str(self.student_config), global_step=0)
|
||||
logger.info("--- Initializing Tensorboard")
|
||||
self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, "log", "train"))
|
||||
self.tensorboard.add_text(tag="config/training", text_string=str(self.params), global_step=0)
|
||||
self.tensorboard.add_text(tag="config/student", text_string=str(self.student_config), global_step=0)
|
||||
|
||||
def prepare_batch_mlm(self,
|
||||
batch):
|
||||
def prepare_batch_mlm(self, batch):
|
||||
"""
|
||||
Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the masked label for MLM.
|
||||
|
||||
@@ -186,13 +200,13 @@ class Distiller:
|
||||
-------
|
||||
token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
|
||||
attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
|
||||
mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -1 where there is nothing to predict.
|
||||
mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -100 where there is nothing to predict.
|
||||
"""
|
||||
token_ids, lengths = batch
|
||||
token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
|
||||
assert token_ids.size(0) == lengths.size(0)
|
||||
|
||||
attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None])
|
||||
attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]
|
||||
|
||||
bs, max_seq_len = token_ids.size()
|
||||
mlm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
|
||||
@@ -200,11 +214,13 @@ class Distiller:
|
||||
x_prob = self.token_probs[token_ids.flatten()]
|
||||
n_tgt = math.ceil(self.mlm_mask_prop * lengths.sum().item())
|
||||
tgt_ids = torch.multinomial(x_prob / x_prob.sum(), n_tgt, replacement=False)
|
||||
pred_mask = torch.zeros(bs * max_seq_len, dtype=torch.bool, device=token_ids.device) # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility
|
||||
pred_mask = torch.zeros(
|
||||
bs * max_seq_len, dtype=torch.bool, device=token_ids.device
|
||||
) # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility
|
||||
pred_mask[tgt_ids] = 1
|
||||
pred_mask = pred_mask.view(bs, max_seq_len)
|
||||
|
||||
pred_mask[token_ids == self.params.special_tok_ids['pad_token']] = 0
|
||||
pred_mask[token_ids == self.params.special_tok_ids["pad_token"]] = 0
|
||||
|
||||
# mask a number of words == 0 [8] (faster with fp16)
|
||||
if self.fp16:
|
||||
@@ -213,26 +229,29 @@ class Distiller:
|
||||
pred_mask = pred_mask.view(-1)
|
||||
n2 = max(n1 % 8, 8 * (n1 // 8))
|
||||
if n2 != n1:
|
||||
pred_mask[torch.nonzero(pred_mask).view(-1)[:n1-n2]] = 0
|
||||
pred_mask[torch.nonzero(pred_mask).view(-1)[: n1 - n2]] = 0
|
||||
pred_mask = pred_mask.view(bs, max_seq_len)
|
||||
assert pred_mask.sum().item() % 8 == 0, pred_mask.sum().item()
|
||||
|
||||
_token_ids_real = token_ids[pred_mask]
|
||||
_token_ids_rand = _token_ids_real.clone().random_(self.vocab_size)
|
||||
_token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids['mask_token'])
|
||||
_token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids["mask_token"])
|
||||
probs = torch.multinomial(self.pred_probs, len(_token_ids_real), replacement=True)
|
||||
_token_ids = _token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long()
|
||||
_token_ids = (
|
||||
_token_ids_mask * (probs == 0).long()
|
||||
+ _token_ids_real * (probs == 1).long()
|
||||
+ _token_ids_rand * (probs == 2).long()
|
||||
)
|
||||
token_ids = token_ids.masked_scatter(pred_mask, _token_ids)
|
||||
|
||||
mlm_labels[~pred_mask] = -1 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
|
||||
mlm_labels[~pred_mask] = -100 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
|
||||
|
||||
# sanity checks
|
||||
assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
|
||||
|
||||
return token_ids, attn_mask, mlm_labels
|
||||
|
||||
def prepare_batch_clm(self,
|
||||
batch):
|
||||
def prepare_batch_clm(self, batch):
|
||||
"""
|
||||
Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the labels for CLM.
|
||||
|
||||
@@ -246,24 +265,22 @@ class Distiller:
|
||||
-------
|
||||
token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
|
||||
attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
|
||||
clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -1 where there is nothing to predict.
|
||||
clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -100 where there is nothing to predict.
|
||||
"""
|
||||
token_ids, lengths = batch
|
||||
token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
|
||||
assert token_ids.size(0) == lengths.size(0)
|
||||
|
||||
attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None])
|
||||
attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]
|
||||
clm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
|
||||
clm_labels[~attn_mask] = -1 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
|
||||
clm_labels[~attn_mask] = -100 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
|
||||
|
||||
# sanity checks
|
||||
assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
|
||||
|
||||
return token_ids, attn_mask, clm_labels
|
||||
|
||||
def round_batch(self,
|
||||
x: torch.tensor,
|
||||
lengths: torch.tensor):
|
||||
def round_batch(self, x: torch.tensor, lengths: torch.tensor):
|
||||
"""
|
||||
For float16 only.
|
||||
Sub-sample sentences in a batch, and add padding, so that each dimension is a multiple of 8.
|
||||
@@ -299,9 +316,9 @@ class Distiller:
|
||||
pad = 8 - (ml1 % 8)
|
||||
ml2 = ml1 + pad
|
||||
if self.mlm:
|
||||
pad_id = self.params.special_tok_ids['pad_token']
|
||||
pad_id = self.params.special_tok_ids["pad_token"]
|
||||
else:
|
||||
pad_id = self.params.special_tok_ids['unk_token']
|
||||
pad_id = self.params.special_tok_ids["unk_token"]
|
||||
padding_tensor = torch.zeros(bs2, pad, dtype=torch.long, device=x.device).fill_(pad_id)
|
||||
x = torch.cat([x, padding_tensor], 1)
|
||||
assert x.size() == (bs2, ml2)
|
||||
@@ -314,20 +331,22 @@ class Distiller:
|
||||
"""
|
||||
The real training loop.
|
||||
"""
|
||||
if self.is_master: logger.info('Starting training')
|
||||
if self.is_master:
|
||||
logger.info("Starting training")
|
||||
self.last_log = time.time()
|
||||
self.student.train()
|
||||
self.teacher.eval()
|
||||
|
||||
for _ in range(self.params.n_epoch):
|
||||
if self.is_master: logger.info(f'--- Starting epoch {self.epoch}/{self.params.n_epoch-1}')
|
||||
if self.is_master:
|
||||
logger.info(f"--- Starting epoch {self.epoch}/{self.params.n_epoch-1}")
|
||||
if self.multi_gpu:
|
||||
torch.distributed.barrier()
|
||||
|
||||
iter_bar = tqdm(self.dataloader, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
|
||||
for batch in iter_bar:
|
||||
if self.params.n_gpu > 0:
|
||||
batch = tuple(t.to(f'cuda:{self.params.local_rank}') for t in batch)
|
||||
batch = tuple(t.to(f"cuda:{self.params.local_rank}") for t in batch)
|
||||
|
||||
if self.mlm:
|
||||
token_ids, attn_mask, lm_labels = self.prepare_batch_mlm(batch=batch)
|
||||
@@ -336,22 +355,21 @@ class Distiller:
|
||||
self.step(input_ids=token_ids, attention_mask=attn_mask, lm_labels=lm_labels)
|
||||
|
||||
iter_bar.update()
|
||||
iter_bar.set_postfix({'Last_loss': f'{self.last_loss:.2f}',
|
||||
'Avg_cum_loss': f'{self.total_loss_epoch/self.n_iter:.2f}'})
|
||||
iter_bar.set_postfix(
|
||||
{"Last_loss": f"{self.last_loss:.2f}", "Avg_cum_loss": f"{self.total_loss_epoch/self.n_iter:.2f}"}
|
||||
)
|
||||
iter_bar.close()
|
||||
|
||||
if self.is_master: logger.info(f'--- Ending epoch {self.epoch}/{self.params.n_epoch-1}')
|
||||
if self.is_master:
|
||||
logger.info(f"--- Ending epoch {self.epoch}/{self.params.n_epoch-1}")
|
||||
self.end_epoch()
|
||||
|
||||
if self.is_master:
|
||||
logger.info(f'Save very last checkpoint as `pytorch_model.bin`.')
|
||||
self.save_checkpoint(checkpoint_name=f'pytorch_model.bin')
|
||||
logger.info('Training is finished')
|
||||
logger.info(f"Save very last checkpoint as `pytorch_model.bin`.")
|
||||
self.save_checkpoint(checkpoint_name=f"pytorch_model.bin")
|
||||
logger.info("Training is finished")
|
||||
|
||||
def step(self,
|
||||
input_ids: torch.tensor,
|
||||
attention_mask: torch.tensor,
|
||||
lm_labels: torch.tensor):
|
||||
def step(self, input_ids: torch.tensor, attention_mask: torch.tensor, lm_labels: torch.tensor):
|
||||
"""
|
||||
One optimization step: forward of student AND teacher, backward on the loss (for gradient accumulation),
|
||||
and possibly a parameter update (depending on the gradient accumulation).
|
||||
@@ -363,78 +381,91 @@ class Distiller:
|
||||
lm_labels: `torch.tensor(bs, seq_length)` - The language modeling labels (mlm labels for MLM and clm labels for CLM).
|
||||
"""
|
||||
if self.mlm:
|
||||
s_logits, s_hidden_states = self.student(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
|
||||
s_logits, s_hidden_states = self.student(
|
||||
input_ids=input_ids, attention_mask=attention_mask
|
||||
) # (bs, seq_length, voc_size)
|
||||
with torch.no_grad():
|
||||
t_logits, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
|
||||
t_logits, t_hidden_states = self.teacher(
|
||||
input_ids=input_ids, attention_mask=attention_mask
|
||||
) # (bs, seq_length, voc_size)
|
||||
else:
|
||||
s_logits, _, s_hidden_states = self.student(input_ids=input_ids, attention_mask=None) # (bs, seq_length, voc_size)
|
||||
s_logits, _, s_hidden_states = self.student(
|
||||
input_ids=input_ids, attention_mask=None
|
||||
) # (bs, seq_length, voc_size)
|
||||
with torch.no_grad():
|
||||
t_logits, _, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=None) # (bs, seq_length, voc_size)
|
||||
t_logits, _, t_hidden_states = self.teacher(
|
||||
input_ids=input_ids, attention_mask=None
|
||||
) # (bs, seq_length, voc_size)
|
||||
assert s_logits.size() == t_logits.size()
|
||||
|
||||
#https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
|
||||
#https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
|
||||
# https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
|
||||
# https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
|
||||
if self.params.restrict_ce_to_mask:
|
||||
mask = (lm_labels>-1).unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size)
|
||||
mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size)
|
||||
else:
|
||||
mask = attention_mask.unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size)
|
||||
s_logits_slct = torch.masked_select(s_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask
|
||||
s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1)) # (bs * seq_length, voc_size) modulo the 1s in mask
|
||||
t_logits_slct = torch.masked_select(t_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask
|
||||
t_logits_slct = t_logits_slct.view(-1, s_logits.size(-1)) # (bs * seq_length, voc_size) modulo the 1s in mask
|
||||
mask = attention_mask.unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size)
|
||||
s_logits_slct = torch.masked_select(s_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask
|
||||
s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1)) # (bs * seq_length, voc_size) modulo the 1s in mask
|
||||
t_logits_slct = torch.masked_select(t_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask
|
||||
t_logits_slct = t_logits_slct.view(-1, s_logits.size(-1)) # (bs * seq_length, voc_size) modulo the 1s in mask
|
||||
assert t_logits_slct.size() == s_logits_slct.size()
|
||||
|
||||
loss_ce = self.ce_loss_fct(F.log_softmax(s_logits_slct/self.temperature, dim=-1),
|
||||
F.softmax(t_logits_slct/self.temperature, dim=-1)) * (self.temperature)**2
|
||||
loss = self.alpha_ce*loss_ce
|
||||
loss_ce = (
|
||||
self.ce_loss_fct(
|
||||
F.log_softmax(s_logits_slct / self.temperature, dim=-1),
|
||||
F.softmax(t_logits_slct / self.temperature, dim=-1),
|
||||
)
|
||||
* (self.temperature) ** 2
|
||||
)
|
||||
loss = self.alpha_ce * loss_ce
|
||||
|
||||
if self.alpha_mlm > 0.:
|
||||
if self.alpha_mlm > 0.0:
|
||||
loss_mlm = self.lm_loss_fct(s_logits.view(-1, s_logits.size(-1)), lm_labels.view(-1))
|
||||
loss += self.alpha_mlm * loss_mlm
|
||||
if self.alpha_clm > 0.:
|
||||
if self.alpha_clm > 0.0:
|
||||
shift_logits = s_logits[..., :-1, :].contiguous()
|
||||
shift_labels = lm_labels[..., 1:].contiguous()
|
||||
loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
|
||||
shift_labels.view(-1))
|
||||
loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
|
||||
loss += self.alpha_clm * loss_clm
|
||||
|
||||
if self.alpha_mse > 0.:
|
||||
loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct)/s_logits_slct.size(0) # Reproducing batchmean reduction
|
||||
if self.alpha_mse > 0.0:
|
||||
loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct) / s_logits_slct.size(
|
||||
0
|
||||
) # Reproducing batchmean reduction
|
||||
loss += self.alpha_mse * loss_mse
|
||||
if self.alpha_cos > 0.:
|
||||
s_hidden_states = s_hidden_states[-1] # (bs, seq_length, dim)
|
||||
t_hidden_states = t_hidden_states[-1] # (bs, seq_length, dim)
|
||||
mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states) # (bs, seq_length, dim)
|
||||
if self.alpha_cos > 0.0:
|
||||
s_hidden_states = s_hidden_states[-1] # (bs, seq_length, dim)
|
||||
t_hidden_states = t_hidden_states[-1] # (bs, seq_length, dim)
|
||||
mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states) # (bs, seq_length, dim)
|
||||
assert s_hidden_states.size() == t_hidden_states.size()
|
||||
dim = s_hidden_states.size(-1)
|
||||
|
||||
s_hidden_states_slct = torch.masked_select(s_hidden_states, mask) # (bs * seq_length * dim)
|
||||
s_hidden_states_slct = s_hidden_states_slct.view(-1, dim) # (bs * seq_length, dim)
|
||||
t_hidden_states_slct = torch.masked_select(t_hidden_states, mask) # (bs * seq_length * dim)
|
||||
t_hidden_states_slct = t_hidden_states_slct.view(-1, dim) # (bs * seq_length, dim)
|
||||
|
||||
target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1) # (bs * seq_length,)
|
||||
|
||||
s_hidden_states_slct = torch.masked_select(s_hidden_states, mask) # (bs * seq_length * dim)
|
||||
s_hidden_states_slct = s_hidden_states_slct.view(-1, dim) # (bs * seq_length, dim)
|
||||
t_hidden_states_slct = torch.masked_select(t_hidden_states, mask) # (bs * seq_length * dim)
|
||||
t_hidden_states_slct = t_hidden_states_slct.view(-1, dim) # (bs * seq_length, dim)
|
||||
|
||||
target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1) # (bs * seq_length,)
|
||||
loss_cos = self.cosine_loss_fct(s_hidden_states_slct, t_hidden_states_slct, target)
|
||||
loss += self.alpha_cos * loss_cos
|
||||
|
||||
self.total_loss_epoch += loss.item()
|
||||
self.last_loss = loss.item()
|
||||
self.last_loss_ce = loss_ce.item()
|
||||
if self.alpha_mlm > 0.:
|
||||
if self.alpha_mlm > 0.0:
|
||||
self.last_loss_mlm = loss_mlm.item()
|
||||
if self.alpha_clm > 0.:
|
||||
if self.alpha_clm > 0.0:
|
||||
self.last_loss_clm = loss_clm.item()
|
||||
if self.alpha_mse > 0.:
|
||||
if self.alpha_mse > 0.0:
|
||||
self.last_loss_mse = loss_mse.item()
|
||||
if self.alpha_cos > 0.:
|
||||
if self.alpha_cos > 0.0:
|
||||
self.last_loss_cos = loss_cos.item()
|
||||
|
||||
self.optimize(loss)
|
||||
|
||||
self.n_sequences_epoch += input_ids.size(0)
|
||||
|
||||
def optimize(self,
|
||||
loss):
|
||||
def optimize(self, loss):
|
||||
"""
|
||||
Normalization on the loss (gradient accumulation or distributed training), followed by
|
||||
backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation).
|
||||
@@ -442,7 +473,7 @@ class Distiller:
|
||||
"""
|
||||
# Check for NaN
|
||||
if (loss != loss).data.any():
|
||||
logger.error('NaN detected')
|
||||
logger.error("NaN detected")
|
||||
exit()
|
||||
|
||||
if self.multi_gpu:
|
||||
@@ -452,6 +483,7 @@ class Distiller:
|
||||
|
||||
if self.fp16:
|
||||
from apex import amp
|
||||
|
||||
with amp.scale_loss(loss, self.optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
else:
|
||||
@@ -488,53 +520,84 @@ class Distiller:
|
||||
return
|
||||
|
||||
for param_name, param in self.student.named_parameters():
|
||||
self.tensorboard.add_scalar(tag='parameter_mean/' + param_name, scalar_value=param.data.mean(), global_step=self.n_total_iter)
|
||||
self.tensorboard.add_scalar(tag='parameter_std/' + param_name, scalar_value=param.data.std(), global_step=self.n_total_iter)
|
||||
self.tensorboard.add_scalar(
|
||||
tag="parameter_mean/" + param_name, scalar_value=param.data.mean(), global_step=self.n_total_iter
|
||||
)
|
||||
self.tensorboard.add_scalar(
|
||||
tag="parameter_std/" + param_name, scalar_value=param.data.std(), global_step=self.n_total_iter
|
||||
)
|
||||
if param.grad is None:
|
||||
continue
|
||||
self.tensorboard.add_scalar(tag="grad_mean/" + param_name, scalar_value=param.grad.data.mean(),global_step=self.n_total_iter)
|
||||
self.tensorboard.add_scalar(tag="grad_std/" + param_name, scalar_value=param.grad.data.std(), global_step=self.n_total_iter)
|
||||
self.tensorboard.add_scalar(
|
||||
tag="grad_mean/" + param_name, scalar_value=param.grad.data.mean(), global_step=self.n_total_iter
|
||||
)
|
||||
self.tensorboard.add_scalar(
|
||||
tag="grad_std/" + param_name, scalar_value=param.grad.data.std(), global_step=self.n_total_iter
|
||||
)
|
||||
|
||||
self.tensorboard.add_scalar(tag="losses/cum_avg_loss_epoch", scalar_value=self.total_loss_epoch/self.n_iter, global_step=self.n_total_iter)
|
||||
self.tensorboard.add_scalar(
|
||||
tag="losses/cum_avg_loss_epoch",
|
||||
scalar_value=self.total_loss_epoch / self.n_iter,
|
||||
global_step=self.n_total_iter,
|
||||
)
|
||||
self.tensorboard.add_scalar(tag="losses/loss", scalar_value=self.last_loss, global_step=self.n_total_iter)
|
||||
self.tensorboard.add_scalar(tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter)
|
||||
if self.alpha_mlm > 0.:
|
||||
self.tensorboard.add_scalar(tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter)
|
||||
if self.alpha_clm > 0.:
|
||||
self.tensorboard.add_scalar(tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter)
|
||||
if self.alpha_mse > 0.:
|
||||
self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter)
|
||||
if self.alpha_cos > 0.:
|
||||
self.tensorboard.add_scalar(tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter)
|
||||
self.tensorboard.add_scalar(tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter)
|
||||
|
||||
self.tensorboard.add_scalar(tag="global/memory_usage", scalar_value=psutil.virtual_memory()._asdict()['used']/1_000_000, global_step=self.n_total_iter)
|
||||
self.tensorboard.add_scalar(tag="global/speed", scalar_value=time.time()-self.last_log, global_step=self.n_total_iter)
|
||||
self.tensorboard.add_scalar(
|
||||
tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter
|
||||
)
|
||||
if self.alpha_mlm > 0.0:
|
||||
self.tensorboard.add_scalar(
|
||||
tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter
|
||||
)
|
||||
if self.alpha_clm > 0.0:
|
||||
self.tensorboard.add_scalar(
|
||||
tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter
|
||||
)
|
||||
if self.alpha_mse > 0.0:
|
||||
self.tensorboard.add_scalar(
|
||||
tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter
|
||||
)
|
||||
if self.alpha_cos > 0.0:
|
||||
self.tensorboard.add_scalar(
|
||||
tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter
|
||||
)
|
||||
self.tensorboard.add_scalar(
|
||||
tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter
|
||||
)
|
||||
|
||||
self.tensorboard.add_scalar(
|
||||
tag="global/memory_usage",
|
||||
scalar_value=psutil.virtual_memory()._asdict()["used"] / 1_000_000,
|
||||
global_step=self.n_total_iter,
|
||||
)
|
||||
self.tensorboard.add_scalar(
|
||||
tag="global/speed", scalar_value=time.time() - self.last_log, global_step=self.n_total_iter
|
||||
)
|
||||
|
||||
def end_epoch(self):
|
||||
"""
|
||||
Finally arrived at the end of epoch (full pass on dataset).
|
||||
Do some tensorboard logging and checkpoint saving.
|
||||
"""
|
||||
logger.info(f'{self.n_sequences_epoch} sequences have been trained during this epoch.')
|
||||
logger.info(f"{self.n_sequences_epoch} sequences have been trained during this epoch.")
|
||||
|
||||
if self.is_master:
|
||||
self.save_checkpoint(checkpoint_name=f'model_epoch_{self.epoch}.pth')
|
||||
self.tensorboard.add_scalar(tag='epoch/loss', scalar_value=self.total_loss_epoch/self.n_iter, global_step=self.epoch)
|
||||
self.save_checkpoint(checkpoint_name=f"model_epoch_{self.epoch}.pth")
|
||||
self.tensorboard.add_scalar(
|
||||
tag="epoch/loss", scalar_value=self.total_loss_epoch / self.n_iter, global_step=self.epoch
|
||||
)
|
||||
|
||||
self.epoch += 1
|
||||
self.n_sequences_epoch = 0
|
||||
self.n_iter = 0
|
||||
self.total_loss_epoch = 0
|
||||
|
||||
def save_checkpoint(self,
|
||||
checkpoint_name: str = 'checkpoint.pth'):
|
||||
def save_checkpoint(self, checkpoint_name: str = "checkpoint.pth"):
|
||||
"""
|
||||
Save the current state. Only by the master process.
|
||||
"""
|
||||
if not self.is_master:
|
||||
return
|
||||
mdl_to_save = self.student.module if hasattr(self.student, 'module') else self.student
|
||||
mdl_to_save = self.student.module if hasattr(self.student, "module") else self.student
|
||||
mdl_to_save.config.save_pretrained(self.dump_path)
|
||||
state_dict = mdl_to_save.state_dict()
|
||||
torch.save(state_dict, os.path.join(self.dump_path, checkpoint_name))
|
||||
|
||||
@@ -17,18 +17,20 @@
|
||||
import bisect
|
||||
import copy
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
|
||||
import numpy as np
|
||||
from torch.utils.data.sampler import BatchSampler, Sampler
|
||||
|
||||
from utils import logger
|
||||
|
||||
|
||||
def _quantize(x, bins):
|
||||
bins = copy.deepcopy(bins)
|
||||
bins = sorted(bins)
|
||||
quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
|
||||
return quantized
|
||||
|
||||
|
||||
def create_lengths_groups(lengths, k=0):
|
||||
bins = np.arange(start=3, stop=k, step=4).tolist() if k > 0 else [10]
|
||||
groups = _quantize(lengths, bins)
|
||||
@@ -39,6 +41,7 @@ def create_lengths_groups(lengths, k=0):
|
||||
logger.info("Count of instances per bin: {}".format(counts))
|
||||
return groups
|
||||
|
||||
|
||||
class GroupedBatchSampler(BatchSampler):
|
||||
"""
|
||||
Wraps another sampler to yield a mini-batch of indices.
|
||||
@@ -53,11 +56,11 @@ class GroupedBatchSampler(BatchSampler):
|
||||
0, i.e. they must be in the range [0, num_groups).
|
||||
batch_size (int): Size of mini-batch.
|
||||
"""
|
||||
|
||||
def __init__(self, sampler, group_ids, batch_size):
|
||||
if not isinstance(sampler, Sampler):
|
||||
raise ValueError(
|
||||
"sampler should be an instance of "
|
||||
"torch.utils.data.Sampler, but got sampler={}".format(sampler)
|
||||
"sampler should be an instance of " "torch.utils.data.Sampler, but got sampler={}".format(sampler)
|
||||
)
|
||||
self.sampler = sampler
|
||||
self.group_ids = group_ids
|
||||
@@ -73,7 +76,7 @@ class GroupedBatchSampler(BatchSampler):
|
||||
buffer_per_group[group_id].append(idx)
|
||||
samples_per_group[group_id].append(idx)
|
||||
if len(buffer_per_group[group_id]) == self.batch_size:
|
||||
yield buffer_per_group[group_id] #TODO
|
||||
yield buffer_per_group[group_id] # TODO
|
||||
num_batches += 1
|
||||
del buffer_per_group[group_id]
|
||||
assert len(buffer_per_group[group_id]) < self.batch_size
|
||||
@@ -90,8 +93,8 @@ class GroupedBatchSampler(BatchSampler):
|
||||
for group_id, idxs in sorted(buffer_per_group.items(), key=lambda x: x[0]):
|
||||
batch_idx.extend(idxs)
|
||||
if len(batch_idx) >= self.batch_size:
|
||||
yield batch_idx[:self.batch_size]
|
||||
batch_idx = batch_idx[self.batch_size:]
|
||||
yield batch_idx[: self.batch_size]
|
||||
batch_idx = batch_idx[self.batch_size :]
|
||||
num_remaining -= 1
|
||||
if len(batch_idx) > 0:
|
||||
yield batch_idx
|
||||
|
||||
@@ -15,12 +15,13 @@
|
||||
""" Dataset to distilled models
|
||||
adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
|
||||
"""
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
import numpy as np
|
||||
from utils import logger
|
||||
|
||||
|
||||
class LmSeqsDataset(Dataset):
|
||||
"""Custom Dataset wrapping language modeling sequences.
|
||||
|
||||
@@ -32,9 +33,7 @@ class LmSeqsDataset(Dataset):
|
||||
data: `List[np.array[int]]
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
params,
|
||||
data):
|
||||
def __init__(self, params, data):
|
||||
self.params = params
|
||||
|
||||
self.token_ids = np.array(data)
|
||||
@@ -43,6 +42,7 @@ class LmSeqsDataset(Dataset):
|
||||
self.check()
|
||||
self.remove_long_sequences()
|
||||
self.remove_empty_sequences()
|
||||
self.remove_unknown_sequences()
|
||||
self.check()
|
||||
self.print_statistics()
|
||||
|
||||
@@ -57,7 +57,7 @@ class LmSeqsDataset(Dataset):
|
||||
Some sanity checks
|
||||
"""
|
||||
assert len(self.token_ids) == len(self.lengths)
|
||||
assert all(self.lengths[i] == len(self.token_ids[i]) for i in range(len(self.lengths)))
|
||||
assert all(self.lengths[i] == len(self.token_ids[i]) for i in range(len(self.lengths)))
|
||||
|
||||
def remove_long_sequences(self):
|
||||
"""
|
||||
@@ -65,17 +65,17 @@ class LmSeqsDataset(Dataset):
|
||||
"""
|
||||
max_len = self.params.max_model_input_size
|
||||
indices = self.lengths > max_len
|
||||
logger.info(f'Splitting {sum(indices)} too long sequences.')
|
||||
logger.info(f"Splitting {sum(indices)} too long sequences.")
|
||||
|
||||
def divide_chunks(l, n):
|
||||
return [l[i:i + n] for i in range(0, len(l), n)]
|
||||
return [l[i : i + n] for i in range(0, len(l), n)]
|
||||
|
||||
new_tok_ids = []
|
||||
new_lengths = []
|
||||
if self.params.mlm:
|
||||
cls_id, sep_id = self.params.special_tok_ids['cls_token'], self.params.special_tok_ids['sep_token']
|
||||
cls_id, sep_id = self.params.special_tok_ids["cls_token"], self.params.special_tok_ids["sep_token"]
|
||||
else:
|
||||
cls_id, sep_id = self.params.special_tok_ids['bos_token'], self.params.special_tok_ids['eos_token']
|
||||
cls_id, sep_id = self.params.special_tok_ids["bos_token"], self.params.special_tok_ids["eos_token"]
|
||||
|
||||
for seq_, len_ in zip(self.token_ids, self.lengths):
|
||||
assert (seq_[0] == cls_id) and (seq_[-1] == sep_id), seq_
|
||||
@@ -84,7 +84,7 @@ class LmSeqsDataset(Dataset):
|
||||
new_lengths.append(len_)
|
||||
else:
|
||||
sub_seqs = []
|
||||
for sub_s in divide_chunks(seq_, max_len-2):
|
||||
for sub_s in divide_chunks(seq_, max_len - 2):
|
||||
if sub_s[0] != cls_id:
|
||||
sub_s = np.insert(sub_s, 0, cls_id)
|
||||
if sub_s[-1] != sep_id:
|
||||
@@ -108,7 +108,23 @@ class LmSeqsDataset(Dataset):
|
||||
self.token_ids = self.token_ids[indices]
|
||||
self.lengths = self.lengths[indices]
|
||||
new_size = len(self)
|
||||
logger.info(f'Remove {init_size - new_size} too short (<=11 tokens) sequences.')
|
||||
logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.")
|
||||
|
||||
def remove_unknown_sequences(self):
|
||||
"""
|
||||
Remove sequences with a (too) high level of unknown tokens.
|
||||
"""
|
||||
if "unk_token" not in self.params.special_tok_ids:
|
||||
return
|
||||
else:
|
||||
unk_token_id = self.params.special_tok_ids["unk_token"]
|
||||
init_size = len(self)
|
||||
unk_occs = np.array([np.count_nonzero(a == unk_token_id) for a in self.token_ids])
|
||||
indices = (unk_occs / self.lengths) < 0.5
|
||||
self.token_ids = self.token_ids[indices]
|
||||
self.lengths = self.lengths[indices]
|
||||
new_size = len(self)
|
||||
logger.info(f"Remove {init_size - new_size} sequences with a high level of unknown tokens (50%).")
|
||||
|
||||
def print_statistics(self):
|
||||
"""
|
||||
@@ -116,7 +132,7 @@ class LmSeqsDataset(Dataset):
|
||||
"""
|
||||
if not self.params.is_master:
|
||||
return
|
||||
logger.info(f'{len(self)} sequences')
|
||||
logger.info(f"{len(self)} sequences")
|
||||
# data_len = sum(self.lengths)
|
||||
# nb_unique_tokens = len(Counter(list(chain(*self.token_ids))))
|
||||
# logger.info(f'{data_len} tokens ({nb_unique_tokens} unique)')
|
||||
@@ -125,8 +141,7 @@ class LmSeqsDataset(Dataset):
|
||||
# nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids])
|
||||
# logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)')
|
||||
|
||||
def batch_sequences(self,
|
||||
batch):
|
||||
def batch_sequences(self, batch):
|
||||
"""
|
||||
Do the padding and transform into torch.tensor.
|
||||
"""
|
||||
@@ -139,13 +154,13 @@ class LmSeqsDataset(Dataset):
|
||||
|
||||
# Pad token ids
|
||||
if self.params.mlm:
|
||||
pad_idx = self.params.special_tok_ids['pad_token']
|
||||
pad_idx = self.params.special_tok_ids["pad_token"]
|
||||
else:
|
||||
pad_idx = self.params.special_tok_ids['unk_token']
|
||||
tk_ = [list(t.astype(int)) + [pad_idx]*(max_seq_len_-len(t)) for t in token_ids]
|
||||
pad_idx = self.params.special_tok_ids["unk_token"]
|
||||
tk_ = [list(t.astype(int)) + [pad_idx] * (max_seq_len_ - len(t)) for t in token_ids]
|
||||
assert len(tk_) == len(token_ids)
|
||||
assert all(len(t) == max_seq_len_ for t in tk_)
|
||||
|
||||
tk_t = torch.tensor(tk_) # (bs, max_seq_len_)
|
||||
tk_t = torch.tensor(tk_) # (bs, max_seq_len_)
|
||||
lg_t = torch.tensor(lengths) # (bs)
|
||||
return tk_t, lg_t
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
transformers
|
||||
|
||||
gitpython==3.0.2
|
||||
tensorboard>=1.14.0
|
||||
tensorboardX==1.8
|
||||
psutil==5.6.3
|
||||
scipy==1.3.1
|
||||
transformers
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -16,75 +16,75 @@
|
||||
Preprocessing script before distillation.
|
||||
"""
|
||||
import argparse
|
||||
import logging
|
||||
import pickle
|
||||
import random
|
||||
import time
|
||||
import numpy as np
|
||||
from transformers import BertTokenizer, RobertaTokenizer, GPT2Tokenizer
|
||||
import logging
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
import numpy as np
|
||||
|
||||
from transformers import BertTokenizer, GPT2Tokenizer, RobertaTokenizer
|
||||
|
||||
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
|
||||
parser.add_argument('--file_path', type=str, default='data/dump.txt',
|
||||
help='The path to the data.')
|
||||
parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta', 'gpt2'])
|
||||
parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased',
|
||||
help="The tokenizer to use.")
|
||||
parser.add_argument('--dump_file', type=str, default='data/dump',
|
||||
help='The dump file prefix.')
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids)."
|
||||
)
|
||||
parser.add_argument("--file_path", type=str, default="data/dump.txt", help="The path to the data.")
|
||||
parser.add_argument("--tokenizer_type", type=str, default="bert", choices=["bert", "roberta", "gpt2"])
|
||||
parser.add_argument("--tokenizer_name", type=str, default="bert-base-uncased", help="The tokenizer to use.")
|
||||
parser.add_argument("--dump_file", type=str, default="data/dump", help="The dump file prefix.")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
logger.info(f'Loading Tokenizer ({args.tokenizer_name})')
|
||||
if args.tokenizer_type == 'bert':
|
||||
logger.info(f"Loading Tokenizer ({args.tokenizer_name})")
|
||||
if args.tokenizer_type == "bert":
|
||||
tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
|
||||
bos = tokenizer.special_tokens_map['cls_token'] # `[CLS]`
|
||||
sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]`
|
||||
elif args.tokenizer_type == 'roberta':
|
||||
bos = tokenizer.special_tokens_map["cls_token"] # `[CLS]`
|
||||
sep = tokenizer.special_tokens_map["sep_token"] # `[SEP]`
|
||||
elif args.tokenizer_type == "roberta":
|
||||
tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
|
||||
bos = tokenizer.special_tokens_map['cls_token'] # `<s>`
|
||||
sep = tokenizer.special_tokens_map['sep_token'] # `</s>`
|
||||
elif args.tokenizer_type == 'gpt2':
|
||||
bos = tokenizer.special_tokens_map["cls_token"] # `<s>`
|
||||
sep = tokenizer.special_tokens_map["sep_token"] # `</s>`
|
||||
elif args.tokenizer_type == "gpt2":
|
||||
tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name)
|
||||
bos = tokenizer.special_tokens_map['bos_token'] # `<|endoftext|>`
|
||||
sep = tokenizer.special_tokens_map['eos_token'] # `<|endoftext|>`
|
||||
bos = tokenizer.special_tokens_map["bos_token"] # `<|endoftext|>`
|
||||
sep = tokenizer.special_tokens_map["eos_token"] # `<|endoftext|>`
|
||||
|
||||
logger.info(f'Loading text from {args.file_path}')
|
||||
with open(args.file_path, 'r', encoding='utf8') as fp:
|
||||
logger.info(f"Loading text from {args.file_path}")
|
||||
with open(args.file_path, "r", encoding="utf8") as fp:
|
||||
data = fp.readlines()
|
||||
|
||||
|
||||
logger.info(f'Start encoding')
|
||||
logger.info(f'{len(data)} examples to process.')
|
||||
logger.info(f"Start encoding")
|
||||
logger.info(f"{len(data)} examples to process.")
|
||||
|
||||
rslt = []
|
||||
iter = 0
|
||||
interval = 10000
|
||||
start = time.time()
|
||||
for text in data:
|
||||
text = f'{bos} {text.strip()} {sep}'
|
||||
text = f"{bos} {text.strip()} {sep}"
|
||||
token_ids = tokenizer.encode(text, add_special_tokens=False)
|
||||
rslt.append(token_ids)
|
||||
|
||||
iter += 1
|
||||
if iter % interval == 0:
|
||||
end = time.time()
|
||||
logger.info(f'{iter} examples processed. - {(end-start)/interval:.2f}s/expl')
|
||||
logger.info(f"{iter} examples processed. - {(end-start)/interval:.2f}s/expl")
|
||||
start = time.time()
|
||||
logger.info('Finished binarization')
|
||||
logger.info(f'{len(data)} examples processed.')
|
||||
logger.info("Finished binarization")
|
||||
logger.info(f"{len(data)} examples processed.")
|
||||
|
||||
|
||||
dp_file = f'{args.dump_file}.{args.tokenizer_name}.pickle'
|
||||
dp_file = f"{args.dump_file}.{args.tokenizer_name}.pickle"
|
||||
rslt_ = [np.uint16(d) for d in rslt]
|
||||
random.shuffle(rslt_)
|
||||
logger.info(f'Dump to {dp_file}')
|
||||
with open(dp_file, 'wb') as handle:
|
||||
logger.info(f"Dump to {dp_file}")
|
||||
with open(dp_file, "wb") as handle:
|
||||
pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
|
||||
@@ -16,74 +16,87 @@
|
||||
Preprocessing script before training the distilled model.
|
||||
Specific to RoBERTa -> DistilRoBERTa and GPT2 -> DistilGPT2.
|
||||
"""
|
||||
from transformers import BertForMaskedLM, RobertaForMaskedLM, GPT2LMHeadModel
|
||||
import torch
|
||||
import argparse
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description="Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation")
|
||||
import torch
|
||||
|
||||
from transformers import GPT2LMHeadModel, RobertaForMaskedLM
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation"
|
||||
)
|
||||
parser.add_argument("--model_type", default="roberta", choices=["roberta", "gpt2"])
|
||||
parser.add_argument("--model_name", default='roberta-large', type=str)
|
||||
parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_roberta_048131723.pth', type=str)
|
||||
parser.add_argument("--vocab_transform", action='store_true')
|
||||
parser.add_argument("--model_name", default="roberta-large", type=str)
|
||||
parser.add_argument("--dump_checkpoint", default="serialization_dir/tf_roberta_048131723.pth", type=str)
|
||||
parser.add_argument("--vocab_transform", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
if args.model_type == 'roberta':
|
||||
if args.model_type == "roberta":
|
||||
model = RobertaForMaskedLM.from_pretrained(args.model_name)
|
||||
prefix = 'roberta'
|
||||
elif args.model_type == 'gpt2':
|
||||
prefix = "roberta"
|
||||
elif args.model_type == "gpt2":
|
||||
model = GPT2LMHeadModel.from_pretrained(args.model_name)
|
||||
prefix = 'transformer'
|
||||
prefix = "transformer"
|
||||
|
||||
state_dict = model.state_dict()
|
||||
compressed_sd = {}
|
||||
|
||||
### Embeddings ###
|
||||
if args.model_type == 'gpt2':
|
||||
for param_name in ['wte.weight', 'wpe.weight']:
|
||||
compressed_sd[f'{prefix}.{param_name}'] = state_dict[f'{prefix}.{param_name}']
|
||||
# Embeddings #
|
||||
if args.model_type == "gpt2":
|
||||
for param_name in ["wte.weight", "wpe.weight"]:
|
||||
compressed_sd[f"{prefix}.{param_name}"] = state_dict[f"{prefix}.{param_name}"]
|
||||
else:
|
||||
for w in ['word_embeddings', 'position_embeddings', 'token_type_embeddings']:
|
||||
param_name = f'{prefix}.embeddings.{w}.weight'
|
||||
for w in ["word_embeddings", "position_embeddings", "token_type_embeddings"]:
|
||||
param_name = f"{prefix}.embeddings.{w}.weight"
|
||||
compressed_sd[param_name] = state_dict[param_name]
|
||||
for w in ['weight', 'bias']:
|
||||
param_name = f'{prefix}.embeddings.LayerNorm.{w}'
|
||||
for w in ["weight", "bias"]:
|
||||
param_name = f"{prefix}.embeddings.LayerNorm.{w}"
|
||||
compressed_sd[param_name] = state_dict[param_name]
|
||||
|
||||
### Transformer Blocks ###
|
||||
# Transformer Blocks #
|
||||
std_idx = 0
|
||||
for teacher_idx in [0, 2, 4, 7, 9, 11]:
|
||||
if args.model_type == 'gpt2':
|
||||
for layer in ['ln_1', 'attn.c_attn', 'attn.c_proj', 'ln_2', 'mlp.c_fc', 'mlp.c_proj']:
|
||||
for w in ['weight', 'bias']:
|
||||
compressed_sd[f'{prefix}.h.{std_idx}.{layer}.{w}'] = \
|
||||
state_dict[f'{prefix}.h.{teacher_idx}.{layer}.{w}']
|
||||
compressed_sd[f'{prefix}.h.{std_idx}.attn.bias'] = state_dict[f'{prefix}.h.{teacher_idx}.attn.bias']
|
||||
if args.model_type == "gpt2":
|
||||
for layer in ["ln_1", "attn.c_attn", "attn.c_proj", "ln_2", "mlp.c_fc", "mlp.c_proj"]:
|
||||
for w in ["weight", "bias"]:
|
||||
compressed_sd[f"{prefix}.h.{std_idx}.{layer}.{w}"] = state_dict[
|
||||
f"{prefix}.h.{teacher_idx}.{layer}.{w}"
|
||||
]
|
||||
compressed_sd[f"{prefix}.h.{std_idx}.attn.bias"] = state_dict[f"{prefix}.h.{teacher_idx}.attn.bias"]
|
||||
else:
|
||||
for layer in ['attention.self.query', 'attention.self.key', 'attention.self.value',
|
||||
'attention.output.dense', 'attention.output.LayerNorm',
|
||||
'intermediate.dense', 'output.dense', 'output.LayerNorm']:
|
||||
for w in ['weight', 'bias']:
|
||||
compressed_sd[f'{prefix}.encoder.layer.{std_idx}.{layer}.{w}'] = \
|
||||
state_dict[f'{prefix}.encoder.layer.{teacher_idx}.{layer}.{w}']
|
||||
for layer in [
|
||||
"attention.self.query",
|
||||
"attention.self.key",
|
||||
"attention.self.value",
|
||||
"attention.output.dense",
|
||||
"attention.output.LayerNorm",
|
||||
"intermediate.dense",
|
||||
"output.dense",
|
||||
"output.LayerNorm",
|
||||
]:
|
||||
for w in ["weight", "bias"]:
|
||||
compressed_sd[f"{prefix}.encoder.layer.{std_idx}.{layer}.{w}"] = state_dict[
|
||||
f"{prefix}.encoder.layer.{teacher_idx}.{layer}.{w}"
|
||||
]
|
||||
std_idx += 1
|
||||
|
||||
### Language Modeling Head ###s
|
||||
if args.model_type == 'roberta':
|
||||
for layer in ['lm_head.decoder.weight', 'lm_head.bias']:
|
||||
compressed_sd[f'{layer}'] = state_dict[f'{layer}']
|
||||
# Language Modeling Head ###s
|
||||
if args.model_type == "roberta":
|
||||
for layer in ["lm_head.decoder.weight", "lm_head.bias"]:
|
||||
compressed_sd[f"{layer}"] = state_dict[f"{layer}"]
|
||||
if args.vocab_transform:
|
||||
for w in ['weight', 'bias']:
|
||||
compressed_sd[f'lm_head.dense.{w}'] = state_dict[f'lm_head.dense.{w}']
|
||||
compressed_sd[f'lm_head.layer_norm.{w}'] = state_dict[f'lm_head.layer_norm.{w}']
|
||||
elif args.model_type == 'gpt2':
|
||||
for w in ['weight', 'bias']:
|
||||
compressed_sd[f'{prefix}.ln_f.{w}'] = state_dict[f'{prefix}.ln_f.{w}']
|
||||
compressed_sd[f'lm_head.weight'] = state_dict[f'lm_head.weight']
|
||||
for w in ["weight", "bias"]:
|
||||
compressed_sd[f"lm_head.dense.{w}"] = state_dict[f"lm_head.dense.{w}"]
|
||||
compressed_sd[f"lm_head.layer_norm.{w}"] = state_dict[f"lm_head.layer_norm.{w}"]
|
||||
elif args.model_type == "gpt2":
|
||||
for w in ["weight", "bias"]:
|
||||
compressed_sd[f"{prefix}.ln_f.{w}"] = state_dict[f"{prefix}.ln_f.{w}"]
|
||||
compressed_sd[f"lm_head.weight"] = state_dict[f"lm_head.weight"]
|
||||
|
||||
print(f'N layers selected for distillation: {std_idx}')
|
||||
print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
|
||||
print(f"N layers selected for distillation: {std_idx}")
|
||||
print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
|
||||
|
||||
print(f'Save transfered checkpoint to {args.dump_checkpoint}.')
|
||||
print(f"Save transfered checkpoint to {args.dump_checkpoint}.")
|
||||
torch.save(compressed_sd, args.dump_checkpoint)
|
||||
|
||||
@@ -16,67 +16,77 @@
|
||||
Preprocessing script before training DistilBERT.
|
||||
Specific to BERT -> DistilBERT.
|
||||
"""
|
||||
from transformers import BertForMaskedLM, RobertaForMaskedLM
|
||||
import torch
|
||||
import argparse
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation")
|
||||
import torch
|
||||
|
||||
from transformers import BertForMaskedLM
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation"
|
||||
)
|
||||
parser.add_argument("--model_type", default="bert", choices=["bert"])
|
||||
parser.add_argument("--model_name", default='bert-base-uncased', type=str)
|
||||
parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_bert-base-uncased_0247911.pth', type=str)
|
||||
parser.add_argument("--vocab_transform", action='store_true')
|
||||
parser.add_argument("--model_name", default="bert-base-uncased", type=str)
|
||||
parser.add_argument("--dump_checkpoint", default="serialization_dir/tf_bert-base-uncased_0247911.pth", type=str)
|
||||
parser.add_argument("--vocab_transform", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
if args.model_type == 'bert':
|
||||
if args.model_type == "bert":
|
||||
model = BertForMaskedLM.from_pretrained(args.model_name)
|
||||
prefix = 'bert'
|
||||
prefix = "bert"
|
||||
else:
|
||||
raise ValueError(f'args.model_type should be "bert".')
|
||||
|
||||
state_dict = model.state_dict()
|
||||
compressed_sd = {}
|
||||
|
||||
for w in ['word_embeddings', 'position_embeddings']:
|
||||
compressed_sd[f'distilbert.embeddings.{w}.weight'] = \
|
||||
state_dict[f'{prefix}.embeddings.{w}.weight']
|
||||
for w in ['weight', 'bias']:
|
||||
compressed_sd[f'distilbert.embeddings.LayerNorm.{w}'] = \
|
||||
state_dict[f'{prefix}.embeddings.LayerNorm.{w}']
|
||||
for w in ["word_embeddings", "position_embeddings"]:
|
||||
compressed_sd[f"distilbert.embeddings.{w}.weight"] = state_dict[f"{prefix}.embeddings.{w}.weight"]
|
||||
for w in ["weight", "bias"]:
|
||||
compressed_sd[f"distilbert.embeddings.LayerNorm.{w}"] = state_dict[f"{prefix}.embeddings.LayerNorm.{w}"]
|
||||
|
||||
std_idx = 0
|
||||
for teacher_idx in [0, 2, 4, 7, 9, 11]:
|
||||
for w in ['weight', 'bias']:
|
||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}'] = \
|
||||
state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.query.{w}']
|
||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}'] = \
|
||||
state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.key.{w}']
|
||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}'] = \
|
||||
state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.value.{w}']
|
||||
for w in ["weight", "bias"]:
|
||||
compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}"] = state_dict[
|
||||
f"{prefix}.encoder.layer.{teacher_idx}.attention.self.query.{w}"
|
||||
]
|
||||
compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}"] = state_dict[
|
||||
f"{prefix}.encoder.layer.{teacher_idx}.attention.self.key.{w}"
|
||||
]
|
||||
compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}"] = state_dict[
|
||||
f"{prefix}.encoder.layer.{teacher_idx}.attention.self.value.{w}"
|
||||
]
|
||||
|
||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}'] = \
|
||||
state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.dense.{w}']
|
||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}'] = \
|
||||
state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}']
|
||||
compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}"] = state_dict[
|
||||
f"{prefix}.encoder.layer.{teacher_idx}.attention.output.dense.{w}"
|
||||
]
|
||||
compressed_sd[f"distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}"] = state_dict[
|
||||
f"{prefix}.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}"
|
||||
]
|
||||
|
||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}'] = \
|
||||
state_dict[f'{prefix}.encoder.layer.{teacher_idx}.intermediate.dense.{w}']
|
||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}'] = \
|
||||
state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.dense.{w}']
|
||||
compressed_sd[f'distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}'] = \
|
||||
state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}']
|
||||
compressed_sd[f"distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}"] = state_dict[
|
||||
f"{prefix}.encoder.layer.{teacher_idx}.intermediate.dense.{w}"
|
||||
]
|
||||
compressed_sd[f"distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}"] = state_dict[
|
||||
f"{prefix}.encoder.layer.{teacher_idx}.output.dense.{w}"
|
||||
]
|
||||
compressed_sd[f"distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}"] = state_dict[
|
||||
f"{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}"
|
||||
]
|
||||
std_idx += 1
|
||||
|
||||
compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight']
|
||||
compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias']
|
||||
compressed_sd[f"vocab_projector.weight"] = state_dict[f"cls.predictions.decoder.weight"]
|
||||
compressed_sd[f"vocab_projector.bias"] = state_dict[f"cls.predictions.bias"]
|
||||
if args.vocab_transform:
|
||||
for w in ['weight', 'bias']:
|
||||
compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}']
|
||||
compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}']
|
||||
for w in ["weight", "bias"]:
|
||||
compressed_sd[f"vocab_transform.{w}"] = state_dict[f"cls.predictions.transform.dense.{w}"]
|
||||
compressed_sd[f"vocab_layer_norm.{w}"] = state_dict[f"cls.predictions.transform.LayerNorm.{w}"]
|
||||
|
||||
print(f'N layers selected for distillation: {std_idx}')
|
||||
print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
|
||||
print(f"N layers selected for distillation: {std_idx}")
|
||||
print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
|
||||
|
||||
print(f'Save transfered checkpoint to {args.dump_checkpoint}.')
|
||||
print(f"Save transfered checkpoint to {args.dump_checkpoint}.")
|
||||
torch.save(compressed_sd, args.dump_checkpoint)
|
||||
|
||||
@@ -15,37 +15,42 @@
|
||||
"""
|
||||
Preprocessing script before training the distilled model.
|
||||
"""
|
||||
from collections import Counter
|
||||
import argparse
|
||||
import pickle
|
||||
import logging
|
||||
import pickle
|
||||
from collections import Counter
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)")
|
||||
parser.add_argument("--data_file", type=str, default="data/dump.bert-base-uncased.pickle",
|
||||
help="The binarized dataset.")
|
||||
parser.add_argument("--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle",
|
||||
help="The dump file.")
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data_file", type=str, default="data/dump.bert-base-uncased.pickle", help="The binarized dataset."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle", help="The dump file."
|
||||
)
|
||||
parser.add_argument("--vocab_size", default=30522, type=int)
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.info(f'Loading data from {args.data_file}')
|
||||
with open(args.data_file, 'rb') as fp:
|
||||
logger.info(f"Loading data from {args.data_file}")
|
||||
with open(args.data_file, "rb") as fp:
|
||||
data = pickle.load(fp)
|
||||
|
||||
logger.info('Counting occurences for MLM.')
|
||||
logger.info("Counting occurences for MLM.")
|
||||
counter = Counter()
|
||||
for tk_ids in data:
|
||||
counter.update(tk_ids)
|
||||
counts = [0]*args.vocab_size
|
||||
counts = [0] * args.vocab_size
|
||||
for k, v in counter.items():
|
||||
counts[k] = v
|
||||
|
||||
logger.info(f'Dump to {args.token_counts_dump}')
|
||||
with open(args.token_counts_dump, 'wb') as handle:
|
||||
logger.info(f"Dump to {args.token_counts_dump}")
|
||||
with open(args.token_counts_dump, "wb") as handle:
|
||||
pickle.dump(counts, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
@@ -16,272 +16,304 @@
|
||||
Training the distilled model.
|
||||
Supported architectures include: BERT -> DistilBERT, RoBERTa -> DistilRoBERTa, GPT2 -> DistilGPT2.
|
||||
"""
|
||||
import os
|
||||
import argparse
|
||||
import pickle
|
||||
import json
|
||||
import os
|
||||
import pickle
|
||||
import shutil
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from transformers import BertConfig, BertForMaskedLM, BertTokenizer
|
||||
from transformers import RobertaConfig, RobertaForMaskedLM, RobertaTokenizer
|
||||
from transformers import DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer
|
||||
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
|
||||
|
||||
from distiller import Distiller
|
||||
from utils import git_log, logger, init_gpu_params, set_seed
|
||||
from lm_seqs_dataset import LmSeqsDataset
|
||||
from transformers import (
|
||||
BertConfig,
|
||||
BertForMaskedLM,
|
||||
BertTokenizer,
|
||||
DistilBertConfig,
|
||||
DistilBertForMaskedLM,
|
||||
DistilBertTokenizer,
|
||||
GPT2Config,
|
||||
GPT2LMHeadModel,
|
||||
GPT2Tokenizer,
|
||||
RobertaConfig,
|
||||
RobertaForMaskedLM,
|
||||
RobertaTokenizer,
|
||||
)
|
||||
from utils import git_log, init_gpu_params, logger, set_seed
|
||||
|
||||
|
||||
MODEL_CLASSES = {
|
||||
'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
|
||||
'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
|
||||
'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
|
||||
'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)
|
||||
"distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
|
||||
"roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
|
||||
"bert": (BertConfig, BertForMaskedLM, BertTokenizer),
|
||||
"gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
|
||||
}
|
||||
|
||||
|
||||
def sanity_checks(args):
|
||||
"""
|
||||
A bunch of args sanity checks to perform even starting...
|
||||
"""
|
||||
assert (args.mlm and args.alpha_mlm > 0.) or (not args.mlm and args.alpha_mlm == 0.)
|
||||
assert (args.alpha_mlm > 0. and args.alpha_clm == 0.) or (args.alpha_mlm == 0. and args.alpha_clm > 0.)
|
||||
assert (args.mlm and args.alpha_mlm > 0.0) or (not args.mlm and args.alpha_mlm == 0.0)
|
||||
assert (args.alpha_mlm > 0.0 and args.alpha_clm == 0.0) or (args.alpha_mlm == 0.0 and args.alpha_clm > 0.0)
|
||||
if args.mlm:
|
||||
assert os.path.isfile(args.token_counts)
|
||||
assert (args.student_type in ['roberta', 'distilbert']) and (args.teacher_type in ['roberta', 'bert'])
|
||||
assert (args.student_type in ["roberta", "distilbert"]) and (args.teacher_type in ["roberta", "bert"])
|
||||
else:
|
||||
assert (args.student_type in ['gpt2']) and (args.teacher_type in ['gpt2'])
|
||||
assert (args.student_type in ["gpt2"]) and (args.teacher_type in ["gpt2"])
|
||||
|
||||
assert args.teacher_type == args.student_type or (args.student_type=='distilbert' and args.teacher_type=='bert')
|
||||
assert args.teacher_type == args.student_type or (
|
||||
args.student_type == "distilbert" and args.teacher_type == "bert"
|
||||
)
|
||||
assert os.path.isfile(args.student_config)
|
||||
if args.student_pretrained_weights is not None:
|
||||
assert os.path.isfile(args.student_pretrained_weights)
|
||||
|
||||
if args.freeze_token_type_embds: assert args.student_type in ['roberta']
|
||||
if args.freeze_token_type_embds:
|
||||
assert args.student_type in ["roberta"]
|
||||
|
||||
assert args.alpha_ce >= 0.0
|
||||
assert args.alpha_mlm >= 0.0
|
||||
assert args.alpha_clm >= 0.0
|
||||
assert args.alpha_mse >= 0.0
|
||||
assert args.alpha_cos >= 0.0
|
||||
assert args.alpha_ce + args.alpha_mlm + args.alpha_clm + args.alpha_mse + args.alpha_cos > 0.0
|
||||
|
||||
assert args.alpha_ce >= 0.
|
||||
assert args.alpha_mlm >= 0.
|
||||
assert args.alpha_clm >= 0.
|
||||
assert args.alpha_mse >= 0.
|
||||
assert args.alpha_cos >= 0.
|
||||
assert args.alpha_ce + args.alpha_mlm + args.alpha_clm + args.alpha_mse + args.alpha_cos > 0.
|
||||
|
||||
def freeze_pos_embeddings(student, args):
|
||||
if args.student_type == 'roberta':
|
||||
if args.student_type == "roberta":
|
||||
student.roberta.embeddings.position_embeddings.weight.requires_grad = False
|
||||
elif args.student_type == 'gpt2':
|
||||
elif args.student_type == "gpt2":
|
||||
student.transformer.wpe.weight.requires_grad = False
|
||||
|
||||
|
||||
def freeze_token_type_embeddings(student, args):
|
||||
if args.student_type == 'roberta':
|
||||
if args.student_type == "roberta":
|
||||
student.roberta.embeddings.token_type_embeddings.weight.requires_grad = False
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Training")
|
||||
parser.add_argument("--force", action='store_true',
|
||||
help="Overwrite dump_path if it already exists.")
|
||||
parser.add_argument("--force", action="store_true", help="Overwrite dump_path if it already exists.")
|
||||
|
||||
parser.add_argument("--dump_path", type=str, required=True,
|
||||
help="The output directory (log, checkpoints, parameters, etc.)")
|
||||
parser.add_argument("--data_file", type=str, required=True,
|
||||
help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.")
|
||||
parser.add_argument(
|
||||
"--dump_path", type=str, required=True, help="The output directory (log, checkpoints, parameters, etc.)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data_file",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.",
|
||||
)
|
||||
|
||||
parser.add_argument("--student_type", type=str, choices=["distilbert", "roberta", "gpt2"], required=True,
|
||||
help="The student type (DistilBERT, RoBERTa).")
|
||||
parser.add_argument("--student_config", type=str, required=True,
|
||||
help="Path to the student configuration.")
|
||||
parser.add_argument("--student_pretrained_weights", default=None, type=str,
|
||||
help="Load student initialization checkpoint.")
|
||||
parser.add_argument(
|
||||
"--student_type",
|
||||
type=str,
|
||||
choices=["distilbert", "roberta", "gpt2"],
|
||||
required=True,
|
||||
help="The student type (DistilBERT, RoBERTa).",
|
||||
)
|
||||
parser.add_argument("--student_config", type=str, required=True, help="Path to the student configuration.")
|
||||
parser.add_argument(
|
||||
"--student_pretrained_weights", default=None, type=str, help="Load student initialization checkpoint."
|
||||
)
|
||||
|
||||
parser.add_argument("--teacher_type", choices=["bert", "roberta", "gpt2"], required=True,
|
||||
help="Teacher type (BERT, RoBERTa).")
|
||||
parser.add_argument("--teacher_name", type=str, required=True,
|
||||
help="The teacher model.")
|
||||
parser.add_argument(
|
||||
"--teacher_type", choices=["bert", "roberta", "gpt2"], required=True, help="Teacher type (BERT, RoBERTa)."
|
||||
)
|
||||
parser.add_argument("--teacher_name", type=str, required=True, help="The teacher model.")
|
||||
|
||||
parser.add_argument("--temperature", default=2., type=float,
|
||||
help="Temperature for the softmax temperature.")
|
||||
parser.add_argument("--alpha_ce", default=0.5, type=float,
|
||||
help="Linear weight for the distillation loss. Must be >=0.")
|
||||
parser.add_argument("--alpha_mlm", default=0.0, type=float,
|
||||
help="Linear weight for the MLM loss. Must be >=0. Should be used in coonjunction with `mlm` flag.")
|
||||
parser.add_argument("--alpha_clm", default=0.5, type=float,
|
||||
help="Linear weight for the CLM loss. Must be >=0.")
|
||||
parser.add_argument("--alpha_mse", default=0.0, type=float,
|
||||
help="Linear weight of the MSE loss. Must be >=0.")
|
||||
parser.add_argument("--alpha_cos", default=0.0, type=float,
|
||||
help="Linear weight of the cosine embedding loss. Must be >=0.")
|
||||
parser.add_argument("--temperature", default=2.0, type=float, help="Temperature for the softmax temperature.")
|
||||
parser.add_argument(
|
||||
"--alpha_ce", default=0.5, type=float, help="Linear weight for the distillation loss. Must be >=0."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--alpha_mlm",
|
||||
default=0.0,
|
||||
type=float,
|
||||
help="Linear weight for the MLM loss. Must be >=0. Should be used in coonjunction with `mlm` flag.",
|
||||
)
|
||||
parser.add_argument("--alpha_clm", default=0.5, type=float, help="Linear weight for the CLM loss. Must be >=0.")
|
||||
parser.add_argument("--alpha_mse", default=0.0, type=float, help="Linear weight of the MSE loss. Must be >=0.")
|
||||
parser.add_argument(
|
||||
"--alpha_cos", default=0.0, type=float, help="Linear weight of the cosine embedding loss. Must be >=0."
|
||||
)
|
||||
|
||||
parser.add_argument("--mlm", action="store_true",
|
||||
help="The LM step: MLM or CLM. If `mlm` is True, the MLM is used over CLM.")
|
||||
parser.add_argument("--mlm_mask_prop", default=0.15, type=float,
|
||||
help="Proportion of tokens for which we need to make a prediction.")
|
||||
parser.add_argument("--word_mask", default=0.8, type=float,
|
||||
help="Proportion of tokens to mask out.")
|
||||
parser.add_argument("--word_keep", default=0.1, type=float,
|
||||
help="Proportion of tokens to keep.")
|
||||
parser.add_argument("--word_rand", default=0.1, type=float,
|
||||
help="Proportion of tokens to randomly replace.")
|
||||
parser.add_argument("--mlm_smoothing", default=0.7, type=float,
|
||||
help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).")
|
||||
parser.add_argument("--token_counts", type=str,
|
||||
help="The token counts in the data_file for MLM.")
|
||||
parser.add_argument(
|
||||
"--mlm", action="store_true", help="The LM step: MLM or CLM. If `mlm` is True, the MLM is used over CLM."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mlm_mask_prop",
|
||||
default=0.15,
|
||||
type=float,
|
||||
help="Proportion of tokens for which we need to make a prediction.",
|
||||
)
|
||||
parser.add_argument("--word_mask", default=0.8, type=float, help="Proportion of tokens to mask out.")
|
||||
parser.add_argument("--word_keep", default=0.1, type=float, help="Proportion of tokens to keep.")
|
||||
parser.add_argument("--word_rand", default=0.1, type=float, help="Proportion of tokens to randomly replace.")
|
||||
parser.add_argument(
|
||||
"--mlm_smoothing",
|
||||
default=0.7,
|
||||
type=float,
|
||||
help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).",
|
||||
)
|
||||
parser.add_argument("--token_counts", type=str, help="The token counts in the data_file for MLM.")
|
||||
|
||||
parser.add_argument("--restrict_ce_to_mask", action='store_true',
|
||||
help="If true, compute the distilation loss only the [MLM] prediction distribution.")
|
||||
parser.add_argument("--freeze_pos_embs", action="store_true",
|
||||
help="Freeze positional embeddings during distillation. For student_type in ['roberta', 'gpt2'] only.")
|
||||
parser.add_argument("--freeze_token_type_embds", action="store_true",
|
||||
help="Freeze token type embeddings during distillation if existent. For student_type in ['roberta'] only.")
|
||||
parser.add_argument(
|
||||
"--restrict_ce_to_mask",
|
||||
action="store_true",
|
||||
help="If true, compute the distilation loss only the [MLM] prediction distribution.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--freeze_pos_embs",
|
||||
action="store_true",
|
||||
help="Freeze positional embeddings during distillation. For student_type in ['roberta', 'gpt2'] only.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--freeze_token_type_embds",
|
||||
action="store_true",
|
||||
help="Freeze token type embeddings during distillation if existent. For student_type in ['roberta'] only.",
|
||||
)
|
||||
|
||||
parser.add_argument("--n_epoch", type=int, default=3,
|
||||
help="Number of pass on the whole dataset.")
|
||||
parser.add_argument("--batch_size", type=int, default=5,
|
||||
help="Batch size (for each process).")
|
||||
parser.add_argument("--group_by_size", action='store_false',
|
||||
help="If true, group sequences that have similar length into the same batch. Default is true.")
|
||||
parser.add_argument("--n_epoch", type=int, default=3, help="Number of pass on the whole dataset.")
|
||||
parser.add_argument("--batch_size", type=int, default=5, help="Batch size (for each process).")
|
||||
parser.add_argument(
|
||||
"--group_by_size",
|
||||
action="store_false",
|
||||
help="If true, group sequences that have similar length into the same batch. Default is true.",
|
||||
)
|
||||
|
||||
parser.add_argument("--gradient_accumulation_steps", type=int, default=50,
|
||||
help="Gradient accumulation for larger training batches.")
|
||||
parser.add_argument("--warmup_prop", default=0.05, type=float,
|
||||
help="Linear warmup proportion.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float,
|
||||
help="Weight deay if we apply some.")
|
||||
parser.add_argument("--learning_rate", default=5e-4, type=float,
|
||||
help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-6, type=float,
|
||||
help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=5.0, type=float,
|
||||
help="Max gradient norm.")
|
||||
parser.add_argument("--initializer_range", default=0.02, type=float,
|
||||
help="Random initialization range.")
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
type=int,
|
||||
default=50,
|
||||
help="Gradient accumulation for larger training batches.",
|
||||
)
|
||||
parser.add_argument("--warmup_prop", default=0.05, type=float, help="Linear warmup proportion.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
|
||||
parser.add_argument("--learning_rate", default=5e-4, type=float, help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=5.0, type=float, help="Max gradient norm.")
|
||||
parser.add_argument("--initializer_range", default=0.02, type=float, help="Random initialization range.")
|
||||
|
||||
parser.add_argument('--fp16', action='store_true',
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
|
||||
parser.add_argument('--fp16_opt_level', type=str, default='O1',
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html")
|
||||
parser.add_argument("--n_gpu", type=int, default=1,
|
||||
help="Number of GPUs in the node.")
|
||||
parser.add_argument("--local_rank", type=int, default=-1,
|
||||
help="Distributed training - Local rank")
|
||||
parser.add_argument("--seed", type=int, default=56,
|
||||
help="Random seed")
|
||||
parser.add_argument(
|
||||
"--fp16",
|
||||
action="store_true",
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fp16_opt_level",
|
||||
type=str,
|
||||
default="O1",
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html",
|
||||
)
|
||||
parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs in the node.")
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="Distributed training - Local rank")
|
||||
parser.add_argument("--seed", type=int, default=56, help="Random seed")
|
||||
|
||||
parser.add_argument("--log_interval", type=int, default=500,
|
||||
help="Tensorboard logging interval.")
|
||||
parser.add_argument("--checkpoint_interval", type=int, default=4000,
|
||||
help="Checkpoint interval.")
|
||||
parser.add_argument("--log_interval", type=int, default=500, help="Tensorboard logging interval.")
|
||||
parser.add_argument("--checkpoint_interval", type=int, default=4000, help="Checkpoint interval.")
|
||||
args = parser.parse_args()
|
||||
sanity_checks(args)
|
||||
|
||||
|
||||
## ARGS ##
|
||||
# ARGS #
|
||||
init_gpu_params(args)
|
||||
set_seed(args)
|
||||
if args.is_master:
|
||||
if os.path.exists(args.dump_path):
|
||||
if not args.force:
|
||||
raise ValueError(f'Serialization dir {args.dump_path} already exists, but you have not precised wheter to overwrite it'
|
||||
'Use `--force` if you want to overwrite it')
|
||||
raise ValueError(
|
||||
f"Serialization dir {args.dump_path} already exists, but you have not precised wheter to overwrite it"
|
||||
"Use `--force` if you want to overwrite it"
|
||||
)
|
||||
else:
|
||||
shutil.rmtree(args.dump_path)
|
||||
|
||||
if not os.path.exists(args.dump_path):
|
||||
os.makedirs(args.dump_path)
|
||||
logger.info(f'Experiment will be dumped and logged in {args.dump_path}')
|
||||
logger.info(f"Experiment will be dumped and logged in {args.dump_path}")
|
||||
|
||||
|
||||
### SAVE PARAMS ###
|
||||
logger.info(f'Param: {args}')
|
||||
with open(os.path.join(args.dump_path, 'parameters.json'), 'w') as f:
|
||||
# SAVE PARAMS #
|
||||
logger.info(f"Param: {args}")
|
||||
with open(os.path.join(args.dump_path, "parameters.json"), "w") as f:
|
||||
json.dump(vars(args), f, indent=4)
|
||||
git_log(args.dump_path)
|
||||
|
||||
student_config_class, student_model_class, _ = MODEL_CLASSES[args.student_type]
|
||||
teacher_config_class, teacher_model_class, teacher_tokenizer_class = MODEL_CLASSES[args.teacher_type]
|
||||
|
||||
### TOKENIZER ###
|
||||
# TOKENIZER #
|
||||
tokenizer = teacher_tokenizer_class.from_pretrained(args.teacher_name)
|
||||
special_tok_ids = {}
|
||||
for tok_name, tok_symbol in tokenizer.special_tokens_map.items():
|
||||
idx = tokenizer.all_special_tokens.index(tok_symbol)
|
||||
special_tok_ids[tok_name] = tokenizer.all_special_ids[idx]
|
||||
logger.info(f'Special tokens {special_tok_ids}')
|
||||
logger.info(f"Special tokens {special_tok_ids}")
|
||||
args.special_tok_ids = special_tok_ids
|
||||
args.max_model_input_size = tokenizer.max_model_input_sizes[args.teacher_name]
|
||||
|
||||
|
||||
## DATA LOADER ##
|
||||
logger.info(f'Loading data from {args.data_file}')
|
||||
with open(args.data_file, 'rb') as fp:
|
||||
# DATA LOADER #
|
||||
logger.info(f"Loading data from {args.data_file}")
|
||||
with open(args.data_file, "rb") as fp:
|
||||
data = pickle.load(fp)
|
||||
|
||||
|
||||
if args.mlm:
|
||||
logger.info(f'Loading token counts from {args.token_counts} (already pre-computed)')
|
||||
with open(args.token_counts, 'rb') as fp:
|
||||
logger.info(f"Loading token counts from {args.token_counts} (already pre-computed)")
|
||||
with open(args.token_counts, "rb") as fp:
|
||||
counts = pickle.load(fp)
|
||||
|
||||
|
||||
token_probs = np.maximum(counts, 1) ** -args.mlm_smoothing
|
||||
for idx in special_tok_ids.values():
|
||||
token_probs[idx] = 0. # do not predict special tokens
|
||||
token_probs[idx] = 0.0 # do not predict special tokens
|
||||
token_probs = torch.from_numpy(token_probs)
|
||||
else:
|
||||
token_probs = None
|
||||
|
||||
|
||||
train_lm_seq_dataset = LmSeqsDataset(params=args, data=data)
|
||||
logger.info(f'Data loader created.')
|
||||
logger.info(f"Data loader created.")
|
||||
|
||||
|
||||
## STUDENT ##
|
||||
logger.info(f'Loading student config from {args.student_config}')
|
||||
# STUDENT #
|
||||
logger.info(f"Loading student config from {args.student_config}")
|
||||
stu_architecture_config = student_config_class.from_pretrained(args.student_config)
|
||||
stu_architecture_config.output_hidden_states = True
|
||||
|
||||
if args.student_pretrained_weights is not None:
|
||||
logger.info(f'Loading pretrained weights from {args.student_pretrained_weights}')
|
||||
student = student_model_class.from_pretrained(args.student_pretrained_weights,
|
||||
config=stu_architecture_config)
|
||||
logger.info(f"Loading pretrained weights from {args.student_pretrained_weights}")
|
||||
student = student_model_class.from_pretrained(args.student_pretrained_weights, config=stu_architecture_config)
|
||||
else:
|
||||
student = student_model_class(stu_architecture_config)
|
||||
|
||||
|
||||
if args.n_gpu > 0:
|
||||
student.to(f'cuda:{args.local_rank}')
|
||||
logger.info(f'Student loaded.')
|
||||
student.to(f"cuda:{args.local_rank}")
|
||||
logger.info(f"Student loaded.")
|
||||
|
||||
|
||||
## TEACHER ##
|
||||
# TEACHER #
|
||||
teacher = teacher_model_class.from_pretrained(args.teacher_name, output_hidden_states=True)
|
||||
if args.n_gpu > 0:
|
||||
teacher.to(f'cuda:{args.local_rank}')
|
||||
logger.info(f'Teacher loaded from {args.teacher_name}.')
|
||||
teacher.to(f"cuda:{args.local_rank}")
|
||||
logger.info(f"Teacher loaded from {args.teacher_name}.")
|
||||
|
||||
|
||||
## FREEZING ##
|
||||
# FREEZING #
|
||||
if args.freeze_pos_embs:
|
||||
freeze_pos_embeddings(student, args)
|
||||
if args.freeze_token_type_embds:
|
||||
freeze_token_type_embeddings(student, args)
|
||||
|
||||
|
||||
## SANITY CHECKS ##
|
||||
# SANITY CHECKS #
|
||||
assert student.config.vocab_size == teacher.config.vocab_size
|
||||
assert student.config.hidden_size == teacher.config.hidden_size
|
||||
assert student.config.max_position_embeddings == teacher.config.max_position_embeddings
|
||||
if args.mlm:
|
||||
assert token_probs.size(0) == stu_architecture_config.vocab_size
|
||||
|
||||
|
||||
## DISTILLER ##
|
||||
# DISTILLER #
|
||||
torch.cuda.empty_cache()
|
||||
distiller = Distiller(params=args,
|
||||
dataset=train_lm_seq_dataset,
|
||||
token_probs=token_probs,
|
||||
student=student,
|
||||
teacher=teacher)
|
||||
distiller = Distiller(
|
||||
params=args, dataset=train_lm_seq_dataset, token_probs=token_probs, student=student, teacher=teacher
|
||||
)
|
||||
distiller.train()
|
||||
logger.info("Let's go get some drinks.")
|
||||
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"activation": "gelu",
|
||||
"attention_dropout": 0.1,
|
||||
"dim": 768,
|
||||
"dropout": 0.1,
|
||||
"hidden_dim": 3072,
|
||||
"initializer_range": 0.02,
|
||||
"max_position_embeddings": 512,
|
||||
"n_heads": 12,
|
||||
"n_layers": 6,
|
||||
"sinusoidal_pos_embds": true,
|
||||
"tie_weights_": true,
|
||||
"vocab_size": 119547
|
||||
}
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"vocab_size": 50265,
|
||||
"hidden_size": 768,
|
||||
"num_hidden_layers": 6,
|
||||
"num_attention_heads": 12,
|
||||
"intermediate_size": 3072,
|
||||
"hidden_act": "gelu",
|
||||
"hidden_dropout_prob": 0.1,
|
||||
"attention_probs_dropout_prob": 0.1,
|
||||
"max_position_embeddings": 514,
|
||||
"type_vocab_size": 1,
|
||||
"initializer_range": 0.02,
|
||||
"layer_norm_eps": 0.00001
|
||||
}
|
||||
@@ -15,17 +15,21 @@
|
||||
""" Utils to train DistilBERT
|
||||
adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
|
||||
"""
|
||||
import git
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import socket
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
import logging
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
import git
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO,
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -35,12 +39,12 @@ def git_log(folder_path: str):
|
||||
"""
|
||||
repo = git.Repo(search_parent_directories=True)
|
||||
repo_infos = {
|
||||
'repo_id': str(repo),
|
||||
'repo_sha': str(repo.head.object.hexsha),
|
||||
'repo_branch': str(repo.active_branch)
|
||||
"repo_id": str(repo),
|
||||
"repo_sha": str(repo.head.object.hexsha),
|
||||
"repo_branch": str(repo.active_branch),
|
||||
}
|
||||
|
||||
with open(os.path.join(folder_path, 'git_log.json'), 'w') as f:
|
||||
with open(os.path.join(folder_path, "git_log.json"), "w") as f:
|
||||
json.dump(repo_infos, f, indent=4)
|
||||
|
||||
|
||||
@@ -57,21 +61,21 @@ def init_gpu_params(params):
|
||||
|
||||
assert torch.cuda.is_available()
|
||||
|
||||
logger.info('Initializing GPUs')
|
||||
logger.info("Initializing GPUs")
|
||||
if params.n_gpu > 1:
|
||||
assert params.local_rank != -1
|
||||
|
||||
params.world_size = int(os.environ['WORLD_SIZE'])
|
||||
params.n_gpu_per_node = int(os.environ['N_GPU_NODE'])
|
||||
params.global_rank = int(os.environ['RANK'])
|
||||
params.world_size = int(os.environ["WORLD_SIZE"])
|
||||
params.n_gpu_per_node = int(os.environ["N_GPU_NODE"])
|
||||
params.global_rank = int(os.environ["RANK"])
|
||||
|
||||
# number of nodes / node ID
|
||||
params.n_nodes = params.world_size // params.n_gpu_per_node
|
||||
params.node_id = params.global_rank // params.n_gpu_per_node
|
||||
params.multi_gpu = True
|
||||
|
||||
assert params.n_nodes == int(os.environ['N_NODES'])
|
||||
assert params.node_id == int(os.environ['NODE_RANK'])
|
||||
assert params.n_nodes == int(os.environ["N_NODES"])
|
||||
assert params.node_id == int(os.environ["NODE_RANK"])
|
||||
|
||||
# local job (single GPU)
|
||||
else:
|
||||
@@ -114,8 +118,7 @@ def init_gpu_params(params):
|
||||
if params.multi_gpu:
|
||||
logger.info("Initializing PyTorch distributed")
|
||||
torch.distributed.init_process_group(
|
||||
init_method='env://',
|
||||
backend='nccl',
|
||||
init_method="env://", backend="nccl",
|
||||
)
|
||||
|
||||
|
||||
|
||||
221
examples/hans/hans_processors.py
Normal file
221
examples/hans/hans_processors.py
Normal file
@@ -0,0 +1,221 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" GLUE processors and helpers """
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
||||
from transformers.file_utils import is_tf_available
|
||||
from utils_hans import DataProcessor, InputExample, InputFeatures
|
||||
|
||||
|
||||
if is_tf_available():
|
||||
import tensorflow as tf
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def hans_convert_examples_to_features(
|
||||
examples,
|
||||
tokenizer,
|
||||
max_length=512,
|
||||
task=None,
|
||||
label_list=None,
|
||||
output_mode=None,
|
||||
pad_on_left=False,
|
||||
pad_token=0,
|
||||
pad_token_segment_id=0,
|
||||
mask_padding_with_zero=True,
|
||||
):
|
||||
"""
|
||||
Loads a data file into a list of ``InputFeatures``
|
||||
|
||||
Args:
|
||||
examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
|
||||
tokenizer: Instance of a tokenizer that will tokenize the examples
|
||||
max_length: Maximum example length
|
||||
task: HANS
|
||||
label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
|
||||
output_mode: String indicating the output mode. Either ``regression`` or ``classification``
|
||||
pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
|
||||
pad_token: Padding token
|
||||
pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
|
||||
mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
|
||||
and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
|
||||
actual values)
|
||||
|
||||
Returns:
|
||||
If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
|
||||
containing the task-specific features. If the input is a list of ``InputExamples``, will return
|
||||
a list of task-specific ``InputFeatures`` which can be fed to the model.
|
||||
|
||||
"""
|
||||
is_tf_dataset = False
|
||||
if is_tf_available() and isinstance(examples, tf.data.Dataset):
|
||||
is_tf_dataset = True
|
||||
|
||||
if task is not None:
|
||||
processor = glue_processors[task]()
|
||||
if label_list is None:
|
||||
label_list = processor.get_labels()
|
||||
logger.info("Using label list %s for task %s" % (label_list, task))
|
||||
if output_mode is None:
|
||||
output_mode = glue_output_modes[task]
|
||||
logger.info("Using output mode %s for task %s" % (output_mode, task))
|
||||
|
||||
label_map = {label: i for i, label in enumerate(label_list)}
|
||||
|
||||
features = []
|
||||
for (ex_index, example) in enumerate(examples):
|
||||
if ex_index % 10000 == 0:
|
||||
logger.info("Writing example %d" % (ex_index))
|
||||
if is_tf_dataset:
|
||||
example = processor.get_example_from_tensor_dict(example)
|
||||
example = processor.tfds_map(example)
|
||||
|
||||
inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length,)
|
||||
input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
|
||||
|
||||
# The mask has 1 for real tokens and 0 for padding tokens. Only real
|
||||
# tokens are attended to.
|
||||
attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
|
||||
|
||||
# Zero-pad up to the sequence length.
|
||||
padding_length = max_length - len(input_ids)
|
||||
if pad_on_left:
|
||||
input_ids = ([pad_token] * padding_length) + input_ids
|
||||
attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
|
||||
token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
|
||||
else:
|
||||
input_ids = input_ids + ([pad_token] * padding_length)
|
||||
attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
|
||||
token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
|
||||
|
||||
assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
|
||||
assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(
|
||||
len(attention_mask), max_length
|
||||
)
|
||||
assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(
|
||||
len(token_type_ids), max_length
|
||||
)
|
||||
|
||||
if output_mode == "classification":
|
||||
label = label_map[example.label] if example.label in label_map else 0
|
||||
elif output_mode == "regression":
|
||||
label = float(example.label)
|
||||
else:
|
||||
raise KeyError(output_mode)
|
||||
pairID = str(example.pairID)
|
||||
|
||||
if ex_index < 10:
|
||||
logger.info("*** Example ***")
|
||||
logger.info("text_a: %s" % (example.text_a))
|
||||
logger.info("text_b: %s" % (example.text_b))
|
||||
logger.info("guid: %s" % (example.guid))
|
||||
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
|
||||
logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
|
||||
logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
|
||||
logger.info("label: %s (id = %d)" % (example.label, label))
|
||||
|
||||
features.append(
|
||||
InputFeatures(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
label=label,
|
||||
pairID=pairID,
|
||||
)
|
||||
)
|
||||
|
||||
if is_tf_available() and is_tf_dataset:
|
||||
|
||||
def gen():
|
||||
for ex in features:
|
||||
yield (
|
||||
{
|
||||
"input_ids": ex.input_ids,
|
||||
"attention_mask": ex.attention_mask,
|
||||
"token_type_ids": ex.token_type_ids,
|
||||
},
|
||||
ex.label,
|
||||
)
|
||||
|
||||
return tf.data.Dataset.from_generator(
|
||||
gen,
|
||||
({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
|
||||
(
|
||||
{
|
||||
"input_ids": tf.TensorShape([None]),
|
||||
"attention_mask": tf.TensorShape([None]),
|
||||
"token_type_ids": tf.TensorShape([None]),
|
||||
},
|
||||
tf.TensorShape([]),
|
||||
),
|
||||
)
|
||||
|
||||
return features
|
||||
|
||||
|
||||
class HansProcessor(DataProcessor):
|
||||
"""Processor for the HANS data set."""
|
||||
|
||||
def get_example_from_tensor_dict(self, tensor_dict):
|
||||
"""See base class."""
|
||||
return InputExample(
|
||||
tensor_dict["idx"].numpy(),
|
||||
tensor_dict["premise"].numpy().decode("utf-8"),
|
||||
tensor_dict["hypothesis"].numpy().decode("utf-8"),
|
||||
str(tensor_dict["label"].numpy()),
|
||||
)
|
||||
|
||||
def get_train_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_train_set.txt")), "train")
|
||||
|
||||
def get_dev_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_evaluation_set.txt")), "dev")
|
||||
|
||||
def get_labels(self):
|
||||
"""See base class."""
|
||||
return ["contradiction", "entailment", "neutral"]
|
||||
|
||||
def _create_examples(self, lines, set_type):
|
||||
"""Creates examples for the training and dev sets."""
|
||||
examples = []
|
||||
for (i, line) in enumerate(lines):
|
||||
if i == 0:
|
||||
continue
|
||||
guid = "%s-%s" % (set_type, line[0])
|
||||
text_a = line[5]
|
||||
text_b = line[6]
|
||||
pairID = line[7][2:] if line[7].startswith("ex") else line[7]
|
||||
label = line[-1]
|
||||
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, pairID=pairID))
|
||||
return examples
|
||||
|
||||
|
||||
glue_tasks_num_labels = {
|
||||
"hans": 3,
|
||||
}
|
||||
|
||||
glue_processors = {
|
||||
"hans": HansProcessor,
|
||||
}
|
||||
|
||||
glue_output_modes = {
|
||||
"hans": "classification",
|
||||
}
|
||||
643
examples/hans/test_hans.py
Normal file
643
examples/hans/test_hans.py
Normal file
@@ -0,0 +1,643 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa)."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from hans_processors import glue_output_modes as output_modes
|
||||
from hans_processors import glue_processors as processors
|
||||
from hans_processors import hans_convert_examples_to_features as convert_examples_to_features
|
||||
from transformers import (
|
||||
WEIGHTS_NAME,
|
||||
AdamW,
|
||||
AlbertConfig,
|
||||
AlbertForSequenceClassification,
|
||||
AlbertTokenizer,
|
||||
BertConfig,
|
||||
BertForSequenceClassification,
|
||||
BertTokenizer,
|
||||
DistilBertConfig,
|
||||
DistilBertForSequenceClassification,
|
||||
DistilBertTokenizer,
|
||||
RobertaConfig,
|
||||
RobertaForSequenceClassification,
|
||||
RobertaTokenizer,
|
||||
XLMConfig,
|
||||
XLMForSequenceClassification,
|
||||
XLMTokenizer,
|
||||
XLNetConfig,
|
||||
XLNetForSequenceClassification,
|
||||
XLNetTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
)
|
||||
|
||||
|
||||
try:
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
except ImportError:
|
||||
from tensorboardX import SummaryWriter
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ALL_MODELS = sum(
|
||||
(
|
||||
tuple(conf.pretrained_config_archive_map.keys())
|
||||
for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
|
||||
),
|
||||
(),
|
||||
)
|
||||
|
||||
MODEL_CLASSES = {
|
||||
"bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
|
||||
"xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
|
||||
"xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
|
||||
"roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
|
||||
"distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
|
||||
"albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
|
||||
}
|
||||
|
||||
|
||||
def set_seed(args):
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
if args.n_gpu > 0:
|
||||
torch.cuda.manual_seed_all(args.seed)
|
||||
|
||||
|
||||
def train(args, train_dataset, model, tokenizer):
|
||||
""" Train the model """
|
||||
if args.local_rank in [-1, 0]:
|
||||
tb_writer = SummaryWriter()
|
||||
|
||||
args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
|
||||
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
|
||||
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||
|
||||
if args.max_steps > 0:
|
||||
t_total = args.max_steps
|
||||
args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
|
||||
else:
|
||||
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
||||
|
||||
# Prepare optimizer and schedule (linear warmup and decay)
|
||||
no_decay = ["bias", "LayerNorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||
"weight_decay": args.weight_decay,
|
||||
},
|
||||
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
|
||||
]
|
||||
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
|
||||
)
|
||||
if args.fp16:
|
||||
try:
|
||||
from apex import amp
|
||||
except ImportError:
|
||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
|
||||
|
||||
# multi-gpu training (should be after apex fp16 initialization)
|
||||
if args.n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# Distributed training (should be after apex fp16 initialization)
|
||||
if args.local_rank != -1:
|
||||
model = torch.nn.parallel.DistributedDataParallel(
|
||||
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
|
||||
)
|
||||
|
||||
# Train!
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num examples = %d", len(train_dataset))
|
||||
logger.info(" Num Epochs = %d", args.num_train_epochs)
|
||||
logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
|
||||
logger.info(
|
||||
" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
args.train_batch_size
|
||||
* args.gradient_accumulation_steps
|
||||
* (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
|
||||
)
|
||||
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
|
||||
logger.info(" Total optimization steps = %d", t_total)
|
||||
|
||||
global_step = 0
|
||||
tr_loss, logging_loss = 0.0, 0.0
|
||||
model.zero_grad()
|
||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
model.train()
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
|
||||
if args.model_type != "distilbert":
|
||||
inputs["token_type_ids"] = (
|
||||
batch[2] if args.model_type in ["bert", "xlnet"] else None
|
||||
) # XLM, DistilBERT and RoBERTa don't use segment_ids
|
||||
outputs = model(**inputs)
|
||||
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||
|
||||
if args.n_gpu > 1:
|
||||
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
||||
if args.gradient_accumulation_steps > 1:
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
|
||||
if args.fp16:
|
||||
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
else:
|
||||
loss.backward()
|
||||
|
||||
tr_loss += loss.item()
|
||||
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||
if args.fp16:
|
||||
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
|
||||
else:
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
|
||||
|
||||
optimizer.step()
|
||||
scheduler.step() # Update learning rate schedule
|
||||
model.zero_grad()
|
||||
global_step += 1
|
||||
|
||||
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
||||
logs = {}
|
||||
if (
|
||||
args.local_rank == -1 and args.evaluate_during_training
|
||||
): # Only evaluate when single GPU otherwise metrics may not average well
|
||||
results = evaluate(args, model, tokenizer)
|
||||
for key, value in results.items():
|
||||
eval_key = "eval_{}".format(key)
|
||||
logs[eval_key] = value
|
||||
|
||||
loss_scalar = (tr_loss - logging_loss) / args.logging_steps
|
||||
learning_rate_scalar = scheduler.get_lr()[0]
|
||||
logs["learning_rate"] = learning_rate_scalar
|
||||
logs["loss"] = loss_scalar
|
||||
logging_loss = tr_loss
|
||||
|
||||
for key, value in logs.items():
|
||||
tb_writer.add_scalar(key, value, global_step)
|
||||
# print(json.dumps({**logs, **{'step': global_step}}))
|
||||
|
||||
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||
# Save model checkpoint
|
||||
output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
model_to_save = (
|
||||
model.module if hasattr(model, "module") else model
|
||||
) # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(output_dir)
|
||||
torch.save(args, os.path.join(output_dir, "training_args.bin"))
|
||||
logger.info("Saving model checkpoint to %s", output_dir)
|
||||
|
||||
if args.max_steps > 0 and global_step > args.max_steps:
|
||||
epoch_iterator.close()
|
||||
break
|
||||
if args.max_steps > 0 and global_step > args.max_steps:
|
||||
train_iterator.close()
|
||||
break
|
||||
|
||||
if args.local_rank in [-1, 0]:
|
||||
tb_writer.close()
|
||||
|
||||
return global_step, tr_loss / global_step
|
||||
|
||||
|
||||
def evaluate(args, model, tokenizer, prefix=""):
|
||||
# Loop to handle MNLI double evaluation (matched, mis-matched)
|
||||
eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
|
||||
eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,)
|
||||
|
||||
results = {}
|
||||
for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
|
||||
eval_dataset, label_list = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
|
||||
|
||||
if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
|
||||
os.makedirs(eval_output_dir)
|
||||
|
||||
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
||||
# Note that DistributedSampler samples randomly
|
||||
eval_sampler = SequentialSampler(eval_dataset)
|
||||
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||
|
||||
# multi-gpu eval
|
||||
if args.n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# Eval!
|
||||
logger.info("***** Running evaluation {} *****".format(prefix))
|
||||
logger.info(" Num examples = %d", len(eval_dataset))
|
||||
logger.info(" Batch size = %d", args.eval_batch_size)
|
||||
eval_loss = 0.0
|
||||
nb_eval_steps = 0
|
||||
preds = None
|
||||
out_label_ids = None
|
||||
for batch in tqdm(eval_dataloader, desc="Evaluating"):
|
||||
model.eval()
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
|
||||
with torch.no_grad():
|
||||
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
|
||||
if args.model_type != "distilbert":
|
||||
inputs["token_type_ids"] = (
|
||||
batch[2] if args.model_type in ["bert", "xlnet"] else None
|
||||
) # XLM, DistilBERT and RoBERTa don't use segment_ids
|
||||
outputs = model(**inputs)
|
||||
tmp_eval_loss, logits = outputs[:2]
|
||||
|
||||
eval_loss += tmp_eval_loss.mean().item()
|
||||
nb_eval_steps += 1
|
||||
if preds is None:
|
||||
preds = logits.detach().cpu().numpy()
|
||||
out_label_ids = inputs["labels"].detach().cpu().numpy()
|
||||
pair_ids = batch[4].detach().cpu().numpy()
|
||||
else:
|
||||
preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
|
||||
out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
|
||||
pair_ids = np.append(pair_ids, batch[4].detach().cpu().numpy(), axis=0)
|
||||
|
||||
eval_loss = eval_loss / nb_eval_steps
|
||||
if args.output_mode == "classification":
|
||||
preds = np.argmax(preds, axis=1)
|
||||
elif args.output_mode == "regression":
|
||||
preds = np.squeeze(preds)
|
||||
|
||||
output_eval_file = os.path.join(eval_output_dir, "hans_predictions.txt")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
writer.write("pairID,gld_label\n")
|
||||
for pid, pred in zip(pair_ids, preds):
|
||||
writer.write("ex" + str(pid) + "," + label_list[int(pred)] + "\n")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
||||
if args.local_rank not in [-1, 0] and not evaluate:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||
|
||||
processor = processors[task]()
|
||||
output_mode = output_modes[task]
|
||||
# Load data features from cache or dataset file
|
||||
cached_features_file = os.path.join(
|
||||
args.data_dir,
|
||||
"cached_{}_{}_{}_{}".format(
|
||||
"dev" if evaluate else "train",
|
||||
list(filter(None, args.model_name_or_path.split("/"))).pop(),
|
||||
str(args.max_seq_length),
|
||||
str(task),
|
||||
),
|
||||
)
|
||||
|
||||
label_list = processor.get_labels()
|
||||
|
||||
if os.path.exists(cached_features_file) and not args.overwrite_cache:
|
||||
logger.info("Loading features from cached file %s", cached_features_file)
|
||||
features = torch.load(cached_features_file)
|
||||
else:
|
||||
logger.info("Creating features from dataset file at %s", args.data_dir)
|
||||
if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta"]:
|
||||
# HACK(label indices are swapped in RoBERTa pretrained model)
|
||||
label_list[1], label_list[2] = label_list[2], label_list[1]
|
||||
examples = (
|
||||
processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
|
||||
)
|
||||
features = convert_examples_to_features(
|
||||
examples,
|
||||
tokenizer,
|
||||
label_list=label_list,
|
||||
max_length=args.max_seq_length,
|
||||
output_mode=output_mode,
|
||||
pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet
|
||||
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
||||
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
|
||||
)
|
||||
if args.local_rank in [-1, 0]:
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
torch.save(features, cached_features_file)
|
||||
|
||||
if args.local_rank == 0 and not evaluate:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||
|
||||
# Convert to Tensors and build dataset
|
||||
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
|
||||
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
|
||||
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
|
||||
if output_mode == "classification":
|
||||
all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
|
||||
elif output_mode == "regression":
|
||||
all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
|
||||
all_pair_ids = torch.tensor([int(f.pairID) for f in features], dtype=torch.long)
|
||||
|
||||
dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels, all_pair_ids)
|
||||
return dataset, label_list
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
# Required parameters
|
||||
parser.add_argument(
|
||||
"--data_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_type",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_name_or_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--task_name",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
|
||||
# Other parameters
|
||||
parser.add_argument(
|
||||
"--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer_name",
|
||||
default="",
|
||||
type=str,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache_dir",
|
||||
default="",
|
||||
type=str,
|
||||
help="Where do you want to store the pre-trained models downloaded from s3",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_seq_length",
|
||||
default=128,
|
||||
type=int,
|
||||
help="The maximum total input sequence length after tokenization. Sequences longer "
|
||||
"than this will be truncated, sequences shorter will be padded.",
|
||||
)
|
||||
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
|
||||
parser.add_argument(
|
||||
"--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
|
||||
)
|
||||
|
||||
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
|
||||
parser.add_argument(
|
||||
"--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.",
|
||||
)
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
|
||||
parser.add_argument(
|
||||
"--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_steps",
|
||||
default=-1,
|
||||
type=int,
|
||||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
|
||||
)
|
||||
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
|
||||
|
||||
parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
|
||||
parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
|
||||
parser.add_argument(
|
||||
"--eval_all_checkpoints",
|
||||
action="store_true",
|
||||
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
|
||||
)
|
||||
parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
|
||||
parser.add_argument(
|
||||
"--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
|
||||
|
||||
parser.add_argument(
|
||||
"--fp16",
|
||||
action="store_true",
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fp16_opt_level",
|
||||
type=str,
|
||||
default="O1",
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html",
|
||||
)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||
parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
|
||||
parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if (
|
||||
os.path.exists(args.output_dir)
|
||||
and os.listdir(args.output_dir)
|
||||
and args.do_train
|
||||
and not args.overwrite_output_dir
|
||||
):
|
||||
raise ValueError(
|
||||
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
|
||||
args.output_dir
|
||||
)
|
||||
)
|
||||
|
||||
# Setup distant debugging if needed
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
|
||||
# Setup CUDA, GPU & distributed training
|
||||
if args.local_rank == -1 or args.no_cuda:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
args.n_gpu = torch.cuda.device_count()
|
||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
args.n_gpu = 1
|
||||
args.device = device
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
|
||||
)
|
||||
logger.warning(
|
||||
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
args.local_rank,
|
||||
device,
|
||||
args.n_gpu,
|
||||
bool(args.local_rank != -1),
|
||||
args.fp16,
|
||||
)
|
||||
|
||||
# Set seed
|
||||
set_seed(args)
|
||||
|
||||
# Prepare GLUE task
|
||||
args.task_name = args.task_name.lower()
|
||||
if args.task_name not in processors:
|
||||
raise ValueError("Task not found: %s" % (args.task_name))
|
||||
processor = processors[args.task_name]()
|
||||
args.output_mode = output_modes[args.task_name]
|
||||
label_list = processor.get_labels()
|
||||
num_labels = len(label_list)
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
if args.local_rank not in [-1, 0]:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||
|
||||
args.model_type = args.model_type.lower()
|
||||
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
|
||||
config = config_class.from_pretrained(
|
||||
args.config_name if args.config_name else args.model_name_or_path,
|
||||
num_labels=num_labels,
|
||||
finetuning_task=args.task_name,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
tokenizer = tokenizer_class.from_pretrained(
|
||||
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
||||
do_lower_case=args.do_lower_case,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
model = model_class.from_pretrained(
|
||||
args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
|
||||
if args.local_rank == 0:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||
|
||||
model.to(args.device)
|
||||
|
||||
logger.info("Training/evaluation parameters %s", args)
|
||||
|
||||
# Training
|
||||
if args.do_train:
|
||||
train_dataset, _ = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
|
||||
global_step, tr_loss = train(args, train_dataset, model, tokenizer)
|
||||
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
|
||||
|
||||
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
|
||||
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||
# Create output directory if needed
|
||||
if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
logger.info("Saving model checkpoint to %s", args.output_dir)
|
||||
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
|
||||
# They can then be reloaded using `from_pretrained()`
|
||||
model_to_save = (
|
||||
model.module if hasattr(model, "module") else model
|
||||
) # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(args.output_dir)
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
|
||||
# Good practice: save your training arguments together with the trained model
|
||||
torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
|
||||
|
||||
# Load a trained model and vocabulary that you have fine-tuned
|
||||
model = model_class.from_pretrained(args.output_dir)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir)
|
||||
model.to(args.device)
|
||||
|
||||
# Evaluation
|
||||
results = {}
|
||||
if args.do_eval and args.local_rank in [-1, 0]:
|
||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
||||
checkpoints = [args.output_dir]
|
||||
if args.eval_all_checkpoints:
|
||||
checkpoints = list(
|
||||
os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
|
||||
)
|
||||
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||
for checkpoint in checkpoints:
|
||||
global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
|
||||
prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
|
||||
|
||||
model = model_class.from_pretrained(checkpoint)
|
||||
model.to(args.device)
|
||||
result = evaluate(args, model, tokenizer, prefix=prefix)
|
||||
result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
|
||||
results.update(result)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -14,11 +14,11 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import csv
|
||||
import sys
|
||||
import copy
|
||||
import csv
|
||||
import json
|
||||
|
||||
|
||||
class InputExample(object):
|
||||
"""
|
||||
A single training/test example for simple sequence classification.
|
||||
@@ -32,11 +32,13 @@ class InputExample(object):
|
||||
label: (Optional) string. The label of the example. This should be
|
||||
specified for train and dev examples, but not for test examples.
|
||||
"""
|
||||
def __init__(self, guid, text_a, text_b=None, label=None):
|
||||
|
||||
def __init__(self, guid, text_a, text_b=None, label=None, pairID=None):
|
||||
self.guid = guid
|
||||
self.text_a = text_a
|
||||
self.text_b = text_b
|
||||
self.label = label
|
||||
self.pairID = pairID
|
||||
|
||||
def __repr__(self):
|
||||
return str(self.to_json_string())
|
||||
@@ -64,11 +66,12 @@ class InputFeatures(object):
|
||||
label: Label corresponding to the input
|
||||
"""
|
||||
|
||||
def __init__(self, input_ids, attention_mask, token_type_ids, label):
|
||||
def __init__(self, input_ids, attention_mask, token_type_ids, label, pairID=None):
|
||||
self.input_ids = input_ids
|
||||
self.attention_mask = attention_mask
|
||||
self.token_type_ids = token_type_ids
|
||||
self.label = label
|
||||
self.pairID = pairID
|
||||
|
||||
def __repr__(self):
|
||||
return str(self.to_json_string())
|
||||
@@ -107,13 +110,6 @@ class DataProcessor(object):
|
||||
"""Gets the list of labels for this data set."""
|
||||
raise NotImplementedError()
|
||||
|
||||
def tfds_map(self, example):
|
||||
"""Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are.
|
||||
This method converts examples to the correct format."""
|
||||
if len(self.get_labels()) > 1:
|
||||
example.label = self.get_labels()[int(example.label)]
|
||||
return example
|
||||
|
||||
@classmethod
|
||||
def _read_tsv(cls, input_file, quotechar=None):
|
||||
"""Reads a tab separated value file."""
|
||||
@@ -121,7 +117,5 @@ class DataProcessor(object):
|
||||
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
|
||||
lines = []
|
||||
for line in reader:
|
||||
if sys.version_info[0] == 2:
|
||||
line = list(unicode(cell, 'utf-8') for cell in line)
|
||||
lines.append(line)
|
||||
return lines
|
||||
614
examples/mm-imdb/run_mmimdb.py
Normal file
614
examples/mm-imdb/run_mmimdb.py
Normal file
@@ -0,0 +1,614 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) Facebook, Inc. and its affiliates.
|
||||
# Copyright (c) HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Finetuning the library models for multimodal multiclass prediction on MM-IMDB dataset."""
|
||||
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from sklearn.metrics import f1_score
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from transformers import (
|
||||
WEIGHTS_NAME,
|
||||
AdamW,
|
||||
AlbertConfig,
|
||||
AlbertModel,
|
||||
AlbertTokenizer,
|
||||
BertConfig,
|
||||
BertModel,
|
||||
BertTokenizer,
|
||||
DistilBertConfig,
|
||||
DistilBertModel,
|
||||
DistilBertTokenizer,
|
||||
MMBTConfig,
|
||||
MMBTForClassification,
|
||||
RobertaConfig,
|
||||
RobertaModel,
|
||||
RobertaTokenizer,
|
||||
XLMConfig,
|
||||
XLMModel,
|
||||
XLMTokenizer,
|
||||
XLNetConfig,
|
||||
XLNetModel,
|
||||
XLNetTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
)
|
||||
from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_image_transforms, get_mmimdb_labels
|
||||
|
||||
|
||||
try:
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
except ImportError:
|
||||
from tensorboardX import SummaryWriter
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ALL_MODELS = sum(
|
||||
(
|
||||
tuple(conf.pretrained_config_archive_map.keys())
|
||||
for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
|
||||
),
|
||||
(),
|
||||
)
|
||||
|
||||
MODEL_CLASSES = {
|
||||
"bert": (BertConfig, BertModel, BertTokenizer),
|
||||
"xlnet": (XLNetConfig, XLNetModel, XLNetTokenizer),
|
||||
"xlm": (XLMConfig, XLMModel, XLMTokenizer),
|
||||
"roberta": (RobertaConfig, RobertaModel, RobertaTokenizer),
|
||||
"distilbert": (DistilBertConfig, DistilBertModel, DistilBertTokenizer),
|
||||
"albert": (AlbertConfig, AlbertModel, AlbertTokenizer),
|
||||
}
|
||||
|
||||
|
||||
def set_seed(args):
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
if args.n_gpu > 0:
|
||||
torch.cuda.manual_seed_all(args.seed)
|
||||
|
||||
|
||||
def train(args, train_dataset, model, tokenizer, criterion):
|
||||
""" Train the model """
|
||||
if args.local_rank in [-1, 0]:
|
||||
tb_writer = SummaryWriter()
|
||||
|
||||
args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
|
||||
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
|
||||
train_dataloader = DataLoader(
|
||||
train_dataset,
|
||||
sampler=train_sampler,
|
||||
batch_size=args.train_batch_size,
|
||||
collate_fn=collate_fn,
|
||||
num_workers=args.num_workers,
|
||||
)
|
||||
|
||||
if args.max_steps > 0:
|
||||
t_total = args.max_steps
|
||||
args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
|
||||
else:
|
||||
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
||||
|
||||
# Prepare optimizer and schedule (linear warmup and decay)
|
||||
no_decay = ["bias", "LayerNorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||
"weight_decay": args.weight_decay,
|
||||
},
|
||||
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
|
||||
]
|
||||
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
|
||||
)
|
||||
if args.fp16:
|
||||
try:
|
||||
from apex import amp
|
||||
except ImportError:
|
||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
|
||||
|
||||
# multi-gpu training (should be after apex fp16 initialization)
|
||||
if args.n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# Distributed training (should be after apex fp16 initialization)
|
||||
if args.local_rank != -1:
|
||||
model = torch.nn.parallel.DistributedDataParallel(
|
||||
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
|
||||
)
|
||||
|
||||
# Train!
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num examples = %d", len(train_dataset))
|
||||
logger.info(" Num Epochs = %d", args.num_train_epochs)
|
||||
logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
|
||||
logger.info(
|
||||
" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
args.train_batch_size
|
||||
* args.gradient_accumulation_steps
|
||||
* (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
|
||||
)
|
||||
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
|
||||
logger.info(" Total optimization steps = %d", t_total)
|
||||
|
||||
global_step = 0
|
||||
tr_loss, logging_loss = 0.0, 0.0
|
||||
best_f1, n_no_improve = 0, 0
|
||||
model.zero_grad()
|
||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||
set_seed(args) # Added here for reproductibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
model.train()
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
labels = batch[5]
|
||||
inputs = {
|
||||
"input_ids": batch[0],
|
||||
"input_modal": batch[2],
|
||||
"attention_mask": batch[1],
|
||||
"modal_start_tokens": batch[3],
|
||||
"modal_end_tokens": batch[4],
|
||||
}
|
||||
outputs = model(**inputs)
|
||||
logits = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||
loss = criterion(logits, labels)
|
||||
|
||||
if args.n_gpu > 1:
|
||||
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
||||
if args.gradient_accumulation_steps > 1:
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
|
||||
if args.fp16:
|
||||
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
else:
|
||||
loss.backward()
|
||||
|
||||
tr_loss += loss.item()
|
||||
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||
if args.fp16:
|
||||
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
|
||||
else:
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
|
||||
|
||||
optimizer.step()
|
||||
scheduler.step() # Update learning rate schedule
|
||||
model.zero_grad()
|
||||
global_step += 1
|
||||
|
||||
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
||||
logs = {}
|
||||
if (
|
||||
args.local_rank == -1 and args.evaluate_during_training
|
||||
): # Only evaluate when single GPU otherwise metrics may not average well
|
||||
results = evaluate(args, model, tokenizer, criterion)
|
||||
for key, value in results.items():
|
||||
eval_key = "eval_{}".format(key)
|
||||
logs[eval_key] = value
|
||||
|
||||
loss_scalar = (tr_loss - logging_loss) / args.logging_steps
|
||||
learning_rate_scalar = scheduler.get_lr()[0]
|
||||
logs["learning_rate"] = learning_rate_scalar
|
||||
logs["loss"] = loss_scalar
|
||||
logging_loss = tr_loss
|
||||
|
||||
for key, value in logs.items():
|
||||
tb_writer.add_scalar(key, value, global_step)
|
||||
print(json.dumps({**logs, **{"step": global_step}}))
|
||||
|
||||
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||
# Save model checkpoint
|
||||
output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
model_to_save = (
|
||||
model.module if hasattr(model, "module") else model
|
||||
) # Take care of distributed/parallel training
|
||||
torch.save(model_to_save.state_dict(), os.path.join(output_dir, WEIGHTS_NAME))
|
||||
torch.save(args, os.path.join(output_dir, "training_args.bin"))
|
||||
logger.info("Saving model checkpoint to %s", output_dir)
|
||||
|
||||
if args.max_steps > 0 and global_step > args.max_steps:
|
||||
epoch_iterator.close()
|
||||
break
|
||||
if args.max_steps > 0 and global_step > args.max_steps:
|
||||
train_iterator.close()
|
||||
break
|
||||
|
||||
if args.local_rank == -1:
|
||||
results = evaluate(args, model, tokenizer, criterion)
|
||||
if results["micro_f1"] > best_f1:
|
||||
best_f1 = results["micro_f1"]
|
||||
n_no_improve = 0
|
||||
else:
|
||||
n_no_improve += 1
|
||||
|
||||
if n_no_improve > args.patience:
|
||||
train_iterator.close()
|
||||
break
|
||||
|
||||
if args.local_rank in [-1, 0]:
|
||||
tb_writer.close()
|
||||
|
||||
return global_step, tr_loss / global_step
|
||||
|
||||
|
||||
def evaluate(args, model, tokenizer, criterion, prefix=""):
|
||||
# Loop to handle MNLI double evaluation (matched, mis-matched)
|
||||
eval_output_dir = args.output_dir
|
||||
eval_dataset = load_examples(args, tokenizer, evaluate=True)
|
||||
|
||||
if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
|
||||
os.makedirs(eval_output_dir)
|
||||
|
||||
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
||||
# Note that DistributedSampler samples randomly
|
||||
eval_sampler = SequentialSampler(eval_dataset)
|
||||
eval_dataloader = DataLoader(
|
||||
eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn
|
||||
)
|
||||
|
||||
# multi-gpu eval
|
||||
if args.n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# Eval!
|
||||
logger.info("***** Running evaluation {} *****".format(prefix))
|
||||
logger.info(" Num examples = %d", len(eval_dataset))
|
||||
logger.info(" Batch size = %d", args.eval_batch_size)
|
||||
eval_loss = 0.0
|
||||
nb_eval_steps = 0
|
||||
preds = None
|
||||
out_label_ids = None
|
||||
for batch in tqdm(eval_dataloader, desc="Evaluating"):
|
||||
model.eval()
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
|
||||
with torch.no_grad():
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
labels = batch[5]
|
||||
inputs = {
|
||||
"input_ids": batch[0],
|
||||
"input_modal": batch[2],
|
||||
"attention_mask": batch[1],
|
||||
"modal_start_tokens": batch[3],
|
||||
"modal_end_tokens": batch[4],
|
||||
}
|
||||
outputs = model(**inputs)
|
||||
logits = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||
tmp_eval_loss = criterion(logits, labels)
|
||||
eval_loss += tmp_eval_loss.mean().item()
|
||||
nb_eval_steps += 1
|
||||
if preds is None:
|
||||
preds = torch.sigmoid(logits).detach().cpu().numpy() > 0.5
|
||||
out_label_ids = labels.detach().cpu().numpy()
|
||||
else:
|
||||
preds = np.append(preds, torch.sigmoid(logits).detach().cpu().numpy() > 0.5, axis=0)
|
||||
out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0)
|
||||
|
||||
eval_loss = eval_loss / nb_eval_steps
|
||||
result = {
|
||||
"loss": eval_loss,
|
||||
"macro_f1": f1_score(out_label_ids, preds, average="macro"),
|
||||
"micro_f1": f1_score(out_label_ids, preds, average="micro"),
|
||||
}
|
||||
|
||||
output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
logger.info("***** Eval results {} *****".format(prefix))
|
||||
for key in sorted(result.keys()):
|
||||
logger.info(" %s = %s", key, str(result[key]))
|
||||
writer.write("%s = %s\n" % (key, str(result[key])))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def load_examples(args, tokenizer, evaluate=False):
|
||||
path = os.path.join(args.data_dir, "dev.jsonl" if evaluate else "train.jsonl")
|
||||
transforms = get_image_transforms()
|
||||
labels = get_mmimdb_labels()
|
||||
dataset = JsonlDataset(path, tokenizer, transforms, labels, args.max_seq_length - args.num_image_embeds - 2)
|
||||
return dataset
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
# Required parameters
|
||||
parser.add_argument(
|
||||
"--data_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The input data dir. Should contain the .jsonl files for MMIMDB.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_type",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_name_or_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
|
||||
# Other parameters
|
||||
parser.add_argument(
|
||||
"--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer_name",
|
||||
default="",
|
||||
type=str,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache_dir",
|
||||
default="",
|
||||
type=str,
|
||||
help="Where do you want to store the pre-trained models downloaded from s3",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_seq_length",
|
||||
default=128,
|
||||
type=int,
|
||||
help="The maximum total input sequence length after tokenization. Sequences longer "
|
||||
"than this will be truncated, sequences shorter will be padded.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num_image_embeds", default=1, type=int, help="Number of Image Embeddings from the Image Encoder"
|
||||
)
|
||||
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
|
||||
parser.add_argument(
|
||||
"--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
|
||||
)
|
||||
|
||||
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
|
||||
parser.add_argument(
|
||||
"--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.",
|
||||
)
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
|
||||
parser.add_argument(
|
||||
"--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
|
||||
)
|
||||
parser.add_argument("--patience", default=5, type=int, help="Patience for Early Stopping.")
|
||||
parser.add_argument(
|
||||
"--max_steps",
|
||||
default=-1,
|
||||
type=int,
|
||||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
|
||||
)
|
||||
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
|
||||
|
||||
parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
|
||||
parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
|
||||
parser.add_argument(
|
||||
"--eval_all_checkpoints",
|
||||
action="store_true",
|
||||
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
|
||||
)
|
||||
parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
|
||||
parser.add_argument("--num_workers", type=int, default=8, help="number of worker threads for dataloading")
|
||||
parser.add_argument(
|
||||
"--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
|
||||
|
||||
parser.add_argument(
|
||||
"--fp16",
|
||||
action="store_true",
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fp16_opt_level",
|
||||
type=str,
|
||||
default="O1",
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html",
|
||||
)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||
parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
|
||||
parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if (
|
||||
os.path.exists(args.output_dir)
|
||||
and os.listdir(args.output_dir)
|
||||
and args.do_train
|
||||
and not args.overwrite_output_dir
|
||||
):
|
||||
raise ValueError(
|
||||
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
|
||||
args.output_dir
|
||||
)
|
||||
)
|
||||
|
||||
# Setup distant debugging if needed
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
|
||||
# Setup CUDA, GPU & distributed training
|
||||
if args.local_rank == -1 or args.no_cuda:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
args.n_gpu = torch.cuda.device_count()
|
||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
args.n_gpu = 1
|
||||
|
||||
args.device = device
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
|
||||
)
|
||||
logger.warning(
|
||||
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
args.local_rank,
|
||||
device,
|
||||
args.n_gpu,
|
||||
bool(args.local_rank != -1),
|
||||
args.fp16,
|
||||
)
|
||||
|
||||
# Set seed
|
||||
set_seed(args)
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
if args.local_rank not in [-1, 0]:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||
|
||||
# Setup model
|
||||
labels = get_mmimdb_labels()
|
||||
num_labels = len(labels)
|
||||
args.model_type = args.model_type.lower()
|
||||
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
|
||||
transformer_config = config_class.from_pretrained(
|
||||
args.config_name if args.config_name else args.model_name_or_path
|
||||
)
|
||||
tokenizer = tokenizer_class.from_pretrained(
|
||||
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
||||
do_lower_case=args.do_lower_case,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
transformer = model_class.from_pretrained(
|
||||
args.model_name_or_path, config=transformer_config, cache_dir=args.cache_dir if args.cache_dir else None
|
||||
)
|
||||
img_encoder = ImageEncoder(args)
|
||||
config = MMBTConfig(transformer_config, num_labels=num_labels)
|
||||
model = MMBTForClassification(config, transformer, img_encoder)
|
||||
|
||||
if args.local_rank == 0:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||
|
||||
model.to(args.device)
|
||||
|
||||
logger.info("Training/evaluation parameters %s", args)
|
||||
|
||||
# Training
|
||||
if args.do_train:
|
||||
train_dataset = load_examples(args, tokenizer, evaluate=False)
|
||||
label_frequences = train_dataset.get_label_frequencies()
|
||||
label_frequences = [label_frequences[l] for l in labels]
|
||||
label_weights = (
|
||||
torch.tensor(label_frequences, device=args.device, dtype=torch.float) / len(train_dataset)
|
||||
) ** -1
|
||||
criterion = nn.BCEWithLogitsLoss(pos_weight=label_weights)
|
||||
global_step, tr_loss = train(args, train_dataset, model, tokenizer, criterion)
|
||||
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
|
||||
|
||||
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
|
||||
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||
# Create output directory if needed
|
||||
if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
logger.info("Saving model checkpoint to %s", args.output_dir)
|
||||
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
|
||||
# They can then be reloaded using `from_pretrained()`
|
||||
model_to_save = (
|
||||
model.module if hasattr(model, "module") else model
|
||||
) # Take care of distributed/parallel training
|
||||
torch.save(model_to_save.state_dict(), os.path.join(args.output_dir, WEIGHTS_NAME))
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
|
||||
# Good practice: save your training arguments together with the trained model
|
||||
torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
|
||||
|
||||
# Load a trained model and vocabulary that you have fine-tuned
|
||||
model = MMBTForClassification(config, transformer, img_encoder)
|
||||
model.load_state_dict(torch.load(os.path.join(args.output_dir, WEIGHTS_NAME)))
|
||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir)
|
||||
model.to(args.device)
|
||||
|
||||
# Evaluation
|
||||
results = {}
|
||||
if args.do_eval and args.local_rank in [-1, 0]:
|
||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
||||
checkpoints = [args.output_dir]
|
||||
if args.eval_all_checkpoints:
|
||||
checkpoints = list(
|
||||
os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
|
||||
)
|
||||
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||
for checkpoint in checkpoints:
|
||||
global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
|
||||
prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
|
||||
model = MMBTForClassification(config, transformer, img_encoder)
|
||||
model.load_state_dict(torch.load(checkpoint))
|
||||
model.to(args.device)
|
||||
result = evaluate(args, model, tokenizer, criterion, prefix=prefix)
|
||||
result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
|
||||
results.update(result)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
143
examples/mm-imdb/utils_mmimdb.py
Normal file
143
examples/mm-imdb/utils_mmimdb.py
Normal file
@@ -0,0 +1,143 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) Facebook, Inc. and its affiliates.
|
||||
# Copyright (c) HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import os
|
||||
from collections import Counter
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torchvision
|
||||
import torchvision.transforms as transforms
|
||||
from PIL import Image
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
|
||||
POOLING_BREAKDOWN = {1: (1, 1), 2: (2, 1), 3: (3, 1), 4: (2, 2), 5: (5, 1), 6: (3, 2), 7: (7, 1), 8: (4, 2), 9: (3, 3)}
|
||||
|
||||
|
||||
class ImageEncoder(nn.Module):
|
||||
def __init__(self, args):
|
||||
super().__init__()
|
||||
model = torchvision.models.resnet152(pretrained=True)
|
||||
modules = list(model.children())[:-2]
|
||||
self.model = nn.Sequential(*modules)
|
||||
self.pool = nn.AdaptiveAvgPool2d(POOLING_BREAKDOWN[args.num_image_embeds])
|
||||
|
||||
def forward(self, x):
|
||||
# Bx3x224x224 -> Bx2048x7x7 -> Bx2048xN -> BxNx2048
|
||||
out = self.pool(self.model(x))
|
||||
out = torch.flatten(out, start_dim=2)
|
||||
out = out.transpose(1, 2).contiguous()
|
||||
return out # BxNx2048
|
||||
|
||||
|
||||
class JsonlDataset(Dataset):
|
||||
def __init__(self, data_path, tokenizer, transforms, labels, max_seq_length):
|
||||
self.data = [json.loads(l) for l in open(data_path)]
|
||||
self.data_dir = os.path.dirname(data_path)
|
||||
self.tokenizer = tokenizer
|
||||
self.labels = labels
|
||||
self.n_classes = len(labels)
|
||||
self.max_seq_length = max_seq_length
|
||||
|
||||
self.transforms = transforms
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, index):
|
||||
sentence = torch.LongTensor(self.tokenizer.encode(self.data[index]["text"], add_special_tokens=True))
|
||||
start_token, sentence, end_token = sentence[0], sentence[1:-1], sentence[-1]
|
||||
sentence = sentence[: self.max_seq_length]
|
||||
|
||||
label = torch.zeros(self.n_classes)
|
||||
label[[self.labels.index(tgt) for tgt in self.data[index]["label"]]] = 1
|
||||
|
||||
image = Image.open(os.path.join(self.data_dir, self.data[index]["img"])).convert("RGB")
|
||||
image = self.transforms(image)
|
||||
|
||||
return {
|
||||
"image_start_token": start_token,
|
||||
"image_end_token": end_token,
|
||||
"sentence": sentence,
|
||||
"image": image,
|
||||
"label": label,
|
||||
}
|
||||
|
||||
def get_label_frequencies(self):
|
||||
label_freqs = Counter()
|
||||
for row in self.data:
|
||||
label_freqs.update(row["label"])
|
||||
return label_freqs
|
||||
|
||||
|
||||
def collate_fn(batch):
|
||||
lens = [len(row["sentence"]) for row in batch]
|
||||
bsz, max_seq_len = len(batch), max(lens)
|
||||
|
||||
mask_tensor = torch.zeros(bsz, max_seq_len, dtype=torch.long)
|
||||
text_tensor = torch.zeros(bsz, max_seq_len, dtype=torch.long)
|
||||
|
||||
for i_batch, (input_row, length) in enumerate(zip(batch, lens)):
|
||||
text_tensor[i_batch, :length] = input_row["sentence"]
|
||||
mask_tensor[i_batch, :length] = 1
|
||||
|
||||
img_tensor = torch.stack([row["image"] for row in batch])
|
||||
tgt_tensor = torch.stack([row["label"] for row in batch])
|
||||
img_start_token = torch.stack([row["image_start_token"] for row in batch])
|
||||
img_end_token = torch.stack([row["image_end_token"] for row in batch])
|
||||
|
||||
return text_tensor, mask_tensor, img_tensor, img_start_token, img_end_token, tgt_tensor
|
||||
|
||||
|
||||
def get_mmimdb_labels():
|
||||
return [
|
||||
"Crime",
|
||||
"Drama",
|
||||
"Thriller",
|
||||
"Action",
|
||||
"Comedy",
|
||||
"Romance",
|
||||
"Documentary",
|
||||
"Short",
|
||||
"Mystery",
|
||||
"History",
|
||||
"Family",
|
||||
"Adventure",
|
||||
"Fantasy",
|
||||
"Sci-Fi",
|
||||
"Western",
|
||||
"Horror",
|
||||
"Sport",
|
||||
"War",
|
||||
"Music",
|
||||
"Musical",
|
||||
"Animation",
|
||||
"Biography",
|
||||
"Film-Noir",
|
||||
]
|
||||
|
||||
|
||||
def get_image_transforms():
|
||||
return transforms.Compose(
|
||||
[
|
||||
transforms.Resize(256),
|
||||
transforms.CenterCrop(224),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(mean=[0.46777044, 0.44531429, 0.40661017], std=[0.12221994, 0.12145835, 0.14380469],),
|
||||
]
|
||||
)
|
||||
@@ -15,7 +15,7 @@ Please check out the repo under uber-research for more information: https://gith
|
||||
|
||||
```bash
|
||||
git clone https://github.com/huggingface/transformers && cd transformers
|
||||
pip install [--editable] .
|
||||
pip install .
|
||||
pip install nltk torchtext # additional requirements.
|
||||
cd examples/pplm
|
||||
```
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
import torch
|
||||
|
||||
|
||||
class ClassificationHead(torch.nn.Module):
|
||||
"""Classification Head for transformer encoders"""
|
||||
|
||||
def __init__(self, class_size, embed_size):
|
||||
super(ClassificationHead, self).__init__()
|
||||
super().__init__()
|
||||
self.class_size = class_size
|
||||
self.embed_size = embed_size
|
||||
# self.mlp1 = torch.nn.Linear(embed_size, embed_size)
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
#! /usr/bin/env python3
|
||||
# coding=utf-8
|
||||
|
||||
#Copyright (c) 2019 Uber Technologies, Inc.
|
||||
# Copyright (c) 2019 Uber Technologies, Inc.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
#http://www.apache.org/licenses/LICENSE-2.0
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Example command with bag of words:
|
||||
@@ -34,10 +34,11 @@ import torch.nn.functional as F
|
||||
from torch.autograd import Variable
|
||||
from tqdm import trange
|
||||
|
||||
from pplm_classification_head import ClassificationHead
|
||||
from transformers import GPT2Tokenizer
|
||||
from transformers.file_utils import cached_path
|
||||
from transformers.modeling_gpt2 import GPT2LMHeadModel
|
||||
from pplm_classification_head import ClassificationHead
|
||||
|
||||
|
||||
PPLM_BOW = 1
|
||||
PPLM_DISCRIM = 2
|
||||
@@ -46,13 +47,13 @@ SMALL_CONST = 1e-15
|
||||
BIG_CONST = 1e10
|
||||
|
||||
BAG_OF_WORDS_ARCHIVE_MAP = {
|
||||
'legal': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/legal.txt",
|
||||
'military': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/military.txt",
|
||||
'politics': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/politics.txt",
|
||||
'religion': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/religion.txt",
|
||||
'science': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/science.txt",
|
||||
'space': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/space.txt",
|
||||
'technology': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/technology.txt",
|
||||
"legal": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/legal.txt",
|
||||
"military": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/military.txt",
|
||||
"politics": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/politics.txt",
|
||||
"religion": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/religion.txt",
|
||||
"science": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/science.txt",
|
||||
"space": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/space.txt",
|
||||
"technology": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/technology.txt",
|
||||
}
|
||||
|
||||
DISCRIMINATOR_MODELS_PARAMS = {
|
||||
@@ -75,10 +76,10 @@ DISCRIMINATOR_MODELS_PARAMS = {
|
||||
}
|
||||
|
||||
|
||||
def to_var(x, requires_grad=False, volatile=False, device='cuda'):
|
||||
if torch.cuda.is_available() and device == 'cuda':
|
||||
def to_var(x, requires_grad=False, volatile=False, device="cuda"):
|
||||
if torch.cuda.is_available() and device == "cuda":
|
||||
x = x.cuda()
|
||||
elif device != 'cuda':
|
||||
elif device != "cuda":
|
||||
x = x.to(device)
|
||||
return Variable(x, requires_grad=requires_grad, volatile=volatile)
|
||||
|
||||
@@ -95,49 +96,39 @@ def top_k_filter(logits, k, probs=False):
|
||||
values = torch.topk(logits, k)[0]
|
||||
batch_mins = values[:, -1].view(-1, 1).expand_as(logits)
|
||||
if probs:
|
||||
return torch.where(logits < batch_mins,
|
||||
torch.ones_like(logits) * 0.0, logits)
|
||||
return torch.where(logits < batch_mins,
|
||||
torch.ones_like(logits) * -BIG_CONST,
|
||||
logits)
|
||||
return torch.where(logits < batch_mins, torch.ones_like(logits) * 0.0, logits)
|
||||
return torch.where(logits < batch_mins, torch.ones_like(logits) * -BIG_CONST, logits)
|
||||
|
||||
|
||||
def perturb_past(
|
||||
past,
|
||||
model,
|
||||
last,
|
||||
unpert_past=None,
|
||||
unpert_logits=None,
|
||||
accumulated_hidden=None,
|
||||
grad_norms=None,
|
||||
stepsize=0.01,
|
||||
one_hot_bows_vectors=None,
|
||||
classifier=None,
|
||||
class_label=None,
|
||||
loss_type=0,
|
||||
num_iterations=3,
|
||||
horizon_length=1,
|
||||
window_length=0,
|
||||
decay=False,
|
||||
gamma=1.5,
|
||||
kl_scale=0.01,
|
||||
device='cuda',
|
||||
past,
|
||||
model,
|
||||
last,
|
||||
unpert_past=None,
|
||||
unpert_logits=None,
|
||||
accumulated_hidden=None,
|
||||
grad_norms=None,
|
||||
stepsize=0.01,
|
||||
one_hot_bows_vectors=None,
|
||||
classifier=None,
|
||||
class_label=None,
|
||||
loss_type=0,
|
||||
num_iterations=3,
|
||||
horizon_length=1,
|
||||
window_length=0,
|
||||
decay=False,
|
||||
gamma=1.5,
|
||||
kl_scale=0.01,
|
||||
device="cuda",
|
||||
):
|
||||
# Generate inital perturbed past
|
||||
grad_accumulator = [
|
||||
(np.zeros(p.shape).astype("float32"))
|
||||
for p in past
|
||||
]
|
||||
grad_accumulator = [(np.zeros(p.shape).astype("float32")) for p in past]
|
||||
|
||||
if accumulated_hidden is None:
|
||||
accumulated_hidden = 0
|
||||
|
||||
if decay:
|
||||
decay_mask = torch.arange(
|
||||
0.,
|
||||
1.0 + SMALL_CONST,
|
||||
1.0 / (window_length)
|
||||
)[1:]
|
||||
decay_mask = torch.arange(0.0, 1.0 + SMALL_CONST, 1.0 / (window_length))[1:]
|
||||
else:
|
||||
decay_mask = 1.0
|
||||
|
||||
@@ -146,26 +137,17 @@ def perturb_past(
|
||||
_, _, _, curr_length, _ = past[0].shape
|
||||
|
||||
if curr_length > window_length and window_length > 0:
|
||||
ones_key_val_shape = (
|
||||
tuple(past[0].shape[:-2])
|
||||
+ tuple([window_length])
|
||||
+ tuple(past[0].shape[-1:])
|
||||
)
|
||||
ones_key_val_shape = tuple(past[0].shape[:-2]) + tuple([window_length]) + tuple(past[0].shape[-1:])
|
||||
|
||||
zeros_key_val_shape = (
|
||||
tuple(past[0].shape[:-2])
|
||||
+ tuple([curr_length - window_length])
|
||||
+ tuple(past[0].shape[-1:])
|
||||
tuple(past[0].shape[:-2]) + tuple([curr_length - window_length]) + tuple(past[0].shape[-1:])
|
||||
)
|
||||
|
||||
ones_mask = torch.ones(ones_key_val_shape)
|
||||
ones_mask = decay_mask * ones_mask.permute(0, 1, 2, 4, 3)
|
||||
ones_mask = ones_mask.permute(0, 1, 2, 4, 3)
|
||||
|
||||
window_mask = torch.cat(
|
||||
(ones_mask, torch.zeros(zeros_key_val_shape)),
|
||||
dim=-2
|
||||
).to(device)
|
||||
window_mask = torch.cat((ones_mask, torch.zeros(zeros_key_val_shape)), dim=-2).to(device)
|
||||
else:
|
||||
window_mask = torch.ones_like(past[0]).to(device)
|
||||
|
||||
@@ -175,8 +157,7 @@ def perturb_past(
|
||||
for i in range(num_iterations):
|
||||
print("Iteration ", i + 1)
|
||||
curr_perturbation = [
|
||||
to_var(torch.from_numpy(p_), requires_grad=True, device=device)
|
||||
for p_ in grad_accumulator
|
||||
to_var(torch.from_numpy(p_), requires_grad=True, device=device) for p_ in grad_accumulator
|
||||
]
|
||||
|
||||
# Compute hidden using perturbed past
|
||||
@@ -184,10 +165,7 @@ def perturb_past(
|
||||
_, _, _, curr_length, _ = curr_perturbation[0].shape
|
||||
all_logits, _, all_hidden = model(last, past=perturbed_past)
|
||||
hidden = all_hidden[-1]
|
||||
new_accumulated_hidden = accumulated_hidden + torch.sum(
|
||||
hidden,
|
||||
dim=1
|
||||
).detach()
|
||||
new_accumulated_hidden = accumulated_hidden + torch.sum(hidden, dim=1).detach()
|
||||
# TODO: Check the layer-norm consistency of this with trained discriminator (Sumanth)
|
||||
logits = all_logits[:, -1, :]
|
||||
probs = F.softmax(logits, dim=-1)
|
||||
@@ -210,20 +188,13 @@ def perturb_past(
|
||||
wte = model.resize_token_embeddings()
|
||||
for _ in range(horizon_length):
|
||||
inputs_embeds = torch.matmul(curr_probs, wte.weight.data)
|
||||
_, curr_unpert_past, curr_all_hidden = model(
|
||||
past=curr_unpert_past,
|
||||
inputs_embeds=inputs_embeds
|
||||
)
|
||||
_, curr_unpert_past, curr_all_hidden = model(past=curr_unpert_past, inputs_embeds=inputs_embeds)
|
||||
curr_hidden = curr_all_hidden[-1]
|
||||
new_accumulated_hidden = new_accumulated_hidden + torch.sum(
|
||||
curr_hidden, dim=1)
|
||||
new_accumulated_hidden = new_accumulated_hidden + torch.sum(curr_hidden, dim=1)
|
||||
|
||||
prediction = classifier(new_accumulated_hidden /
|
||||
(curr_length + 1 + horizon_length))
|
||||
prediction = classifier(new_accumulated_hidden / (curr_length + 1 + horizon_length))
|
||||
|
||||
label = torch.tensor(prediction.shape[0] * [class_label],
|
||||
device=device,
|
||||
dtype=torch.long)
|
||||
label = torch.tensor(prediction.shape[0] * [class_label], device=device, dtype=torch.long)
|
||||
discrim_loss = ce_loss(prediction, label)
|
||||
print(" pplm_discrim_loss:", discrim_loss.data.cpu().numpy())
|
||||
loss += discrim_loss
|
||||
@@ -232,21 +203,15 @@ def perturb_past(
|
||||
kl_loss = 0.0
|
||||
if kl_scale > 0.0:
|
||||
unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)
|
||||
unpert_probs = (
|
||||
unpert_probs + SMALL_CONST *
|
||||
(unpert_probs <= SMALL_CONST).float().to(device).detach()
|
||||
)
|
||||
correction = SMALL_CONST * (probs <= SMALL_CONST).float().to(
|
||||
device).detach()
|
||||
unpert_probs = unpert_probs + SMALL_CONST * (unpert_probs <= SMALL_CONST).float().to(device).detach()
|
||||
correction = SMALL_CONST * (probs <= SMALL_CONST).float().to(device).detach()
|
||||
corrected_probs = probs + correction.detach()
|
||||
kl_loss = kl_scale * (
|
||||
(corrected_probs * (corrected_probs / unpert_probs).log()).sum()
|
||||
)
|
||||
print(' kl_loss', kl_loss.data.cpu().numpy())
|
||||
kl_loss = kl_scale * ((corrected_probs * (corrected_probs / unpert_probs).log()).sum())
|
||||
print(" kl_loss", kl_loss.data.cpu().numpy())
|
||||
loss += kl_loss
|
||||
|
||||
loss_per_iter.append(loss.data.cpu().numpy())
|
||||
print(' pplm_loss', (loss - kl_loss).data.cpu().numpy())
|
||||
print(" pplm_loss", (loss - kl_loss).data.cpu().numpy())
|
||||
|
||||
# compute gradients
|
||||
loss.backward()
|
||||
@@ -259,15 +224,12 @@ def perturb_past(
|
||||
]
|
||||
else:
|
||||
grad_norms = [
|
||||
(torch.norm(p_.grad * window_mask) + SMALL_CONST)
|
||||
for index, p_ in enumerate(curr_perturbation)
|
||||
(torch.norm(p_.grad * window_mask) + SMALL_CONST) for index, p_ in enumerate(curr_perturbation)
|
||||
]
|
||||
|
||||
# normalize gradients
|
||||
grad = [
|
||||
-stepsize *
|
||||
(p_.grad * window_mask / grad_norms[
|
||||
index] ** gamma).data.cpu().numpy()
|
||||
-stepsize * (p_.grad * window_mask / grad_norms[index] ** gamma).data.cpu().numpy()
|
||||
for index, p_ in enumerate(curr_perturbation)
|
||||
]
|
||||
|
||||
@@ -285,36 +247,27 @@ def perturb_past(
|
||||
past = new_past
|
||||
|
||||
# apply the accumulated perturbations to the past
|
||||
grad_accumulator = [
|
||||
to_var(torch.from_numpy(p_), requires_grad=True, device=device)
|
||||
for p_ in grad_accumulator
|
||||
]
|
||||
grad_accumulator = [to_var(torch.from_numpy(p_), requires_grad=True, device=device) for p_ in grad_accumulator]
|
||||
pert_past = list(map(add, past, grad_accumulator))
|
||||
|
||||
return pert_past, new_accumulated_hidden, grad_norms, loss_per_iter
|
||||
|
||||
|
||||
def get_classifier(
|
||||
name: Optional[str], class_label: Union[str, int],
|
||||
device: str
|
||||
name: Optional[str], class_label: Union[str, int], device: str
|
||||
) -> Tuple[Optional[ClassificationHead], Optional[int]]:
|
||||
if name is None:
|
||||
return None, None
|
||||
|
||||
params = DISCRIMINATOR_MODELS_PARAMS[name]
|
||||
classifier = ClassificationHead(
|
||||
class_size=params['class_size'],
|
||||
embed_size=params['embed_size']
|
||||
).to(device)
|
||||
classifier = ClassificationHead(class_size=params["class_size"], embed_size=params["embed_size"]).to(device)
|
||||
if "url" in params:
|
||||
resolved_archive_file = cached_path(params["url"])
|
||||
elif "path" in params:
|
||||
resolved_archive_file = params["path"]
|
||||
else:
|
||||
raise ValueError("Either url or path have to be specified "
|
||||
"in the discriminator model parameters")
|
||||
classifier.load_state_dict(
|
||||
torch.load(resolved_archive_file, map_location=device))
|
||||
raise ValueError("Either url or path have to be specified " "in the discriminator model parameters")
|
||||
classifier.load_state_dict(torch.load(resolved_archive_file, map_location=device))
|
||||
classifier.eval()
|
||||
|
||||
if isinstance(class_label, str):
|
||||
@@ -341,8 +294,7 @@ def get_classifier(
|
||||
return classifier, label_id
|
||||
|
||||
|
||||
def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> \
|
||||
List[List[List[int]]]:
|
||||
def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> List[List[List[int]]]:
|
||||
bow_indices = []
|
||||
for id_or_path in bag_of_words_ids_or_paths:
|
||||
if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP:
|
||||
@@ -351,13 +303,11 @@ def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) ->
|
||||
filepath = id_or_path
|
||||
with open(filepath, "r") as f:
|
||||
words = f.read().strip().split("\n")
|
||||
bow_indices.append(
|
||||
[tokenizer.encode(word.strip(), add_prefix_space=True) for word in
|
||||
words])
|
||||
bow_indices.append([tokenizer.encode(word.strip(), add_prefix_space=True) for word in words])
|
||||
return bow_indices
|
||||
|
||||
|
||||
def build_bows_one_hot_vectors(bow_indices, tokenizer, device='cuda'):
|
||||
def build_bows_one_hot_vectors(bow_indices, tokenizer, device="cuda"):
|
||||
if bow_indices is None:
|
||||
return None
|
||||
|
||||
@@ -373,39 +323,35 @@ def build_bows_one_hot_vectors(bow_indices, tokenizer, device='cuda'):
|
||||
|
||||
|
||||
def full_text_generation(
|
||||
model,
|
||||
tokenizer,
|
||||
context=None,
|
||||
num_samples=1,
|
||||
device="cuda",
|
||||
bag_of_words=None,
|
||||
discrim=None,
|
||||
class_label=None,
|
||||
length=100,
|
||||
stepsize=0.02,
|
||||
temperature=1.0,
|
||||
top_k=10,
|
||||
sample=False,
|
||||
num_iterations=3,
|
||||
grad_length=10000,
|
||||
horizon_length=1,
|
||||
window_length=0,
|
||||
decay=False,
|
||||
gamma=1.5,
|
||||
gm_scale=0.9,
|
||||
kl_scale=0.01,
|
||||
**kwargs
|
||||
model,
|
||||
tokenizer,
|
||||
context=None,
|
||||
num_samples=1,
|
||||
device="cuda",
|
||||
bag_of_words=None,
|
||||
discrim=None,
|
||||
class_label=None,
|
||||
length=100,
|
||||
stepsize=0.02,
|
||||
temperature=1.0,
|
||||
top_k=10,
|
||||
sample=False,
|
||||
num_iterations=3,
|
||||
grad_length=10000,
|
||||
horizon_length=1,
|
||||
window_length=0,
|
||||
decay=False,
|
||||
gamma=1.5,
|
||||
gm_scale=0.9,
|
||||
kl_scale=0.01,
|
||||
repetition_penalty=1.0,
|
||||
**kwargs
|
||||
):
|
||||
classifier, class_id = get_classifier(
|
||||
discrim,
|
||||
class_label,
|
||||
device
|
||||
)
|
||||
classifier, class_id = get_classifier(discrim, class_label, device)
|
||||
|
||||
bow_indices = []
|
||||
if bag_of_words:
|
||||
bow_indices = get_bag_of_words_indices(bag_of_words.split(";"),
|
||||
tokenizer)
|
||||
bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer)
|
||||
|
||||
if bag_of_words and classifier:
|
||||
print("Both PPLM-BoW and PPLM-Discrim are on. This is not optimized.")
|
||||
@@ -429,9 +375,10 @@ def full_text_generation(
|
||||
device=device,
|
||||
length=length,
|
||||
sample=sample,
|
||||
perturb=False
|
||||
perturb=False,
|
||||
repetition_penalty=repetition_penalty,
|
||||
)
|
||||
if device == 'cuda':
|
||||
if device == "cuda":
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
pert_gen_tok_texts = []
|
||||
@@ -462,42 +409,44 @@ def full_text_generation(
|
||||
gamma=gamma,
|
||||
gm_scale=gm_scale,
|
||||
kl_scale=kl_scale,
|
||||
repetition_penalty=repetition_penalty,
|
||||
)
|
||||
pert_gen_tok_texts.append(pert_gen_tok_text)
|
||||
if classifier is not None:
|
||||
discrim_losses.append(discrim_loss.data.cpu().numpy())
|
||||
losses_in_time.append(loss_in_time)
|
||||
|
||||
if device == 'cuda':
|
||||
if device == "cuda":
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
return unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time
|
||||
|
||||
|
||||
def generate_text_pplm(
|
||||
model,
|
||||
tokenizer,
|
||||
context=None,
|
||||
past=None,
|
||||
device="cuda",
|
||||
perturb=True,
|
||||
bow_indices=None,
|
||||
classifier=None,
|
||||
class_label=None,
|
||||
loss_type=0,
|
||||
length=100,
|
||||
stepsize=0.02,
|
||||
temperature=1.0,
|
||||
top_k=10,
|
||||
sample=False,
|
||||
num_iterations=3,
|
||||
grad_length=10000,
|
||||
horizon_length=1,
|
||||
window_length=0,
|
||||
decay=False,
|
||||
gamma=1.5,
|
||||
gm_scale=0.9,
|
||||
kl_scale=0.01,
|
||||
model,
|
||||
tokenizer,
|
||||
context=None,
|
||||
past=None,
|
||||
device="cuda",
|
||||
perturb=True,
|
||||
bow_indices=None,
|
||||
classifier=None,
|
||||
class_label=None,
|
||||
loss_type=0,
|
||||
length=100,
|
||||
stepsize=0.02,
|
||||
temperature=1.0,
|
||||
top_k=10,
|
||||
sample=False,
|
||||
num_iterations=3,
|
||||
grad_length=10000,
|
||||
horizon_length=1,
|
||||
window_length=0,
|
||||
decay=False,
|
||||
gamma=1.5,
|
||||
gm_scale=0.9,
|
||||
kl_scale=0.01,
|
||||
repetition_penalty=1.0,
|
||||
):
|
||||
output_so_far = None
|
||||
if context:
|
||||
@@ -507,8 +456,7 @@ def generate_text_pplm(
|
||||
output_so_far = context_t
|
||||
|
||||
# collect one hot vectors for bags of words
|
||||
one_hot_bows_vectors = build_bows_one_hot_vectors(bow_indices, tokenizer,
|
||||
device)
|
||||
one_hot_bows_vectors = build_bows_one_hot_vectors(bow_indices, tokenizer, device)
|
||||
|
||||
grad_norms = None
|
||||
last = None
|
||||
@@ -570,18 +518,21 @@ def generate_text_pplm(
|
||||
|
||||
pert_logits, past, pert_all_hidden = model(last, past=pert_past)
|
||||
pert_logits = pert_logits[:, -1, :] / temperature # + SMALL_CONST
|
||||
|
||||
for token_idx in set(output_so_far[0].tolist()):
|
||||
if pert_logits[0, token_idx] < 0:
|
||||
pert_logits[0, token_idx] *= repetition_penalty
|
||||
else:
|
||||
pert_logits[0, token_idx] /= repetition_penalty
|
||||
|
||||
pert_probs = F.softmax(pert_logits, dim=-1)
|
||||
|
||||
if classifier is not None:
|
||||
ce_loss = torch.nn.CrossEntropyLoss()
|
||||
prediction = classifier(torch.mean(unpert_last_hidden, dim=1))
|
||||
label = torch.tensor([class_label], device=device,
|
||||
dtype=torch.long)
|
||||
label = torch.tensor([class_label], device=device, dtype=torch.long)
|
||||
unpert_discrim_loss = ce_loss(prediction, label)
|
||||
print(
|
||||
"unperturbed discrim loss",
|
||||
unpert_discrim_loss.data.cpu().numpy()
|
||||
)
|
||||
print("unperturbed discrim loss", unpert_discrim_loss.data.cpu().numpy())
|
||||
else:
|
||||
unpert_discrim_loss = 0
|
||||
|
||||
@@ -590,10 +541,8 @@ def generate_text_pplm(
|
||||
|
||||
unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)
|
||||
|
||||
pert_probs = ((pert_probs ** gm_scale) * (
|
||||
unpert_probs ** (1 - gm_scale))) # + SMALL_CONST
|
||||
pert_probs = top_k_filter(pert_probs, k=top_k,
|
||||
probs=True) # + SMALL_CONST
|
||||
pert_probs = (pert_probs ** gm_scale) * (unpert_probs ** (1 - gm_scale)) # + SMALL_CONST
|
||||
pert_probs = top_k_filter(pert_probs, k=top_k, probs=True) # + SMALL_CONST
|
||||
|
||||
# rescale
|
||||
if torch.sum(pert_probs) <= 1:
|
||||
@@ -611,10 +560,7 @@ def generate_text_pplm(
|
||||
_, last = torch.topk(pert_probs, k=1, dim=-1)
|
||||
|
||||
# update context/output_so_far appending the new token
|
||||
output_so_far = (
|
||||
last if output_so_far is None
|
||||
else torch.cat((output_so_far, last), dim=1)
|
||||
)
|
||||
output_so_far = last if output_so_far is None else torch.cat((output_so_far, last), dim=1)
|
||||
|
||||
print(tokenizer.decode(output_so_far.tolist()[0]))
|
||||
|
||||
@@ -623,44 +569,43 @@ def generate_text_pplm(
|
||||
|
||||
def set_generic_model_params(discrim_weights, discrim_meta):
|
||||
if discrim_weights is None:
|
||||
raise ValueError('When using a generic discriminator, '
|
||||
'discrim_weights need to be specified')
|
||||
raise ValueError("When using a generic discriminator, " "discrim_weights need to be specified")
|
||||
if discrim_meta is None:
|
||||
raise ValueError('When using a generic discriminator, '
|
||||
'discrim_meta need to be specified')
|
||||
raise ValueError("When using a generic discriminator, " "discrim_meta need to be specified")
|
||||
|
||||
with open(discrim_meta, 'r') as discrim_meta_file:
|
||||
with open(discrim_meta, "r") as discrim_meta_file:
|
||||
meta = json.load(discrim_meta_file)
|
||||
meta['path'] = discrim_weights
|
||||
DISCRIMINATOR_MODELS_PARAMS['generic'] = meta
|
||||
meta["path"] = discrim_weights
|
||||
DISCRIMINATOR_MODELS_PARAMS["generic"] = meta
|
||||
|
||||
|
||||
def run_pplm_example(
|
||||
pretrained_model="gpt2-medium",
|
||||
cond_text="",
|
||||
uncond=False,
|
||||
num_samples=1,
|
||||
bag_of_words=None,
|
||||
discrim=None,
|
||||
discrim_weights=None,
|
||||
discrim_meta=None,
|
||||
class_label=-1,
|
||||
length=100,
|
||||
stepsize=0.02,
|
||||
temperature=1.0,
|
||||
top_k=10,
|
||||
sample=False,
|
||||
num_iterations=3,
|
||||
grad_length=10000,
|
||||
horizon_length=1,
|
||||
window_length=0,
|
||||
decay=False,
|
||||
gamma=1.5,
|
||||
gm_scale=0.9,
|
||||
kl_scale=0.01,
|
||||
seed=0,
|
||||
no_cuda=False,
|
||||
colorama=False
|
||||
pretrained_model="gpt2-medium",
|
||||
cond_text="",
|
||||
uncond=False,
|
||||
num_samples=1,
|
||||
bag_of_words=None,
|
||||
discrim=None,
|
||||
discrim_weights=None,
|
||||
discrim_meta=None,
|
||||
class_label=-1,
|
||||
length=100,
|
||||
stepsize=0.02,
|
||||
temperature=1.0,
|
||||
top_k=10,
|
||||
sample=False,
|
||||
num_iterations=3,
|
||||
grad_length=10000,
|
||||
horizon_length=1,
|
||||
window_length=0,
|
||||
decay=False,
|
||||
gamma=1.5,
|
||||
gm_scale=0.9,
|
||||
kl_scale=0.01,
|
||||
seed=0,
|
||||
no_cuda=False,
|
||||
colorama=False,
|
||||
repetition_penalty=1.0,
|
||||
):
|
||||
# set Random seed
|
||||
torch.manual_seed(seed)
|
||||
@@ -669,21 +614,15 @@ def run_pplm_example(
|
||||
# set the device
|
||||
device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
|
||||
|
||||
if discrim == 'generic':
|
||||
if discrim == "generic":
|
||||
set_generic_model_params(discrim_weights, discrim_meta)
|
||||
|
||||
if discrim is not None:
|
||||
pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][
|
||||
"pretrained_model"
|
||||
]
|
||||
print("discrim = {}, pretrained_model set "
|
||||
"to discriminator's = {}".format(discrim, pretrained_model))
|
||||
pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim]["pretrained_model"]
|
||||
print("discrim = {}, pretrained_model set " "to discriminator's = {}".format(discrim, pretrained_model))
|
||||
|
||||
# load pretrained model
|
||||
model = GPT2LMHeadModel.from_pretrained(
|
||||
pretrained_model,
|
||||
output_hidden_states=True
|
||||
)
|
||||
model = GPT2LMHeadModel.from_pretrained(pretrained_model, output_hidden_states=True)
|
||||
model.to(device)
|
||||
model.eval()
|
||||
|
||||
@@ -696,9 +635,7 @@ def run_pplm_example(
|
||||
|
||||
# figure out conditioning text
|
||||
if uncond:
|
||||
tokenized_cond_text = tokenizer.encode(
|
||||
[tokenizer.bos_token]
|
||||
)
|
||||
tokenized_cond_text = tokenizer.encode([tokenizer.bos_token])
|
||||
else:
|
||||
raw_text = cond_text
|
||||
while not raw_text:
|
||||
@@ -736,6 +673,7 @@ def run_pplm_example(
|
||||
gamma=gamma,
|
||||
gm_scale=gm_scale,
|
||||
kl_scale=kl_scale,
|
||||
repetition_penalty=repetition_penalty,
|
||||
)
|
||||
|
||||
# untokenize unperturbed text
|
||||
@@ -750,8 +688,7 @@ def run_pplm_example(
|
||||
|
||||
bow_word_ids = set()
|
||||
if bag_of_words and colorama:
|
||||
bow_indices = get_bag_of_words_indices(bag_of_words.split(";"),
|
||||
tokenizer)
|
||||
bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer)
|
||||
for single_bow_list in bow_indices:
|
||||
# filtering all words in the list composed of more than 1 token
|
||||
filtered = list(filter(lambda x: len(x) <= 1, single_bow_list))
|
||||
@@ -765,13 +702,11 @@ def run_pplm_example(
|
||||
if colorama:
|
||||
import colorama
|
||||
|
||||
pert_gen_text = ''
|
||||
pert_gen_text = ""
|
||||
for word_id in pert_gen_tok_text.tolist()[0]:
|
||||
if word_id in bow_word_ids:
|
||||
pert_gen_text += '{}{}{}'.format(
|
||||
colorama.Fore.RED,
|
||||
tokenizer.decode([word_id]),
|
||||
colorama.Style.RESET_ALL
|
||||
pert_gen_text += "{}{}{}".format(
|
||||
colorama.Fore.RED, tokenizer.decode([word_id]), colorama.Style.RESET_ALL
|
||||
)
|
||||
else:
|
||||
pert_gen_text += tokenizer.decode([word_id])
|
||||
@@ -781,18 +716,16 @@ def run_pplm_example(
|
||||
print("= Perturbed generated text {} =".format(i + 1))
|
||||
print(pert_gen_text)
|
||||
print()
|
||||
except:
|
||||
pass
|
||||
except Exception as exc:
|
||||
print("Ignoring error while generating perturbed text:", exc)
|
||||
|
||||
# keep the prefix, perturbed seq, original seq for each index
|
||||
generated_texts.append(
|
||||
(tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text)
|
||||
)
|
||||
generated_texts.append((tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text))
|
||||
|
||||
return
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--pretrained_model",
|
||||
@@ -801,19 +734,10 @@ if __name__ == '__main__':
|
||||
default="gpt2-medium",
|
||||
help="pretrained model name or path to local checkpoint",
|
||||
)
|
||||
parser.add_argument("--cond_text", type=str, default="The lake", help="Prefix texts to condition on")
|
||||
parser.add_argument("--uncond", action="store_true", help="Generate from end-of-text as prefix")
|
||||
parser.add_argument(
|
||||
"--cond_text", type=str, default="The lake",
|
||||
help="Prefix texts to condition on"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--uncond", action="store_true",
|
||||
help="Generate from end-of-text as prefix"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num_samples",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of samples to generate from the modified latents",
|
||||
"--num_samples", type=int, default=1, help="Number of samples to generate from the modified latents",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bag_of_words",
|
||||
@@ -821,8 +745,8 @@ if __name__ == '__main__':
|
||||
type=str,
|
||||
default=None,
|
||||
help="Bags of words used for PPLM-BoW. "
|
||||
"Either a BOW id (see list in code) or a filepath. "
|
||||
"Multiple BoWs separated by ;",
|
||||
"Either a BOW id (see list in code) or a filepath. "
|
||||
"Multiple BoWs separated by ;",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--discrim",
|
||||
@@ -832,48 +756,39 @@ if __name__ == '__main__':
|
||||
choices=("clickbait", "sentiment", "toxicity", "generic"),
|
||||
help="Discriminator to use",
|
||||
)
|
||||
parser.add_argument('--discrim_weights', type=str, default=None,
|
||||
help='Weights for the generic discriminator')
|
||||
parser.add_argument('--discrim_meta', type=str, default=None,
|
||||
help='Meta information for the generic discriminator')
|
||||
parser.add_argument("--discrim_weights", type=str, default=None, help="Weights for the generic discriminator")
|
||||
parser.add_argument(
|
||||
"--class_label",
|
||||
type=int,
|
||||
default=-1,
|
||||
help="Class label used for the discriminator",
|
||||
"--discrim_meta", type=str, default=None, help="Meta information for the generic discriminator"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--class_label", type=int, default=-1, help="Class label used for the discriminator",
|
||||
)
|
||||
parser.add_argument("--length", type=int, default=100)
|
||||
parser.add_argument("--stepsize", type=float, default=0.02)
|
||||
parser.add_argument("--temperature", type=float, default=1.0)
|
||||
parser.add_argument("--top_k", type=int, default=10)
|
||||
parser.add_argument(
|
||||
"--sample", action="store_true",
|
||||
help="Generate from end-of-text as prefix"
|
||||
)
|
||||
parser.add_argument("--sample", action="store_true", help="Generate from end-of-text as prefix")
|
||||
parser.add_argument("--num_iterations", type=int, default=3)
|
||||
parser.add_argument("--grad_length", type=int, default=10000)
|
||||
parser.add_argument(
|
||||
"--window_length",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Length of past which is being optimized; "
|
||||
"0 corresponds to infinite window length",
|
||||
help="Length of past which is being optimized; " "0 corresponds to infinite window length",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--horizon_length",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Length of future to optimize over",
|
||||
"--horizon_length", type=int, default=1, help="Length of future to optimize over",
|
||||
)
|
||||
parser.add_argument("--decay", action="store_true",
|
||||
help="whether to decay or not")
|
||||
parser.add_argument("--decay", action="store_true", help="whether to decay or not")
|
||||
parser.add_argument("--gamma", type=float, default=1.5)
|
||||
parser.add_argument("--gm_scale", type=float, default=0.9)
|
||||
parser.add_argument("--kl_scale", type=float, default=0.01)
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--no_cuda", action="store_true", help="no cuda")
|
||||
parser.add_argument("--colorama", action="store_true",
|
||||
help="colors keywords")
|
||||
parser.add_argument("--colorama", action="store_true", help="colors keywords")
|
||||
parser.add_argument(
|
||||
"--repetition_penalty", type=float, default=1.0, help="Penalize repetition. More than 1.0 -> less repetition",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
run_pplm_example(**vars(args))
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
#! /usr/bin/env python3
|
||||
# coding=utf-8
|
||||
|
||||
#Copyright (c) 2019 Uber Technologies, Inc.
|
||||
# Copyright (c) 2019 Uber Technologies, Inc.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
#http://www.apache.org/licenses/LICENSE-2.0
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
@@ -24,7 +24,6 @@ import time
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torch.optim
|
||||
import torch.optim as optim
|
||||
import torch.utils.data as data
|
||||
from nltk.tokenize.treebank import TreebankWordDetokenizer
|
||||
@@ -32,8 +31,9 @@ from torchtext import data as torchtext_data
|
||||
from torchtext import datasets
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from transformers import GPT2Tokenizer, GPT2LMHeadModel
|
||||
from pplm_classification_head import ClassificationHead
|
||||
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
||||
|
||||
|
||||
torch.manual_seed(0)
|
||||
np.random.seed(0)
|
||||
@@ -42,26 +42,15 @@ example_sentence = "This is incredible! I love it, this is the best chicken I ha
|
||||
max_length_seq = 100
|
||||
|
||||
|
||||
|
||||
|
||||
class Discriminator(torch.nn.Module):
|
||||
"""Transformer encoder followed by a Classification Head"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
class_size,
|
||||
pretrained_model="gpt2-medium",
|
||||
cached_mode=False,
|
||||
device='cpu'
|
||||
):
|
||||
super(Discriminator, self).__init__()
|
||||
def __init__(self, class_size, pretrained_model="gpt2-medium", cached_mode=False, device="cpu"):
|
||||
super().__init__()
|
||||
self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
|
||||
self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model)
|
||||
self.embed_size = self.encoder.transformer.config.hidden_size
|
||||
self.classifier_head = ClassificationHead(
|
||||
class_size=class_size,
|
||||
embed_size=self.embed_size
|
||||
)
|
||||
self.classifier_head = ClassificationHead(class_size=class_size, embed_size=self.embed_size)
|
||||
self.cached_mode = cached_mode
|
||||
self.device = device
|
||||
|
||||
@@ -74,14 +63,10 @@ class Discriminator(torch.nn.Module):
|
||||
self.classifier_head.train()
|
||||
|
||||
def avg_representation(self, x):
|
||||
mask = x.ne(0).unsqueeze(2).repeat(
|
||||
1, 1, self.embed_size
|
||||
).float().to(self.device).detach()
|
||||
mask = x.ne(0).unsqueeze(2).repeat(1, 1, self.embed_size).float().to(self.device).detach()
|
||||
hidden, _ = self.encoder.transformer(x)
|
||||
masked_hidden = hidden * mask
|
||||
avg_hidden = torch.sum(masked_hidden, dim=1) / (
|
||||
torch.sum(mask, dim=1).detach() + EPSILON
|
||||
)
|
||||
avg_hidden = torch.sum(masked_hidden, dim=1) / (torch.sum(mask, dim=1).detach() + EPSILON)
|
||||
return avg_hidden
|
||||
|
||||
def forward(self, x):
|
||||
@@ -117,10 +102,7 @@ def collate_fn(data):
|
||||
def pad_sequences(sequences):
|
||||
lengths = [len(seq) for seq in sequences]
|
||||
|
||||
padded_sequences = torch.zeros(
|
||||
len(sequences),
|
||||
max(lengths)
|
||||
).long() # padding value = 0
|
||||
padded_sequences = torch.zeros(len(sequences), max(lengths)).long() # padding value = 0
|
||||
|
||||
for i, seq in enumerate(sequences):
|
||||
end = lengths[i]
|
||||
@@ -149,8 +131,7 @@ def cached_collate_fn(data):
|
||||
return x_batch, y_batch
|
||||
|
||||
|
||||
def train_epoch(data_loader, discriminator, optimizer,
|
||||
epoch=0, log_interval=10, device='cpu'):
|
||||
def train_epoch(data_loader, discriminator, optimizer, epoch=0, log_interval=10, device="cpu"):
|
||||
samples_so_far = 0
|
||||
discriminator.train_custom()
|
||||
for batch_idx, (input_t, target_t) in enumerate(data_loader):
|
||||
@@ -169,13 +150,15 @@ def train_epoch(data_loader, discriminator, optimizer,
|
||||
print(
|
||||
"Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
|
||||
epoch + 1,
|
||||
samples_so_far, len(data_loader.dataset),
|
||||
100 * samples_so_far / len(data_loader.dataset), loss.item()
|
||||
samples_so_far,
|
||||
len(data_loader.dataset),
|
||||
100 * samples_so_far / len(data_loader.dataset),
|
||||
loss.item(),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def evaluate_performance(data_loader, discriminator, device='cpu'):
|
||||
def evaluate_performance(data_loader, discriminator, device="cpu"):
|
||||
discriminator.eval()
|
||||
test_loss = 0
|
||||
correct = 0
|
||||
@@ -194,13 +177,12 @@ def evaluate_performance(data_loader, discriminator, device='cpu'):
|
||||
print(
|
||||
"Performance on test set: "
|
||||
"Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)".format(
|
||||
test_loss, correct, len(data_loader.dataset),
|
||||
100. * correct / len(data_loader.dataset)
|
||||
test_loss, correct, len(data_loader.dataset), 100.0 * correct / len(data_loader.dataset)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def predict(input_sentence, model, classes, cached=False, device='cpu'):
|
||||
def predict(input_sentence, model, classes, cached=False, device="cpu"):
|
||||
input_t = model.tokenizer.encode(input_sentence)
|
||||
input_t = torch.tensor([input_t], dtype=torch.long, device=device)
|
||||
if cached:
|
||||
@@ -208,17 +190,14 @@ def predict(input_sentence, model, classes, cached=False, device='cpu'):
|
||||
|
||||
log_probs = model(input_t).data.cpu().numpy().flatten().tolist()
|
||||
print("Input sentence:", input_sentence)
|
||||
print("Predictions:", ", ".join(
|
||||
"{}: {:.4f}".format(c, math.exp(log_prob)) for c, log_prob in
|
||||
zip(classes, log_probs)
|
||||
))
|
||||
print(
|
||||
"Predictions:",
|
||||
", ".join("{}: {:.4f}".format(c, math.exp(log_prob)) for c, log_prob in zip(classes, log_probs)),
|
||||
)
|
||||
|
||||
|
||||
def get_cached_data_loader(dataset, batch_size, discriminator,
|
||||
shuffle=False, device='cpu'):
|
||||
data_loader = torch.utils.data.DataLoader(dataset=dataset,
|
||||
batch_size=batch_size,
|
||||
collate_fn=collate_fn)
|
||||
def get_cached_data_loader(dataset, batch_size, discriminator, shuffle=False, device="cpu"):
|
||||
data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=collate_fn)
|
||||
|
||||
xs = []
|
||||
ys = []
|
||||
@@ -231,50 +210,44 @@ def get_cached_data_loader(dataset, batch_size, discriminator,
|
||||
ys += y.cpu().numpy().tolist()
|
||||
|
||||
data_loader = torch.utils.data.DataLoader(
|
||||
dataset=Dataset(xs, ys),
|
||||
batch_size=batch_size,
|
||||
shuffle=shuffle,
|
||||
collate_fn=cached_collate_fn)
|
||||
dataset=Dataset(xs, ys), batch_size=batch_size, shuffle=shuffle, collate_fn=cached_collate_fn
|
||||
)
|
||||
|
||||
return data_loader
|
||||
|
||||
|
||||
def train_discriminator(
|
||||
dataset, dataset_fp=None, pretrained_model="gpt2-medium",
|
||||
epochs=10, batch_size=64, log_interval=10,
|
||||
save_model=False, cached=False, no_cuda=False):
|
||||
dataset,
|
||||
dataset_fp=None,
|
||||
pretrained_model="gpt2-medium",
|
||||
epochs=10,
|
||||
batch_size=64,
|
||||
log_interval=10,
|
||||
save_model=False,
|
||||
cached=False,
|
||||
no_cuda=False,
|
||||
):
|
||||
device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
|
||||
|
||||
print("Preprocessing {} dataset...".format(dataset))
|
||||
start = time.time()
|
||||
|
||||
if dataset == "SST":
|
||||
idx2class = ["positive", "negative", "very positive", "very negative",
|
||||
"neutral"]
|
||||
idx2class = ["positive", "negative", "very positive", "very negative", "neutral"]
|
||||
class2idx = {c: i for i, c in enumerate(idx2class)}
|
||||
|
||||
discriminator = Discriminator(
|
||||
class_size=len(idx2class),
|
||||
pretrained_model=pretrained_model,
|
||||
cached_mode=cached,
|
||||
device=device
|
||||
class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
|
||||
).to(device)
|
||||
|
||||
text = torchtext_data.Field()
|
||||
label = torchtext_data.Field(sequential=False)
|
||||
train_data, val_data, test_data = datasets.SST.splits(
|
||||
text,
|
||||
label,
|
||||
fine_grained=True,
|
||||
train_subtrees=True,
|
||||
)
|
||||
train_data, val_data, test_data = datasets.SST.splits(text, label, fine_grained=True, train_subtrees=True,)
|
||||
|
||||
x = []
|
||||
y = []
|
||||
for i in trange(len(train_data), ascii=True):
|
||||
seq = TreebankWordDetokenizer().detokenize(
|
||||
vars(train_data[i])["text"]
|
||||
)
|
||||
seq = TreebankWordDetokenizer().detokenize(vars(train_data[i])["text"])
|
||||
seq = discriminator.tokenizer.encode(seq)
|
||||
seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
|
||||
x.append(seq)
|
||||
@@ -284,9 +257,7 @@ def train_discriminator(
|
||||
test_x = []
|
||||
test_y = []
|
||||
for i in trange(len(test_data), ascii=True):
|
||||
seq = TreebankWordDetokenizer().detokenize(
|
||||
vars(test_data[i])["text"]
|
||||
)
|
||||
seq = TreebankWordDetokenizer().detokenize(vars(test_data[i])["text"])
|
||||
seq = discriminator.tokenizer.encode(seq)
|
||||
seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
|
||||
test_x.append(seq)
|
||||
@@ -306,10 +277,7 @@ def train_discriminator(
|
||||
class2idx = {c: i for i, c in enumerate(idx2class)}
|
||||
|
||||
discriminator = Discriminator(
|
||||
class_size=len(idx2class),
|
||||
pretrained_model=pretrained_model,
|
||||
cached_mode=cached,
|
||||
device=device
|
||||
class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
|
||||
).to(device)
|
||||
|
||||
with open("datasets/clickbait/clickbait_train_prefix.txt") as f:
|
||||
@@ -317,10 +285,8 @@ def train_discriminator(
|
||||
for i, line in enumerate(f):
|
||||
try:
|
||||
data.append(eval(line))
|
||||
except:
|
||||
print("Error evaluating line {}: {}".format(
|
||||
i, line
|
||||
))
|
||||
except Exception:
|
||||
print("Error evaluating line {}: {}".format(i, line))
|
||||
continue
|
||||
x = []
|
||||
y = []
|
||||
@@ -331,27 +297,20 @@ def train_discriminator(
|
||||
seq = discriminator.tokenizer.encode(d["text"])
|
||||
|
||||
if len(seq) < max_length_seq:
|
||||
seq = torch.tensor(
|
||||
[50256] + seq, device=device, dtype=torch.long
|
||||
)
|
||||
seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
|
||||
else:
|
||||
print("Line {} is longer than maximum length {}".format(
|
||||
i, max_length_seq
|
||||
))
|
||||
print("Line {} is longer than maximum length {}".format(i, max_length_seq))
|
||||
continue
|
||||
x.append(seq)
|
||||
y.append(d["label"])
|
||||
except:
|
||||
print("Error evaluating / tokenizing"
|
||||
" line {}, skipping it".format(i))
|
||||
except Exception:
|
||||
print("Error evaluating / tokenizing" " line {}, skipping it".format(i))
|
||||
pass
|
||||
|
||||
full_dataset = Dataset(x, y)
|
||||
train_size = int(0.9 * len(full_dataset))
|
||||
test_size = len(full_dataset) - train_size
|
||||
train_dataset, test_dataset = torch.utils.data.random_split(
|
||||
full_dataset, [train_size, test_size]
|
||||
)
|
||||
train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
|
||||
|
||||
discriminator_meta = {
|
||||
"class_size": len(idx2class),
|
||||
@@ -366,10 +325,7 @@ def train_discriminator(
|
||||
class2idx = {c: i for i, c in enumerate(idx2class)}
|
||||
|
||||
discriminator = Discriminator(
|
||||
class_size=len(idx2class),
|
||||
pretrained_model=pretrained_model,
|
||||
cached_mode=cached,
|
||||
device=device
|
||||
class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
|
||||
).to(device)
|
||||
|
||||
x = []
|
||||
@@ -381,27 +337,20 @@ def train_discriminator(
|
||||
seq = discriminator.tokenizer.encode(d["text"])
|
||||
|
||||
if len(seq) < max_length_seq:
|
||||
seq = torch.tensor(
|
||||
[50256] + seq, device=device, dtype=torch.long
|
||||
)
|
||||
seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
|
||||
else:
|
||||
print("Line {} is longer than maximum length {}".format(
|
||||
i, max_length_seq
|
||||
))
|
||||
print("Line {} is longer than maximum length {}".format(i, max_length_seq))
|
||||
continue
|
||||
x.append(seq)
|
||||
y.append(int(np.sum(d["label"]) > 0))
|
||||
except:
|
||||
print("Error evaluating / tokenizing"
|
||||
" line {}, skipping it".format(i))
|
||||
except Exception:
|
||||
print("Error evaluating / tokenizing" " line {}, skipping it".format(i))
|
||||
pass
|
||||
|
||||
full_dataset = Dataset(x, y)
|
||||
train_size = int(0.9 * len(full_dataset))
|
||||
test_size = len(full_dataset) - train_size
|
||||
train_dataset, test_dataset = torch.utils.data.random_split(
|
||||
full_dataset, [train_size, test_size]
|
||||
)
|
||||
train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
|
||||
|
||||
discriminator_meta = {
|
||||
"class_size": len(idx2class),
|
||||
@@ -416,8 +365,7 @@ def train_discriminator(
|
||||
# class \t text
|
||||
|
||||
if dataset_fp is None:
|
||||
raise ValueError("When generic dataset is selected, "
|
||||
"dataset_fp needs to be specified aswell.")
|
||||
raise ValueError("When generic dataset is selected, " "dataset_fp needs to be specified aswell.")
|
||||
|
||||
classes = set()
|
||||
with open(dataset_fp) as f:
|
||||
@@ -430,10 +378,7 @@ def train_discriminator(
|
||||
class2idx = {c: i for i, c in enumerate(idx2class)}
|
||||
|
||||
discriminator = Discriminator(
|
||||
class_size=len(idx2class),
|
||||
pretrained_model=pretrained_model,
|
||||
cached_mode=cached,
|
||||
device=device
|
||||
class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
|
||||
).to(device)
|
||||
|
||||
x = []
|
||||
@@ -447,34 +392,24 @@ def train_discriminator(
|
||||
|
||||
try:
|
||||
seq = discriminator.tokenizer.encode(text)
|
||||
if (len(seq) < max_length_seq):
|
||||
seq = torch.tensor(
|
||||
[50256] + seq,
|
||||
device=device,
|
||||
dtype=torch.long
|
||||
)
|
||||
if len(seq) < max_length_seq:
|
||||
seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
|
||||
|
||||
else:
|
||||
print(
|
||||
"Line {} is longer than maximum length {}".format(
|
||||
i, max_length_seq
|
||||
))
|
||||
print("Line {} is longer than maximum length {}".format(i, max_length_seq))
|
||||
continue
|
||||
|
||||
x.append(seq)
|
||||
y.append(class2idx[label])
|
||||
|
||||
except:
|
||||
except Exception:
|
||||
print("Error tokenizing line {}, skipping it".format(i))
|
||||
pass
|
||||
|
||||
full_dataset = Dataset(x, y)
|
||||
train_size = int(0.9 * len(full_dataset))
|
||||
test_size = len(full_dataset) - train_size
|
||||
train_dataset, test_dataset = torch.utils.data.random_split(
|
||||
full_dataset,
|
||||
[train_size, test_size]
|
||||
)
|
||||
train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
|
||||
|
||||
discriminator_meta = {
|
||||
"class_size": len(idx2class),
|
||||
@@ -485,9 +420,7 @@ def train_discriminator(
|
||||
}
|
||||
|
||||
end = time.time()
|
||||
print("Preprocessed {} data points".format(
|
||||
len(train_dataset) + len(test_dataset))
|
||||
)
|
||||
print("Preprocessed {} data points".format(len(train_dataset) + len(test_dataset)))
|
||||
print("Data preprocessing took: {:.3f}s".format(end - start))
|
||||
|
||||
if cached:
|
||||
@@ -495,30 +428,21 @@ def train_discriminator(
|
||||
|
||||
start = time.time()
|
||||
|
||||
train_loader = get_cached_data_loader(
|
||||
train_dataset, batch_size, discriminator,
|
||||
shuffle=True, device=device
|
||||
)
|
||||
train_loader = get_cached_data_loader(train_dataset, batch_size, discriminator, shuffle=True, device=device)
|
||||
|
||||
test_loader = get_cached_data_loader(
|
||||
test_dataset, batch_size, discriminator, device=device
|
||||
)
|
||||
test_loader = get_cached_data_loader(test_dataset, batch_size, discriminator, device=device)
|
||||
|
||||
end = time.time()
|
||||
print("Building representation cache took: {:.3f}s".format(end - start))
|
||||
|
||||
else:
|
||||
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
|
||||
batch_size=batch_size,
|
||||
shuffle=True,
|
||||
collate_fn=collate_fn)
|
||||
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
|
||||
batch_size=batch_size,
|
||||
collate_fn=collate_fn)
|
||||
train_loader = torch.utils.data.DataLoader(
|
||||
dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
|
||||
)
|
||||
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, collate_fn=collate_fn)
|
||||
|
||||
if save_model:
|
||||
with open("{}_classifier_head_meta.json".format(dataset),
|
||||
"w") as meta_file:
|
||||
with open("{}_classifier_head_meta.json".format(dataset), "w") as meta_file:
|
||||
json.dump(discriminator_meta, meta_file)
|
||||
|
||||
optimizer = optim.Adam(discriminator.parameters(), lr=0.0001)
|
||||
@@ -533,56 +457,61 @@ def train_discriminator(
|
||||
optimizer=optimizer,
|
||||
epoch=epoch,
|
||||
log_interval=log_interval,
|
||||
device=device
|
||||
)
|
||||
evaluate_performance(
|
||||
data_loader=test_loader,
|
||||
discriminator=discriminator,
|
||||
device=device
|
||||
device=device,
|
||||
)
|
||||
evaluate_performance(data_loader=test_loader, discriminator=discriminator, device=device)
|
||||
|
||||
end = time.time()
|
||||
print("Epoch took: {:.3f}s".format(end - start))
|
||||
|
||||
print("\nExample prediction")
|
||||
predict(example_sentence, discriminator, idx2class,
|
||||
cached=cached, device=device)
|
||||
predict(example_sentence, discriminator, idx2class, cached=cached, device=device)
|
||||
|
||||
if save_model:
|
||||
# torch.save(discriminator.state_dict(),
|
||||
# "{}_discriminator_{}.pt".format(
|
||||
# args.dataset, epoch + 1
|
||||
# ))
|
||||
torch.save(discriminator.get_classifier().state_dict(),
|
||||
"{}_classifier_head_epoch_{}.pt".format(dataset,
|
||||
epoch + 1))
|
||||
torch.save(
|
||||
discriminator.get_classifier().state_dict(),
|
||||
"{}_classifier_head_epoch_{}.pt".format(dataset, epoch + 1),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Train a discriminator on top of GPT-2 representations")
|
||||
parser.add_argument("--dataset", type=str, default="SST",
|
||||
choices=("SST", "clickbait", "toxic", "generic"),
|
||||
help="dataset to train the discriminator on."
|
||||
"In case of generic, the dataset is expected"
|
||||
"to be a TSBV file with structure: class \\t text")
|
||||
parser.add_argument("--dataset_fp", type=str, default="",
|
||||
help="File path of the dataset to use. "
|
||||
"Needed only in case of generic datadset")
|
||||
parser.add_argument("--pretrained_model", type=str, default="gpt2-medium",
|
||||
help="Pretrained model to use as encoder")
|
||||
parser.add_argument("--epochs", type=int, default=10, metavar="N",
|
||||
help="Number of training epochs")
|
||||
parser.add_argument("--batch_size", type=int, default=64, metavar="N",
|
||||
help="input batch size for training (default: 64)")
|
||||
parser.add_argument("--log_interval", type=int, default=10, metavar="N",
|
||||
help="how many batches to wait before logging training status")
|
||||
parser.add_argument("--save_model", action="store_true",
|
||||
help="whether to save the model")
|
||||
parser.add_argument("--cached", action="store_true",
|
||||
help="whether to cache the input representations")
|
||||
parser.add_argument("--no_cuda", action="store_true",
|
||||
help="use to turn off cuda")
|
||||
parser = argparse.ArgumentParser(description="Train a discriminator on top of GPT-2 representations")
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
type=str,
|
||||
default="SST",
|
||||
choices=("SST", "clickbait", "toxic", "generic"),
|
||||
help="dataset to train the discriminator on."
|
||||
"In case of generic, the dataset is expected"
|
||||
"to be a TSBV file with structure: class \\t text",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset_fp",
|
||||
type=str,
|
||||
default="",
|
||||
help="File path of the dataset to use. " "Needed only in case of generic datadset",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pretrained_model", type=str, default="gpt2-medium", help="Pretrained model to use as encoder"
|
||||
)
|
||||
parser.add_argument("--epochs", type=int, default=10, metavar="N", help="Number of training epochs")
|
||||
parser.add_argument(
|
||||
"--batch_size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--log_interval",
|
||||
type=int,
|
||||
default=10,
|
||||
metavar="N",
|
||||
help="how many batches to wait before logging training status",
|
||||
)
|
||||
parser.add_argument("--save_model", action="store_true", help="whether to save the model")
|
||||
parser.add_argument("--cached", action="store_true", help="whether to cache the input representations")
|
||||
parser.add_argument("--no_cuda", action="store_true", help="use to turn off cuda")
|
||||
args = parser.parse_args()
|
||||
|
||||
train_discriminator(**(vars(args)))
|
||||
|
||||
@@ -19,30 +19,23 @@
|
||||
Some parts of this script are adapted from the code of Michel et al. (http://arxiv.org/abs/1905.10650)
|
||||
which is available at https://github.com/pmichel31415/are-16-heads-really-better-than-1
|
||||
"""
|
||||
import os
|
||||
import argparse
|
||||
import logging
|
||||
from datetime import timedelta, datetime
|
||||
from tqdm import tqdm
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subset
|
||||
from torch.utils.data import DataLoader, SequentialSampler, Subset
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from torch.nn import CrossEntropyLoss, MSELoss
|
||||
|
||||
from transformers import (WEIGHTS_NAME,
|
||||
BertConfig, BertForSequenceClassification, BertTokenizer,
|
||||
XLMConfig, XLMForSequenceClassification, XLMTokenizer,
|
||||
XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
|
||||
|
||||
from run_glue import set_seed, load_and_cache_examples, ALL_MODELS, MODEL_CLASSES
|
||||
from tqdm import tqdm
|
||||
|
||||
from run_glue import ALL_MODELS, MODEL_CLASSES, load_and_cache_examples, set_seed
|
||||
from transformers import glue_compute_metrics as compute_metrics
|
||||
from transformers import glue_output_modes as output_modes
|
||||
from transformers import glue_processors as processors
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -63,7 +56,9 @@ def print_2d_tensor(tensor):
|
||||
logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:d}" for x in tensor[row].cpu().data))
|
||||
|
||||
|
||||
def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None):
|
||||
def compute_heads_importance(
|
||||
args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None
|
||||
):
|
||||
""" This method shows how to compute:
|
||||
- head attention entropy
|
||||
- head importance scores according to http://arxiv.org/abs/1905.10650
|
||||
@@ -85,8 +80,14 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True,
|
||||
input_ids, input_mask, segment_ids, label_ids = batch
|
||||
|
||||
# Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)
|
||||
outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, head_mask=head_mask)
|
||||
loss, logits, all_attentions = outputs[0], outputs[1], outputs[-1] # Loss and logits are the first, attention the last
|
||||
outputs = model(
|
||||
input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, head_mask=head_mask
|
||||
)
|
||||
loss, logits, all_attentions = (
|
||||
outputs[0],
|
||||
outputs[1],
|
||||
outputs[-1],
|
||||
) # Loss and logits are the first, attention the last
|
||||
loss.backward() # Backpropagate to populate the gradients in the head mask
|
||||
|
||||
if compute_entropy:
|
||||
@@ -113,15 +114,15 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True,
|
||||
# Layerwise importance normalization
|
||||
if not args.dont_normalize_importance_by_layer:
|
||||
exponent = 2
|
||||
norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1/exponent)
|
||||
norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1 / exponent)
|
||||
head_importance /= norm_by_layer.unsqueeze(-1) + 1e-20
|
||||
|
||||
if not args.dont_normalize_global_importance:
|
||||
head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())
|
||||
|
||||
# Print/save matrices
|
||||
np.save(os.path.join(args.output_dir, 'attn_entropy.npy'), attn_entropy.detach().cpu().numpy())
|
||||
np.save(os.path.join(args.output_dir, 'head_importance.npy'), head_importance.detach().cpu().numpy())
|
||||
np.save(os.path.join(args.output_dir, "attn_entropy.npy"), attn_entropy.detach().cpu().numpy())
|
||||
np.save(os.path.join(args.output_dir, "head_importance.npy"), head_importance.detach().cpu().numpy())
|
||||
|
||||
logger.info("Attention entropies")
|
||||
print_2d_tensor(attn_entropy)
|
||||
@@ -129,7 +130,9 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True,
|
||||
print_2d_tensor(head_importance)
|
||||
logger.info("Head ranked by importance scores")
|
||||
head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device)
|
||||
head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(head_importance.numel(), device=args.device)
|
||||
head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(
|
||||
head_importance.numel(), device=args.device
|
||||
)
|
||||
head_ranks = head_ranks.view_as(head_importance)
|
||||
print_2d_tensor(head_ranks)
|
||||
|
||||
@@ -150,9 +153,9 @@ def mask_heads(args, model, eval_dataloader):
|
||||
|
||||
current_score = original_score
|
||||
while current_score >= original_score * args.masking_threshold:
|
||||
head_mask = new_head_mask.clone() # save current head mask
|
||||
head_mask = new_head_mask.clone() # save current head mask
|
||||
# heads from least important to most - keep only not-masked heads
|
||||
head_importance[head_mask == 0.0] = float('Inf')
|
||||
head_importance[head_mask == 0.0] = float("Inf")
|
||||
current_heads_to_mask = head_importance.view(-1).sort()[1]
|
||||
|
||||
if len(current_heads_to_mask) <= num_to_mask:
|
||||
@@ -167,14 +170,21 @@ def mask_heads(args, model, eval_dataloader):
|
||||
print_2d_tensor(new_head_mask)
|
||||
|
||||
# Compute metric and head importance again
|
||||
_, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask)
|
||||
_, head_importance, preds, labels = compute_heads_importance(
|
||||
args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask
|
||||
)
|
||||
preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
|
||||
current_score = compute_metrics(args.task_name, preds, labels)[args.metric_name]
|
||||
logger.info("Masking: current score: %f, remaning heads %d (%.1f percents)", current_score, new_head_mask.sum(), new_head_mask.sum()/new_head_mask.numel() * 100)
|
||||
logger.info(
|
||||
"Masking: current score: %f, remaning heads %d (%.1f percents)",
|
||||
current_score,
|
||||
new_head_mask.sum(),
|
||||
new_head_mask.sum() / new_head_mask.numel() * 100,
|
||||
)
|
||||
|
||||
logger.info("Final head mask")
|
||||
print_2d_tensor(head_mask)
|
||||
np.save(os.path.join(args.output_dir, 'head_mask.npy'), head_mask.detach().cpu().numpy())
|
||||
np.save(os.path.join(args.output_dir, "head_mask.npy"), head_mask.detach().cpu().numpy())
|
||||
|
||||
return head_mask
|
||||
|
||||
@@ -186,8 +196,9 @@ def prune_heads(args, model, eval_dataloader, head_mask):
|
||||
# Try pruning and test time speedup
|
||||
# Pruning is like masking but we actually remove the masked weights
|
||||
before_time = datetime.now()
|
||||
_, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
|
||||
compute_entropy=False, compute_importance=False, head_mask=head_mask)
|
||||
_, _, preds, labels = compute_heads_importance(
|
||||
args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=head_mask
|
||||
)
|
||||
preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
|
||||
score_masking = compute_metrics(args.task_name, preds, labels)[args.metric_name]
|
||||
original_time = datetime.now() - before_time
|
||||
@@ -199,73 +210,127 @@ def prune_heads(args, model, eval_dataloader, head_mask):
|
||||
pruned_num_params = sum(p.numel() for p in model.parameters())
|
||||
|
||||
before_time = datetime.now()
|
||||
_, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
|
||||
compute_entropy=False, compute_importance=False, head_mask=None)
|
||||
_, _, preds, labels = compute_heads_importance(
|
||||
args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=None
|
||||
)
|
||||
preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
|
||||
score_pruning = compute_metrics(args.task_name, preds, labels)[args.metric_name]
|
||||
new_time = datetime.now() - before_time
|
||||
|
||||
logger.info("Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", original_num_params, pruned_num_params, pruned_num_params/original_num_params * 100)
|
||||
logger.info(
|
||||
"Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)",
|
||||
original_num_params,
|
||||
pruned_num_params,
|
||||
pruned_num_params / original_num_params * 100,
|
||||
)
|
||||
logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning)
|
||||
logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time/new_time * 100)
|
||||
logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time / new_time * 100)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
## Required parameters
|
||||
parser.add_argument("--data_dir", default=None, type=str, required=True,
|
||||
help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
|
||||
parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
|
||||
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(
|
||||
ALL_MODELS))
|
||||
parser.add_argument("--task_name", default=None, type=str, required=True,
|
||||
help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
|
||||
parser.add_argument("--output_dir", default=None, type=str, required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.")
|
||||
# Required parameters
|
||||
parser.add_argument(
|
||||
"--data_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_name_or_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--task_name",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
|
||||
## Other parameters
|
||||
parser.add_argument("--config_name", default="", type=str,
|
||||
help="Pretrained config name or path if not the same as model_name_or_path")
|
||||
parser.add_argument("--tokenizer_name", default="", type=str,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name_or_path")
|
||||
parser.add_argument("--cache_dir", default="", type=str,
|
||||
help="Where do you want to store the pre-trained models downloaded from s3")
|
||||
parser.add_argument("--data_subset", type=int, default=-1,
|
||||
help="If > 0: limit the data to a subset of data_subset instances.")
|
||||
parser.add_argument("--overwrite_output_dir", action='store_true',
|
||||
help="Whether to overwrite data in output directory")
|
||||
parser.add_argument('--overwrite_cache', action='store_true',
|
||||
help="Overwrite the cached training and evaluation sets")
|
||||
# Other parameters
|
||||
parser.add_argument(
|
||||
"--config_name",
|
||||
default="",
|
||||
type=str,
|
||||
help="Pretrained config name or path if not the same as model_name_or_path",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer_name",
|
||||
default="",
|
||||
type=str,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name_or_path",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache_dir",
|
||||
default="",
|
||||
type=str,
|
||||
help="Where do you want to store the pre-trained models downloaded from s3",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overwrite_output_dir", action="store_true", help="Whether to overwrite data in output directory"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
|
||||
)
|
||||
|
||||
parser.add_argument("--dont_normalize_importance_by_layer", action='store_true',
|
||||
help="Don't normalize importance score by layers")
|
||||
parser.add_argument("--dont_normalize_global_importance", action='store_true',
|
||||
help="Don't normalize all importance scores between 0 and 1")
|
||||
parser.add_argument(
|
||||
"--dont_normalize_importance_by_layer", action="store_true", help="Don't normalize importance score by layers"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dont_normalize_global_importance",
|
||||
action="store_true",
|
||||
help="Don't normalize all importance scores between 0 and 1",
|
||||
)
|
||||
|
||||
parser.add_argument("--try_masking", action='store_true',
|
||||
help="Whether to try to mask head until a threshold of accuracy.")
|
||||
parser.add_argument("--masking_threshold", default=0.9, type=float,
|
||||
help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).")
|
||||
parser.add_argument("--masking_amount", default=0.1, type=float,
|
||||
help="Amount to heads to masking at each masking step.")
|
||||
parser.add_argument("--metric_name", default="acc", type=str,
|
||||
help="Metric to use for head masking.")
|
||||
parser.add_argument(
|
||||
"--try_masking", action="store_true", help="Whether to try to mask head until a threshold of accuracy."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--masking_threshold",
|
||||
default=0.9,
|
||||
type=float,
|
||||
help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step."
|
||||
)
|
||||
parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.")
|
||||
|
||||
parser.add_argument("--max_seq_length", default=128, type=int,
|
||||
help="The maximum total input sequence length after WordPiece tokenization. \n"
|
||||
"Sequences longer than this will be truncated, sequences shorter padded.")
|
||||
parser.add_argument(
|
||||
"--max_seq_length",
|
||||
default=128,
|
||||
type=int,
|
||||
help="The maximum total input sequence length after WordPiece tokenization. \n"
|
||||
"Sequences longer than this will be truncated, sequences shorter padded.",
|
||||
)
|
||||
parser.add_argument("--batch_size", default=1, type=int, help="Batch size.")
|
||||
|
||||
parser.add_argument("--seed", type=int, default=42)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
|
||||
parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available")
|
||||
parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
|
||||
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
|
||||
parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
|
||||
parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
|
||||
parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
@@ -278,10 +343,10 @@ def main():
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
args.device = torch.device("cuda", args.local_rank)
|
||||
args.n_gpu = 1
|
||||
torch.distributed.init_process_group(backend='nccl') # Initializes the distributed backend
|
||||
torch.distributed.init_process_group(backend="nccl") # Initializes the distributed backend
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
|
||||
logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
|
||||
logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1)))
|
||||
|
||||
# Set seeds
|
||||
@@ -306,17 +371,23 @@ def main():
|
||||
args.model_type = key # take the first match in model types
|
||||
break
|
||||
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
|
||||
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
|
||||
num_labels=num_labels,
|
||||
finetuning_task=args.task_name,
|
||||
output_attentions=True,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
model = model_class.from_pretrained(args.model_name_or_path,
|
||||
from_tf=bool('.ckpt' in args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
config = config_class.from_pretrained(
|
||||
args.config_name if args.config_name else args.model_name_or_path,
|
||||
num_labels=num_labels,
|
||||
finetuning_task=args.task_name,
|
||||
output_attentions=True,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
tokenizer = tokenizer_class.from_pretrained(
|
||||
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
model = model_class.from_pretrained(
|
||||
args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
|
||||
if args.local_rank == 0:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||
@@ -324,14 +395,14 @@ def main():
|
||||
# Distributed and parallel training
|
||||
model.to(args.device)
|
||||
if args.local_rank != -1:
|
||||
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
|
||||
output_device=args.local_rank,
|
||||
find_unused_parameters=True)
|
||||
model = torch.nn.parallel.DistributedDataParallel(
|
||||
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
|
||||
)
|
||||
elif args.n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# Print/save training arguments
|
||||
torch.save(args, os.path.join(args.output_dir, 'run_args.bin'))
|
||||
torch.save(args, os.path.join(args.output_dir, "run_args.bin"))
|
||||
logger.info("Training/evaluation parameters %s", args)
|
||||
|
||||
# Prepare dataset for the GLUE task
|
||||
@@ -341,11 +412,9 @@ def main():
|
||||
eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
|
||||
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
|
||||
|
||||
|
||||
# Compute head entropy and importance score
|
||||
compute_heads_importance(args, model, eval_dataloader)
|
||||
|
||||
|
||||
# Try head masking (set heads to zero until the score goes under a threshole)
|
||||
# and head pruning (remove masked heads and see the effect on the network)
|
||||
if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:
|
||||
@@ -353,5 +422,5 @@ def main():
|
||||
prune_heads(args, model, eval_dataloader, head_mask)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -16,42 +16,44 @@
|
||||
# limitations under the License.
|
||||
""" Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from tqdm import trange
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig, CTRLConfig
|
||||
|
||||
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
||||
from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
|
||||
from transformers import XLNetLMHeadModel, XLNetTokenizer
|
||||
from transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
|
||||
from transformers import CTRLLMHeadModel, CTRLTokenizer
|
||||
from transformers import XLMWithLMHeadModel, XLMTokenizer
|
||||
from transformers import (
|
||||
CTRLLMHeadModel,
|
||||
CTRLTokenizer,
|
||||
GPT2LMHeadModel,
|
||||
GPT2Tokenizer,
|
||||
OpenAIGPTLMHeadModel,
|
||||
OpenAIGPTTokenizer,
|
||||
TransfoXLLMHeadModel,
|
||||
TransfoXLTokenizer,
|
||||
XLMTokenizer,
|
||||
XLMWithLMHeadModel,
|
||||
XLNetLMHeadModel,
|
||||
XLNetTokenizer,
|
||||
)
|
||||
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO,
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MAX_LENGTH = int(10000) # Hardcoded max length to avoid infinite loop
|
||||
|
||||
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig, CTRLConfig)), ())
|
||||
|
||||
MODEL_CLASSES = {
|
||||
'gpt2': (GPT2LMHeadModel, GPT2Tokenizer),
|
||||
'ctrl': (CTRLLMHeadModel, CTRLTokenizer),
|
||||
'openai-gpt': (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
|
||||
'xlnet': (XLNetLMHeadModel, XLNetTokenizer),
|
||||
'transfo-xl': (TransfoXLLMHeadModel, TransfoXLTokenizer),
|
||||
'xlm': (XLMWithLMHeadModel, XLMTokenizer),
|
||||
"gpt2": (GPT2LMHeadModel, GPT2Tokenizer),
|
||||
"ctrl": (CTRLLMHeadModel, CTRLTokenizer),
|
||||
"openai-gpt": (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
|
||||
"xlnet": (XLNetLMHeadModel, XLNetTokenizer),
|
||||
"transfo-xl": (TransfoXLLMHeadModel, TransfoXLTokenizer),
|
||||
"xlm": (XLMWithLMHeadModel, XLMTokenizer),
|
||||
}
|
||||
|
||||
# Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
|
||||
@@ -76,105 +78,111 @@ def set_seed(args):
|
||||
torch.cuda.manual_seed_all(args.seed)
|
||||
|
||||
|
||||
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
|
||||
""" Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
|
||||
Args:
|
||||
logits: logits distribution shape (batch size x vocabulary size)
|
||||
top_k > 0: keep only top k tokens with highest probability (top-k filtering).
|
||||
top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
|
||||
Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
|
||||
From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
|
||||
"""
|
||||
top_k = min(top_k, logits.size(-1)) # Safety check
|
||||
if top_k > 0:
|
||||
# Remove all tokens with a probability less than the last token of the top-k
|
||||
indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
|
||||
logits[indices_to_remove] = filter_value
|
||||
|
||||
if top_p > 0.0:
|
||||
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
|
||||
cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
|
||||
|
||||
# Remove tokens with cumulative probability above the threshold
|
||||
sorted_indices_to_remove = cumulative_probs > top_p
|
||||
# Shift the indices to the right to keep also the first token above the threshold
|
||||
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
|
||||
sorted_indices_to_remove[..., 0] = 0
|
||||
|
||||
# scatter sorted tensors to original indexing
|
||||
indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
|
||||
logits[indices_to_remove] = filter_value
|
||||
return logits
|
||||
#
|
||||
# Functions to prepare models' input
|
||||
#
|
||||
|
||||
|
||||
def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0,
|
||||
is_xlnet=False, is_xlm_mlm=False, xlm_mask_token=None, xlm_lang=None, device='cpu'):
|
||||
context = torch.tensor(context, dtype=torch.long, device=device)
|
||||
context = context.unsqueeze(0).repeat(num_samples, 1)
|
||||
generated = context
|
||||
with torch.no_grad():
|
||||
for _ in trange(length):
|
||||
def prepare_ctrl_input(args, _, tokenizer, prompt_text):
|
||||
if args.temperature > 0.7:
|
||||
logger.info("CTRL typically works better with lower temperatures (and lower top_k).")
|
||||
|
||||
inputs = {'input_ids': generated}
|
||||
if is_xlnet:
|
||||
# XLNet is a direct (predict same token, not next token) and bi-directional model by default
|
||||
# => need one additional dummy token in the input (will be masked), attention mask and target mapping (see model docstring)
|
||||
input_ids = torch.cat((generated, torch.zeros((1, 1), dtype=torch.long, device=device)), dim=1)
|
||||
perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float, device=device)
|
||||
perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
|
||||
target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float, device=device)
|
||||
target_mapping[0, 0, -1] = 1.0 # predict last token
|
||||
inputs = {'input_ids': input_ids, 'perm_mask': perm_mask, 'target_mapping': target_mapping}
|
||||
encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False)
|
||||
if not any(encoded_prompt[0] == x for x in tokenizer.control_codes.values()):
|
||||
logger.info("WARNING! You are not starting your generation from a control code so you won't get good results")
|
||||
return prompt_text
|
||||
|
||||
if is_xlm_mlm and xlm_mask_token:
|
||||
# XLM MLM models are direct models (predict same token, not next token)
|
||||
# => need one additional dummy token in the input (will be masked and guessed)
|
||||
input_ids = torch.cat((generated, torch.full((1, 1), xlm_mask_token, dtype=torch.long, device=device)), dim=1)
|
||||
inputs = {'input_ids': input_ids}
|
||||
|
||||
if xlm_lang is not None:
|
||||
inputs["langs"] = torch.tensor([xlm_lang] * inputs["input_ids"].shape[1], device=device).view(1, -1)
|
||||
def prepare_xlm_input(args, model, tokenizer, prompt_text):
|
||||
# kwargs = {"language": None, "mask_token_id": None}
|
||||
|
||||
outputs = model(**inputs) # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet/CTRL (cached hidden-states)
|
||||
next_token_logits = outputs[0][:, -1, :] / (temperature if temperature > 0 else 1.)
|
||||
# Set the language
|
||||
use_lang_emb = hasattr(model.config, "use_lang_emb") and model.config.use_lang_emb
|
||||
if hasattr(model.config, "lang2id") and use_lang_emb:
|
||||
available_languages = model.config.lang2id.keys()
|
||||
if args.xlm_language in available_languages:
|
||||
language = args.xlm_language
|
||||
else:
|
||||
language = None
|
||||
while language not in available_languages:
|
||||
language = input("Using XLM. Select language in " + str(list(available_languages)) + " >>> ")
|
||||
# kwargs["language"] = tokenizer.lang2id[language]
|
||||
|
||||
# repetition penalty from CTRL (https://arxiv.org/abs/1909.05858)
|
||||
for i in range(num_samples):
|
||||
for _ in set(generated[i].tolist()):
|
||||
next_token_logits[i, _] /= repetition_penalty
|
||||
|
||||
filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
|
||||
if temperature == 0: # greedy sampling:
|
||||
next_token = torch.argmax(filtered_logits, dim=-1).unsqueeze(-1)
|
||||
else:
|
||||
next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
|
||||
generated = torch.cat((generated, next_token), dim=1)
|
||||
return generated
|
||||
# TODO fix mask_token_id setup when configurations will be synchronized between models and tokenizers
|
||||
# XLM masked-language modeling (MLM) models need masked token
|
||||
# is_xlm_mlm = "mlm" in args.model_name_or_path
|
||||
# if is_xlm_mlm:
|
||||
# kwargs["mask_token_id"] = tokenizer.mask_token_id
|
||||
|
||||
return prompt_text
|
||||
|
||||
|
||||
def prepare_xlnet_input(args, _, tokenizer, prompt_text):
|
||||
prompt_text = (args.padding_text if args.padding_text else PADDING_TEXT) + prompt_text
|
||||
return prompt_text, {}
|
||||
|
||||
|
||||
def prepare_transfoxl_input(args, _, tokenizer, prompt_text):
|
||||
prompt_text = (args.padding_text if args.padding_text else PADDING_TEXT) + prompt_text
|
||||
return prompt_text, {}
|
||||
|
||||
|
||||
PREPROCESSING_FUNCTIONS = {
|
||||
"ctrl": prepare_ctrl_input,
|
||||
"xlm": prepare_xlm_input,
|
||||
"xlnet": prepare_xlnet_input,
|
||||
"transfo-xl": prepare_transfoxl_input,
|
||||
}
|
||||
|
||||
|
||||
def adjust_length_to_model(length, max_sequence_length):
|
||||
if length < 0 and max_sequence_length > 0:
|
||||
length = max_sequence_length
|
||||
elif 0 < max_sequence_length < length:
|
||||
length = max_sequence_length # No generation bigger than model size
|
||||
elif length < 0:
|
||||
length = MAX_LENGTH # avoid infinite loop
|
||||
return length
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--model_type", default=None, type=str, required=True,
|
||||
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
|
||||
parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
|
||||
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
|
||||
parser.add_argument(
|
||||
"--model_type",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_name_or_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
|
||||
)
|
||||
|
||||
parser.add_argument("--prompt", type=str, default="")
|
||||
parser.add_argument("--padding_text", type=str, default="")
|
||||
parser.add_argument("--xlm_lang", type=str, default="", help="Optional language when used with the XLM model.")
|
||||
parser.add_argument("--length", type=int, default=20)
|
||||
parser.add_argument("--num_samples", type=int, default=1)
|
||||
parser.add_argument("--temperature", type=float, default=1.0,
|
||||
help="temperature of 0 implies greedy sampling")
|
||||
parser.add_argument("--repetition_penalty", type=float, default=1.0,
|
||||
help="primarily useful for CTRL model; in that case, use 1.2")
|
||||
parser.add_argument("--top_k", type=int, default=0)
|
||||
parser.add_argument("--top_p", type=float, default=0.9)
|
||||
parser.add_argument("--no_cuda", action='store_true',
|
||||
help="Avoid using CUDA when available")
|
||||
parser.add_argument('--seed', type=int, default=42,
|
||||
help="random seed for initialization")
|
||||
parser.add_argument('--stop_token', type=str, default=None,
|
||||
help="Token at which text generation is stopped")
|
||||
parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped")
|
||||
|
||||
parser.add_argument(
|
||||
"--temperature",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="temperature of 1.0 has no effect, lower tend toward greedy sampling",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2"
|
||||
)
|
||||
parser.add_argument("--k", type=int, default=0)
|
||||
parser.add_argument("--p", type=float, default=0.9)
|
||||
|
||||
parser.add_argument("--padding_text", type=str, default="", help="Padding text for Transfo-XL and XLNet.")
|
||||
parser.add_argument("--xlm_language", type=str, default="", help="Optional language when used with the XLM model.")
|
||||
|
||||
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
|
||||
parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
|
||||
args = parser.parse_args()
|
||||
|
||||
args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
@@ -182,79 +190,49 @@ def main():
|
||||
|
||||
set_seed(args)
|
||||
|
||||
args.model_type = args.model_type.lower()
|
||||
model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
|
||||
# Initialize the model and tokenizer
|
||||
try:
|
||||
args.model_type = args.model_type.lower()
|
||||
model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
|
||||
except KeyError:
|
||||
raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)")
|
||||
|
||||
tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
|
||||
model = model_class.from_pretrained(args.model_name_or_path)
|
||||
model.to(args.device)
|
||||
model.eval()
|
||||
|
||||
if args.length < 0 and model.config.max_position_embeddings > 0:
|
||||
args.length = model.config.max_position_embeddings
|
||||
elif 0 < model.config.max_position_embeddings < args.length:
|
||||
args.length = model.config.max_position_embeddings # No generation bigger than model size
|
||||
elif args.length < 0:
|
||||
args.length = MAX_LENGTH # avoid infinite loop
|
||||
|
||||
args.length = adjust_length_to_model(args.length, max_sequence_length=model.config.max_position_embeddings)
|
||||
logger.info(args)
|
||||
if args.model_type in ["ctrl"]:
|
||||
if args.temperature > 0.7:
|
||||
logger.info('CTRL typically works better with lower temperatures (and lower top_k).')
|
||||
|
||||
while True:
|
||||
xlm_lang = None
|
||||
# XLM Language usage detailed in the issues #1414
|
||||
if args.model_type in ["xlm"] and hasattr(tokenizer, 'lang2id') and hasattr(model.config, 'use_lang_emb') \
|
||||
and model.config.use_lang_emb:
|
||||
if args.xlm_lang:
|
||||
language = args.xlm_lang
|
||||
else:
|
||||
language = None
|
||||
while language not in tokenizer.lang2id.keys():
|
||||
language = input("Using XLM. Select language in " + str(list(tokenizer.lang2id.keys())) + " >>> ")
|
||||
xlm_lang = tokenizer.lang2id[language]
|
||||
prompt_text = args.prompt if args.prompt else input("Model prompt >>> ")
|
||||
|
||||
# XLM masked-language modeling (MLM) models need masked token (see details in sample_sequence)
|
||||
is_xlm_mlm = args.model_type in ["xlm"] and 'mlm' in args.model_name_or_path
|
||||
if is_xlm_mlm:
|
||||
xlm_mask_token = tokenizer.mask_token_id
|
||||
else:
|
||||
xlm_mask_token = None
|
||||
# Different models need different input formatting and/or extra arguments
|
||||
requires_preprocessing = args.model_type in PREPROCESSING_FUNCTIONS.keys()
|
||||
if requires_preprocessing:
|
||||
prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type)
|
||||
prompt_text = prepare_input(args, model, tokenizer, prompt_text)
|
||||
encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
|
||||
encoded_prompt = encoded_prompt.to(args.device)
|
||||
|
||||
raw_text = args.prompt if args.prompt else input("Model prompt >>> ")
|
||||
if args.model_type in ["transfo-xl", "xlnet"]:
|
||||
# Models with memory likes to have a long prompt for short inputs.
|
||||
raw_text = (args.padding_text if args.padding_text else PADDING_TEXT) + raw_text
|
||||
context_tokens = tokenizer.encode(raw_text, add_special_tokens=False)
|
||||
if args.model_type == "ctrl":
|
||||
if not any(context_tokens[0] == x for x in tokenizer.control_codes.values()):
|
||||
logger.info("WARNING! You are not starting your generation from a control code so you won't get good results")
|
||||
out = sample_sequence(
|
||||
model=model,
|
||||
context=context_tokens,
|
||||
num_samples=args.num_samples,
|
||||
length=args.length,
|
||||
temperature=args.temperature,
|
||||
top_k=args.top_k,
|
||||
top_p=args.top_p,
|
||||
repetition_penalty=args.repetition_penalty,
|
||||
is_xlnet=bool(args.model_type == "xlnet"),
|
||||
is_xlm_mlm=is_xlm_mlm,
|
||||
xlm_mask_token=xlm_mask_token,
|
||||
xlm_lang=xlm_lang,
|
||||
device=args.device,
|
||||
)
|
||||
out = out[:, len(context_tokens):].tolist()
|
||||
for o in out:
|
||||
text = tokenizer.decode(o, clean_up_tokenization_spaces=True)
|
||||
text = text[: text.find(args.stop_token) if args.stop_token else None]
|
||||
output_sequences = model.generate(
|
||||
input_ids=encoded_prompt,
|
||||
max_length=args.length,
|
||||
temperature=args.temperature,
|
||||
top_k=args.k,
|
||||
top_p=args.p,
|
||||
repetition_penalty=args.repetition_penalty,
|
||||
do_sample=True,
|
||||
)
|
||||
|
||||
print(text)
|
||||
# Batch size == 1. to add more examples please use num_return_sequences > 1
|
||||
generated_sequence = output_sequences[0].tolist()
|
||||
text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
|
||||
text = text[: text.find(args.stop_token) if args.stop_token else None]
|
||||
|
||||
print(text)
|
||||
|
||||
if args.prompt:
|
||||
break
|
||||
return text
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -13,66 +13,91 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa)."""
|
||||
""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa, Albert, XLM-RoBERTa)."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||
TensorDataset)
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from transformers import (
|
||||
WEIGHTS_NAME,
|
||||
AdamW,
|
||||
AlbertConfig,
|
||||
AlbertForSequenceClassification,
|
||||
AlbertTokenizer,
|
||||
BertConfig,
|
||||
BertForSequenceClassification,
|
||||
BertTokenizer,
|
||||
DistilBertConfig,
|
||||
DistilBertForSequenceClassification,
|
||||
DistilBertTokenizer,
|
||||
FlaubertConfig,
|
||||
FlaubertForSequenceClassification,
|
||||
FlaubertTokenizer,
|
||||
RobertaConfig,
|
||||
RobertaForSequenceClassification,
|
||||
RobertaTokenizer,
|
||||
XLMConfig,
|
||||
XLMForSequenceClassification,
|
||||
XLMRobertaConfig,
|
||||
XLMRobertaForSequenceClassification,
|
||||
XLMRobertaTokenizer,
|
||||
XLMTokenizer,
|
||||
XLNetConfig,
|
||||
XLNetForSequenceClassification,
|
||||
XLNetTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
)
|
||||
from transformers import glue_compute_metrics as compute_metrics
|
||||
from transformers import glue_convert_examples_to_features as convert_examples_to_features
|
||||
from transformers import glue_output_modes as output_modes
|
||||
from transformers import glue_processors as processors
|
||||
|
||||
|
||||
try:
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
except:
|
||||
except ImportError:
|
||||
from tensorboardX import SummaryWriter
|
||||
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from transformers import (WEIGHTS_NAME, BertConfig,
|
||||
BertForSequenceClassification, BertTokenizer,
|
||||
RobertaConfig,
|
||||
RobertaForSequenceClassification,
|
||||
RobertaTokenizer,
|
||||
XLMConfig, XLMForSequenceClassification,
|
||||
XLMTokenizer, XLNetConfig,
|
||||
XLNetForSequenceClassification,
|
||||
XLNetTokenizer,
|
||||
DistilBertConfig,
|
||||
DistilBertForSequenceClassification,
|
||||
DistilBertTokenizer,
|
||||
AlbertConfig,
|
||||
AlbertForSequenceClassification,
|
||||
AlbertTokenizer,
|
||||
)
|
||||
|
||||
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||
|
||||
from transformers import glue_compute_metrics as compute_metrics
|
||||
from transformers import glue_output_modes as output_modes
|
||||
from transformers import glue_processors as processors
|
||||
from transformers import glue_convert_examples_to_features as convert_examples_to_features
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig,
|
||||
RobertaConfig, DistilBertConfig)), ())
|
||||
ALL_MODELS = sum(
|
||||
(
|
||||
tuple(conf.pretrained_config_archive_map.keys())
|
||||
for conf in (
|
||||
BertConfig,
|
||||
XLNetConfig,
|
||||
XLMConfig,
|
||||
RobertaConfig,
|
||||
DistilBertConfig,
|
||||
AlbertConfig,
|
||||
XLMRobertaConfig,
|
||||
FlaubertConfig,
|
||||
)
|
||||
),
|
||||
(),
|
||||
)
|
||||
|
||||
MODEL_CLASSES = {
|
||||
'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
|
||||
'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
|
||||
'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
|
||||
'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
|
||||
'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
|
||||
'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer)
|
||||
"bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
|
||||
"xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
|
||||
"xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
|
||||
"roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
|
||||
"distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
|
||||
"albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
|
||||
"xlmroberta": (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer),
|
||||
"flaubert": (FlaubertConfig, FlaubertForSequenceClassification, FlaubertTokenizer),
|
||||
}
|
||||
|
||||
|
||||
@@ -100,14 +125,28 @@ def train(args, train_dataset, model, tokenizer):
|
||||
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
||||
|
||||
# Prepare optimizer and schedule (linear warmup and decay)
|
||||
no_decay = ['bias', 'LayerNorm.weight']
|
||||
no_decay = ["bias", "LayerNorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
|
||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||
]
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||
"weight_decay": args.weight_decay,
|
||||
},
|
||||
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
|
||||
]
|
||||
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
|
||||
)
|
||||
|
||||
# Check if saved optimizer or scheduler states exist
|
||||
if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
|
||||
os.path.join(args.model_name_or_path, "scheduler.pt")
|
||||
):
|
||||
# Load in optimizer and scheduler states
|
||||
optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
|
||||
scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
|
||||
|
||||
if args.fp16:
|
||||
try:
|
||||
from apex import amp
|
||||
@@ -121,40 +160,66 @@ def train(args, train_dataset, model, tokenizer):
|
||||
|
||||
# Distributed training (should be after apex fp16 initialization)
|
||||
if args.local_rank != -1:
|
||||
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
|
||||
output_device=args.local_rank,
|
||||
find_unused_parameters=True)
|
||||
model = torch.nn.parallel.DistributedDataParallel(
|
||||
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
|
||||
)
|
||||
|
||||
# Train!
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num examples = %d", len(train_dataset))
|
||||
logger.info(" Num Epochs = %d", args.num_train_epochs)
|
||||
logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
|
||||
logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
|
||||
logger.info(
|
||||
" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
args.train_batch_size
|
||||
* args.gradient_accumulation_steps
|
||||
* (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
|
||||
)
|
||||
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
|
||||
logger.info(" Total optimization steps = %d", t_total)
|
||||
|
||||
global_step = 0
|
||||
epochs_trained = 0
|
||||
steps_trained_in_current_epoch = 0
|
||||
# Check if continuing training from a checkpoint
|
||||
if os.path.exists(args.model_name_or_path):
|
||||
# set global_step to gobal_step of last saved checkpoint from model path
|
||||
global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
|
||||
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||
steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||
|
||||
logger.info(" Continuing training from checkpoint, will skip to saved global_step")
|
||||
logger.info(" Continuing training from epoch %d", epochs_trained)
|
||||
logger.info(" Continuing training from global step %d", global_step)
|
||||
logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
|
||||
|
||||
tr_loss, logging_loss = 0.0, 0.0
|
||||
model.zero_grad()
|
||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
||||
train_iterator = trange(
|
||||
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0],
|
||||
)
|
||||
set_seed(args) # Added here for reproductibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
|
||||
# Skip past any already trained steps if resuming training
|
||||
if steps_trained_in_current_epoch > 0:
|
||||
steps_trained_in_current_epoch -= 1
|
||||
continue
|
||||
|
||||
model.train()
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
inputs = {'input_ids': batch[0],
|
||||
'attention_mask': batch[1],
|
||||
'labels': batch[3]}
|
||||
if args.model_type != 'distilbert':
|
||||
inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids
|
||||
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
|
||||
if args.model_type != "distilbert":
|
||||
inputs["token_type_ids"] = (
|
||||
batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None
|
||||
) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
|
||||
outputs = model(**inputs)
|
||||
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||
|
||||
if args.n_gpu > 1:
|
||||
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
||||
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
||||
if args.gradient_accumulation_steps > 1:
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
|
||||
@@ -178,32 +243,42 @@ def train(args, train_dataset, model, tokenizer):
|
||||
|
||||
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
||||
logs = {}
|
||||
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
|
||||
if (
|
||||
args.local_rank == -1 and args.evaluate_during_training
|
||||
): # Only evaluate when single GPU otherwise metrics may not average well
|
||||
results = evaluate(args, model, tokenizer)
|
||||
for key, value in results.items():
|
||||
eval_key = 'eval_{}'.format(key)
|
||||
eval_key = "eval_{}".format(key)
|
||||
logs[eval_key] = value
|
||||
|
||||
loss_scalar = (tr_loss - logging_loss) / args.logging_steps
|
||||
learning_rate_scalar = scheduler.get_lr()[0]
|
||||
logs['learning_rate'] = learning_rate_scalar
|
||||
logs['loss'] = loss_scalar
|
||||
logs["learning_rate"] = learning_rate_scalar
|
||||
logs["loss"] = loss_scalar
|
||||
logging_loss = tr_loss
|
||||
|
||||
for key, value in logs.items():
|
||||
tb_writer.add_scalar(key, value, global_step)
|
||||
print(json.dumps({**logs, **{'step': global_step}}))
|
||||
print(json.dumps({**logs, **{"step": global_step}}))
|
||||
|
||||
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||
# Save model checkpoint
|
||||
output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
|
||||
output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
model_to_save = (
|
||||
model.module if hasattr(model, "module") else model
|
||||
) # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(output_dir)
|
||||
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
||||
tokenizer.save_pretrained(output_dir)
|
||||
|
||||
torch.save(args, os.path.join(output_dir, "training_args.bin"))
|
||||
logger.info("Saving model checkpoint to %s", output_dir)
|
||||
|
||||
torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
|
||||
torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
|
||||
logger.info("Saving optimizer and scheduler states to %s", output_dir)
|
||||
|
||||
if args.max_steps > 0 and global_step > args.max_steps:
|
||||
epoch_iterator.close()
|
||||
break
|
||||
@@ -220,7 +295,7 @@ def train(args, train_dataset, model, tokenizer):
|
||||
def evaluate(args, model, tokenizer, prefix=""):
|
||||
# Loop to handle MNLI double evaluation (matched, mis-matched)
|
||||
eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
|
||||
eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
|
||||
eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,)
|
||||
|
||||
results = {}
|
||||
for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
|
||||
@@ -251,11 +326,11 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
|
||||
with torch.no_grad():
|
||||
inputs = {'input_ids': batch[0],
|
||||
'attention_mask': batch[1],
|
||||
'labels': batch[3]}
|
||||
if args.model_type != 'distilbert':
|
||||
inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids
|
||||
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
|
||||
if args.model_type != "distilbert":
|
||||
inputs["token_type_ids"] = (
|
||||
batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None
|
||||
) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
|
||||
outputs = model(**inputs)
|
||||
tmp_eval_loss, logits = outputs[:2]
|
||||
|
||||
@@ -263,10 +338,10 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
nb_eval_steps += 1
|
||||
if preds is None:
|
||||
preds = logits.detach().cpu().numpy()
|
||||
out_label_ids = inputs['labels'].detach().cpu().numpy()
|
||||
out_label_ids = inputs["labels"].detach().cpu().numpy()
|
||||
else:
|
||||
preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
|
||||
out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
|
||||
out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
|
||||
|
||||
eval_loss = eval_loss / nb_eval_steps
|
||||
if args.output_mode == "classification":
|
||||
@@ -293,29 +368,36 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
||||
processor = processors[task]()
|
||||
output_mode = output_modes[task]
|
||||
# Load data features from cache or dataset file
|
||||
cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
|
||||
'dev' if evaluate else 'train',
|
||||
list(filter(None, args.model_name_or_path.split('/'))).pop(),
|
||||
str(args.max_seq_length),
|
||||
str(task)))
|
||||
cached_features_file = os.path.join(
|
||||
args.data_dir,
|
||||
"cached_{}_{}_{}_{}".format(
|
||||
"dev" if evaluate else "train",
|
||||
list(filter(None, args.model_name_or_path.split("/"))).pop(),
|
||||
str(args.max_seq_length),
|
||||
str(task),
|
||||
),
|
||||
)
|
||||
if os.path.exists(cached_features_file) and not args.overwrite_cache:
|
||||
logger.info("Loading features from cached file %s", cached_features_file)
|
||||
features = torch.load(cached_features_file)
|
||||
else:
|
||||
logger.info("Creating features from dataset file at %s", args.data_dir)
|
||||
label_list = processor.get_labels()
|
||||
if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
|
||||
if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta", "xlmroberta"]:
|
||||
# HACK(label indices are swapped in RoBERTa pretrained model)
|
||||
label_list[1], label_list[2] = label_list[2], label_list[1]
|
||||
examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
|
||||
features = convert_examples_to_features(examples,
|
||||
tokenizer,
|
||||
label_list=label_list,
|
||||
max_length=args.max_seq_length,
|
||||
output_mode=output_mode,
|
||||
pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet
|
||||
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
||||
pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
|
||||
label_list[1], label_list[2] = label_list[2], label_list[1]
|
||||
examples = (
|
||||
processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
|
||||
)
|
||||
features = convert_examples_to_features(
|
||||
examples,
|
||||
tokenizer,
|
||||
label_list=label_list,
|
||||
max_length=args.max_seq_length,
|
||||
output_mode=output_mode,
|
||||
pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet
|
||||
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
||||
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
|
||||
)
|
||||
if args.local_rank in [-1, 0]:
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
@@ -332,7 +414,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
||||
all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
|
||||
elif output_mode == "regression":
|
||||
all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
|
||||
|
||||
|
||||
dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
|
||||
return dataset
|
||||
|
||||
@@ -340,91 +422,152 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
## Required parameters
|
||||
parser.add_argument("--data_dir", default=None, type=str, required=True,
|
||||
help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
|
||||
parser.add_argument("--model_type", default=None, type=str, required=True,
|
||||
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
|
||||
parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
|
||||
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
|
||||
parser.add_argument("--task_name", default=None, type=str, required=True,
|
||||
help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
|
||||
parser.add_argument("--output_dir", default=None, type=str, required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.")
|
||||
# Required parameters
|
||||
parser.add_argument(
|
||||
"--data_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_type",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_name_or_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--task_name",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
|
||||
## Other parameters
|
||||
parser.add_argument("--config_name", default="", type=str,
|
||||
help="Pretrained config name or path if not the same as model_name")
|
||||
parser.add_argument("--tokenizer_name", default="", type=str,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name")
|
||||
parser.add_argument("--cache_dir", default="", type=str,
|
||||
help="Where do you want to store the pre-trained models downloaded from s3")
|
||||
parser.add_argument("--max_seq_length", default=128, type=int,
|
||||
help="The maximum total input sequence length after tokenization. Sequences longer "
|
||||
"than this will be truncated, sequences shorter will be padded.")
|
||||
parser.add_argument("--do_train", action='store_true',
|
||||
help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action='store_true',
|
||||
help="Whether to run eval on the dev set.")
|
||||
parser.add_argument("--evaluate_during_training", action='store_true',
|
||||
help="Rul evaluation during training at each logging step.")
|
||||
parser.add_argument("--do_lower_case", action='store_true',
|
||||
help="Set this flag if you are using an uncased model.")
|
||||
# Other parameters
|
||||
parser.add_argument(
|
||||
"--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer_name",
|
||||
default="",
|
||||
type=str,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache_dir",
|
||||
default="",
|
||||
type=str,
|
||||
help="Where do you want to store the pre-trained models downloaded from s3",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_seq_length",
|
||||
default=128,
|
||||
type=int,
|
||||
help="The maximum total input sequence length after tokenization. Sequences longer "
|
||||
"than this will be truncated, sequences shorter will be padded.",
|
||||
)
|
||||
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
|
||||
parser.add_argument(
|
||||
"--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.",
|
||||
)
|
||||
|
||||
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
|
||||
help="Batch size per GPU/CPU for training.")
|
||||
parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
|
||||
help="Batch size per GPU/CPU for evaluation.")
|
||||
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float,
|
||||
help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float,
|
||||
help="Weight decay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
|
||||
help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float,
|
||||
help="Max gradient norm.")
|
||||
parser.add_argument("--num_train_epochs", default=3.0, type=float,
|
||||
help="Total number of training epochs to perform.")
|
||||
parser.add_argument("--max_steps", default=-1, type=int,
|
||||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
|
||||
parser.add_argument("--warmup_steps", default=0, type=int,
|
||||
help="Linear warmup over warmup_steps.")
|
||||
parser.add_argument(
|
||||
"--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.",
|
||||
)
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
|
||||
parser.add_argument(
|
||||
"--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_steps",
|
||||
default=-1,
|
||||
type=int,
|
||||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
|
||||
)
|
||||
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
|
||||
|
||||
parser.add_argument('--logging_steps', type=int, default=50,
|
||||
help="Log every X updates steps.")
|
||||
parser.add_argument('--save_steps', type=int, default=50,
|
||||
help="Save checkpoint every X updates steps.")
|
||||
parser.add_argument("--eval_all_checkpoints", action='store_true',
|
||||
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
|
||||
parser.add_argument("--no_cuda", action='store_true',
|
||||
help="Avoid using CUDA when available")
|
||||
parser.add_argument('--overwrite_output_dir', action='store_true',
|
||||
help="Overwrite the content of the output directory")
|
||||
parser.add_argument('--overwrite_cache', action='store_true',
|
||||
help="Overwrite the cached training and evaluation sets")
|
||||
parser.add_argument('--seed', type=int, default=42,
|
||||
help="random seed for initialization")
|
||||
parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
|
||||
parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
|
||||
parser.add_argument(
|
||||
"--eval_all_checkpoints",
|
||||
action="store_true",
|
||||
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
|
||||
)
|
||||
parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
|
||||
parser.add_argument(
|
||||
"--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets",
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
|
||||
|
||||
parser.add_argument('--fp16', action='store_true',
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
|
||||
parser.add_argument('--fp16_opt_level', type=str, default='O1',
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html")
|
||||
parser.add_argument("--local_rank", type=int, default=-1,
|
||||
help="For distributed training: local_rank")
|
||||
parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
|
||||
parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
|
||||
parser.add_argument(
|
||||
"--fp16",
|
||||
action="store_true",
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fp16_opt_level",
|
||||
type=str,
|
||||
default="O1",
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html",
|
||||
)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||
parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
|
||||
parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
|
||||
raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
|
||||
if (
|
||||
os.path.exists(args.output_dir)
|
||||
and os.listdir(args.output_dir)
|
||||
and args.do_train
|
||||
and not args.overwrite_output_dir
|
||||
):
|
||||
raise ValueError(
|
||||
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
|
||||
args.output_dir
|
||||
)
|
||||
)
|
||||
|
||||
# Setup distant debugging if needed
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
@@ -436,16 +579,24 @@ def main():
|
||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
torch.distributed.init_process_group(backend='nccl')
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
args.n_gpu = 1
|
||||
args.device = device
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
|
||||
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
|
||||
)
|
||||
logger.warning(
|
||||
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
args.local_rank,
|
||||
device,
|
||||
args.n_gpu,
|
||||
bool(args.local_rank != -1),
|
||||
args.fp16,
|
||||
)
|
||||
|
||||
# Set seed
|
||||
set_seed(args)
|
||||
@@ -465,17 +616,23 @@ def main():
|
||||
|
||||
args.model_type = args.model_type.lower()
|
||||
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
|
||||
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
|
||||
num_labels=num_labels,
|
||||
finetuning_task=args.task_name,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
||||
do_lower_case=args.do_lower_case,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
model = model_class.from_pretrained(args.model_name_or_path,
|
||||
from_tf=bool('.ckpt' in args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
config = config_class.from_pretrained(
|
||||
args.config_name if args.config_name else args.model_name_or_path,
|
||||
num_labels=num_labels,
|
||||
finetuning_task=args.task_name,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
tokenizer = tokenizer_class.from_pretrained(
|
||||
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
||||
do_lower_case=args.do_lower_case,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
model = model_class.from_pretrained(
|
||||
args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
|
||||
if args.local_rank == 0:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||
@@ -484,14 +641,12 @@ def main():
|
||||
|
||||
logger.info("Training/evaluation parameters %s", args)
|
||||
|
||||
|
||||
# Training
|
||||
if args.do_train:
|
||||
train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
|
||||
global_step, tr_loss = train(args, train_dataset, model, tokenizer)
|
||||
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
|
||||
|
||||
|
||||
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
|
||||
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||
# Create output directory if needed
|
||||
@@ -501,36 +656,39 @@ def main():
|
||||
logger.info("Saving model checkpoint to %s", args.output_dir)
|
||||
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
|
||||
# They can then be reloaded using `from_pretrained()`
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
model_to_save = (
|
||||
model.module if hasattr(model, "module") else model
|
||||
) # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(args.output_dir)
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
|
||||
# Good practice: save your training arguments together with the trained model
|
||||
torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
|
||||
torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
|
||||
|
||||
# Load a trained model and vocabulary that you have fine-tuned
|
||||
model = model_class.from_pretrained(args.output_dir)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir)
|
||||
model.to(args.device)
|
||||
|
||||
|
||||
# Evaluation
|
||||
results = {}
|
||||
if args.do_eval and args.local_rank in [-1, 0]:
|
||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
||||
checkpoints = [args.output_dir]
|
||||
if args.eval_all_checkpoints:
|
||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||
checkpoints = list(
|
||||
os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
|
||||
)
|
||||
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||
for checkpoint in checkpoints:
|
||||
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
||||
prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
|
||||
|
||||
global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
|
||||
prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
|
||||
|
||||
model = model_class.from_pretrained(checkpoint)
|
||||
model.to(args.device)
|
||||
result = evaluate(args, model, tokenizer, prefix=prefix)
|
||||
result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
|
||||
result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
|
||||
results.update(result)
|
||||
|
||||
return results
|
||||
|
||||
@@ -19,7 +19,6 @@ GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while B
|
||||
using a masked language modeling (MLM) loss.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
@@ -29,50 +28,72 @@ import pickle
|
||||
import random
|
||||
import re
|
||||
import shutil
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
|
||||
from torch.nn.utils.rnn import pad_sequence
|
||||
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from transformers import (
|
||||
WEIGHTS_NAME,
|
||||
AdamW,
|
||||
BertConfig,
|
||||
BertForMaskedLM,
|
||||
BertTokenizer,
|
||||
CamembertConfig,
|
||||
CamembertForMaskedLM,
|
||||
CamembertTokenizer,
|
||||
DistilBertConfig,
|
||||
DistilBertForMaskedLM,
|
||||
DistilBertTokenizer,
|
||||
GPT2Config,
|
||||
GPT2LMHeadModel,
|
||||
GPT2Tokenizer,
|
||||
OpenAIGPTConfig,
|
||||
OpenAIGPTLMHeadModel,
|
||||
OpenAIGPTTokenizer,
|
||||
PreTrainedModel,
|
||||
PreTrainedTokenizer,
|
||||
RobertaConfig,
|
||||
RobertaForMaskedLM,
|
||||
RobertaTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
)
|
||||
|
||||
|
||||
try:
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
except:
|
||||
except ImportError:
|
||||
from tensorboardX import SummaryWriter
|
||||
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
|
||||
BertConfig, BertForMaskedLM, BertTokenizer,
|
||||
GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
|
||||
OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
|
||||
RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
|
||||
DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer,
|
||||
CamembertConfig, CamembertForMaskedLM, CamembertTokenizer)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
MODEL_CLASSES = {
|
||||
'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
|
||||
'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
|
||||
'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
|
||||
'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
|
||||
'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
|
||||
'camembert': (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer)
|
||||
"gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
|
||||
"openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
|
||||
"bert": (BertConfig, BertForMaskedLM, BertTokenizer),
|
||||
"roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
|
||||
"distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
|
||||
"camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer),
|
||||
}
|
||||
|
||||
|
||||
class TextDataset(Dataset):
|
||||
def __init__(self, tokenizer, args, file_path='train', block_size=512):
|
||||
def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
|
||||
assert os.path.isfile(file_path)
|
||||
directory, filename = os.path.split(file_path)
|
||||
cached_features_file = os.path.join(directory, args.model_name_or_path + '_cached_lm_' + str(block_size) + '_' + filename)
|
||||
cached_features_file = os.path.join(
|
||||
directory, args.model_type + "_cached_lm_" + str(block_size) + "_" + filename
|
||||
)
|
||||
|
||||
if os.path.exists(cached_features_file) and not args.overwrite_cache:
|
||||
logger.info("Loading features from cached file %s", cached_features_file)
|
||||
with open(cached_features_file, 'rb') as handle:
|
||||
with open(cached_features_file, "rb") as handle:
|
||||
self.examples = pickle.load(handle)
|
||||
else:
|
||||
logger.info("Creating features from dataset file at %s", directory)
|
||||
@@ -83,14 +104,14 @@ class TextDataset(Dataset):
|
||||
|
||||
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
|
||||
|
||||
for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
|
||||
self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i+block_size]))
|
||||
for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size
|
||||
self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]))
|
||||
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
|
||||
# If your dataset is small, first you should loook for a bigger one :-) and second you
|
||||
# can change this behavior by adding (model specific) padding.
|
||||
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
with open(cached_features_file, 'wb') as handle:
|
||||
with open(cached_features_file, "wb") as handle:
|
||||
pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def __len__(self):
|
||||
@@ -100,9 +121,32 @@ class TextDataset(Dataset):
|
||||
return torch.tensor(self.examples[item])
|
||||
|
||||
|
||||
class LineByLineTextDataset(Dataset):
|
||||
def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
|
||||
assert os.path.isfile(file_path)
|
||||
# Here, we do not cache the features, operating under the assumption
|
||||
# that we will soon use fast multithreaded tokenizers from the
|
||||
# `tokenizers` repo everywhere =)
|
||||
logger.info("Creating features from dataset file at %s", file_path)
|
||||
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
lines = [line for line in f.read().splitlines() if len(line) > 0]
|
||||
|
||||
self.examples = tokenizer.batch_encode_plus(lines, max_length=block_size)["input_ids"]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.examples)
|
||||
|
||||
def __getitem__(self, i):
|
||||
return torch.tensor(self.examples[i])
|
||||
|
||||
|
||||
def load_and_cache_examples(args, tokenizer, evaluate=False):
|
||||
dataset = TextDataset(tokenizer, args, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size)
|
||||
return dataset
|
||||
file_path = args.eval_data_file if evaluate else args.train_data_file
|
||||
if args.line_by_line:
|
||||
return LineByLineTextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size)
|
||||
else:
|
||||
return TextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size)
|
||||
|
||||
|
||||
def set_seed(args):
|
||||
@@ -113,28 +157,35 @@ def set_seed(args):
|
||||
torch.cuda.manual_seed_all(args.seed)
|
||||
|
||||
|
||||
def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False):
|
||||
def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
|
||||
ordering_and_checkpoint_path = []
|
||||
|
||||
glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))
|
||||
|
||||
for path in glob_checkpoints:
|
||||
if use_mtime:
|
||||
ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
|
||||
else:
|
||||
regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
|
||||
if regex_match and regex_match.groups():
|
||||
ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
|
||||
|
||||
checkpoints_sorted = sorted(ordering_and_checkpoint_path)
|
||||
checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
|
||||
return checkpoints_sorted
|
||||
|
||||
|
||||
def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
|
||||
if not args.save_total_limit:
|
||||
return
|
||||
if args.save_total_limit <= 0:
|
||||
return
|
||||
|
||||
# Check if we should delete older checkpoint(s)
|
||||
glob_checkpoints = glob.glob(os.path.join(args.output_dir, '{}-*'.format(checkpoint_prefix)))
|
||||
if len(glob_checkpoints) <= args.save_total_limit:
|
||||
checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
|
||||
if len(checkpoints_sorted) <= args.save_total_limit:
|
||||
return
|
||||
|
||||
ordering_and_checkpoint_path = []
|
||||
for path in glob_checkpoints:
|
||||
if use_mtime:
|
||||
ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
|
||||
else:
|
||||
regex_match = re.match('.*{}-([0-9]+)'.format(checkpoint_prefix), path)
|
||||
if regex_match and regex_match.groups():
|
||||
ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
|
||||
|
||||
checkpoints_sorted = sorted(ordering_and_checkpoint_path)
|
||||
checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
|
||||
number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
|
||||
checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
|
||||
for checkpoint in checkpoints_to_be_deleted:
|
||||
@@ -142,15 +193,20 @@ def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False):
|
||||
shutil.rmtree(checkpoint)
|
||||
|
||||
|
||||
def mask_tokens(inputs, tokenizer, args):
|
||||
def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
""" Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
|
||||
labels = inputs.clone()
|
||||
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
|
||||
probability_matrix = torch.full(labels.shape, args.mlm_probability)
|
||||
special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()]
|
||||
special_tokens_mask = [
|
||||
tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
|
||||
]
|
||||
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
|
||||
if tokenizer._pad_token is not None:
|
||||
padding_mask = labels.eq(tokenizer.pad_token_id)
|
||||
probability_matrix.masked_fill_(padding_mask, value=0.0)
|
||||
masked_indices = torch.bernoulli(probability_matrix).bool()
|
||||
labels[~masked_indices] = -1 # We only compute loss on masked tokens
|
||||
labels[~masked_indices] = -100 # We only compute loss on masked tokens
|
||||
|
||||
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
|
||||
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
|
||||
@@ -165,14 +221,22 @@ def mask_tokens(inputs, tokenizer, args):
|
||||
return inputs, labels
|
||||
|
||||
|
||||
def train(args, train_dataset, model, tokenizer):
|
||||
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
|
||||
""" Train the model """
|
||||
if args.local_rank in [-1, 0]:
|
||||
tb_writer = SummaryWriter()
|
||||
|
||||
args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
|
||||
|
||||
def collate(examples: List[torch.Tensor]):
|
||||
if tokenizer._pad_token is None:
|
||||
return pad_sequence(examples, batch_first=True)
|
||||
return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
|
||||
|
||||
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
|
||||
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||
train_dataloader = DataLoader(
|
||||
train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate
|
||||
)
|
||||
|
||||
if args.max_steps > 0:
|
||||
t_total = args.max_steps
|
||||
@@ -181,19 +245,28 @@ def train(args, train_dataset, model, tokenizer):
|
||||
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
||||
|
||||
# Prepare optimizer and schedule (linear warmup and decay)
|
||||
no_decay = ['bias', 'LayerNorm.weight']
|
||||
no_decay = ["bias", "LayerNorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
|
||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||
]
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||
"weight_decay": args.weight_decay,
|
||||
},
|
||||
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
|
||||
)
|
||||
|
||||
# Check if saved optimizer or scheduler states exist
|
||||
if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')):
|
||||
if (
|
||||
args.model_name_or_path
|
||||
and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
|
||||
and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
|
||||
):
|
||||
# Load in optimizer and scheduler states
|
||||
optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt')))
|
||||
scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt')))
|
||||
optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
|
||||
scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
|
||||
|
||||
if args.fp16:
|
||||
try:
|
||||
@@ -208,17 +281,21 @@ def train(args, train_dataset, model, tokenizer):
|
||||
|
||||
# Distributed training (should be after apex fp16 initialization)
|
||||
if args.local_rank != -1:
|
||||
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
|
||||
output_device=args.local_rank,
|
||||
find_unused_parameters=True)
|
||||
model = torch.nn.parallel.DistributedDataParallel(
|
||||
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
|
||||
)
|
||||
|
||||
# Train!
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num examples = %d", len(train_dataset))
|
||||
logger.info(" Num Epochs = %d", args.num_train_epochs)
|
||||
logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
|
||||
logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
|
||||
logger.info(
|
||||
" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
args.train_batch_size
|
||||
* args.gradient_accumulation_steps
|
||||
* (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
|
||||
)
|
||||
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
|
||||
logger.info(" Total optimization steps = %d", t_total)
|
||||
|
||||
@@ -226,29 +303,35 @@ def train(args, train_dataset, model, tokenizer):
|
||||
epochs_trained = 0
|
||||
steps_trained_in_current_epoch = 0
|
||||
# Check if continuing training from a checkpoint
|
||||
if os.path.exists(args.model_name_or_path):
|
||||
# set global_step to gobal_step of last saved checkpoint from model path
|
||||
global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
|
||||
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||
steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||
if args.model_name_or_path and os.path.exists(args.model_name_or_path):
|
||||
try:
|
||||
# set global_step to gobal_step of last saved checkpoint from model path
|
||||
checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
|
||||
global_step = int(checkpoint_suffix)
|
||||
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||
steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||
|
||||
logger.info(" Continuing training from checkpoint, will skip to saved global_step")
|
||||
logger.info(" Continuing training from epoch %d", epochs_trained)
|
||||
logger.info(" Continuing training from global step %d", global_step)
|
||||
logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
|
||||
logger.info(" Continuing training from checkpoint, will skip to saved global_step")
|
||||
logger.info(" Continuing training from epoch %d", epochs_trained)
|
||||
logger.info(" Continuing training from global step %d", global_step)
|
||||
logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
|
||||
except ValueError:
|
||||
logger.info(" Starting fine-tuning.")
|
||||
|
||||
tr_loss, logging_loss = 0.0, 0.0
|
||||
|
||||
model_to_resize = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
model_to_resize = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training
|
||||
model_to_resize.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
model.zero_grad()
|
||||
train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||
set_seed(args) # Added here for reproducibility (even between python 2 and 3)
|
||||
train_iterator = trange(
|
||||
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
|
||||
)
|
||||
set_seed(args) # Added here for reproducibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
|
||||
|
||||
# Skip past any already trained steps if resuming training
|
||||
if steps_trained_in_current_epoch > 0:
|
||||
steps_trained_in_current_epoch -= 1
|
||||
@@ -285,31 +368,34 @@ def train(args, train_dataset, model, tokenizer):
|
||||
|
||||
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
||||
# Log metrics
|
||||
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
|
||||
if (
|
||||
args.local_rank == -1 and args.evaluate_during_training
|
||||
): # Only evaluate when single GPU otherwise metrics may not average well
|
||||
results = evaluate(args, model, tokenizer)
|
||||
for key, value in results.items():
|
||||
tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
|
||||
tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
|
||||
tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
|
||||
tb_writer.add_scalar("eval_{}".format(key), value, global_step)
|
||||
tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
|
||||
tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
|
||||
logging_loss = tr_loss
|
||||
|
||||
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||
checkpoint_prefix = 'checkpoint'
|
||||
checkpoint_prefix = "checkpoint"
|
||||
# Save model checkpoint
|
||||
output_dir = os.path.join(args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step))
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
model_to_save = (
|
||||
model.module if hasattr(model, "module") else model
|
||||
) # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(output_dir)
|
||||
tokenizer.save_pretrained(output_dir)
|
||||
|
||||
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
||||
torch.save(args, os.path.join(output_dir, "training_args.bin"))
|
||||
logger.info("Saving model checkpoint to %s", output_dir)
|
||||
|
||||
_rotate_checkpoints(args, checkpoint_prefix)
|
||||
|
||||
torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
|
||||
torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))
|
||||
torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
|
||||
torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
|
||||
logger.info("Saving optimizer and scheduler states to %s", output_dir)
|
||||
|
||||
if args.max_steps > 0 and global_step > args.max_steps:
|
||||
@@ -325,19 +411,27 @@ def train(args, train_dataset, model, tokenizer):
|
||||
return global_step, tr_loss / global_step
|
||||
|
||||
|
||||
def evaluate(args, model, tokenizer, prefix=""):
|
||||
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict:
|
||||
# Loop to handle MNLI double evaluation (matched, mis-matched)
|
||||
eval_output_dir = args.output_dir
|
||||
|
||||
eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
|
||||
|
||||
if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
|
||||
os.makedirs(eval_output_dir)
|
||||
if args.local_rank in [-1, 0]:
|
||||
os.makedirs(eval_output_dir, exist_ok=True)
|
||||
|
||||
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
||||
# Note that DistributedSampler samples randomly
|
||||
|
||||
def collate(examples: List[torch.Tensor]):
|
||||
if tokenizer._pad_token is None:
|
||||
return pad_sequence(examples, batch_first=True)
|
||||
return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
|
||||
|
||||
eval_sampler = SequentialSampler(eval_dataset)
|
||||
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||
eval_dataloader = DataLoader(
|
||||
eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate
|
||||
)
|
||||
|
||||
# multi-gpu evaluate
|
||||
if args.n_gpu > 1:
|
||||
@@ -365,9 +459,7 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
eval_loss = eval_loss / nb_eval_steps
|
||||
perplexity = torch.exp(torch.tensor(eval_loss))
|
||||
|
||||
result = {
|
||||
"perplexity": perplexity
|
||||
}
|
||||
result = {"perplexity": perplexity}
|
||||
|
||||
output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
@@ -382,108 +474,179 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
## Required parameters
|
||||
parser.add_argument("--train_data_file", default=None, type=str, required=True,
|
||||
help="The input training data file (a text file).")
|
||||
parser.add_argument("--output_dir", default=None, type=str, required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.")
|
||||
# Required parameters
|
||||
parser.add_argument(
|
||||
"--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file)."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_type", type=str, required=True, help="The model architecture to be trained or fine-tuned.",
|
||||
)
|
||||
|
||||
## Other parameters
|
||||
parser.add_argument("--eval_data_file", default=None, type=str,
|
||||
help="An optional input evaluation data file to evaluate the perplexity on (a text file).")
|
||||
# Other parameters
|
||||
parser.add_argument(
|
||||
"--eval_data_file",
|
||||
default=None,
|
||||
type=str,
|
||||
help="An optional input evaluation data file to evaluate the perplexity on (a text file).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--line_by_line",
|
||||
action="store_true",
|
||||
help="Whether distinct lines of text in the dataset are to be handled as distinct sequences.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--should_continue", action="store_true", help="Whether to continue from latest checkpoint in output_dir"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_name_or_path",
|
||||
default=None,
|
||||
type=str,
|
||||
help="The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.",
|
||||
)
|
||||
|
||||
parser.add_argument("--model_type", default="bert", type=str,
|
||||
help="The model architecture to be fine-tuned.")
|
||||
parser.add_argument("--model_name_or_path", default="bert-base-cased", type=str,
|
||||
help="The model checkpoint for weights initialization.")
|
||||
parser.add_argument(
|
||||
"--mlm", action="store_true", help="Train with masked-language modeling loss instead of language modeling."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss"
|
||||
)
|
||||
|
||||
parser.add_argument("--mlm", action='store_true',
|
||||
help="Train with masked-language modeling loss instead of language modeling.")
|
||||
parser.add_argument("--mlm_probability", type=float, default=0.15,
|
||||
help="Ratio of tokens to mask for masked language modeling loss")
|
||||
parser.add_argument(
|
||||
"--config_name",
|
||||
default=None,
|
||||
type=str,
|
||||
help="Optional pretrained config name or path if not the same as model_name_or_path. If both are None, initialize a new config.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer_name",
|
||||
default=None,
|
||||
type=str,
|
||||
help="Optional pretrained tokenizer name or path if not the same as model_name_or_path. If both are None, initialize a new tokenizer.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
help="Optional directory to store the pre-trained models downloaded from s3 (instead of the default one)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--block_size",
|
||||
default=-1,
|
||||
type=int,
|
||||
help="Optional input sequence length after tokenization."
|
||||
"The training dataset will be truncated in block of this size for training."
|
||||
"Default to the model max input length for single sentence inputs (take into account special tokens).",
|
||||
)
|
||||
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
|
||||
parser.add_argument(
|
||||
"--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
|
||||
)
|
||||
|
||||
parser.add_argument("--config_name", default="", type=str,
|
||||
help="Optional pretrained config name or path if not the same as model_name_or_path")
|
||||
parser.add_argument("--tokenizer_name", default="", type=str,
|
||||
help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
|
||||
parser.add_argument("--cache_dir", default="", type=str,
|
||||
help="Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)")
|
||||
parser.add_argument("--block_size", default=-1, type=int,
|
||||
help="Optional input sequence length after tokenization."
|
||||
"The training dataset will be truncated in block of this size for training."
|
||||
"Default to the model max input length for single sentence inputs (take into account special tokens).")
|
||||
parser.add_argument("--do_train", action='store_true',
|
||||
help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action='store_true',
|
||||
help="Whether to run eval on the dev set.")
|
||||
parser.add_argument("--evaluate_during_training", action='store_true',
|
||||
help="Run evaluation during training at each logging step.")
|
||||
parser.add_argument("--do_lower_case", action='store_true',
|
||||
help="Set this flag if you are using an uncased model.")
|
||||
parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.")
|
||||
parser.add_argument(
|
||||
"--per_gpu_eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for evaluation."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.",
|
||||
)
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
|
||||
parser.add_argument(
|
||||
"--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_steps",
|
||||
default=-1,
|
||||
type=int,
|
||||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
|
||||
)
|
||||
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
|
||||
|
||||
parser.add_argument("--per_gpu_train_batch_size", default=4, type=int,
|
||||
help="Batch size per GPU/CPU for training.")
|
||||
parser.add_argument("--per_gpu_eval_batch_size", default=4, type=int,
|
||||
help="Batch size per GPU/CPU for evaluation.")
|
||||
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float,
|
||||
help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float,
|
||||
help="Weight deay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
|
||||
help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float,
|
||||
help="Max gradient norm.")
|
||||
parser.add_argument("--num_train_epochs", default=1.0, type=float,
|
||||
help="Total number of training epochs to perform.")
|
||||
parser.add_argument("--max_steps", default=-1, type=int,
|
||||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
|
||||
parser.add_argument("--warmup_steps", default=0, type=int,
|
||||
help="Linear warmup over warmup_steps.")
|
||||
parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
|
||||
parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
|
||||
parser.add_argument(
|
||||
"--save_total_limit",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--eval_all_checkpoints",
|
||||
action="store_true",
|
||||
help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number",
|
||||
)
|
||||
parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
|
||||
parser.add_argument(
|
||||
"--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
|
||||
|
||||
parser.add_argument('--logging_steps', type=int, default=50,
|
||||
help="Log every X updates steps.")
|
||||
parser.add_argument('--save_steps', type=int, default=50,
|
||||
help="Save checkpoint every X updates steps.")
|
||||
parser.add_argument('--save_total_limit', type=int, default=None,
|
||||
help='Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default')
|
||||
parser.add_argument("--eval_all_checkpoints", action='store_true',
|
||||
help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number")
|
||||
parser.add_argument("--no_cuda", action='store_true',
|
||||
help="Avoid using CUDA when available")
|
||||
parser.add_argument('--overwrite_output_dir', action='store_true',
|
||||
help="Overwrite the content of the output directory")
|
||||
parser.add_argument('--overwrite_cache', action='store_true',
|
||||
help="Overwrite the cached training and evaluation sets")
|
||||
parser.add_argument('--seed', type=int, default=42,
|
||||
help="random seed for initialization")
|
||||
|
||||
parser.add_argument('--fp16', action='store_true',
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
|
||||
parser.add_argument('--fp16_opt_level', type=str, default='O1',
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html")
|
||||
parser.add_argument("--local_rank", type=int, default=-1,
|
||||
help="For distributed training: local_rank")
|
||||
parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
|
||||
parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
|
||||
parser.add_argument(
|
||||
"--fp16",
|
||||
action="store_true",
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fp16_opt_level",
|
||||
type=str,
|
||||
default="O1",
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html",
|
||||
)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||
parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
|
||||
parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm:
|
||||
raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
|
||||
"flag (masked language modeling).")
|
||||
raise ValueError(
|
||||
"BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
|
||||
"flag (masked language modeling)."
|
||||
)
|
||||
if args.eval_data_file is None and args.do_eval:
|
||||
raise ValueError("Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
|
||||
"or remove the --do_eval argument.")
|
||||
raise ValueError(
|
||||
"Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
|
||||
"or remove the --do_eval argument."
|
||||
)
|
||||
if args.should_continue:
|
||||
sorted_checkpoints = _sorted_checkpoints(args)
|
||||
if len(sorted_checkpoints) == 0:
|
||||
raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
|
||||
else:
|
||||
args.model_name_or_path = sorted_checkpoints[-1]
|
||||
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
|
||||
raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
|
||||
if (
|
||||
os.path.exists(args.output_dir)
|
||||
and os.listdir(args.output_dir)
|
||||
and args.do_train
|
||||
and not args.overwrite_output_dir
|
||||
):
|
||||
raise ValueError(
|
||||
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
|
||||
args.output_dir
|
||||
)
|
||||
)
|
||||
|
||||
# Setup distant debugging if needed
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
@@ -495,16 +658,24 @@ def main():
|
||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
torch.distributed.init_process_group(backend='nccl')
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
args.n_gpu = 1
|
||||
args.device = device
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
|
||||
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
|
||||
)
|
||||
logger.warning(
|
||||
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
args.local_rank,
|
||||
device,
|
||||
args.n_gpu,
|
||||
bool(args.local_rank != -1),
|
||||
args.fp16,
|
||||
)
|
||||
|
||||
# Set seed
|
||||
set_seed(args)
|
||||
@@ -514,18 +685,41 @@ def main():
|
||||
torch.distributed.barrier() # Barrier to make sure only the first process in distributed training download model & vocab
|
||||
|
||||
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
|
||||
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
||||
do_lower_case=args.do_lower_case,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
|
||||
if args.config_name:
|
||||
config = config_class.from_pretrained(args.config_name, cache_dir=args.cache_dir)
|
||||
elif args.model_name_or_path:
|
||||
config = config_class.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
|
||||
else:
|
||||
config = config_class()
|
||||
|
||||
if args.tokenizer_name:
|
||||
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
|
||||
elif args.model_name_or_path:
|
||||
tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
|
||||
else:
|
||||
raise ValueError(
|
||||
"You are instantiating a new {} tokenizer. This is not supported, but you can do it from another script, save it,"
|
||||
"and load it from here, using --tokenizer_name".format(tokenizer_class.__name__)
|
||||
)
|
||||
|
||||
if args.block_size <= 0:
|
||||
args.block_size = tokenizer.max_len_single_sentence # Our input block size will be the max possible for the model
|
||||
args.block_size = min(args.block_size, tokenizer.max_len_single_sentence)
|
||||
model = model_class.from_pretrained(args.model_name_or_path,
|
||||
from_tf=bool('.ckpt' in args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
args.block_size = tokenizer.max_len_single_sentence
|
||||
# Our input block size will be the max possible for the model
|
||||
else:
|
||||
args.block_size = min(args.block_size, tokenizer.max_len_single_sentence)
|
||||
|
||||
if args.model_name_or_path:
|
||||
model = model_class.from_pretrained(
|
||||
args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=args.cache_dir,
|
||||
)
|
||||
else:
|
||||
logger.info("Training new model from scratch")
|
||||
model = model_class(config=config)
|
||||
|
||||
model.to(args.device)
|
||||
|
||||
if args.local_rank == 0:
|
||||
@@ -546,45 +740,47 @@ def main():
|
||||
global_step, tr_loss = train(args, train_dataset, model, tokenizer)
|
||||
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
|
||||
|
||||
|
||||
# Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
|
||||
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||
# Create output directory if needed
|
||||
if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
|
||||
os.makedirs(args.output_dir)
|
||||
if args.local_rank in [-1, 0]:
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
logger.info("Saving model checkpoint to %s", args.output_dir)
|
||||
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
|
||||
# They can then be reloaded using `from_pretrained()`
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
model_to_save = (
|
||||
model.module if hasattr(model, "module") else model
|
||||
) # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(args.output_dir)
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
|
||||
# Good practice: save your training arguments together with the trained model
|
||||
torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
|
||||
torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
|
||||
|
||||
# Load a trained model and vocabulary that you have fine-tuned
|
||||
model = model_class.from_pretrained(args.output_dir)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir)
|
||||
model.to(args.device)
|
||||
|
||||
|
||||
# Evaluation
|
||||
results = {}
|
||||
if args.do_eval and args.local_rank in [-1, 0]:
|
||||
checkpoints = [args.output_dir]
|
||||
if args.eval_all_checkpoints:
|
||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||
checkpoints = list(
|
||||
os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
|
||||
)
|
||||
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||
for checkpoint in checkpoints:
|
||||
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
||||
prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
|
||||
|
||||
global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
|
||||
prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
|
||||
|
||||
model = model_class.from_pretrained(checkpoint)
|
||||
model.to(args.device)
|
||||
result = evaluate(args, model, tokenizer, prefix=prefix)
|
||||
result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
|
||||
result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
|
||||
results.update(result)
|
||||
|
||||
return results
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" Finetuning the library models for multiple choice (Bert, Roberta, XLNet)."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
@@ -23,48 +22,50 @@ import logging
|
||||
import os
|
||||
import random
|
||||
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||
TensorDataset)
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from transformers import (
|
||||
WEIGHTS_NAME,
|
||||
AdamW,
|
||||
BertConfig,
|
||||
BertForMultipleChoice,
|
||||
BertTokenizer,
|
||||
RobertaConfig,
|
||||
RobertaForMultipleChoice,
|
||||
RobertaTokenizer,
|
||||
XLNetConfig,
|
||||
XLNetForMultipleChoice,
|
||||
XLNetTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
)
|
||||
from utils_multiple_choice import convert_examples_to_features, processors
|
||||
|
||||
|
||||
try:
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
except:
|
||||
except ImportError:
|
||||
from tensorboardX import SummaryWriter
|
||||
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from transformers import (WEIGHTS_NAME, BertConfig,
|
||||
BertForMultipleChoice, BertTokenizer,
|
||||
XLNetConfig, XLNetForMultipleChoice,
|
||||
XLNetTokenizer, RobertaConfig,
|
||||
RobertaForMultipleChoice, RobertaTokenizer)
|
||||
|
||||
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||
|
||||
from utils_multiple_choice import (convert_examples_to_features, processors)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, RobertaConfig)), ())
|
||||
ALL_MODELS = sum(
|
||||
(tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, RobertaConfig)), ()
|
||||
)
|
||||
|
||||
MODEL_CLASSES = {
|
||||
'bert': (BertConfig, BertForMultipleChoice, BertTokenizer),
|
||||
'xlnet': (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer),
|
||||
'roberta': (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer)
|
||||
"bert": (BertConfig, BertForMultipleChoice, BertTokenizer),
|
||||
"xlnet": (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer),
|
||||
"roberta": (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer),
|
||||
}
|
||||
|
||||
|
||||
def select_field(features, field):
|
||||
return [
|
||||
[
|
||||
choice[field]
|
||||
for choice in feature.choices_features
|
||||
]
|
||||
for feature in features
|
||||
]
|
||||
return [[choice[field] for choice in feature.choices_features] for feature in features]
|
||||
|
||||
|
||||
def simple_accuracy(preds, labels):
|
||||
@@ -95,13 +96,18 @@ def train(args, train_dataset, model, tokenizer):
|
||||
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
||||
|
||||
# Prepare optimizer and schedule (linear warmup and decay)
|
||||
no_decay = ['bias', 'LayerNorm.weight']
|
||||
no_decay = ["bias", "LayerNorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
|
||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||
]
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||
"weight_decay": args.weight_decay,
|
||||
},
|
||||
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
|
||||
)
|
||||
if args.fp16:
|
||||
try:
|
||||
from apex import amp
|
||||
@@ -115,41 +121,49 @@ def train(args, train_dataset, model, tokenizer):
|
||||
|
||||
# Distributed training (should be after apex fp16 initialization)
|
||||
if args.local_rank != -1:
|
||||
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
|
||||
output_device=args.local_rank,
|
||||
find_unused_parameters=True)
|
||||
model = torch.nn.parallel.DistributedDataParallel(
|
||||
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
|
||||
)
|
||||
|
||||
# Train!
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num examples = %d", len(train_dataset))
|
||||
logger.info(" Num Epochs = %d", args.num_train_epochs)
|
||||
logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
|
||||
logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
|
||||
logger.info(
|
||||
" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
args.train_batch_size
|
||||
* args.gradient_accumulation_steps
|
||||
* (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
|
||||
)
|
||||
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
|
||||
logger.info(" Total optimization steps = %d", t_total)
|
||||
|
||||
global_step = 0
|
||||
tr_loss, logging_loss = 0.0, 0.0
|
||||
best_dev_acc, best_dev_loss = 0.0, 99999999999.0
|
||||
best_dev_acc = 0.0
|
||||
best_steps = 0
|
||||
model.zero_grad()
|
||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
||||
set_seed(args) # Added here for reproductibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
model.train()
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
inputs = {'input_ids': batch[0],
|
||||
'attention_mask': batch[1],
|
||||
'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids
|
||||
'labels': batch[3]}
|
||||
inputs = {
|
||||
"input_ids": batch[0],
|
||||
"attention_mask": batch[1],
|
||||
"token_type_ids": batch[2]
|
||||
if args.model_type in ["bert", "xlnet"]
|
||||
else None, # XLM don't use segment_ids
|
||||
"labels": batch[3],
|
||||
}
|
||||
outputs = model(**inputs)
|
||||
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||
|
||||
if args.n_gpu > 1:
|
||||
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
||||
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
||||
if args.gradient_accumulation_steps > 1:
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
|
||||
@@ -171,33 +185,45 @@ def train(args, train_dataset, model, tokenizer):
|
||||
|
||||
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
||||
# Log metrics
|
||||
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
|
||||
if (
|
||||
args.local_rank == -1 and args.evaluate_during_training
|
||||
): # Only evaluate when single GPU otherwise metrics may not average well
|
||||
results = evaluate(args, model, tokenizer)
|
||||
for key, value in results.items():
|
||||
tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
|
||||
tb_writer.add_scalar("eval_{}".format(key), value, global_step)
|
||||
if results["eval_acc"] > best_dev_acc:
|
||||
best_dev_acc = results["eval_acc"]
|
||||
best_dev_loss = results["eval_loss"]
|
||||
best_steps = global_step
|
||||
if args.do_test:
|
||||
results_test = evaluate(args, model, tokenizer, test=True)
|
||||
for key, value in results_test.items():
|
||||
tb_writer.add_scalar('test_{}'.format(key), value, global_step)
|
||||
logger.info("test acc: %s, loss: %s, global steps: %s", str(results_test['eval_acc']), str(results_test['eval_loss']), str(global_step))
|
||||
tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
|
||||
tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
|
||||
logger.info("Average loss: %s at global step: %s", str((tr_loss - logging_loss)/args.logging_steps), str(global_step))
|
||||
tb_writer.add_scalar("test_{}".format(key), value, global_step)
|
||||
logger.info(
|
||||
"test acc: %s, loss: %s, global steps: %s",
|
||||
str(results_test["eval_acc"]),
|
||||
str(results_test["eval_loss"]),
|
||||
str(global_step),
|
||||
)
|
||||
tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
|
||||
tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
|
||||
logger.info(
|
||||
"Average loss: %s at global step: %s",
|
||||
str((tr_loss - logging_loss) / args.logging_steps),
|
||||
str(global_step),
|
||||
)
|
||||
logging_loss = tr_loss
|
||||
|
||||
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||
# Save model checkpoint
|
||||
output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
|
||||
output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
model_to_save = (
|
||||
model.module if hasattr(model, "module") else model
|
||||
) # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(output_dir)
|
||||
tokenizer.save_vocabulary(output_dir)
|
||||
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
||||
torch.save(args, os.path.join(output_dir, "training_args.bin"))
|
||||
logger.info("Saving model checkpoint to %s", output_dir)
|
||||
|
||||
if args.max_steps > 0 and global_step > args.max_steps:
|
||||
@@ -246,10 +272,14 @@ def evaluate(args, model, tokenizer, prefix="", test=False):
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
|
||||
with torch.no_grad():
|
||||
inputs = {'input_ids': batch[0],
|
||||
'attention_mask': batch[1],
|
||||
'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids
|
||||
'labels': batch[3]}
|
||||
inputs = {
|
||||
"input_ids": batch[0],
|
||||
"attention_mask": batch[1],
|
||||
"token_type_ids": batch[2]
|
||||
if args.model_type in ["bert", "xlnet"]
|
||||
else None, # XLM don't use segment_ids
|
||||
"labels": batch[3],
|
||||
}
|
||||
outputs = model(**inputs)
|
||||
tmp_eval_loss, logits = outputs[:2]
|
||||
|
||||
@@ -257,10 +287,10 @@ def evaluate(args, model, tokenizer, prefix="", test=False):
|
||||
nb_eval_steps += 1
|
||||
if preds is None:
|
||||
preds = logits.detach().cpu().numpy()
|
||||
out_label_ids = inputs['labels'].detach().cpu().numpy()
|
||||
out_label_ids = inputs["labels"].detach().cpu().numpy()
|
||||
else:
|
||||
preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
|
||||
out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
|
||||
out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
|
||||
|
||||
eval_loss = eval_loss / nb_eval_steps
|
||||
preds = np.argmax(preds, axis=1)
|
||||
@@ -273,8 +303,14 @@ def evaluate(args, model, tokenizer, prefix="", test=False):
|
||||
with open(output_eval_file, "w") as writer:
|
||||
logger.info("***** Eval results {} *****".format(str(prefix) + " is test:" + str(test)))
|
||||
writer.write("model =%s\n" % str(args.model_name_or_path))
|
||||
writer.write("total batch size=%d\n" % (args.per_gpu_train_batch_size * args.gradient_accumulation_steps *
|
||||
(torch.distributed.get_world_size() if args.local_rank != -1 else 1)))
|
||||
writer.write(
|
||||
"total batch size=%d\n"
|
||||
% (
|
||||
args.per_gpu_train_batch_size
|
||||
* args.gradient_accumulation_steps
|
||||
* (torch.distributed.get_world_size() if args.local_rank != -1 else 1)
|
||||
)
|
||||
)
|
||||
writer.write("train num epochs=%d\n" % args.num_train_epochs)
|
||||
writer.write("fp16 =%s\n" % args.fp16)
|
||||
writer.write("max seq length =%d\n" % args.max_seq_length)
|
||||
@@ -291,17 +327,21 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
|
||||
processor = processors[task]()
|
||||
# Load data features from cache or dataset file
|
||||
if evaluate:
|
||||
cached_mode = 'dev'
|
||||
cached_mode = "dev"
|
||||
elif test:
|
||||
cached_mode = 'test'
|
||||
cached_mode = "test"
|
||||
else:
|
||||
cached_mode = 'train'
|
||||
assert (evaluate == True and test == True) == False
|
||||
cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
|
||||
cached_mode,
|
||||
list(filter(None, args.model_name_or_path.split('/'))).pop(),
|
||||
str(args.max_seq_length),
|
||||
str(task)))
|
||||
cached_mode = "train"
|
||||
assert not (evaluate and test)
|
||||
cached_features_file = os.path.join(
|
||||
args.data_dir,
|
||||
"cached_{}_{}_{}_{}".format(
|
||||
cached_mode,
|
||||
list(filter(None, args.model_name_or_path.split("/"))).pop(),
|
||||
str(args.max_seq_length),
|
||||
str(task),
|
||||
),
|
||||
)
|
||||
if os.path.exists(cached_features_file) and not args.overwrite_cache:
|
||||
logger.info("Loading features from cached file %s", cached_features_file)
|
||||
features = torch.load(cached_features_file)
|
||||
@@ -320,8 +360,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
|
||||
label_list,
|
||||
args.max_seq_length,
|
||||
tokenizer,
|
||||
pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet
|
||||
pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0
|
||||
pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet
|
||||
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
|
||||
)
|
||||
if args.local_rank in [-1, 0]:
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
@@ -331,9 +371,9 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||
|
||||
# Convert to Tensors and build dataset
|
||||
all_input_ids = torch.tensor(select_field(features, 'input_ids'), dtype=torch.long)
|
||||
all_input_mask = torch.tensor(select_field(features, 'input_mask'), dtype=torch.long)
|
||||
all_segment_ids = torch.tensor(select_field(features, 'segment_ids'), dtype=torch.long)
|
||||
all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
|
||||
all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long)
|
||||
all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
|
||||
all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long)
|
||||
|
||||
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
|
||||
@@ -343,92 +383,151 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
## Required parameters
|
||||
parser.add_argument("--data_dir", default=None, type=str, required=True,
|
||||
help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
|
||||
parser.add_argument("--model_type", default=None, type=str, required=True,
|
||||
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
|
||||
parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
|
||||
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
|
||||
parser.add_argument("--task_name", default=None, type=str, required=True,
|
||||
help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
|
||||
parser.add_argument("--output_dir", default=None, type=str, required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.")
|
||||
# Required parameters
|
||||
parser.add_argument(
|
||||
"--data_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_type",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_name_or_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--task_name",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
|
||||
## Other parameters
|
||||
parser.add_argument("--config_name", default="", type=str,
|
||||
help="Pretrained config name or path if not the same as model_name")
|
||||
parser.add_argument("--tokenizer_name", default="", type=str,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name")
|
||||
parser.add_argument("--cache_dir", default="", type=str,
|
||||
help="Where do you want to store the pre-trained models downloaded from s3")
|
||||
parser.add_argument("--max_seq_length", default=128, type=int,
|
||||
help="The maximum total input sequence length after tokenization. Sequences longer "
|
||||
"than this will be truncated, sequences shorter will be padded.")
|
||||
parser.add_argument("--do_train", action='store_true',
|
||||
help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action='store_true',
|
||||
help="Whether to run eval on the dev set.")
|
||||
parser.add_argument("--do_test", action='store_true', help='Whether to run test on the test set')
|
||||
parser.add_argument("--evaluate_during_training", action='store_true',
|
||||
help="Run evaluation during training at each logging step.")
|
||||
parser.add_argument("--do_lower_case", action='store_true',
|
||||
help="Set this flag if you are using an uncased model.")
|
||||
# Other parameters
|
||||
parser.add_argument(
|
||||
"--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer_name",
|
||||
default="",
|
||||
type=str,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache_dir",
|
||||
default="",
|
||||
type=str,
|
||||
help="Where do you want to store the pre-trained models downloaded from s3",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_seq_length",
|
||||
default=128,
|
||||
type=int,
|
||||
help="The maximum total input sequence length after tokenization. Sequences longer "
|
||||
"than this will be truncated, sequences shorter will be padded.",
|
||||
)
|
||||
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
|
||||
parser.add_argument("--do_test", action="store_true", help="Whether to run test on the test set")
|
||||
parser.add_argument(
|
||||
"--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
|
||||
)
|
||||
|
||||
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
|
||||
help="Batch size per GPU/CPU for training.")
|
||||
parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
|
||||
help="Batch size per GPU/CPU for evaluation.")
|
||||
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float,
|
||||
help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float,
|
||||
help="Weight deay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
|
||||
help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float,
|
||||
help="Max gradient norm.")
|
||||
parser.add_argument("--num_train_epochs", default=3.0, type=float,
|
||||
help="Total number of training epochs to perform.")
|
||||
parser.add_argument("--max_steps", default=-1, type=int,
|
||||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
|
||||
parser.add_argument("--warmup_steps", default=0, type=int,
|
||||
help="Linear warmup over warmup_steps.")
|
||||
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
|
||||
parser.add_argument(
|
||||
"--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.",
|
||||
)
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
|
||||
parser.add_argument(
|
||||
"--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_steps",
|
||||
default=-1,
|
||||
type=int,
|
||||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
|
||||
)
|
||||
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
|
||||
|
||||
parser.add_argument('--logging_steps', type=int, default=50,
|
||||
help="Log every X updates steps.")
|
||||
parser.add_argument('--save_steps', type=int, default=50,
|
||||
help="Save checkpoint every X updates steps.")
|
||||
parser.add_argument("--eval_all_checkpoints", action='store_true',
|
||||
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
|
||||
parser.add_argument("--no_cuda", action='store_true',
|
||||
help="Avoid using CUDA when available")
|
||||
parser.add_argument('--overwrite_output_dir', action='store_true',
|
||||
help="Overwrite the content of the output directory")
|
||||
parser.add_argument('--overwrite_cache', action='store_true',
|
||||
help="Overwrite the cached training and evaluation sets")
|
||||
parser.add_argument('--seed', type=int, default=42,
|
||||
help="random seed for initialization")
|
||||
parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
|
||||
parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
|
||||
parser.add_argument(
|
||||
"--eval_all_checkpoints",
|
||||
action="store_true",
|
||||
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
|
||||
)
|
||||
parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
|
||||
parser.add_argument(
|
||||
"--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
|
||||
|
||||
parser.add_argument('--fp16', action='store_true',
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
|
||||
parser.add_argument('--fp16_opt_level', type=str, default='O1',
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html")
|
||||
parser.add_argument("--local_rank", type=int, default=-1,
|
||||
help="For distributed training: local_rank")
|
||||
parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
|
||||
parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
|
||||
parser.add_argument(
|
||||
"--fp16",
|
||||
action="store_true",
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fp16_opt_level",
|
||||
type=str,
|
||||
default="O1",
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html",
|
||||
)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||
parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
|
||||
parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
|
||||
raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
|
||||
if (
|
||||
os.path.exists(args.output_dir)
|
||||
and os.listdir(args.output_dir)
|
||||
and args.do_train
|
||||
and not args.overwrite_output_dir
|
||||
):
|
||||
raise ValueError(
|
||||
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
|
||||
args.output_dir
|
||||
)
|
||||
)
|
||||
|
||||
# Setup distant debugging if needed
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
@@ -440,16 +539,24 @@ def main():
|
||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
torch.distributed.init_process_group(backend='nccl')
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
args.n_gpu = 1
|
||||
args.device = device
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
|
||||
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
|
||||
)
|
||||
logger.warning(
|
||||
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
args.local_rank,
|
||||
device,
|
||||
args.n_gpu,
|
||||
bool(args.local_rank != -1),
|
||||
args.fp16,
|
||||
)
|
||||
|
||||
# Set seed
|
||||
set_seed(args)
|
||||
@@ -468,17 +575,23 @@ def main():
|
||||
|
||||
args.model_type = args.model_type.lower()
|
||||
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
|
||||
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
|
||||
num_labels=num_labels,
|
||||
finetuning_task=args.task_name,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
||||
do_lower_case=args.do_lower_case,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
model = model_class.from_pretrained(args.model_name_or_path,
|
||||
from_tf=bool('.ckpt' in args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
config = config_class.from_pretrained(
|
||||
args.config_name if args.config_name else args.model_name_or_path,
|
||||
num_labels=num_labels,
|
||||
finetuning_task=args.task_name,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
tokenizer = tokenizer_class.from_pretrained(
|
||||
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
||||
do_lower_case=args.do_lower_case,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
model = model_class.from_pretrained(
|
||||
args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
|
||||
if args.local_rank == 0:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||
@@ -494,7 +607,6 @@ def main():
|
||||
global_step, tr_loss, best_steps = train(args, train_dataset, model, tokenizer)
|
||||
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
|
||||
|
||||
|
||||
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
|
||||
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||
# Create output directory if needed
|
||||
@@ -504,19 +616,20 @@ def main():
|
||||
logger.info("Saving model checkpoint to %s", args.output_dir)
|
||||
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
|
||||
# They can then be reloaded using `from_pretrained()`
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
model_to_save = (
|
||||
model.module if hasattr(model, "module") else model
|
||||
) # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(args.output_dir)
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
|
||||
# Good practice: save your training arguments together with the trained model
|
||||
torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
|
||||
torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
|
||||
|
||||
# Load a trained model and vocabulary that you have fine-tuned
|
||||
model = model_class.from_pretrained(args.output_dir)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir)
|
||||
model.to(args.device)
|
||||
|
||||
|
||||
# Evaluation
|
||||
results = {}
|
||||
if args.do_eval and args.local_rank in [-1, 0]:
|
||||
@@ -524,17 +637,19 @@ def main():
|
||||
args.output_dir = args.model_name_or_path
|
||||
checkpoints = [args.output_dir]
|
||||
if args.eval_all_checkpoints:
|
||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||
checkpoints = list(
|
||||
os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
|
||||
)
|
||||
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||
for checkpoint in checkpoints:
|
||||
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
||||
prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
|
||||
|
||||
global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
|
||||
prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
|
||||
|
||||
model = model_class.from_pretrained(checkpoint)
|
||||
model.to(args.device)
|
||||
result = evaluate(args, model, tokenizer, prefix=prefix)
|
||||
result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
|
||||
result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
|
||||
results.update(result)
|
||||
|
||||
if args.do_test and args.local_rank in [-1, 0]:
|
||||
@@ -546,13 +661,13 @@ def main():
|
||||
# logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||
for checkpoint in checkpoints:
|
||||
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
||||
prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
|
||||
|
||||
global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
|
||||
prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
|
||||
|
||||
model = model_class.from_pretrained(checkpoint)
|
||||
model.to(args.device)
|
||||
result = evaluate(args, model, tokenizer, prefix=prefix, test=True)
|
||||
result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
|
||||
result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
|
||||
results.update(result)
|
||||
if best_steps:
|
||||
logger.info("best steps of eval acc is the following checkpoints: %s", best_steps)
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" Fine-tuning the library models for named entity recognition on CoNLL-2003 (Bert or Roberta). """
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
@@ -25,31 +24,57 @@ import random
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from seqeval.metrics import precision_score, recall_score, f1_score
|
||||
from tensorboardX import SummaryWriter
|
||||
from seqeval.metrics import f1_score, precision_score, recall_score
|
||||
from torch.nn import CrossEntropyLoss
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from transformers import (
|
||||
WEIGHTS_NAME,
|
||||
AdamW,
|
||||
BertConfig,
|
||||
BertForTokenClassification,
|
||||
BertTokenizer,
|
||||
CamembertConfig,
|
||||
CamembertForTokenClassification,
|
||||
CamembertTokenizer,
|
||||
DistilBertConfig,
|
||||
DistilBertForTokenClassification,
|
||||
DistilBertTokenizer,
|
||||
RobertaConfig,
|
||||
RobertaForTokenClassification,
|
||||
RobertaTokenizer,
|
||||
XLMRobertaConfig,
|
||||
XLMRobertaForTokenClassification,
|
||||
XLMRobertaTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
)
|
||||
from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
|
||||
|
||||
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||
from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer
|
||||
from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer
|
||||
from transformers import DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer
|
||||
from transformers import CamembertConfig, CamembertForTokenClassification, CamembertTokenizer
|
||||
|
||||
try:
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
except ImportError:
|
||||
from tensorboardX import SummaryWriter
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ALL_MODELS = sum(
|
||||
(tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)),
|
||||
())
|
||||
(
|
||||
tuple(conf.pretrained_config_archive_map.keys())
|
||||
for conf in (BertConfig, RobertaConfig, DistilBertConfig, CamembertConfig, XLMRobertaConfig)
|
||||
),
|
||||
(),
|
||||
)
|
||||
|
||||
MODEL_CLASSES = {
|
||||
"bert": (BertConfig, BertForTokenClassification, BertTokenizer),
|
||||
"roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
|
||||
"distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer),
|
||||
"camembert": (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer),
|
||||
"xlmroberta": (XLMRobertaConfig, XLMRobertaForTokenClassification, XLMRobertaTokenizer),
|
||||
}
|
||||
|
||||
|
||||
@@ -79,12 +104,25 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
|
||||
# Prepare optimizer and schedule (linear warmup and decay)
|
||||
no_decay = ["bias", "LayerNorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||
"weight_decay": args.weight_decay},
|
||||
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||
"weight_decay": args.weight_decay,
|
||||
},
|
||||
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
|
||||
)
|
||||
|
||||
# Check if saved optimizer or scheduler states exist
|
||||
if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
|
||||
os.path.join(args.model_name_or_path, "scheduler.pt")
|
||||
):
|
||||
# Load in optimizer and scheduler states
|
||||
optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
|
||||
scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
|
||||
|
||||
if args.fp16:
|
||||
try:
|
||||
from apex import amp
|
||||
@@ -98,36 +136,61 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
|
||||
|
||||
# Distributed training (should be after apex fp16 initialization)
|
||||
if args.local_rank != -1:
|
||||
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
|
||||
output_device=args.local_rank,
|
||||
find_unused_parameters=True)
|
||||
model = torch.nn.parallel.DistributedDataParallel(
|
||||
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
|
||||
)
|
||||
|
||||
# Train!
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num examples = %d", len(train_dataset))
|
||||
logger.info(" Num Epochs = %d", args.num_train_epochs)
|
||||
logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
|
||||
logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
args.train_batch_size * args.gradient_accumulation_steps * (
|
||||
torch.distributed.get_world_size() if args.local_rank != -1 else 1))
|
||||
logger.info(
|
||||
" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
args.train_batch_size
|
||||
* args.gradient_accumulation_steps
|
||||
* (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
|
||||
)
|
||||
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
|
||||
logger.info(" Total optimization steps = %d", t_total)
|
||||
|
||||
global_step = 0
|
||||
epochs_trained = 0
|
||||
steps_trained_in_current_epoch = 0
|
||||
# Check if continuing training from a checkpoint
|
||||
if os.path.exists(args.model_name_or_path):
|
||||
# set global_step to gobal_step of last saved checkpoint from model path
|
||||
global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
|
||||
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||
steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||
|
||||
logger.info(" Continuing training from checkpoint, will skip to saved global_step")
|
||||
logger.info(" Continuing training from epoch %d", epochs_trained)
|
||||
logger.info(" Continuing training from global step %d", global_step)
|
||||
logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
|
||||
|
||||
tr_loss, logging_loss = 0.0, 0.0
|
||||
model.zero_grad()
|
||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
||||
train_iterator = trange(
|
||||
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
|
||||
)
|
||||
set_seed(args) # Added here for reproductibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
|
||||
# Skip past any already trained steps if resuming training
|
||||
if steps_trained_in_current_epoch > 0:
|
||||
steps_trained_in_current_epoch -= 1
|
||||
continue
|
||||
|
||||
model.train()
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
inputs = {"input_ids": batch[0],
|
||||
"attention_mask": batch[1],
|
||||
"labels": batch[3]}
|
||||
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
|
||||
if args.model_type != "distilbert":
|
||||
inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None # XLM and RoBERTa don"t use segment_ids
|
||||
inputs["token_type_ids"] = (
|
||||
batch[2] if args.model_type in ["bert", "xlnet"] else None
|
||||
) # XLM and RoBERTa don"t use segment_ids
|
||||
|
||||
outputs = model(**inputs)
|
||||
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
|
||||
@@ -157,7 +220,9 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
|
||||
|
||||
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
||||
# Log metrics
|
||||
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
|
||||
if (
|
||||
args.local_rank == -1 and args.evaluate_during_training
|
||||
): # Only evaluate when single GPU otherwise metrics may not average well
|
||||
results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev")
|
||||
for key, value in results.items():
|
||||
tb_writer.add_scalar("eval_{}".format(key), value, global_step)
|
||||
@@ -170,11 +235,19 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
|
||||
output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
model_to_save = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training
|
||||
model_to_save = (
|
||||
model.module if hasattr(model, "module") else model
|
||||
) # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(output_dir)
|
||||
tokenizer.save_pretrained(output_dir)
|
||||
|
||||
torch.save(args, os.path.join(output_dir, "training_args.bin"))
|
||||
logger.info("Saving model checkpoint to %s", output_dir)
|
||||
|
||||
torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
|
||||
torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
|
||||
logger.info("Saving optimizer and scheduler states to %s", output_dir)
|
||||
|
||||
if args.max_steps > 0 and global_step > args.max_steps:
|
||||
epoch_iterator.close()
|
||||
break
|
||||
@@ -213,11 +286,11 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
|
||||
with torch.no_grad():
|
||||
inputs = {"input_ids": batch[0],
|
||||
"attention_mask": batch[1],
|
||||
"labels": batch[3]}
|
||||
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
|
||||
if args.model_type != "distilbert":
|
||||
inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None # XLM and RoBERTa don"t use segment_ids
|
||||
inputs["token_type_ids"] = (
|
||||
batch[2] if args.model_type in ["bert", "xlnet"] else None
|
||||
) # XLM and RoBERTa don"t use segment_ids
|
||||
outputs = model(**inputs)
|
||||
tmp_eval_loss, logits = outputs[:2]
|
||||
|
||||
@@ -251,7 +324,7 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
|
||||
"loss": eval_loss,
|
||||
"precision": precision_score(out_label_list, preds_list),
|
||||
"recall": recall_score(out_label_list, preds_list),
|
||||
"f1": f1_score(out_label_list, preds_list)
|
||||
"f1": f1_score(out_label_list, preds_list),
|
||||
}
|
||||
|
||||
logger.info("***** Eval results %s *****", prefix)
|
||||
@@ -266,29 +339,36 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||
|
||||
# Load data features from cache or dataset file
|
||||
cached_features_file = os.path.join(args.data_dir, "cached_{}_{}_{}".format(mode,
|
||||
list(filter(None, args.model_name_or_path.split("/"))).pop(),
|
||||
str(args.max_seq_length)))
|
||||
cached_features_file = os.path.join(
|
||||
args.data_dir,
|
||||
"cached_{}_{}_{}".format(
|
||||
mode, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length)
|
||||
),
|
||||
)
|
||||
if os.path.exists(cached_features_file) and not args.overwrite_cache:
|
||||
logger.info("Loading features from cached file %s", cached_features_file)
|
||||
features = torch.load(cached_features_file)
|
||||
else:
|
||||
logger.info("Creating features from dataset file at %s", args.data_dir)
|
||||
examples = read_examples_from_file(args.data_dir, mode)
|
||||
features = convert_examples_to_features(examples, labels, args.max_seq_length, tokenizer,
|
||||
cls_token_at_end=bool(args.model_type in ["xlnet"]),
|
||||
# xlnet has a cls token at the end
|
||||
cls_token=tokenizer.cls_token,
|
||||
cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
|
||||
sep_token=tokenizer.sep_token,
|
||||
sep_token_extra=bool(args.model_type in ["roberta"]),
|
||||
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
||||
pad_on_left=bool(args.model_type in ["xlnet"]),
|
||||
# pad on the left for xlnet
|
||||
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
||||
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
|
||||
pad_token_label_id=pad_token_label_id
|
||||
)
|
||||
features = convert_examples_to_features(
|
||||
examples,
|
||||
labels,
|
||||
args.max_seq_length,
|
||||
tokenizer,
|
||||
cls_token_at_end=bool(args.model_type in ["xlnet"]),
|
||||
# xlnet has a cls token at the end
|
||||
cls_token=tokenizer.cls_token,
|
||||
cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
|
||||
sep_token=tokenizer.sep_token,
|
||||
sep_token_extra=bool(args.model_type in ["roberta"]),
|
||||
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
||||
pad_on_left=bool(args.model_type in ["xlnet"]),
|
||||
# pad on the left for xlnet
|
||||
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
||||
pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
|
||||
pad_token_label_id=pad_token_label_id,
|
||||
)
|
||||
if args.local_rank in [-1, 0]:
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
torch.save(features, cached_features_file)
|
||||
@@ -309,96 +389,152 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
## Required parameters
|
||||
parser.add_argument("--data_dir", default=None, type=str, required=True,
|
||||
help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.")
|
||||
parser.add_argument("--model_type", default=None, type=str, required=True,
|
||||
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
|
||||
parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
|
||||
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
|
||||
parser.add_argument("--output_dir", default=None, type=str, required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.")
|
||||
# Required parameters
|
||||
parser.add_argument(
|
||||
"--data_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_type",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_name_or_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
|
||||
## Other parameters
|
||||
parser.add_argument("--labels", default="", type=str,
|
||||
help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.")
|
||||
parser.add_argument("--config_name", default="", type=str,
|
||||
help="Pretrained config name or path if not the same as model_name")
|
||||
parser.add_argument("--tokenizer_name", default="", type=str,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name")
|
||||
parser.add_argument("--cache_dir", default="", type=str,
|
||||
help="Where do you want to store the pre-trained models downloaded from s3")
|
||||
parser.add_argument("--max_seq_length", default=128, type=int,
|
||||
help="The maximum total input sequence length after tokenization. Sequences longer "
|
||||
"than this will be truncated, sequences shorter will be padded.")
|
||||
parser.add_argument("--do_train", action="store_true",
|
||||
help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action="store_true",
|
||||
help="Whether to run eval on the dev set.")
|
||||
parser.add_argument("--do_predict", action="store_true",
|
||||
help="Whether to run predictions on the test set.")
|
||||
parser.add_argument("--evaluate_during_training", action="store_true",
|
||||
help="Whether to run evaluation during training at each logging step.")
|
||||
parser.add_argument("--do_lower_case", action="store_true",
|
||||
help="Set this flag if you are using an uncased model.")
|
||||
# Other parameters
|
||||
parser.add_argument(
|
||||
"--labels",
|
||||
default="",
|
||||
type=str,
|
||||
help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer_name",
|
||||
default="",
|
||||
type=str,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache_dir",
|
||||
default="",
|
||||
type=str,
|
||||
help="Where do you want to store the pre-trained models downloaded from s3",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_seq_length",
|
||||
default=128,
|
||||
type=int,
|
||||
help="The maximum total input sequence length after tokenization. Sequences longer "
|
||||
"than this will be truncated, sequences shorter will be padded.",
|
||||
)
|
||||
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
|
||||
parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
|
||||
parser.add_argument(
|
||||
"--evaluate_during_training",
|
||||
action="store_true",
|
||||
help="Whether to run evaluation during training at each logging step.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
|
||||
)
|
||||
|
||||
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
|
||||
help="Batch size per GPU/CPU for training.")
|
||||
parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
|
||||
help="Batch size per GPU/CPU for evaluation.")
|
||||
parser.add_argument("--gradient_accumulation_steps", type=int, default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float,
|
||||
help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float,
|
||||
help="Weight decay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
|
||||
help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float,
|
||||
help="Max gradient norm.")
|
||||
parser.add_argument("--num_train_epochs", default=3.0, type=float,
|
||||
help="Total number of training epochs to perform.")
|
||||
parser.add_argument("--max_steps", default=-1, type=int,
|
||||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
|
||||
parser.add_argument("--warmup_steps", default=0, type=int,
|
||||
help="Linear warmup over warmup_steps.")
|
||||
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
|
||||
parser.add_argument(
|
||||
"--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.",
|
||||
)
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
|
||||
parser.add_argument(
|
||||
"--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_steps",
|
||||
default=-1,
|
||||
type=int,
|
||||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
|
||||
)
|
||||
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
|
||||
|
||||
parser.add_argument("--logging_steps", type=int, default=50,
|
||||
help="Log every X updates steps.")
|
||||
parser.add_argument("--save_steps", type=int, default=50,
|
||||
help="Save checkpoint every X updates steps.")
|
||||
parser.add_argument("--eval_all_checkpoints", action="store_true",
|
||||
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
|
||||
parser.add_argument("--no_cuda", action="store_true",
|
||||
help="Avoid using CUDA when available")
|
||||
parser.add_argument("--overwrite_output_dir", action="store_true",
|
||||
help="Overwrite the content of the output directory")
|
||||
parser.add_argument("--overwrite_cache", action="store_true",
|
||||
help="Overwrite the cached training and evaluation sets")
|
||||
parser.add_argument("--seed", type=int, default=42,
|
||||
help="random seed for initialization")
|
||||
parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
|
||||
parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
|
||||
parser.add_argument(
|
||||
"--eval_all_checkpoints",
|
||||
action="store_true",
|
||||
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
|
||||
)
|
||||
parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
|
||||
parser.add_argument(
|
||||
"--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
|
||||
|
||||
parser.add_argument("--fp16", action="store_true",
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
|
||||
parser.add_argument("--fp16_opt_level", type=str, default="O1",
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html")
|
||||
parser.add_argument("--local_rank", type=int, default=-1,
|
||||
help="For distributed training: local_rank")
|
||||
parser.add_argument(
|
||||
"--fp16",
|
||||
action="store_true",
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fp16_opt_level",
|
||||
type=str,
|
||||
default="O1",
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html",
|
||||
)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||
parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
|
||||
parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if os.path.exists(args.output_dir) and os.listdir(
|
||||
args.output_dir) and args.do_train and not args.overwrite_output_dir:
|
||||
if (
|
||||
os.path.exists(args.output_dir)
|
||||
and os.listdir(args.output_dir)
|
||||
and args.do_train
|
||||
and not args.overwrite_output_dir
|
||||
):
|
||||
raise ValueError(
|
||||
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
|
||||
args.output_dir))
|
||||
args.output_dir
|
||||
)
|
||||
)
|
||||
|
||||
# Setup distant debugging if needed
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
@@ -415,11 +551,19 @@ def main():
|
||||
args.device = device
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
|
||||
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
|
||||
)
|
||||
logger.warning(
|
||||
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
args.local_rank,
|
||||
device,
|
||||
args.n_gpu,
|
||||
bool(args.local_rank != -1),
|
||||
args.fp16,
|
||||
)
|
||||
|
||||
# Set seed
|
||||
set_seed(args)
|
||||
@@ -436,16 +580,22 @@ def main():
|
||||
|
||||
args.model_type = args.model_type.lower()
|
||||
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
|
||||
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
|
||||
num_labels=num_labels,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
||||
do_lower_case=args.do_lower_case,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
model = model_class.from_pretrained(args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
config = config_class.from_pretrained(
|
||||
args.config_name if args.config_name else args.model_name_or_path,
|
||||
num_labels=num_labels,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
tokenizer = tokenizer_class.from_pretrained(
|
||||
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
||||
do_lower_case=args.do_lower_case,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
model = model_class.from_pretrained(
|
||||
args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
|
||||
if args.local_rank == 0:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||
@@ -469,7 +619,9 @@ def main():
|
||||
logger.info("Saving model checkpoint to %s", args.output_dir)
|
||||
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
|
||||
# They can then be reloaded using `from_pretrained()`
|
||||
model_to_save = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training
|
||||
model_to_save = (
|
||||
model.module if hasattr(model, "module") else model
|
||||
) # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(args.output_dir)
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
|
||||
@@ -482,7 +634,9 @@ def main():
|
||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
||||
checkpoints = [args.output_dir]
|
||||
if args.eval_all_checkpoints:
|
||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)))
|
||||
checkpoints = list(
|
||||
os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
|
||||
)
|
||||
logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||
for checkpoint in checkpoints:
|
||||
@@ -529,4 +683,3 @@ def main():
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
@@ -15,55 +15,75 @@
|
||||
# limitations under the License.
|
||||
""" Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor, SquadResult
|
||||
from transformers.data.metrics.squad_metrics import compute_predictions_logits, compute_predictions_log_probs, squad_evaluate
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import glob
|
||||
import timeit
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from transformers import (
|
||||
WEIGHTS_NAME,
|
||||
AdamW,
|
||||
AlbertConfig,
|
||||
AlbertForQuestionAnswering,
|
||||
AlbertTokenizer,
|
||||
BertConfig,
|
||||
BertForQuestionAnswering,
|
||||
BertTokenizer,
|
||||
DistilBertConfig,
|
||||
DistilBertForQuestionAnswering,
|
||||
DistilBertTokenizer,
|
||||
RobertaConfig,
|
||||
RobertaForQuestionAnswering,
|
||||
RobertaTokenizer,
|
||||
XLMConfig,
|
||||
XLMForQuestionAnswering,
|
||||
XLMTokenizer,
|
||||
XLNetConfig,
|
||||
XLNetForQuestionAnswering,
|
||||
XLNetTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
squad_convert_examples_to_features,
|
||||
)
|
||||
from transformers.data.metrics.squad_metrics import (
|
||||
compute_predictions_log_probs,
|
||||
compute_predictions_logits,
|
||||
squad_evaluate,
|
||||
)
|
||||
from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor
|
||||
|
||||
|
||||
try:
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
except:
|
||||
except ImportError:
|
||||
from tensorboardX import SummaryWriter
|
||||
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from transformers import (WEIGHTS_NAME, BertConfig,
|
||||
BertForQuestionAnswering, BertTokenizer,
|
||||
XLMConfig, XLMForQuestionAnswering,
|
||||
XLMTokenizer, XLNetConfig,
|
||||
XLNetForQuestionAnswering,
|
||||
XLNetTokenizer,
|
||||
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer,
|
||||
AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer,
|
||||
XLMConfig, XLMForQuestionAnswering, XLMTokenizer,
|
||||
)
|
||||
|
||||
from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
|
||||
for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
|
||||
ALL_MODELS = sum(
|
||||
(tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, XLNetConfig, XLMConfig)),
|
||||
(),
|
||||
)
|
||||
|
||||
MODEL_CLASSES = {
|
||||
'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
|
||||
'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
|
||||
'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
|
||||
'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
|
||||
'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer),
|
||||
'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer)
|
||||
"bert": (BertConfig, BertForQuestionAnswering, BertTokenizer),
|
||||
"roberta": (RobertaConfig, RobertaForQuestionAnswering, RobertaTokenizer),
|
||||
"xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
|
||||
"xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
|
||||
"distilbert": (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
|
||||
"albert": (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer),
|
||||
}
|
||||
|
||||
|
||||
def set_seed(args):
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
@@ -71,9 +91,11 @@ def set_seed(args):
|
||||
if args.n_gpu > 0:
|
||||
torch.cuda.manual_seed_all(args.seed)
|
||||
|
||||
|
||||
def to_list(tensor):
|
||||
return tensor.detach().cpu().tolist()
|
||||
|
||||
|
||||
def train(args, train_dataset, model, tokenizer):
|
||||
""" Train the model """
|
||||
if args.local_rank in [-1, 0]:
|
||||
@@ -90,20 +112,33 @@ def train(args, train_dataset, model, tokenizer):
|
||||
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
||||
|
||||
# Prepare optimizer and schedule (linear warmup and decay)
|
||||
no_decay = ['bias', 'LayerNorm.weight']
|
||||
no_decay = ["bias", "LayerNorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
|
||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||
"weight_decay": args.weight_decay,
|
||||
},
|
||||
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
|
||||
)
|
||||
|
||||
# Check if saved optimizer or scheduler states exist
|
||||
if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
|
||||
os.path.join(args.model_name_or_path, "scheduler.pt")
|
||||
):
|
||||
# Load in optimizer and scheduler states
|
||||
optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
|
||||
scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
|
||||
|
||||
if args.fp16:
|
||||
try:
|
||||
from apex import amp
|
||||
except ImportError:
|
||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
||||
|
||||
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
|
||||
|
||||
# multi-gpu training (should be after apex fp16 initialization)
|
||||
@@ -112,50 +147,84 @@ def train(args, train_dataset, model, tokenizer):
|
||||
|
||||
# Distributed training (should be after apex fp16 initialization)
|
||||
if args.local_rank != -1:
|
||||
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
|
||||
output_device=args.local_rank,
|
||||
find_unused_parameters=True)
|
||||
model = torch.nn.parallel.DistributedDataParallel(
|
||||
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
|
||||
)
|
||||
|
||||
# Train!
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num examples = %d", len(train_dataset))
|
||||
logger.info(" Num Epochs = %d", args.num_train_epochs)
|
||||
logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
|
||||
logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
|
||||
logger.info(
|
||||
" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
args.train_batch_size
|
||||
* args.gradient_accumulation_steps
|
||||
* (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
|
||||
)
|
||||
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
|
||||
logger.info(" Total optimization steps = %d", t_total)
|
||||
|
||||
global_step = 1
|
||||
epochs_trained = 0
|
||||
steps_trained_in_current_epoch = 0
|
||||
# Check if continuing training from a checkpoint
|
||||
if os.path.exists(args.model_name_or_path):
|
||||
try:
|
||||
# set global_step to gobal_step of last saved checkpoint from model path
|
||||
checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
|
||||
global_step = int(checkpoint_suffix)
|
||||
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||
steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||
|
||||
logger.info(" Continuing training from checkpoint, will skip to saved global_step")
|
||||
logger.info(" Continuing training from epoch %d", epochs_trained)
|
||||
logger.info(" Continuing training from global step %d", global_step)
|
||||
logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
|
||||
except ValueError:
|
||||
logger.info(" Starting fine-tuning.")
|
||||
|
||||
tr_loss, logging_loss = 0.0, 0.0
|
||||
model.zero_grad()
|
||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
||||
|
||||
train_iterator = trange(
|
||||
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
|
||||
)
|
||||
# Added here for reproductibility
|
||||
set_seed(args)
|
||||
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
|
||||
# Skip past any already trained steps if resuming training
|
||||
if steps_trained_in_current_epoch > 0:
|
||||
steps_trained_in_current_epoch -= 1
|
||||
continue
|
||||
|
||||
model.train()
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
|
||||
inputs = {
|
||||
'input_ids': batch[0],
|
||||
'attention_mask': batch[1],
|
||||
'start_positions': batch[3],
|
||||
'end_positions': batch[4]
|
||||
"input_ids": batch[0],
|
||||
"attention_mask": batch[1],
|
||||
"token_type_ids": batch[2],
|
||||
"start_positions": batch[3],
|
||||
"end_positions": batch[4],
|
||||
}
|
||||
|
||||
if args.model_type != 'distilbert':
|
||||
inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
|
||||
|
||||
if args.model_type in ['xlnet', 'xlm']:
|
||||
inputs.update({'cls_index': batch[5], 'p_mask': batch[6]})
|
||||
if args.model_type in ["xlm", "roberta", "distilbert"]:
|
||||
del inputs["token_type_ids"]
|
||||
|
||||
if args.model_type in ["xlnet", "xlm"]:
|
||||
inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
|
||||
if args.version_2_with_negative:
|
||||
inputs.update({"is_impossible": batch[7]})
|
||||
outputs = model(**inputs)
|
||||
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||
# model outputs are always tuple in transformers (see doc)
|
||||
loss = outputs[0]
|
||||
|
||||
if args.n_gpu > 1:
|
||||
loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
|
||||
loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
|
||||
if args.gradient_accumulation_steps > 1:
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
|
||||
@@ -179,24 +248,32 @@ def train(args, train_dataset, model, tokenizer):
|
||||
|
||||
# Log metrics
|
||||
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
||||
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
|
||||
# Only evaluate when single GPU otherwise metrics may not average well
|
||||
if args.local_rank == -1 and args.evaluate_during_training:
|
||||
results = evaluate(args, model, tokenizer)
|
||||
for key, value in results.items():
|
||||
tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
|
||||
tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
|
||||
tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
|
||||
tb_writer.add_scalar("eval_{}".format(key), value, global_step)
|
||||
tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
|
||||
tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
|
||||
logging_loss = tr_loss
|
||||
|
||||
# Save model checkpoint
|
||||
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||
output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
|
||||
output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
# Take care of distributed/parallel training
|
||||
model_to_save = model.module if hasattr(model, "module") else model
|
||||
model_to_save.save_pretrained(output_dir)
|
||||
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
||||
tokenizer.save_pretrained(output_dir)
|
||||
|
||||
torch.save(args, os.path.join(output_dir, "training_args.bin"))
|
||||
logger.info("Saving model checkpoint to %s", output_dir)
|
||||
|
||||
torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
|
||||
torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
|
||||
logger.info("Saving optimizer and scheduler states to %s", output_dir)
|
||||
|
||||
if args.max_steps > 0 and global_step > args.max_steps:
|
||||
epoch_iterator.close()
|
||||
break
|
||||
@@ -223,7 +300,7 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||
|
||||
# multi-gpu evaluate
|
||||
if args.n_gpu > 1:
|
||||
if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# Eval!
|
||||
@@ -240,18 +317,19 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
|
||||
with torch.no_grad():
|
||||
inputs = {
|
||||
'input_ids': batch[0],
|
||||
'attention_mask': batch[1]
|
||||
"input_ids": batch[0],
|
||||
"attention_mask": batch[1],
|
||||
"token_type_ids": batch[2],
|
||||
}
|
||||
|
||||
if args.model_type != 'distilbert':
|
||||
inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids
|
||||
|
||||
if args.model_type in ["xlm", "roberta", "distilbert"]:
|
||||
del inputs["token_type_ids"]
|
||||
|
||||
example_indices = batch[3]
|
||||
|
||||
|
||||
# XLNet and XLM use more arguments for their predictions
|
||||
if args.model_type in ['xlnet', 'xlm']:
|
||||
inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})
|
||||
if args.model_type in ["xlnet", "xlm"]:
|
||||
inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
|
||||
|
||||
outputs = model(**inputs)
|
||||
|
||||
@@ -271,17 +349,17 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
cls_logits = output[4]
|
||||
|
||||
result = SquadResult(
|
||||
unique_id, start_logits, end_logits,
|
||||
start_top_index=start_top_index,
|
||||
end_top_index=end_top_index,
|
||||
cls_logits=cls_logits
|
||||
unique_id,
|
||||
start_logits,
|
||||
end_logits,
|
||||
start_top_index=start_top_index,
|
||||
end_top_index=end_top_index,
|
||||
cls_logits=cls_logits,
|
||||
)
|
||||
|
||||
else:
|
||||
start_logits, end_logits = output
|
||||
result = SquadResult(
|
||||
unique_id, start_logits, end_logits
|
||||
)
|
||||
result = SquadResult(unique_id, start_logits, end_logits)
|
||||
|
||||
all_results.append(result)
|
||||
|
||||
@@ -298,42 +376,72 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
output_null_log_odds_file = None
|
||||
|
||||
# XLNet and XLM use a more complex post-processing procedure
|
||||
if args.model_type in ['xlnet', 'xlm']:
|
||||
if args.model_type in ["xlnet", "xlm"]:
|
||||
start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
|
||||
end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top
|
||||
|
||||
predictions = compute_predictions_log_probs(examples, features, all_results, args.n_best_size,
|
||||
args.max_answer_length, output_prediction_file,
|
||||
output_nbest_file, output_null_log_odds_file,
|
||||
start_n_top, end_n_top,
|
||||
args.version_2_with_negative, tokenizer, args.verbose_logging)
|
||||
predictions = compute_predictions_log_probs(
|
||||
examples,
|
||||
features,
|
||||
all_results,
|
||||
args.n_best_size,
|
||||
args.max_answer_length,
|
||||
output_prediction_file,
|
||||
output_nbest_file,
|
||||
output_null_log_odds_file,
|
||||
start_n_top,
|
||||
end_n_top,
|
||||
args.version_2_with_negative,
|
||||
tokenizer,
|
||||
args.verbose_logging,
|
||||
)
|
||||
else:
|
||||
predictions = compute_predictions_logits(examples, features, all_results, args.n_best_size,
|
||||
args.max_answer_length, args.do_lower_case, output_prediction_file,
|
||||
output_nbest_file, output_null_log_odds_file, args.verbose_logging,
|
||||
args.version_2_with_negative, args.null_score_diff_threshold)
|
||||
predictions = compute_predictions_logits(
|
||||
examples,
|
||||
features,
|
||||
all_results,
|
||||
args.n_best_size,
|
||||
args.max_answer_length,
|
||||
args.do_lower_case,
|
||||
output_prediction_file,
|
||||
output_nbest_file,
|
||||
output_null_log_odds_file,
|
||||
args.verbose_logging,
|
||||
args.version_2_with_negative,
|
||||
args.null_score_diff_threshold,
|
||||
tokenizer,
|
||||
)
|
||||
|
||||
# Compute the F1 and exact scores.
|
||||
results = squad_evaluate(examples, predictions)
|
||||
return results
|
||||
|
||||
|
||||
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
|
||||
if args.local_rank not in [-1, 0] and not evaluate:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||
# Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||
torch.distributed.barrier()
|
||||
|
||||
# Load data features from cache or dataset file
|
||||
input_dir = args.data_dir if args.data_dir else "."
|
||||
cached_features_file = os.path.join(input_dir, 'cached_{}_{}_{}'.format(
|
||||
'dev' if evaluate else 'train',
|
||||
list(filter(None, args.model_name_or_path.split('/'))).pop(),
|
||||
str(args.max_seq_length))
|
||||
cached_features_file = os.path.join(
|
||||
input_dir,
|
||||
"cached_{}_{}_{}".format(
|
||||
"dev" if evaluate else "train",
|
||||
list(filter(None, args.model_name_or_path.split("/"))).pop(),
|
||||
str(args.max_seq_length),
|
||||
),
|
||||
)
|
||||
|
||||
# Init features and dataset from cache if it exists
|
||||
if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
|
||||
if os.path.exists(cached_features_file) and not args.overwrite_cache:
|
||||
logger.info("Loading features from cached file %s", cached_features_file)
|
||||
features_and_dataset = torch.load(cached_features_file)
|
||||
features, dataset = features_and_dataset["features"], features_and_dataset["dataset"]
|
||||
features, dataset, examples = (
|
||||
features_and_dataset["features"],
|
||||
features_and_dataset["dataset"],
|
||||
features_and_dataset["examples"],
|
||||
)
|
||||
else:
|
||||
logger.info("Creating features from dataset file at %s", input_dir)
|
||||
|
||||
@@ -350,28 +458,29 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
|
||||
examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
|
||||
else:
|
||||
processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
|
||||
|
||||
if evaluate:
|
||||
examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
|
||||
else:
|
||||
examples = processor.get_train_examples(args.data_dir, filename=args.train_file)
|
||||
|
||||
features, dataset = squad_convert_examples_to_features(
|
||||
features, dataset = squad_convert_examples_to_features(
|
||||
examples=examples,
|
||||
tokenizer=tokenizer,
|
||||
max_seq_length=args.max_seq_length,
|
||||
doc_stride=args.doc_stride,
|
||||
max_query_length=args.max_query_length,
|
||||
is_training=not evaluate,
|
||||
return_dataset='pt'
|
||||
return_dataset="pt",
|
||||
threads=args.threads,
|
||||
)
|
||||
|
||||
if args.local_rank in [-1, 0]:
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
torch.save({"features": features, "dataset": dataset}, cached_features_file)
|
||||
torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)
|
||||
|
||||
if args.local_rank == 0 and not evaluate:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||
# Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
||||
torch.distributed.barrier()
|
||||
|
||||
if output_examples:
|
||||
return dataset, examples, features
|
||||
@@ -381,115 +490,211 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
## Required parameters
|
||||
parser.add_argument("--model_type", default=None, type=str, required=True,
|
||||
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
|
||||
parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
|
||||
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
|
||||
parser.add_argument("--output_dir", default=None, type=str, required=True,
|
||||
help="The output directory where the model checkpoints and predictions will be written.")
|
||||
# Required parameters
|
||||
parser.add_argument(
|
||||
"--model_type",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_name_or_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The output directory where the model checkpoints and predictions will be written.",
|
||||
)
|
||||
|
||||
## Other parameters
|
||||
parser.add_argument("--data_dir", default=None, type=str,
|
||||
help="The input data dir. Should contain the .json files for the task." +
|
||||
"If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
|
||||
parser.add_argument("--train_file", default=None, type=str,
|
||||
help="The input training file. If a data dir is specified, will look for the file there" +
|
||||
"If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
|
||||
parser.add_argument("--predict_file", default=None, type=str,
|
||||
help="The input evaluation file. If a data dir is specified, will look for the file there" +
|
||||
"If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
|
||||
parser.add_argument("--config_name", default="", type=str,
|
||||
help="Pretrained config name or path if not the same as model_name")
|
||||
parser.add_argument("--tokenizer_name", default="", type=str,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name")
|
||||
parser.add_argument("--cache_dir", default="", type=str,
|
||||
help="Where do you want to store the pre-trained models downloaded from s3")
|
||||
# Other parameters
|
||||
parser.add_argument(
|
||||
"--data_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
help="The input data dir. Should contain the .json files for the task."
|
||||
+ "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--train_file",
|
||||
default=None,
|
||||
type=str,
|
||||
help="The input training file. If a data dir is specified, will look for the file there"
|
||||
+ "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--predict_file",
|
||||
default=None,
|
||||
type=str,
|
||||
help="The input evaluation file. If a data dir is specified, will look for the file there"
|
||||
+ "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer_name",
|
||||
default="",
|
||||
type=str,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache_dir",
|
||||
default="",
|
||||
type=str,
|
||||
help="Where do you want to store the pre-trained models downloaded from s3",
|
||||
)
|
||||
|
||||
parser.add_argument('--version_2_with_negative', action='store_true',
|
||||
help='If true, the SQuAD examples contain some that do not have an answer.')
|
||||
parser.add_argument('--null_score_diff_threshold', type=float, default=0.0,
|
||||
help="If null_score - best_non_null is greater than the threshold predict null.")
|
||||
parser.add_argument(
|
||||
"--version_2_with_negative",
|
||||
action="store_true",
|
||||
help="If true, the SQuAD examples contain some that do not have an answer.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--null_score_diff_threshold",
|
||||
type=float,
|
||||
default=0.0,
|
||||
help="If null_score - best_non_null is greater than the threshold predict null.",
|
||||
)
|
||||
|
||||
parser.add_argument("--max_seq_length", default=384, type=int,
|
||||
help="The maximum total input sequence length after WordPiece tokenization. Sequences "
|
||||
"longer than this will be truncated, and sequences shorter than this will be padded.")
|
||||
parser.add_argument("--doc_stride", default=128, type=int,
|
||||
help="When splitting up a long document into chunks, how much stride to take between chunks.")
|
||||
parser.add_argument("--max_query_length", default=64, type=int,
|
||||
help="The maximum number of tokens for the question. Questions longer than this will "
|
||||
"be truncated to this length.")
|
||||
parser.add_argument("--do_train", action='store_true',
|
||||
help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action='store_true',
|
||||
help="Whether to run eval on the dev set.")
|
||||
parser.add_argument("--evaluate_during_training", action='store_true',
|
||||
help="Rul evaluation during training at each logging step.")
|
||||
parser.add_argument("--do_lower_case", action='store_true',
|
||||
help="Set this flag if you are using an uncased model.")
|
||||
parser.add_argument(
|
||||
"--max_seq_length",
|
||||
default=384,
|
||||
type=int,
|
||||
help="The maximum total input sequence length after WordPiece tokenization. Sequences "
|
||||
"longer than this will be truncated, and sequences shorter than this will be padded.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--doc_stride",
|
||||
default=128,
|
||||
type=int,
|
||||
help="When splitting up a long document into chunks, how much stride to take between chunks.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_query_length",
|
||||
default=64,
|
||||
type=int,
|
||||
help="The maximum number of tokens for the question. Questions longer than this will "
|
||||
"be truncated to this length.",
|
||||
)
|
||||
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
|
||||
parser.add_argument(
|
||||
"--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
|
||||
)
|
||||
|
||||
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
|
||||
help="Batch size per GPU/CPU for training.")
|
||||
parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
|
||||
help="Batch size per GPU/CPU for evaluation.")
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float,
|
||||
help="The initial learning rate for Adam.")
|
||||
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float,
|
||||
help="Weight decay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
|
||||
help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float,
|
||||
help="Max gradient norm.")
|
||||
parser.add_argument("--num_train_epochs", default=3.0, type=float,
|
||||
help="Total number of training epochs to perform.")
|
||||
parser.add_argument("--max_steps", default=-1, type=int,
|
||||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
|
||||
parser.add_argument("--warmup_steps", default=0, type=int,
|
||||
help="Linear warmup over warmup_steps.")
|
||||
parser.add_argument("--n_best_size", default=20, type=int,
|
||||
help="The total number of n-best predictions to generate in the nbest_predictions.json output file.")
|
||||
parser.add_argument("--max_answer_length", default=30, type=int,
|
||||
help="The maximum length of an answer that can be generated. This is needed because the start "
|
||||
"and end predictions are not conditioned on one another.")
|
||||
parser.add_argument("--verbose_logging", action='store_true',
|
||||
help="If true, all of the warnings related to data processing will be printed. "
|
||||
"A number of warnings are expected for a normal SQuAD evaluation.")
|
||||
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
|
||||
parser.add_argument(
|
||||
"--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
|
||||
)
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.",
|
||||
)
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
|
||||
parser.add_argument(
|
||||
"--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_steps",
|
||||
default=-1,
|
||||
type=int,
|
||||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
|
||||
)
|
||||
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
|
||||
parser.add_argument(
|
||||
"--n_best_size",
|
||||
default=20,
|
||||
type=int,
|
||||
help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_answer_length",
|
||||
default=30,
|
||||
type=int,
|
||||
help="The maximum length of an answer that can be generated. This is needed because the start "
|
||||
"and end predictions are not conditioned on one another.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose_logging",
|
||||
action="store_true",
|
||||
help="If true, all of the warnings related to data processing will be printed. "
|
||||
"A number of warnings are expected for a normal SQuAD evaluation.",
|
||||
)
|
||||
|
||||
parser.add_argument('--logging_steps', type=int, default=50,
|
||||
help="Log every X updates steps.")
|
||||
parser.add_argument('--save_steps', type=int, default=50,
|
||||
help="Save checkpoint every X updates steps.")
|
||||
parser.add_argument("--eval_all_checkpoints", action='store_true',
|
||||
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
|
||||
parser.add_argument("--no_cuda", action='store_true',
|
||||
help="Whether not to use CUDA when available")
|
||||
parser.add_argument('--overwrite_output_dir', action='store_true',
|
||||
help="Overwrite the content of the output directory")
|
||||
parser.add_argument('--overwrite_cache', action='store_true',
|
||||
help="Overwrite the cached training and evaluation sets")
|
||||
parser.add_argument('--seed', type=int, default=42,
|
||||
help="random seed for initialization")
|
||||
parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
|
||||
parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
|
||||
parser.add_argument(
|
||||
"--eval_all_checkpoints",
|
||||
action="store_true",
|
||||
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
|
||||
)
|
||||
parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
|
||||
parser.add_argument(
|
||||
"--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
|
||||
|
||||
parser.add_argument("--local_rank", type=int, default=-1,
|
||||
help="local_rank for distributed training on gpus")
|
||||
parser.add_argument('--fp16', action='store_true',
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
|
||||
parser.add_argument('--fp16_opt_level', type=str, default='O1',
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html")
|
||||
parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
|
||||
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
|
||||
parser.add_argument(
|
||||
"--fp16",
|
||||
action="store_true",
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fp16_opt_level",
|
||||
type=str,
|
||||
default="O1",
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html",
|
||||
)
|
||||
parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
|
||||
parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
|
||||
|
||||
parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features")
|
||||
args = parser.parse_args()
|
||||
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
|
||||
raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
|
||||
if args.doc_stride >= args.max_seq_length - args.max_query_length:
|
||||
logger.warning(
|
||||
"WARNING - You've set a doc stride which may be superior to the document length in some "
|
||||
"examples. This could result in errors when building features from the examples. Please reduce the doc "
|
||||
"stride or increase the maximum length to ensure the features are correctly built."
|
||||
)
|
||||
|
||||
if (
|
||||
os.path.exists(args.output_dir)
|
||||
and os.listdir(args.output_dir)
|
||||
and args.do_train
|
||||
and not args.overwrite_output_dir
|
||||
):
|
||||
raise ValueError(
|
||||
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
|
||||
args.output_dir
|
||||
)
|
||||
)
|
||||
|
||||
# Setup distant debugging if needed
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
@@ -501,38 +706,54 @@ def main():
|
||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
torch.distributed.init_process_group(backend='nccl')
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
args.n_gpu = 1
|
||||
args.device = device
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
|
||||
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
|
||||
)
|
||||
logger.warning(
|
||||
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
args.local_rank,
|
||||
device,
|
||||
args.n_gpu,
|
||||
bool(args.local_rank != -1),
|
||||
args.fp16,
|
||||
)
|
||||
|
||||
# Set seed
|
||||
set_seed(args)
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
if args.local_rank not in [-1, 0]:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||
# Make sure only the first process in distributed training will download model & vocab
|
||||
torch.distributed.barrier()
|
||||
|
||||
args.model_type = args.model_type.lower()
|
||||
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
|
||||
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
||||
do_lower_case=args.do_lower_case,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
model = model_class.from_pretrained(args.model_name_or_path,
|
||||
from_tf=bool('.ckpt' in args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
config = config_class.from_pretrained(
|
||||
args.config_name if args.config_name else args.model_name_or_path,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
tokenizer = tokenizer_class.from_pretrained(
|
||||
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
||||
do_lower_case=args.do_lower_case,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
model = model_class.from_pretrained(
|
||||
args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
|
||||
if args.local_rank == 0:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||
# Make sure only the first process in distributed training will download model & vocab
|
||||
torch.distributed.barrier()
|
||||
|
||||
model.to(args.device)
|
||||
|
||||
@@ -544,7 +765,8 @@ def main():
|
||||
if args.fp16:
|
||||
try:
|
||||
import apex
|
||||
apex.amp.register_half_function(torch, 'einsum')
|
||||
|
||||
apex.amp.register_half_function(torch, "einsum")
|
||||
except ImportError:
|
||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
||||
|
||||
@@ -554,7 +776,6 @@ def main():
|
||||
global_step, tr_loss = train(args, train_dataset, model, tokenizer)
|
||||
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
|
||||
|
||||
|
||||
# Save the trained model and the tokenizer
|
||||
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||
# Create output directory if needed
|
||||
@@ -564,28 +785,30 @@ def main():
|
||||
logger.info("Saving model checkpoint to %s", args.output_dir)
|
||||
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
|
||||
# They can then be reloaded using `from_pretrained()`
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
# Take care of distributed/parallel training
|
||||
model_to_save = model.module if hasattr(model, "module") else model
|
||||
model_to_save.save_pretrained(args.output_dir)
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
|
||||
# Good practice: save your training arguments together with the trained model
|
||||
torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
|
||||
torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
|
||||
|
||||
# Load a trained model and vocabulary that you have fine-tuned
|
||||
model = model_class.from_pretrained(args.output_dir, force_download=True)
|
||||
model = model_class.from_pretrained(args.output_dir) # , force_download=True)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
||||
model.to(args.device)
|
||||
|
||||
|
||||
# Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
|
||||
results = {}
|
||||
if args.do_eval and args.local_rank in [-1, 0]:
|
||||
|
||||
if args.do_train:
|
||||
logger.info("Loading checkpoints saved during training for evaluation")
|
||||
checkpoints = [args.output_dir]
|
||||
if args.eval_all_checkpoints:
|
||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||
checkpoints = list(
|
||||
os.path.dirname(c)
|
||||
for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
|
||||
)
|
||||
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs
|
||||
else:
|
||||
logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path)
|
||||
@@ -595,14 +818,14 @@ def main():
|
||||
|
||||
for checkpoint in checkpoints:
|
||||
# Reload the model
|
||||
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
||||
model = model_class.from_pretrained(checkpoint, force_download=True)
|
||||
global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
|
||||
model = model_class.from_pretrained(checkpoint) # , force_download=True)
|
||||
model.to(args.device)
|
||||
|
||||
# Evaluate
|
||||
result = evaluate(args, model, tokenizer, prefix=global_step)
|
||||
|
||||
result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
|
||||
result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
|
||||
results.update(result)
|
||||
|
||||
logger.info("Results: {}".format(results))
|
||||
|
||||
@@ -1,7 +1,17 @@
|
||||
import os
|
||||
|
||||
import tensorflow as tf
|
||||
import tensorflow_datasets
|
||||
from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig, glue_convert_examples_to_features, BertForSequenceClassification, glue_processors
|
||||
|
||||
from transformers import (
|
||||
BertConfig,
|
||||
BertForSequenceClassification,
|
||||
BertTokenizer,
|
||||
TFBertForSequenceClassification,
|
||||
glue_convert_examples_to_features,
|
||||
glue_processors,
|
||||
)
|
||||
|
||||
|
||||
# script parameters
|
||||
BATCH_SIZE = 32
|
||||
@@ -16,7 +26,7 @@ if TASK == "sst-2":
|
||||
TFDS_TASK = "sst2"
|
||||
elif TASK == "sts-b":
|
||||
TFDS_TASK = "stsb"
|
||||
else:
|
||||
else:
|
||||
TFDS_TASK = TASK
|
||||
|
||||
num_labels = len(glue_processors[TASK]().get_labels())
|
||||
@@ -27,29 +37,29 @@ tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})
|
||||
|
||||
# Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
|
||||
config = BertConfig.from_pretrained("bert-base-cased", num_labels=num_labels)
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
|
||||
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased', config=config)
|
||||
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
|
||||
model = TFBertForSequenceClassification.from_pretrained("bert-base-cased", config=config)
|
||||
|
||||
# Load dataset via TensorFlow Datasets
|
||||
data, info = tensorflow_datasets.load(f'glue/{TFDS_TASK}', with_info=True)
|
||||
train_examples = info.splits['train'].num_examples
|
||||
data, info = tensorflow_datasets.load(f"glue/{TFDS_TASK}", with_info=True)
|
||||
train_examples = info.splits["train"].num_examples
|
||||
|
||||
# MNLI expects either validation_matched or validation_mismatched
|
||||
valid_examples = info.splits['validation'].num_examples
|
||||
valid_examples = info.splits["validation"].num_examples
|
||||
|
||||
# Prepare dataset for GLUE as a tf.data.Dataset instance
|
||||
train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, TASK)
|
||||
train_dataset = glue_convert_examples_to_features(data["train"], tokenizer, 128, TASK)
|
||||
|
||||
# MNLI expects either validation_matched or validation_mismatched
|
||||
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, TASK)
|
||||
valid_dataset = glue_convert_examples_to_features(data["validation"], tokenizer, 128, TASK)
|
||||
train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1)
|
||||
valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)
|
||||
|
||||
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
|
||||
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
|
||||
opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
|
||||
if USE_AMP:
|
||||
# loss scaling is currently required when using mixed precision
|
||||
opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')
|
||||
opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic")
|
||||
|
||||
|
||||
if num_labels == 1:
|
||||
@@ -57,37 +67,39 @@ if num_labels == 1:
|
||||
else:
|
||||
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
|
||||
|
||||
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
|
||||
metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
|
||||
model.compile(optimizer=opt, loss=loss, metrics=[metric])
|
||||
|
||||
# Train and evaluate using tf.keras.Model.fit()
|
||||
train_steps = train_examples//BATCH_SIZE
|
||||
valid_steps = valid_examples//EVAL_BATCH_SIZE
|
||||
train_steps = train_examples // BATCH_SIZE
|
||||
valid_steps = valid_examples // EVAL_BATCH_SIZE
|
||||
|
||||
history = model.fit(train_dataset, epochs=EPOCHS, steps_per_epoch=train_steps,
|
||||
validation_data=valid_dataset, validation_steps=valid_steps)
|
||||
history = model.fit(
|
||||
train_dataset,
|
||||
epochs=EPOCHS,
|
||||
steps_per_epoch=train_steps,
|
||||
validation_data=valid_dataset,
|
||||
validation_steps=valid_steps,
|
||||
)
|
||||
|
||||
# Save TF2 model
|
||||
os.makedirs('./save/', exist_ok=True)
|
||||
model.save_pretrained('./save/')
|
||||
os.makedirs("./save/", exist_ok=True)
|
||||
model.save_pretrained("./save/")
|
||||
|
||||
if TASK == "mrpc":
|
||||
# Load the TensorFlow model in PyTorch for inspection
|
||||
# This is to demo the interoperability between the two frameworks, you don't have to
|
||||
# This is to demo the interoperability between the two frameworks, you don't have to
|
||||
# do this in real life (you can run the inference on the TF model).
|
||||
pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
|
||||
pytorch_model = BertForSequenceClassification.from_pretrained("./save/", from_tf=True)
|
||||
|
||||
# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
|
||||
sentence_0 = 'This research was consistent with his findings.'
|
||||
sentence_1 = 'His findings were compatible with this research.'
|
||||
sentence_2 = 'His findings were not compatible with this research.'
|
||||
inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
|
||||
inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
|
||||
|
||||
del inputs_1["special_tokens_mask"]
|
||||
del inputs_2["special_tokens_mask"]
|
||||
sentence_0 = "This research was consistent with his findings."
|
||||
sentence_1 = "His findings were compatible with this research."
|
||||
sentence_2 = "His findings were not compatible with this research."
|
||||
inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors="pt")
|
||||
inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors="pt")
|
||||
|
||||
pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
|
||||
pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
|
||||
print('sentence_1 is', 'a paraphrase' if pred_1 else 'not a paraphrase', 'of sentence_0')
|
||||
print('sentence_2 is', 'a paraphrase' if pred_2 else 'not a paraphrase', 'of sentence_0')
|
||||
print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
|
||||
print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
|
||||
|
||||
@@ -1,209 +1,190 @@
|
||||
# coding=utf-8
|
||||
import datetime
|
||||
import os
|
||||
import math
|
||||
import glob
|
||||
import re
|
||||
import tensorflow as tf
|
||||
import collections
|
||||
import datetime
|
||||
import glob
|
||||
import math
|
||||
import os
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from absl import app, flags, logging
|
||||
from seqeval import metrics
|
||||
import _pickle as pickle
|
||||
from absl import logging
|
||||
from transformers import TF2_WEIGHTS_NAME, BertConfig, BertTokenizer, TFBertForTokenClassification
|
||||
from transformers import RobertaConfig, RobertaTokenizer, TFRobertaForTokenClassification
|
||||
from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForTokenClassification
|
||||
from transformers import create_optimizer, GradientAccumulator
|
||||
|
||||
from transformers import (
|
||||
TF2_WEIGHTS_NAME,
|
||||
BertConfig,
|
||||
BertTokenizer,
|
||||
DistilBertConfig,
|
||||
DistilBertTokenizer,
|
||||
GradientAccumulator,
|
||||
RobertaConfig,
|
||||
RobertaTokenizer,
|
||||
TFBertForTokenClassification,
|
||||
TFDistilBertForTokenClassification,
|
||||
TFRobertaForTokenClassification,
|
||||
create_optimizer,
|
||||
)
|
||||
from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
|
||||
from fastprogress import master_bar, progress_bar
|
||||
from absl import flags
|
||||
from absl import app
|
||||
|
||||
|
||||
try:
|
||||
from fastprogress import master_bar, progress_bar
|
||||
except ImportError:
|
||||
from fastprogress.fastprogress import master_bar, progress_bar
|
||||
|
||||
|
||||
ALL_MODELS = sum(
|
||||
(tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)),
|
||||
())
|
||||
(tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)), ()
|
||||
)
|
||||
|
||||
MODEL_CLASSES = {
|
||||
"bert": (BertConfig, TFBertForTokenClassification, BertTokenizer),
|
||||
"roberta": (RobertaConfig, TFRobertaForTokenClassification, RobertaTokenizer),
|
||||
"distilbert": (DistilBertConfig, TFDistilBertForTokenClassification, DistilBertTokenizer)
|
||||
"distilbert": (DistilBertConfig, TFDistilBertForTokenClassification, DistilBertTokenizer),
|
||||
}
|
||||
|
||||
|
||||
flags.DEFINE_string(
|
||||
"data_dir", None,
|
||||
"The input data dir. Should contain the .conll files (or other data files) "
|
||||
"for the task.")
|
||||
"data_dir", None, "The input data dir. Should contain the .conll files (or other data files) " "for the task."
|
||||
)
|
||||
|
||||
flags.DEFINE_string("model_type", None, "Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
|
||||
|
||||
flags.DEFINE_string(
|
||||
"model_type", None,
|
||||
"Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
|
||||
"model_name_or_path",
|
||||
None,
|
||||
"Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
|
||||
)
|
||||
|
||||
flags.DEFINE_string("output_dir", None, "The output directory where the model checkpoints will be written.")
|
||||
|
||||
flags.DEFINE_string(
|
||||
"model_name_or_path", None,
|
||||
"Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
|
||||
"labels", "", "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."
|
||||
)
|
||||
|
||||
flags.DEFINE_string(
|
||||
"output_dir", None,
|
||||
"The output directory where the model checkpoints will be written.")
|
||||
flags.DEFINE_string("config_name", "", "Pretrained config name or path if not the same as model_name")
|
||||
|
||||
flags.DEFINE_string(
|
||||
"labels", "",
|
||||
"Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.")
|
||||
flags.DEFINE_string("tokenizer_name", "", "Pretrained tokenizer name or path if not the same as model_name")
|
||||
|
||||
flags.DEFINE_string(
|
||||
"config_name", "",
|
||||
"Pretrained config name or path if not the same as model_name")
|
||||
|
||||
flags.DEFINE_string(
|
||||
"tokenizer_name", "",
|
||||
"Pretrained tokenizer name or path if not the same as model_name")
|
||||
|
||||
flags.DEFINE_string(
|
||||
"cache_dir", "",
|
||||
"Where do you want to store the pre-trained models downloaded from s3")
|
||||
flags.DEFINE_string("cache_dir", "", "Where do you want to store the pre-trained models downloaded from s3")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"max_seq_length", 128,
|
||||
"max_seq_length",
|
||||
128,
|
||||
"The maximum total input sentence length after tokenization. "
|
||||
"Sequences longer than this will be truncated, sequences shorter "
|
||||
"will be padded.")
|
||||
"will be padded.",
|
||||
)
|
||||
|
||||
flags.DEFINE_string(
|
||||
"tpu", None,
|
||||
"tpu",
|
||||
None,
|
||||
"The Cloud TPU to use for training. This should be either the name "
|
||||
"used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
|
||||
"url.")
|
||||
"url.",
|
||||
)
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"num_tpu_cores", 8,
|
||||
"Total number of TPU cores to use.")
|
||||
flags.DEFINE_integer("num_tpu_cores", 8, "Total number of TPU cores to use.")
|
||||
|
||||
flags.DEFINE_boolean("do_train", False, "Whether to run training.")
|
||||
|
||||
flags.DEFINE_boolean("do_eval", False, "Whether to run eval on the dev set.")
|
||||
|
||||
flags.DEFINE_boolean("do_predict", False, "Whether to run predictions on the test set.")
|
||||
|
||||
flags.DEFINE_boolean(
|
||||
"do_train", False,
|
||||
"Whether to run training.")
|
||||
"evaluate_during_training", False, "Whether to run evaluation during training at each logging step."
|
||||
)
|
||||
|
||||
flags.DEFINE_boolean(
|
||||
"do_eval", False,
|
||||
"Whether to run eval on the dev set.")
|
||||
flags.DEFINE_boolean("do_lower_case", False, "Set this flag if you are using an uncased model.")
|
||||
|
||||
flags.DEFINE_boolean(
|
||||
"do_predict", False,
|
||||
"Whether to run predictions on the test set.")
|
||||
flags.DEFINE_integer("per_device_train_batch_size", 8, "Batch size per GPU/CPU/TPU for training.")
|
||||
|
||||
flags.DEFINE_boolean(
|
||||
"evaluate_during_training", False,
|
||||
"Whether to run evaluation during training at each logging step.")
|
||||
|
||||
flags.DEFINE_boolean(
|
||||
"do_lower_case", False,
|
||||
"Set this flag if you are using an uncased model.")
|
||||
flags.DEFINE_integer("per_device_eval_batch_size", 8, "Batch size per GPU/CPU/TPU for evaluation.")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"per_device_train_batch_size", 8,
|
||||
"Batch size per GPU/CPU/TPU for training.")
|
||||
"gradient_accumulation_steps", 1, "Number of updates steps to accumulate before performing a backward/update pass."
|
||||
)
|
||||
|
||||
flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
|
||||
|
||||
flags.DEFINE_float("weight_decay", 0.0, "Weight decay if we apply some.")
|
||||
|
||||
flags.DEFINE_float("adam_epsilon", 1e-8, "Epsilon for Adam optimizer.")
|
||||
|
||||
flags.DEFINE_float("max_grad_norm", 1.0, "Max gradient norm.")
|
||||
|
||||
flags.DEFINE_integer("num_train_epochs", 3, "Total number of training epochs to perform.")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"per_device_eval_batch_size", 8,
|
||||
"Batch size per GPU/CPU/TPU for evaluation.")
|
||||
"max_steps", -1, "If > 0: set total number of training steps to perform. Override num_train_epochs."
|
||||
)
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"gradient_accumulation_steps", 1,
|
||||
"Number of updates steps to accumulate before performing a backward/update pass.")
|
||||
flags.DEFINE_integer("warmup_steps", 0, "Linear warmup over warmup_steps.")
|
||||
|
||||
flags.DEFINE_float(
|
||||
"learning_rate", 5e-5,
|
||||
"The initial learning rate for Adam.")
|
||||
flags.DEFINE_integer("logging_steps", 50, "Log every X updates steps.")
|
||||
|
||||
flags.DEFINE_float(
|
||||
"weight_decay", 0.0,
|
||||
"Weight decay if we apply some.")
|
||||
|
||||
flags.DEFINE_float(
|
||||
"adam_epsilon", 1e-8,
|
||||
"Epsilon for Adam optimizer.")
|
||||
|
||||
flags.DEFINE_float(
|
||||
"max_grad_norm", 1.0,
|
||||
"Max gradient norm.")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"num_train_epochs", 3,
|
||||
"Total number of training epochs to perform.")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"max_steps", -1,
|
||||
"If > 0: set total number of training steps to perform. Override num_train_epochs.")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"warmup_steps", 0,
|
||||
"Linear warmup over warmup_steps.")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"logging_steps", 50,
|
||||
"Log every X updates steps.")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"save_steps", 50,
|
||||
"Save checkpoint every X updates steps.")
|
||||
flags.DEFINE_integer("save_steps", 50, "Save checkpoint every X updates steps.")
|
||||
|
||||
flags.DEFINE_boolean(
|
||||
"eval_all_checkpoints", False,
|
||||
"Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
|
||||
"eval_all_checkpoints",
|
||||
False,
|
||||
"Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
|
||||
)
|
||||
|
||||
flags.DEFINE_boolean(
|
||||
"no_cuda", False,
|
||||
"Avoid using CUDA when available")
|
||||
flags.DEFINE_boolean("no_cuda", False, "Avoid using CUDA when available")
|
||||
|
||||
flags.DEFINE_boolean(
|
||||
"overwrite_output_dir", False,
|
||||
"Overwrite the content of the output directory")
|
||||
flags.DEFINE_boolean("overwrite_output_dir", False, "Overwrite the content of the output directory")
|
||||
|
||||
flags.DEFINE_boolean(
|
||||
"overwrite_cache", False,
|
||||
"Overwrite the cached training and evaluation sets")
|
||||
flags.DEFINE_boolean("overwrite_cache", False, "Overwrite the cached training and evaluation sets")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"seed", 42,
|
||||
"random seed for initialization")
|
||||
flags.DEFINE_integer("seed", 42, "random seed for initialization")
|
||||
|
||||
flags.DEFINE_boolean(
|
||||
"fp16", False,
|
||||
"Whether to use 16-bit (mixed) precision instead of 32-bit")
|
||||
flags.DEFINE_boolean("fp16", False, "Whether to use 16-bit (mixed) precision instead of 32-bit")
|
||||
|
||||
flags.DEFINE_string(
|
||||
"gpus", "0",
|
||||
"gpus",
|
||||
"0",
|
||||
"Comma separated list of gpus devices. If only one, switch to single "
|
||||
"gpu strategy, if None takes all the gpus available.")
|
||||
"gpu strategy, if None takes all the gpus available.",
|
||||
)
|
||||
|
||||
|
||||
def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id):
|
||||
if args['max_steps'] > 0:
|
||||
num_train_steps = args['max_steps'] * args['gradient_accumulation_steps']
|
||||
args['num_train_epochs'] = 1
|
||||
def train(
|
||||
args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id
|
||||
):
|
||||
if args["max_steps"] > 0:
|
||||
num_train_steps = args["max_steps"] * args["gradient_accumulation_steps"]
|
||||
args["num_train_epochs"] = 1
|
||||
else:
|
||||
num_train_steps = math.ceil(num_train_examples / train_batch_size) // args['gradient_accumulation_steps'] * args['num_train_epochs']
|
||||
num_train_steps = (
|
||||
math.ceil(num_train_examples / train_batch_size)
|
||||
// args["gradient_accumulation_steps"]
|
||||
* args["num_train_epochs"]
|
||||
)
|
||||
|
||||
writer = tf.summary.create_file_writer("/tmp/mylogs")
|
||||
|
||||
with strategy.scope():
|
||||
loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
|
||||
optimizer = create_optimizer(args['learning_rate'], num_train_steps, args['warmup_steps'])
|
||||
optimizer = create_optimizer(args["learning_rate"], num_train_steps, args["warmup_steps"])
|
||||
|
||||
if args['fp16']:
|
||||
optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic')
|
||||
if args["fp16"]:
|
||||
optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")
|
||||
|
||||
loss_metric = tf.keras.metrics.Mean(name='loss', dtype=tf.float32)
|
||||
loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32)
|
||||
gradient_accumulator = GradientAccumulator()
|
||||
|
||||
|
||||
logging.info("***** Running training *****")
|
||||
logging.info(" Num examples = %d", num_train_examples)
|
||||
logging.info(" Num Epochs = %d", args['num_train_epochs'])
|
||||
logging.info(" Instantaneous batch size per device = %d", args['per_device_train_batch_size'])
|
||||
logging.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
train_batch_size * args['gradient_accumulation_steps'])
|
||||
logging.info(" Gradient Accumulation steps = %d", args['gradient_accumulation_steps'])
|
||||
logging.info(" Num Epochs = %d", args["num_train_epochs"])
|
||||
logging.info(" Instantaneous batch size per device = %d", args["per_device_train_batch_size"])
|
||||
logging.info(
|
||||
" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
train_batch_size * args["gradient_accumulation_steps"],
|
||||
)
|
||||
logging.info(" Gradient Accumulation steps = %d", args["gradient_accumulation_steps"])
|
||||
logging.info(" Total training steps = %d", num_train_steps)
|
||||
|
||||
model.summary()
|
||||
@@ -214,26 +195,28 @@ def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, l
|
||||
|
||||
for gradient, variable in zip(gradient_accumulator.gradients, model.trainable_variables):
|
||||
if gradient is not None:
|
||||
scaled_gradient = gradient / (args['n_device'] * args['gradient_accumulation_steps'])
|
||||
scaled_gradient = gradient / (args["n_device"] * args["gradient_accumulation_steps"])
|
||||
grads_and_vars.append((scaled_gradient, variable))
|
||||
else:
|
||||
grads_and_vars.append((gradient, variable))
|
||||
|
||||
optimizer.apply_gradients(grads_and_vars, args['max_grad_norm'])
|
||||
optimizer.apply_gradients(grads_and_vars, args["max_grad_norm"])
|
||||
gradient_accumulator.reset()
|
||||
|
||||
@tf.function
|
||||
def train_step(train_features, train_labels):
|
||||
def step_fn(train_features, train_labels):
|
||||
inputs = {'attention_mask': train_features['input_mask'], 'training': True}
|
||||
inputs = {"attention_mask": train_features["input_mask"], "training": True}
|
||||
|
||||
if args['model_type'] != "distilbert":
|
||||
inputs["token_type_ids"] = train_features['segment_ids'] if args['model_type'] in ["bert", "xlnet"] else None
|
||||
if args["model_type"] != "distilbert":
|
||||
inputs["token_type_ids"] = (
|
||||
train_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None
|
||||
)
|
||||
|
||||
with tf.GradientTape() as tape:
|
||||
logits = model(train_features['input_ids'], **inputs)[0]
|
||||
logits = model(train_features["input_ids"], **inputs)[0]
|
||||
logits = tf.reshape(logits, (-1, len(labels) + 1))
|
||||
active_loss = tf.reshape(train_features['input_mask'], (-1,))
|
||||
active_loss = tf.reshape(train_features["input_mask"], (-1,))
|
||||
active_logits = tf.boolean_mask(logits, active_loss)
|
||||
train_labels = tf.reshape(train_labels, (-1,))
|
||||
active_labels = tf.boolean_mask(train_labels, active_loss)
|
||||
@@ -251,34 +234,40 @@ def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, l
|
||||
return mean_loss
|
||||
|
||||
current_time = datetime.datetime.now()
|
||||
train_iterator = master_bar(range(args['num_train_epochs']))
|
||||
train_iterator = master_bar(range(args["num_train_epochs"]))
|
||||
global_step = 0
|
||||
logging_loss = 0.0
|
||||
|
||||
for epoch in train_iterator:
|
||||
epoch_iterator = progress_bar(train_dataset, total=num_train_steps, parent=train_iterator, display=args['n_device'] > 1)
|
||||
epoch_iterator = progress_bar(
|
||||
train_dataset, total=num_train_steps, parent=train_iterator, display=args["n_device"] > 1
|
||||
)
|
||||
step = 1
|
||||
|
||||
with strategy.scope():
|
||||
for train_features, train_labels in epoch_iterator:
|
||||
loss = train_step(train_features, train_labels)
|
||||
|
||||
if step % args['gradient_accumulation_steps'] == 0:
|
||||
if step % args["gradient_accumulation_steps"] == 0:
|
||||
strategy.experimental_run_v2(apply_gradients)
|
||||
|
||||
loss_metric(loss)
|
||||
|
||||
global_step += 1
|
||||
|
||||
if args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0:
|
||||
if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0:
|
||||
# Log metrics
|
||||
if args['n_device'] == 1 and args['evaluate_during_training']: # Only evaluate when single GPU otherwise metrics may not average well
|
||||
y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev")
|
||||
if (
|
||||
args["n_device"] == 1 and args["evaluate_during_training"]
|
||||
): # Only evaluate when single GPU otherwise metrics may not average well
|
||||
y_true, y_pred, eval_loss = evaluate(
|
||||
args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev"
|
||||
)
|
||||
report = metrics.classification_report(y_true, y_pred, digits=4)
|
||||
|
||||
|
||||
logging.info("Eval at step " + str(global_step) + "\n" + report)
|
||||
logging.info("eval_loss: " + str(eval_loss))
|
||||
|
||||
|
||||
precision = metrics.precision_score(y_true, y_pred)
|
||||
recall = metrics.recall_score(y_true, y_pred)
|
||||
f1 = metrics.f1_score(y_true, y_pred)
|
||||
@@ -288,33 +277,35 @@ def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, l
|
||||
tf.summary.scalar("precision", precision, global_step)
|
||||
tf.summary.scalar("recall", recall, global_step)
|
||||
tf.summary.scalar("f1", f1, global_step)
|
||||
|
||||
|
||||
lr = optimizer.learning_rate
|
||||
learning_rate = lr(step)
|
||||
|
||||
with writer.as_default():
|
||||
tf.summary.scalar("lr", learning_rate, global_step)
|
||||
tf.summary.scalar("loss", (loss_metric.result() - logging_loss) / args['logging_steps'], global_step)
|
||||
|
||||
tf.summary.scalar(
|
||||
"loss", (loss_metric.result() - logging_loss) / args["logging_steps"], global_step
|
||||
)
|
||||
|
||||
logging_loss = loss_metric.result()
|
||||
|
||||
with writer.as_default():
|
||||
tf.summary.scalar("loss", loss_metric.result(), step=step)
|
||||
|
||||
if args['save_steps'] > 0 and global_step % args['save_steps'] == 0:
|
||||
if args["save_steps"] > 0 and global_step % args["save_steps"] == 0:
|
||||
# Save model checkpoint
|
||||
output_dir = os.path.join(args['output_dir'], "checkpoint-{}".format(global_step))
|
||||
output_dir = os.path.join(args["output_dir"], "checkpoint-{}".format(global_step))
|
||||
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
|
||||
model.save_pretrained(output_dir)
|
||||
logging.info("Saving model checkpoint to %s", output_dir)
|
||||
|
||||
train_iterator.child.comment = f'loss : {loss_metric.result()}'
|
||||
|
||||
train_iterator.child.comment = f"loss : {loss_metric.result()}"
|
||||
step += 1
|
||||
|
||||
train_iterator.write(f'loss epoch {epoch + 1}: {loss_metric.result()}')
|
||||
train_iterator.write(f"loss epoch {epoch + 1}: {loss_metric.result()}")
|
||||
|
||||
loss_metric.reset_states()
|
||||
|
||||
@@ -322,13 +313,15 @@ def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, l
|
||||
|
||||
|
||||
def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode):
|
||||
eval_batch_size = args['per_device_eval_batch_size'] * args['n_device']
|
||||
eval_dataset, size = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode=mode)
|
||||
eval_batch_size = args["per_device_eval_batch_size"] * args["n_device"]
|
||||
eval_dataset, size = load_and_cache_examples(
|
||||
args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode=mode
|
||||
)
|
||||
eval_dataset = strategy.experimental_distribute_dataset(eval_dataset)
|
||||
preds = None
|
||||
num_eval_steps = math.ceil(size / eval_batch_size)
|
||||
master = master_bar(range(1))
|
||||
eval_iterator = progress_bar(eval_dataset, total=num_eval_steps, parent=master, display=args['n_device'] > 1)
|
||||
eval_iterator = progress_bar(eval_dataset, total=num_eval_steps, parent=master, display=args["n_device"] > 1)
|
||||
loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
|
||||
loss = 0.0
|
||||
|
||||
@@ -337,15 +330,17 @@ def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode)
|
||||
logging.info(" Batch size = %d", eval_batch_size)
|
||||
|
||||
for eval_features, eval_labels in eval_iterator:
|
||||
inputs = {'attention_mask': eval_features['input_mask'], 'training': False}
|
||||
inputs = {"attention_mask": eval_features["input_mask"], "training": False}
|
||||
|
||||
if args['model_type'] != "distilbert":
|
||||
inputs["token_type_ids"] = eval_features['segment_ids'] if args['model_type'] in ["bert", "xlnet"] else None
|
||||
if args["model_type"] != "distilbert":
|
||||
inputs["token_type_ids"] = (
|
||||
eval_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None
|
||||
)
|
||||
|
||||
with strategy.scope():
|
||||
logits = model(eval_features['input_ids'], **inputs)[0]
|
||||
logits = model(eval_features["input_ids"], **inputs)[0]
|
||||
tmp_logits = tf.reshape(logits, (-1, len(labels) + 1))
|
||||
active_loss = tf.reshape(eval_features['input_mask'], (-1,))
|
||||
active_loss = tf.reshape(eval_features["input_mask"], (-1,))
|
||||
active_logits = tf.boolean_mask(tmp_logits, active_loss)
|
||||
tmp_eval_labels = tf.reshape(eval_labels, (-1,))
|
||||
active_labels = tf.boolean_mask(tmp_eval_labels, active_loss)
|
||||
@@ -384,11 +379,11 @@ def load_cache(cached_file, max_seq_length):
|
||||
def _decode_record(record):
|
||||
example = tf.io.parse_single_example(record, name_to_features)
|
||||
features = {}
|
||||
features['input_ids'] = example['input_ids']
|
||||
features['input_mask'] = example['input_mask']
|
||||
features['segment_ids'] = example['segment_ids']
|
||||
features["input_ids"] = example["input_ids"]
|
||||
features["input_mask"] = example["input_mask"]
|
||||
features["segment_ids"] = example["segment_ids"]
|
||||
|
||||
return features, example['label_ids']
|
||||
return features, example["label_ids"]
|
||||
|
||||
d = tf.data.TFRecordDataset(cached_file)
|
||||
d = d.map(_decode_record, num_parallel_calls=4)
|
||||
@@ -422,39 +417,46 @@ def save_cache(features, cached_features_file):
|
||||
|
||||
|
||||
def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, batch_size, mode):
|
||||
drop_remainder = True if args['tpu'] or mode == 'train' else False
|
||||
drop_remainder = True if args["tpu"] or mode == "train" else False
|
||||
|
||||
# Load data features from cache or dataset file
|
||||
cached_features_file = os.path.join(args['data_dir'], "cached_{}_{}_{}.tf_record".format(mode,
|
||||
list(filter(None, args['model_name_or_path'].split("/"))).pop(),
|
||||
str(args['max_seq_length'])))
|
||||
if os.path.exists(cached_features_file) and not args['overwrite_cache']:
|
||||
cached_features_file = os.path.join(
|
||||
args["data_dir"],
|
||||
"cached_{}_{}_{}.tf_record".format(
|
||||
mode, list(filter(None, args["model_name_or_path"].split("/"))).pop(), str(args["max_seq_length"])
|
||||
),
|
||||
)
|
||||
if os.path.exists(cached_features_file) and not args["overwrite_cache"]:
|
||||
logging.info("Loading features from cached file %s", cached_features_file)
|
||||
dataset, size = load_cache(cached_features_file, args['max_seq_length'])
|
||||
dataset, size = load_cache(cached_features_file, args["max_seq_length"])
|
||||
else:
|
||||
logging.info("Creating features from dataset file at %s", args['data_dir'])
|
||||
examples = read_examples_from_file(args['data_dir'], mode)
|
||||
features = convert_examples_to_features(examples, labels, args['max_seq_length'], tokenizer,
|
||||
cls_token_at_end=bool(args['model_type'] in ["xlnet"]),
|
||||
# xlnet has a cls token at the end
|
||||
cls_token=tokenizer.cls_token,
|
||||
cls_token_segment_id=2 if args['model_type'] in ["xlnet"] else 0,
|
||||
sep_token=tokenizer.sep_token,
|
||||
sep_token_extra=bool(args['model_type'] in ["roberta"]),
|
||||
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
||||
pad_on_left=bool(args['model_type'] in ["xlnet"]),
|
||||
# pad on the left for xlnet
|
||||
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
||||
pad_token_segment_id=4 if args['model_type'] in ["xlnet"] else 0,
|
||||
pad_token_label_id=pad_token_label_id
|
||||
)
|
||||
logging.info("Creating features from dataset file at %s", args["data_dir"])
|
||||
examples = read_examples_from_file(args["data_dir"], mode)
|
||||
features = convert_examples_to_features(
|
||||
examples,
|
||||
labels,
|
||||
args["max_seq_length"],
|
||||
tokenizer,
|
||||
cls_token_at_end=bool(args["model_type"] in ["xlnet"]),
|
||||
# xlnet has a cls token at the end
|
||||
cls_token=tokenizer.cls_token,
|
||||
cls_token_segment_id=2 if args["model_type"] in ["xlnet"] else 0,
|
||||
sep_token=tokenizer.sep_token,
|
||||
sep_token_extra=bool(args["model_type"] in ["roberta"]),
|
||||
# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
|
||||
pad_on_left=bool(args["model_type"] in ["xlnet"]),
|
||||
# pad on the left for xlnet
|
||||
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
||||
pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0,
|
||||
pad_token_label_id=pad_token_label_id,
|
||||
)
|
||||
logging.info("Saving features into cached file %s", cached_features_file)
|
||||
save_cache(features, cached_features_file)
|
||||
dataset, size = load_cache(cached_features_file, args['max_seq_length'])
|
||||
dataset, size = load_cache(cached_features_file, args["max_seq_length"])
|
||||
|
||||
if mode == 'train':
|
||||
if mode == "train":
|
||||
dataset = dataset.repeat()
|
||||
dataset = dataset.shuffle(buffer_size=8192, seed=args['seed'])
|
||||
dataset = dataset.shuffle(buffer_size=8192, seed=args["seed"])
|
||||
|
||||
dataset = dataset.batch(batch_size, drop_remainder)
|
||||
dataset = dataset.prefetch(buffer_size=batch_size)
|
||||
@@ -466,98 +468,134 @@ def main(_):
|
||||
logging.set_verbosity(logging.INFO)
|
||||
args = flags.FLAGS.flag_values_dict()
|
||||
|
||||
if os.path.exists(args['output_dir']) and os.listdir(
|
||||
args['output_dir']) and args['do_train'] and not args['overwrite_output_dir']:
|
||||
if (
|
||||
os.path.exists(args["output_dir"])
|
||||
and os.listdir(args["output_dir"])
|
||||
and args["do_train"]
|
||||
and not args["overwrite_output_dir"]
|
||||
):
|
||||
raise ValueError(
|
||||
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
|
||||
args['output_dir']))
|
||||
args["output_dir"]
|
||||
)
|
||||
)
|
||||
|
||||
if args['fp16']:
|
||||
if args["fp16"]:
|
||||
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
|
||||
|
||||
if args['tpu']:
|
||||
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=args['tpu'])
|
||||
if args["tpu"]:
|
||||
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=args["tpu"])
|
||||
tf.config.experimental_connect_to_cluster(resolver)
|
||||
tf.tpu.experimental.initialize_tpu_system(resolver)
|
||||
strategy = tf.distribute.experimental.TPUStrategy(resolver)
|
||||
args['n_device'] = args['num_tpu_cores']
|
||||
elif len(args['gpus'].split(',')) > 1:
|
||||
args['n_device'] = len([f"/gpu:{gpu}" for gpu in args['gpus'].split(',')])
|
||||
strategy = tf.distribute.MirroredStrategy(devices=[f"/gpu:{gpu}" for gpu in args['gpus'].split(',')])
|
||||
elif args['no_cuda']:
|
||||
args['n_device'] = 1
|
||||
args["n_device"] = args["num_tpu_cores"]
|
||||
elif len(args["gpus"].split(",")) > 1:
|
||||
args["n_device"] = len([f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
|
||||
strategy = tf.distribute.MirroredStrategy(devices=[f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
|
||||
elif args["no_cuda"]:
|
||||
args["n_device"] = 1
|
||||
strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
|
||||
else:
|
||||
args['n_device'] = len(args['gpus'].split(','))
|
||||
strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args['gpus'].split(',')[0])
|
||||
args["n_device"] = len(args["gpus"].split(","))
|
||||
strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args["gpus"].split(",")[0])
|
||||
|
||||
logging.warning("n_device: %s, distributed training: %s, 16-bits training: %s",
|
||||
args['n_device'], bool(args['n_device'] > 1), args['fp16'])
|
||||
logging.warning(
|
||||
"n_device: %s, distributed training: %s, 16-bits training: %s",
|
||||
args["n_device"],
|
||||
bool(args["n_device"] > 1),
|
||||
args["fp16"],
|
||||
)
|
||||
|
||||
labels = get_labels(args['labels'])
|
||||
labels = get_labels(args["labels"])
|
||||
num_labels = len(labels) + 1
|
||||
pad_token_label_id = 0
|
||||
config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]
|
||||
config = config_class.from_pretrained(args['config_name'] if args['config_name'] else args['model_name_or_path'],
|
||||
num_labels=num_labels,
|
||||
cache_dir=args['cache_dir'] if args['cache_dir'] else None)
|
||||
config_class, model_class, tokenizer_class = MODEL_CLASSES[args["model_type"]]
|
||||
config = config_class.from_pretrained(
|
||||
args["config_name"] if args["config_name"] else args["model_name_or_path"],
|
||||
num_labels=num_labels,
|
||||
cache_dir=args["cache_dir"] if args["cache_dir"] else None,
|
||||
)
|
||||
|
||||
logging.info("Training/evaluation parameters %s", args)
|
||||
|
||||
# Training
|
||||
if args['do_train']:
|
||||
tokenizer = tokenizer_class.from_pretrained(args['tokenizer_name'] if args['tokenizer_name'] else args['model_name_or_path'],
|
||||
do_lower_case=args['do_lower_case'],
|
||||
cache_dir=args['cache_dir'] if args['cache_dir'] else None)
|
||||
if args["do_train"]:
|
||||
tokenizer = tokenizer_class.from_pretrained(
|
||||
args["tokenizer_name"] if args["tokenizer_name"] else args["model_name_or_path"],
|
||||
do_lower_case=args["do_lower_case"],
|
||||
cache_dir=args["cache_dir"] if args["cache_dir"] else None,
|
||||
)
|
||||
|
||||
with strategy.scope():
|
||||
model = model_class.from_pretrained(args['model_name_or_path'],
|
||||
from_pt=bool(".bin" in args['model_name_or_path']),
|
||||
config=config,
|
||||
cache_dir=args['cache_dir'] if args['cache_dir'] else None)
|
||||
model = model_class.from_pretrained(
|
||||
args["model_name_or_path"],
|
||||
from_pt=bool(".bin" in args["model_name_or_path"]),
|
||||
config=config,
|
||||
cache_dir=args["cache_dir"] if args["cache_dir"] else None,
|
||||
)
|
||||
model.layers[-1].activation = tf.keras.activations.softmax
|
||||
|
||||
train_batch_size = args['per_device_train_batch_size'] * args['n_device']
|
||||
train_dataset, num_train_examples = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, train_batch_size, mode="train")
|
||||
train_batch_size = args["per_device_train_batch_size"] * args["n_device"]
|
||||
train_dataset, num_train_examples = load_and_cache_examples(
|
||||
args, tokenizer, labels, pad_token_label_id, train_batch_size, mode="train"
|
||||
)
|
||||
train_dataset = strategy.experimental_distribute_dataset(train_dataset)
|
||||
train(args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id)
|
||||
train(
|
||||
args,
|
||||
strategy,
|
||||
train_dataset,
|
||||
tokenizer,
|
||||
model,
|
||||
num_train_examples,
|
||||
labels,
|
||||
train_batch_size,
|
||||
pad_token_label_id,
|
||||
)
|
||||
|
||||
if not os.path.exists(args['output_dir']):
|
||||
os.makedirs(args['output_dir'])
|
||||
if not os.path.exists(args["output_dir"]):
|
||||
os.makedirs(args["output_dir"])
|
||||
|
||||
logging.info("Saving model to %s", args['output_dir'])
|
||||
logging.info("Saving model to %s", args["output_dir"])
|
||||
|
||||
model.save_pretrained(args['output_dir'])
|
||||
tokenizer.save_pretrained(args['output_dir'])
|
||||
model.save_pretrained(args["output_dir"])
|
||||
tokenizer.save_pretrained(args["output_dir"])
|
||||
|
||||
# Evaluation
|
||||
if args['do_eval']:
|
||||
tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
|
||||
if args["do_eval"]:
|
||||
tokenizer = tokenizer_class.from_pretrained(args["output_dir"], do_lower_case=args["do_lower_case"])
|
||||
checkpoints = []
|
||||
results = []
|
||||
|
||||
if args['eval_all_checkpoints']:
|
||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args['output_dir'] + "/**/" + TF2_WEIGHTS_NAME, recursive=True), key=lambda f: int(''.join(filter(str.isdigit, f)) or -1)))
|
||||
|
||||
if args["eval_all_checkpoints"]:
|
||||
checkpoints = list(
|
||||
os.path.dirname(c)
|
||||
for c in sorted(
|
||||
glob.glob(args["output_dir"] + "/**/" + TF2_WEIGHTS_NAME, recursive=True),
|
||||
key=lambda f: int("".join(filter(str.isdigit, f)) or -1),
|
||||
)
|
||||
)
|
||||
|
||||
logging.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||
|
||||
if len(checkpoints) == 0:
|
||||
checkpoints.append(args['output_dir'])
|
||||
|
||||
checkpoints.append(args["output_dir"])
|
||||
|
||||
for checkpoint in checkpoints:
|
||||
global_step = checkpoint.split("-")[-1] if re.match(".*checkpoint-[0-9]", checkpoint) else "final"
|
||||
|
||||
with strategy.scope():
|
||||
model = model_class.from_pretrained(checkpoint)
|
||||
|
||||
y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev")
|
||||
y_true, y_pred, eval_loss = evaluate(
|
||||
args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev"
|
||||
)
|
||||
report = metrics.classification_report(y_true, y_pred, digits=4)
|
||||
|
||||
if global_step:
|
||||
results.append({global_step + "_report": report, global_step + "_loss": eval_loss})
|
||||
|
||||
output_eval_file = os.path.join(args['output_dir'], "eval_results.txt")
|
||||
|
||||
output_eval_file = os.path.join(args["output_dir"], "eval_results.txt")
|
||||
|
||||
with tf.io.gfile.GFile(output_eval_file, "w") as writer:
|
||||
for res in results:
|
||||
for key, val in res.items():
|
||||
@@ -572,26 +610,28 @@ def main(_):
|
||||
writer.write(report)
|
||||
writer.write("\n")
|
||||
|
||||
if args['do_predict']:
|
||||
tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
|
||||
model = model_class.from_pretrained(args['output_dir'])
|
||||
eval_batch_size = args['per_device_eval_batch_size'] * args['n_device']
|
||||
predict_dataset, _ = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test")
|
||||
if args["do_predict"]:
|
||||
tokenizer = tokenizer_class.from_pretrained(args["output_dir"], do_lower_case=args["do_lower_case"])
|
||||
model = model_class.from_pretrained(args["output_dir"])
|
||||
eval_batch_size = args["per_device_eval_batch_size"] * args["n_device"]
|
||||
predict_dataset, _ = load_and_cache_examples(
|
||||
args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test"
|
||||
)
|
||||
y_true, y_pred, pred_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="test")
|
||||
output_test_results_file = os.path.join(args['output_dir'], "test_results.txt")
|
||||
output_test_predictions_file = os.path.join(args['output_dir'], "test_predictions.txt")
|
||||
output_test_results_file = os.path.join(args["output_dir"], "test_results.txt")
|
||||
output_test_predictions_file = os.path.join(args["output_dir"], "test_predictions.txt")
|
||||
report = metrics.classification_report(y_true, y_pred, digits=4)
|
||||
|
||||
with tf.io.gfile.GFile(output_test_results_file, "w") as writer:
|
||||
report = metrics.classification_report(y_true, y_pred, digits=4)
|
||||
|
||||
|
||||
logging.info("\n" + report)
|
||||
|
||||
|
||||
writer.write(report)
|
||||
writer.write("\n\nloss = " + str(pred_loss))
|
||||
|
||||
with tf.io.gfile.GFile(output_test_predictions_file, "w") as writer:
|
||||
with tf.io.gfile.GFile(os.path.join(args['data_dir'], "test.txt"), "r") as f:
|
||||
with tf.io.gfile.GFile(os.path.join(args["data_dir"], "test.txt"), "r") as f:
|
||||
example_id = 0
|
||||
|
||||
for line in f:
|
||||
|
||||
@@ -16,7 +16,6 @@
|
||||
""" Finetuning multi-lingual models on XNLI (Bert, DistilBERT, XLM).
|
||||
Adapted from `examples/run_glue.py`"""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
@@ -26,38 +25,46 @@ import random
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||
TensorDataset)
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
|
||||
try:
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
except:
|
||||
from tensorboardX import SummaryWriter
|
||||
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from transformers import (WEIGHTS_NAME,
|
||||
BertConfig, BertForSequenceClassification, BertTokenizer,
|
||||
XLMConfig, XLMForSequenceClassification, XLMTokenizer,
|
||||
DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
|
||||
|
||||
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||
|
||||
from transformers import (
|
||||
WEIGHTS_NAME,
|
||||
AdamW,
|
||||
BertConfig,
|
||||
BertForSequenceClassification,
|
||||
BertTokenizer,
|
||||
DistilBertConfig,
|
||||
DistilBertForSequenceClassification,
|
||||
DistilBertTokenizer,
|
||||
XLMConfig,
|
||||
XLMForSequenceClassification,
|
||||
XLMTokenizer,
|
||||
get_linear_schedule_with_warmup,
|
||||
)
|
||||
from transformers import glue_convert_examples_to_features as convert_examples_to_features
|
||||
from transformers import xnli_compute_metrics as compute_metrics
|
||||
from transformers import xnli_output_modes as output_modes
|
||||
from transformers import xnli_processors as processors
|
||||
|
||||
from transformers import glue_convert_examples_to_features as convert_examples_to_features
|
||||
|
||||
try:
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
except ImportError:
|
||||
from tensorboardX import SummaryWriter
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, DistilBertConfig, XLMConfig)), ())
|
||||
ALL_MODELS = sum(
|
||||
(tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, DistilBertConfig, XLMConfig)), ()
|
||||
)
|
||||
|
||||
MODEL_CLASSES = {
|
||||
'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
|
||||
'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
|
||||
'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
|
||||
"bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
|
||||
"xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
|
||||
"distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
|
||||
}
|
||||
|
||||
|
||||
@@ -85,13 +92,27 @@ def train(args, train_dataset, model, tokenizer):
|
||||
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
||||
|
||||
# Prepare optimizer and schedule (linear warmup and decay)
|
||||
no_decay = ['bias', 'LayerNorm.weight']
|
||||
no_decay = ["bias", "LayerNorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
|
||||
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||
]
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||
"weight_decay": args.weight_decay,
|
||||
},
|
||||
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
||||
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
|
||||
)
|
||||
|
||||
# Check if saved optimizer or scheduler states exist
|
||||
if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
|
||||
os.path.join(args.model_name_or_path, "scheduler.pt")
|
||||
):
|
||||
# Load in optimizer and scheduler states
|
||||
optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
|
||||
scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
|
||||
|
||||
if args.fp16:
|
||||
try:
|
||||
from apex import amp
|
||||
@@ -105,40 +126,65 @@ def train(args, train_dataset, model, tokenizer):
|
||||
|
||||
# Distributed training (should be after apex fp16 initialization)
|
||||
if args.local_rank != -1:
|
||||
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
|
||||
output_device=args.local_rank,
|
||||
find_unused_parameters=True)
|
||||
model = torch.nn.parallel.DistributedDataParallel(
|
||||
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
|
||||
)
|
||||
|
||||
# Train!
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num examples = %d", len(train_dataset))
|
||||
logger.info(" Num Epochs = %d", args.num_train_epochs)
|
||||
logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
|
||||
logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
|
||||
logger.info(
|
||||
" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
args.train_batch_size
|
||||
* args.gradient_accumulation_steps
|
||||
* (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
|
||||
)
|
||||
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
|
||||
logger.info(" Total optimization steps = %d", t_total)
|
||||
|
||||
global_step = 0
|
||||
epochs_trained = 0
|
||||
steps_trained_in_current_epoch = 0
|
||||
# Check if continuing training from a checkpoint
|
||||
if os.path.exists(args.model_name_or_path):
|
||||
# set global_step to gobal_step of last saved checkpoint from model path
|
||||
global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
|
||||
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||
steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
|
||||
|
||||
logger.info(" Continuing training from checkpoint, will skip to saved global_step")
|
||||
logger.info(" Continuing training from epoch %d", epochs_trained)
|
||||
logger.info(" Continuing training from global step %d", global_step)
|
||||
logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
|
||||
|
||||
tr_loss, logging_loss = 0.0, 0.0
|
||||
model.zero_grad()
|
||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
||||
train_iterator = trange(
|
||||
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
|
||||
)
|
||||
set_seed(args) # Added here for reproductibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
# Skip past any already trained steps if resuming training
|
||||
if steps_trained_in_current_epoch > 0:
|
||||
steps_trained_in_current_epoch -= 1
|
||||
continue
|
||||
|
||||
model.train()
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
inputs = {'input_ids': batch[0],
|
||||
'attention_mask': batch[1],
|
||||
'labels': batch[3]}
|
||||
if args.model_type != 'distilbert':
|
||||
inputs['token_type_ids'] = batch[2] if args.model_type in ['bert'] else None # XLM and DistilBERT don't use segment_ids
|
||||
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
|
||||
if args.model_type != "distilbert":
|
||||
inputs["token_type_ids"] = (
|
||||
batch[2] if args.model_type in ["bert"] else None
|
||||
) # XLM and DistilBERT don't use segment_ids
|
||||
outputs = model(**inputs)
|
||||
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||
|
||||
if args.n_gpu > 1:
|
||||
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
||||
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
||||
if args.gradient_accumulation_steps > 1:
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
|
||||
@@ -162,24 +208,34 @@ def train(args, train_dataset, model, tokenizer):
|
||||
|
||||
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
||||
# Log metrics
|
||||
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
|
||||
if (
|
||||
args.local_rank == -1 and args.evaluate_during_training
|
||||
): # Only evaluate when single GPU otherwise metrics may not average well
|
||||
results = evaluate(args, model, tokenizer)
|
||||
for key, value in results.items():
|
||||
tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
|
||||
tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
|
||||
tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
|
||||
tb_writer.add_scalar("eval_{}".format(key), value, global_step)
|
||||
tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
|
||||
tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
|
||||
logging_loss = tr_loss
|
||||
|
||||
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
|
||||
# Save model checkpoint
|
||||
output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
|
||||
output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
model_to_save = (
|
||||
model.module if hasattr(model, "module") else model
|
||||
) # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(output_dir)
|
||||
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
||||
tokenizer.save_pretrained(output_dir)
|
||||
|
||||
torch.save(args, os.path.join(output_dir, "training_args.bin"))
|
||||
logger.info("Saving model checkpoint to %s", output_dir)
|
||||
|
||||
torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
|
||||
torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
|
||||
logger.info("Saving optimizer and scheduler states to %s", output_dir)
|
||||
|
||||
if args.max_steps > 0 and global_step > args.max_steps:
|
||||
epoch_iterator.close()
|
||||
break
|
||||
@@ -226,11 +282,11 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
batch = tuple(t.to(args.device) for t in batch)
|
||||
|
||||
with torch.no_grad():
|
||||
inputs = {'input_ids': batch[0],
|
||||
'attention_mask': batch[1],
|
||||
'labels': batch[3]}
|
||||
if args.model_type != 'distilbert':
|
||||
inputs['token_type_ids'] = batch[2] if args.model_type in ['bert'] else None # XLM and DistilBERT don't use segment_ids
|
||||
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
|
||||
if args.model_type != "distilbert":
|
||||
inputs["token_type_ids"] = (
|
||||
batch[2] if args.model_type in ["bert"] else None
|
||||
) # XLM and DistilBERT don't use segment_ids
|
||||
outputs = model(**inputs)
|
||||
tmp_eval_loss, logits = outputs[:2]
|
||||
|
||||
@@ -238,16 +294,16 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
nb_eval_steps += 1
|
||||
if preds is None:
|
||||
preds = logits.detach().cpu().numpy()
|
||||
out_label_ids = inputs['labels'].detach().cpu().numpy()
|
||||
out_label_ids = inputs["labels"].detach().cpu().numpy()
|
||||
else:
|
||||
preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
|
||||
out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
|
||||
out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
|
||||
|
||||
eval_loss = eval_loss / nb_eval_steps
|
||||
if args.output_mode == "classification":
|
||||
preds = np.argmax(preds, axis=1)
|
||||
else:
|
||||
raise ValueError('No other `output_mode` for XNLI.')
|
||||
raise ValueError("No other `output_mode` for XNLI.")
|
||||
result = compute_metrics(eval_task, preds, out_label_ids)
|
||||
results.update(result)
|
||||
|
||||
@@ -268,27 +324,34 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
||||
processor = processors[task](language=args.language, train_language=args.train_language)
|
||||
output_mode = output_modes[task]
|
||||
# Load data features from cache or dataset file
|
||||
cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}_{}'.format(
|
||||
'test' if evaluate else 'train',
|
||||
list(filter(None, args.model_name_or_path.split('/'))).pop(),
|
||||
str(args.max_seq_length),
|
||||
str(task),
|
||||
str(args.train_language if (not evaluate and args.train_language is not None) else args.language)))
|
||||
cached_features_file = os.path.join(
|
||||
args.data_dir,
|
||||
"cached_{}_{}_{}_{}_{}".format(
|
||||
"test" if evaluate else "train",
|
||||
list(filter(None, args.model_name_or_path.split("/"))).pop(),
|
||||
str(args.max_seq_length),
|
||||
str(task),
|
||||
str(args.train_language if (not evaluate and args.train_language is not None) else args.language),
|
||||
),
|
||||
)
|
||||
if os.path.exists(cached_features_file) and not args.overwrite_cache:
|
||||
logger.info("Loading features from cached file %s", cached_features_file)
|
||||
features = torch.load(cached_features_file)
|
||||
else:
|
||||
logger.info("Creating features from dataset file at %s", args.data_dir)
|
||||
label_list = processor.get_labels()
|
||||
examples = processor.get_test_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
|
||||
features = convert_examples_to_features(examples,
|
||||
tokenizer,
|
||||
label_list=label_list,
|
||||
max_length=args.max_seq_length,
|
||||
output_mode=output_mode,
|
||||
pad_on_left=False,
|
||||
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
||||
pad_token_segment_id=0,
|
||||
examples = (
|
||||
processor.get_test_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
|
||||
)
|
||||
features = convert_examples_to_features(
|
||||
examples,
|
||||
tokenizer,
|
||||
label_list=label_list,
|
||||
max_length=args.max_seq_length,
|
||||
output_mode=output_mode,
|
||||
pad_on_left=False,
|
||||
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
|
||||
pad_token_segment_id=0,
|
||||
)
|
||||
if args.local_rank in [-1, 0]:
|
||||
logger.info("Saving features into cached file %s", cached_features_file)
|
||||
@@ -304,7 +367,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
||||
if output_mode == "classification":
|
||||
all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
|
||||
else:
|
||||
raise ValueError('No other `output_mode` for XNLI.')
|
||||
raise ValueError("No other `output_mode` for XNLI.")
|
||||
|
||||
dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
|
||||
return dataset
|
||||
@@ -313,93 +376,153 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
## Required parameters
|
||||
parser.add_argument("--data_dir", default=None, type=str, required=True,
|
||||
help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
|
||||
parser.add_argument("--model_type", default=None, type=str, required=True,
|
||||
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
|
||||
parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
|
||||
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
|
||||
parser.add_argument("--language", default=None, type=str, required=True,
|
||||
help="Evaluation language. Also train language if `train_language` is set to None.")
|
||||
parser.add_argument("--train_language", default=None, type=str,
|
||||
help="Train language if is different of the evaluation language.")
|
||||
parser.add_argument("--output_dir", default=None, type=str, required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.")
|
||||
# Required parameters
|
||||
parser.add_argument(
|
||||
"--data_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_type",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_name_or_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--language",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Evaluation language. Also train language if `train_language` is set to None.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--train_language", default=None, type=str, help="Train language if is different of the evaluation language."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.",
|
||||
)
|
||||
|
||||
## Other parameters
|
||||
parser.add_argument("--config_name", default="", type=str,
|
||||
help="Pretrained config name or path if not the same as model_name")
|
||||
parser.add_argument("--tokenizer_name", default="", type=str,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name")
|
||||
parser.add_argument("--cache_dir", default="", type=str,
|
||||
help="Where do you want to store the pre-trained models downloaded from s3")
|
||||
parser.add_argument("--max_seq_length", default=128, type=int,
|
||||
help="The maximum total input sequence length after tokenization. Sequences longer "
|
||||
"than this will be truncated, sequences shorter will be padded.")
|
||||
parser.add_argument("--do_train", action='store_true',
|
||||
help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action='store_true',
|
||||
help="Whether to run eval on the test set.")
|
||||
parser.add_argument("--evaluate_during_training", action='store_true',
|
||||
help="Rul evaluation during training at each logging step.")
|
||||
parser.add_argument("--do_lower_case", action='store_true',
|
||||
help="Set this flag if you are using an uncased model.")
|
||||
# Other parameters
|
||||
parser.add_argument(
|
||||
"--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer_name",
|
||||
default="",
|
||||
type=str,
|
||||
help="Pretrained tokenizer name or path if not the same as model_name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache_dir",
|
||||
default="",
|
||||
type=str,
|
||||
help="Where do you want to store the pre-trained models downloaded from s3",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_seq_length",
|
||||
default=128,
|
||||
type=int,
|
||||
help="The maximum total input sequence length after tokenization. Sequences longer "
|
||||
"than this will be truncated, sequences shorter will be padded.",
|
||||
)
|
||||
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the test set.")
|
||||
parser.add_argument(
|
||||
"--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
|
||||
)
|
||||
|
||||
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
|
||||
help="Batch size per GPU/CPU for training.")
|
||||
parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
|
||||
help="Batch size per GPU/CPU for evaluation.")
|
||||
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float,
|
||||
help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float,
|
||||
help="Weight deay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
|
||||
help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float,
|
||||
help="Max gradient norm.")
|
||||
parser.add_argument("--num_train_epochs", default=3.0, type=float,
|
||||
help="Total number of training epochs to perform.")
|
||||
parser.add_argument("--max_steps", default=-1, type=int,
|
||||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
|
||||
parser.add_argument("--warmup_steps", default=0, type=int,
|
||||
help="Linear warmup over warmup_steps.")
|
||||
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
|
||||
parser.add_argument(
|
||||
"--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gradient_accumulation_steps",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of updates steps to accumulate before performing a backward/update pass.",
|
||||
)
|
||||
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
|
||||
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
|
||||
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
|
||||
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
|
||||
parser.add_argument(
|
||||
"--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_steps",
|
||||
default=-1,
|
||||
type=int,
|
||||
help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
|
||||
)
|
||||
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
|
||||
|
||||
parser.add_argument('--logging_steps', type=int, default=50,
|
||||
help="Log every X updates steps.")
|
||||
parser.add_argument('--save_steps', type=int, default=50,
|
||||
help="Save checkpoint every X updates steps.")
|
||||
parser.add_argument("--eval_all_checkpoints", action='store_true',
|
||||
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
|
||||
parser.add_argument("--no_cuda", action='store_true',
|
||||
help="Avoid using CUDA when available")
|
||||
parser.add_argument('--overwrite_output_dir', action='store_true',
|
||||
help="Overwrite the content of the output directory")
|
||||
parser.add_argument('--overwrite_cache', action='store_true',
|
||||
help="Overwrite the cached training and evaluation sets")
|
||||
parser.add_argument('--seed', type=int, default=42,
|
||||
help="random seed for initialization")
|
||||
parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
|
||||
parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
|
||||
parser.add_argument(
|
||||
"--eval_all_checkpoints",
|
||||
action="store_true",
|
||||
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
|
||||
)
|
||||
parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
|
||||
parser.add_argument(
|
||||
"--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
|
||||
|
||||
parser.add_argument('--fp16', action='store_true',
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
|
||||
parser.add_argument('--fp16_opt_level', type=str, default='O1',
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html")
|
||||
parser.add_argument("--local_rank", type=int, default=-1,
|
||||
help="For distributed training: local_rank")
|
||||
parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
|
||||
parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
|
||||
parser.add_argument(
|
||||
"--fp16",
|
||||
action="store_true",
|
||||
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fp16_opt_level",
|
||||
type=str,
|
||||
default="O1",
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html",
|
||||
)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||
parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
|
||||
parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
|
||||
raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
|
||||
if (
|
||||
os.path.exists(args.output_dir)
|
||||
and os.listdir(args.output_dir)
|
||||
and args.do_train
|
||||
and not args.overwrite_output_dir
|
||||
):
|
||||
raise ValueError(
|
||||
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
|
||||
args.output_dir
|
||||
)
|
||||
)
|
||||
|
||||
# Setup distant debugging if needed
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
@@ -411,22 +534,30 @@ def main():
|
||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
torch.distributed.init_process_group(backend='nccl')
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
args.n_gpu = 1
|
||||
args.device = device
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
|
||||
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
|
||||
)
|
||||
logger.warning(
|
||||
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
args.local_rank,
|
||||
device,
|
||||
args.n_gpu,
|
||||
bool(args.local_rank != -1),
|
||||
args.fp16,
|
||||
)
|
||||
|
||||
# Set seed
|
||||
set_seed(args)
|
||||
|
||||
# Prepare XNLI task
|
||||
args.task_name = 'xnli'
|
||||
args.task_name = "xnli"
|
||||
if args.task_name not in processors:
|
||||
raise ValueError("Task not found: %s" % (args.task_name))
|
||||
processor = processors[args.task_name](language=args.language, train_language=args.train_language)
|
||||
@@ -440,17 +571,23 @@ def main():
|
||||
|
||||
args.model_type = args.model_type.lower()
|
||||
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
|
||||
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
|
||||
num_labels=num_labels,
|
||||
finetuning_task=args.task_name,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
||||
do_lower_case=args.do_lower_case,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
model = model_class.from_pretrained(args.model_name_or_path,
|
||||
from_tf=bool('.ckpt' in args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None)
|
||||
config = config_class.from_pretrained(
|
||||
args.config_name if args.config_name else args.model_name_or_path,
|
||||
num_labels=num_labels,
|
||||
finetuning_task=args.task_name,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
tokenizer = tokenizer_class.from_pretrained(
|
||||
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
||||
do_lower_case=args.do_lower_case,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
model = model_class.from_pretrained(
|
||||
args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=args.cache_dir if args.cache_dir else None,
|
||||
)
|
||||
|
||||
if args.local_rank == 0:
|
||||
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||
@@ -459,14 +596,12 @@ def main():
|
||||
|
||||
logger.info("Training/evaluation parameters %s", args)
|
||||
|
||||
|
||||
# Training
|
||||
if args.do_train:
|
||||
train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
|
||||
global_step, tr_loss = train(args, train_dataset, model, tokenizer)
|
||||
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
|
||||
|
||||
|
||||
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
|
||||
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||
# Create output directory if needed
|
||||
@@ -476,36 +611,39 @@ def main():
|
||||
logger.info("Saving model checkpoint to %s", args.output_dir)
|
||||
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
|
||||
# They can then be reloaded using `from_pretrained()`
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||
model_to_save = (
|
||||
model.module if hasattr(model, "module") else model
|
||||
) # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(args.output_dir)
|
||||
tokenizer.save_pretrained(args.output_dir)
|
||||
|
||||
# Good practice: save your training arguments together with the trained model
|
||||
torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
|
||||
torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
|
||||
|
||||
# Load a trained model and vocabulary that you have fine-tuned
|
||||
model = model_class.from_pretrained(args.output_dir)
|
||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir)
|
||||
model.to(args.device)
|
||||
|
||||
|
||||
# Evaluation
|
||||
results = {}
|
||||
if args.do_eval and args.local_rank in [-1, 0]:
|
||||
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
||||
checkpoints = [args.output_dir]
|
||||
if args.eval_all_checkpoints:
|
||||
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
|
||||
checkpoints = list(
|
||||
os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
|
||||
)
|
||||
logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
|
||||
logger.info("Evaluate the following checkpoints: %s", checkpoints)
|
||||
for checkpoint in checkpoints:
|
||||
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
|
||||
prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
|
||||
|
||||
global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
|
||||
prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
|
||||
|
||||
model = model_class.from_pretrained(checkpoint)
|
||||
model.to(args.device)
|
||||
result = evaluate(args, model, tokenizer, prefix=prefix)
|
||||
result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
|
||||
result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
|
||||
results.update(result)
|
||||
|
||||
return results
|
||||
|
||||
@@ -10,7 +10,7 @@ The model is loaded with the pre-trained weights for the abstractive summarizati
|
||||
|
||||
```
|
||||
git clone https://github.com/huggingface/transformers && cd transformers
|
||||
pip install [--editable] .
|
||||
pip install .
|
||||
pip install nltk py-rouge
|
||||
cd examples/summarization
|
||||
```
|
||||
|
||||
@@ -14,9 +14,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" BertAbs configuration """
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
@@ -33,6 +31,8 @@ class BertAbsConfig(PretrainedConfig):
|
||||
r""" Class to store the configuration of the BertAbs model.
|
||||
|
||||
Arguments:
|
||||
vocab_size: int
|
||||
Number of tokens in the vocabulary.
|
||||
max_pos: int
|
||||
The maximum sequence length that this model will be used with.
|
||||
enc_layer: int
|
||||
@@ -62,10 +62,11 @@ class BertAbsConfig(PretrainedConfig):
|
||||
"""
|
||||
|
||||
pretrained_config_archive_map = BERTABS_FINETUNED_CONFIG_MAP
|
||||
model_type = "bertabs"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size_or_config_json_file=30522,
|
||||
vocab_size=30522,
|
||||
max_pos=512,
|
||||
enc_layers=6,
|
||||
enc_hidden_size=512,
|
||||
@@ -79,41 +80,19 @@ class BertAbsConfig(PretrainedConfig):
|
||||
dec_dropout=0.2,
|
||||
**kwargs,
|
||||
):
|
||||
super(BertAbsConfig, self).__init__(**kwargs)
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if self._input_is_path_to_json(vocab_size_or_config_json_file):
|
||||
path_to_json = vocab_size_or_config_json_file
|
||||
with open(path_to_json, "r", encoding="utf-8") as reader:
|
||||
json_config = json.loads(reader.read())
|
||||
for key, value in json_config.items():
|
||||
self.__dict__[key] = value
|
||||
elif isinstance(vocab_size_or_config_json_file, int):
|
||||
self.vocab_size = vocab_size_or_config_json_file
|
||||
self.max_pos = max_pos
|
||||
self.vocab_size = vocab_size
|
||||
self.max_pos = max_pos
|
||||
|
||||
self.enc_layers = enc_layers
|
||||
self.enc_hidden_size = enc_hidden_size
|
||||
self.enc_heads = enc_heads
|
||||
self.enc_ff_size = enc_ff_size
|
||||
self.enc_dropout = enc_dropout
|
||||
self.enc_layers = enc_layers
|
||||
self.enc_hidden_size = enc_hidden_size
|
||||
self.enc_heads = enc_heads
|
||||
self.enc_ff_size = enc_ff_size
|
||||
self.enc_dropout = enc_dropout
|
||||
|
||||
self.dec_layers = dec_layers
|
||||
self.dec_hidden_size = dec_hidden_size
|
||||
self.dec_heads = dec_heads
|
||||
self.dec_ff_size = dec_ff_size
|
||||
self.dec_dropout = dec_dropout
|
||||
else:
|
||||
raise ValueError(
|
||||
"First argument must be either a vocabulary size (int)"
|
||||
"or the path to a pretrained model config file (str)"
|
||||
)
|
||||
|
||||
def _input_is_path_to_json(self, first_argument):
|
||||
""" Checks whether the first argument passed to config
|
||||
is the path to a JSON file that contains the config.
|
||||
"""
|
||||
is_python_2 = sys.version_info[0] == 2
|
||||
if is_python_2:
|
||||
return isinstance(first_argument, unicode)
|
||||
else:
|
||||
return isinstance(first_argument, str)
|
||||
self.dec_layers = dec_layers
|
||||
self.dec_hidden_size = dec_hidden_size
|
||||
self.dec_heads = dec_heads
|
||||
self.dec_ff_size = dec_ff_size
|
||||
self.dec_dropout = dec_dropout
|
||||
|
||||
@@ -20,13 +20,13 @@ the model within the original codebase to be able to only save its `state_dict`.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from collections import namedtuple
|
||||
import logging
|
||||
from collections import namedtuple
|
||||
|
||||
import torch
|
||||
|
||||
from models.model_builder import AbsSummarizer # The authors' implementation
|
||||
from model_bertabs import BertAbsSummarizer
|
||||
|
||||
from models.model_builder import AbsSummarizer # The authors' implementation
|
||||
from transformers import BertTokenizer
|
||||
|
||||
|
||||
@@ -34,12 +34,30 @@ logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
SAMPLE_TEXT = 'Hello world! cécé herlolip'
|
||||
SAMPLE_TEXT = "Hello world! cécé herlolip"
|
||||
|
||||
|
||||
BertAbsConfig = namedtuple(
|
||||
"BertAbsConfig",
|
||||
["temp_dir", "large", "use_bert_emb", "finetune_bert", "encoder", "share_emb", "max_pos", "enc_layers", "enc_hidden_size", "enc_heads", "enc_ff_size", "enc_dropout", "dec_layers", "dec_hidden_size", "dec_heads", "dec_ff_size", "dec_dropout"],
|
||||
[
|
||||
"temp_dir",
|
||||
"large",
|
||||
"use_bert_emb",
|
||||
"finetune_bert",
|
||||
"encoder",
|
||||
"share_emb",
|
||||
"max_pos",
|
||||
"enc_layers",
|
||||
"enc_hidden_size",
|
||||
"enc_heads",
|
||||
"enc_ff_size",
|
||||
"enc_dropout",
|
||||
"dec_layers",
|
||||
"dec_hidden_size",
|
||||
"dec_heads",
|
||||
"dec_ff_size",
|
||||
"dec_dropout",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@@ -119,7 +137,9 @@ def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
|
||||
output_original_model = original(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls)[0]
|
||||
output_original_generator = original.generator(output_original_model)
|
||||
|
||||
output_converted_model = new_model(encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask)[0]
|
||||
output_converted_model = new_model(
|
||||
encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask
|
||||
)[0]
|
||||
output_converted_generator = new_model.generator(output_converted_model)
|
||||
|
||||
maximum_absolute_difference = torch.max(torch.abs(output_converted_model - output_original_model)).item()
|
||||
@@ -136,28 +156,21 @@ def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
|
||||
# The model has been saved with torch.save(model) and this is bound to the exact
|
||||
# directory structure. We save the state_dict instead.
|
||||
logging.info("saving the model's state dictionary")
|
||||
torch.save(new_model.state_dict(), "bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin")
|
||||
torch.save(
|
||||
new_model.state_dict(), "bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--bertabs_checkpoint_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path the official PyTorch dump.",
|
||||
"--bertabs_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pytorch_dump_folder_path",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the output PyTorch model.",
|
||||
"--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
convert_bertabs_checkpoints(
|
||||
args.bertabs_checkpoint_path,
|
||||
args.pytorch_dump_folder_path,
|
||||
args.bertabs_checkpoint_path, args.pytorch_dump_folder_path,
|
||||
)
|
||||
|
||||
@@ -27,9 +27,8 @@ import torch
|
||||
from torch import nn
|
||||
from torch.nn.init import xavier_uniform_
|
||||
|
||||
from transformers import BertModel, BertConfig, PreTrainedModel
|
||||
|
||||
from configuration_bertabs import BertAbsConfig
|
||||
from transformers import BertConfig, BertModel, PreTrainedModel
|
||||
|
||||
|
||||
MAX_SIZE = 5000
|
||||
@@ -48,7 +47,7 @@ class BertAbsPreTrainedModel(PreTrainedModel):
|
||||
|
||||
class BertAbs(BertAbsPreTrainedModel):
|
||||
def __init__(self, args, checkpoint=None, bert_extractive_checkpoint=None):
|
||||
super(BertAbs, self).__init__(args)
|
||||
super().__init__(args)
|
||||
self.args = args
|
||||
self.bert = Bert()
|
||||
|
||||
@@ -56,40 +55,22 @@ class BertAbs(BertAbsPreTrainedModel):
|
||||
load_bert_pretrained_extractive = True if bert_extractive_checkpoint else False
|
||||
if load_bert_pretrained_extractive:
|
||||
self.bert.model.load_state_dict(
|
||||
dict(
|
||||
[
|
||||
(n[11:], p)
|
||||
for n, p in bert_extractive_checkpoint.items()
|
||||
if n.startswith("bert.model")
|
||||
]
|
||||
),
|
||||
dict([(n[11:], p) for n, p in bert_extractive_checkpoint.items() if n.startswith("bert.model")]),
|
||||
strict=True,
|
||||
)
|
||||
|
||||
self.vocab_size = self.bert.model.config.vocab_size
|
||||
|
||||
if args.max_pos > 512:
|
||||
my_pos_embeddings = nn.Embedding(
|
||||
args.max_pos, self.bert.model.config.hidden_size
|
||||
)
|
||||
my_pos_embeddings.weight.data[
|
||||
:512
|
||||
] = self.bert.model.embeddings.position_embeddings.weight.data
|
||||
my_pos_embeddings.weight.data[
|
||||
512:
|
||||
] = self.bert.model.embeddings.position_embeddings.weight.data[-1][
|
||||
my_pos_embeddings = nn.Embedding(args.max_pos, self.bert.model.config.hidden_size)
|
||||
my_pos_embeddings.weight.data[:512] = self.bert.model.embeddings.position_embeddings.weight.data
|
||||
my_pos_embeddings.weight.data[512:] = self.bert.model.embeddings.position_embeddings.weight.data[-1][
|
||||
None, :
|
||||
].repeat(
|
||||
args.max_pos - 512, 1
|
||||
)
|
||||
].repeat(args.max_pos - 512, 1)
|
||||
self.bert.model.embeddings.position_embeddings = my_pos_embeddings
|
||||
tgt_embeddings = nn.Embedding(
|
||||
self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0
|
||||
)
|
||||
tgt_embeddings = nn.Embedding(self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0)
|
||||
|
||||
tgt_embeddings.weight = copy.deepcopy(
|
||||
self.bert.model.embeddings.word_embeddings.weight
|
||||
)
|
||||
tgt_embeddings.weight = copy.deepcopy(self.bert.model.embeddings.word_embeddings.weight)
|
||||
|
||||
self.decoder = TransformerDecoder(
|
||||
self.args.dec_layers,
|
||||
@@ -102,9 +83,7 @@ class BertAbs(BertAbsPreTrainedModel):
|
||||
)
|
||||
|
||||
gen_func = nn.LogSoftmax(dim=-1)
|
||||
self.generator = nn.Sequential(
|
||||
nn.Linear(args.dec_hidden_size, args.vocab_size), gen_func
|
||||
)
|
||||
self.generator = nn.Sequential(nn.Linear(args.dec_hidden_size, args.vocab_size), gen_func)
|
||||
self.generator[0].weight = self.decoder.embeddings.weight
|
||||
|
||||
load_from_checkpoints = False if checkpoint is None else True
|
||||
@@ -127,25 +106,14 @@ class BertAbs(BertAbsPreTrainedModel):
|
||||
p.data.zero_()
|
||||
|
||||
def forward(
|
||||
self,
|
||||
encoder_input_ids,
|
||||
decoder_input_ids,
|
||||
token_type_ids,
|
||||
encoder_attention_mask,
|
||||
decoder_attention_mask,
|
||||
self, encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask,
|
||||
):
|
||||
encoder_output = self.bert(
|
||||
input_ids=encoder_input_ids,
|
||||
token_type_ids=token_type_ids,
|
||||
attention_mask=encoder_attention_mask,
|
||||
input_ids=encoder_input_ids, token_type_ids=token_type_ids, attention_mask=encoder_attention_mask,
|
||||
)
|
||||
encoder_hidden_states = encoder_output[0]
|
||||
dec_state = self.decoder.init_decoder_state(
|
||||
encoder_input_ids, encoder_hidden_states
|
||||
)
|
||||
decoder_outputs, _ = self.decoder(
|
||||
decoder_input_ids[:, :-1], encoder_hidden_states, dec_state
|
||||
)
|
||||
dec_state = self.decoder.init_decoder_state(encoder_input_ids, encoder_hidden_states)
|
||||
decoder_outputs, _ = self.decoder(decoder_input_ids[:, :-1], encoder_hidden_states, dec_state)
|
||||
return decoder_outputs
|
||||
|
||||
|
||||
@@ -154,7 +122,7 @@ class Bert(nn.Module):
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super(Bert, self).__init__()
|
||||
super().__init__()
|
||||
config = BertConfig.from_pretrained("bert-base-uncased")
|
||||
self.model = BertModel(config)
|
||||
|
||||
@@ -162,10 +130,7 @@ class Bert(nn.Module):
|
||||
self.eval()
|
||||
with torch.no_grad():
|
||||
encoder_outputs, _ = self.model(
|
||||
input_ids,
|
||||
token_type_ids=token_type_ids,
|
||||
attention_mask=attention_mask,
|
||||
**kwargs
|
||||
input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, **kwargs
|
||||
)
|
||||
return encoder_outputs
|
||||
|
||||
@@ -186,7 +151,7 @@ class TransformerDecoder(nn.Module):
|
||||
"""
|
||||
|
||||
def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings, vocab_size):
|
||||
super(TransformerDecoder, self).__init__()
|
||||
super().__init__()
|
||||
|
||||
# Basic attributes.
|
||||
self.decoder_type = "transformer"
|
||||
@@ -196,10 +161,7 @@ class TransformerDecoder(nn.Module):
|
||||
|
||||
# Build TransformerDecoder.
|
||||
self.transformer_layers = nn.ModuleList(
|
||||
[
|
||||
TransformerDecoderLayer(d_model, heads, d_ff, dropout)
|
||||
for _ in range(num_layers)
|
||||
]
|
||||
[TransformerDecoderLayer(d_model, heads, d_ff, dropout) for _ in range(num_layers)]
|
||||
)
|
||||
|
||||
self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
|
||||
@@ -236,20 +198,14 @@ class TransformerDecoder(nn.Module):
|
||||
# Decoder padding mask
|
||||
tgt_words = tgt
|
||||
tgt_batch, tgt_len = tgt_words.size()
|
||||
tgt_pad_mask = (
|
||||
tgt_words.data.eq(padding_idx).unsqueeze(1).expand(tgt_batch, tgt_len, tgt_len)
|
||||
)
|
||||
tgt_pad_mask = tgt_words.data.eq(padding_idx).unsqueeze(1).expand(tgt_batch, tgt_len, tgt_len)
|
||||
|
||||
# Encoder padding mask
|
||||
if memory_mask is not None:
|
||||
src_len = memory_mask.size(-1)
|
||||
src_pad_mask = memory_mask.expand(src_batch, tgt_len, src_len)
|
||||
else:
|
||||
src_pad_mask = (
|
||||
src_words.data.eq(padding_idx)
|
||||
.unsqueeze(1)
|
||||
.expand(src_batch, tgt_len, src_len)
|
||||
)
|
||||
src_pad_mask = src_words.data.eq(padding_idx).unsqueeze(1).expand(src_batch, tgt_len, src_len)
|
||||
|
||||
# Pass through the embeddings
|
||||
emb = self.embeddings(input_ids)
|
||||
@@ -271,9 +227,7 @@ class TransformerDecoder(nn.Module):
|
||||
src_pad_mask,
|
||||
tgt_pad_mask,
|
||||
previous_input=prev_layer_input,
|
||||
layer_cache=state.cache["layer_{}".format(i)]
|
||||
if state.cache is not None
|
||||
else None,
|
||||
layer_cache=state.cache["layer_{}".format(i)] if state.cache is not None else None,
|
||||
step=step,
|
||||
)
|
||||
if state.cache is None:
|
||||
@@ -303,13 +257,11 @@ class PositionalEncoding(nn.Module):
|
||||
def __init__(self, dropout, dim, max_len=5000):
|
||||
pe = torch.zeros(max_len, dim)
|
||||
position = torch.arange(0, max_len).unsqueeze(1)
|
||||
div_term = torch.exp(
|
||||
(torch.arange(0, dim, 2, dtype=torch.float) * -(math.log(10000.0) / dim))
|
||||
)
|
||||
div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float) * -(math.log(10000.0) / dim)))
|
||||
pe[:, 0::2] = torch.sin(position.float() * div_term)
|
||||
pe[:, 1::2] = torch.cos(position.float() * div_term)
|
||||
pe = pe.unsqueeze(0)
|
||||
super(PositionalEncoding, self).__init__()
|
||||
super().__init__()
|
||||
self.register_buffer("pe", pe)
|
||||
self.dropout = nn.Dropout(p=dropout)
|
||||
self.dim = dim
|
||||
@@ -341,7 +293,7 @@ class TransformerDecoderLayer(nn.Module):
|
||||
"""
|
||||
|
||||
def __init__(self, d_model, heads, d_ff, dropout):
|
||||
super(TransformerDecoderLayer, self).__init__()
|
||||
super().__init__()
|
||||
|
||||
self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
|
||||
|
||||
@@ -356,14 +308,7 @@ class TransformerDecoderLayer(nn.Module):
|
||||
self.register_buffer("mask", mask)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
inputs,
|
||||
memory_bank,
|
||||
src_pad_mask,
|
||||
tgt_pad_mask,
|
||||
previous_input=None,
|
||||
layer_cache=None,
|
||||
step=None,
|
||||
self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, previous_input=None, layer_cache=None, step=None,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
@@ -380,34 +325,20 @@ class TransformerDecoderLayer(nn.Module):
|
||||
* all_input `[batch_size x current_step x model_dim]`
|
||||
|
||||
"""
|
||||
dec_mask = torch.gt(
|
||||
tgt_pad_mask + self.mask[:, : tgt_pad_mask.size(1), : tgt_pad_mask.size(1)], 0
|
||||
)
|
||||
dec_mask = torch.gt(tgt_pad_mask + self.mask[:, : tgt_pad_mask.size(1), : tgt_pad_mask.size(1)], 0)
|
||||
input_norm = self.layer_norm_1(inputs)
|
||||
all_input = input_norm
|
||||
if previous_input is not None:
|
||||
all_input = torch.cat((previous_input, input_norm), dim=1)
|
||||
dec_mask = None
|
||||
|
||||
query = self.self_attn(
|
||||
all_input,
|
||||
all_input,
|
||||
input_norm,
|
||||
mask=dec_mask,
|
||||
layer_cache=layer_cache,
|
||||
type="self",
|
||||
)
|
||||
query = self.self_attn(all_input, all_input, input_norm, mask=dec_mask, layer_cache=layer_cache, type="self",)
|
||||
|
||||
query = self.drop(query) + inputs
|
||||
|
||||
query_norm = self.layer_norm_2(query)
|
||||
mid = self.context_attn(
|
||||
memory_bank,
|
||||
memory_bank,
|
||||
query_norm,
|
||||
mask=src_pad_mask,
|
||||
layer_cache=layer_cache,
|
||||
type="context",
|
||||
memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, type="context",
|
||||
)
|
||||
output = self.feed_forward(self.drop(mid) + query)
|
||||
|
||||
@@ -479,7 +410,7 @@ class MultiHeadedAttention(nn.Module):
|
||||
self.dim_per_head = model_dim // head_count
|
||||
self.model_dim = model_dim
|
||||
|
||||
super(MultiHeadedAttention, self).__init__()
|
||||
super().__init__()
|
||||
self.head_count = head_count
|
||||
|
||||
self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head)
|
||||
@@ -492,14 +423,7 @@ class MultiHeadedAttention(nn.Module):
|
||||
self.final_linear = nn.Linear(model_dim, model_dim)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
key,
|
||||
value,
|
||||
query,
|
||||
mask=None,
|
||||
layer_cache=None,
|
||||
type=None,
|
||||
predefined_graph_1=None,
|
||||
self, key, value, query, mask=None, layer_cache=None, type=None, predefined_graph_1=None,
|
||||
):
|
||||
"""
|
||||
Compute the context vector and the attention vectors.
|
||||
@@ -522,8 +446,6 @@ class MultiHeadedAttention(nn.Module):
|
||||
batch_size = key.size(0)
|
||||
dim_per_head = self.dim_per_head
|
||||
head_count = self.head_count
|
||||
key_len = key.size(1)
|
||||
query_len = query.size(1)
|
||||
|
||||
def shape(x):
|
||||
""" projection """
|
||||
@@ -531,11 +453,7 @@ class MultiHeadedAttention(nn.Module):
|
||||
|
||||
def unshape(x):
|
||||
""" compute context """
|
||||
return (
|
||||
x.transpose(1, 2)
|
||||
.contiguous()
|
||||
.view(batch_size, -1, head_count * dim_per_head)
|
||||
)
|
||||
return x.transpose(1, 2).contiguous().view(batch_size, -1, head_count * dim_per_head)
|
||||
|
||||
# 1) Project key, value, and query.
|
||||
if layer_cache is not None:
|
||||
@@ -554,9 +472,7 @@ class MultiHeadedAttention(nn.Module):
|
||||
if layer_cache["self_keys"] is not None:
|
||||
key = torch.cat((layer_cache["self_keys"].to(device), key), dim=2)
|
||||
if layer_cache["self_values"] is not None:
|
||||
value = torch.cat(
|
||||
(layer_cache["self_values"].to(device), value), dim=2
|
||||
)
|
||||
value = torch.cat((layer_cache["self_values"].to(device), value), dim=2)
|
||||
layer_cache["self_keys"] = key
|
||||
layer_cache["self_values"] = value
|
||||
elif type == "context":
|
||||
@@ -586,9 +502,6 @@ class MultiHeadedAttention(nn.Module):
|
||||
|
||||
query = shape(query)
|
||||
|
||||
key_len = key.size(2)
|
||||
query_len = query.size(2)
|
||||
|
||||
# 2) Calculate and scale scores.
|
||||
query = query / math.sqrt(dim_per_head)
|
||||
scores = torch.matmul(query, key.transpose(2, 3))
|
||||
@@ -601,7 +514,7 @@ class MultiHeadedAttention(nn.Module):
|
||||
|
||||
attn = self.softmax(scores)
|
||||
|
||||
if not predefined_graph_1 is None:
|
||||
if predefined_graph_1 is not None:
|
||||
attn_masked = attn[:, -1] * predefined_graph_1
|
||||
attn_masked = attn_masked / (torch.sum(attn_masked, 2).unsqueeze(2) + 1e-9)
|
||||
|
||||
@@ -637,13 +550,9 @@ class DecoderState(object):
|
||||
sizes = e.size()
|
||||
br = sizes[1]
|
||||
if len(sizes) == 3:
|
||||
sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2])[
|
||||
:, :, idx
|
||||
]
|
||||
sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2])[:, :, idx]
|
||||
else:
|
||||
sent_states = e.view(
|
||||
sizes[0], beam_size, br // beam_size, sizes[2], sizes[3]
|
||||
)[:, :, idx]
|
||||
sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2], sizes[3])[:, :, idx]
|
||||
|
||||
sent_states.data.copy_(sent_states.data.index_select(1, positions))
|
||||
|
||||
@@ -716,11 +625,7 @@ class TransformerDecoderState(DecoderState):
|
||||
|
||||
|
||||
def gelu(x):
|
||||
return (
|
||||
0.5
|
||||
* x
|
||||
* (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||
)
|
||||
return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||
|
||||
|
||||
class PositionwiseFeedForward(nn.Module):
|
||||
@@ -734,7 +639,7 @@ class PositionwiseFeedForward(nn.Module):
|
||||
"""
|
||||
|
||||
def __init__(self, d_model, d_ff, dropout=0.1):
|
||||
super(PositionwiseFeedForward, self).__init__()
|
||||
super().__init__()
|
||||
self.w_1 = nn.Linear(d_model, d_ff)
|
||||
self.w_2 = nn.Linear(d_ff, d_model)
|
||||
self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
|
||||
@@ -758,9 +663,7 @@ class PositionwiseFeedForward(nn.Module):
|
||||
def build_predictor(args, tokenizer, symbols, model, logger=None):
|
||||
# we should be able to refactor the global scorer a lot
|
||||
scorer = GNMTGlobalScorer(args.alpha, length_penalty="wu")
|
||||
translator = Translator(
|
||||
args, model, tokenizer, symbols, global_scorer=scorer, logger=logger
|
||||
)
|
||||
translator = Translator(args, model, tokenizer, symbols, global_scorer=scorer, logger=logger)
|
||||
return translator
|
||||
|
||||
|
||||
@@ -891,9 +794,7 @@ class Translator(object):
|
||||
Shouldn't need the original dataset.
|
||||
"""
|
||||
with torch.no_grad():
|
||||
return self._fast_translate_batch(
|
||||
batch, self.max_length, min_length=self.min_length
|
||||
)
|
||||
return self._fast_translate_batch(batch, self.max_length, min_length=self.min_length)
|
||||
|
||||
# Where the beam search lives
|
||||
# I have no idea why it is being called from the method above
|
||||
@@ -912,26 +813,18 @@ class Translator(object):
|
||||
mask_src = batch.mask_src
|
||||
|
||||
src_features = self.model.bert(src, segs, mask_src)
|
||||
dec_states = self.model.decoder.init_decoder_state(
|
||||
src, src_features, with_cache=True
|
||||
)
|
||||
dec_states = self.model.decoder.init_decoder_state(src, src_features, with_cache=True)
|
||||
device = src_features.device
|
||||
|
||||
# Tile states and memory beam_size times.
|
||||
dec_states.map_batch_fn(lambda state, dim: tile(state, beam_size, dim=dim))
|
||||
src_features = tile(src_features, beam_size, dim=0)
|
||||
batch_offset = torch.arange(batch_size, dtype=torch.long, device=device)
|
||||
beam_offset = torch.arange(
|
||||
0, batch_size * beam_size, step=beam_size, dtype=torch.long, device=device
|
||||
)
|
||||
alive_seq = torch.full(
|
||||
[batch_size * beam_size, 1], self.start_token, dtype=torch.long, device=device
|
||||
)
|
||||
beam_offset = torch.arange(0, batch_size * beam_size, step=beam_size, dtype=torch.long, device=device)
|
||||
alive_seq = torch.full([batch_size * beam_size, 1], self.start_token, dtype=torch.long, device=device)
|
||||
|
||||
# Give full probability to the first beam on the first step.
|
||||
topk_log_probs = torch.tensor(
|
||||
[0.0] + [float("-inf")] * (beam_size - 1), device=device
|
||||
).repeat(batch_size)
|
||||
topk_log_probs = torch.tensor([0.0] + [float("-inf")] * (beam_size - 1), device=device).repeat(batch_size)
|
||||
|
||||
# Structure that holds finished hypotheses.
|
||||
hypotheses = [[] for _ in range(batch_size)] # noqa: F812
|
||||
@@ -948,9 +841,7 @@ class Translator(object):
|
||||
# Decoder forward.
|
||||
decoder_input = decoder_input.transpose(0, 1)
|
||||
|
||||
dec_out, dec_states = self.model.decoder(
|
||||
decoder_input, src_features, dec_states, step=step
|
||||
)
|
||||
dec_out, dec_states = self.model.decoder(decoder_input, src_features, dec_states, step=step)
|
||||
|
||||
# Generator forward.
|
||||
log_probs = self.generator.forward(dec_out.transpose(0, 1).squeeze(0))
|
||||
@@ -978,10 +869,7 @@ class Translator(object):
|
||||
words = " ".join(words).replace(" ##", "").split()
|
||||
if len(words) <= 3:
|
||||
continue
|
||||
trigrams = [
|
||||
(words[i - 1], words[i], words[i + 1])
|
||||
for i in range(1, len(words) - 1)
|
||||
]
|
||||
trigrams = [(words[i - 1], words[i], words[i + 1]) for i in range(1, len(words) - 1)]
|
||||
trigram = tuple(trigrams[-1])
|
||||
if trigram in trigrams[:-1]:
|
||||
fail = True
|
||||
@@ -999,15 +887,11 @@ class Translator(object):
|
||||
topk_ids = topk_ids.fmod(vocab_size)
|
||||
|
||||
# Map beam_index to batch_index in the flat representation.
|
||||
batch_index = topk_beam_index + beam_offset[
|
||||
: topk_beam_index.size(0)
|
||||
].unsqueeze(1)
|
||||
batch_index = topk_beam_index + beam_offset[: topk_beam_index.size(0)].unsqueeze(1)
|
||||
select_indices = batch_index.view(-1)
|
||||
|
||||
# Append last prediction.
|
||||
alive_seq = torch.cat(
|
||||
[alive_seq.index_select(0, select_indices), topk_ids.view(-1, 1)], -1
|
||||
)
|
||||
alive_seq = torch.cat([alive_seq.index_select(0, select_indices), topk_ids.view(-1, 1)], -1)
|
||||
|
||||
is_finished = topk_ids.eq(self.end_token)
|
||||
if step + 1 == max_length:
|
||||
@@ -1040,15 +924,11 @@ class Translator(object):
|
||||
topk_log_probs = topk_log_probs.index_select(0, non_finished)
|
||||
batch_index = batch_index.index_select(0, non_finished)
|
||||
batch_offset = batch_offset.index_select(0, non_finished)
|
||||
alive_seq = predictions.index_select(0, non_finished).view(
|
||||
-1, alive_seq.size(-1)
|
||||
)
|
||||
alive_seq = predictions.index_select(0, non_finished).view(-1, alive_seq.size(-1))
|
||||
# Reorder states.
|
||||
select_indices = batch_index.view(-1)
|
||||
src_features = src_features.index_select(0, select_indices)
|
||||
dec_states.map_batch_fn(
|
||||
lambda state, dim: state.index_select(dim, select_indices)
|
||||
)
|
||||
dec_states.map_batch_fn(lambda state, dim: state.index_select(dim, select_indices))
|
||||
|
||||
return results
|
||||
|
||||
@@ -1089,14 +969,7 @@ def tile(x, count, dim=0):
|
||||
out_size = list(x.size())
|
||||
out_size[0] *= count
|
||||
batch = x.size(0)
|
||||
x = (
|
||||
x.view(batch, -1)
|
||||
.transpose(0, 1)
|
||||
.repeat(count, 1)
|
||||
.transpose(0, 1)
|
||||
.contiguous()
|
||||
.view(*out_size)
|
||||
)
|
||||
x = x.view(batch, -1).transpose(0, 1).repeat(count, 1).transpose(0, 1).contiguous().view(*out_size)
|
||||
if dim != 0:
|
||||
x = x.permute(perm).contiguous()
|
||||
return x
|
||||
@@ -1107,6 +980,7 @@ def tile(x, count, dim=0):
|
||||
# a finetuning script.
|
||||
#
|
||||
|
||||
|
||||
class BertSumOptimizer(object):
|
||||
""" Specific optimizer for BertSum.
|
||||
|
||||
@@ -1126,16 +1000,10 @@ class BertSumOptimizer(object):
|
||||
|
||||
self.optimizers = {
|
||||
"encoder": torch.optim.Adam(
|
||||
model.encoder.parameters(),
|
||||
lr=lr["encoder"],
|
||||
betas=(beta_1, beta_2),
|
||||
eps=eps,
|
||||
model.encoder.parameters(), lr=lr["encoder"], betas=(beta_1, beta_2), eps=eps,
|
||||
),
|
||||
"decoder": torch.optim.Adam(
|
||||
model.decoder.parameters(),
|
||||
lr=lr["decoder"],
|
||||
betas=(beta_1, beta_2),
|
||||
eps=eps,
|
||||
model.decoder.parameters(), lr=lr["decoder"], betas=(beta_1, beta_2), eps=eps,
|
||||
),
|
||||
}
|
||||
|
||||
@@ -1143,9 +1011,7 @@ class BertSumOptimizer(object):
|
||||
self.current_learning_rates = {}
|
||||
|
||||
def _update_rate(self, stack):
|
||||
return self.lr[stack] * min(
|
||||
self._step ** (-0.5), self._step * self.warmup_steps[stack] ** (-1.5)
|
||||
)
|
||||
return self.lr[stack] * min(self._step ** (-0.5), self._step * self.warmup_steps[stack] ** (-1.5))
|
||||
|
||||
def zero_grad(self):
|
||||
self.optimizer_decoder.zero_grad()
|
||||
|
||||
@@ -1,9 +1,5 @@
|
||||
# progress bars in model download and training scripts
|
||||
tqdm
|
||||
# Accessing files from S3 directly.
|
||||
boto3
|
||||
# Used for downloading models over HTTP
|
||||
requests
|
||||
transformers
|
||||
|
||||
# For ROUGE
|
||||
nltk
|
||||
py-rouge
|
||||
|
||||
@@ -1,33 +1,30 @@
|
||||
#! /usr/bin/python3
|
||||
import argparse
|
||||
from collections import namedtuple
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from collections import namedtuple
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, SequentialSampler
|
||||
from tqdm import tqdm
|
||||
|
||||
from transformers import BertTokenizer
|
||||
|
||||
from modeling_bertabs import BertAbs, build_predictor
|
||||
|
||||
from transformers import BertTokenizer
|
||||
from utils_summarization import (
|
||||
SummarizationDataset,
|
||||
encode_for_summarization,
|
||||
build_mask,
|
||||
fit_to_block_size,
|
||||
compute_token_type_ids,
|
||||
encode_for_summarization,
|
||||
fit_to_block_size,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||
|
||||
|
||||
Batch = namedtuple(
|
||||
"Batch", ["document_names", "batch_size", "src", "segs", "mask_src", "tgt_str"]
|
||||
)
|
||||
Batch = namedtuple("Batch", ["document_names", "batch_size", "src", "segs", "mask_src", "tgt_str"])
|
||||
|
||||
|
||||
def evaluate(args):
|
||||
@@ -48,13 +45,14 @@ def evaluate(args):
|
||||
|
||||
import rouge
|
||||
import nltk
|
||||
nltk.download('punkt')
|
||||
|
||||
nltk.download("punkt")
|
||||
rouge_evaluator = rouge.Rouge(
|
||||
metrics=['rouge-n', 'rouge-l'],
|
||||
metrics=["rouge-n", "rouge-l"],
|
||||
max_n=2,
|
||||
limit_length=True,
|
||||
length_limit=args.beam_size,
|
||||
length_limit_type='words',
|
||||
length_limit_type="words",
|
||||
apply_avg=True,
|
||||
apply_best=False,
|
||||
alpha=0.5, # Default F1_score
|
||||
@@ -161,15 +159,15 @@ Recall >> {:.3f}
|
||||
F1 >> {:.3f}
|
||||
Precision >> {:.3f}
|
||||
Recall >> {:.3f}""".format(
|
||||
scores['rouge-1']['f'],
|
||||
scores['rouge-1']['p'],
|
||||
scores['rouge-1']['r'],
|
||||
scores['rouge-2']['f'],
|
||||
scores['rouge-2']['p'],
|
||||
scores['rouge-2']['r'],
|
||||
scores['rouge-l']['f'],
|
||||
scores['rouge-l']['p'],
|
||||
scores['rouge-l']['r'],
|
||||
scores["rouge-1"]["f"],
|
||||
scores["rouge-1"]["p"],
|
||||
scores["rouge-1"]["r"],
|
||||
scores["rouge-2"]["f"],
|
||||
scores["rouge-2"]["p"],
|
||||
scores["rouge-2"]["r"],
|
||||
scores["rouge-l"]["f"],
|
||||
scores["rouge-l"]["p"],
|
||||
scores["rouge-l"]["r"],
|
||||
)
|
||||
|
||||
|
||||
@@ -186,10 +184,11 @@ def save_rouge_scores(str_scores):
|
||||
def build_data_iterator(args, tokenizer):
|
||||
dataset = load_and_cache_examples(args, tokenizer)
|
||||
sampler = SequentialSampler(dataset)
|
||||
collate_fn = lambda data: collate(data, tokenizer, block_size=512, device=args.device)
|
||||
iterator = DataLoader(
|
||||
dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,
|
||||
)
|
||||
|
||||
def collate_fn(data):
|
||||
return collate(data, tokenizer, block_size=512, device=args.device)
|
||||
|
||||
iterator = DataLoader(dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,)
|
||||
|
||||
return iterator
|
||||
|
||||
@@ -210,14 +209,9 @@ def collate(data, tokenizer, block_size, device):
|
||||
names = [name for name, _, _ in data]
|
||||
summaries = [" ".join(summary_list) for _, _, summary_list in data]
|
||||
|
||||
encoded_text = [
|
||||
encode_for_summarization(story, summary, tokenizer) for _, story, summary in data
|
||||
]
|
||||
encoded_text = [encode_for_summarization(story, summary, tokenizer) for _, story, summary in data]
|
||||
encoded_stories = torch.tensor(
|
||||
[
|
||||
fit_to_block_size(story, block_size, tokenizer.pad_token_id)
|
||||
for story, _ in encoded_text
|
||||
]
|
||||
[fit_to_block_size(story, block_size, tokenizer.pad_token_id) for story, _ in encoded_text]
|
||||
)
|
||||
encoder_token_type_ids = compute_token_type_ids(encoded_stories, tokenizer.cls_token_id)
|
||||
encoder_mask = build_mask(encoded_stories, tokenizer.pad_token_id)
|
||||
@@ -272,38 +266,23 @@ def main():
|
||||
)
|
||||
# EVALUATION options
|
||||
parser.add_argument(
|
||||
"--no_cuda",
|
||||
default=False,
|
||||
type=bool,
|
||||
help="Whether to force the execution on CPU.",
|
||||
"--no_cuda", default=False, type=bool, help="Whether to force the execution on CPU.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.",
|
||||
)
|
||||
# BEAM SEARCH arguments
|
||||
parser.add_argument(
|
||||
"--min_length",
|
||||
default=50,
|
||||
type=int,
|
||||
help="Minimum number of tokens for the summaries.",
|
||||
"--min_length", default=50, type=int, help="Minimum number of tokens for the summaries.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_length",
|
||||
default=200,
|
||||
type=int,
|
||||
help="Maixmum number of tokens for the summaries.",
|
||||
"--max_length", default=200, type=int, help="Maixmum number of tokens for the summaries.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--beam_size",
|
||||
default=5,
|
||||
type=int,
|
||||
help="The number of beams to start with for each example.",
|
||||
"--beam_size", default=5, type=int, help="The number of beams to start with for each example.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--alpha",
|
||||
default=0.95,
|
||||
type=float,
|
||||
help="The value of alpha for the length penalty in the beam search.",
|
||||
"--alpha", default=0.95, type=float, help="The value of alpha for the length penalty in the beam search.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--block_trigram",
|
||||
|
||||
@@ -17,12 +17,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from utils_summarization import (
|
||||
compute_token_type_ids,
|
||||
fit_to_block_size,
|
||||
build_mask,
|
||||
process_story,
|
||||
)
|
||||
from utils_summarization import build_mask, compute_token_type_ids, fit_to_block_size, process_story
|
||||
|
||||
|
||||
class SummarizationDataProcessingTest(unittest.TestCase):
|
||||
@@ -33,25 +28,19 @@ class SummarizationDataProcessingTest(unittest.TestCase):
|
||||
""" Pad the sequence with 0 if the sequence is smaller than the block size."""
|
||||
sequence = [1, 2, 3, 4]
|
||||
expected_output = [1, 2, 3, 4, 0, 0, 0, 0, 0, 0]
|
||||
self.assertEqual(
|
||||
fit_to_block_size(sequence, self.block_size, 0), expected_output
|
||||
)
|
||||
self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)
|
||||
|
||||
def test_fit_to_block_sequence_fit_exactly(self):
|
||||
""" Do nothing if the sequence is the right size. """
|
||||
sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
||||
expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
||||
self.assertEqual(
|
||||
fit_to_block_size(sequence, self.block_size, 0), expected_output
|
||||
)
|
||||
self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)
|
||||
|
||||
def test_fit_to_block_sequence_too_big(self):
|
||||
""" Truncate the sequence if it is too long. """
|
||||
sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
|
||||
expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
||||
self.assertEqual(
|
||||
fit_to_block_size(sequence, self.block_size, 0), expected_output
|
||||
)
|
||||
self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)
|
||||
|
||||
def test_process_story_no_highlights(self):
|
||||
""" Processing a story with no highlights returns an empty list for the summary.
|
||||
@@ -95,9 +84,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
|
||||
def test_build_mask(self):
|
||||
sequence = torch.tensor([1, 2, 3, 4, 23, 23, 23])
|
||||
expected = torch.tensor([1, 1, 1, 1, 0, 0, 0])
|
||||
np.testing.assert_array_equal(
|
||||
build_mask(sequence, 23).numpy(), expected.numpy()
|
||||
)
|
||||
np.testing.assert_array_equal(build_mask(sequence, 23).numpy(), expected.numpy())
|
||||
|
||||
def test_build_mask_with_padding_equal_to_one(self):
|
||||
sequence = torch.tensor([8, 2, 3, 4, 1, 1, 1])
|
||||
@@ -106,16 +93,8 @@ class SummarizationDataProcessingTest(unittest.TestCase):
|
||||
|
||||
def test_compute_token_type_ids(self):
|
||||
separator = 101
|
||||
batch = torch.tensor(
|
||||
[[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]]
|
||||
)
|
||||
expected = torch.tensor(
|
||||
[[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 1, 1]]
|
||||
)
|
||||
batch = torch.tensor([[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]])
|
||||
expected = torch.tensor([[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 1, 1]])
|
||||
|
||||
result = compute_token_type_ids(batch, separator)
|
||||
np.testing.assert_array_equal(result, expected)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -1,5 +1,5 @@
|
||||
from collections import deque
|
||||
import os
|
||||
from collections import deque
|
||||
|
||||
import torch
|
||||
from torch.utils.data import Dataset
|
||||
@@ -68,9 +68,7 @@ def process_story(raw_story):
|
||||
Raises:
|
||||
IndexError: If the stoy is empty or contains no highlights.
|
||||
"""
|
||||
nonempty_lines = list(
|
||||
filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")])
|
||||
)
|
||||
nonempty_lines = list(filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")]))
|
||||
|
||||
# for some unknown reason some lines miss a period, add it
|
||||
nonempty_lines = [_add_missing_period(line) for line in nonempty_lines]
|
||||
@@ -96,7 +94,7 @@ def process_story(raw_story):
|
||||
|
||||
|
||||
def _add_missing_period(line):
|
||||
END_TOKENS = [".", "!", "?", "...", "'", "`", '"', u"\u2019", u"\u2019", ")"]
|
||||
END_TOKENS = [".", "!", "?", "...", "'", "`", '"', "\u2019", "\u2019", ")"]
|
||||
if line.startswith("@highlight"):
|
||||
return line
|
||||
if line[-1] in END_TOKENS:
|
||||
@@ -135,13 +133,9 @@ def encode_for_summarization(story_lines, summary_lines, tokenizer):
|
||||
sentences.
|
||||
"""
|
||||
story_lines_token_ids = [tokenizer.encode(line) for line in story_lines]
|
||||
story_token_ids = [
|
||||
token for sentence in story_lines_token_ids for token in sentence
|
||||
]
|
||||
story_token_ids = [token for sentence in story_lines_token_ids for token in sentence]
|
||||
summary_lines_token_ids = [tokenizer.encode(line) for line in summary_lines]
|
||||
summary_token_ids = [
|
||||
token for sentence in summary_lines_token_ids for token in sentence
|
||||
]
|
||||
summary_token_ids = [token for sentence in summary_lines_token_ids for token in sentence]
|
||||
|
||||
return story_token_ids, summary_token_ids
|
||||
|
||||
|
||||
@@ -12,57 +12,53 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
import unittest
|
||||
from unittest.mock import patch
|
||||
|
||||
try:
|
||||
# python 3.4+ can use builtin unittest.mock instead of mock package
|
||||
from unittest.mock import patch
|
||||
except ImportError:
|
||||
from mock import patch
|
||||
|
||||
import run_generation
|
||||
import run_glue
|
||||
import run_squad
|
||||
import run_generation
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
|
||||
def get_setup_file():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-f')
|
||||
parser.add_argument("-f")
|
||||
args = parser.parse_args()
|
||||
return args.f
|
||||
|
||||
class ExamplesTests(unittest.TestCase):
|
||||
|
||||
class ExamplesTests(unittest.TestCase):
|
||||
def test_run_glue(self):
|
||||
stream_handler = logging.StreamHandler(sys.stdout)
|
||||
logger.addHandler(stream_handler)
|
||||
|
||||
testargs = ["run_glue.py",
|
||||
"--data_dir=./examples/tests_samples/MRPC/",
|
||||
"--task_name=mrpc",
|
||||
"--do_train",
|
||||
"--do_eval",
|
||||
"--output_dir=./examples/tests_samples/temp_dir",
|
||||
"--per_gpu_train_batch_size=2",
|
||||
"--per_gpu_eval_batch_size=1",
|
||||
"--learning_rate=1e-4",
|
||||
"--max_steps=10",
|
||||
"--warmup_steps=2",
|
||||
"--overwrite_output_dir",
|
||||
"--seed=42"]
|
||||
model_type, model_name = ("--model_type=bert",
|
||||
"--model_name_or_path=bert-base-uncased")
|
||||
with patch.object(sys, 'argv', testargs + [model_type, model_name]):
|
||||
testargs = [
|
||||
"run_glue.py",
|
||||
"--data_dir=./examples/tests_samples/MRPC/",
|
||||
"--task_name=mrpc",
|
||||
"--do_train",
|
||||
"--do_eval",
|
||||
"--output_dir=./examples/tests_samples/temp_dir",
|
||||
"--per_gpu_train_batch_size=2",
|
||||
"--per_gpu_eval_batch_size=1",
|
||||
"--learning_rate=1e-4",
|
||||
"--max_steps=10",
|
||||
"--warmup_steps=2",
|
||||
"--overwrite_output_dir",
|
||||
"--seed=42",
|
||||
]
|
||||
model_type, model_name = ("--model_type=bert", "--model_name_or_path=bert-base-uncased")
|
||||
with patch.object(sys, "argv", testargs + [model_type, model_name]):
|
||||
result = run_glue.main()
|
||||
for value in result.values():
|
||||
self.assertGreaterEqual(value, 0.75)
|
||||
@@ -71,40 +67,34 @@ class ExamplesTests(unittest.TestCase):
|
||||
stream_handler = logging.StreamHandler(sys.stdout)
|
||||
logger.addHandler(stream_handler)
|
||||
|
||||
testargs = ["run_squad.py",
|
||||
"--data_dir=./examples/tests_samples/SQUAD",
|
||||
"--model_name=bert-base-uncased",
|
||||
"--output_dir=./examples/tests_samples/temp_dir",
|
||||
"--max_steps=10",
|
||||
"--warmup_steps=2",
|
||||
"--do_train",
|
||||
"--do_eval",
|
||||
"--version_2_with_negative",
|
||||
"--learning_rate=2e-4",
|
||||
"--per_gpu_train_batch_size=2",
|
||||
"--per_gpu_eval_batch_size=1",
|
||||
"--overwrite_output_dir",
|
||||
"--seed=42"]
|
||||
model_type, model_name = ("--model_type=bert",
|
||||
"--model_name_or_path=bert-base-uncased")
|
||||
with patch.object(sys, 'argv', testargs + [model_type, model_name]):
|
||||
testargs = [
|
||||
"run_squad.py",
|
||||
"--data_dir=./examples/tests_samples/SQUAD",
|
||||
"--model_name=bert-base-uncased",
|
||||
"--output_dir=./examples/tests_samples/temp_dir",
|
||||
"--max_steps=10",
|
||||
"--warmup_steps=2",
|
||||
"--do_train",
|
||||
"--do_eval",
|
||||
"--version_2_with_negative",
|
||||
"--learning_rate=2e-4",
|
||||
"--per_gpu_train_batch_size=2",
|
||||
"--per_gpu_eval_batch_size=1",
|
||||
"--overwrite_output_dir",
|
||||
"--seed=42",
|
||||
]
|
||||
model_type, model_name = ("--model_type=bert", "--model_name_or_path=bert-base-uncased")
|
||||
with patch.object(sys, "argv", testargs + [model_type, model_name]):
|
||||
result = run_squad.main()
|
||||
self.assertGreaterEqual(result['f1'], 30)
|
||||
self.assertGreaterEqual(result['exact'], 30)
|
||||
self.assertGreaterEqual(result["f1"], 30)
|
||||
self.assertGreaterEqual(result["exact"], 30)
|
||||
|
||||
def test_generation(self):
|
||||
stream_handler = logging.StreamHandler(sys.stdout)
|
||||
logger.addHandler(stream_handler)
|
||||
|
||||
testargs = ["run_generation.py",
|
||||
"--prompt=Hello",
|
||||
"--length=10",
|
||||
"--seed=42"]
|
||||
model_type, model_name = ("--model_type=openai-gpt",
|
||||
"--model_name_or_path=openai-gpt")
|
||||
with patch.object(sys, 'argv', testargs + [model_type, model_name]):
|
||||
testargs = ["run_generation.py", "--prompt=Hello", "--length=10", "--seed=42"]
|
||||
model_type, model_name = ("--model_type=openai-gpt", "--model_name_or_path=openai-gpt")
|
||||
with patch.object(sys, "argv", testargs + [model_type, model_name]):
|
||||
result = run_generation.main()
|
||||
self.assertGreaterEqual(len(result), 10)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -15,18 +15,16 @@
|
||||
# limitations under the License.
|
||||
""" Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension """
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from io import open
|
||||
import json
|
||||
import csv
|
||||
import glob
|
||||
import tqdm
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
import tqdm
|
||||
|
||||
from transformers import PreTrainedTokenizer
|
||||
|
||||
|
||||
@@ -55,19 +53,10 @@ class InputExample(object):
|
||||
|
||||
|
||||
class InputFeatures(object):
|
||||
def __init__(self,
|
||||
example_id,
|
||||
choices_features,
|
||||
label
|
||||
|
||||
):
|
||||
def __init__(self, example_id, choices_features, label):
|
||||
self.example_id = example_id
|
||||
self.choices_features = [
|
||||
{
|
||||
'input_ids': input_ids,
|
||||
'input_mask': input_mask,
|
||||
'segment_ids': segment_ids
|
||||
}
|
||||
{"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
|
||||
for input_ids, input_mask, segment_ids in choices_features
|
||||
]
|
||||
self.label = label
|
||||
@@ -99,29 +88,29 @@ class RaceProcessor(DataProcessor):
|
||||
def get_train_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
logger.info("LOOKING AT {} train".format(data_dir))
|
||||
high = os.path.join(data_dir, 'train/high')
|
||||
middle = os.path.join(data_dir, 'train/middle')
|
||||
high = os.path.join(data_dir, "train/high")
|
||||
middle = os.path.join(data_dir, "train/middle")
|
||||
high = self._read_txt(high)
|
||||
middle = self._read_txt(middle)
|
||||
return self._create_examples(high + middle, 'train')
|
||||
return self._create_examples(high + middle, "train")
|
||||
|
||||
def get_dev_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
logger.info("LOOKING AT {} dev".format(data_dir))
|
||||
high = os.path.join(data_dir, 'dev/high')
|
||||
middle = os.path.join(data_dir, 'dev/middle')
|
||||
high = os.path.join(data_dir, "dev/high")
|
||||
middle = os.path.join(data_dir, "dev/middle")
|
||||
high = self._read_txt(high)
|
||||
middle = self._read_txt(middle)
|
||||
return self._create_examples(high + middle, 'dev')
|
||||
return self._create_examples(high + middle, "dev")
|
||||
|
||||
def get_test_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
logger.info("LOOKING AT {} test".format(data_dir))
|
||||
high = os.path.join(data_dir, 'test/high')
|
||||
middle = os.path.join(data_dir, 'test/middle')
|
||||
high = os.path.join(data_dir, "test/high")
|
||||
middle = os.path.join(data_dir, "test/middle")
|
||||
high = self._read_txt(high)
|
||||
middle = self._read_txt(middle)
|
||||
return self._create_examples(high + middle, 'test')
|
||||
return self._create_examples(high + middle, "test")
|
||||
|
||||
def get_labels(self):
|
||||
"""See base class."""
|
||||
@@ -131,13 +120,12 @@ class RaceProcessor(DataProcessor):
|
||||
lines = []
|
||||
files = glob.glob(input_dir + "/*txt")
|
||||
for file in tqdm.tqdm(files, desc="read files"):
|
||||
with open(file, 'r', encoding='utf-8') as fin:
|
||||
with open(file, "r", encoding="utf-8") as fin:
|
||||
data_raw = json.load(fin)
|
||||
data_raw["race_id"] = file
|
||||
lines.append(data_raw)
|
||||
return lines
|
||||
|
||||
|
||||
def _create_examples(self, lines, set_type):
|
||||
"""Creates examples for the training and dev sets."""
|
||||
examples = []
|
||||
@@ -145,19 +133,22 @@ class RaceProcessor(DataProcessor):
|
||||
race_id = "%s-%s" % (set_type, data_raw["race_id"])
|
||||
article = data_raw["article"]
|
||||
for i in range(len(data_raw["answers"])):
|
||||
truth = str(ord(data_raw['answers'][i]) - ord('A'))
|
||||
question = data_raw['questions'][i]
|
||||
options = data_raw['options'][i]
|
||||
truth = str(ord(data_raw["answers"][i]) - ord("A"))
|
||||
question = data_raw["questions"][i]
|
||||
options = data_raw["options"][i]
|
||||
|
||||
examples.append(
|
||||
InputExample(
|
||||
example_id=race_id,
|
||||
question=question,
|
||||
contexts=[article, article, article, article], # this is not efficient but convenient
|
||||
contexts=[article, article, article, article], # this is not efficient but convenient
|
||||
endings=[options[0], options[1], options[2], options[3]],
|
||||
label=truth))
|
||||
label=truth,
|
||||
)
|
||||
)
|
||||
return examples
|
||||
|
||||
|
||||
class SwagProcessor(DataProcessor):
|
||||
"""Processor for the SWAG data set."""
|
||||
|
||||
@@ -179,27 +170,19 @@ class SwagProcessor(DataProcessor):
|
||||
"setting!"
|
||||
)
|
||||
return self._create_examples(self._read_csv(os.path.join(data_dir, "test.csv")), "test")
|
||||
|
||||
def get_labels(self):
|
||||
"""See base class."""
|
||||
return ["0", "1", "2", "3"]
|
||||
|
||||
def _read_csv(self, input_file):
|
||||
with open(input_file, 'r', encoding='utf-8') as f:
|
||||
reader = csv.reader(f)
|
||||
lines = []
|
||||
for line in reader:
|
||||
if sys.version_info[0] == 2:
|
||||
line = list(unicode(cell, 'utf-8') for cell in line)
|
||||
lines.append(line)
|
||||
return lines
|
||||
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
return list(csv.reader(f))
|
||||
|
||||
def _create_examples(self, lines: List[List[str]], type: str):
|
||||
"""Creates examples for the training and dev sets."""
|
||||
if type == "train" and lines[0][-1] != 'label':
|
||||
raise ValueError(
|
||||
"For training, the input file must contain a label column."
|
||||
)
|
||||
if type == "train" and lines[0][-1] != "label":
|
||||
raise ValueError("For training, the input file must contain a label column.")
|
||||
|
||||
examples = [
|
||||
InputExample(
|
||||
@@ -207,10 +190,11 @@ class SwagProcessor(DataProcessor):
|
||||
question=line[5], # in the swag dataset, the
|
||||
# common beginning of each
|
||||
# choice is stored in "sent2".
|
||||
contexts = [line[4], line[4], line[4], line[4]],
|
||||
endings = [line[7], line[8], line[9], line[10]],
|
||||
label=line[11]
|
||||
) for line in lines[1:] # we skip the line with the column names
|
||||
contexts=[line[4], line[4], line[4], line[4]],
|
||||
endings=[line[7], line[8], line[9], line[10]],
|
||||
label=line[11],
|
||||
)
|
||||
for line in lines[1:] # we skip the line with the column names
|
||||
]
|
||||
|
||||
return examples
|
||||
@@ -238,15 +222,14 @@ class ArcProcessor(DataProcessor):
|
||||
return ["0", "1", "2", "3"]
|
||||
|
||||
def _read_json(self, input_file):
|
||||
with open(input_file, 'r', encoding='utf-8') as fin:
|
||||
with open(input_file, "r", encoding="utf-8") as fin:
|
||||
lines = fin.readlines()
|
||||
return lines
|
||||
|
||||
|
||||
def _create_examples(self, lines, type):
|
||||
"""Creates examples for the training and dev sets."""
|
||||
|
||||
#There are two types of labels. They should be normalized
|
||||
# There are two types of labels. They should be normalized
|
||||
def normalize(truth):
|
||||
if truth in "ABCD":
|
||||
return ord(truth) - ord("A")
|
||||
@@ -283,12 +266,18 @@ class ArcProcessor(DataProcessor):
|
||||
if len(options) == 4:
|
||||
examples.append(
|
||||
InputExample(
|
||||
example_id = id,
|
||||
example_id=id,
|
||||
question=question,
|
||||
contexts=[options[0]["para"].replace("_", ""), options[1]["para"].replace("_", ""),
|
||||
options[2]["para"].replace("_", ""), options[3]["para"].replace("_", "")],
|
||||
contexts=[
|
||||
options[0]["para"].replace("_", ""),
|
||||
options[1]["para"].replace("_", ""),
|
||||
options[2]["para"].replace("_", ""),
|
||||
options[3]["para"].replace("_", ""),
|
||||
],
|
||||
endings=[options[0]["text"], options[1]["text"], options[2]["text"], options[3]["text"]],
|
||||
label=truth))
|
||||
label=truth,
|
||||
)
|
||||
)
|
||||
|
||||
if type == "train":
|
||||
assert len(examples) > 1
|
||||
@@ -316,7 +305,7 @@ def convert_examples_to_features(
|
||||
Loads a data file into a list of `InputFeatures`
|
||||
"""
|
||||
|
||||
label_map = {label : i for i, label in enumerate(label_list)}
|
||||
label_map = {label: i for i, label in enumerate(label_list)}
|
||||
|
||||
features = []
|
||||
for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
|
||||
@@ -331,16 +320,13 @@ def convert_examples_to_features(
|
||||
else:
|
||||
text_b = example.question + " " + ending
|
||||
|
||||
inputs = tokenizer.encode_plus(
|
||||
text_a,
|
||||
text_b,
|
||||
add_special_tokens=True,
|
||||
max_length=max_length,
|
||||
)
|
||||
if 'num_truncated_tokens' in inputs and inputs['num_truncated_tokens'] > 0:
|
||||
logger.info('Attention! you are cropping tokens (swag task is ok). '
|
||||
'If you are training ARC and RACE and you are poping question + options,'
|
||||
'you need to try to use a bigger max seq length!')
|
||||
inputs = tokenizer.encode_plus(text_a, text_b, add_special_tokens=True, max_length=max_length,)
|
||||
if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
|
||||
logger.info(
|
||||
"Attention! you are cropping tokens (swag task is ok). "
|
||||
"If you are training ARC and RACE and you are poping question + options,"
|
||||
"you need to try to use a bigger max seq length!"
|
||||
)
|
||||
|
||||
input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
|
||||
|
||||
@@ -364,7 +350,6 @@ def convert_examples_to_features(
|
||||
assert len(token_type_ids) == max_length
|
||||
choices_features.append((input_ids, attention_mask, token_type_ids))
|
||||
|
||||
|
||||
label = label_map[example.label]
|
||||
|
||||
if ex_index < 2:
|
||||
@@ -372,33 +357,17 @@ def convert_examples_to_features(
|
||||
logger.info("race_id: {}".format(example.example_id))
|
||||
for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features):
|
||||
logger.info("choice: {}".format(choice_idx))
|
||||
logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
|
||||
logger.info("attention_mask: {}".format(' '.join(map(str, attention_mask))))
|
||||
logger.info("token_type_ids: {}".format(' '.join(map(str, token_type_ids))))
|
||||
logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
|
||||
logger.info("attention_mask: {}".format(" ".join(map(str, attention_mask))))
|
||||
logger.info("token_type_ids: {}".format(" ".join(map(str, token_type_ids))))
|
||||
logger.info("label: {}".format(label))
|
||||
|
||||
features.append(
|
||||
InputFeatures(
|
||||
example_id=example.example_id,
|
||||
choices_features=choices_features,
|
||||
label=label,
|
||||
)
|
||||
)
|
||||
features.append(InputFeatures(example_id=example.example_id, choices_features=choices_features, label=label,))
|
||||
|
||||
return features
|
||||
|
||||
|
||||
processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor}
|
||||
|
||||
|
||||
processors = {
|
||||
"race": RaceProcessor,
|
||||
"swag": SwagProcessor,
|
||||
"arc": ArcProcessor
|
||||
}
|
||||
|
||||
|
||||
MULTIPLE_CHOICE_TASKS_NUM_LABELS = {
|
||||
"race", 4,
|
||||
"swag", 4,
|
||||
"arc", 4
|
||||
}
|
||||
MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4}
|
||||
|
||||
@@ -15,11 +15,10 @@
|
||||
# limitations under the License.
|
||||
""" Named entity recognition fine-tuning: utilities to work with CoNLL-2003 task. """
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import logging
|
||||
import os
|
||||
from io import open
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -61,9 +60,7 @@ def read_examples_from_file(data_dir, mode):
|
||||
for line in f:
|
||||
if line.startswith("-DOCSTART-") or line == "" or line == "\n":
|
||||
if words:
|
||||
examples.append(InputExample(guid="{}-{}".format(mode, guid_index),
|
||||
words=words,
|
||||
labels=labels))
|
||||
examples.append(InputExample(guid="{}-{}".format(mode, guid_index), words=words, labels=labels))
|
||||
guid_index += 1
|
||||
words = []
|
||||
labels = []
|
||||
@@ -76,27 +73,27 @@ def read_examples_from_file(data_dir, mode):
|
||||
# Examples could have no label for mode = "test"
|
||||
labels.append("O")
|
||||
if words:
|
||||
examples.append(InputExample(guid="%s-%d".format(mode, guid_index),
|
||||
words=words,
|
||||
labels=labels))
|
||||
examples.append(InputExample(guid="%s-%d".format(mode, guid_index), words=words, labels=labels))
|
||||
return examples
|
||||
|
||||
|
||||
def convert_examples_to_features(examples,
|
||||
label_list,
|
||||
max_seq_length,
|
||||
tokenizer,
|
||||
cls_token_at_end=False,
|
||||
cls_token="[CLS]",
|
||||
cls_token_segment_id=1,
|
||||
sep_token="[SEP]",
|
||||
sep_token_extra=False,
|
||||
pad_on_left=False,
|
||||
pad_token=0,
|
||||
pad_token_segment_id=0,
|
||||
pad_token_label_id=-1,
|
||||
sequence_a_segment_id=0,
|
||||
mask_padding_with_zero=True):
|
||||
def convert_examples_to_features(
|
||||
examples,
|
||||
label_list,
|
||||
max_seq_length,
|
||||
tokenizer,
|
||||
cls_token_at_end=False,
|
||||
cls_token="[CLS]",
|
||||
cls_token_segment_id=1,
|
||||
sep_token="[SEP]",
|
||||
sep_token_extra=False,
|
||||
pad_on_left=False,
|
||||
pad_token=0,
|
||||
pad_token_segment_id=0,
|
||||
pad_token_label_id=-100,
|
||||
sequence_a_segment_id=0,
|
||||
mask_padding_with_zero=True,
|
||||
):
|
||||
""" Loads a data file into a list of `InputBatch`s
|
||||
`cls_token_at_end` define the location of the CLS token:
|
||||
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
|
||||
@@ -122,8 +119,8 @@ def convert_examples_to_features(examples,
|
||||
# Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
|
||||
special_tokens_count = 3 if sep_token_extra else 2
|
||||
if len(tokens) > max_seq_length - special_tokens_count:
|
||||
tokens = tokens[:(max_seq_length - special_tokens_count)]
|
||||
label_ids = label_ids[:(max_seq_length - special_tokens_count)]
|
||||
tokens = tokens[: (max_seq_length - special_tokens_count)]
|
||||
label_ids = label_ids[: (max_seq_length - special_tokens_count)]
|
||||
|
||||
# The convention in BERT is:
|
||||
# (a) For sequence pairs:
|
||||
@@ -174,10 +171,10 @@ def convert_examples_to_features(examples,
|
||||
segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
|
||||
label_ids = ([pad_token_label_id] * padding_length) + label_ids
|
||||
else:
|
||||
input_ids += ([pad_token] * padding_length)
|
||||
input_mask += ([0 if mask_padding_with_zero else 1] * padding_length)
|
||||
segment_ids += ([pad_token_segment_id] * padding_length)
|
||||
label_ids += ([pad_token_label_id] * padding_length)
|
||||
input_ids += [pad_token] * padding_length
|
||||
input_mask += [0 if mask_padding_with_zero else 1] * padding_length
|
||||
segment_ids += [pad_token_segment_id] * padding_length
|
||||
label_ids += [pad_token_label_id] * padding_length
|
||||
|
||||
assert len(input_ids) == max_seq_length
|
||||
assert len(input_mask) == max_seq_length
|
||||
@@ -194,10 +191,8 @@ def convert_examples_to_features(examples,
|
||||
logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))
|
||||
|
||||
features.append(
|
||||
InputFeatures(input_ids=input_ids,
|
||||
input_mask=input_mask,
|
||||
segment_ids=segment_ids,
|
||||
label_ids=label_ids))
|
||||
InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids)
|
||||
)
|
||||
return features
|
||||
|
||||
|
||||
@@ -209,4 +204,4 @@ def get_labels(path):
|
||||
labels = ["O"] + labels
|
||||
return labels
|
||||
else:
|
||||
return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
|
||||
return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
|
||||
|
||||
16
hubconf.py
16
hubconf.py
@@ -1,13 +1,20 @@
|
||||
from transformers import (
|
||||
AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
|
||||
AutoConfig,
|
||||
AutoModel,
|
||||
AutoModelForQuestionAnswering,
|
||||
AutoModelForSequenceClassification,
|
||||
AutoModelWithLMHead,
|
||||
AutoTokenizer,
|
||||
)
|
||||
from transformers.file_utils import add_start_docstrings
|
||||
|
||||
dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
|
||||
|
||||
dependencies = ["torch", "tqdm", "boto3", "requests", "regex", "sentencepiece", "sacremoses"]
|
||||
|
||||
|
||||
@add_start_docstrings(AutoConfig.__doc__)
|
||||
def config(*args, **kwargs):
|
||||
r"""
|
||||
r"""
|
||||
# Using torch.hub !
|
||||
import torch
|
||||
|
||||
@@ -27,7 +34,7 @@ def config(*args, **kwargs):
|
||||
|
||||
@add_start_docstrings(AutoTokenizer.__doc__)
|
||||
def tokenizer(*args, **kwargs):
|
||||
r"""
|
||||
r"""
|
||||
# Using torch.hub !
|
||||
import torch
|
||||
|
||||
@@ -57,6 +64,7 @@ def model(*args, **kwargs):
|
||||
|
||||
return AutoModel.from_pretrained(*args, **kwargs)
|
||||
|
||||
|
||||
@add_start_docstrings(AutoModelWithLMHead.__doc__)
|
||||
def modelWithLMHead(*args, **kwargs):
|
||||
r"""
|
||||
|
||||
@@ -1,48 +0,0 @@
|
||||
absl-py==0.8.0
|
||||
astor==0.8.0
|
||||
atomicwrites==1.3.0
|
||||
attrs==19.2.0
|
||||
boto3==1.9.243
|
||||
botocore==1.12.243
|
||||
certifi==2019.9.11
|
||||
chardet==3.0.4
|
||||
Click==7.0
|
||||
docutils==0.15.2
|
||||
gast==0.2.2
|
||||
google-pasta==0.1.7
|
||||
grpcio==1.24.1
|
||||
h5py==2.10.0
|
||||
idna==2.8
|
||||
importlib-metadata==0.23
|
||||
jmespath==0.9.4
|
||||
joblib==0.14.0
|
||||
Keras-Applications==1.0.8
|
||||
Keras-Preprocessing==1.1.0
|
||||
Markdown==3.1.1
|
||||
more-itertools==7.2.0
|
||||
numpy==1.17.2
|
||||
opt-einsum==3.1.0
|
||||
packaging==19.2
|
||||
pluggy==0.13.0
|
||||
protobuf==3.10.0
|
||||
py==1.8.0
|
||||
pyparsing==2.4.2
|
||||
pytest==5.2.1
|
||||
python-dateutil==2.8.0
|
||||
regex==2019.8.19
|
||||
requests==2.22.0
|
||||
s3transfer==0.2.1
|
||||
sacremoses==0.0.35
|
||||
sentencepiece==0.1.83
|
||||
six==1.12.0
|
||||
tensorboard==2.0.0
|
||||
tensorflow==2.0.0
|
||||
tensorflow-estimator==2.0.0
|
||||
termcolor==1.1.0
|
||||
torch==1.2.0
|
||||
tqdm==4.36.1
|
||||
urllib3==1.25.6
|
||||
wcwidth==0.1.7
|
||||
Werkzeug==0.16.0
|
||||
wrapt==1.11.2
|
||||
zipp==0.6.0
|
||||
@@ -1,12 +0,0 @@
|
||||
# progress bars in model download and training scripts
|
||||
tqdm
|
||||
# Accessing files from S3 directly.
|
||||
boto3
|
||||
# Used for downloading models over HTTP
|
||||
requests
|
||||
# For OpenAI GPT
|
||||
regex
|
||||
# For XLNet
|
||||
sentencepiece
|
||||
# For XLM
|
||||
sacremoses
|
||||
34
setup.cfg
Normal file
34
setup.cfg
Normal file
@@ -0,0 +1,34 @@
|
||||
[isort]
|
||||
ensure_newline_before_comments = True
|
||||
force_grid_wrap = 0
|
||||
include_trailing_comma = True
|
||||
known_first_party = transformers
|
||||
known_third_party =
|
||||
absl
|
||||
fairseq
|
||||
fastprogress
|
||||
git
|
||||
h5py
|
||||
MeCab
|
||||
nltk
|
||||
numpy
|
||||
packaging
|
||||
PIL
|
||||
psutil
|
||||
seqeval
|
||||
sklearn
|
||||
tensorboardX
|
||||
tensorflow
|
||||
tensorflow_datasets
|
||||
torch
|
||||
torchtext
|
||||
torchvision
|
||||
|
||||
line_length = 119
|
||||
lines_after_imports = 2
|
||||
multi_line_output = 3
|
||||
use_parentheses = True
|
||||
|
||||
[flake8]
|
||||
ignore = E203, E501, W503
|
||||
max-line-length = 119
|
||||
106
setup.py
106
setup.py
@@ -14,7 +14,7 @@ To create the package for pypi.
|
||||
creating the wheel and the source distribution (obviously).
|
||||
|
||||
For the wheel, run: "python setup.py bdist_wheel" in the top level directory.
|
||||
(this will build a wheel for the python version you use to build it - make sure you use python 3.x).
|
||||
(this will build a wheel for the python version you use to build it).
|
||||
|
||||
For the sources, run: "python setup.py sdist"
|
||||
You should now have a /dist directory with both .whl and .tar.gz source versions.
|
||||
@@ -23,6 +23,8 @@ To create the package for pypi.
|
||||
|
||||
twine upload dist/* -r pypitest
|
||||
(pypi suggest using twine as other methods upload files via plaintext.)
|
||||
You may have to specify the repository url, use the following command then:
|
||||
twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
|
||||
|
||||
Check that you can install it in a virtualenv by running:
|
||||
pip install -i https://testpypi.python.org/pypi transformers
|
||||
@@ -33,49 +35,89 @@ To create the package for pypi.
|
||||
7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
|
||||
|
||||
"""
|
||||
from io import open
|
||||
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
|
||||
extras = {
|
||||
'serving': ['uvicorn', 'fastapi']
|
||||
}
|
||||
extras['all'] = [package for package in extras.values()]
|
||||
# Remove stale transformers.egg-info directory to avoid https://github.com/pypa/pip/issues/5466
|
||||
stale_egg_info = Path(__file__).parent / "transformers.egg-info"
|
||||
if stale_egg_info.exists():
|
||||
print(
|
||||
(
|
||||
"Warning: {} exists.\n\n"
|
||||
"If you recently updated transformers to 3.0 or later, this is expected,\n"
|
||||
"but it may prevent transformers from installing in editable mode.\n\n"
|
||||
"This directory is automatically generated by Python's packaging tools.\n"
|
||||
"I will remove it now.\n\n"
|
||||
"See https://github.com/pypa/pip/issues/5466 for details.\n"
|
||||
).format(stale_egg_info)
|
||||
)
|
||||
shutil.rmtree(stale_egg_info)
|
||||
|
||||
|
||||
extras = {}
|
||||
|
||||
extras["mecab"] = ["mecab-python3"]
|
||||
extras["sklearn"] = ["scikit-learn"]
|
||||
extras["tf"] = ["tensorflow"]
|
||||
extras["torch"] = ["torch"]
|
||||
|
||||
extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"]
|
||||
extras["all"] = extras["serving"] + ["tensorflow", "torch"]
|
||||
|
||||
extras["testing"] = ["pytest", "pytest-xdist"]
|
||||
extras["quality"] = ["black", "isort", "flake8"]
|
||||
extras["docs"] = ["recommonmark", "sphinx", "sphinx-markdown-tables", "sphinx-rtd-theme"]
|
||||
extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3", "scikit-learn", "tensorflow", "torch"]
|
||||
|
||||
setup(
|
||||
name="transformers",
|
||||
version="2.2.2",
|
||||
version="2.4.1",
|
||||
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
|
||||
author_email="thomas@huggingface.co",
|
||||
description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
|
||||
long_description=open("README.md", "r", encoding='utf-8').read(),
|
||||
long_description=open("README.md", "r", encoding="utf-8").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
keywords='NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU',
|
||||
license='Apache',
|
||||
keywords="NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU",
|
||||
license="Apache",
|
||||
url="https://github.com/huggingface/transformers",
|
||||
packages=find_packages(exclude=["*.tests", "*.tests.*",
|
||||
"tests.*", "tests"]),
|
||||
install_requires=['numpy',
|
||||
'boto3',
|
||||
'requests',
|
||||
'tqdm',
|
||||
'regex',
|
||||
'sentencepiece',
|
||||
'sacremoses'],
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
"transformers=transformers.__main__:main",
|
||||
]
|
||||
},
|
||||
extras_require=extras,
|
||||
scripts=[
|
||||
'transformers-cli'
|
||||
package_dir={"": "src"},
|
||||
packages=find_packages("src"),
|
||||
install_requires=[
|
||||
"numpy",
|
||||
"tokenizers == 0.0.11",
|
||||
# accessing files from S3 directly
|
||||
"boto3",
|
||||
# filesystem locks e.g. to prevent parallel downloads
|
||||
"filelock",
|
||||
# for downloading models over HTTPS
|
||||
"requests",
|
||||
# progress bars in model download and training scripts
|
||||
"tqdm >= 4.27",
|
||||
# for OpenAI GPT
|
||||
"regex != 2019.12.17",
|
||||
# for XLNet
|
||||
"sentencepiece",
|
||||
# for XLM
|
||||
"sacremoses",
|
||||
],
|
||||
# python_requires='>=3.5.0',
|
||||
extras_require=extras,
|
||||
scripts=["transformers-cli"],
|
||||
python_requires=">=3.5.0",
|
||||
classifiers=[
|
||||
'Intended Audience :: Science/Research',
|
||||
'License :: OSI Approved :: Apache Software License',
|
||||
'Programming Language :: Python :: 3',
|
||||
'Topic :: Scientific/Engineering :: Artificial Intelligence',
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Intended Audience :: Developers",
|
||||
"Intended Audience :: Education",
|
||||
"Intended Audience :: Science/Research",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.5",
|
||||
"Programming Language :: Python :: 3.6",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
],
|
||||
)
|
||||
|
||||
429
src/transformers/__init__.py
Executable file
429
src/transformers/__init__.py
Executable file
@@ -0,0 +1,429 @@
|
||||
# flake8: noqa
|
||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||
# module, but to preserve other warnings. So, don't check this module at all.
|
||||
|
||||
__version__ = "2.4.1"
|
||||
|
||||
# Work around to update TensorFlow's absl.logging threshold which alters the
|
||||
# default Python logging output behavior when present.
|
||||
# see: https://github.com/abseil/abseil-py/issues/99
|
||||
# and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
|
||||
try:
|
||||
import absl.logging
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
absl.logging.set_verbosity("info")
|
||||
absl.logging.set_stderrthreshold("info")
|
||||
absl.logging._warn_preinit_stderr = False
|
||||
|
||||
import logging
|
||||
|
||||
from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
|
||||
from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
|
||||
from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig
|
||||
from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
|
||||
from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig
|
||||
from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig
|
||||
from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig
|
||||
from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
|
||||
from .configuration_mmbt import MMBTConfig
|
||||
from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig
|
||||
from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
|
||||
from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
|
||||
from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig
|
||||
|
||||
# Configurations
|
||||
from .configuration_utils import PretrainedConfig
|
||||
from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig
|
||||
from .configuration_xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig
|
||||
from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNetConfig
|
||||
from .data import (
|
||||
DataProcessor,
|
||||
InputExample,
|
||||
InputFeatures,
|
||||
SingleSentenceClassificationProcessor,
|
||||
SquadExample,
|
||||
SquadFeatures,
|
||||
SquadV1Processor,
|
||||
SquadV2Processor,
|
||||
glue_convert_examples_to_features,
|
||||
glue_output_modes,
|
||||
glue_processors,
|
||||
glue_tasks_num_labels,
|
||||
is_sklearn_available,
|
||||
squad_convert_examples_to_features,
|
||||
xnli_output_modes,
|
||||
xnli_processors,
|
||||
xnli_tasks_num_labels,
|
||||
)
|
||||
|
||||
# Files and general utilities
|
||||
from .file_utils import (
|
||||
CONFIG_NAME,
|
||||
MODEL_CARD_NAME,
|
||||
PYTORCH_PRETRAINED_BERT_CACHE,
|
||||
PYTORCH_TRANSFORMERS_CACHE,
|
||||
TF2_WEIGHTS_NAME,
|
||||
TF_WEIGHTS_NAME,
|
||||
TRANSFORMERS_CACHE,
|
||||
WEIGHTS_NAME,
|
||||
add_end_docstrings,
|
||||
add_start_docstrings,
|
||||
cached_path,
|
||||
is_tf_available,
|
||||
is_torch_available,
|
||||
)
|
||||
|
||||
# Model Cards
|
||||
from .modelcard import ModelCard
|
||||
|
||||
# TF 2.0 <=> PyTorch conversion utilities
|
||||
from .modeling_tf_pytorch_utils import (
|
||||
convert_tf_weight_name_to_pt_weight_name,
|
||||
load_pytorch_checkpoint_in_tf2_model,
|
||||
load_pytorch_model_in_tf2_model,
|
||||
load_pytorch_weights_in_tf2_model,
|
||||
load_tf2_checkpoint_in_pytorch_model,
|
||||
load_tf2_model_in_pytorch_model,
|
||||
load_tf2_weights_in_pytorch_model,
|
||||
)
|
||||
|
||||
# Pipelines
|
||||
from .pipelines import (
|
||||
CsvPipelineDataFormat,
|
||||
FeatureExtractionPipeline,
|
||||
FillMaskPipeline,
|
||||
JsonPipelineDataFormat,
|
||||
NerPipeline,
|
||||
PipedPipelineDataFormat,
|
||||
Pipeline,
|
||||
PipelineDataFormat,
|
||||
QuestionAnsweringPipeline,
|
||||
TextClassificationPipeline,
|
||||
pipeline,
|
||||
)
|
||||
from .tokenization_albert import AlbertTokenizer
|
||||
from .tokenization_auto import AutoTokenizer
|
||||
from .tokenization_bert import BasicTokenizer, BertTokenizer, BertTokenizerFast, WordpieceTokenizer
|
||||
from .tokenization_bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer
|
||||
from .tokenization_camembert import CamembertTokenizer
|
||||
from .tokenization_ctrl import CTRLTokenizer
|
||||
from .tokenization_distilbert import DistilBertTokenizer
|
||||
from .tokenization_flaubert import FlaubertTokenizer
|
||||
from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
|
||||
from .tokenization_openai import OpenAIGPTTokenizer
|
||||
from .tokenization_roberta import RobertaTokenizer
|
||||
from .tokenization_t5 import T5Tokenizer
|
||||
from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer
|
||||
|
||||
# Tokenizers
|
||||
from .tokenization_utils import PreTrainedTokenizer
|
||||
from .tokenization_xlm import XLMTokenizer
|
||||
from .tokenization_xlm_roberta import XLMRobertaTokenizer
|
||||
from .tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
|
||||
if is_sklearn_available():
|
||||
from .data import glue_compute_metrics, xnli_compute_metrics
|
||||
|
||||
|
||||
# Modeling
|
||||
if is_torch_available():
|
||||
from .modeling_utils import PreTrainedModel, prune_layer, Conv1D
|
||||
from .modeling_auto import (
|
||||
AutoModel,
|
||||
AutoModelForPreTraining,
|
||||
AutoModelForSequenceClassification,
|
||||
AutoModelForQuestionAnswering,
|
||||
AutoModelWithLMHead,
|
||||
AutoModelForTokenClassification,
|
||||
ALL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
|
||||
from .modeling_bert import (
|
||||
BertPreTrainedModel,
|
||||
BertModel,
|
||||
BertForPreTraining,
|
||||
BertForMaskedLM,
|
||||
BertForNextSentencePrediction,
|
||||
BertForSequenceClassification,
|
||||
BertForMultipleChoice,
|
||||
BertForTokenClassification,
|
||||
BertForQuestionAnswering,
|
||||
load_tf_weights_in_bert,
|
||||
BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
from .modeling_openai import (
|
||||
OpenAIGPTPreTrainedModel,
|
||||
OpenAIGPTModel,
|
||||
OpenAIGPTLMHeadModel,
|
||||
OpenAIGPTDoubleHeadsModel,
|
||||
load_tf_weights_in_openai_gpt,
|
||||
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
from .modeling_transfo_xl import (
|
||||
TransfoXLPreTrainedModel,
|
||||
TransfoXLModel,
|
||||
TransfoXLLMHeadModel,
|
||||
AdaptiveEmbedding,
|
||||
load_tf_weights_in_transfo_xl,
|
||||
TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
from .modeling_gpt2 import (
|
||||
GPT2PreTrainedModel,
|
||||
GPT2Model,
|
||||
GPT2LMHeadModel,
|
||||
GPT2DoubleHeadsModel,
|
||||
load_tf_weights_in_gpt2,
|
||||
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
from .modeling_ctrl import CTRLPreTrainedModel, CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
from .modeling_xlnet import (
|
||||
XLNetPreTrainedModel,
|
||||
XLNetModel,
|
||||
XLNetLMHeadModel,
|
||||
XLNetForSequenceClassification,
|
||||
XLNetForTokenClassification,
|
||||
XLNetForMultipleChoice,
|
||||
XLNetForQuestionAnsweringSimple,
|
||||
XLNetForQuestionAnswering,
|
||||
load_tf_weights_in_xlnet,
|
||||
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
from .modeling_xlm import (
|
||||
XLMPreTrainedModel,
|
||||
XLMModel,
|
||||
XLMWithLMHeadModel,
|
||||
XLMForSequenceClassification,
|
||||
XLMForQuestionAnswering,
|
||||
XLMForQuestionAnsweringSimple,
|
||||
XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
from .modeling_roberta import (
|
||||
RobertaForMaskedLM,
|
||||
RobertaModel,
|
||||
RobertaForSequenceClassification,
|
||||
RobertaForMultipleChoice,
|
||||
RobertaForTokenClassification,
|
||||
RobertaForQuestionAnswering,
|
||||
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
from .modeling_camembert import (
|
||||
CamembertForMaskedLM,
|
||||
CamembertModel,
|
||||
CamembertForSequenceClassification,
|
||||
CamembertForTokenClassification,
|
||||
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
from .modeling_distilbert import (
|
||||
DistilBertPreTrainedModel,
|
||||
DistilBertForMaskedLM,
|
||||
DistilBertModel,
|
||||
DistilBertForSequenceClassification,
|
||||
DistilBertForQuestionAnswering,
|
||||
DistilBertForTokenClassification,
|
||||
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
from .modeling_camembert import (
|
||||
CamembertForMaskedLM,
|
||||
CamembertModel,
|
||||
CamembertForSequenceClassification,
|
||||
CamembertForMultipleChoice,
|
||||
CamembertForTokenClassification,
|
||||
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
|
||||
from .modeling_t5 import (
|
||||
T5PreTrainedModel,
|
||||
T5Model,
|
||||
T5WithLMHeadModel,
|
||||
load_tf_weights_in_t5,
|
||||
T5_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
from .modeling_albert import (
|
||||
AlbertPreTrainedModel,
|
||||
AlbertModel,
|
||||
AlbertForMaskedLM,
|
||||
AlbertForSequenceClassification,
|
||||
AlbertForQuestionAnswering,
|
||||
load_tf_weights_in_albert,
|
||||
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
from .modeling_xlm_roberta import (
|
||||
XLMRobertaForMaskedLM,
|
||||
XLMRobertaModel,
|
||||
XLMRobertaForMultipleChoice,
|
||||
XLMRobertaForSequenceClassification,
|
||||
XLMRobertaForTokenClassification,
|
||||
XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
from .modeling_mmbt import ModalEmbeddings, MMBTModel, MMBTForClassification
|
||||
|
||||
from .modeling_flaubert import (
|
||||
FlaubertModel,
|
||||
FlaubertWithLMHeadModel,
|
||||
FlaubertForSequenceClassification,
|
||||
FlaubertForQuestionAnswering,
|
||||
FlaubertForQuestionAnsweringSimple,
|
||||
FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
|
||||
# Optimization
|
||||
from .optimization import (
|
||||
AdamW,
|
||||
get_constant_schedule,
|
||||
get_constant_schedule_with_warmup,
|
||||
get_cosine_schedule_with_warmup,
|
||||
get_cosine_with_hard_restarts_schedule_with_warmup,
|
||||
get_linear_schedule_with_warmup,
|
||||
)
|
||||
|
||||
|
||||
# TensorFlow
|
||||
if is_tf_available():
|
||||
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list
|
||||
from .modeling_tf_auto import (
|
||||
TFAutoModel,
|
||||
TFAutoModelForPreTraining,
|
||||
TFAutoModelForSequenceClassification,
|
||||
TFAutoModelForQuestionAnswering,
|
||||
TFAutoModelWithLMHead,
|
||||
TFAutoModelForTokenClassification,
|
||||
TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
|
||||
from .modeling_tf_bert import (
|
||||
TFBertPreTrainedModel,
|
||||
TFBertMainLayer,
|
||||
TFBertEmbeddings,
|
||||
TFBertModel,
|
||||
TFBertForPreTraining,
|
||||
TFBertForMaskedLM,
|
||||
TFBertForNextSentencePrediction,
|
||||
TFBertForSequenceClassification,
|
||||
TFBertForMultipleChoice,
|
||||
TFBertForTokenClassification,
|
||||
TFBertForQuestionAnswering,
|
||||
TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
|
||||
from .modeling_tf_gpt2 import (
|
||||
TFGPT2PreTrainedModel,
|
||||
TFGPT2MainLayer,
|
||||
TFGPT2Model,
|
||||
TFGPT2LMHeadModel,
|
||||
TFGPT2DoubleHeadsModel,
|
||||
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
|
||||
from .modeling_tf_openai import (
|
||||
TFOpenAIGPTPreTrainedModel,
|
||||
TFOpenAIGPTMainLayer,
|
||||
TFOpenAIGPTModel,
|
||||
TFOpenAIGPTLMHeadModel,
|
||||
TFOpenAIGPTDoubleHeadsModel,
|
||||
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
|
||||
from .modeling_tf_transfo_xl import (
|
||||
TFTransfoXLPreTrainedModel,
|
||||
TFTransfoXLMainLayer,
|
||||
TFTransfoXLModel,
|
||||
TFTransfoXLLMHeadModel,
|
||||
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
|
||||
from .modeling_tf_xlnet import (
|
||||
TFXLNetPreTrainedModel,
|
||||
TFXLNetMainLayer,
|
||||
TFXLNetModel,
|
||||
TFXLNetLMHeadModel,
|
||||
TFXLNetForSequenceClassification,
|
||||
TFXLNetForTokenClassification,
|
||||
TFXLNetForQuestionAnsweringSimple,
|
||||
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
|
||||
from .modeling_tf_xlm import (
|
||||
TFXLMPreTrainedModel,
|
||||
TFXLMMainLayer,
|
||||
TFXLMModel,
|
||||
TFXLMWithLMHeadModel,
|
||||
TFXLMForSequenceClassification,
|
||||
TFXLMForQuestionAnsweringSimple,
|
||||
TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
|
||||
from .modeling_tf_xlm_roberta import (
|
||||
TFXLMRobertaForMaskedLM,
|
||||
TFXLMRobertaModel,
|
||||
TFXLMRobertaForSequenceClassification,
|
||||
TFXLMRobertaForTokenClassification,
|
||||
TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
|
||||
from .modeling_tf_roberta import (
|
||||
TFRobertaPreTrainedModel,
|
||||
TFRobertaMainLayer,
|
||||
TFRobertaModel,
|
||||
TFRobertaForMaskedLM,
|
||||
TFRobertaForSequenceClassification,
|
||||
TFRobertaForTokenClassification,
|
||||
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
|
||||
from .modeling_tf_camembert import (
|
||||
TFCamembertModel,
|
||||
TFCamembertForMaskedLM,
|
||||
TFCamembertForSequenceClassification,
|
||||
TFCamembertForTokenClassification,
|
||||
TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
|
||||
from .modeling_tf_distilbert import (
|
||||
TFDistilBertPreTrainedModel,
|
||||
TFDistilBertMainLayer,
|
||||
TFDistilBertModel,
|
||||
TFDistilBertForMaskedLM,
|
||||
TFDistilBertForSequenceClassification,
|
||||
TFDistilBertForTokenClassification,
|
||||
TFDistilBertForQuestionAnswering,
|
||||
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
|
||||
from .modeling_tf_ctrl import (
|
||||
TFCTRLPreTrainedModel,
|
||||
TFCTRLModel,
|
||||
TFCTRLLMHeadModel,
|
||||
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
|
||||
from .modeling_tf_albert import (
|
||||
TFAlbertPreTrainedModel,
|
||||
TFAlbertModel,
|
||||
TFAlbertForMaskedLM,
|
||||
TFAlbertForSequenceClassification,
|
||||
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
|
||||
from .modeling_tf_t5 import (
|
||||
TFT5PreTrainedModel,
|
||||
TFT5Model,
|
||||
TFT5WithLMHeadModel,
|
||||
TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
|
||||
# Optimization
|
||||
from .optimization_tf import WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator
|
||||
|
||||
|
||||
if not is_tf_available() and not is_torch_available():
|
||||
logger.warning(
|
||||
"Neither PyTorch nor TensorFlow >= 2.0 have been found."
|
||||
"Models won't be available and only tokenizers, configuration"
|
||||
"and file/data utilities can be used."
|
||||
)
|
||||
@@ -1,6 +1,7 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from argparse import ArgumentParser
|
||||
|
||||
|
||||
class BaseTransformersCLICommand(ABC):
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
144
src/transformers/commands/convert.py
Normal file
144
src/transformers/commands/convert.py
Normal file
@@ -0,0 +1,144 @@
|
||||
from argparse import ArgumentParser, Namespace
|
||||
from logging import getLogger
|
||||
|
||||
from transformers.commands import BaseTransformersCLICommand
|
||||
|
||||
|
||||
def convert_command_factory(args: Namespace):
|
||||
"""
|
||||
Factory function used to convert a model TF 1.0 checkpoint in a PyTorch checkpoint.
|
||||
:return: ServeCommand
|
||||
"""
|
||||
return ConvertCommand(
|
||||
args.model_type, args.tf_checkpoint, args.pytorch_dump_output, args.config, args.finetuning_task_name
|
||||
)
|
||||
|
||||
|
||||
class ConvertCommand(BaseTransformersCLICommand):
|
||||
@staticmethod
|
||||
def register_subcommand(parser: ArgumentParser):
|
||||
"""
|
||||
Register this command to argparse so it's available for the transformer-cli
|
||||
:param parser: Root parser to register command-specific arguments
|
||||
:return:
|
||||
"""
|
||||
train_parser = parser.add_parser(
|
||||
"convert",
|
||||
help="CLI tool to run convert model from original "
|
||||
"author checkpoints to Transformers PyTorch checkpoints.",
|
||||
)
|
||||
train_parser.add_argument("--model_type", type=str, required=True, help="Model's type.")
|
||||
train_parser.add_argument(
|
||||
"--tf_checkpoint", type=str, required=True, help="TensorFlow checkpoint path or folder."
|
||||
)
|
||||
train_parser.add_argument(
|
||||
"--pytorch_dump_output", type=str, required=True, help="Path to the PyTorch savd model output."
|
||||
)
|
||||
train_parser.add_argument("--config", type=str, default="", help="Configuration file path or folder.")
|
||||
train_parser.add_argument(
|
||||
"--finetuning_task_name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Optional fine-tuning task name if the TF model was a finetuned model.",
|
||||
)
|
||||
train_parser.set_defaults(func=convert_command_factory)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_type: str,
|
||||
tf_checkpoint: str,
|
||||
pytorch_dump_output: str,
|
||||
config: str,
|
||||
finetuning_task_name: str,
|
||||
*args
|
||||
):
|
||||
self._logger = getLogger("transformers-cli/converting")
|
||||
|
||||
self._logger.info("Loading model {}".format(model_type))
|
||||
self._model_type = model_type
|
||||
self._tf_checkpoint = tf_checkpoint
|
||||
self._pytorch_dump_output = pytorch_dump_output
|
||||
self._config = config
|
||||
self._finetuning_task_name = finetuning_task_name
|
||||
|
||||
def run(self):
|
||||
if self._model_type == "bert":
|
||||
try:
|
||||
from transformers.convert_bert_original_tf_checkpoint_to_pytorch import (
|
||||
convert_tf_checkpoint_to_pytorch,
|
||||
)
|
||||
except ImportError:
|
||||
msg = (
|
||||
"transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
||||
"In that case, it requires TensorFlow to be installed. Please see "
|
||||
"https://www.tensorflow.org/install/ for installation instructions."
|
||||
)
|
||||
raise ImportError(msg)
|
||||
|
||||
convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
|
||||
elif self._model_type == "gpt":
|
||||
from transformers.convert_openai_original_tf_checkpoint_to_pytorch import (
|
||||
convert_openai_checkpoint_to_pytorch,
|
||||
)
|
||||
|
||||
convert_openai_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
|
||||
elif self._model_type == "transfo_xl":
|
||||
try:
|
||||
from transformers.convert_transfo_xl_original_tf_checkpoint_to_pytorch import (
|
||||
convert_transfo_xl_checkpoint_to_pytorch,
|
||||
)
|
||||
except ImportError:
|
||||
msg = (
|
||||
"transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
||||
"In that case, it requires TensorFlow to be installed. Please see "
|
||||
"https://www.tensorflow.org/install/ for installation instructions."
|
||||
)
|
||||
raise ImportError(msg)
|
||||
|
||||
if "ckpt" in self._tf_checkpoint.lower():
|
||||
TF_CHECKPOINT = self._tf_checkpoint
|
||||
TF_DATASET_FILE = ""
|
||||
else:
|
||||
TF_DATASET_FILE = self._tf_checkpoint
|
||||
TF_CHECKPOINT = ""
|
||||
convert_transfo_xl_checkpoint_to_pytorch(
|
||||
TF_CHECKPOINT, self._config, self._pytorch_dump_output, TF_DATASET_FILE
|
||||
)
|
||||
elif self._model_type == "gpt2":
|
||||
try:
|
||||
from transformers.convert_gpt2_original_tf_checkpoint_to_pytorch import (
|
||||
convert_gpt2_checkpoint_to_pytorch,
|
||||
)
|
||||
except ImportError:
|
||||
msg = (
|
||||
"transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
||||
"In that case, it requires TensorFlow to be installed. Please see "
|
||||
"https://www.tensorflow.org/install/ for installation instructions."
|
||||
)
|
||||
raise ImportError(msg)
|
||||
|
||||
convert_gpt2_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
|
||||
elif self._model_type == "xlnet":
|
||||
try:
|
||||
from transformers.convert_xlnet_original_tf_checkpoint_to_pytorch import (
|
||||
convert_xlnet_checkpoint_to_pytorch,
|
||||
)
|
||||
except ImportError:
|
||||
msg = (
|
||||
"transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
|
||||
"In that case, it requires TensorFlow to be installed. Please see "
|
||||
"https://www.tensorflow.org/install/ for installation instructions."
|
||||
)
|
||||
raise ImportError(msg)
|
||||
|
||||
convert_xlnet_checkpoint_to_pytorch(
|
||||
self._tf_checkpoint, self._config, self._pytorch_dump_output, self._finetuning_task_name
|
||||
)
|
||||
elif self._model_type == "xlm":
|
||||
from transformers.convert_xlm_original_pytorch_checkpoint_to_pytorch import (
|
||||
convert_xlm_checkpoint_to_pytorch,
|
||||
)
|
||||
|
||||
convert_xlm_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output)
|
||||
else:
|
||||
raise ValueError("--model_type should be selected in the list [bert, gpt, gpt2, transfo_xl, xlnet, xlm]")
|
||||
32
src/transformers/commands/download.py
Normal file
32
src/transformers/commands/download.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from argparse import ArgumentParser
|
||||
|
||||
from transformers.commands import BaseTransformersCLICommand
|
||||
|
||||
|
||||
def download_command_factory(args):
|
||||
return DownloadCommand(args.model, args.cache_dir, args.force)
|
||||
|
||||
|
||||
class DownloadCommand(BaseTransformersCLICommand):
|
||||
@staticmethod
|
||||
def register_subcommand(parser: ArgumentParser):
|
||||
download_parser = parser.add_parser("download")
|
||||
download_parser.add_argument(
|
||||
"--cache-dir", type=str, default=None, help="Path to location to store the models"
|
||||
)
|
||||
download_parser.add_argument(
|
||||
"--force", action="store_true", help="Force the model to be download even if already in cache-dir"
|
||||
)
|
||||
download_parser.add_argument("model", type=str, help="Name of the model to download")
|
||||
download_parser.set_defaults(func=download_command_factory)
|
||||
|
||||
def __init__(self, model: str, cache: str, force: bool):
|
||||
self._model = model
|
||||
self._cache = cache
|
||||
self._force = force
|
||||
|
||||
def run(self):
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
|
||||
AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
|
||||
AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
|
||||
96
src/transformers/commands/run.py
Normal file
96
src/transformers/commands/run.py
Normal file
@@ -0,0 +1,96 @@
|
||||
import logging
|
||||
from argparse import ArgumentParser
|
||||
|
||||
from transformers.commands import BaseTransformersCLICommand
|
||||
from transformers.pipelines import SUPPORTED_TASKS, Pipeline, PipelineDataFormat, pipeline
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
|
||||
def try_infer_format_from_ext(path: str):
|
||||
if not path:
|
||||
return "pipe"
|
||||
|
||||
for ext in PipelineDataFormat.SUPPORTED_FORMATS:
|
||||
if path.endswith(ext):
|
||||
return ext
|
||||
|
||||
raise Exception(
|
||||
"Unable to determine file format from file extension {}. "
|
||||
"Please provide the format through --format {}".format(path, PipelineDataFormat.SUPPORTED_FORMATS)
|
||||
)
|
||||
|
||||
|
||||
def run_command_factory(args):
|
||||
nlp = pipeline(
|
||||
task=args.task,
|
||||
model=args.model if args.model else None,
|
||||
config=args.config,
|
||||
tokenizer=args.tokenizer,
|
||||
device=args.device,
|
||||
)
|
||||
format = try_infer_format_from_ext(args.input) if args.format == "infer" else args.format
|
||||
reader = PipelineDataFormat.from_str(
|
||||
format=format,
|
||||
output_path=args.output,
|
||||
input_path=args.input,
|
||||
column=args.column if args.column else nlp.default_input_names,
|
||||
overwrite=args.overwrite,
|
||||
)
|
||||
return RunCommand(nlp, reader)
|
||||
|
||||
|
||||
class RunCommand(BaseTransformersCLICommand):
|
||||
def __init__(self, nlp: Pipeline, reader: PipelineDataFormat):
|
||||
self._nlp = nlp
|
||||
self._reader = reader
|
||||
|
||||
@staticmethod
|
||||
def register_subcommand(parser: ArgumentParser):
|
||||
run_parser = parser.add_parser("run", help="Run a pipeline through the CLI")
|
||||
run_parser.add_argument("--task", choices=SUPPORTED_TASKS.keys(), help="Task to run")
|
||||
run_parser.add_argument("--input", type=str, help="Path to the file to use for inference")
|
||||
run_parser.add_argument("--output", type=str, help="Path to the file that will be used post to write results.")
|
||||
run_parser.add_argument("--model", type=str, help="Name or path to the model to instantiate.")
|
||||
run_parser.add_argument("--config", type=str, help="Name or path to the model's config to instantiate.")
|
||||
run_parser.add_argument(
|
||||
"--tokenizer", type=str, help="Name of the tokenizer to use. (default: same as the model name)"
|
||||
)
|
||||
run_parser.add_argument(
|
||||
"--column",
|
||||
type=str,
|
||||
help="Name of the column to use as input. (For multi columns input as QA use column1,columns2)",
|
||||
)
|
||||
run_parser.add_argument(
|
||||
"--format",
|
||||
type=str,
|
||||
default="infer",
|
||||
choices=PipelineDataFormat.SUPPORTED_FORMATS,
|
||||
help="Input format to read from",
|
||||
)
|
||||
run_parser.add_argument(
|
||||
"--device",
|
||||
type=int,
|
||||
default=-1,
|
||||
help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)",
|
||||
)
|
||||
run_parser.add_argument("--overwrite", action="store_true", help="Allow overwriting the output file.")
|
||||
run_parser.set_defaults(func=run_command_factory)
|
||||
|
||||
def run(self):
|
||||
nlp, outputs = self._nlp, []
|
||||
|
||||
for entry in self._reader:
|
||||
output = nlp(**entry) if self._reader.is_multi_columns else nlp(entry)
|
||||
if isinstance(output, dict):
|
||||
outputs.append(output)
|
||||
else:
|
||||
outputs += output
|
||||
|
||||
# Saving data
|
||||
if self._nlp.binary_output:
|
||||
binary_path = self._reader.save_binary(outputs)
|
||||
logger.warning("Current pipeline requires output to be in binary format, saving at {}".format(binary_path))
|
||||
else:
|
||||
self._reader.save(outputs)
|
||||
214
src/transformers/commands/serving.py
Normal file
214
src/transformers/commands/serving.py
Normal file
@@ -0,0 +1,214 @@
|
||||
import logging
|
||||
from argparse import ArgumentParser, Namespace
|
||||
from typing import Any, List, Optional
|
||||
|
||||
from transformers import Pipeline
|
||||
from transformers.commands import BaseTransformersCLICommand
|
||||
from transformers.pipelines import SUPPORTED_TASKS, pipeline
|
||||
|
||||
|
||||
try:
|
||||
from uvicorn import run
|
||||
from fastapi import FastAPI, HTTPException, Body
|
||||
from fastapi.routing import APIRoute
|
||||
from pydantic import BaseModel
|
||||
from starlette.responses import JSONResponse
|
||||
|
||||
_serve_dependencies_installed = True
|
||||
except (ImportError, AttributeError):
|
||||
BaseModel = object
|
||||
|
||||
def Body(*x, **y):
|
||||
pass
|
||||
|
||||
_serve_dependencies_installed = False
|
||||
|
||||
|
||||
logger = logging.getLogger("transformers-cli/serving")
|
||||
|
||||
|
||||
def serve_command_factory(args: Namespace):
|
||||
"""
|
||||
Factory function used to instantiate serving server from provided command line arguments.
|
||||
:return: ServeCommand
|
||||
"""
|
||||
nlp = pipeline(
|
||||
task=args.task,
|
||||
model=args.model if args.model else None,
|
||||
config=args.config,
|
||||
tokenizer=args.tokenizer,
|
||||
device=args.device,
|
||||
)
|
||||
return ServeCommand(nlp, args.host, args.port, args.workers)
|
||||
|
||||
|
||||
class ServeModelInfoResult(BaseModel):
|
||||
"""
|
||||
Expose model information
|
||||
"""
|
||||
|
||||
infos: dict
|
||||
|
||||
|
||||
class ServeTokenizeResult(BaseModel):
|
||||
"""
|
||||
Tokenize result model
|
||||
"""
|
||||
|
||||
tokens: List[str]
|
||||
tokens_ids: Optional[List[int]]
|
||||
|
||||
|
||||
class ServeDeTokenizeResult(BaseModel):
|
||||
"""
|
||||
DeTokenize result model
|
||||
"""
|
||||
|
||||
text: str
|
||||
|
||||
|
||||
class ServeForwardResult(BaseModel):
|
||||
"""
|
||||
Forward result model
|
||||
"""
|
||||
|
||||
output: Any
|
||||
|
||||
|
||||
class ServeCommand(BaseTransformersCLICommand):
|
||||
@staticmethod
|
||||
def register_subcommand(parser: ArgumentParser):
|
||||
"""
|
||||
Register this command to argparse so it's available for the transformer-cli
|
||||
:param parser: Root parser to register command-specific arguments
|
||||
:return:
|
||||
"""
|
||||
serve_parser = parser.add_parser(
|
||||
"serve", help="CLI tool to run inference requests through REST and GraphQL endpoints."
|
||||
)
|
||||
serve_parser.add_argument(
|
||||
"--task", type=str, choices=SUPPORTED_TASKS.keys(), help="The task to run the pipeline on"
|
||||
)
|
||||
serve_parser.add_argument("--host", type=str, default="localhost", help="Interface the server will listen on.")
|
||||
serve_parser.add_argument("--port", type=int, default=8888, help="Port the serving will listen to.")
|
||||
serve_parser.add_argument("--workers", type=int, default=1, help="Number of http workers")
|
||||
serve_parser.add_argument("--model", type=str, help="Model's name or path to stored model.")
|
||||
serve_parser.add_argument("--config", type=str, help="Model's config name or path to stored model.")
|
||||
serve_parser.add_argument("--tokenizer", type=str, help="Tokenizer name to use.")
|
||||
serve_parser.add_argument(
|
||||
"--device",
|
||||
type=int,
|
||||
default=-1,
|
||||
help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)",
|
||||
)
|
||||
serve_parser.set_defaults(func=serve_command_factory)
|
||||
|
||||
def __init__(self, pipeline: Pipeline, host: str, port: int, workers: int):
|
||||
|
||||
self._pipeline = pipeline
|
||||
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.workers = workers
|
||||
|
||||
if not _serve_dependencies_installed:
|
||||
raise RuntimeError(
|
||||
"Using serve command requires FastAPI and unicorn. "
|
||||
'Please install transformers with [serving]: pip install "transformers[serving]".'
|
||||
"Or install FastAPI and unicorn separately."
|
||||
)
|
||||
else:
|
||||
logger.info("Serving model over {}:{}".format(host, port))
|
||||
self._app = FastAPI(
|
||||
routes=[
|
||||
APIRoute(
|
||||
"/",
|
||||
self.model_info,
|
||||
response_model=ServeModelInfoResult,
|
||||
response_class=JSONResponse,
|
||||
methods=["GET"],
|
||||
),
|
||||
APIRoute(
|
||||
"/tokenize",
|
||||
self.tokenize,
|
||||
response_model=ServeTokenizeResult,
|
||||
response_class=JSONResponse,
|
||||
methods=["POST"],
|
||||
),
|
||||
APIRoute(
|
||||
"/detokenize",
|
||||
self.detokenize,
|
||||
response_model=ServeDeTokenizeResult,
|
||||
response_class=JSONResponse,
|
||||
methods=["POST"],
|
||||
),
|
||||
APIRoute(
|
||||
"/forward",
|
||||
self.forward,
|
||||
response_model=ServeForwardResult,
|
||||
response_class=JSONResponse,
|
||||
methods=["POST"],
|
||||
),
|
||||
],
|
||||
timeout=600,
|
||||
)
|
||||
|
||||
def run(self):
|
||||
run(self._app, host=self.host, port=self.port, workers=self.workers)
|
||||
|
||||
def model_info(self):
|
||||
return ServeModelInfoResult(infos=vars(self._pipeline.model.config))
|
||||
|
||||
def tokenize(self, text_input: str = Body(None, embed=True), return_ids: bool = Body(False, embed=True)):
|
||||
"""
|
||||
Tokenize the provided input and eventually returns corresponding tokens id:
|
||||
- **text_input**: String to tokenize
|
||||
- **return_ids**: Boolean flags indicating if the tokens have to be converted to their integer mapping.
|
||||
"""
|
||||
try:
|
||||
tokens_txt = self._pipeline.tokenizer.tokenize(text_input)
|
||||
|
||||
if return_ids:
|
||||
tokens_ids = self._pipeline.tokenizer.convert_tokens_to_ids(tokens_txt)
|
||||
return ServeTokenizeResult(tokens=tokens_txt, tokens_ids=tokens_ids)
|
||||
else:
|
||||
return ServeTokenizeResult(tokens=tokens_txt)
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail={"model": "", "error": str(e)})
|
||||
|
||||
def detokenize(
|
||||
self,
|
||||
tokens_ids: List[int] = Body(None, embed=True),
|
||||
skip_special_tokens: bool = Body(False, embed=True),
|
||||
cleanup_tokenization_spaces: bool = Body(True, embed=True),
|
||||
):
|
||||
"""
|
||||
Detokenize the provided tokens ids to readable text:
|
||||
- **tokens_ids**: List of tokens ids
|
||||
- **skip_special_tokens**: Flag indicating to not try to decode special tokens
|
||||
- **cleanup_tokenization_spaces**: Flag indicating to remove all leading/trailing spaces and intermediate ones.
|
||||
"""
|
||||
try:
|
||||
decoded_str = self._pipeline.tokenizer.decode(tokens_ids, skip_special_tokens, cleanup_tokenization_spaces)
|
||||
return ServeDeTokenizeResult(model="", text=decoded_str)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail={"model": "", "error": str(e)})
|
||||
|
||||
async def forward(self, inputs=Body(None, embed=True)):
|
||||
"""
|
||||
**inputs**:
|
||||
**attention_mask**:
|
||||
**tokens_type_ids**:
|
||||
"""
|
||||
|
||||
# Check we don't have empty string
|
||||
if len(inputs) == 0:
|
||||
return ServeForwardResult(output=[], attention=[])
|
||||
|
||||
try:
|
||||
# Forward through the model
|
||||
output = self._pipeline(inputs)
|
||||
return ServeForwardResult(output=output)
|
||||
except Exception as e:
|
||||
raise HTTPException(500, {"error": str(e)})
|
||||
144
src/transformers/commands/train.py
Normal file
144
src/transformers/commands/train.py
Normal file
@@ -0,0 +1,144 @@
|
||||
import os
|
||||
from argparse import ArgumentParser, Namespace
|
||||
from logging import getLogger
|
||||
|
||||
from transformers import SingleSentenceClassificationProcessor as Processor
|
||||
from transformers import TextClassificationPipeline, is_tf_available, is_torch_available
|
||||
from transformers.commands import BaseTransformersCLICommand
|
||||
|
||||
|
||||
if not is_tf_available() and not is_torch_available():
|
||||
raise RuntimeError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training")
|
||||
|
||||
# TF training parameters
|
||||
USE_XLA = False
|
||||
USE_AMP = False
|
||||
|
||||
|
||||
def train_command_factory(args: Namespace):
|
||||
"""
|
||||
Factory function used to instantiate serving server from provided command line arguments.
|
||||
:return: ServeCommand
|
||||
"""
|
||||
return TrainCommand(args)
|
||||
|
||||
|
||||
class TrainCommand(BaseTransformersCLICommand):
|
||||
@staticmethod
|
||||
def register_subcommand(parser: ArgumentParser):
|
||||
"""
|
||||
Register this command to argparse so it's available for the transformer-cli
|
||||
:param parser: Root parser to register command-specific arguments
|
||||
:return:
|
||||
"""
|
||||
train_parser = parser.add_parser("train", help="CLI tool to train a model on a task.")
|
||||
|
||||
train_parser.add_argument(
|
||||
"--train_data",
|
||||
type=str,
|
||||
required=True,
|
||||
help="path to train (and optionally evaluation) dataset as a csv with "
|
||||
"tab separated labels and sentences.",
|
||||
)
|
||||
train_parser.add_argument(
|
||||
"--column_label", type=int, default=0, help="Column of the dataset csv file with example labels."
|
||||
)
|
||||
train_parser.add_argument(
|
||||
"--column_text", type=int, default=1, help="Column of the dataset csv file with example texts."
|
||||
)
|
||||
train_parser.add_argument(
|
||||
"--column_id", type=int, default=2, help="Column of the dataset csv file with example ids."
|
||||
)
|
||||
train_parser.add_argument(
|
||||
"--skip_first_row", action="store_true", help="Skip the first row of the csv file (headers)."
|
||||
)
|
||||
|
||||
train_parser.add_argument("--validation_data", type=str, default="", help="path to validation dataset.")
|
||||
train_parser.add_argument(
|
||||
"--validation_split",
|
||||
type=float,
|
||||
default=0.1,
|
||||
help="if validation dataset is not provided, fraction of train dataset " "to use as validation dataset.",
|
||||
)
|
||||
|
||||
train_parser.add_argument("--output", type=str, default="./", help="path to saved the trained model.")
|
||||
|
||||
train_parser.add_argument(
|
||||
"--task", type=str, default="text_classification", help="Task to train the model on."
|
||||
)
|
||||
train_parser.add_argument(
|
||||
"--model", type=str, default="bert-base-uncased", help="Model's name or path to stored model."
|
||||
)
|
||||
train_parser.add_argument("--train_batch_size", type=int, default=32, help="Batch size for training.")
|
||||
train_parser.add_argument("--valid_batch_size", type=int, default=64, help="Batch size for validation.")
|
||||
train_parser.add_argument("--learning_rate", type=float, default=3e-5, help="Learning rate.")
|
||||
train_parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon for Adam optimizer.")
|
||||
train_parser.set_defaults(func=train_command_factory)
|
||||
|
||||
def __init__(self, args: Namespace):
|
||||
self.logger = getLogger("transformers-cli/training")
|
||||
|
||||
self.framework = "tf" if is_tf_available() else "torch"
|
||||
|
||||
os.makedirs(args.output, exist_ok=True)
|
||||
assert os.path.isdir(args.output)
|
||||
self.output = args.output
|
||||
|
||||
self.column_label = args.column_label
|
||||
self.column_text = args.column_text
|
||||
self.column_id = args.column_id
|
||||
|
||||
self.logger.info("Loading {} pipeline for {}".format(args.task, args.model))
|
||||
if args.task == "text_classification":
|
||||
self.pipeline = TextClassificationPipeline.from_pretrained(args.model)
|
||||
elif args.task == "token_classification":
|
||||
raise NotImplementedError
|
||||
elif args.task == "question_answering":
|
||||
raise NotImplementedError
|
||||
|
||||
self.logger.info("Loading dataset from {}".format(args.train_data))
|
||||
self.train_dataset = Processor.create_from_csv(
|
||||
args.train_data,
|
||||
column_label=args.column_label,
|
||||
column_text=args.column_text,
|
||||
column_id=args.column_id,
|
||||
skip_first_row=args.skip_first_row,
|
||||
)
|
||||
self.valid_dataset = None
|
||||
if args.validation_data:
|
||||
self.logger.info("Loading validation dataset from {}".format(args.validation_data))
|
||||
self.valid_dataset = Processor.create_from_csv(
|
||||
args.validation_data,
|
||||
column_label=args.column_label,
|
||||
column_text=args.column_text,
|
||||
column_id=args.column_id,
|
||||
skip_first_row=args.skip_first_row,
|
||||
)
|
||||
|
||||
self.validation_split = args.validation_split
|
||||
self.train_batch_size = args.train_batch_size
|
||||
self.valid_batch_size = args.valid_batch_size
|
||||
self.learning_rate = args.learning_rate
|
||||
self.adam_epsilon = args.adam_epsilon
|
||||
|
||||
def run(self):
|
||||
if self.framework == "tf":
|
||||
return self.run_tf()
|
||||
return self.run_torch()
|
||||
|
||||
def run_torch(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def run_tf(self):
|
||||
self.pipeline.fit(
|
||||
self.train_dataset,
|
||||
validation_data=self.valid_dataset,
|
||||
validation_split=self.validation_split,
|
||||
learning_rate=self.learning_rate,
|
||||
adam_epsilon=self.adam_epsilon,
|
||||
train_batch_size=self.train_batch_size,
|
||||
valid_batch_size=self.valid_batch_size,
|
||||
)
|
||||
|
||||
# Save trained pipeline
|
||||
self.pipeline.save_pretrained(self.output)
|
||||
@@ -1,36 +1,52 @@
|
||||
import os
|
||||
import sys
|
||||
from argparse import ArgumentParser
|
||||
from getpass import getpass
|
||||
import os
|
||||
from typing import List, Union
|
||||
|
||||
from requests.exceptions import HTTPError
|
||||
|
||||
from transformers.commands import BaseTransformersCLICommand
|
||||
from transformers.hf_api import HfApi, HfFolder, HTTPError
|
||||
from transformers.hf_api import HfApi, HfFolder
|
||||
|
||||
|
||||
UPLOAD_MAX_FILES = 15
|
||||
|
||||
|
||||
class UserCommands(BaseTransformersCLICommand):
|
||||
@staticmethod
|
||||
def register_subcommand(parser: ArgumentParser):
|
||||
login_parser = parser.add_parser('login')
|
||||
login_parser = parser.add_parser("login", help="Log in using the same credentials as on huggingface.co")
|
||||
login_parser.set_defaults(func=lambda args: LoginCommand(args))
|
||||
whoami_parser = parser.add_parser('whoami')
|
||||
whoami_parser = parser.add_parser("whoami", help="Find out which huggingface.co account you are logged in as.")
|
||||
whoami_parser.set_defaults(func=lambda args: WhoamiCommand(args))
|
||||
logout_parser = parser.add_parser('logout')
|
||||
logout_parser = parser.add_parser("logout", help="Log out")
|
||||
logout_parser.set_defaults(func=lambda args: LogoutCommand(args))
|
||||
list_parser = parser.add_parser('ls')
|
||||
list_parser.set_defaults(func=lambda args: ListObjsCommand(args))
|
||||
# s3
|
||||
s3_parser = parser.add_parser("s3", help="{ls, rm} Commands to interact with the files you upload on S3.")
|
||||
s3_subparsers = s3_parser.add_subparsers(help="s3 related commands")
|
||||
ls_parser = s3_subparsers.add_parser("ls")
|
||||
ls_parser.set_defaults(func=lambda args: ListObjsCommand(args))
|
||||
rm_parser = s3_subparsers.add_parser("rm")
|
||||
rm_parser.add_argument("filename", type=str, help="individual object filename to delete from S3.")
|
||||
rm_parser.set_defaults(func=lambda args: DeleteObjCommand(args))
|
||||
# upload
|
||||
upload_parser = parser.add_parser('upload')
|
||||
upload_parser.add_argument('path', type=str, help='Local path of the folder or individual file to upload.')
|
||||
upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override individual object filename on S3.')
|
||||
upload_parser = parser.add_parser("upload")
|
||||
upload_parser.add_argument("path", type=str, help="Local path of the folder or individual file to upload.")
|
||||
upload_parser.add_argument(
|
||||
"--filename", type=str, default=None, help="Optional: override individual object filename on S3."
|
||||
)
|
||||
upload_parser.set_defaults(func=lambda args: UploadCommand(args))
|
||||
|
||||
|
||||
|
||||
class ANSI:
|
||||
"""
|
||||
Helper for en.wikipedia.org/wiki/ANSI_escape_code
|
||||
"""
|
||||
_bold = u"\u001b[1m"
|
||||
_reset = u"\u001b[0m"
|
||||
|
||||
_bold = "\u001b[1m"
|
||||
_reset = "\u001b[0m"
|
||||
|
||||
@classmethod
|
||||
def bold(cls, s):
|
||||
return "{}{}{}".format(cls._bold, s, cls._reset)
|
||||
@@ -44,14 +60,16 @@ class BaseUserCommand:
|
||||
|
||||
class LoginCommand(BaseUserCommand):
|
||||
def run(self):
|
||||
print("""
|
||||
_| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|
|
||||
_| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
|
||||
_|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|
|
||||
_| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
|
||||
_| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|
|
||||
print(
|
||||
"""
|
||||
_| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|
|
||||
_| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
|
||||
_|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|
|
||||
_| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
|
||||
_| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|
|
||||
|
||||
""")
|
||||
"""
|
||||
)
|
||||
username = input("Username: ")
|
||||
password = getpass()
|
||||
try:
|
||||
@@ -91,8 +109,7 @@ class LogoutCommand(BaseUserCommand):
|
||||
|
||||
|
||||
class ListObjsCommand(BaseUserCommand):
|
||||
def tabulate(self, rows, headers):
|
||||
# type: (List[List[Union[str, int]]], List[str]) -> str
|
||||
def tabulate(self, rows: List[List[Union[str, int]]], headers: List[str]) -> str:
|
||||
"""
|
||||
Inspired by:
|
||||
stackoverflow.com/a/8356620/593036
|
||||
@@ -101,16 +118,10 @@ class ListObjsCommand(BaseUserCommand):
|
||||
col_widths = [max(len(str(x)) for x in col) for col in zip(*rows, headers)]
|
||||
row_format = ("{{:{}}} " * len(headers)).format(*col_widths)
|
||||
lines = []
|
||||
lines.append(
|
||||
row_format.format(*headers)
|
||||
)
|
||||
lines.append(
|
||||
row_format.format(*["-" * w for w in col_widths])
|
||||
)
|
||||
lines.append(row_format.format(*headers))
|
||||
lines.append(row_format.format(*["-" * w for w in col_widths]))
|
||||
for row in rows:
|
||||
lines.append(
|
||||
row_format.format(*row)
|
||||
)
|
||||
lines.append(row_format.format(*row))
|
||||
return "\n".join(lines)
|
||||
|
||||
def run(self):
|
||||
@@ -126,15 +137,22 @@ class ListObjsCommand(BaseUserCommand):
|
||||
if len(objs) == 0:
|
||||
print("No shared file yet")
|
||||
exit()
|
||||
rows = [ [
|
||||
obj.filename,
|
||||
obj.LastModified,
|
||||
obj.ETag,
|
||||
obj.Size
|
||||
] for obj in objs ]
|
||||
print(
|
||||
self.tabulate(rows, headers=["Filename", "LastModified", "ETag", "Size"])
|
||||
)
|
||||
rows = [[obj.filename, obj.LastModified, obj.ETag, obj.Size] for obj in objs]
|
||||
print(self.tabulate(rows, headers=["Filename", "LastModified", "ETag", "Size"]))
|
||||
|
||||
|
||||
class DeleteObjCommand(BaseUserCommand):
|
||||
def run(self):
|
||||
token = HfFolder.get_token()
|
||||
if token is None:
|
||||
print("Not logged in")
|
||||
exit(1)
|
||||
try:
|
||||
self._api.delete_obj(token, filename=self.args.filename)
|
||||
except HTTPError as e:
|
||||
print(e)
|
||||
exit(1)
|
||||
print("Done")
|
||||
|
||||
|
||||
class UploadCommand(BaseUserCommand):
|
||||
@@ -143,13 +161,7 @@ class UploadCommand(BaseUserCommand):
|
||||
Recursively list all files in a folder.
|
||||
"""
|
||||
entries: List[os.DirEntry] = list(os.scandir(rel_path))
|
||||
files = [
|
||||
(
|
||||
os.path.join(os.getcwd(), f.path), # filepath
|
||||
f.path # filename
|
||||
)
|
||||
for f in entries if f.is_file()
|
||||
]
|
||||
files = [(os.path.join(os.getcwd(), f.path), f.path) for f in entries if f.is_file()] # (filepath, filename)
|
||||
for f in entries:
|
||||
if f.is_dir():
|
||||
files += self.walk_dir(f.path)
|
||||
@@ -172,23 +184,26 @@ class UploadCommand(BaseUserCommand):
|
||||
else:
|
||||
raise ValueError("Not a valid file or directory: {}".format(local_path))
|
||||
|
||||
for filepath, filename in files:
|
||||
if sys.platform == "win32":
|
||||
files = [(filepath, filename.replace(os.sep, "/")) for filepath, filename in files]
|
||||
|
||||
if len(files) > UPLOAD_MAX_FILES:
|
||||
print(
|
||||
"About to upload file {} to S3 under filename {}".format(
|
||||
ANSI.bold(filepath), ANSI.bold(filename)
|
||||
"About to upload {} files to S3. This is probably wrong. Please filter files before uploading.".format(
|
||||
ANSI.bold(len(files))
|
||||
)
|
||||
)
|
||||
exit(1)
|
||||
|
||||
for filepath, filename in files:
|
||||
print("About to upload file {} to S3 under filename {}".format(ANSI.bold(filepath), ANSI.bold(filename)))
|
||||
|
||||
choice = input("Proceed? [Y/n] ").lower()
|
||||
if not(choice == "" or choice == "y" or choice == "yes"):
|
||||
if not (choice == "" or choice == "y" or choice == "yes"):
|
||||
print("Abort")
|
||||
exit()
|
||||
print(
|
||||
ANSI.bold("Uploading... This might take a while if files are large")
|
||||
)
|
||||
print(ANSI.bold("Uploading... This might take a while if files are large"))
|
||||
for filepath, filename in files:
|
||||
access_url = self._api.presign_and_upload(
|
||||
token=token, filename=filename, filepath=filepath
|
||||
)
|
||||
access_url = self._api.presign_and_upload(token=token, filename=filename, filepath=filepath)
|
||||
print("Your file now lives at:")
|
||||
print(access_url)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user