Add m2m100 (#10236)

* m2m_100

* no layernorm_embedding

* sinusoidal positional embeddings

* update pos embeddings

* add default config values

* tokenizer

* add conversion script

* fix config

* fix pos embed

* remove _float_tensor

* update tokenizer

* update lang codes

* handle lang codes

* fix pos embeds

* fix spm key

* put embedding weights on device

* remove qa and seq classification heads

* fix convert script

* lang codes pn one line

* fix embeds

* fix tokenizer

* fix tokenizer

* add fast tokenizer

* style

* M2M100MT => M2M100

* fix copyright, style

* tokenizer converter

* vocab file

* remove fast tokenizer

* fix embeds

* fix tokenizer

* fix tests

* add tokenizer tests

* add integration test

* quality

* fix model name

* fix test

* doc

* doc

* fix doc

* add copied from statements

* fix tokenizer tests

* apply review suggestions

* fix urls

* fix shift_tokens_right

* apply review suggestions

* fix

* fix doc

* add lang code to id

* remove unused function

* update checkpoint names

* fix copy

* fix tokenizer

* fix checkpoint names

* fix merge issue

* style
This commit is contained in:
Suraj Patil
2021-03-06 22:14:16 +05:30
committed by GitHub
parent fd01104435
commit f6e74a63ca
17 changed files with 2699 additions and 18 deletions

View File

@@ -30,6 +30,8 @@ PATH_TO_DOC = "docs/source"
# Being in this list is an exception and should **not** be the rule.
IGNORE_NON_TESTED = [
# models to ignore for not tested
"M2M100Encoder", # Building part of bigger (tested) model.
"M2M100Decoder", # Building part of bigger (tested) model.
"LEDEncoder", # Building part of bigger (tested) model.
"LEDDecoder", # Building part of bigger (tested) model.
"BartDecoderWrapper", # Building part of bigger (tested) model.
@@ -75,6 +77,8 @@ TEST_FILES_WITH_NO_COMMON_TESTS = [
# should **not** be the rule.
IGNORE_NON_AUTO_CONFIGURED = [
# models to ignore for model xxx mapping
"M2M100Encoder",
"M2M100Decoder",
"LEDEncoder",
"LEDDecoder",
"BartDecoder",