Longformer (#4352)

* first commit * bug fixes * better examples * undo padding * remove wrong VOCAB_FILES_NAMES * License * make style * make isort happy * unit tests * integration test * make `black` happy by undoing `isort` changes!! * lint * no need for the padding value * batch_size not bsz * remove unused type casting * seqlen not seq_len * staticmethod * `bert` selfattention instead of `n2` * uint8 instead of bool + lints * pad inputs_embeds using embeddings not a constant * black * unit test with padding * fix unit tests * remove redundant unit test * upload model weights * resolve todo * simpler _mask_invalid_locations without lru_cache + backward compatible masked_fill_ * increase unittest coverage
2020-05-19 07:04:43 -07:00
parent 31eedff5a0
commit 8f1d047148
10 changed files with 1113 additions and 3 deletions
--- a/src/transformers/tokenization_longformer.py
+++ b/src/transformers/tokenization_longformer.py
@@ -0,0 +1,42 @@
+# coding=utf-8
+# Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+from .tokenization_roberta import RobertaTokenizer
+
+
+logger = logging.getLogger(__name__)
+
+
+# vocab and merges same as roberta
+vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
+merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
+_all_longformer_models = ["longformer-base-4096", "longformer-large-4096"]
+
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "longformer-base-4096": 4096,
+    "longformer-large-4096": 4096,
+}
+
+
+class LongformerTokenizer(RobertaTokenizer):
+    # merges and vocab same as Roberta
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_vocab_files_map = {
+        "vocab_file": {m: vocab_url for m in _all_longformer_models},
+        "merges_file": {m: merges_url for m in _all_longformer_models},
+    }