Longformer (#4352)
* first commit * bug fixes * better examples * undo padding * remove wrong VOCAB_FILES_NAMES * License * make style * make isort happy * unit tests * integration test * make `black` happy by undoing `isort` changes!! * lint * no need for the padding value * batch_size not bsz * remove unused type casting * seqlen not seq_len * staticmethod * `bert` selfattention instead of `n2` * uint8 instead of bool + lints * pad inputs_embeds using embeddings not a constant * black * unit test with padding * fix unit tests * remove redundant unit test * upload model weights * resolve todo * simpler _mask_invalid_locations without lru_cache + backward compatible masked_fill_ * increase unittest coverage
This commit is contained in:
@@ -29,6 +29,7 @@ from .configuration_auto import (
|
||||
ElectraConfig,
|
||||
FlaubertConfig,
|
||||
GPT2Config,
|
||||
LongformerConfig,
|
||||
OpenAIGPTConfig,
|
||||
ReformerConfig,
|
||||
RobertaConfig,
|
||||
@@ -50,6 +51,7 @@ from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFas
|
||||
from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast
|
||||
from .tokenization_flaubert import FlaubertTokenizer
|
||||
from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
|
||||
from .tokenization_longformer import LongformerTokenizer
|
||||
from .tokenization_marian import MarianTokenizer
|
||||
from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
|
||||
from .tokenization_reformer import ReformerTokenizer
|
||||
@@ -73,6 +75,7 @@ TOKENIZER_MAPPING = OrderedDict(
|
||||
(XLMRobertaConfig, (XLMRobertaTokenizer, None)),
|
||||
(MarianConfig, (MarianTokenizer, None)),
|
||||
(BartConfig, (BartTokenizer, None)),
|
||||
(LongformerConfig, (LongformerTokenizer, None)),
|
||||
(RobertaConfig, (RobertaTokenizer, RobertaTokenizerFast)),
|
||||
(ReformerConfig, (ReformerTokenizer, None)),
|
||||
(ElectraConfig, (ElectraTokenizer, ElectraTokenizerFast)),
|
||||
@@ -105,6 +108,7 @@ class AutoTokenizer:
|
||||
- contains `albert`: AlbertTokenizer (ALBERT model)
|
||||
- contains `camembert`: CamembertTokenizer (CamemBERT model)
|
||||
- contains `xlm-roberta`: XLMRobertaTokenizer (XLM-RoBERTa model)
|
||||
- contains `longformer`: LongformerTokenizer (AllenAI Longformer model)
|
||||
- contains `roberta`: RobertaTokenizer (RoBERTa model)
|
||||
- contains `bert`: BertTokenizer (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
|
||||
@@ -136,6 +140,7 @@ class AutoTokenizer:
|
||||
- contains `albert`: AlbertTokenizer (ALBERT model)
|
||||
- contains `camembert`: CamembertTokenizer (CamemBERT model)
|
||||
- contains `xlm-roberta`: XLMRobertaTokenizer (XLM-RoBERTa model)
|
||||
- contains `longformer`: LongformerTokenizer (AllenAI Longformer model)
|
||||
- contains `roberta`: RobertaTokenizer (RoBERTa model)
|
||||
- contains `bert-base-japanese`: BertJapaneseTokenizer (Bert model)
|
||||
- contains `bert`: BertTokenizer (Bert model)
|
||||
|
||||
Reference in New Issue
Block a user