[Tokenizer Utils Base] Make pad function more flexible (#9928)

* change tokenizer requirement

* split line

* Correct typo from list to str

* improve style

* make other function pretty as well

* add comment

* correct typo

* add new test

* pass tests for tok without padding token

* Apply suggestions from code review
This commit is contained in:
Patrick von Platen
2021-02-02 10:35:27 +03:00
committed by GitHub
parent d1b14c9b54
commit 538b3b4607
40 changed files with 187 additions and 107 deletions

View File

@@ -125,7 +125,7 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
@@ -270,7 +270,7 @@ class {{cookiecutter.camelcase_modelname}}TokenizerFast(PreTrainedTokenizerFast)
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,