diff --git a/setup.py b/setup.py index 52fef444f7..2753236f9d 100644 --- a/setup.py +++ b/setup.py @@ -86,7 +86,7 @@ setup( packages=find_packages("src"), install_requires=[ "numpy", - "tokenizers", + "tokenizers == 0.0.10", # accessing files from S3 directly "boto3", # filesystem locks e.g. to prevent parallel downloads diff --git a/src/transformers/tokenization_bert.py b/src/transformers/tokenization_bert.py index cb78f03df7..5362806bdc 100644 --- a/src/transformers/tokenization_bert.py +++ b/src/transformers/tokenization_bert.py @@ -583,12 +583,14 @@ class BertTokenizerFast(FastPreTrainedTokenizer): ) ) if max_length is not None: - self._tokenizer.with_truncation(max_length, stride, truncation_strategy) + self._tokenizer.with_truncation(max_length, + stride=stride, + strategy=truncation_strategy) self._tokenizer.with_padding( - max_length if pad_to_max_length else None, - self.padding_side, - self.pad_token_id, - self.pad_token_type_id, - self.pad_token, + max_length=max_length if pad_to_max_length else None, + direction=self.padding_side, + pad_id=self.pad_token_id, + pad_type_id=self.pad_token_type_id, + pad_token=self.pad_token, ) self._decoder = tk.decoders.WordPiece.new() diff --git a/src/transformers/tokenization_gpt2.py b/src/transformers/tokenization_gpt2.py index bba5eeb762..4cc1d4708e 100644 --- a/src/transformers/tokenization_gpt2.py +++ b/src/transformers/tokenization_gpt2.py @@ -274,15 +274,17 @@ class GPT2TokenizerFast(FastPreTrainedTokenizer): self._tokenizer = tk.Tokenizer(tk.models.BPE.from_files(vocab_file, merges_file)) self._update_special_tokens() - self._tokenizer.with_pre_tokenizer(tk.pre_tokenizers.ByteLevel.new(add_prefix_space)) + self._tokenizer.with_pre_tokenizer(tk.pre_tokenizers.ByteLevel.new(add_prefix_space=add_prefix_space)) self._tokenizer.with_decoder(tk.decoders.ByteLevel.new()) if max_length: - self._tokenizer.with_truncation(max_length, stride, truncation_strategy) + self._tokenizer.with_truncation(max_length, + stride=stride, + strategy=truncation_strategy) self._tokenizer.with_padding( - max_length if pad_to_max_length else None, - self.padding_side, - self.pad_token_id if self.pad_token_id is not None else 0, - self.pad_token_type_id, - self.pad_token if self.pad_token is not None else "", + max_length=max_length if pad_to_max_length else None, + direction=self.padding_side, + pad_id=self.pad_token_id if self.pad_token_id is not None else 0, + pad_type_id=self.pad_token_type_id, + pad_token=self.pad_token if self.pad_token is not None else "", ) self._decoder = tk.decoders.ByteLevel.new() diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index 57e2b909f7..c52ba6ff79 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -1430,10 +1430,10 @@ class FastPreTrainedTokenizer(PreTrainedTokenizer): @property def vocab_size(self): - return self.tokenizer.get_vocab_size(False) + return self.tokenizer.get_vocab_size(with_added_tokens=False) def __len__(self): - return self.tokenizer.get_vocab_size(True) + return self.tokenizer.get_vocab_size(with_added_tokens=True) def _update_special_tokens(self): self.tokenizer.add_special_tokens(self.all_special_tokens)