From 734d29b03d298936db7d7a41824ce15065bdf16c Mon Sep 17 00:00:00 2001 From: Anthony MOI Date: Tue, 24 Dec 2019 13:32:41 -0500 Subject: [PATCH] tokenizers is now a real dependency --- setup.py | 1 + src/transformers/tokenization_bert.py | 76 ++++++++++++--------------- src/transformers/tokenization_gpt2.py | 43 +++++++-------- 3 files changed, 54 insertions(+), 66 deletions(-) diff --git a/setup.py b/setup.py index 558a38ea8b..52fef444f7 100644 --- a/setup.py +++ b/setup.py @@ -86,6 +86,7 @@ setup( packages=find_packages("src"), install_requires=[ "numpy", + "tokenizers", # accessing files from S3 directly "boto3", # filesystem locks e.g. to prevent parallel downloads diff --git a/src/transformers/tokenization_bert.py b/src/transformers/tokenization_bert.py index 05aa42cadc..cb78f03df7 100644 --- a/src/transformers/tokenization_bert.py +++ b/src/transformers/tokenization_bert.py @@ -20,6 +20,8 @@ import logging import os import unicodedata +import tokenizers as tk + from .tokenization_utils import FastPreTrainedTokenizer, PreTrainedTokenizer @@ -552,49 +554,41 @@ class BertTokenizerFast(FastPreTrainedTokenizer): add_special_tokens=True, **kwargs ): + super(BertTokenizerFast, self).__init__( + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + **kwargs + ) - try: - from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors - - super(BertTokenizerFast, self).__init__( - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - **kwargs + self._tokenizer = tk.Tokenizer(tk.models.WordPiece.from_files(vocab_file, unk_token=unk_token)) + self._update_special_tokens() + self._tokenizer.with_pre_tokenizer( + tk.pre_tokenizers.BertPreTokenizer.new( + do_basic_tokenize=do_basic_tokenize, + do_lower_case=do_lower_case, + tokenize_chinese_chars=tokenize_chinese_chars, + never_split=never_split if never_split is not None else [], ) + ) + self._tokenizer.with_decoder(tk.decoders.WordPiece.new()) - self._tokenizer = Tokenizer(models.WordPiece.from_files(vocab_file, unk_token=unk_token)) - self._update_special_tokens() - self._tokenizer.with_pre_tokenizer( - pre_tokenizers.BertPreTokenizer.new( - do_basic_tokenize=do_basic_tokenize, - do_lower_case=do_lower_case, - tokenize_chinese_chars=tokenize_chinese_chars, - never_split=never_split if never_split is not None else [], + if add_special_tokens: + self._tokenizer.with_post_processor( + tk.processors.BertProcessing.new( + (sep_token, self._tokenizer.token_to_id(sep_token)), + (cls_token, self._tokenizer.token_to_id(cls_token)), ) ) - self._tokenizer.with_decoder(decoders.WordPiece.new()) - - if add_special_tokens: - self._tokenizer.with_post_processor( - processors.BertProcessing.new( - (sep_token, self._tokenizer.token_to_id(sep_token)), - (cls_token, self._tokenizer.token_to_id(cls_token)), - ) - ) - if max_length is not None: - self._tokenizer.with_truncation(max_length, stride, truncation_strategy) - self._tokenizer.with_padding( - max_length if pad_to_max_length else None, - self.padding_side, - self.pad_token_id, - self.pad_token_type_id, - self.pad_token, - ) - self._decoder = decoders.WordPiece.new() - - except (AttributeError, ImportError) as e: - logger.error("Make sure you installed `tokenizers` with `pip install tokenizers==0.0.8`") - raise e + if max_length is not None: + self._tokenizer.with_truncation(max_length, stride, truncation_strategy) + self._tokenizer.with_padding( + max_length if pad_to_max_length else None, + self.padding_side, + self.pad_token_id, + self.pad_token_type_id, + self.pad_token, + ) + self._decoder = tk.decoders.WordPiece.new() diff --git a/src/transformers/tokenization_gpt2.py b/src/transformers/tokenization_gpt2.py index 9514975079..bba5eeb762 100644 --- a/src/transformers/tokenization_gpt2.py +++ b/src/transformers/tokenization_gpt2.py @@ -21,6 +21,7 @@ import os from functools import lru_cache import regex as re +import tokenizers as tk from .tokenization_utils import FastPreTrainedTokenizer, PreTrainedTokenizer @@ -267,29 +268,21 @@ class GPT2TokenizerFast(FastPreTrainedTokenizer): truncation_strategy="longest_first", **kwargs ): + super(GPT2TokenizerFast, self).__init__( + bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs + ) - try: - from tokenizers import Tokenizer, models, pre_tokenizers, decoders - - super(GPT2TokenizerFast, self).__init__( - bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs - ) - - self._tokenizer = Tokenizer(models.BPE.from_files(vocab_file, merges_file)) - self._update_special_tokens() - self._tokenizer.with_pre_tokenizer(pre_tokenizers.ByteLevel.new(add_prefix_space)) - self._tokenizer.with_decoder(decoders.ByteLevel.new()) - if max_length: - self._tokenizer.with_truncation(max_length, stride, truncation_strategy) - self._tokenizer.with_padding( - max_length if pad_to_max_length else None, - self.padding_side, - self.pad_token_id if self.pad_token_id is not None else 0, - self.pad_token_type_id, - self.pad_token if self.pad_token is not None else "", - ) - self._decoder = decoders.ByteLevel.new() - - except (AttributeError, ImportError) as e: - logger.error("Make sure you installed `tokenizers` with `pip install tokenizers==0.0.8`") - raise e + self._tokenizer = tk.Tokenizer(tk.models.BPE.from_files(vocab_file, merges_file)) + self._update_special_tokens() + self._tokenizer.with_pre_tokenizer(tk.pre_tokenizers.ByteLevel.new(add_prefix_space)) + self._tokenizer.with_decoder(tk.decoders.ByteLevel.new()) + if max_length: + self._tokenizer.with_truncation(max_length, stride, truncation_strategy) + self._tokenizer.with_padding( + max_length if pad_to_max_length else None, + self.padding_side, + self.pad_token_id if self.pad_token_id is not None else 0, + self.pad_token_type_id, + self.pad_token if self.pad_token is not None else "", + ) + self._decoder = tk.decoders.ByteLevel.new()