diff --git a/docs/source/en/model_doc/bert.mdx b/docs/source/en/model_doc/bert.mdx index 4975346a7a..67da0012a6 100644 --- a/docs/source/en/model_doc/bert.mdx +++ b/docs/source/en/model_doc/bert.mdx @@ -58,6 +58,10 @@ This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The o [[autodoc]] BertTokenizerFast +## TFBertTokenizer + +[[autodoc]] TFBertTokenizer + ## Bert specific outputs [[autodoc]] models.bert.modeling_bert.BertForPreTrainingOutput diff --git a/setup.py b/setup.py index e73fca64d1..d2eab1f558 100644 --- a/setup.py +++ b/setup.py @@ -155,6 +155,7 @@ _deps = [ "starlette", "tensorflow-cpu>=2.3", "tensorflow>=2.3", + "tensorflow-text", "tf2onnx", "timeout-decorator", "timm", @@ -238,8 +239,8 @@ extras = {} extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic") extras["sklearn"] = deps_list("scikit-learn") -extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx") -extras["tf-cpu"] = deps_list("tensorflow-cpu", "onnxconverter-common", "tf2onnx") +extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx", "tensorflow-text") +extras["tf-cpu"] = deps_list("tensorflow-cpu", "onnxconverter-common", "tf2onnx", "tensorflow-text") extras["torch"] = deps_list("torch") extras["accelerate"] = deps_list("accelerate") diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 0e1a33ba71..32f865a847 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -35,6 +35,7 @@ from .utils import ( is_scatter_available, is_sentencepiece_available, is_speech_available, + is_tensorflow_text_available, is_tf_available, is_timm_available, is_tokenizers_available, @@ -435,6 +436,7 @@ _import_structure = { "is_sentencepiece_available", "is_sklearn_available", "is_speech_available", + "is_tensorflow_text_available", "is_tf_available", "is_timm_available", "is_tokenizers_available", @@ -575,6 +577,19 @@ else: _import_structure["models.mctct"].append("MCTCTFeatureExtractor") _import_structure["models.speech_to_text"].append("Speech2TextFeatureExtractor") +# Tensorflow-text-specific objects +try: + if not is_tensorflow_text_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_tensorflow_text_objects + + _import_structure["utils.dummy_tensorflow_text_objects"] = [ + name for name in dir(dummy_tensorflow_text_objects) if not name.startswith("_") + ] +else: + _import_structure["models.bert"].append("TFBertTokenizer") + try: if not (is_sentencepiece_available() and is_speech_available()): raise OptionalDependencyNotAvailable() @@ -3067,6 +3082,7 @@ if TYPE_CHECKING: is_sentencepiece_available, is_sklearn_available, is_speech_available, + is_tensorflow_text_available, is_tf_available, is_timm_available, is_tokenizers_available, @@ -3183,6 +3199,14 @@ if TYPE_CHECKING: from .models.mctct import MCTCTFeatureExtractor from .models.speech_to_text import Speech2TextFeatureExtractor + try: + if not is_tensorflow_text_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_tensorflow_text_objects import * + else: + from .models.bert import TFBertTokenizer + try: if not (is_speech_available() and is_sentencepiece_available()): raise OptionalDependencyNotAvailable() diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index e0da1a303f..359a07d70e 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -61,6 +61,7 @@ deps = { "starlette": "starlette", "tensorflow-cpu": "tensorflow-cpu>=2.3", "tensorflow": "tensorflow>=2.3", + "tensorflow-text": "tensorflow-text", "tf2onnx": "tf2onnx", "timeout-decorator": "timeout-decorator", "timm": "timm", diff --git a/src/transformers/models/bert/__init__.py b/src/transformers/models/bert/__init__.py index 0abd0c228c..1699b6d5ee 100644 --- a/src/transformers/models/bert/__init__.py +++ b/src/transformers/models/bert/__init__.py @@ -22,6 +22,7 @@ from ...utils import ( OptionalDependencyNotAvailable, _LazyModule, is_flax_available, + is_tensorflow_text_available, is_tf_available, is_tokenizers_available, is_torch_available, @@ -84,6 +85,13 @@ else: "TFBertModel", "TFBertPreTrainedModel", ] +try: + if not is_tensorflow_text_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["tokenization_bert_tf"] = ["TFBertTokenizer"] try: if not is_flax_available(): @@ -160,6 +168,14 @@ if TYPE_CHECKING: TFBertPreTrainedModel, ) + try: + if not is_tensorflow_text_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .tokenization_bert_tf import TFBertTokenizer + try: if not is_flax_available(): raise OptionalDependencyNotAvailable() diff --git a/src/transformers/models/bert/tokenization_bert_tf.py b/src/transformers/models/bert/tokenization_bert_tf.py new file mode 100644 index 0000000000..477ba37e0c --- /dev/null +++ b/src/transformers/models/bert/tokenization_bert_tf.py @@ -0,0 +1,222 @@ +import os +from typing import List, Union + +import tensorflow as tf + +from tensorflow_text import FastBertTokenizer, ShrinkLongestTrimmer, case_fold_utf8, combine_segments, pad_model_inputs + +from .tokenization_bert import BertTokenizer + + +class TFBertTokenizer(tf.keras.layers.Layer): + """ + This is an in-graph tokenizer for BERT. It should be initialized similarly to other tokenizers, using the + `from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings + from an existing standard tokenizer object. + + In-graph tokenizers, unlike other Hugging Face tokenizers, are actually Keras layers and are designed to be run + when the model is called, rather than during preprocessing. As a result, they have somewhat more limited options + than standard tokenizer classes. They are most useful when you want to create an end-to-end model that goes + straight from `tf.string` inputs to outputs. + + Args: + vocab_list (`list`): + List containing the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + cls_token_id (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + sep_token_id (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token_id (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + padding (`str`, defaults to `"longest"`): + The type of padding to use. Can be either `"longest"`, to pad only up to the longest sample in the batch, + or `"max_length", to pad all inputs to the maximum length supported by the tokenizer. + truncation (`bool`, *optional*, defaults to `True`): + Whether to truncate the sequence to the maximum length. + max_length (`int`, *optional*, defaults to `512`): + The maximum length of the sequence, used for padding (if `padding` is "max_length") and/or truncation (if + `truncation` is `True`). + pad_to_multiple_of (`int`, *optional*, defaults to `None`): + If set, the sequence will be padded to a multiple of this value. + return_token_type_ids (`bool`, *optional*, defaults to `True`): + Whether to return token_type_ids. + return_attention_mask (`bool`, *optional*, defaults to `True`): + Whether to return the attention_mask. + """ + + def __init__( + self, + vocab_list: List, + do_lower_case: bool, + cls_token_id: int = None, + sep_token_id: int = None, + pad_token_id: int = None, + padding: str = "longest", + truncation: bool = True, + max_length: int = 512, + pad_to_multiple_of: int = None, + return_token_type_ids: bool = True, + return_attention_mask: bool = True, + ): + super().__init__() + self.tf_tokenizer = FastBertTokenizer( + vocab_list, token_out_type=tf.int64, lower_case_nfd_strip_accents=do_lower_case + ) + self.vocab_list = vocab_list + self.do_lower_case = do_lower_case + self.cls_token_id = cls_token_id or vocab_list.index("[CLS]") + self.sep_token_id = sep_token_id or vocab_list.index("[SEP]") + self.pad_token_id = pad_token_id or vocab_list.index("[PAD]") + self.paired_trimmer = ShrinkLongestTrimmer(max_length - 3, axis=1) # Allow room for special tokens + self.max_length = max_length + self.padding = padding + self.truncation = truncation + self.pad_to_multiple_of = pad_to_multiple_of + self.return_token_type_ids = return_token_type_ids + self.return_attention_mask = return_attention_mask + + @classmethod + def from_tokenizer(cls, tokenizer: "PreTrainedTokenizerBase", **kwargs): # noqa: F821 + """ + Initialize a `TFBertTokenizer` from an existing `Tokenizer`. + + Args: + tokenizer (`PreTrainedTokenizerBase`): + The tokenizer to use to initialize the `TFBertTokenizer`. + + Examples: + + ```python + from transformers import AutoTokenizer, TFBertTokenizer + + tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + tf_tokenizer = TFBertTokenizer.from_tokenizer(tokenizer) + ``` + """ + vocab = tokenizer.get_vocab() + vocab = sorted([(wordpiece, idx) for wordpiece, idx in vocab.items()], key=lambda x: x[1]) + vocab_list = [entry[0] for entry in vocab] + return cls( + vocab_list=vocab_list, + do_lower_case=tokenizer.do_lower_case, + cls_token_id=tokenizer.cls_token_id, + sep_token_id=tokenizer.sep_token_id, + pad_token_id=tokenizer.pad_token_id, + **kwargs, + ) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], *init_inputs, **kwargs): + """ + Instantiate a `TFBertTokenizer` from a pre-trained tokenizer. + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + The name or path to the pre-trained tokenizer. + + Examples: + + ```python + from transformers import TFBertTokenizer + + tf_tokenizer = TFBertTokenizer.from_pretrained("bert-base-uncased") + ``` + """ + try: + tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs) + except: # noqa: E722 + from .tokenization_bert_fast import BertTokenizerFast + + tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs) + return cls.from_tokenizer(tokenizer, **kwargs) + + def unpaired_tokenize(self, texts): + if self.do_lower_case: + texts = case_fold_utf8(texts) + return self.tf_tokenizer.tokenize(texts) + + def call( + self, + text, + text_pair=None, + padding=None, + truncation=None, + max_length=None, + pad_to_multiple_of=None, + return_token_type_ids=None, + return_attention_mask=None, + ): + if padding is None: + padding = self.padding + if padding not in ("longest", "max_length"): + raise ValueError("Padding must be either 'longest' or 'max_length'!") + if max_length is not None and text_pair is not None: + # Because we have to instantiate a Trimmer to do it properly + raise ValueError("max_length cannot be overridden at call time when truncating paired texts!") + if max_length is None: + max_length = self.max_length + if truncation is None: + truncation = self.truncation + if pad_to_multiple_of is None: + pad_to_multiple_of = self.pad_to_multiple_of + if return_token_type_ids is None: + return_token_type_ids = self.return_token_type_ids + if return_attention_mask is None: + return_attention_mask = self.return_attention_mask + if not isinstance(text, tf.Tensor): + text = tf.convert_to_tensor(text) + if text_pair is not None and not isinstance(text_pair, tf.Tensor): + text_pair = tf.convert_to_tensor(text_pair) + if text_pair is not None: + if text.shape.rank > 1: + raise ValueError("text argument should not be multidimensional when a text pair is supplied!") + if text_pair.shape.rank > 1: + raise ValueError("text_pair should not be multidimensional!") + if text.shape.rank == 2: + text, text_pair = text[:, 0], text[:, 1] + text = self.unpaired_tokenize(text) + if text_pair is None: # Unpaired text + if truncation: + text = text[:, : max_length - 2] # Allow room for special tokens + input_ids, token_type_ids = combine_segments( + (text,), start_of_sequence_id=self.cls_token_id, end_of_segment_id=self.sep_token_id + ) + else: # Paired text + text_pair = self.unpaired_tokenize(text_pair) + if truncation: + text, text_pair = self.paired_trimmer.trim([text, text_pair]) + input_ids, token_type_ids = combine_segments( + (text, text_pair), start_of_sequence_id=self.cls_token_id, end_of_segment_id=self.sep_token_id + ) + if padding == "longest": + pad_length = input_ids.bounding_shape(axis=1) + if pad_to_multiple_of is not None: + # No ceiling division in tensorflow, so we negate floordiv instead + pad_length = pad_to_multiple_of * (-tf.math.floordiv(-pad_length, pad_to_multiple_of)) + else: + pad_length = max_length + + input_ids, attention_mask = pad_model_inputs(input_ids, max_seq_length=pad_length, pad_value=self.pad_token_id) + output = {"input_ids": input_ids} + if return_attention_mask: + output["attention_mask"] = attention_mask + if return_token_type_ids: + token_type_ids, _ = pad_model_inputs( + token_type_ids, max_seq_length=pad_length, pad_value=self.pad_token_id + ) + output["token_type_ids"] = token_type_ids + return output + + def get_config(self): + return { + "vocab_list": self.vocab_list, + "do_lower_case": self.do_lower_case, + "cls_token_id": self.cls_token_id, + "sep_token_id": self.sep_token_id, + "pad_token_id": self.pad_token_id, + } diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 211f94f1f9..569b117667 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -63,6 +63,7 @@ from .utils import ( is_soundfile_availble, is_spacy_available, is_tensorflow_probability_available, + is_tensorflow_text_available, is_tf2onnx_available, is_tf_available, is_timm_available, @@ -361,6 +362,14 @@ def require_tokenizers(test_case): return unittest.skipUnless(is_tokenizers_available(), "test requires tokenizers")(test_case) +def require_tensorflow_text(test_case): + """ + Decorator marking a test that requires tensorflow_text. These tests are skipped when tensroflow_text isn't + installed. + """ + return unittest.skipUnless(is_tensorflow_text_available(), "test requires tensorflow_text")(test_case) + + def require_pandas(test_case): """ Decorator marking a test that requires pandas. These tests are skipped when pandas isn't installed. diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index 920b3c0386..f96c281c01 100644 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -119,6 +119,7 @@ from .import_utils import ( is_spacy_available, is_speech_available, is_tensorflow_probability_available, + is_tensorflow_text_available, is_tf2onnx_available, is_tf_available, is_timm_available, diff --git a/src/transformers/utils/dummy_tensorflow_text_objects.py b/src/transformers/utils/dummy_tensorflow_text_objects.py new file mode 100644 index 0000000000..691774bb6b --- /dev/null +++ b/src/transformers/utils/dummy_tensorflow_text_objects.py @@ -0,0 +1,10 @@ +# This file is autogenerated by the command `make fix-copies`, do not edit. +# flake8: noqa +from ..utils import DummyObject, requires_backends + + +class TFBertTokenizer(metaclass=DummyObject): + _backends = ["tensorflow_text"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tensorflow_text"]) diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index da42c73587..270f6c06c0 100644 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -491,6 +491,10 @@ def is_spacy_available(): return importlib.util.find_spec("spacy") is not None +def is_tensorflow_text_available(): + return importlib.util.find_spec("tensorflow_text") is not None + + def is_in_notebook(): try: # Test adapted from tqdm.autonotebook: https://github.com/tqdm/tqdm/blob/master/tqdm/autonotebook.py @@ -721,6 +725,12 @@ TENSORFLOW_PROBABILITY_IMPORT_ERROR = """ explained here: https://github.com/tensorflow/probability. """ +# docstyle-ignore +TENSORFLOW_TEXT_IMPORT_ERROR = """ +{0} requires the tensorflow_text library but it was not found in your environment. You can install it with pip as +explained here: https://www.tensorflow.org/text/guide/tf_text_intro. +""" + # docstyle-ignore PANDAS_IMPORT_ERROR = """ @@ -800,6 +810,7 @@ BACKENDS_MAPPING = OrderedDict( ("speech", (is_speech_available, SPEECH_IMPORT_ERROR)), ("tensorflow_probability", (is_tensorflow_probability_available, TENSORFLOW_PROBABILITY_IMPORT_ERROR)), ("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)), + ("tensorflow_text", (is_tensorflow_text_available, TENSORFLOW_TEXT_IMPORT_ERROR)), ("timm", (is_timm_available, TIMM_IMPORT_ERROR)), ("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)), ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)), diff --git a/tests/models/bert/test_tokenization_bert_tf.py b/tests/models/bert/test_tokenization_bert_tf.py new file mode 100644 index 0000000000..4ace9c9360 --- /dev/null +++ b/tests/models/bert/test_tokenization_bert_tf.py @@ -0,0 +1,100 @@ +import unittest +from pathlib import Path +from tempfile import TemporaryDirectory + +from transformers import AutoConfig, TFAutoModel, is_tensorflow_text_available, is_tf_available +from transformers.models.bert.tokenization_bert import BertTokenizer +from transformers.testing_utils import require_tensorflow_text, slow + + +if is_tensorflow_text_available(): + from transformers.models.bert import TFBertTokenizer + +if is_tf_available(): + import tensorflow as tf + + +TOKENIZER_CHECKPOINTS = ["bert-base-uncased", "bert-base-cased"] +TINY_MODEL_CHECKPOINT = "hf-internal-testing/tiny-bert-tf-only" + +if is_tf_available(): + + class ModelToSave(tf.keras.Model): + def __init__(self, tokenizer): + super().__init__() + self.tokenizer = tokenizer + config = AutoConfig.from_pretrained(TINY_MODEL_CHECKPOINT) + self.bert = TFAutoModel.from_config(config) + + def call(self, inputs): + tokenized = self.tokenizer(inputs) + out = self.bert(**tokenized) + return out["pooler_output"] + + +@require_tensorflow_text +class BertTokenizationTest(unittest.TestCase): + # The TF tokenizers are usually going to be used as pretrained tokenizers from existing model checkpoints, + # so that's what we focus on here. + + def setUp(self): + super().setUp() + + self.tokenizers = [BertTokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS] + self.tf_tokenizers = [TFBertTokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS] + self.test_sentences = [ + "This is a straightforward English test sentence.", + "This one has some weird characters\rto\nsee\r\nif those\u00E9break things.", + "Now we're going to add some Chinese: 一 二 三 一二三", + "And some much more rare Chinese: 齉 堃 齉堃", + "Je vais aussi écrire en français pour tester les accents", + "Classical Irish also has some unusual characters, so in they go: Gaelaċ, ꝼ", + ] + self.paired_sentences = list(zip(self.test_sentences, self.test_sentences[::-1])) + + def test_output_equivalence(self): + for tokenizer, tf_tokenizer in zip(self.tokenizers, self.tf_tokenizers): + for test_inputs in (self.test_sentences, self.paired_sentences): + python_outputs = tokenizer(test_inputs, return_tensors="tf", padding="longest") + tf_outputs = tf_tokenizer(test_inputs) + + for key in python_outputs.keys(): + self.assertTrue(tf.reduce_all(python_outputs[key].shape == tf_outputs[key].shape)) + self.assertTrue(tf.reduce_all(tf.cast(python_outputs[key], tf.int64) == tf_outputs[key])) + + @slow + def test_different_pairing_styles(self): + for tf_tokenizer in self.tf_tokenizers: + merged_outputs = tf_tokenizer(self.paired_sentences) + separated_outputs = tf_tokenizer( + text=[sentence[0] for sentence in self.paired_sentences], + text_pair=[sentence[1] for sentence in self.paired_sentences], + ) + for key in merged_outputs.keys(): + self.assertTrue(tf.reduce_all(tf.cast(merged_outputs[key], tf.int64) == separated_outputs[key])) + + @slow + def test_graph_mode(self): + for tf_tokenizer in self.tf_tokenizers: + compiled_tokenizer = tf.function(tf_tokenizer) + for test_inputs in (self.test_sentences, self.paired_sentences): + test_inputs = tf.constant(test_inputs) + compiled_outputs = compiled_tokenizer(test_inputs) + eager_outputs = tf_tokenizer(test_inputs) + + for key in eager_outputs.keys(): + self.assertTrue(tf.reduce_all(eager_outputs[key] == compiled_outputs[key])) + + @slow + def test_saved_model(self): + for tf_tokenizer in self.tf_tokenizers: + model = ModelToSave(tokenizer=tf_tokenizer) + test_inputs = tf.convert_to_tensor(self.test_sentences) + out = model(test_inputs) # Build model with some sample inputs + with TemporaryDirectory() as tempdir: + save_path = Path(tempdir) / "saved.model" + model.save(save_path) + loaded_model = tf.keras.models.load_model(save_path) + loaded_output = loaded_model(test_inputs) + # We may see small differences because the loaded model is compiled, so we need an epsilon for the test + self.assertLessEqual(tf.reduce_max(tf.abs(out - loaded_output)), 1e-5) diff --git a/utils/check_dummies.py b/utils/check_dummies.py index d6c1c4b592..484aac2545 100644 --- a/utils/check_dummies.py +++ b/utils/check_dummies.py @@ -26,7 +26,7 @@ PATH_TO_TRANSFORMERS = "src/transformers" _re_backend = re.compile(r"is\_([a-z_]*)_available()") # Matches from xxx import bla _re_single_line_import = re.compile(r"\s+from\s+\S*\s+import\s+([^\(\s].*)\n") -_re_test_backend = re.compile(r"^\s+if\s+not\s+is\_[a-z]*\_available\(\)") +_re_test_backend = re.compile(r"^\s+if\s+not\s+is\_[a-z_]*\_available\(\)") DUMMY_CONSTANT = """