Add a TF in-graph tokenizer for BERT (#17701)

* Add a TF in-graph tokenizer for BERT * Add from_pretrained * Add proper truncation, option handling to match other tokenizers * Add proper imports and guards * Add test, fix all the bugs exposed by said test * Fix truncation of paired texts in graph mode, more test updates * Small fixes, add a (very careful) test for savedmodel * Add tensorflow-text dependency, make fixup * Update documentation * Update documentation * make fixup * Slight changes to tests * Add some docstring examples * Update tests * Update tests and add proper lowercasing/normalization * make fixup * Add docstring for padding! * Mark slow tests * make fixup * Fall back to BertTokenizerFast if BertTokenizer is unavailable * Fall back to BertTokenizerFast if BertTokenizer is unavailable * make fixup * Properly handle tensorflow-text dummies
2022-06-27 12:06:21 +01:00
parent 401fcca6c5
commit ee0d001de7
12 changed files with 402 additions and 3 deletions
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -63,6 +63,7 @@ from .utils import (
    is_soundfile_availble,
    is_spacy_available,
    is_tensorflow_probability_available,
+    is_tensorflow_text_available,
    is_tf2onnx_available,
    is_tf_available,
    is_timm_available,
@@ -361,6 +362,14 @@ def require_tokenizers(test_case):
    return unittest.skipUnless(is_tokenizers_available(), "test requires tokenizers")(test_case)


+def require_tensorflow_text(test_case):
+    """
+    Decorator marking a test that requires tensorflow_text. These tests are skipped when tensroflow_text isn't
+    installed.
+    """
+    return unittest.skipUnless(is_tensorflow_text_available(), "test requires tensorflow_text")(test_case)
+
+
 def require_pandas(test_case):
    """
    Decorator marking a test that requires pandas. These tests are skipped when pandas isn't installed.