Add a TF in-graph tokenizer for BERT (#17701)

* Add a TF in-graph tokenizer for BERT * Add from_pretrained * Add proper truncation, option handling to match other tokenizers * Add proper imports and guards * Add test, fix all the bugs exposed by said test * Fix truncation of paired texts in graph mode, more test updates * Small fixes, add a (very careful) test for savedmodel * Add tensorflow-text dependency, make fixup * Update documentation * Update documentation * make fixup * Slight changes to tests * Add some docstring examples * Update tests * Update tests and add proper lowercasing/normalization * make fixup * Add docstring for padding! * Mark slow tests * make fixup * Fall back to BertTokenizerFast if BertTokenizer is unavailable * Fall back to BertTokenizerFast if BertTokenizer is unavailable * make fixup * Properly handle tensorflow-text dummies
2022-06-27 12:06:21 +01:00
parent 401fcca6c5
commit ee0d001de7
12 changed files with 402 additions and 3 deletions
--- a/utils/check_dummies.py
+++ b/utils/check_dummies.py
@@ -26,7 +26,7 @@ PATH_TO_TRANSFORMERS = "src/transformers"
 _re_backend = re.compile(r"is\_([a-z_]*)_available()")
 # Matches from xxx import bla
 _re_single_line_import = re.compile(r"\s+from\s+\S*\s+import\s+([^\(\s].*)\n")
-_re_test_backend = re.compile(r"^\s+if\s+not\s+is\_[a-z]*\_available\(\)")
+_re_test_backend = re.compile(r"^\s+if\s+not\s+is\_[a-z_]*\_available\(\)")


 DUMMY_CONSTANT = """