add in layer gpt2 tokenizer (#20421)

* add minimal working gpt2 tokenizer

* graph mode and output equivalence tests working

* not today tensorflow. serialization test passing!

* fix style, documentation, docstrings and all that jazz

* passing consistency checks

* move keras nlp to tf dependencies

* fix tf modeling utils and gpt2 attention to enable compiling

* fix (I hope) keras nlp dependencies

* rever changes on generation

* remove debug prints

* remove redundant tf dummy objects

* add from config, get config and max length settings to address review

* let flake ignore the error on distillation you are welcome

* test from config

* add padding test

* address sgugger review
This commit is contained in:
Pi Esposito
2022-11-29 12:02:40 -03:00
committed by GitHub
parent e8d448edcf
commit fb2b45e562
11 changed files with 297 additions and 4 deletions

View File

@@ -124,6 +124,7 @@ _deps = [
"jaxlib>=0.1.65,<=0.3.6",
"jieba",
"kenlm",
"keras-nlp>=0.3.1",
"nltk",
"natten>=0.14.4",
"numpy>=1.17",
@@ -241,14 +242,13 @@ class DepsTableUpdateCommand(Command):
with open(target, "w", encoding="utf-8", newline="\n") as f:
f.write("\n".join(content))
extras = {}
extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic", "sudachipy", "sudachidict_core", "pyknp")
extras["sklearn"] = deps_list("scikit-learn")
extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx", "tensorflow-text")
extras["tf-cpu"] = deps_list("tensorflow-cpu", "onnxconverter-common", "tf2onnx", "tensorflow-text")
extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp")
extras["tf-cpu"] = deps_list("tensorflow-cpu", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp")
extras["torch"] = deps_list("torch")
extras["accelerate"] = deps_list("accelerate")