diff --git a/docs/source/index.rst b/docs/source/index.rst index 43b73efcb4..35b801278a 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -388,6 +388,7 @@ TensorFlow and/or Flax. model_doc/gpt model_doc/gpt2 model_doc/pegasus + model_doc/phobert model_doc/prophetnet model_doc/rag model_doc/reformer diff --git a/docs/source/model_doc/phobert.rst b/docs/source/model_doc/phobert.rst new file mode 100644 index 0000000000..5ef99b4080 --- /dev/null +++ b/docs/source/model_doc/phobert.rst @@ -0,0 +1,59 @@ +.. + Copyright 2020 The HuggingFace Team. All rights reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + specific language governing permissions and limitations under the License. + +PhoBERT +----------------------------------------------------------------------------------------------------------------------- + +Overview +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The PhoBERT model was proposed in `PhoBERT: Pre-trained language models for Vietnamese +`__ by Dat Quoc Nguyen, Anh Tuan Nguyen. + +The abstract from the paper is the following: + +*We present PhoBERT with two versions, PhoBERT-base and PhoBERT-large, the first public large-scale monolingual +language models pre-trained for Vietnamese. Experimental results show that PhoBERT consistently outperforms the recent +best pre-trained multilingual model XLM-R (Conneau et al., 2020) and improves the state-of-the-art in multiple +Vietnamese-specific NLP tasks including Part-of-speech tagging, Dependency parsing, Named-entity recognition and +Natural language inference.* + +Example of use: + +.. code-block:: + + import torch + from transformers import AutoModel, AutoTokenizer + + phobert = AutoModel.from_pretrained("vinai/phobert-base") + tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base") + + # INPUT TEXT MUST BE ALREADY WORD-SEGMENTED! + line = "Tôi là sinh_viên trường đại_học Công_nghệ ." + + input_ids = torch.tensor([tokenizer.encode(line)]) + + with torch.no_grad(): + features = phobert(input_ids) # Models outputs are now tuples + + ## With TensorFlow 2.0+: + # from transformers import TFAutoModel + # phobert = TFAutoModel.from_pretrained("vinai/phobert-base") + + +The original code can be found `here `__. + +PhobertTokenizer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.PhobertTokenizer + :members: diff --git a/utils/check_repo.py b/utils/check_repo.py index 0f6f9db8aa..aefac35684 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -402,8 +402,6 @@ SHOULD_HAVE_THEIR_OWN_PAGE = [ "BertJapaneseTokenizer", "CharacterTokenizer", "MecabTokenizer", - # Phoebus - "PhobertTokenizer", # Benchmarks "PyTorchBenchmark", "PyTorchBenchmarkArguments",