diff --git a/docs/source/index.rst b/docs/source/index.rst index 1ada9c18d7..f8b9c43670 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -356,6 +356,7 @@ TensorFlow and/or Flax. model_doc/bart model_doc/barthez model_doc/bert + model_doc/bertweet model_doc/bertgeneration model_doc/blenderbot model_doc/camembert diff --git a/docs/source/model_doc/bertweet.rst b/docs/source/model_doc/bertweet.rst new file mode 100644 index 0000000000..4fe1470def --- /dev/null +++ b/docs/source/model_doc/bertweet.rst @@ -0,0 +1,64 @@ +.. + Copyright 2020 The HuggingFace Team. All rights reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + specific language governing permissions and limitations under the License. + +Bertweet +----------------------------------------------------------------------------------------------------------------------- + +Overview +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The BERTweet model was proposed in `BERTweet: A pre-trained language model for English Tweets +`__ by Dat Quoc Nguyen, Thanh Vu, Anh Tuan Nguyen. + +The abstract from the paper is the following: + +*We present BERTweet, the first public large-scale pre-trained language model for English Tweets. Our BERTweet, having +the same architecture as BERT-base (Devlin et al., 2019), is trained using the RoBERTa pre-training procedure (Liu et +al., 2019). Experiments show that BERTweet outperforms strong baselines RoBERTa-base and XLM-R-base (Conneau et al., +2020), producing better performance results than the previous state-of-the-art models on three Tweet NLP tasks: +Part-of-speech tagging, Named-entity recognition and text classification.* + +Example of use: + +.. code-block:: + + import torch + from transformers import AutoModel, AutoTokenizer + + bertweet = AutoModel.from_pretrained("vinai/bertweet-base") + + # For transformers v4.x+: + tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False) + + # For transformers v3.x: + # tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base") + + # INPUT TWEET IS ALREADY NORMALIZED! + line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:" + + input_ids = torch.tensor([tokenizer.encode(line)]) + + with torch.no_grad(): + features = bertweet(input_ids) # Models outputs are now tuples + + ## With TensorFlow 2.0+: + # from transformers import TFAutoModel + # bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base") + + +The original code can be found `here `__. + +BertweetTokenizer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.BertweetTokenizer + :members: diff --git a/utils/check_repo.py b/utils/check_repo.py index 7cd80232dc..343432adc2 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -382,8 +382,6 @@ SHOULD_HAVE_THEIR_OWN_PAGE = [ "BertJapaneseTokenizer", "CharacterTokenizer", "MecabTokenizer", - # Bertweet - "BertweetTokenizer", # Herbert "HerbertTokenizer", "HerbertTokenizerFast",