From 3d587c53432c7c392aa08f582015c846ca076540 Mon Sep 17 00:00:00 2001 From: Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com> Date: Mon, 18 Oct 2021 21:16:46 +0700 Subject: [PATCH] Add BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese (#13788) * Add the pre-trained BARTpho model * Add the pre-trained BARTpho model * Add the pre-trained BARTpho model * Fix incorrectly sorted and/or formatted imports * Fix incorrectly sorted and/or formatted style * Fix check_dummies * Fix check_dummies * Fix check_dummies * Update docs/source/model_doc/bartpho.rst Co-authored-by: Suraj Patil * Update src/transformers/models/bartpho/__init__.py Co-authored-by: Suraj Patil * Update src/transformers/models/bartpho/tokenization_bartpho.py Co-authored-by: Suraj Patil * Update tests/test_tokenization_bartpho.py Co-authored-by: Suraj Patil * Update src/transformers/models/bartpho/tokenization_bartpho.py Co-authored-by: Suraj Patil * Update tests/test_tokenization_bartpho.py Co-authored-by: Suraj Patil * Update docs/source/model_doc/bartpho.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update docs/source/model_doc/bartpho.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/bartpho/__init__.py Co-authored-by: Suraj Patil * Add the pre-trained BARTpho model * Add Tips section in doc and details of monolingual_vocab_file * Fix conflicts * Add another tip related to monolingual_vocab_file * Readd dependency_versions_table.py * Handle failing checks * Remove test_list.txt * Remove md5sum.saved * Revise Readme.md Co-authored-by: Suraj Patil Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- README.md | 3 + README_zh-hans.md | 3 + README_zh-hant.md | 3 + docs/source/index.rst | 165 +++++----- docs/source/model_doc/bartpho.rst | 86 +++++ docs/source/model_doc/bertweet.rst | 2 +- docs/source/model_doc/phobert.rst | 3 +- src/transformers/__init__.py | 3 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 1 + .../models/auto/tokenization_auto.py | 1 + src/transformers/models/bartpho/__init__.py | 36 +++ .../models/bartpho/tokenization_bartpho.py | 304 ++++++++++++++++++ .../utils/dummy_sentencepiece_objects.py | 9 + tests/test_tokenization_bartpho.py | 66 ++++ 15 files changed, 607 insertions(+), 79 deletions(-) create mode 100644 docs/source/model_doc/bartpho.rst create mode 100644 src/transformers/models/bartpho/__init__.py create mode 100644 src/transformers/models/bartpho/tokenization_bartpho.py create mode 100644 tests/test_tokenization_bartpho.py diff --git a/README.md b/README.md index 6d2c9a3820..36db757fb9 100644 --- a/README.md +++ b/README.md @@ -211,8 +211,10 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. 1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer. 1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis. +1. **[BARTpho](https://huggingface.co/transformers/master/model_doc/bartpho.html)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen. 1. **[BEiT](https://huggingface.co/transformers/model_doc/beit.html)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei. 1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. +1. **[BERTweet](https://huggingface.co/transformers/master/model_doc/bertweet.html)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen. 1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. 1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. 1. **[BigBird-Pegasus](https://huggingface.co/transformers/model_doc/bigbird_pegasus.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. @@ -262,6 +264,7 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. 1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. +1. **[PhoBERT](https://huggingface.co/transformers/master/model_doc/phobert.html)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen. 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. 1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. diff --git a/README_zh-hans.md b/README_zh-hans.md index f234ac505b..fd6e05ce4c 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -235,9 +235,11 @@ conda install -c huggingface transformers 1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (来自 Google Research and the Toyota Technological Institute at Chicago) 伴随论文 [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), 由 Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut 发布。 1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (来自 Facebook) 伴随论文 [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) 由 Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer 发布。 1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (来自 École polytechnique) 伴随论文 [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) 由 Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis 发布。 +1. **[BARTpho](https://huggingface.co/transformers/master/model_doc/bartpho.html)** (来自 VinAI Research) 伴随论文 [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) 由 Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen 发布。 1. **[BEiT](https://huggingface.co/transformers/model_doc/beit.html)** (来自 Microsoft) 伴随论文 [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) 由 Hangbo Bao, Li Dong, Furu Wei 发布。 1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (来自 Google) 伴随论文 [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) 由 Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova 发布。 1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (来自 Google) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。 +1. **[BERTweet](https://huggingface.co/transformers/master/model_doc/bertweet.html)** (来自 VinAI Research) 伴随论文 [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) 由 Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen 发布。 1. **[BigBird-Pegasus](https://huggingface.co/transformers/model_doc/bigbird_pegasus.html)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。 1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。 1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。 @@ -284,6 +286,7 @@ conda install -c huggingface transformers 1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。 1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。 1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。 +1. **[PhoBERT](https://huggingface.co/transformers/master/model_doc/phobert.html)** (来自 VinAI Research) 伴随论文 [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) 由 Dat Quoc Nguyen and Anh Tuan Nguyen 发布。 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (来自 Google Research) 伴随论文 [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 由 Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya 发布。 1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index 4a428ed7b8..0950c7fc2b 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -247,9 +247,11 @@ conda install -c huggingface transformers 1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. 1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer. 1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis. +1. **[BARTpho](https://huggingface.co/transformers/master/model_doc/bartpho.html)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen. 1. **[BEiT](https://huggingface.co/transformers/model_doc/beit.html)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei. 1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. 1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. +1. **[BERTweet](https://huggingface.co/transformers/master/model_doc/bertweet.html)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen. 1. **[BigBird-Pegasus](https://huggingface.co/transformers/model_doc/bigbird_pegasus.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. 1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. 1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. @@ -296,6 +298,7 @@ conda install -c huggingface transformers 1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. 1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. 1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. +1. **[PhoBERT](https://huggingface.co/transformers/master/model_doc/phobert.html)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen. 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. 1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. diff --git a/docs/source/index.rst b/docs/source/index.rst index 60792ce2c1..93b446cdd2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -105,228 +105,238 @@ Supported models 3. :doc:`BARThez ` (from École polytechnique) released with the paper `BARThez: a Skilled Pretrained French Sequence-to-Sequence Model `__ by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis. -4. :doc:`BEiT ` (from Microsoft) released with the paper `BEiT: BERT Pre-Training of Image Transformers +4. `BARTpho `__ (from VinAI Research) released with + the paper `BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese `__ by + Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen. +5. :doc:`BEiT ` (from Microsoft) released with the paper `BEiT: BERT Pre-Training of Image Transformers `__ by Hangbo Bao, Li Dong, Furu Wei. -5. :doc:`BERT ` (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional +6. :doc:`BERT ` (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding `__ by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. -6. :doc:`BERT For Sequence Generation ` (from Google) released with the paper `Leveraging +7. `BERTweet `__ (from VinAI Research) released + with the paper `BERTweet: A pre-trained language model for English Tweets + `__ by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen. +8. :doc:`BERT For Sequence Generation ` (from Google) released with the paper `Leveraging Pre-trained Checkpoints for Sequence Generation Tasks `__ by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. -7. :doc:`BigBird-RoBERTa ` (from Google Research) released with the paper `Big Bird: Transformers +9. :doc:`BigBird-RoBERTa ` (from Google Research) released with the paper `Big Bird: Transformers for Longer Sequences `__ by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. -8. :doc:`BigBird-Pegasus ` (from Google Research) released with the paper `Big Bird: - Transformers for Longer Sequences `__ by Manzil Zaheer, Guru Guruganesh, Avinava - Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed. -9. :doc:`Blenderbot ` (from Facebook) released with the paper `Recipes for building an - open-domain chatbot `__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary - Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. -10. :doc:`BlenderbotSmall ` (from Facebook) released with the paper `Recipes for building +10. :doc:`BigBird-Pegasus ` (from Google Research) released with the paper `Big Bird: + Transformers for Longer Sequences `__ by Manzil Zaheer, Guru Guruganesh, Avinava + Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr + Ahmed. +11. :doc:`Blenderbot ` (from Facebook) released with the paper `Recipes for building an + open-domain chatbot `__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary + Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. +12. :doc:`BlenderbotSmall ` (from Facebook) released with the paper `Recipes for building an open-domain chatbot `__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston. -11. :doc:`BORT ` (from Alexa) released with the paper `Optimal Subarchitecture Extraction For BERT +13. :doc:`BORT ` (from Alexa) released with the paper `Optimal Subarchitecture Extraction For BERT `__ by Adrian de Wynter and Daniel J. Perry. -12. :doc:`ByT5 ` (from Google Research) released with the paper `ByT5: Towards a token-free future with +14. :doc:`ByT5 ` (from Google Research) released with the paper `ByT5: Towards a token-free future with pre-trained byte-to-byte models `__ by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel. -13. :doc:`CamemBERT ` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty +15. :doc:`CamemBERT ` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty French Language Model `__ by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot. -14. :doc:`CANINE ` (from Google Research) released with the paper `CANINE: Pre-training an Efficient +16. :doc:`CANINE ` (from Google Research) released with the paper `CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation `__ by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting. -15. :doc:`CLIP ` (from OpenAI) released with the paper `Learning Transferable Visual Models From +17. :doc:`CLIP ` (from OpenAI) released with the paper `Learning Transferable Visual Models From Natural Language Supervision `__ by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. -16. :doc:`ConvBERT ` (from YituTech) released with the paper `ConvBERT: Improving BERT with +18. :doc:`ConvBERT ` (from YituTech) released with the paper `ConvBERT: Improving BERT with Span-based Dynamic Convolution `__ by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. -17. :doc:`CPM ` (from Tsinghua University) released with the paper `CPM: A Large-scale Generative +19. :doc:`CPM ` (from Tsinghua University) released with the paper `CPM: A Large-scale Generative Chinese Pre-trained Language Model `__ by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun. -18. :doc:`CTRL ` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language +20. :doc:`CTRL ` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language Model for Controllable Generation `__ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher. -19. :doc:`DeBERTa ` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with +21. :doc:`DeBERTa ` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with Disentangled Attention `__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. -20. :doc:`DeBERTa-v2 ` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT +22. :doc:`DeBERTa-v2 ` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with Disentangled Attention `__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. -21. :doc:`DeiT ` (from Facebook) released with the paper `Training data-efficient image transformers & +23. :doc:`DeiT ` (from Facebook) released with the paper `Training data-efficient image transformers & distillation through attention `__ by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. -22. :doc:`DETR ` (from Facebook) released with the paper `End-to-End Object Detection with Transformers +24. :doc:`DETR ` (from Facebook) released with the paper `End-to-End Object Detection with Transformers `__ by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. -23. :doc:`DialoGPT ` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale +25. :doc:`DialoGPT ` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation `__ by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. -24. :doc:`DistilBERT ` (from HuggingFace), released together with the paper `DistilBERT, a +26. :doc:`DistilBERT ` (from HuggingFace), released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter `__ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 `__, RoBERTa into `DistilRoBERTa `__, Multilingual BERT into `DistilmBERT `__ and a German version of DistilBERT. -25. :doc:`DPR ` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain +27. :doc:`DPR ` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain Question Answering `__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. -26. :doc:`EncoderDecoder ` (from Google Research) released with the paper `Leveraging +28. :doc:`EncoderDecoder ` (from Google Research) released with the paper `Leveraging Pre-trained Checkpoints for Sequence Generation Tasks `__ by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. -27. :doc:`ELECTRA ` (from Google Research/Stanford University) released with the paper `ELECTRA: +29. :doc:`ELECTRA ` (from Google Research/Stanford University) released with the paper `ELECTRA: Pre-training text encoders as discriminators rather than generators `__ by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. -28. :doc:`FlauBERT ` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model +30. :doc:`FlauBERT ` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model Pre-training for French `__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. -29. :doc:`FNet ` (from Google Research) released with the paper `FNet: Mixing Tokens with Fourier +31. :doc:`FNet ` (from Google Research) released with the paper `FNet: Mixing Tokens with Fourier Transforms `__ by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. -30. :doc:`Funnel Transformer ` (from CMU/Google Brain) released with the paper `Funnel-Transformer: +32. :doc:`Funnel Transformer ` (from CMU/Google Brain) released with the paper `Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing `__ by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. -31. :doc:`GPT ` (from OpenAI) released with the paper `Improving Language Understanding by Generative +33. :doc:`GPT ` (from OpenAI) released with the paper `Improving Language Understanding by Generative Pre-Training `__ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. -32. :doc:`GPT-2 ` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask +34. :doc:`GPT-2 ` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask Learners `__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**. -33. :doc:`GPT-J ` (from EleutherAI) released in the repository `kingoflolz/mesh-transformer-jax +35. :doc:`GPT-J ` (from EleutherAI) released in the repository `kingoflolz/mesh-transformer-jax `__ by Ben Wang and Aran Komatsuzaki. -34. :doc:`GPT Neo ` (from EleutherAI) released in the repository `EleutherAI/gpt-neo +36. :doc:`GPT Neo ` (from EleutherAI) released in the repository `EleutherAI/gpt-neo `__ by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. -35. :doc:`Hubert ` (from Facebook) released with the paper `HuBERT: Self-Supervised Speech +37. :doc:`Hubert ` (from Facebook) released with the paper `HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units `__ by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. -36. :doc:`I-BERT ` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization +38. :doc:`I-BERT ` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization `__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. -37. :doc:`LayoutLM ` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training +39. :doc:`LayoutLM ` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training of Text and Layout for Document Image Understanding `__ by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou. -38. :doc:`LayoutLMv2 ` (from Microsoft Research Asia) released with the paper `LayoutLMv2: +40. :doc:`LayoutLMv2 ` (from Microsoft Research Asia) released with the paper `LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding `__ by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou. -39. :doc:`LayoutXLM ` (from Microsoft Research Asia) released with the paper `LayoutXLM: +41. :doc:`LayoutXLM ` (from Microsoft Research Asia) released with the paper `LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding `__ by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei. -40. :doc:`LED ` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer +42. :doc:`LED ` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer `__ by Iz Beltagy, Matthew E. Peters, Arman Cohan. -41. :doc:`Longformer ` (from AllenAI) released with the paper `Longformer: The Long-Document +43. :doc:`Longformer ` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer `__ by Iz Beltagy, Matthew E. Peters, Arman Cohan. -42. :doc:`LUKE ` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity +44. :doc:`LUKE ` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention `__ by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto. -43. :doc:`LXMERT ` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality +45. :doc:`LXMERT ` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering `__ by Hao Tan and Mohit Bansal. -44. :doc:`M2M100 ` (from Facebook) released with the paper `Beyond English-Centric Multilingual +46. :doc:`M2M100 ` (from Facebook) released with the paper `Beyond English-Centric Multilingual Machine Translation `__ by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin. -45. :doc:`MarianMT ` Machine translation models trained using `OPUS `__ data by +47. :doc:`MarianMT ` Machine translation models trained using `OPUS `__ data by Jörg Tiedemann. The `Marian Framework `__ is being developed by the Microsoft Translator Team. -46. :doc:`MBart ` (from Facebook) released with the paper `Multilingual Denoising Pre-training for +48. :doc:`MBart ` (from Facebook) released with the paper `Multilingual Denoising Pre-training for Neural Machine Translation `__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer. -47. :doc:`MBart-50 ` (from Facebook) released with the paper `Multilingual Translation with Extensible +49. :doc:`MBart-50 ` (from Facebook) released with the paper `Multilingual Translation with Extensible Multilingual Pretraining and Finetuning `__ by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan. -48. :doc:`Megatron-BERT ` (from NVIDIA) released with the paper `Megatron-LM: Training +50. :doc:`Megatron-BERT ` (from NVIDIA) released with the paper `Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism `__ by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. -49. :doc:`Megatron-GPT2 ` (from NVIDIA) released with the paper `Megatron-LM: Training +51. :doc:`Megatron-GPT2 ` (from NVIDIA) released with the paper `Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism `__ by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. -50. :doc:`MPNet ` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted +52. :doc:`MPNet ` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted Pre-training for Language Understanding `__ by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. -51. :doc:`MT5 ` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained +53. :doc:`MT5 ` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained text-to-text transformer `__ by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. -52. :doc:`Pegasus ` (from Google) released with the paper `PEGASUS: Pre-training with Extracted +54. :doc:`Pegasus ` (from Google) released with the paper `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization `__ by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. -53. :doc:`ProphetNet ` (from Microsoft Research) released with the paper `ProphetNet: Predicting +55. `PhoBERT `__ (from VinAI Research) released with + the paper `PhoBERT: Pre-trained language models for Vietnamese + `__ by Dat Quoc Nguyen and Anh Tuan Nguyen. +56. :doc:`ProphetNet ` (from Microsoft Research) released with the paper `ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training `__ by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. -54. :doc:`Reformer ` (from Google Research) released with the paper `Reformer: The Efficient +57. :doc:`Reformer ` (from Google Research) released with the paper `Reformer: The Efficient Transformer `__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. -55. :doc:`RemBERT ` (from Google Research) released with the paper `Rethinking embedding coupling in +58. :doc:`RemBERT ` (from Google Research) released with the paper `Rethinking embedding coupling in pre-trained language models `__ by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. -56. :doc:`RoBERTa ` (from Facebook), released together with the paper a `Robustly Optimized BERT +59. :doc:`RoBERTa ` (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach `__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. -57. :doc:`RoFormer ` (from ZhuiyiTechnology), released together with the paper a `RoFormer: +60. :doc:`RoFormer ` (from ZhuiyiTechnology), released together with the paper a `RoFormer: Enhanced Transformer with Rotary Position Embedding `__ by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu. -58. :doc:`SEW ` (from ASAPP) released with the paper `Performance-Efficiency Trade-offs in Unsupervised +61. :doc:`SEW ` (from ASAPP) released with the paper `Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition `__ by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. -59. :doc:`SEW-D ` (from ASAPP) released with the paper `Performance-Efficiency Trade-offs in +62. :doc:`SEW-D ` (from ASAPP) released with the paper `Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition `__ by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi. -60. :doc:`SpeechToTextTransformer ` (from Facebook), released together with the paper +63. :doc:`SpeechToTextTransformer ` (from Facebook), released together with the paper `fairseq S2T: Fast Speech-to-Text Modeling with fairseq `__ by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. -61. :doc:`SpeechToTextTransformer2 ` (from Facebook), released together with the paper +64. :doc:`SpeechToTextTransformer2 ` (from Facebook), released together with the paper `Large-Scale Self- and Semi-Supervised Learning for Speech Translation `__ by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau. -62. :doc:`Splinter ` (from Tel Aviv University), released together with the paper `Few-Shot +65. :doc:`Splinter ` (from Tel Aviv University), released together with the paper `Few-Shot Question Answering by Pretraining Span Selection `__ by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. -63. :doc:`SqueezeBert ` (from Berkeley) released with the paper `SqueezeBERT: What can computer +66. :doc:`SqueezeBert ` (from Berkeley) released with the paper `SqueezeBERT: What can computer vision teach NLP about efficient neural networks? `__ by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer. -64. :doc:`T5 ` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a +67. :doc:`T5 ` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer `__ by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. -65. :doc:`T5v1.1 ` (from Google AI) released in the repository +68. :doc:`T5v1.1 ` (from Google AI) released in the repository `google-research/text-to-text-transfer-transformer `__ by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. -66. :doc:`TAPAS ` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via +69. :doc:`TAPAS ` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via Pre-training `__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. -67. :doc:`Transformer-XL ` (from Google/CMU) released with the paper `Transformer-XL: +70. :doc:`Transformer-XL ` (from Google/CMU) released with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context `__ by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. -68. `TrOCR `__ (from Microsoft), released together +71. `TrOCR `__ (from Microsoft), released together with the paper `TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models `__ by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. -69. :doc:`Vision Transformer (ViT) ` (from Google AI) released with the paper `An Image is Worth 16x16 +72. :doc:`Vision Transformer (ViT) ` (from Google AI) released with the paper `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale `__ by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. -70. :doc:`VisualBERT ` (from UCLA NLP) released with the paper `VisualBERT: A Simple and +73. :doc:`VisualBERT ` (from UCLA NLP) released with the paper `VisualBERT: A Simple and Performant Baseline for Vision and Language `__ by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang. -71. :doc:`Wav2Vec2 ` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for +74. :doc:`Wav2Vec2 ` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations `__ by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. -72. :doc:`XLM ` (from Facebook) released together with the paper `Cross-lingual Language Model +75. :doc:`XLM ` (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining `__ by Guillaume Lample and Alexis Conneau. -73. :doc:`XLM-ProphetNet ` (from Microsoft Research) released with the paper `ProphetNet: +76. :doc:`XLM-ProphetNet ` (from Microsoft Research) released with the paper `ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training `__ by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. -74. :doc:`XLM-RoBERTa ` (from Facebook AI), released together with the paper `Unsupervised +77. :doc:`XLM-RoBERTa ` (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale `__ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. -75. :doc:`XLNet ` (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive +78. :doc:`XLNet ` (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive Pretraining for Language Understanding `__ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. -76. :doc:`XLSR-Wav2Vec2 ` (from Facebook AI) released with the paper `Unsupervised +79. :doc:`XLSR-Wav2Vec2 ` (from Facebook AI) released with the paper `Unsupervised Cross-Lingual Representation Learning For Speech Recognition `__ by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli. @@ -570,6 +580,7 @@ Flax), PyTorch, and/or TensorFlow. model_doc/auto model_doc/bart model_doc/barthez + model_doc/bartpho model_doc/beit model_doc/bert model_doc/bertweet diff --git a/docs/source/model_doc/bartpho.rst b/docs/source/model_doc/bartpho.rst new file mode 100644 index 0000000000..6e5113d328 --- /dev/null +++ b/docs/source/model_doc/bartpho.rst @@ -0,0 +1,86 @@ +.. + Copyright 2021 The HuggingFace Team. All rights reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + specific language governing permissions and limitations under the License. + +BARTpho +----------------------------------------------------------------------------------------------------------------------- + +Overview +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The BARTpho model was proposed in `BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese +`__ by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen. + +The abstract from the paper is the following: + +*We present BARTpho with two versions -- BARTpho_word and BARTpho_syllable -- the first public large-scale monolingual +sequence-to-sequence models pre-trained for Vietnamese. Our BARTpho uses the "large" architecture and pre-training +scheme of the sequence-to-sequence denoising model BART, thus especially suitable for generative NLP tasks. Experiments +on a downstream task of Vietnamese text summarization show that in both automatic and human evaluations, our BARTpho +outperforms the strong baseline mBART and improves the state-of-the-art. We release BARTpho to facilitate future +research and applications of generative Vietnamese NLP tasks.* + +Example of use: + +.. code-block:: + + >>> import torch + >>> from transformers import AutoModel, AutoTokenizer + + >>> bartpho = AutoModel.from_pretrained("vinai/bartpho-syllable") + + >>> tokenizer = AutoTokenizer.from_pretrained("vinai/bartpho-syllable") + + >>> line = "Chúng tôi là những nghiên cứu viên." + + >>> input_ids = tokenizer(line, return_tensors="pt") + + >>> with torch.no_grad(): + ... features = bartpho(**input_ids) # Models outputs are now tuples + + >>> # With TensorFlow 2.0+: + >>> from transformers import TFAutoModel + >>> bartpho = TFAutoModel.from_pretrained("vinai/bartpho-syllable") + >>> input_ids = tokenizer(line, return_tensors="tf") + >>> features = bartpho(**input_ids) + +Tips: + +- Following mBART, BARTpho uses the "large" architecture of BART with an additional layer-normalization layer on top of + both the encoder and decoder. Thus, usage examples in the :doc:`documentation of BART `, when adapting to use + with BARTpho, should be adjusted by replacing the BART-specialized classes with the mBART-specialized counterparts. + For example: + +.. code-block:: + + >>> from transformers import MBartForConditionalGeneration + >>> bartpho = MBartForConditionalGeneration.from_pretrained("vinai/bartpho-syllable") + >>> TXT = 'Chúng tôi là nghiên cứu viên.' + >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids'] + >>> logits = bartpho(input_ids).logits + >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() + >>> probs = logits[0, masked_index].softmax(dim=0) + >>> values, predictions = probs.topk(5) + >>> print(tokenizer.decode(predictions).split()) + +- This implementation is only for tokenization: "monolingual_vocab_file" consists of Vietnamese-specialized types + extracted from the pre-trained SentencePiece model "vocab_file" that is available from the multilingual XLM-RoBERTa. + Other languages, if employing this pre-trained multilingual SentencePiece model "vocab_file" for subword + segmentation, can reuse BartphoTokenizer with their own language-specialized "monolingual_vocab_file". + +This model was contributed by `dqnguyen `__. The original code can be found `here +`__. + +BartphoTokenizer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.BartphoTokenizer + :members: diff --git a/docs/source/model_doc/bertweet.rst b/docs/source/model_doc/bertweet.rst index 6a66c3202f..0f6a9befee 100644 --- a/docs/source/model_doc/bertweet.rst +++ b/docs/source/model_doc/bertweet.rst @@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -Bertweet +BERTweet ----------------------------------------------------------------------------------------------------------------------- Overview diff --git a/docs/source/model_doc/phobert.rst b/docs/source/model_doc/phobert.rst index bb35a460eb..ffe8657ab3 100644 --- a/docs/source/model_doc/phobert.rst +++ b/docs/source/model_doc/phobert.rst @@ -50,7 +50,8 @@ Example of use: >>> # phobert = TFAutoModel.from_pretrained("vinai/phobert-base") - This model was contributed by `dqnguyen `__. The original code can be found `here `__. +This model was contributed by `dqnguyen `__. The original code can be found `here +`__. PhobertTokenizer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 879d3b0e57..8fb4817997 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -160,6 +160,7 @@ _import_structure = { ], "models.bart": ["BartConfig", "BartTokenizer"], "models.barthez": [], + "models.bartpho": [], "models.beit": ["BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BeitConfig"], "models.bert": [ "BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", @@ -350,6 +351,7 @@ _import_structure = { if is_sentencepiece_available(): _import_structure["models.albert"].append("AlbertTokenizer") _import_structure["models.barthez"].append("BarthezTokenizer") + _import_structure["models.bartpho"].append("BartphoTokenizer") _import_structure["models.bert_generation"].append("BertGenerationTokenizer") _import_structure["models.big_bird"].append("BigBirdTokenizer") _import_structure["models.camembert"].append("CamembertTokenizer") @@ -2209,6 +2211,7 @@ if TYPE_CHECKING: if is_sentencepiece_available(): from .models.albert import AlbertTokenizer from .models.barthez import BarthezTokenizer + from .models.bartpho import BartphoTokenizer from .models.bert_generation import BertGenerationTokenizer from .models.big_bird import BigBirdTokenizer from .models.camembert import CamembertTokenizer diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index df6e841038..ec961e4252 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -21,6 +21,7 @@ from . import ( auto, bart, barthez, + bartpho, beit, bert, bert_generation, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 37162deefe..58a760754b 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -243,6 +243,7 @@ MODEL_NAMES_MAPPING = OrderedDict( ("hubert", "Hubert"), ("barthez", "BARThez"), ("phobert", "PhoBERT"), + ("bartpho", "BARTpho"), ("cpm", "CPM"), ("bertweet", "Bertweet"), ("bert-japanese", "BertJapanese"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index a30d981874..bc0443526f 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -189,6 +189,7 @@ else: ), ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)), ("phobert", ("PhobertTokenizer", None)), + ("bartpho", ("BartphoTokenizer", None)), ( "barthez", ( diff --git a/src/transformers/models/bartpho/__init__.py b/src/transformers/models/bartpho/__init__.py new file mode 100644 index 0000000000..9d57a21845 --- /dev/null +++ b/src/transformers/models/bartpho/__init__.py @@ -0,0 +1,36 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ...file_utils import _LazyModule, is_sentencepiece_available + + +_import_structure = {} + +if is_sentencepiece_available(): + _import_structure["tokenization_bartpho"] = ["BartphoTokenizer"] + +if TYPE_CHECKING: + if is_sentencepiece_available(): + from .tokenization_bartpho import BartphoTokenizer + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure) diff --git a/src/transformers/models/bartpho/tokenization_bartpho.py b/src/transformers/models/bartpho/tokenization_bartpho.py new file mode 100644 index 0000000000..adb6af893f --- /dev/null +++ b/src/transformers/models/bartpho/tokenization_bartpho.py @@ -0,0 +1,304 @@ +# coding=utf-8 +# Copyright 2021 VinAI Research and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +""" Tokenization classes for BARTpho-syllable model.""" + + +import os +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple + +import sentencepiece as spm + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + +SPIECE_UNDERLINE = "▁" + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "monolingual_vocab_file": "dict.txt"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "vinai/bartpho-syllable": "https://huggingface.co/vinai/bartpho-syllable/resolve/main/sentencepiece.bpe.model", + }, + "monolingual_vocab_file": { + "vinai/bartpho-syllable": "https://huggingface.co/vinai/bartpho-syllable/resolve/main/dict.txt", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"vinai/bartpho-syllable": 1024} + + +class BartphoTokenizer(PreTrainedTokenizer): + """ + Adapted from :class:`~transformers.XLMRobertaTokenizer`. Based on `SentencePiece + `__. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. + Users should refer to this superclass for more information regarding those methods. + + Args: + vocab_file (:obj:`str`): + Path to the vocabulary file. This vocabulary is the pre-trained SentencePiece model available from the + multilingual XLM-RoBERTa, also used in mBART, consisting of 250K types. + monolingual_vocab_file (:obj:`str`): + Path to the monolingual vocabulary file. This monolingual vocabulary consists of Vietnamese-specialized + types extracted from the multilingual vocabulary vocab_file of 250K types. + bos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the :obj:`cls_token`. + eos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The end of sequence token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the end of + sequence. The token used is the :obj:`sep_token`. + sep_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): + Additional special tokens used by the tokenizer. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + + Attributes: + sp_model (:obj:`SentencePieceProcessor`): + The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + monolingual_vocab_file, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + sp_model_kwargs: Optional[Dict[str, Any]] = None, + **kwargs + ) -> None: + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + self.vocab_file = vocab_file + self.monolingual_vocab_file = monolingual_vocab_file + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(str(vocab_file)) + + # Load the reduced vocab + self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3} + with open(monolingual_vocab_file, "r", encoding="utf-8") as f: + for line in f.readlines(): + token = line.strip().split()[0] + self.fairseq_tokens_to_ids[token] = len(self.fairseq_tokens_to_ids) + self.fairseq_tokens_to_ids[""] = len(self.fairseq_tokens_to_ids) + + self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + state["sp_model_proto"] = self.sp_model.serialized_model_proto() + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.LoadFromSerializedProto(self.sp_model_proto) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An BARTPho sequence has the following format: + + - single sequence: `` X `` + - pair of sequences: `` A B `` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` method. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. BARTPho does not + make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of zeros. + + """ + + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + @property + def vocab_size(self): + return len(self.fairseq_ids_to_tokens) + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text: str) -> List[str]: + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + if token in self.fairseq_tokens_to_ids: + return self.fairseq_tokens_to_ids[token] + else: + return self.fairseq_tokens_to_ids[""] + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.fairseq_ids_to_tokens[index] + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (strings for sub-words) in a single string.""" + out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() + return out_string + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + out_monolingual_vocab_file = os.path.join( + save_directory, + (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["monolingual_vocab_file"], + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + if os.path.abspath(self.monolingual_vocab_file) != os.path.abspath(out_monolingual_vocab_file): + copyfile(self.monolingual_vocab_file, out_monolingual_vocab_file) + + return out_vocab_file, out_monolingual_vocab_file diff --git a/src/transformers/utils/dummy_sentencepiece_objects.py b/src/transformers/utils/dummy_sentencepiece_objects.py index 518703520c..cdc2a892f8 100644 --- a/src/transformers/utils/dummy_sentencepiece_objects.py +++ b/src/transformers/utils/dummy_sentencepiece_objects.py @@ -20,6 +20,15 @@ class BarthezTokenizer: requires_backends(cls, ["sentencepiece"]) +class BartphoTokenizer: + def __init__(self, *args, **kwargs): + requires_backends(self, ["sentencepiece"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["sentencepiece"]) + + class BertGenerationTokenizer: def __init__(self, *args, **kwargs): requires_backends(self, ["sentencepiece"]) diff --git a/tests/test_tokenization_bartpho.py b/tests/test_tokenization_bartpho.py new file mode 100644 index 0000000000..f0aa2a9592 --- /dev/null +++ b/tests/test_tokenization_bartpho.py @@ -0,0 +1,66 @@ +# coding=utf-8 +# Copyright 2021 HuggingFace Inc. team. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import unittest + +from transformers.models.bartpho.tokenization_bartpho import VOCAB_FILES_NAMES, BartphoTokenizer + +from .test_tokenization_common import TokenizerTesterMixin + + +SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece_bpe.model") + + +class BartphoTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + + tokenizer_class = BartphoTokenizer + test_rust_tokenizer = False + test_sentencepiece = True + + def setUp(self): + super().setUp() + + vocab = ["▁This", "▁is", "▁a", "▁t", "est"] + vocab_tokens = dict(zip(vocab, range(len(vocab)))) + self.special_tokens_map = {"unk_token": ""} + + self.monolingual_vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["monolingual_vocab_file"]) + with open(self.monolingual_vocab_file, "w", encoding="utf-8") as fp: + for token in vocab_tokens: + fp.write(f"{token} {vocab_tokens[token]}\n") + + tokenizer = BartphoTokenizer(SAMPLE_VOCAB, self.monolingual_vocab_file, **self.special_tokens_map) + tokenizer.save_pretrained(self.tmpdirname) + + def get_tokenizer(self, **kwargs): + kwargs.update(self.special_tokens_map) + return BartphoTokenizer.from_pretrained(self.tmpdirname, **kwargs) + + def get_input_output_texts(self, tokenizer): + input_text = "This is a là test" + output_text = "This is a test" + return input_text, output_text + + def test_full_tokenizer(self): + tokenizer = BartphoTokenizer(SAMPLE_VOCAB, self.monolingual_vocab_file, **self.special_tokens_map) + text = "This is a là test" + bpe_tokens = "▁This ▁is ▁a ▁l à ▁t est".split() + tokens = tokenizer.tokenize(text) + self.assertListEqual(tokens, bpe_tokens) + + input_tokens = tokens + [tokenizer.unk_token] + input_bpe_tokens = [4, 5, 6, 3, 3, 7, 8, 3] + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)