From 7b13bd01df821ccb5c2253f23f93b89a821e7729 Mon Sep 17 00:00:00 2001 From: rmroczkowski <64909124+rmroczkowski@users.noreply.github.com> Date: Fri, 16 Oct 2020 09:06:51 +0200 Subject: [PATCH] Herbert polish model (#7798) * HerBERT transformer model for Polish language understanding. * HerbertTokenizerFast generated with HerbertConverter * Herbert base and large model cards * Herbert model cards with tags * Herbert tensorflow models * Herbert model tests based on Bert test suit * src/transformers/tokenization_herbert.py edited online with Bitbucket * src/transformers/tokenization_herbert.py edited online with Bitbucket * docs/source/model_doc/herbert.rst edited online with Bitbucket * Herbert tokenizer tests and bug fixes * src/transformers/configuration_herbert.py edited online with Bitbucket * Copyrights and tests for TFHerbertModel * model_cards/allegro/herbert-base-cased/README.md edited online with Bitbucket * model_cards/allegro/herbert-large-cased/README.md edited online with Bitbucket * Bug fixes after testing * Reformat modified_only_fixup * Proper order of configuration * Herbert proper documentation formatting * Formatting with make modified_only_fixup * Dummies fixed * Adding missing models to documentation * Removing HerBERT model as it is a simple extension of BERT * Update model_cards/allegro/herbert-base-cased/README.md Co-authored-by: Julien Chaumond * Update model_cards/allegro/herbert-large-cased/README.md Co-authored-by: Julien Chaumond * HerbertTokenizer deprecated configuration removed Co-authored-by: Julien Chaumond --- .../allegro/herbert-base-cased/README.md | 51 +++++ .../allegro/herbert-large-cased/README.md | 50 +++++ src/transformers/__init__.py | 1 + src/transformers/convert_slow_tokenizer.py | 32 +++ src/transformers/tokenization_herbert.py | 197 ++++++++++++++++++ tests/test_tokenization_herbert.py | 122 +++++++++++ 6 files changed, 453 insertions(+) create mode 100644 model_cards/allegro/herbert-base-cased/README.md create mode 100644 model_cards/allegro/herbert-large-cased/README.md create mode 100644 src/transformers/tokenization_herbert.py create mode 100644 tests/test_tokenization_herbert.py diff --git a/model_cards/allegro/herbert-base-cased/README.md b/model_cards/allegro/herbert-base-cased/README.md new file mode 100644 index 0000000000..0afd84d4a6 --- /dev/null +++ b/model_cards/allegro/herbert-base-cased/README.md @@ -0,0 +1,51 @@ +--- +language: pl +tags: +- herbert +license: cc-by-sa-4.0 +--- + +# HerBERT +**[HerBERT](https://en.wikipedia.org/wiki/Zbigniew_Herbert)** is a BERT-based Language Model trained on Polish Corpora +using MLM and SSO objectives with dynamic masking of whole words. +Model training and experiments were conducted with [transformers](https://github.com/huggingface/transformers) in version 2.9. + +## Tokenizer +The training dataset was tokenized into subwords using ``CharBPETokenizer`` a character level byte-pair encoding with +a vocabulary size of 50k tokens. The tokenizer itself was trained with a [tokenizers](https://github.com/huggingface/tokenizers) library. +We kindly encourage you to use the **Fast** version of tokenizer, namely ``HerbertTokenizerFast``. + +## HerBERT usage + + +Example code: +```python +from transformers import AutoTokenizer, AutoModel + +tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased") +model = AutoModel.from_pretrained("allegro/herbert-base-cased") + +output = model( + **tokenizer.batch_encode_plus( + [ + ( + "A potem szedł środkiem drogi w kurzawie, bo zamiatał nogami, ślepy dziad prowadzony przez tłustego kundla na sznurku.", + "A potem leciał od lasu chłopak z butelką, ale ten ujrzawszy księdza przy drodze okrążył go z dala i biegł na przełaj pól do karczmy." + ) + ], + padding='longest', + add_special_tokens=True, + return_tensors='pt' + ) +) +``` + + +## License +CC BY-SA 4.0 + + +## Authors +Model was trained by **Allegro Machine Learning Research** team. + +You can contact us at: klejbenchmark@allegro.pl diff --git a/model_cards/allegro/herbert-large-cased/README.md b/model_cards/allegro/herbert-large-cased/README.md new file mode 100644 index 0000000000..583586f747 --- /dev/null +++ b/model_cards/allegro/herbert-large-cased/README.md @@ -0,0 +1,50 @@ +--- +language: pl +tags: +- herbert +license: cc-by-sa-4.0 +--- +# HerBERT +**[HerBERT](https://en.wikipedia.org/wiki/Zbigniew_Herbert)** is a BERT-based Language Model trained on Polish Corpora +using MLM and SSO objectives with dynamic masking of whole words. +Model training and experiments were conducted with [transformers](https://github.com/huggingface/transformers) in version 2.9. + +## Tokenizer +The training dataset was tokenized into subwords using ``CharBPETokenizer`` a character level byte-pair encoding with +a vocabulary size of 50k tokens. The tokenizer itself was trained with a [tokenizers](https://github.com/huggingface/tokenizers) library. +We kindly encourage you to use the **Fast** version of tokenizer, namely ``HerbertTokenizerFast``. + +## HerBERT usage + + +Example code: +```python +from transformers import AutoTokenizer, AutoModel + +tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-large-cased") +model = AutoModel.from_pretrained("allegro/herbert-large-cased") + +output = model( + **tokenizer.batch_encode_plus( + [ + ( + "A potem szedł środkiem drogi w kurzawie, bo zamiatał nogami, ślepy dziad prowadzony przez tłustego kundla na sznurku.", + "A potem leciał od lasu chłopak z butelką, ale ten ujrzawszy księdza przy drodze okrążył go z dala i biegł na przełaj pól do karczmy." + ) + ], + padding='longest', + add_special_tokens=True, + return_tensors='pt' + ) +) +``` + + +## License +CC BY-SA 4.0 + + +## Authors +Model was trained by **Allegro Machine Learning Research** team. + +You can contact us at: klejbenchmark@allegro.pl diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 4e6d8bdd34..358901967a 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -177,6 +177,7 @@ from .tokenization_flaubert import FlaubertTokenizer from .tokenization_fsmt import FSMTTokenizer from .tokenization_funnel import FunnelTokenizer, FunnelTokenizerFast from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast +from .tokenization_herbert import HerbertTokenizer, HerbertTokenizerFast from .tokenization_layoutlm import LayoutLMTokenizer, LayoutLMTokenizerFast from .tokenization_longformer import LongformerTokenizer, LongformerTokenizerFast from .tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index fa7cd2d38a..e65c2f4399 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -227,6 +227,37 @@ class GPT2Converter(Converter): return tokenizer +class HerbertConverter(Converter): + def converted(self) -> Tokenizer: + tokenizer_info_str = "#version:" + token_suffix = "" + + vocab = self.original_tokenizer.encoder + merges = list(self.original_tokenizer.bpe_ranks.keys()) + if tokenizer_info_str in merges[0][0]: + merges = merges[1:] + + tokenizer = Tokenizer( + BPE( + vocab, + merges, + dropout=None, + unk_token=self.original_tokenizer.unk_token, + end_of_word_suffix=token_suffix, + ) + ) + + tokenizer.normalizer = normalizers.BertNormalizer(lowercase=False, strip_accents=False) + tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() + tokenizer.decoder = decoders.BPEDecoder(suffix=token_suffix) + tokenizer.post_processor = processors.BertProcessing( + sep=(self.original_tokenizer.sep_token, self.original_tokenizer.sep_token_id), + cls=(self.original_tokenizer.cls_token, self.original_tokenizer.cls_token_id), + ) + + return tokenizer + + class RobertaConverter(Converter): def converted(self) -> Tokenizer: ot = self.original_tokenizer @@ -550,6 +581,7 @@ CONVERTERS = { "ElectraTokenizer": BertConverter, "FunnelTokenizer": FunnelConverter, "GPT2Tokenizer": GPT2Converter, + "HerbertTokenizer": HerbertConverter, "LxmertTokenizer": BertConverter, "MBartTokenizer": MBartConverter, "OpenAIGPTTokenizer": OpenAIGPTConverter, diff --git a/src/transformers/tokenization_herbert.py b/src/transformers/tokenization_herbert.py new file mode 100644 index 0000000000..8104485f0f --- /dev/null +++ b/src/transformers/tokenization_herbert.py @@ -0,0 +1,197 @@ +# coding=utf-8 +# Copyright 2020 The Google AI Language Team Authors, Allegro.pl, Facebook Inc. and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional + +from .tokenization_bert import BasicTokenizer +from .tokenization_utils_fast import PreTrainedTokenizerFast +from .tokenization_xlm import XLMTokenizer +from .utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", +} + + +class HerbertTokenizer(XLMTokenizer): + """ + Construct a BPE tokenizer for HerBERT. + + Peculiarities: + + - uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation. + Each occurence of a punctuation character will be treated separately. + + - Such pretokenized input is BPE subtokenized + + This tokenizer inherits from :class:`~transformers.XLMTokenizer` which contains most of the methods. Users + should refer to the superclass for more information regarding methods. + """ + + vocab_files_names = VOCAB_FILES_NAMES + + def __init__(self, **kwargs): + + kwargs["cls_token"] = "" + kwargs["unk_token"] = "" + kwargs["pad_token"] = "" + kwargs["mask_token"] = "" + kwargs["sep_token"] = "" + kwargs["do_lowercase_and_remove_accent"] = False + kwargs["additional_special_tokens"] = [] + + super().__init__(**kwargs) + self.bert_pre_tokenizer = BasicTokenizer( + do_lower_case=False, never_split=self.all_special_tokens, tokenize_chinese_chars=False, strip_accents=False + ) + + def _tokenize(self, text): + + pre_tokens = self.bert_pre_tokenizer.tokenize(text) + + split_tokens = [] + for token in pre_tokens: + if token: + split_tokens.extend([t for t in self.bpe(token).split(" ")]) + + return split_tokens + + +class HerbertTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "Fast" BPE tokenizer for HerBERT (backed by HuggingFace's `tokenizers` library). + + Peculiarities: + + - uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. + Each occurence of a punctuation character will be treated separately. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users + should refer to the superclass for more information regarding methods. + + Args: + vocab_file (:obj:`str`): + Path to the vocabulary file. + merges_file (:obj:`str`): + Path to the merges file. + """ + + vocab_files_names = VOCAB_FILES_NAMES + slow_tokenizer_class = HerbertTokenizer + + def __init__(self, vocab_file, merges_file, **kwargs): + + kwargs["cls_token"] = "" + kwargs["unk_token"] = "" + kwargs["pad_token"] = "" + kwargs["mask_token"] = "" + kwargs["sep_token"] = "" + + super().__init__( + vocab_file, + merges_file, + **kwargs, + ) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + An HerBERT, like BERT sequence has the following format: + + - single sequence: `` X `` + - pair of sequences: `` A B `` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + + cls = [self.cls_token_id] + sep = [self.sep_token_id] + if token_ids_1 is None: + return cls + token_ids_0 + sep + + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` method. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model." + ) + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. + HerBERT, like BERT sequence pair mask has the following format: + + :: + + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given + sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] diff --git a/tests/test_tokenization_herbert.py b/tests/test_tokenization_herbert.py new file mode 100644 index 0000000000..bf876a05ec --- /dev/null +++ b/tests/test_tokenization_herbert.py @@ -0,0 +1,122 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors, Allegro.pl and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import os +import unittest + +from transformers.testing_utils import slow +from transformers.tokenization_herbert import VOCAB_FILES_NAMES, HerbertTokenizer, HerbertTokenizerFast + +from .test_tokenization_common import TokenizerTesterMixin + + +class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + + tokenizer_class = HerbertTokenizer + rust_tokenizer_class = HerbertTokenizerFast + test_rust_tokenizer = True + + def setUp(self): + super().setUp() + + vocab = [ + "", + "", + "l", + "o", + "w", + "e", + "r", + "s", + "t", + "i", + "d", + "n", + "w", + "r", + "t", + "lo", + "low", + "er", + "low", + "lowest", + "newer", + "wider", + ",", + "", + ] + vocab_tokens = dict(zip(vocab, range(len(vocab)))) + merges = ["l o 123", "lo w 1456", "e r 1789", ""] + + self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(self.vocab_file, "w") as fp: + fp.write(json.dumps(vocab_tokens)) + with open(self.merges_file, "w") as fp: + fp.write("\n".join(merges)) + + def get_input_output_texts(self, tokenizer): + input_text = "lower newer" + output_text = "lower newer" + return input_text, output_text + + def test_full_tokenizer(self): + tokenizer = self.tokenizer_class(vocab_file=self.vocab_file, merges_file=self.merges_file) + + text = "lower" + bpe_tokens = ["low", "er"] + tokens = tokenizer.tokenize(text) + self.assertListEqual(tokens, bpe_tokens) + + input_tokens = tokens + [""] + input_bpe_tokens = [16, 17, 23] + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + + def test_rust_and_python_full_tokenizers(self): + if not self.test_rust_tokenizer: + return + + tokenizer = self.get_tokenizer() + rust_tokenizer = self.get_rust_tokenizer() + + sequence = "lower,newer" + + tokens = tokenizer.tokenize(sequence) + rust_tokens = rust_tokenizer.tokenize(sequence) + self.assertListEqual(tokens, rust_tokens) + + ids = tokenizer.encode(sequence, add_special_tokens=False) + rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) + self.assertListEqual(ids, rust_ids) + + rust_tokenizer = self.get_rust_tokenizer() + ids = tokenizer.encode(sequence) + rust_ids = rust_tokenizer.encode(sequence) + self.assertListEqual(ids, rust_ids) + + @slow + def test_sequence_builders(self): + tokenizer = self.tokenizer_class.from_pretrained("allegro/herbert-base-cased") + + text = tokenizer.encode("konstruowanie sekwencji", add_special_tokens=False) + text_2 = tokenizer.encode("konstruowanie wielu sekwencji", add_special_tokens=False) + + encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) + + assert encoded_sentence == [0] + text + [2] + assert encoded_pair == [0] + text + [2] + text_2 + [2]