Release: v3.0.2

Fix fast tokenizers too (#5562 )
Various tokenizers fixes (#5558 )
2020-07-06 18:49:44 -04:00 · 2020-07-06 18:45:01 -04:00 · 2020-07-06 18:27:53 -04:00 · 2020-07-06 17:26:48 -04:00 · 2020-07-06 12:17:05 -04:00 · 2020-07-06 11:33:57 -04:00
17 changed files with 153 additions and 87 deletions
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -26,7 +26,7 @@ author = u'huggingface'
 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'3.0.0'
+release = u'3.0.2'


 # -- General configuration ---------------------------------------------------
--- a/docs/source/training.rst
+++ b/docs/source/training.rst
@@ -39,7 +39,7 @@ of the specified model are used to initialize the model. The
 library also includes a number of task-specific final layers or 'heads' whose
 weights are instantiated randomly when not present in the specified
 pre-trained model. For example, instantiating a model with
-``BertForSequenceClassification.from_pretrained('bert-base-uncased', num_classes=2)``
+``BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)``
 will create a BERT model instance with encoder weights copied from the
 ``bert-base-uncased`` model and a randomly initialized sequence
 classification head on top of the encoder with an output size of 2. Models
@@ -272,7 +272,7 @@ optimize.
 :func:`~transformers.Trainer` uses a built-in default function to collate
 batches and prepare them to be fed into the model. If needed, you can also
 use the ``data_collator`` argument to pass your own collator function which
-takes in the data in the format provides by your dataset and returns a
+takes in the data in the format provided by your dataset and returns a
 batch ready to be fed into the model. Note that
 :func:`~transformers.TFTrainer` expects the passed datasets to be dataset
 objects from ``tensorflow_datasets``.
--- a/examples/text-generation/run_generation.py
+++ b/examples/text-generation/run_generation.py
@@ -214,8 +214,14 @@ def main():
    if requires_preprocessing:
        prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type)
        preprocessed_prompt_text = prepare_input(args, model, tokenizer, prompt_text)
+
+        if model.__class__.__name__ in ["TransfoXLLMHeadModel"]:
+            tokenizer_kwargs = {"add_space_before_punct_symbol": True}
+        else:
+            tokenizer_kwargs = {}
+
        encoded_prompt = tokenizer.encode(
-            preprocessed_prompt_text, add_special_tokens=False, return_tensors="pt", add_space_before_punct_symbol=True
+            preprocessed_prompt_text, add_special_tokens=False, return_tensors="pt", **tokenizer_kwargs
        )
    else:
        encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
--- a/setup.py
+++ b/setup.py
@@ -101,7 +101,7 @@ extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3<1", "sci

 setup(
    name="transformers",
-    version="3.0.1",
+    version="3.0.2",
    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Patrick von Platen, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
    author_email="thomas@huggingface.co",
    description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
@@ -114,7 +114,7 @@ setup(
    packages=find_packages("src"),
    install_requires=[
        "numpy",
-        "tokenizers == 0.8.0-rc4",
+        "tokenizers == 0.8.1.rc1",
        # dataclasses for Python versions that don't have it
        "dataclasses;python_version<'3.7'",
        # utilities from PyPA to e.g. compare versions
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -2,7 +2,7 @@
 # There's no way to ignore "F401 '...' imported but unused" warnings in this
 # module, but to preserve other warnings. So, don't check this module at all.

-__version__ = "3.0.1"
+__version__ = "3.0.2"

 # Work around to update TensorFlow's absl.logging threshold which alters the
 # default Python logging output behavior when present.
--- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -71,10 +71,10 @@ from transformers import (
    XLMRobertaConfig,
    XLNetConfig,
    cached_path,
-    hf_bucket_url,
    is_torch_available,
    load_pytorch_checkpoint_in_tf2_model,
 )
+from transformers.file_utils import hf_bucket_url


 if is_torch_available():
--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -347,6 +347,10 @@ class TFGenerationMixin:
            encoder_outputs = None
            cur_len = shape_list(input_ids)[-1]

+        assert (
+            cur_len < max_length
+        ), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`"
+
        if num_beams > 1:
            output = self._generate_beam_search(
                input_ids,
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -428,6 +428,10 @@ class GenerationMixin:
            encoder_outputs = None
            cur_len = input_ids.shape[-1]

+        assert (
+            cur_len < max_length
+        ), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`"
+
        if num_beams > 1:
            output = self._generate_beam_search(
                input_ids,
--- a/src/transformers/modeling_tf_t5.py
+++ b/src/transformers/modeling_tf_t5.py
@@ -478,7 +478,7 @@ class TFT5Block(tf.keras.layers.Layer):
        return outputs  # hidden-states, present_key_value_states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)


-class _NoLayerEmbedTokens(object):
+class _NoLayerEmbedTokens:
    """
     this class wraps a the TFSharedEmbeddingTokens layer into a python 'no-keras-layer'
     class to avoid problem with weight restoring. Also it makes sure that the layer is
@@ -655,7 +655,7 @@ class TFT5MainLayer(tf.keras.layers.Layer):
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.

-        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposistion
+        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
        # extended_attention_mask = tf.math.equal(extended_attention_mask,
        #                                         tf.transpose(extended_attention_mask, perm=(-1, -2)))
@@ -682,16 +682,8 @@ class TFT5MainLayer(tf.keras.layers.Layer):
        else:
            encoder_extended_attention_mask = None

-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            raise NotImplementedError
-        else:
-            head_mask = [None] * self.num_hidden_layers
-            # head_mask = tf.constant([0] * self.num_hidden_layers)
+        assert head_mask is None, "Head mask not supported"
+        head_mask = [None] * self.num_hidden_layers

        present_key_value_states = ()
        all_hidden_states = ()
@@ -1054,8 +1046,6 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel):
        r"""
    Returns:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs:
-        loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_label` is provided):
-            Classification loss (cross entropy).
        prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        decoder_past_key_value_states (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``):
--- a/src/transformers/tokenization_bert.py
+++ b/src/transformers/tokenization_bert.py
@@ -606,7 +606,7 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
        mask_token="[MASK]",
        clean_text=True,
        tokenize_chinese_chars=True,
-        strip_accents=True,
+        strip_accents=None,
        wordpieces_prefix="##",
        **kwargs
    ):
--- a/src/transformers/tokenization_gpt2.py
+++ b/src/transformers/tokenization_gpt2.py
@@ -102,17 +102,26 @@ def get_pairs(word):

 class GPT2Tokenizer(PreTrainedTokenizer):
    """
-    GPT-2 BPE tokenizer. Peculiarities:
+    GPT-2 BPE tokenizer, using byte-level Byte-Pair-Encoding.

-    - Byte-level Byte-Pair-Encoding
-    - Requires a space to start the input string => the encoding methods should be called with the
-      ``add_prefix_space`` flag set to ``True``.
-      Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
-      the absence of a space at the beginning of a string:
+    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:

    ::

-        tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
+        >>> from transformers import GPT2Tokenizer
+        >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        >>> tokenizer("Hello world")['input_ids']
+        [15496, 995]
+        >>> tokenizer(" Hello world")['input_ids']
+        [18435, 995]
+
+    You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
+    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+
+    .. note::
+
+        When used with ``is_pretokenized=True``, this tokenizer will add a space before each word (even the first one).

    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
    should refer to the superclass for more information regarding methods.
@@ -137,6 +146,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]

    def __init__(
        self,
@@ -287,21 +297,30 @@ class GPT2Tokenizer(PreTrainedTokenizer):

 class GPT2TokenizerFast(PreTrainedTokenizerFast):
    """
-    Constructs a "Fast" GPT-2 BPE tokenizer (backed by HuggingFace's `tokenizers` library).
+    Constructs a "Fast" GPT-2 BPE tokenizer (backed by HuggingFace's `tokenizers` library), using byte-level
+    Byte-Pair-Encoding.

-    Peculiarities:
-
-    - Byte-level Byte-Pair-Encoding
-    - Requires a space to start the input string => the encoding methods should be called with the
-      ``add_prefix_space`` flag set to ``True``.
-      Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
-      the absence of a space at the beginning of a string:
+    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:

    ::

-        tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
+        >>> from transformers import GPT2TokenizerFast
+        >>> tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+        >>> tokenizer("Hello world")['input_ids']
+        [15496, 995]
+        >>> tokenizer(" Hello world")['input_ids']
+        [18435, 995]

-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the methods. Users
+    You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
+    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+
+    .. note::
+
+        When used with ``is_pretokenized=True``, this tokenizer needs to be instantiated with
+        ``add_prefix_space=True``.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
    should refer to the superclass for more information regarding methods.

    Args:
@@ -330,6 +349,7 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]

    def __init__(
        self,
--- a/src/transformers/tokenization_openai.py
+++ b/src/transformers/tokenization_openai.py
@@ -96,6 +96,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]

    def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
        super().__init__(unk_token=unk_token, **kwargs)
@@ -261,6 +262,7 @@ class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]

    def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
        kwargs.setdefault("unk_token", unk_token)
--- a/src/transformers/tokenization_roberta.py
+++ b/src/transformers/tokenization_roberta.py
@@ -62,17 +62,26 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {

 class RobertaTokenizer(GPT2Tokenizer):
    """
-    Constructs a RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
+    Constructs a RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding.

-    - Byte-level Byte-Pair-Encoding
-    - Requires a space to start the input string => the encoding methods should be called with the
-      ``add_prefix_space`` flag set to ``True``.
-      Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
-      the absence of a space at the beginning of a string:
+    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:

    ::

-        tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
+        >>> from transformers import RobertaTokenizer
+        >>> tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+        >>> tokenizer("Hello world")['input_ids']
+        [0, 31414, 232, 328, 2]
+        >>> tokenizer(" Hello world")['input_ids']
+        [0, 20920, 232, 2]
+
+    You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
+    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+
+    .. note::
+
+        When used with ``is_pretokenized=True``, this tokenizer will add a space before each word (even the first one).

    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
    should refer to the superclass for more information regarding methods.
@@ -251,19 +260,28 @@ class RobertaTokenizer(GPT2Tokenizer):

 class RobertaTokenizerFast(GPT2TokenizerFast):
    """
-    Constructs a "Fast" RoBERTa BPE tokenizer (backed by HuggingFace's `tokenizers` library).
+    Constructs a "Fast" RoBERTa BPE tokenizer (backed by HuggingFace's `tokenizers` library), derived from the GPT-2
+    tokenizer, using byte-level Byte-Pair-Encoding.

-    Peculiarities:
-
-    - Byte-level Byte-Pair-Encoding
-    - Requires a space to start the input string => the encoding methods should be called with the
-      ``add_prefix_space`` flag set to ``True``.
-      Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
-      the absence of a space at the beginning of a string:
+    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:

    ::

-        tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
+        >>> from transformers import RobertaTokenizerFast
+        >>> tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
+        >>> tokenizer("Hello world")['input_ids']
+        [0, 31414, 232, 328, 2]
+        >>> tokenizer(" Hello world")['input_ids']
+        [0, 20920, 232, 2]
+
+    You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
+    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+
+    .. note::
+
+        When used with ``is_pretokenized=True``, this tokenizer needs to be instantiated with
+        ``add_prefix_space=True``.

    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the methods. Users
    should refer to the superclass for more information regarding methods.
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -588,8 +588,8 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
                first_ids,
                second_ids,
                add_special_tokens=add_special_tokens,
-                padding_strategy=PaddingStrategy.DO_NOT_PAD,  # we pad in batch afterward
-                truncation_strategy=truncation_strategy,
+                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterward
+                truncation=truncation_strategy.value,
                max_length=max_length,
                stride=stride,
                pad_to_multiple_of=None,  # we pad in batch afterward
@@ -649,7 +649,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):

    def convert_ids_to_tokens(
        self, ids: Union[int, List[int]], skip_special_tokens: bool = False
-    ) -> Union[int, List[int]]:
+    ) -> Union[str, List[str]]:
        """ Converts a single index or a sequence of indices (integers) in a token "
            (resp.) a sequence of tokens (str), using the vocabulary and added tokens.

--- a/tests/test_modeling_tf_t5.py
+++ b/tests/test_modeling_tf_t5.py
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -24,6 +24,7 @@ from typing import TYPE_CHECKING, Dict, List, Tuple, Union

 from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast
 from transformers.testing_utils import require_tf, require_torch, slow
+from transformers.tokenization_utils import AddedToken


 if TYPE_CHECKING:
@@ -233,6 +234,12 @@ class TokenizerTesterMixin:

                self.assertListEqual(subwords, subwords_loaded)

+    def test_pickle_added_tokens(self):
+        tok1 = AddedToken("<s>", rstrip=True, lstrip=True, normalized=False, single_word=True)
+        tok2 = pickle.loads(pickle.dumps(tok1))
+
+        self.assertEqual(tok1.__getstate__(), tok2.__getstate__())
+
    def test_added_tokens_do_lower_case(self):
        # TODO(thom) activate fast tokenizer tests once Rust tokenizers accepts white spaces in added tokens
        tokenizers = self.get_tokenizers(fast=False, do_lower_case=True)
--- a/tests/test_tokenization_fast.py
+++ b/tests/test_tokenization_fast.py
@@ -91,8 +91,6 @@ class CommonFastTokenizerTest(unittest.TestCase):
        self.assert_padding(tokenizer_r, tokenizer_p)
        self.assert_create_token_type_ids(tokenizer_r, tokenizer_p)
        self.assert_prepare_for_model(tokenizer_r, tokenizer_p)
-        # TODO: enable for v3.0.0
-        # self.assert_empty_output_no_special_tokens(tokenizer_r, tokenizer_p)

    def fast_only(self, tokenizer_r):
        # Ensure None raise an error
@@ -748,29 +746,41 @@ class WordPieceFastTokenizerTest(CommonFastTokenizerTest):
            add_special_tokens=True,
        )

-        expected_results = [
-            ((0, 1), "A"),
-            ((1, 2), ","),
-            ((3, 8), "naive"),  # BERT normalizes this away
-            # Append MASK here after lower-casing
-            ((16, 21), "Allen"),
-            ((22, 24), "##NL"),
-            ((24, 25), "##P"),
-            ((26, 34), "sentence"),
-            ((35, 36), "."),
-        ]
-
-        # Check if the tokenizer is uncased
-        if tokenizer_r.init_kwargs.get("do_lower_case"):
-            expected_results = [(offset, token.lower()) for (offset, token) in expected_results]
-
-        # Append the special tokens
-        expected_results.insert(3, ((9, 15), "[MASK]"))
-        expected_results.insert(0, (None, "[CLS]"))
-        expected_results.append((None, "[SEP]"))
+        do_lower_case = tokenizer_r.init_kwargs.get("do_lower_case")
+        expected_results = (
+            [
+                ((0, 0), "[CLS]"),
+                ((0, 1), "A"),
+                ((1, 2), ","),
+                ((3, 5), "na"),
+                ((5, 6), "##ï"),
+                ((6, 8), "##ve"),
+                ((9, 15), "[MASK]"),
+                ((16, 21), "Allen"),
+                ((21, 23), "##NL"),
+                ((23, 24), "##P"),
+                ((25, 33), "sentence"),
+                ((33, 34), "."),
+                ((0, 0), "[SEP]"),
+            ]
+            if not do_lower_case
+            else [
+                ((0, 0), "[CLS]"),
+                ((0, 1), "a"),
+                ((1, 2), ","),
+                ((3, 8), "naive"),
+                ((9, 15), "[MASK]"),
+                ((16, 21), "allen"),
+                ((21, 23), "##nl"),
+                ((23, 24), "##p"),
+                ((25, 33), "sentence"),
+                ((33, 34), "."),
+                ((0, 0), "[SEP]"),
+            ]
+        )

        self.assertEqual([e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"]))
-        # self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
+        self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])


 class RobertaFastTokenizerTest(CommonFastTokenizerTest):
Author	SHA1	Message	Date
Lysandre	b0892fa0e8	Release: v3.0.2 Some checks failed GitHub-hosted runner / check_code_quality (push) Has been cancelled Details	2020-07-06 18:49:44 -04:00
Sylvain Gugger	f1e2e423ab	Fix fast tokenizers too (#5562 )	2020-07-06 18:45:01 -04:00
Anthony MOI	5787e4c159	Various tokenizers fixes (#5558 ) * BertTokenizerFast - Do not specify strip_accents by default * Bump tokenizers to new version * Add test for AddedToken serialization	2020-07-06 18:27:53 -04:00
Sylvain Gugger	21f28c34b7	Fix #5507 (#5559 ) * Fix #5507 * Fix formatting	2020-07-06 17:26:48 -04:00
Lysandre Debut	9d9b872b66	The `add_space_before_punct_symbol` is only for TransfoXL (#5549 )	2020-07-06 12:17:05 -04:00
Lysandre Debut	d6b0b9d451	GPT2 tokenizer should not output token type IDs (#5546 ) * GPT2 tokenizer should not output token type IDs * Same for OpenAIGPT	2020-07-06 11:33:57 -04:00
Sylvain Gugger	7833b21a5a	Fix #5544 (#5551 )	2020-07-06 11:22:24 -04:00
Thomas Wolf	c473484087	Fix the tokenization warning noted in #5505 (#5550 ) * fix warning * style and quality	2020-07-06 11:15:25 -04:00
Lysandre	1bbc28bee7	Imports organization	2020-07-06 10:27:10 -04:00
Mohamed Taher Alrefaie	1bc13697b1	Update convert_pytorch_checkpoint_to_tf2.py (#5531 ) fixed ImportError: cannot import name 'hf_bucket_url'	2020-07-06 09:55:10 -04:00
Arnav Sharma	b2309cc6bf	Typo fix in `training` doc (#5495 )	2020-07-06 09:15:22 -04:00
ELanning	7ecff0ccbb	Fix typo in training (#5510 )	2020-07-06 09:14:57 -04:00
Sam Shleifer	58cca47c16	[cleanup] TF T5 tests only init t5-base once. (#5410 )	2020-07-03 14:27:49 -04:00
Patrick von Platen	991172922f	better error message (#5497 )	2020-07-03 19:25:25 +02:00
Thomas Wolf	b58a15a31e	unpining specific git versions in setup.py	2020-07-03 17:38:39 +02:00