Merge branch 'master' into do_lower_case

2019-11-27 17:04:39 +01:00
parent 7246d3c2f9 de2696f68e
commit 21637d4924
72 changed files with 4048 additions and 333 deletions
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -21,6 +21,7 @@ import os
 import json
 import six
 import copy
+import itertools
 from io import open

 from .file_utils import cached_path, is_tf_available, is_torch_available
@@ -646,9 +647,9 @@ class PreTrainedTokenizer(object):
                        tokenized_text += [sub_text]
                text_list = tokenized_text

-            return sum((self._tokenize(token, **kwargs) if token not \
+            return list(itertools.chain.from_iterable((self._tokenize(token, **kwargs) if token not \
                    in self.added_tokens_encoder and token not in self.all_special_tokens \
-                    else [token] for token in tokenized_text), [])
+                    else [token] for token in tokenized_text)))

        added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens
        tokenized_text = split_on_tokens(added_tokens, text)
@@ -676,10 +677,6 @@ class PreTrainedTokenizer(object):
        ids = []
        for token in tokens:
            ids.append(self._convert_token_to_id_with_added_voc(token))
-        if len(ids) > self.max_len:
-            logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
-                           "for this model ({} > {}). Running this sequence through the model will result in "
-                           "indexing errors".format(len(ids), self.max_len))
        return ids

    def _convert_token_to_id_with_added_voc(self, token):
@@ -882,6 +879,11 @@ class PreTrainedTokenizer(object):
            encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
            encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]

+        if max_length is None and len(encoded_inputs["input_ids"]) > self.max_len:
+            logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
+                           "for this model ({} > {}). Running this sequence through the model will result in "
+                           "indexing errors".format(len(ids), self.max_len))
+                           
        return encoded_inputs

    def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):
@@ -1060,7 +1062,7 @@ class PreTrainedTokenizer(object):
            class attributes (cls_token, unk_token...).
        """
        all_toks = self.all_special_tokens
-        all_ids = list(self._convert_token_to_id(t) for t in all_toks)
+        all_ids = self.convert_tokens_to_ids(all_toks)
        return all_ids

    @staticmethod