Merge branch 'master' into iterative_split_on_token

2019-08-21 01:33:02 +02:00
parent 9ce36e3e4b 9beaa85b07
commit d30cbaf5dc
34 changed files with 1486 additions and 182 deletions
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -180,9 +180,10 @@ class PreTrainedTokenizer(object):

    @classmethod
    def from_pretrained(cls, *inputs, **kwargs):
-        r""" Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
+        r"""
+        Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.

-        Parameters:
+        Args:
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
@@ -383,14 +384,15 @@ class PreTrainedTokenizer(object):


    def add_tokens(self, new_tokens):
-        """ Add a list of new tokens to the tokenizer class. If the new tokens are not in the
+        """
+        Add a list of new tokens to the tokenizer class. If the new tokens are not in the
        vocabulary, they are added to it with indices starting from length of the current vocabulary.

-            Parameters:
-                new_tokens: list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
+        Args:
+            new_tokens: list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).

-            Returns:
-                Number of tokens added to the vocabulary.
+        Returns:
+            Number of tokens added to the vocabulary.

        Examples::

@@ -422,17 +424,20 @@ class PreTrainedTokenizer(object):


    def add_special_tokens(self, special_tokens_dict):
-        """ Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
-            to class attributes. If special tokens are NOT in the vocabulary, they are added
-            to it (indexed starting from the last index of the current vocabulary).
+        """
+        Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
+        to class attributes. If special tokens are NOT in the vocabulary, they are added
+        to it (indexed starting from the last index of the current vocabulary).

-            Parameters:
-                special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``].
+        Args:
+            special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
+                [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
+                ``additional_special_tokens``].

-                    Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
+                Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).

-            Returns:
-                Number of tokens added to the vocabulary.
+        Returns:
+            Number of tokens added to the vocabulary.

        Examples::

@@ -549,14 +554,37 @@ class PreTrainedTokenizer(object):
    def _convert_token_to_id(self, token):
        raise NotImplementedError

-
-    def encode(self, text):
-        """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
-
-        Same doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
+    def encode(self, text, text_pair=None, add_special_tokens=False):
        """
-        return self.convert_tokens_to_ids(self.tokenize(text))
+        Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
+        
+        Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.

+        Args:
+            text: The first sequence to be encoded.
+            text_pair: Optional second sequence to be encoded.
+            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
+                to their model.
+        """
+        if text_pair is None:
+            if add_special_tokens:
+                return self.add_special_tokens_single_sentence(self.convert_tokens_to_ids(self.tokenize(text)))
+            else:
+                return self.convert_tokens_to_ids(self.tokenize(text))
+
+        first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text)]
+        second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair)]
+
+        if add_special_tokens:
+            return self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens)
+        else:
+            return first_sentence_tokens, second_sentence_tokens
+
+    def add_special_tokens_single_sentence(self, token_ids):
+        raise NotImplementedError
+
+    def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
+        raise NotImplementedError

    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
        """ Converts a single index or a sequence of indices (integers) in a token "
@@ -591,16 +619,28 @@ class PreTrainedTokenizer(object):
        return ' '.join(self.convert_ids_to_tokens(tokens))

    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
-        """ Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
-            with options to remove special tokens and clean up tokenization spaces.
-
+        """
+        Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
+        with options to remove special tokens and clean up tokenization spaces.
        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
        """
        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
        text = self.convert_tokens_to_string(filtered_tokens)
-        if clean_up_tokenization_spaces:
-            text = self.clean_up_tokenization(text)
-        return text
+
+        if self.sep_token is not None and self.sep_token in text:
+            text = text.replace(self.cls_token, self.sep_token)
+            split_text = list(filter(lambda sentence: len(sentence) > 0, text.split(self.sep_token)))
+            if clean_up_tokenization_spaces:
+                clean_text = [self.clean_up_tokenization(text) for text in split_text]
+                return clean_text
+            else:
+                return split_text
+        else:
+            if clean_up_tokenization_spaces:
+                clean_text = self.clean_up_tokenization(text)
+                return clean_text
+            else:
+                return text

    @property
    def special_tokens_map(self):
@@ -632,7 +672,7 @@ class PreTrainedTokenizer(object):
            class attributes (cls_token, unk_token...).
        """
        all_toks = self.all_special_tokens
-        all_ids = list(self.convert_tokens_to_ids(t) for t in all_toks)
+        all_ids = list(self._convert_token_to_id(t) for t in all_toks)
        return all_ids

    @staticmethod