Updated DistilBERT

2019-09-24 07:03:24 -04:00
parent ab984a8b72
commit 9d44236f70
3 changed files with 98 additions and 62 deletions
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -412,7 +412,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
            output_mask=True,
            max_length=max_seq_length
        )
-        input_ids, segment_ids = inputs["sequence"], inputs["mask"]
+        input_ids, segment_ids = inputs["input_ids"], inputs["output_token_type"]
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -196,8 +196,8 @@ class CommonTestCases:
            if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer":
                seq_0 = "Test this method."
                seq_1 = "With these inputs."
-                information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True, output_mask=True)
+                information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True, output_token_type=True)
-                sequences, mask = information["sequence"], information["mask"]
+                sequences, mask = information["input_ids"], information["output_token_type"]
                assert len(sequences) == len(mask)
        def test_number_of_added_tokens(self):
@@ -224,7 +224,7 @@ class CommonTestCases:
            total_length = len(sequence) + num_added_tokens
            information = tokenizer.encode_plus(seq_0, max_length=total_length - 2, add_special_tokens=True, stride=stride)
-            truncated_sequence = information["sequence"]
+            truncated_sequence = information["input_ids"]
            overflowing_tokens = information["overflowing_tokens"]
            assert len(overflowing_tokens) == 2 + stride
@@ -249,12 +249,12 @@ class CommonTestCases:
            )
            information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True,
-                                                stride=stride)
+                                                stride=stride, truncate_first_sequence=False)
            information_first_truncated = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2,
                                                                add_special_tokens=True, stride=stride,
-                                                                truncate_second_sequence_first=False)
+                                                                truncate_first_sequence=True)
-            truncated_sequence = information["sequence"]
+            truncated_sequence = information["input_ids"]
            overflowing_tokens = information["overflowing_tokens"]
            overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -536,13 +536,7 @@ class PreTrainedTokenizer(object):
        if pair:
            initial_tokens_len = len(self.encode("This is a sequence") + self.encode("This is another"))
-            final_tokens = self.encode("This is a sequence", "This is another", add_special_tokens=True)
+            final_tokens_len = len(self.encode("This is a sequence", "This is another", add_special_tokens=True))
            # In some models (e.g. GPT-2), there is no sequence pair encoding.
            if len(final_tokens) == 2:
                return 0
            else:
                final_tokens_len = len(final_tokens)
        else:
            initial_tokens_len = len(self.encode("This is a sequence"))
            final_tokens_len = len(self.encode("This is a sequence", add_special_tokens=True))
@@ -700,86 +694,93 @@ class PreTrainedTokenizer(object):
        Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
        Args:
-            text: The first sequence to be encoded.
+            text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
-            text_pair: Optional second sequence to be encoded.
+                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                method)
            text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
                string using the `tokenize` method) or a list of integers (tokenized string ids using the
                `convert_tokens_to_ids` method)
            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
                to their model.
            **kwargs: passed to the `self.tokenize()` method
        """
-        if text_pair is None:
+        return self.encode_plus(text, text_pair, add_special_tokens, **kwargs)["input_ids"]
            if add_special_tokens:
                sequence_tokens = self.convert_tokens_to_ids(self.tokenize(text, **kwargs)) if isinstance(text, six.string_types) else text
                return self.add_special_tokens_single_sequence(sequence_tokens)
            else:
                ids = self.convert_tokens_to_ids(self.tokenize(text, **kwargs)) if isinstance(text, six.string_types) else text
                return ids
        first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text, **kwargs)] if isinstance(text, six.string_types) else text
        second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair, **kwargs)] if isinstance(text_pair, six.string_types) else text_pair
        if add_special_tokens:
            return self.add_special_tokens_sequence_pair(first_sentence_tokens, second_sentence_tokens)
        else:
            logger.warning("No special tokens were added. The two sequences have been concatenated.")
            return first_sentence_tokens + second_sentence_tokens
    def encode_plus(self,
                    text,
                    text_pair=None,
                    add_special_tokens=False,
-                    output_mask=False,
+                    output_token_type=False,
                    max_length=None,
                    stride=0,
-                    truncate_second_sequence_first=True,
+                    truncate_first_sequence=True,
                    **kwargs):
        """
        Returns a dictionary containing the encoded sequence or sequence pair. Other values can be returned by this
        method: the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
        Args:
-            text: The first sequence to be encoded.
+            text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
-            text_pair: Optional second sequence to be encoded.
+                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                method)
            text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
                string using the `tokenize` method) or a list of integers (tokenized string ids using the
                `convert_tokens_to_ids` method)
            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
                to their model.
-            output_mask: if set to ``True``, returns the text pair corresponding mask with 0 for the first sequence,
+            output_token_type: if set to ``True``, returns the text pair corresponding mask with 0 for the first sequence,
                and 1 for the second.
            max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
                If there are overflowing tokens, those will be added to the returned dictionary
            stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
                from the main sequence returned. The value of this argument defined the number of additional tokens.
-            truncate_second_sequence_first: if there is a specified max_length, this flag will choose which sequence
+            truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence
                will be truncated.
            **kwargs: passed to the `self.tokenize()` method
        """
        information = {}
        def get_input_ids(text):
            if isinstance(text, six.string_types):
                input_ids = self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
                input_ids = self.convert_tokens_to_ids(text)
            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
                input_ids = text
            else:
                raise ValueError("Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.")
            return input_ids
        if text_pair is None:
-            sequence_tokens = self.convert_tokens_to_ids(self.tokenize(text, **kwargs)) if isinstance(text, six.string_types) else text
+            sequence_tokens = get_input_ids(text)
            if add_special_tokens:
-                information = self.prepare_for_model(sequence_tokens, max_length, stride)
+                information = self.prepare_for_model(sequence_tokens, max_length=max_length, stride=stride)
            else:
                if max_length:
                    information["overflowing_tokens"] = sequence_tokens[max_length - stride:]
                    sequence_tokens = sequence_tokens[:max_length]
-                information["sequence"] = sequence_tokens
+                information["input_ids"] = sequence_tokens
-            if output_mask:
+            if output_token_type:
-                information["mask"] = [0] * len(information["sequence"])
+                information["output_token_type"] = [0] * len(information["input_ids"])
        else:
-            first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text, **kwargs)] if isinstance(text, six.string_types) else text
+            first_sentence_tokens = get_input_ids(text)
-            second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair, **kwargs)] if isinstance(text_pair, six.string_types) else text_pair
+            second_sentence_tokens = get_input_ids(text_pair)
            if add_special_tokens:
                information = self.prepare_pair_for_model(
                    first_sentence_tokens,
                    second_sentence_tokens,
-                    max_length,
+                    max_length=max_length,
-                    truncate_second_sequence_first,
+                    truncate_first_sequence=truncate_first_sequence,
-                    stride
+                    stride=stride
                )
-                if output_mask:
+                if output_token_type:
-                    information["mask"] = self.create_mask_from_sequences(text, text_pair)
+                    information["output_token_type"] = self.create_mask_from_sequences(text, text_pair)
            else:
                logger.warning("No special tokens were added. The two sequences have been concatenated.")
                sequence = first_sentence_tokens + second_sentence_tokens
@@ -787,43 +788,78 @@ class PreTrainedTokenizer(object):
                if max_length:
                    information["overflowing_tokens"] = sequence[max_length - stride:]
                    sequence = sequence[:max_length]
-                if output_mask:
+                if output_token_type:
-                    information["mask"] = [0] * len(sequence)
+                    information["output_token_type"] = [0] * len(sequence)
-                information["sequence"] = sequence
+                information["input_ids"] = sequence
        return information
    def prepare_for_model(self, ids, max_length=None, stride=0):
        """
        Prepares a list of tokenized input ids so that it can be used by the model. It adds special tokens, truncates
        sequences if overflowing while taking into account the special tokens and manages a window stride for
        overflowing tokens
        Args:
            ids: list of tokenized input ids. Can be obtained from a string by chaining the
                `tokenize` and `convert_tokens_to_ids` methods.
            max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
            stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
                list of inputs.
        Return:
            a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
        """
        information = {}
        n_added_tokens = self.num_added_tokens()
        if max_length:
            n_added_tokens = self.num_added_tokens()
            information["overflowing_tokens"] = ids[max_length - n_added_tokens - stride:]
            ids = ids[:max_length - n_added_tokens]
-        information["sequence"] = self.add_special_tokens_single_sequence(ids)
+        information["input_ids"] = self.add_special_tokens_single_sequence(ids)
        return information
-    def prepare_pair_for_model(self, ids_0, ids_1, max_length=None, truncate_second_sequence_first=True, stride=0):
+    def prepare_pair_for_model(self, ids_0, ids_1, max_length=None, truncate_first_sequence=True, stride=0):
        """
        Prepares a list of tokenized input ids pair so that it can be used by the model. It adds special tokens,
        truncates sequences if overflowing while taking into account the special tokens and manages a window stride for
        overflowing tokens
        Args:
            ids_0: list of tokenized input ids. Can be obtained from a string by chaining the
                `tokenize` and `convert_tokens_to_ids` methods.
            ids_1: second list of tokenized input ids. Can be obtained from a string by chaining the
                `tokenize` and `convert_tokens_to_ids` methods.
            max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
            truncate_first_sequence: if set to `True`, alongside a specified `max_length`, will truncate the first
                sequence if the total size is superior than the specified `max_length`. If set to `False`, will
                truncate the second sequence instead.
            stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
                list of inputs.
        Return:
            a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
        """
        f_len, s_len = len(ids_0), len(ids_1)
        n_added_tokens = self.num_added_tokens(pair=True)
        information = {}
        if max_length:
            n_added_tokens = self.num_added_tokens(pair=True)
            if len(ids_0) + n_added_tokens >= max_length:
                logger.warning(
                    "The first sequence is longer than the maximum specified length. This sequence will not be truncated.")
            else:
                if f_len + s_len + self.num_added_tokens(pair=True) > max_length:
-                    if truncate_second_sequence_first:
+                    if truncate_first_sequence:
                        information["overflowing_tokens"] = ids_1[max_length - f_len - n_added_tokens - stride:]
                        ids_1 = ids_1[:max_length - f_len - n_added_tokens]
                    else:
                        information["overflowing_tokens"] = ids_0[max_length - s_len - n_added_tokens - stride:]
                        ids_0 = ids_0[:max_length - s_len - n_added_tokens]
                    else:
                        information["overflowing_tokens"] = ids_1[max_length - f_len - n_added_tokens - stride:]
                        ids_1 = ids_1[:max_length - f_len - n_added_tokens]
        sequence = self.add_special_tokens_sequence_pair(ids_0, ids_1)
-        information["sequence"] = sequence
+        information["input_ids"] = sequence
        return information