From a7ca6d738b7801c680bd25d9e910f962d3f8bf2d Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 4 Dec 2019 15:43:34 -0500
Subject: [PATCH] Padding side is  tokenizer-dependant

---
 transformers/data/processors/squad.py         | 11 ++--
 .../tests/tokenization_tests_commons.py       | 21 +++++--
 transformers/tokenization_utils.py            | 60 ++++++++++++-------
 transformers/tokenization_xlnet.py            |  1 +
 4 files changed, 58 insertions(+), 35 deletions(-)

diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index 9306189eb4..6599c54330 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -73,8 +73,7 @@ def _is_whitespace(c):
     return False
 
 def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                       doc_stride, max_query_length, is_training,
-                                       sequence_a_is_doc=False):
+                                       doc_stride, max_query_length, is_training):
     """Loads a data file into a list of `InputBatch`s."""
 
     # Defining helper methods    
@@ -127,13 +126,13 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
         while len(spans) * doc_stride < len(all_doc_tokens):
             
             encoded_dict = tokenizer.encode_plus(
-                truncated_query if not sequence_a_is_doc else span_doc_tokens, 
-                span_doc_tokens if not sequence_a_is_doc else truncated_query, 
+                truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, 
+                span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, 
                 max_length=max_seq_length, 
                 return_overflowing_tokens=True, 
-                padding_strategy='right',
+                pad_to_max_length=True,
                 stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
-                truncation_strategy='only_second' if not sequence_a_is_doc else 'only_first'
+                truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first'
             )
 
             paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens)
diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
index 40d68d0ab2..6592005c67 100644
--- a/transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -344,17 +344,19 @@ class CommonTestCases:
             padding_idx = tokenizer.pad_token_id
 
             # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+            tokenizer.padding_side = "right"
             encoded_sequence = tokenizer.encode(sequence)
             sequence_length = len(encoded_sequence)
-            padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, padding_strategy='right')
+            padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
             padded_sequence_length = len(padded_sequence)
             assert sequence_length + padding_size == padded_sequence_length
             assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
 
             # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+            tokenizer.padding_side = "left"
             encoded_sequence = tokenizer.encode(sequence)
             sequence_length = len(encoded_sequence)
-            padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, padding_strategy='left')
+            padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
             padded_sequence_length = len(padded_sequence)
             assert sequence_length + padding_size == padded_sequence_length
             assert [padding_idx] * padding_size + encoded_sequence == padded_sequence
@@ -362,10 +364,15 @@ class CommonTestCases:
             # RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified
             encoded_sequence = tokenizer.encode(sequence)
             sequence_length = len(encoded_sequence)
-            padded_sequence_right = tokenizer.encode(sequence, padding_strategy='right')
+
+            tokenizer.padding_side = "right"
+            padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True)
             padded_sequence_right_length = len(padded_sequence_right)
-            padded_sequence_left = tokenizer.encode(sequence, padding_strategy='left')
+
+            tokenizer.padding_side = "left"
+            padded_sequence_left = tokenizer.encode(sequence, pad_to_max_length=True)
             padded_sequence_left_length = len(padded_sequence_left)
+
             assert sequence_length == padded_sequence_right_length
             assert encoded_sequence == padded_sequence_right
             assert sequence_length == padded_sequence_left_length
@@ -387,7 +394,8 @@ class CommonTestCases:
             sequence_length = len(input_ids)
 
             # Test right padding
-            padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, padding_strategy='right', return_special_tokens_mask=True)
+            tokenizer.padding_side = "right"
+            padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True)
             padded_input_ids = padded_sequence['input_ids']
             padded_token_type_ids = padded_sequence['token_type_ids']
             padded_attention_mask = padded_sequence['attention_mask']
@@ -401,7 +409,8 @@ class CommonTestCases:
             assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask 
 
             # Test left padding
-            padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, padding_strategy='left', return_special_tokens_mask=True)
+            tokenizer.padding_side = "left"
+            padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True)
             padded_input_ids = padded_sequence['input_ids']
             padded_token_type_ids = padded_sequence['token_type_ids']
             padded_attention_mask = padded_sequence['attention_mask']
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index dbbabd0e1a..41a611ea49 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -77,6 +77,8 @@ class PreTrainedTokenizer(object):
                                  "pad_token", "cls_token", "mask_token",
                                  "additional_special_tokens"]
 
+    padding_side = "right"
+
     @property
     def bos_token(self):
         """ Beginning of sentence token (string). Log an error if used while not having been set. """
@@ -223,6 +225,9 @@ class PreTrainedTokenizer(object):
 
         self.max_len = max_len if max_len is not None else int(1e12)
 
+        # Padding side is right by default and over-riden in subclsses. If specified in the kwargs, it is changed.
+        self.padding_side = kwargs.pop('padding_side', self.padding_side)
+        
         # Added tokens
         self.added_tokens_encoder = {}
         self.added_tokens_decoder = {}
@@ -702,7 +707,7 @@ class PreTrainedTokenizer(object):
                max_length=None,
                stride=0,
                truncation_strategy='longest_first',
-               padding_strategy=None,
+               pad_to_max_length=False,
                return_tensors=None,
                **kwargs):
         """
@@ -729,12 +734,12 @@ class PreTrainedTokenizer(object):
                 - 'only_first': Only truncate the first sequence
                 - 'only_second': Only truncate the second sequence
                 - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            padding_strategy: if set to a strategy, the returned sequences will be padded according to the model's 
-                padding index, up to their max length. If no max length is specified, no padding is done.
-                The strategies are handled by the following strings:
+            pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
+                padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
+                The tokenizer padding sides are handled by the following strings:
                 - 'left': pads on the left of the sequences
                 - 'right': pads on the right of the sequences   
-                Defaults to None: no padding.
+                Defaults to False: no padding.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
             **kwargs: passed to the `self.tokenize()` method
@@ -745,7 +750,7 @@ class PreTrainedTokenizer(object):
                                           add_special_tokens=add_special_tokens,
                                           stride=stride,
                                           truncation_strategy=truncation_strategy,
-                                          padding_strategy=padding_strategy,
+                                          pad_to_max_length=pad_to_max_length,
                                           return_tensors=return_tensors,
                                           **kwargs)
 
@@ -758,7 +763,7 @@ class PreTrainedTokenizer(object):
                     max_length=None,
                     stride=0,
                     truncation_strategy='longest_first',
-                    padding_strategy=None,
+                    pad_to_max_length=False,
                     return_tensors=None,
                     return_token_type_ids=True,
                     return_attention_mask=True,
@@ -788,12 +793,12 @@ class PreTrainedTokenizer(object):
                 - 'only_first': Only truncate the first sequence
                 - 'only_second': Only truncate the second sequence
                 - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            padding_strategy: if set to a strategy, the returned sequences will be padded according to the model's 
-                padding index, up to their max length. If no max length is specified, no padding is done.
-                The strategies are handled by the following strings:
+            pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
+                padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
+                The tokenizer padding sides are handled by the following strings:
                 - 'left': pads on the left of the sequences
                 - 'right': pads on the right of the sequences   
-                Defaults to None: no padding.
+                Defaults to False: no padding.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
             return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
@@ -841,7 +846,7 @@ class PreTrainedTokenizer(object):
         return self.prepare_for_model(first_ids,
                                       pair_ids=second_ids,
                                       max_length=max_length,
-                                      padding_strategy=padding_strategy,
+                                      pad_to_max_length=pad_to_max_length,
                                       add_special_tokens=add_special_tokens,
                                       stride=stride,
                                       truncation_strategy=truncation_strategy,
@@ -853,7 +858,7 @@ class PreTrainedTokenizer(object):
 
     def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0,
                           truncation_strategy='longest_first',
-                          padding_strategy=None,
+                          pad_to_max_length=False,
                           return_tensors=None,
                           return_token_type_ids=True,
                           return_attention_mask=True,
@@ -881,12 +886,12 @@ class PreTrainedTokenizer(object):
                 - 'only_first': Only truncate the first sequence
                 - 'only_second': Only truncate the second sequence
                 - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            padding_strategy: if set to a strategy, the returned sequences will be padded according to the model's 
-                padding index, up to their max length. If no max length is specified, no padding is done.
-                The strategies are handled by the following strings:
+            pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
+                padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
+                The tokenizer padding sides are handled by the following strings:
                 - 'left': pads on the left of the sequences
-                - 'right': pads on the right of the sequences            
-                Defaults to None: no padding.
+                - 'right': pads on the right of the sequences   
+                Defaults to False: no padding.
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
             return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
@@ -955,10 +960,19 @@ class PreTrainedTokenizer(object):
                            "for this model ({} > {}). Running this sequence through the model will result in "
                            "indexing errors".format(len(ids), self.max_len))
                            
-        if padding_strategy is not None and max_length and len(encoded_inputs["input_ids"]) < max_length:
-            difference = max_length - len(encoded_inputs["input_ids"])
+        needs_to_be_padded = pad_to_max_length and (
+            max_length and len(encoded_inputs["input_ids"]) < max_length
+            or 
+            max_length is None and len(encoded_inputs["input_ids"]) < self.max_len and self.max_len <= 10000
+        )
 
-            if padding_strategy == 'right':
+        if pad_to_max_length and max_length is None and self.max_len > 10000:
+            logger.warning("Sequence can't be padded as the maximum  ")
+
+        if needs_to_be_padded:
+            difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"])
+
+            if self.padding_side == 'right':
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference
                 if return_token_type_ids:
@@ -967,7 +981,7 @@ class PreTrainedTokenizer(object):
                     encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                 encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
 
-            elif padding_strategy == 'left':
+            elif self.padding_side == 'left':
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] =  [0] * difference + [1] * len(encoded_inputs["input_ids"])
                 if return_token_type_ids:
@@ -977,7 +991,7 @@ class PreTrainedTokenizer(object):
                 encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"]
 
             else:
-                raise ValueError("Invalid padding strategy:" + str(padding_strategy))
+                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
             
         elif return_attention_mask:
             encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py
index 3ea71f4438..1c43c0943a 100644
--- a/transformers/tokenization_xlnet.py
+++ b/transformers/tokenization_xlnet.py
@@ -60,6 +60,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    padding_side = "left"
 
     def __init__(self, vocab_file,
                  do_lower_case=False, remove_space=True, keep_accents=False,