From 8cba0572603516f4f0d7fcd52fb76dce885b1358 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Thu, 19 Sep 2019 09:42:13 +0200 Subject: [PATCH] Doc + remove artefacts --- pytorch_transformers/tokenization_utils.py | 41 ++-------------------- 1 file changed, 2 insertions(+), 39 deletions(-) diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py index 3a3ebd49be..2a31aec887 100644 --- a/pytorch_transformers/tokenization_utils.py +++ b/pytorch_transformers/tokenization_utils.py @@ -724,9 +724,8 @@ class PreTrainedTokenizer(object): def encode_plus(self, text, text_pair=None, add_special_tokens=False, output_mask=False, max_length=None, **kwargs): """ - Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. - - Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``. + Returns a dictionary containing the encoded sequence or sequence pair. Other values can be returned by this + method: the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. Args: text: The first sequence to be encoded. @@ -801,42 +800,6 @@ class PreTrainedTokenizer(object): return information - if text_pair is None: - if add_special_tokens: - sequence_tokens = self.convert_tokens_to_ids(self.tokenize(text, **kwargs)) - if max_length: - sequence_tokens = sequence_tokens[:max_length - self.num_added_tokens()] - return self.add_special_tokens_single_sentence(sequence_tokens) - else: - ids = self.convert_tokens_to_ids(self.tokenize(text, **kwargs)) - return ids[:max_length] if max_length != -1 else ids - - first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text, **kwargs)] - second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair, **kwargs)] - - if add_special_tokens: - if max_length: - if len(first_sentence_tokens) + self.num_added_tokens(pair=True) >= max_length: - logger.warning( - "The first sequence is longer than the maximum specified length. This sequence will not be truncated.") - else: - if len(second_sentence_tokens) + len(first_sentence_tokens) + self.num_added_tokens( - pair=True) > max_length: - second_sentence_tokens = second_sentence_tokens[ - :max_length - len(first_sentence_tokens) - self.num_added_tokens( - pair=True)] - - return self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens, - output_mask) - else: - if max_length: - first_sentence_tokens = first_sentence_tokens[:max_length] - second_sentence_tokens = second_sentence_tokens[:max_length] - - if output_mask: - logger.warning("Can't output mask if you're not joining two sequences.") - return first_sentence_tokens, second_sentence_tokens - def add_special_tokens_single_sentence(self, token_ids): logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.") return token_ids