From 9a8c168f56fe3c0e21d554a577ac03beb004ef89 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Fri, 14 Aug 2020 10:36:58 +0200 Subject: [PATCH] Sort unique_no_split_tokens to make it deterministic (#6461) * change unique_no_split_tokens's type to set * use sorted list instead of set * style --- src/transformers/tokenization_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index cbe9b34bee..3121980c0d 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -207,10 +207,10 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert) if special_tokens: - self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(new_tokens))) + self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens))) else: # Or on the newly added tokens - self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(tokens_to_add))) + self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add))) return len(tokens_to_add)