Sort unique_no_split_tokens to make it deterministic (#6461)

* change unique_no_split_tokens's type to set

* use sorted list instead of set

* style
This commit is contained in:
Quentin Lhoest
2020-08-14 10:36:58 +02:00
committed by GitHub
parent 1d6e71e116
commit 9a8c168f56

View File

@@ -207,10 +207,10 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
# Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
if special_tokens:
self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(new_tokens)))
self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens)))
else:
# Or on the newly added tokens
self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(tokens_to_add)))
self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add)))
return len(tokens_to_add)