Preserve spaces in GPT-2 tokenizers (#2778)
* Preserve spaces in GPT-2 tokenizers Preserves spaces after special tokens in GPT-2 and inhereted (RoBERTa) tokenizers, enabling correct BPE encoding. Automatically inserts a space in front of first token in encode function when adding special tokens. * Add tokenization preprocessing method * Add framework argument to pipeline factory Also fixes pipeline test issue. Each test input now treated as a distinct sequence.
This commit is contained in:
@@ -154,3 +154,12 @@ class RobertaTokenizer(GPT2Tokenizer):
|
||||
if token_ids_1 is None:
|
||||
return len(cls + token_ids_0 + sep) * [0]
|
||||
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
|
||||
|
||||
def prepare_for_tokenization(self, text, add_special_tokens=False, **kwargs):
|
||||
if "add_prefix_space" in kwargs:
|
||||
add_prefix_space = kwargs["add_prefix_space"]
|
||||
else:
|
||||
add_prefix_space = add_special_tokens
|
||||
if add_prefix_space and not text[0].isspace():
|
||||
text = " " + text
|
||||
return text
|
||||
|
||||
Reference in New Issue
Block a user