tokenizer white space: revert to previous behavior
This commit is contained in:
@@ -373,7 +373,7 @@ def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str]) -> List[List[
|
|||||||
filepath = id_or_path
|
filepath = id_or_path
|
||||||
with open(filepath, "r") as f:
|
with open(filepath, "r") as f:
|
||||||
words = f.read().split("\n")
|
words = f.read().split("\n")
|
||||||
bow_indices.append([TOKENIZER.encode(word) for word in words])
|
bow_indices.append([TOKENIZER.encode(word, add_prefix_space=True) for word in words])
|
||||||
return bow_indices
|
return bow_indices
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user