Bug fix: token classification pipeline while passing offset_mapping (#22034)
fix slow tokenizers with passing offset_mapping
This commit is contained in:
@@ -304,7 +304,9 @@ class TokenClassificationPipeline(Pipeline):
|
|||||||
start_ind = start_ind.item()
|
start_ind = start_ind.item()
|
||||||
end_ind = end_ind.item()
|
end_ind = end_ind.item()
|
||||||
word_ref = sentence[start_ind:end_ind]
|
word_ref = sentence[start_ind:end_ind]
|
||||||
if getattr(self.tokenizer._tokenizer.model, "continuing_subword_prefix", None):
|
if getattr(self.tokenizer, "_tokenizer", None) and getattr(
|
||||||
|
self.tokenizer._tokenizer.model, "continuing_subword_prefix", None
|
||||||
|
):
|
||||||
# This is a BPE, word aware tokenizer, there is a correct way
|
# This is a BPE, word aware tokenizer, there is a correct way
|
||||||
# to fuse tokens
|
# to fuse tokens
|
||||||
is_subword = len(word) != len(word_ref)
|
is_subword = len(word) != len(word_ref)
|
||||||
|
|||||||
Reference in New Issue
Block a user