Merge pull request #337 from CatalinVoss/patch-2
Allow tokenization of sequences > 512 for caching
This commit is contained in:
@@ -120,7 +120,7 @@ class BertTokenizer(object):
|
||||
for token in tokens:
|
||||
ids.append(self.vocab[token])
|
||||
if len(ids) > self.max_len:
|
||||
raise ValueError(
|
||||
logger.warning(
|
||||
"Token indices sequence length is longer than the specified maximum "
|
||||
" sequence length for this BERT model ({} > {}). Running this"
|
||||
" sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
|
||||
|
||||
Reference in New Issue
Block a user