Allow tokenization of sequences > 512 for caching
For many applications requiring randomized data access, it's easier to cache the tokenized representations than the words. So why not turn this into a warning?
This commit is contained in:
@@ -232,7 +232,7 @@ class OpenAIGPTTokenizer(object):
|
|||||||
else:
|
else:
|
||||||
ids.append(self.encoder.get(token, 0))
|
ids.append(self.encoder.get(token, 0))
|
||||||
if len(ids) > self.max_len:
|
if len(ids) > self.max_len:
|
||||||
raise ValueError(
|
logger.warning(
|
||||||
"Token indices sequence length is longer than the specified maximum "
|
"Token indices sequence length is longer than the specified maximum "
|
||||||
" sequence length for this OpenAI GPT model ({} > {}). Running this"
|
" sequence length for this OpenAI GPT model ({} > {}). Running this"
|
||||||
" sequence through the model will result in indexing errors".format(len(ids), self.max_len)
|
" sequence through the model will result in indexing errors".format(len(ids), self.max_len)
|
||||||
|
|||||||
Reference in New Issue
Block a user