[tokenizers] Updates data processors, docstring, examples and model cards to the new API (#5308)
* remove references to old API in docstring - update data processors * style * fix tests - better type checking error messages * better type checking * include awesome fix by @LysandreJik for #5310 * updated doc and examples
This commit is contained in:
@@ -91,7 +91,7 @@ class LineByLineTextDataset(Dataset):
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
|
||||
|
||||
batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)
|
||||
batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size)
|
||||
self.examples = batch_encoding["input_ids"]
|
||||
|
||||
def __len__(self):
|
||||
|
||||
@@ -137,8 +137,11 @@ def _glue_convert_examples_to_features(
|
||||
|
||||
labels = [label_from_example(example) for example in examples]
|
||||
|
||||
batch_encoding = tokenizer.batch_encode_plus(
|
||||
[(example.text_a, example.text_b) for example in examples], max_length=max_length, pad_to_max_length=True,
|
||||
batch_encoding = tokenizer(
|
||||
[(example.text_a, example.text_b) for example in examples],
|
||||
max_length=max_length,
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
)
|
||||
|
||||
features = []
|
||||
|
||||
@@ -120,7 +120,9 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q
|
||||
|
||||
spans = []
|
||||
|
||||
truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
|
||||
truncated_query = tokenizer.encode(
|
||||
example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length
|
||||
)
|
||||
sequence_added_tokens = (
|
||||
tokenizer.max_len - tokenizer.max_len_single_sentence + 1
|
||||
if "roberta" in str(type(tokenizer)) or "camembert" in str(type(tokenizer))
|
||||
@@ -131,14 +133,14 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q
|
||||
span_doc_tokens = all_doc_tokens
|
||||
while len(spans) * doc_stride < len(all_doc_tokens):
|
||||
|
||||
encoded_dict = tokenizer.encode_plus(
|
||||
encoded_dict = tokenizer.encode_plus( # TODO(thom) update this logic
|
||||
truncated_query if tokenizer.padding_side == "right" else span_doc_tokens,
|
||||
span_doc_tokens if tokenizer.padding_side == "right" else truncated_query,
|
||||
truncation="only_second" if tokenizer.padding_side == "right" else "only_first",
|
||||
padding="max_length",
|
||||
max_length=max_seq_length,
|
||||
return_overflowing_tokens=True,
|
||||
pad_to_max_length=True,
|
||||
stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
|
||||
truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first",
|
||||
return_token_type_ids=True,
|
||||
)
|
||||
|
||||
@@ -176,7 +178,9 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q
|
||||
|
||||
spans.append(encoded_dict)
|
||||
|
||||
if "overflowing_tokens" not in encoded_dict:
|
||||
if "overflowing_tokens" not in encoded_dict or (
|
||||
"overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0
|
||||
):
|
||||
break
|
||||
span_doc_tokens = encoded_dict["overflowing_tokens"]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user