Fix QA examples for roberta tokenizer (#12928)
This commit is contained in:
@@ -339,6 +339,11 @@ def main():
|
|||||||
|
|
||||||
# Training preprocessing
|
# Training preprocessing
|
||||||
def prepare_train_features(examples):
|
def prepare_train_features(examples):
|
||||||
|
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
|
||||||
|
# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
|
||||||
|
# left whitespace
|
||||||
|
examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
|
||||||
|
|
||||||
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
||||||
# in one example possible giving several features when a context is long, each of those features having a
|
# in one example possible giving several features when a context is long, each of those features having a
|
||||||
# context that overlaps a bit the context of the previous feature.
|
# context that overlaps a bit the context of the previous feature.
|
||||||
@@ -433,6 +438,11 @@ def main():
|
|||||||
|
|
||||||
# Validation preprocessing
|
# Validation preprocessing
|
||||||
def prepare_validation_features(examples):
|
def prepare_validation_features(examples):
|
||||||
|
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
|
||||||
|
# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
|
||||||
|
# left whitespace
|
||||||
|
examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
|
||||||
|
|
||||||
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
||||||
# in one example possible giving several features when a context is long, each of those features having a
|
# in one example possible giving several features when a context is long, each of those features having a
|
||||||
# context that overlaps a bit the context of the previous feature.
|
# context that overlaps a bit the context of the previous feature.
|
||||||
|
|||||||
@@ -327,6 +327,11 @@ def main():
|
|||||||
|
|
||||||
# Training preprocessing
|
# Training preprocessing
|
||||||
def prepare_train_features(examples):
|
def prepare_train_features(examples):
|
||||||
|
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
|
||||||
|
# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
|
||||||
|
# left whitespace
|
||||||
|
examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
|
||||||
|
|
||||||
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
||||||
# in one example possible giving several features when a context is long, each of those features having a
|
# in one example possible giving several features when a context is long, each of those features having a
|
||||||
# context that overlaps a bit the context of the previous feature.
|
# context that overlaps a bit the context of the previous feature.
|
||||||
|
|||||||
@@ -315,6 +315,11 @@ def main():
|
|||||||
|
|
||||||
# Training preprocessing
|
# Training preprocessing
|
||||||
def prepare_train_features(examples):
|
def prepare_train_features(examples):
|
||||||
|
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
|
||||||
|
# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
|
||||||
|
# left whitespace
|
||||||
|
examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
|
||||||
|
|
||||||
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
||||||
# in one example possible giving several features when a context is long, each of those features having a
|
# in one example possible giving several features when a context is long, each of those features having a
|
||||||
# context that overlaps a bit the context of the previous feature.
|
# context that overlaps a bit the context of the previous feature.
|
||||||
@@ -430,6 +435,11 @@ def main():
|
|||||||
|
|
||||||
# Validation preprocessing
|
# Validation preprocessing
|
||||||
def prepare_validation_features(examples):
|
def prepare_validation_features(examples):
|
||||||
|
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
|
||||||
|
# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
|
||||||
|
# left whitespace
|
||||||
|
examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
|
||||||
|
|
||||||
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
||||||
# in one example possible giving several features when a context is long, each of those features having a
|
# in one example possible giving several features when a context is long, each of those features having a
|
||||||
# context that overlaps a bit the context of the previous feature.
|
# context that overlaps a bit the context of the previous feature.
|
||||||
|
|||||||
@@ -367,6 +367,11 @@ def main():
|
|||||||
|
|
||||||
# Training preprocessing
|
# Training preprocessing
|
||||||
def prepare_train_features(examples):
|
def prepare_train_features(examples):
|
||||||
|
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
|
||||||
|
# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
|
||||||
|
# left whitespace
|
||||||
|
examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
|
||||||
|
|
||||||
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
||||||
# in one example possible giving several features when a context is long, each of those features having a
|
# in one example possible giving several features when a context is long, each of those features having a
|
||||||
# context that overlaps a bit the context of the previous feature.
|
# context that overlaps a bit the context of the previous feature.
|
||||||
@@ -459,6 +464,11 @@ def main():
|
|||||||
|
|
||||||
# Validation preprocessing
|
# Validation preprocessing
|
||||||
def prepare_validation_features(examples):
|
def prepare_validation_features(examples):
|
||||||
|
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
|
||||||
|
# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
|
||||||
|
# left whitespace
|
||||||
|
examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
|
||||||
|
|
||||||
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
||||||
# in one example possible giving several features when a context is long, each of those features having a
|
# in one example possible giving several features when a context is long, each of those features having a
|
||||||
# context that overlaps a bit the context of the previous feature.
|
# context that overlaps a bit the context of the previous feature.
|
||||||
|
|||||||
@@ -393,6 +393,11 @@ def main():
|
|||||||
|
|
||||||
# Training preprocessing
|
# Training preprocessing
|
||||||
def prepare_train_features(examples):
|
def prepare_train_features(examples):
|
||||||
|
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
|
||||||
|
# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
|
||||||
|
# left whitespace
|
||||||
|
examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
|
||||||
|
|
||||||
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
||||||
# in one example possible giving several features when a context is long, each of those features having a
|
# in one example possible giving several features when a context is long, each of those features having a
|
||||||
# context that overlaps a bit the context of the previous feature.
|
# context that overlaps a bit the context of the previous feature.
|
||||||
@@ -487,6 +492,11 @@ def main():
|
|||||||
|
|
||||||
# Validation preprocessing
|
# Validation preprocessing
|
||||||
def prepare_validation_features(examples):
|
def prepare_validation_features(examples):
|
||||||
|
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
|
||||||
|
# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
|
||||||
|
# left whitespace
|
||||||
|
examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
|
||||||
|
|
||||||
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
||||||
# in one example possible giving several features when a context is long, each of those features having a
|
# in one example possible giving several features when a context is long, each of those features having a
|
||||||
# context that overlaps a bit the context of the previous feature.
|
# context that overlaps a bit the context of the previous feature.
|
||||||
|
|||||||
Reference in New Issue
Block a user