Fix QA examples for roberta tokenizer (#12928)
This commit is contained in:
@@ -339,6 +339,11 @@ def main():
|
||||
|
||||
# Training preprocessing
|
||||
def prepare_train_features(examples):
|
||||
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
|
||||
# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
|
||||
# left whitespace
|
||||
examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
|
||||
|
||||
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
||||
# in one example possible giving several features when a context is long, each of those features having a
|
||||
# context that overlaps a bit the context of the previous feature.
|
||||
@@ -433,6 +438,11 @@ def main():
|
||||
|
||||
# Validation preprocessing
|
||||
def prepare_validation_features(examples):
|
||||
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
|
||||
# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
|
||||
# left whitespace
|
||||
examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
|
||||
|
||||
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
||||
# in one example possible giving several features when a context is long, each of those features having a
|
||||
# context that overlaps a bit the context of the previous feature.
|
||||
|
||||
@@ -327,6 +327,11 @@ def main():
|
||||
|
||||
# Training preprocessing
|
||||
def prepare_train_features(examples):
|
||||
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
|
||||
# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
|
||||
# left whitespace
|
||||
examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
|
||||
|
||||
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
||||
# in one example possible giving several features when a context is long, each of those features having a
|
||||
# context that overlaps a bit the context of the previous feature.
|
||||
|
||||
@@ -315,6 +315,11 @@ def main():
|
||||
|
||||
# Training preprocessing
|
||||
def prepare_train_features(examples):
|
||||
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
|
||||
# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
|
||||
# left whitespace
|
||||
examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
|
||||
|
||||
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
||||
# in one example possible giving several features when a context is long, each of those features having a
|
||||
# context that overlaps a bit the context of the previous feature.
|
||||
@@ -430,6 +435,11 @@ def main():
|
||||
|
||||
# Validation preprocessing
|
||||
def prepare_validation_features(examples):
|
||||
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
|
||||
# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
|
||||
# left whitespace
|
||||
examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
|
||||
|
||||
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
||||
# in one example possible giving several features when a context is long, each of those features having a
|
||||
# context that overlaps a bit the context of the previous feature.
|
||||
|
||||
@@ -367,6 +367,11 @@ def main():
|
||||
|
||||
# Training preprocessing
|
||||
def prepare_train_features(examples):
|
||||
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
|
||||
# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
|
||||
# left whitespace
|
||||
examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
|
||||
|
||||
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
||||
# in one example possible giving several features when a context is long, each of those features having a
|
||||
# context that overlaps a bit the context of the previous feature.
|
||||
@@ -459,6 +464,11 @@ def main():
|
||||
|
||||
# Validation preprocessing
|
||||
def prepare_validation_features(examples):
|
||||
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
|
||||
# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
|
||||
# left whitespace
|
||||
examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
|
||||
|
||||
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
||||
# in one example possible giving several features when a context is long, each of those features having a
|
||||
# context that overlaps a bit the context of the previous feature.
|
||||
|
||||
@@ -393,6 +393,11 @@ def main():
|
||||
|
||||
# Training preprocessing
|
||||
def prepare_train_features(examples):
|
||||
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
|
||||
# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
|
||||
# left whitespace
|
||||
examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
|
||||
|
||||
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
||||
# in one example possible giving several features when a context is long, each of those features having a
|
||||
# context that overlaps a bit the context of the previous feature.
|
||||
@@ -487,6 +492,11 @@ def main():
|
||||
|
||||
# Validation preprocessing
|
||||
def prepare_validation_features(examples):
|
||||
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
|
||||
# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
|
||||
# left whitespace
|
||||
examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
|
||||
|
||||
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
|
||||
# in one example possible giving several features when a context is long, each of those features having a
|
||||
# context that overlaps a bit the context of the previous feature.
|
||||
|
||||
Reference in New Issue
Block a user