Black preview (#17217)

* Black preview

* Fixup too!

* Fix check copies

* Use the same version as the CI

* Bump black
This commit is contained in:
Sylvain Gugger
2022-05-12 16:25:55 -04:00
committed by GitHub
parent 9bd67ac7bb
commit afe5d42d8d
578 changed files with 8274 additions and 3296 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -187,7 +187,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
vocab = {}
for (i, token) in enumerate(vocab_tokens):
for i, token in enumerate(vocab_tokens):
vocab[token] = i
tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")

View File

@@ -144,7 +144,10 @@ class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@slow
def test_tokenization_base_hard_symbols(self):
symbols = 'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth'
symbols = (
'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
" add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth"
)
original_tokenizer_encodings = [
871,
419,

View File

@@ -176,7 +176,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こんにちは", "こん", "にちは", "ばんは", "##こん", "##にちは", "##ばんは"]
vocab = {}
for (i, token) in enumerate(vocab_tokens):
for i, token in enumerate(vocab_tokens):
vocab[token] = i
tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
@@ -249,7 +249,7 @@ class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestC
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "", "", "", "", "", "", "", "", "", ""]
vocab = {}
for (i, token) in enumerate(vocab_tokens):
for i, token in enumerate(vocab_tokens):
vocab[token] = i
tokenizer = CharacterTokenizer(vocab=vocab, unk_token="[UNK]")
@@ -288,7 +288,8 @@ class BertTokenizerMismatchTest(unittest.TestCase):
BertTokenizer.from_pretrained(EXAMPLE_BERT_JAPANESE_ID)
self.assertTrue(
cm.records[0].message.startswith(
"The tokenizer class you load from this checkpoint is not the same type as the class this function is called from."
"The tokenizer class you load from this checkpoint is not the same type as the class this function"
" is called from."
)
)
EXAMPLE_BERT_ID = "bert-base-cased"
@@ -296,6 +297,7 @@ class BertTokenizerMismatchTest(unittest.TestCase):
BertJapaneseTokenizer.from_pretrained(EXAMPLE_BERT_ID)
self.assertTrue(
cm.records[0].message.startswith(
"The tokenizer class you load from this checkpoint is not the same type as the class this function is called from."
"The tokenizer class you load from this checkpoint is not the same type as the class this function"
" is called from."
)
)

View File

@@ -799,7 +799,16 @@ class BigBirdModelIntegrationTest(unittest.TestCase):
model.to(torch_device)
text = [
"Transformer-based models are unable to process long sequences due to their self-attention operation, which scales quadratically with the sequence length. To address this limitation, we introduce the Longformer with an attention mechanism that scales linearly with sequence length, making it easy to process documents of thousands of tokens or longer. Longformers attention mechanism is a drop-in replacement for the standard self-attention and combines a local windowed attention with a task motivated global attention. Following prior work on long-sequence transformers, we evaluate Longformer on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream tasks. Our pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on WikiHop and TriviaQA."
"Transformer-based models are unable to process long sequences due to their self-attention operation,"
" which scales quadratically with the sequence length. To address this limitation, we introduce the"
" Longformer with an attention mechanism that scales linearly with sequence length, making it easy to"
" process documents of thousands of tokens or longer. Longformers attention mechanism is a drop-in"
" replacement for the standard self-attention and combines a local windowed attention with a task"
" motivated global attention. Following prior work on long-sequence transformers, we evaluate Longformer"
" on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In"
" contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream"
" tasks. Our pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new"
" state-of-the-art results on WikiHop and TriviaQA."
]
inputs = tokenizer(text)
@@ -837,7 +846,18 @@ class BigBirdModelIntegrationTest(unittest.TestCase):
)
model.to(torch_device)
context = "The BigBird model was proposed in Big Bird: Transformers for Longer Sequences by Zaheer, Manzil and Guruganesh, Guru and Dubey, Kumar Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon, Santiago and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and others. BigBird, is a sparse-attention based transformer which extends Transformer based models, such as BERT to much longer sequences. In addition to sparse attention, BigBird also applies global attention as well as random attention to the input sequence. Theoretically, it has been shown that applying sparse, global, and random attention approximates full attention, while being computationally much more efficient for longer sequences. As a consequence of the capability to handle longer context, BigBird has shown improved performance on various long document NLP tasks, such as question answering and summarization, compared to BERT or RoBERTa."
context = (
"The BigBird model was proposed in Big Bird: Transformers for Longer Sequences by Zaheer, Manzil and"
" Guruganesh, Guru and Dubey, Kumar Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon, Santiago"
" and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and others. BigBird, is a"
" sparse-attention based transformer which extends Transformer based models, such as BERT to much longer"
" sequences. In addition to sparse attention, BigBird also applies global attention as well as random"
" attention to the input sequence. Theoretically, it has been shown that applying sparse, global, and"
" random attention approximates full attention, while being computationally much more efficient for longer"
" sequences. As a consequence of the capability to handle longer context, BigBird has shown improved"
" performance on various long document NLP tasks, such as question answering and summarization, compared"
" to BERT or RoBERTa."
)
question = [
"Which is better for longer sequences- BigBird or BERT?",

View File

@@ -168,7 +168,10 @@ class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@slow
def test_tokenization_base_hard_symbols(self):
symbols = 'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth'
symbols = (
'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
" add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth"
)
# fmt: off
original_tokenizer_encodings = [65, 871, 419, 358, 946, 991, 2521, 452, 358, 1357, 387, 7751, 3536, 112, 985, 456, 126, 865, 938, 5400, 5734, 458, 1368, 467, 786, 2462, 5246, 1159, 633, 865, 4519, 457, 582, 852, 2557, 427, 916, 508, 405, 34324, 497, 391, 408, 11342, 1244, 385, 100, 938, 985, 456, 574, 362, 12597, 3200, 3129, 1172, 66] # noqa: E231
# fmt: on

View File

@@ -538,9 +538,26 @@ class BigBirdPegasusModelIntegrationTests(unittest.TestCase):
hypotheses_batch = model.generate(**inputs)
EXPECTED_LEP = "motivated by some recent studies on the light cp - odd higgs boson @xmath0 in non - minimal supersymmetric models, we investigate the rare @xmath1-decays @xmath2 ( @xmath3 ) in the two higgs doublet model ( 2hdm ), the nearly minimal supersymmetric standard model ( nmssm ), the next - to - minimal supersymmetric standard model ( nmssm ) and the minimal supersymmetric standard model ( mssm ).<n> we find that the branching ratios of @xmath4 can reach @xmath5 in 2hdm, @xmath6 in nmssm and @xmath7 in mssm, which are at the level of @xmath8 in 2hdm, @xmath9 in nmssm and @xmath10 in mssm, respectively.<n> these rates can be significantly enhanced in new physics models which lie within the expected sensitivity of the gigaz option of the international linear collider ( ilc ). <n> = # 1,nucl. <n> phys. <n> b * # 1"
EXPECTED_LEP = (
"motivated by some recent studies on the light cp - odd higgs boson @xmath0 in non - minimal"
" supersymmetric models, we investigate the rare @xmath1-decays @xmath2 ( @xmath3 ) in the two higgs"
" doublet model ( 2hdm ), the nearly minimal supersymmetric standard model ( nmssm ), the next - to -"
" minimal supersymmetric standard model ( nmssm ) and the minimal supersymmetric standard model ( mssm"
" ).<n> we find that the branching ratios of @xmath4 can reach @xmath5 in 2hdm, @xmath6 in nmssm and"
" @xmath7 in mssm, which are at the level of @xmath8 in 2hdm, @xmath9 in nmssm and @xmath10 in mssm,"
" respectively.<n> these rates can be significantly enhanced in new physics models which lie within the"
" expected sensitivity of the gigaz option of the international linear collider ( ilc ). <n> = # 1,nucl."
" <n> phys. <n> b * # 1"
)
EXPECTED_MAGNET = "a positive, nonsaturating and dominantly linear magnetoresistance can appear within quite wide magnetic - field range in the surface state of a topological insulator having a positive and finite effective g - factor. this linear magnetoresistance shows up in the system of high carrier concentration and low mobility when electrons are in extended states and spread over many smeared landau levels, and persists up to room temperature, providing a possible mechanism for the recently observed linear magnetoresistance in topological insulator bi@xmath0se@xmath1 nanoribbons."
EXPECTED_MAGNET = (
"a positive, nonsaturating and dominantly linear magnetoresistance can appear within quite wide magnetic -"
" field range in the surface state of a topological insulator having a positive and finite effective g -"
" factor. this linear magnetoresistance shows up in the system of high carrier concentration and low"
" mobility when electrons are in extended states and spread over many smeared landau levels, and persists"
" up to room temperature, providing a possible mechanism for the recently observed linear"
" magnetoresistance in topological insulator bi@xmath0se@xmath1 nanoribbons."
)
generated = tokenizer.batch_decode(
hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True

View File

@@ -304,7 +304,10 @@ class Blenderbot3BIntegrationTests(unittest.TestCase):
generated_txt = self.tokenizer.batch_decode(generated_utterances, **TOK_DECODE_KW)
assert generated_txt[0].strip() == tgt_text
src_text = "Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel like i'm going to throw up.\nand why is that?"
src_text = (
"Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel"
" like i'm going to throw up.\nand why is that?"
)
model_inputs = self.tokenizer([src_text], return_tensors="pt").to(torch_device)

View File

@@ -290,8 +290,8 @@ class Blenderbot90MIntegrationTests(unittest.TestCase):
def test_90_generation_from_long_input(self):
src_text = [
"Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel like\
i'm going to throw up.\nand why is that?"
"Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel"
" like i'm going to throw up.\nand why is that?"
]
model_inputs = self.tokenizer(src_text, return_tensors="pt").to(torch_device)

View File

@@ -305,7 +305,8 @@ def _long_tensor(tok_lst):
@require_tf
class TFBlenderbot90MIntegrationTests(unittest.TestCase):
src_text = [
"Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel like i'm going to throw up.\nand why is that?"
"Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel like "
" i'm going to throw up.\nand why is that?"
]
model_name = "facebook/blenderbot_small-90M"

View File

@@ -378,7 +378,12 @@ class CanineModelTest(ModelTesterMixin, unittest.TestCase):
torch.allclose(
set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
),
msg=f"Tuple and dict output are not equal. Difference: {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`: {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}.",
msg=(
"Tuple and dict output are not equal. Difference:"
f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
),
)
recursive_check(tuple_output, dict_output)

View File

@@ -219,7 +219,10 @@ class TFConvNextModelTest(TFModelTesterMixin, unittest.TestCase):
else:
self.assertTrue(
all(tf.equal(tuple_object, dict_object)),
msg=f"Tuple and dict output are not equal. Difference: {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}",
msg=(
"Tuple and dict output are not equal. Difference:"
f" {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}"
),
)
recursive_check(tuple_output, dict_output)

View File

@@ -736,7 +736,8 @@ class Data2VecAudioModelIntegrationTest(unittest.TestCase):
EXPECTED_TRANSCRIPTIONS = [
"a man said to the universe sir i exist",
"sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
"the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with thousands of spectators were trivialities not worth thinking about",
"the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around"
" him with thousands of spectators were trivialities not worth thinking about",
"his instant of panic was followed by a small sharp blow high on his chest",
]
self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)

View File

@@ -126,7 +126,9 @@ class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
sequences = [
"ALBERT: A Lite BERT for Self-supervised Learning of Language Representations",
"ALBERT incorporates two parameter reduction techniques",
"The first one is a factorized embedding parameterization. By decomposing the large vocabulary embedding matrix into two small matrices, we separate the size of the hidden layers from the size of vocabulary embedding.",
"The first one is a factorized embedding parameterization. By decomposing the large vocabulary"
" embedding matrix into two small matrices, we separate the size of the hidden layers from the size of"
" vocabulary embedding.",
]
encoding = tokenizer(sequences, padding=True)
@@ -155,7 +157,9 @@ class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
expected_decoded_sequence = [
"ALBERT: A Lite BERT for Self-supervised Learning of Language Representations",
"ALBERT incorporates two parameter reduction techniques",
"The first one is a factorized embedding parameterization. By decomposing the large vocabulary embedding matrix into two small matrices, we separate the size of the hidden layers from the size of vocabulary embedding.",
"The first one is a factorized embedding parameterization. By decomposing the large vocabulary"
" embedding matrix into two small matrices, we separate the size of the hidden layers from the size of"
" vocabulary embedding.",
]
self.assertDictEqual(encoding.data, expected_encoding)

View File

@@ -333,7 +333,12 @@ class FNetModelTest(ModelTesterMixin, unittest.TestCase):
torch.allclose(
set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
),
msg=f"Tuple and dict output are not equal. Difference: {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`: {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}.",
msg=(
"Tuple and dict output are not equal. Difference:"
f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
),
)
recursive_check(tuple_output, dict_output)

View File

@@ -576,7 +576,8 @@ class TFGPT2ModelLanguageGenerationTest(unittest.TestCase):
sentence = ["The dog"]
expected_output_string = [
"The dog owner asked why did our vet decide there needed to be extra ventilation inside because most puppies"
"The dog owner asked why did our vet decide there needed to be extra ventilation inside because most"
" puppies"
]
expected_output_string_xla = [
"The dog has been named in connection with the murder of a 20-year-old man in!"

View File

@@ -539,7 +539,8 @@ class TFHubertModelIntegrationTest(unittest.TestCase):
EXPECTED_TRANSCRIPTIONS = [
"a man said to the universe sir i exist",
"sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
"the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
"the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around"
" him with the thousands of spectators were trivialities not worth thinking about",
"his instant of panic was followed by a small sharp blow high on his chest",
]
self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)

View File

@@ -181,7 +181,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
vocab = {}
for (i, token) in enumerate(vocab_tokens):
for i, token in enumerate(vocab_tokens):
vocab[token] = i
tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
@@ -1634,11 +1634,9 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
break
self.assertTrue(
find,
(
f"'{new_special_token_str}' doesn't appear in the list "
f"'{new_tokenizer.all_special_tokens_extended}' as an AddedToken with the same parameters as "
f"'{special_token}' in the list {tokenizer.all_special_tokens_extended}"
),
f"'{new_special_token_str}' doesn't appear in the list "
f"'{new_tokenizer.all_special_tokens_extended}' as an AddedToken with the same parameters as "
f"'{special_token}' in the list {tokenizer.all_special_tokens_extended}",
)
elif special_token not in special_tokens_map:
# The special token must appear identically in the list of the new tokenizer.
@@ -1923,7 +1921,8 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertEqual(len(cm.records), 1)
self.assertTrue(
cm.records[0].message.startswith(
"Token indices sequence length is longer than the specified maximum sequence length for this model"
"Token indices sequence length is longer than the specified maximum sequence length"
" for this model"
)
)
@@ -1937,7 +1936,8 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertEqual(len(cm.records), 1)
self.assertTrue(
cm.records[0].message.startswith(
"Token indices sequence length is longer than the specified maximum sequence length for this model"
"Token indices sequence length is longer than the specified maximum sequence length"
" for this model"
)
)
# Check the order of Sequence of input ids, overflowing tokens and bbox sequence with truncation
@@ -2232,7 +2232,8 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertEqual(len(cm.records), 1)
self.assertTrue(
cm.records[0].message.startswith(
"Token indices sequence length is longer than the specified maximum sequence length for this model"
"Token indices sequence length is longer than the specified maximum sequence length"
" for this model"
)
)
@@ -2244,7 +2245,8 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertEqual(len(cm.records), 1)
self.assertTrue(
cm.records[0].message.startswith(
"Token indices sequence length is longer than the specified maximum sequence length for this model"
"Token indices sequence length is longer than the specified maximum sequence length"
" for this model"
)
)
# Check the order of Sequence of input ids, overflowing tokens and bbox sequence with truncation

View File

@@ -1543,11 +1543,9 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
break
self.assertTrue(
find,
(
f"'{new_special_token_str}' doesn't appear in the list "
f"'{new_tokenizer.all_special_tokens_extended}' as an AddedToken with the same parameters as "
f"'{special_token}' in the list {tokenizer.all_special_tokens_extended}"
),
f"'{new_special_token_str}' doesn't appear in the list "
f"'{new_tokenizer.all_special_tokens_extended}' as an AddedToken with the same parameters as "
f"'{special_token}' in the list {tokenizer.all_special_tokens_extended}",
)
elif special_token not in special_tokens_map:
# The special token must appear identically in the list of the new tokenizer.

View File

@@ -528,9 +528,26 @@ class LEDModelIntegrationTests(unittest.TestCase):
no_repeat_ngram_size=3,
)
EXPECTED_LEP = " the physics of @xmath0-boson will again play the central role in the frontier of particle physics if the gigaz option of the international linear collider ( ilc ) can be realized in its first phase. \n the expected sensitivity to the branching ratio of the rare decays, especially its exotic or rare processes, should be investigated comprehensively to evaluate their potential in probing new physics. in this work \n, we extend the previous studies of these decays to some new models and investigate the decays altogether. we are motivated by some recent studies on the singlet extension of the mssm, such as the next - to - minimal supersymmetric standard model ( nmssm ) @xcite and the nearly - minimal - supersymmetry - standard - model(nmssm)@xcite, where a light cp - odd higgs boson with singlet - dominant component may naturally arise from the spontaneous breaking of some approximate global symmetry. # 1#2#3#4#5#6#7#8#9#10#11#12 "
EXPECTED_LEP = (
" the physics of @xmath0-boson will again play the central role in the frontier of particle physics if the"
" gigaz option of the international linear collider ( ilc ) can be realized in its first phase. \n the"
" expected sensitivity to the branching ratio of the rare decays, especially its exotic or rare processes,"
" should be investigated comprehensively to evaluate their potential in probing new physics. in this work"
" \n, we extend the previous studies of these decays to some new models and investigate the decays"
" altogether. we are motivated by some recent studies on the singlet extension of the mssm, such as the"
" next - to - minimal supersymmetric standard model ( nmssm ) @xcite and the nearly - minimal -"
" supersymmetry - standard - model(nmssm)@xcite, where a light cp - odd higgs boson with singlet -"
" dominant component may naturally arise from the spontaneous breaking of some approximate global"
" symmetry. # 1#2#3#4#5#6#7#8#9#10#11#12 "
)
EXPECTED_MAGNET = " the recent experiment in the surface states of the topological insulator bi@xmath0se @xmath1, however, reported that a large positive magnetoresistance becomes very linear in perpendicular magnetic field even in an opposite situation where the carrier sheet density is high that all electrons occupy more than one landau levels. \n it is striking that this observation is in conflict with abrikosov s model and also with the classical parish - littlewood model. "
EXPECTED_MAGNET = (
" the recent experiment in the surface states of the topological insulator bi@xmath0se @xmath1, however,"
" reported that a large positive magnetoresistance becomes very linear in perpendicular magnetic field"
" even in an opposite situation where the carrier sheet density is high that all electrons occupy more"
" than one landau levels. \n it is striking that this observation is in conflict with abrikosov s model"
" and also with the classical parish - littlewood model. "
)
generated = tok.batch_decode(
hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True

View File

@@ -624,7 +624,10 @@ class LukeModelIntegrationTests(unittest.TestCase):
model.to(torch_device)
tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base", task="entity_classification")
text = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped the new world number one avoid a humiliating second- round exit at Wimbledon ."
text = (
"Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped"
" the new world number one avoid a humiliating second- round exit at Wimbledon ."
)
span = (39, 42)
encoding = tokenizer(text, entity_spans=[span], add_prefix_space=True, return_tensors="pt")
@@ -656,7 +659,10 @@ class LukeModelIntegrationTests(unittest.TestCase):
model.to(torch_device)
tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large", task="entity_classification")
text = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped the new world number one avoid a humiliating second- round exit at Wimbledon ."
text = (
"Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped"
" the new world number one avoid a humiliating second- round exit at Wimbledon ."
)
span = (39, 42)
encoding = tokenizer(text, entity_spans=[span], add_prefix_space=True, return_tensors="pt")

View File

@@ -480,7 +480,10 @@ class LukeTokenizerIntegrationTests(unittest.TestCase):
def test_entity_classification_no_padding_or_truncation(self):
tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base", task="entity_classification")
sentence = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped the new world number one avoid a humiliating second- round exit at Wimbledon ."
sentence = (
"Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped"
" the new world number one avoid a humiliating second- round exit at Wimbledon ."
)
span = (39, 42)
encoding = tokenizer(sentence, entity_spans=[span], return_token_type_ids=True)
@@ -491,7 +494,8 @@ class LukeTokenizerIntegrationTests(unittest.TestCase):
self.assertEqual(len(encoding["token_type_ids"]), 42)
self.assertEqual(
tokenizer.decode(encoding["input_ids"], spaces_between_special_tokens=False),
"<s>Top seed Ana Ivanovic said on Thursday<ent> she<ent> could hardly believe her luck as a fortuitous netcord helped the new world number one avoid a humiliating second- round exit at Wimbledon.</s>",
"<s>Top seed Ana Ivanovic said on Thursday<ent> she<ent> could hardly believe her luck as a fortuitous"
" netcord helped the new world number one avoid a humiliating second- round exit at Wimbledon.</s>",
)
self.assertEqual(
tokenizer.decode(encoding["input_ids"][9:12], spaces_between_special_tokens=False), "<ent> she<ent>"
@@ -514,7 +518,10 @@ class LukeTokenizerIntegrationTests(unittest.TestCase):
tokenizer = LukeTokenizer.from_pretrained(
"studio-ousia/luke-base", task="entity_classification", return_token_type_ids=True
)
sentence = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped the new world number one avoid a humiliating second- round exit at Wimbledon ."
sentence = (
"Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped"
" the new world number one avoid a humiliating second- round exit at Wimbledon ."
)
# entity information
span = (39, 42)

View File

@@ -354,7 +354,9 @@ class M2M100ModelIntegrationTests(unittest.TestCase):
src_fr = [
"L'affaire NSA souligne l'absence totale de débat sur le renseignement",
"Selon moi, il y a deux niveaux de réponse de la part du gouvernement français.",
"Lorsque François Hollande téléphone à Barack Obama ou quand le ministre des affaires étrangères Laurent Fabius convoque l'ambassadeur des Etats-Unis, ils réagissent à une vraie découverte, qui est celle de l'ampleur de la surveillance américaine sur l'ensemble des communications en France.",
"Lorsque François Hollande téléphone à Barack Obama ou quand le ministre des affaires étrangères Laurent"
" Fabius convoque l'ambassadeur des Etats-Unis, ils réagissent à une vraie découverte, qui est celle de"
" l'ampleur de la surveillance américaine sur l'ensemble des communications en France.",
]
# The below article tests that we don't add any hypotheses outside of the top n_beams
@@ -370,7 +372,9 @@ class M2M100ModelIntegrationTests(unittest.TestCase):
expected_en = [
"The NSA case highlights the total absence of intelligence debate",
"I think there are two levels of response from the French government.",
"When François Hollande calls Barack Obama or when Foreign Minister Laurent Fabius calls the U.S. Ambassador, they respond to a real discovery, which is that of the scale of U.S. surveillance on all communications in France.",
"When François Hollande calls Barack Obama or when Foreign Minister Laurent Fabius calls the U.S."
" Ambassador, they respond to a real discovery, which is that of the scale of U.S. surveillance on all"
" communications in France.",
]
generated = tokenizer.batch_decode(

View File

@@ -348,7 +348,9 @@ class MBartEnroIntegrationTest(AbstractSeq2SeqIntegrationTest):
]
tgt_text = [
"Şeful ONU declară că nu există o soluţie militară în Siria",
'Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu vor face decât să înrăutăţească violenţa şi mizeria pentru milioane de oameni.',
"Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei"
' pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu vor'
" face decât să înrăutăţească violenţa şi mizeria pentru milioane de oameni.",
]
expected_src_tokens = [8274, 127873, 25916, 7, 8622, 2071, 438, 67485, 53, 187895, 23, 51712, 2, 250004]

View File

@@ -213,7 +213,9 @@ class MBartEnroIntegrationTest(unittest.TestCase):
]
tgt_text = [
"Şeful ONU declară că nu există o soluţie militară în Siria",
'Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu vor face decât să înrăutăţească violenţele şi mizeria pentru milioane de oameni.',
"Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei"
' pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu vor'
" face decât să înrăutăţească violenţele şi mizeria pentru milioane de oameni.",
]
expected_src_tokens = [8274, 127873, 25916, 7, 8622, 2071, 438, 67485, 53, 187895, 23, 51712, 2, EN_CODE]

View File

@@ -203,7 +203,9 @@ class MBart50OneToManyIntegrationTest(unittest.TestCase):
]
tgt_text = [
"Şeful ONU declară că nu există o soluţie militară în Siria",
'Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu vor face decât să înrăutăţească violenţele şi mizeria pentru milioane de oameni.',
"Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei"
' pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu vor'
" face decât să înrăutăţească violenţele şi mizeria pentru milioane de oameni.",
]
expected_src_tokens = [EN_CODE, 8274, 127873, 25916, 7, 8622, 2071, 438, 67485, 53, 187895, 23, 51712, 2]

View File

@@ -365,7 +365,8 @@ class MLukeTokenizerIntegrationTests(unittest.TestCase):
self.assertEqual(
tokenizer.decode(encoding["input_ids"], spaces_between_special_tokens=False),
"<s> ISO 639-3 uses the code fas</s></s> for the dialects spoken across Iran and アフガニスタン ( Afghanistan ).</s>",
"<s> ISO 639-3 uses the code fas</s></s> for the dialects spoken across Iran and アフガニスタン ( Afghanistan"
" ).</s>",
)
self.assertEqual(
tokenizer.decode(encoding["input_ids"][1:5], spaces_between_special_tokens=False), "ISO 639-3"
@@ -423,7 +424,8 @@ class MLukeTokenizerIntegrationTests(unittest.TestCase):
self.assertEqual(
tokenizer.decode(encoding["input_ids"], spaces_between_special_tokens=False),
"<s> ISO 639-3 uses the code fas</s></s> for the dialects spoken across Iran and アフガニスタン ( Afghanistan ).</s>",
"<s> ISO 639-3 uses the code fas</s></s> for the dialects spoken across Iran and アフガニスタン ( Afghanistan"
" ).</s>",
)
self.assertEqual(
tokenizer.decode(encoding["input_ids"][1:5], spaces_between_special_tokens=False), "ISO 639-3"
@@ -506,7 +508,8 @@ class MLukeTokenizerIntegrationTests(unittest.TestCase):
self.assertEqual(len(encoding["token_type_ids"]), 23)
self.assertEqual(
tokenizer.decode(encoding["input_ids"], spaces_between_special_tokens=False),
"<s> Japanese is an<ent>East Asian language<ent>spoken by about 128 million people, primarily in Japan.</s>",
"<s> Japanese is an<ent>East Asian language<ent>spoken by about 128 million people, primarily in"
" Japan.</s>",
)
self.assertEqual(
tokenizer.decode(encoding["input_ids"][4:9], spaces_between_special_tokens=False),
@@ -559,7 +562,8 @@ class MLukeTokenizerIntegrationTests(unittest.TestCase):
self.assertEqual(
tokenizer.decode(encoding["input_ids"], spaces_between_special_tokens=False),
"<s><ent>Japanese<ent>is an East Asian language spoken by about 128 million people, primarily in<ent2>Japan<ent2>.</s>",
"<s><ent>Japanese<ent>is an East Asian language spoken by about 128 million people, primarily"
" in<ent2>Japan<ent2>.</s>",
)
self.assertEqual(
tokenizer.decode(encoding["input_ids"][1:4], spaces_between_special_tokens=False),

View File

@@ -194,7 +194,7 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
vocab = {}
for (i, token) in enumerate(vocab_tokens):
for i, token in enumerate(vocab_tokens):
vocab[token] = i
tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")

View File

@@ -339,7 +339,8 @@ class TFPegasusIntegrationTests(unittest.TestCase):
""" The London trio are up for best UK act and best album, as well as getting two nominations in the best song category."We got told like this morning 'Oh I think you're nominated'", said Dappy."And I was like 'Oh yeah, which one?' And now we've got nominated for four awards. I mean, wow!"Bandmate Fazer added: "We thought it's best of us to come down and mingle with everyone and say hello to the cameras. And now we find we've got four nominations."The band have two shots at the best song prize, getting the nod for their Tynchy Stryder collaboration Number One, and single Strong Again.Their album Uncle B will also go up against records by the likes of Beyonce and Kanye West.N-Dubz picked up the best newcomer Mobo in 2007, but female member Tulisa said they wouldn't be too disappointed if they didn't win this time around."At the end of the day we're grateful to be where we are in our careers."If it don't happen then it don't happen - live to fight another day and keep on making albums and hits for the fans."Dappy also revealed they could be performing live several times on the night.The group will be doing Number One and also a possible rendition of the War Child single, I Got Soul.The charity song is a re-working of The Killers' All These Things That I've Done and is set to feature artists like Chipmunk, Ironik and Pixie Lott.This year's Mobos will be held outside of London for the first time, in Glasgow on 30 September.N-Dubz said they were looking forward to performing for their Scottish fans and boasted about their recent shows north of the border."We just done Edinburgh the other day," said Dappy."We smashed up an N-Dubz show over there. We done Aberdeen about three or four months ago - we smashed up that show over there! Everywhere we go we smash it up!" """,
]
expected_text = [
"California's largest electricity provider has cut power to hundreds of thousands of customers in an effort to reduce the risk of wildfires.",
"California's largest electricity provider has cut power to hundreds of thousands of customers in an effort to"
" reduce the risk of wildfires.",
'N-Dubz have revealed they\'re "grateful" to have been nominated for four Mobo Awards.',
] # differs slightly from pytorch, likely due to numerical differences in linear layers
model_name = "google/pegasus-xsum"

View File

@@ -72,7 +72,10 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_mask_tokens_rust_pegasus(self):
rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname)
py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname)
raw_input_str = "Let's see which <unk> is the better <unk_token_11> one <mask_1> It seems like this <mask_2> was important </s> <pad> <pad> <pad>"
raw_input_str = (
"Let's see which <unk> is the better <unk_token_11> one <mask_1> It seems like this <mask_2> was important"
" </s> <pad> <pad> <pad>"
)
rust_ids = rust_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
py_ids = py_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
self.assertListEqual(py_ids, rust_ids)
@@ -158,7 +161,10 @@ class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_mask_tokens_rust_pegasus(self):
rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname)
py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname)
raw_input_str = "Let's see which <unk> is the better <unk_token> one [MASK] It seems like this [MASK] was important </s> <pad> <pad> <pad>"
raw_input_str = (
"Let's see which <unk> is the better <unk_token> one [MASK] It seems like this [MASK] was important </s>"
" <pad> <pad> <pad>"
)
rust_ids = rust_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
py_ids = py_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
self.assertListEqual(py_ids, rust_ids)
@@ -198,7 +204,10 @@ class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer.tokenize(test_str)
"""
test_str = "This is an example string that is used to test the original TF implementation against the HF implementation"
test_str = (
"This is an example string that is used to test the original TF implementation against the HF"
" implementation"
)
token_ids = self._large_tokenizer(test_str).input_ids

View File

@@ -542,9 +542,12 @@ class PerceiverModelTest(ModelTesterMixin, unittest.TestCase):
torch.allclose(
set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
),
msg=f"Tuple and dict output are not equal. Difference: {torch.max(torch.abs(tuple_object - dict_object))}. "
f"Tuple has `nan`: {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. "
f"Dict has `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}.",
msg=(
"Tuple and dict output are not equal. Difference:"
f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
),
)
recursive_check(tuple_output, dict_output)
@@ -767,7 +770,10 @@ class PerceiverModelTest(ModelTesterMixin, unittest.TestCase):
@require_torch_multi_gpu
@unittest.skip(
reason="Perceiver does not work with data parallel (DP) because of a bug in PyTorch: https://github.com/pytorch/pytorch/issues/36035"
reason=(
"Perceiver does not work with data parallel (DP) because of a bug in PyTorch:"
" https://github.com/pytorch/pytorch/issues/36035"
)
)
def test_multi_gpu_data_parallel_forward(self):
pass

View File

@@ -1226,7 +1226,15 @@ class ProphetNetModelIntegrationTest(unittest.TestCase):
tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased-cnndm")
ARTICLE_TO_SUMMARIZE = "USTC was founded in Beijing by the Chinese Academy of Sciences (CAS) in September 1958. The Director of CAS, Mr. Guo Moruo was appointed the first president of USTC. USTC's founding mission was to develop a high-level science and technology workforce, as deemed critical for development of China's economy, defense, and science and technology education. The establishment was hailed as \"A Major Event in the History of Chinese Education and Science.\" CAS has supported USTC by combining most of its institutes with the departments of the university. USTC is listed in the top 16 national key universities, becoming the youngest national key university.".lower()
ARTICLE_TO_SUMMARIZE = (
"USTC was founded in Beijing by the Chinese Academy of Sciences (CAS) in September 1958. The Director of"
" CAS, Mr. Guo Moruo was appointed the first president of USTC. USTC's founding mission was to develop a"
" high-level science and technology workforce, as deemed critical for development of China's economy,"
' defense, and science and technology education. The establishment was hailed as "A Major Event in the'
' History of Chinese Education and Science." CAS has supported USTC by combining most of its institutes'
" with the departments of the university. USTC is listed in the top 16 national key universities, becoming"
" the youngest national key university.".lower()
)
input_ids = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=511, return_tensors="pt").input_ids
input_ids = input_ids.to(torch_device)
@@ -1234,7 +1242,10 @@ class ProphetNetModelIntegrationTest(unittest.TestCase):
summary_ids = model.generate(
input_ids, num_beams=4, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True
)
EXPECTED_SUMMARIZE_512 = "us ##tc was founded by the chinese academy of sciences ( cas ) in 1958 . [X_SEP] us ##tc is listed in the top 16 national key universities ."
EXPECTED_SUMMARIZE_512 = (
"us ##tc was founded by the chinese academy of sciences ( cas ) in 1958 . [X_SEP] us ##tc is listed in the"
" top 16 national key universities ."
)
generated_titles = [
" ".join(tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True)) for g in summary_ids
]
@@ -1251,7 +1262,8 @@ class ProphetNetModelIntegrationTest(unittest.TestCase):
EXPECTED_SUMMARIZE_100 = (
r"us ##tc was founded in beijing by the chinese academy of sciences ( cas ) in 1958 . [X_SEP] us ##tc "
"'"
' s founding mission was to develop a high - level science and technology workforce . [X_SEP] establishment hailed as " a major event in the history of chinese education and science "'
" s founding mission was to develop a high - level science and technology workforce . [X_SEP]"
' establishment hailed as " a major event in the history of chinese education and science "'
)
generated_titles = [
" ".join(tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True)) for g in summary_ids

View File

@@ -141,7 +141,7 @@ class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
vocab = {}
for (i, token) in enumerate(vocab_tokens):
for i, token in enumerate(vocab_tokens):
vocab[token] = i
tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")

View File

@@ -186,7 +186,7 @@ class RealmTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
vocab = {}
for (i, token) in enumerate(vocab_tokens):
for i, token in enumerate(vocab_tokens):
vocab[token] = i
tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")

View File

@@ -574,7 +574,10 @@ class ReformerTesterMixin:
@require_torch_multi_gpu
@unittest.skip(
reason="Reformer does not work with data parallel (DP) because of a bug in PyTorch: https://github.com/pytorch/pytorch/issues/36035"
reason=(
"Reformer does not work with data parallel (DP) because of a bug in PyTorch:"
" https://github.com/pytorch/pytorch/issues/36035"
)
)
def test_multi_gpu_data_parallel_forward(self):
pass

View File

@@ -214,7 +214,10 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@slow
def test_tokenization_base_hard_symbols(self):
symbols = 'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth'
symbols = (
'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
" add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth"
)
original_tokenizer_encodings = [
108,
265,

View File

@@ -189,7 +189,7 @@ class RetriBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
vocab = {}
for (i, token) in enumerate(vocab_tokens):
for i, token in enumerate(vocab_tokens):
vocab[token] = i
tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")

View File

@@ -770,8 +770,10 @@ class Speech2TextModelIntegrationTests(unittest.TestCase):
EXPECTED_TRANSCRIPTIONS = [
"mister quilter is the apostle of the middle classes and we are glad to welcome his gospel",
"nor is mister cultar's manner less interesting than his matter",
"he tells us that at this festive season of the year with christmas and roast beef looming before us similes drawn from eating and its results occur most readily to the mind",
"he has grave doubts whether sir frederick leyton's work is really greek after all and can discover in it but little of rocky ithaca",
"he tells us that at this festive season of the year with christmas and roast beef looming before us"
" similes drawn from eating and its results occur most readily to the mind",
"he has grave doubts whether sir frederick leyton's work is really greek after all and can discover in it"
" but little of rocky ithaca",
]
self.assertListEqual(generated_transcripts, EXPECTED_TRANSCRIPTIONS)

View File

@@ -602,7 +602,9 @@ class TFSpeech2TextModelIntegrationTests(unittest.TestCase):
EXPECTED_TRANSCRIPTIONS = [
"mister quilter is the apostle of the middle classes and we are glad to welcome his gospel",
"nor is mister cultar's manner less interesting than his matter",
"he tells us that at this festive season of the year with christmas and roast beef looming before us similes drawn from eating and its results occur most readily to the mind",
"he has grave doubts whether sir frederick leyton's work is really greek after all and can discover in it but little of rocky ithaca",
"he tells us that at this festive season of the year with christmas and roast beef looming before us"
" similes drawn from eating and its results occur most readily to the mind",
"he has grave doubts whether sir frederick leyton's work is really greek after all and can discover in it"
" but little of rocky ithaca",
]
self.assertListEqual(generated_transcripts, EXPECTED_TRANSCRIPTIONS)

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -251,7 +251,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
vocab = {}
for (i, token) in enumerate(vocab_tokens):
for i, token in enumerate(vocab_tokens):
vocab[token] = i
tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")

View File

@@ -589,7 +589,10 @@ class ViltModelIntegrationTest(unittest.TestCase):
image1 = Image.open(dataset[0]["file"]).convert("RGB")
image2 = Image.open(dataset[1]["file"]).convert("RGB")
text = "The left image contains twice the number of dogs as the right image, and at least two dogs in total are standing."
text = (
"The left image contains twice the number of dogs as the right image, and at least two dogs in total are"
" standing."
)
encoding_1 = processor(image1, text, return_tensors="pt")
encoding_2 = processor(image2, text, return_tensors="pt")

View File

@@ -463,7 +463,8 @@ class FlaxWav2Vec2ModelIntegrationTest(unittest.TestCase):
EXPECTED_TRANSCRIPTIONS = [
"a man said to the universe sir i exist",
"sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
"the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
"the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around"
" him with the thousands of spectators were trivialities not worth thinking about",
"his instant panic was followed by a small sharp blow high on his chest",
]
self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)

View File

@@ -548,7 +548,8 @@ class TFWav2Vec2ModelIntegrationTest(unittest.TestCase):
EXPECTED_TRANSCRIPTIONS = [
"a man said to the universe sir i exist",
"sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
"the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
"the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around"
" him with the thousands of spectators were trivialities not worth thinking about",
"his instant panic was followed by a small sharp blow high on his chest",
]
self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)

View File

@@ -1179,7 +1179,8 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
EXPECTED_TRANSCRIPTIONS = [
"a man said to the universe sir i exist",
"sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
"the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
"the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around"
" him with the thousands of spectators were trivialities not worth thinking about",
"his instant panic was followed by a small sharp blow high on his chest",
]
self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
@@ -1461,8 +1462,11 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
EXPECTED_TRANSCRIPTIONS = [
"ɐ m æ n s ɛ d t ə ð ə j uː n ɪ v ɚ s s ɚ aɪ ɛ ɡ z ɪ s t",
"s w ɛ t k ʌ v ɚ d b ɹ iː ɔ n z b ɑː d i t ɹ ɪ k l ɪ ŋ ɪ n t ə ð ə t aɪ t l oɪ n k l ɑː θ ð æ w ʌ z ð ɪ n l i ɡ ɑːɹ m ə n t h iː w ɔːɹ",
"ð ə k aɪ t ɔ n h ɪ z tʃ ɛ s t s t ɪ l d ɹ ɪ p ɪ ŋ b l ʌ d ð ɪ eɪ k ʌ v h ɪ z oʊ v ɚ s t ɹ eɪ n d aɪ z iː v ə n ð ə s ɔːɹ ɹ ɪ ŋ ɐ ɹ iː n ɐ ɚ ɹ aʊ n d h ɪ m w ɪ ð ə θ aʊ z ə n d z ʌ v s p ɛ k t eɪ ɾ ɚ z w ɜː t ɹ ɪ v ɪ æ l ᵻ ɾ i z n ɑː t w ɜː θ θ ɪ ŋ k ɪ ŋ ɐ b aʊ t",
"s w ɛ t k ʌ v ɚ d b ɹ iː ɔ n z b ɑː d i t ɹ ɪ k l ɪ ŋ ɪ n t ə ð ə t aɪ t l oɪ n k l ɑː θ ð æ w ʌ z ð ɪ"
" n l i ɡ ɑːɹ m ə n t h iː w ɔːɹ",
"ð ə k aɪ t ɔ n h ɪ z tʃ ɛ s t s t ɪ l d ɹ ɪ p ɪ ŋ b l ʌ d ð ɪ eɪ k ʌ v h ɪ z oʊ v ɚ s t ɹ eɪ n d aɪ z iː"
" v ə n ð ə s ɔːɹ ɹ ɪ ŋ ɐ ɹ iː n ɐ ɚ ɹ aʊ n d h ɪ m w ɪ ð ə θ aʊ z ə n d z ʌ v s p ɛ k t eɪ ɾ ɚ z w ɜː t ɹ"
" ɪ v ɪ æ l ᵻ ɾ i z n ɑː t w ɜː θ θ ɪ ŋ k ɪ ŋ ɐ b aʊ t",
"h ɪ z ɪ n s t ə n t v p æ n ɪ k w ʌ z f ɑː l oʊ d b aɪ ɐ s m ɔː l ʃ ɑːɹ p b l oʊ h aɪ ɔ n h ɪ z tʃ ɛ s t",
]
# should correspond to =>:

View File

@@ -179,7 +179,10 @@ class XGLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@slow
def test_tokenization_base_hard_symbols(self):
symbols = 'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will add words that should not exsist and be tokenized to unk, such as saoneuhaoesuth'
symbols = (
'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
" add words that should not exsist and be tokenized to unk, such as saoneuhaoesuth"
)
# fmt: off
original_tokenizer_encodings = [2, 1018, 67, 11, 1988, 2617, 5631, 278, 11, 3407, 48, 71630, 28085, 4, 3234, 157, 13, 6, 5, 6, 4, 3526, 768, 15, 659, 57, 298, 3983, 864, 129, 21, 6, 5, 13675, 377, 652, 7580, 10341, 155, 2817, 422, 1666, 7, 1674, 53, 113, 202277, 17892, 33, 60, 87, 4, 3234, 157, 61, 2667, 52376, 19, 88, 23, 735]
# fmt: on

View File

@@ -102,8 +102,18 @@ class XLMProphetNetModelIntegrationTest(unittest.TestCase):
tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased-xglue-ntg")
EN_SENTENCE = "Microsoft Corporation intends to officially end free support for the Windows 7 operating system after January 14, 2020, according to the official portal of the organization. From that day, users of this system will not be able to receive security updates, which could make their computers vulnerable to cyber attacks."
RU_SENTENCE = "орпорация Microsoft намерена официально прекратить бесплатную поддержку операционной системы Windows 7 после 14 января 2020 года, сообщается на официальном портале организации . С указанного дня пользователи этой системы не смогут получать обновления безопасности, из-за чего их компьютеры могут стать уязвимыми к кибератакам."
EN_SENTENCE = (
"Microsoft Corporation intends to officially end free support for the Windows 7 operating system after"
" January 14, 2020, according to the official portal of the organization. From that day, users of this"
" system will not be able to receive security updates, which could make their computers vulnerable to"
" cyber attacks."
)
RU_SENTENCE = (
"орпорация Microsoft намерена официально прекратить бесплатную поддержку операционной системы Windows 7"
" после 14 января 2020 года, сообщается на официальном портале организации . С указанного дня пользователи"
" этой системы не смогут получать обновления безопасности, из-за чего их компьютеры могут стать уязвимыми"
" к кибератакам."
)
ZH_SENTENCE = (
"根据该组织的官方门户网站微软公司打算在2020年1月14日之后正式终止对Windows 7操作系统的免费支持。从那时起该系统的用户将无法接收安全更新这可能会使他们的计算机容易受到网络攻击。"
)
@@ -132,8 +142,9 @@ class XLMProphetNetModelIntegrationTest(unittest.TestCase):
tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True) for g in summary_ids_beam1
]
EXPECTED_TITLE_EN_BEAM1_TOK = "▁Microsoft ▁to ▁end ▁free ▁support ▁for ▁Windows ▁7".split(" ")
EXPECTED_TITLE_RU_BEAM1_TOK = "▁Microsoft ▁намерен а ▁прекрати ть ▁бес плат ную ▁поддержку ▁Windows ▁7 ▁после ▁14 ▁января ▁2020 ▁года".split(
" "
EXPECTED_TITLE_RU_BEAM1_TOK = (
"▁Microsoft ▁намерен а ▁прекрати ть ▁бес плат ную ▁поддержку ▁Windows ▁7 ▁после ▁14 ▁января ▁2020 ▁года"
.split(" ")
)
EXPECTED_TITLE_ZH_BEAM1_TOK = "微软 公司 打算 终止 对 Windows ▁7 操作 系统的 免费 支持".split(" ")
self.assertListEqual(

View File

@@ -256,7 +256,10 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@slow
def test_tokenization_base_hard_symbols(self):
symbols = 'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth'
symbols = (
'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
" add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth"
)
original_tokenizer_encodings = [
0,
3293,