Can test examples spread over multiple blocks

2020-01-14 17:30:31 -05:00
parent eaa6b9afc6
commit 632675ea88
2 changed files with 77 additions and 14 deletions
--- a/docs/source/glossary.rst
+++ b/docs/source/glossary.rst
@@ -24,6 +24,7 @@ The tokenizer takes care of splitting the sequence into tokens available in the

 ::

+    # Continuation of the previous script
    tokenized_sequence = tokenizer.tokenize(sequence)
    assert tokenized_sequence == ['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']

@@ -33,6 +34,7 @@ this, the recommended being `encode` or `encode_plus`, which leverage the Rust i

 ::

+    # Continuation of the previous script
    encoded_sequence = tokenizer.encode(sequence)
    assert encoded_sequence == [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]

@@ -48,6 +50,9 @@ For example, consider these two sequences:

 ::

+    from transformers import BertTokenizer
+    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+
    sequence_a = "This is a short sequence."
    sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."

@@ -65,10 +70,11 @@ In the first case, the list of IDs will be extended by the padding indices:

 ::

+    # Continuation of the previous script
    padded_sequence_a = tokenizer.encode(sequence_a, max_length=19, pad_to_max_length=True)

-    assert padded_sequence_a = [101, 1188, 1110, 170, 1603, 4954,  119, 102,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,   0]
-    assert encoded_sequence_b = [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]
+    assert padded_sequence_a == [101, 1188, 1110, 170, 1603, 4954,  119, 102,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,   0]
+    assert encoded_sequence_b == [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]

 These can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating
 the position of the padded indices so that the model does not attend to them. For the
@@ -79,6 +85,7 @@ The method :func:`~transformers.PreTrainedTokenizer.encode_plus` may be used to

 ::

+    # Continuation of the previous script
    sequence_a_dict = tokenizer.encode_plus(sequence_a, max_length=19, pad_to_max_length=True)

    assert sequence_a_dict['input_ids'] == [101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
@@ -94,6 +101,9 @@ tokens. For example, the BERT model builds its two sequence input as such:

 ::

+    from transformers import BertTokenizer
+    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+
    # [CLS] SEQ_A [SEP] SEQ_B [SEP]

    sequence_a = "HuggingFace is based in NYC"
@@ -110,10 +120,11 @@ We can leverage :func:`~transformers.PreTrainedTokenizer.encode_plus` to output

 ::

+    # Continuation of the previous script
    encoded_dict = tokenizer.encode_plus(sequence_a, sequence_b)

-    assert sequence_a_dict['input_ids'] == [101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 17520, 102, 2777, 1110, 20164, 10932, 2271, 7954, 1359, 136, 102]
-    assert sequence_a_dict['token_type_ids'] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+    assert encoded_dict['input_ids'] == [101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 17520, 102, 2777, 1110, 20164, 10932, 2271, 7954, 1359, 136, 102]
+    assert encoded_dict['token_type_ids'] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]

 The first sequence, the "context" used for the question, has all its tokens represented by :obj:`0`, whereas the
 question has all its tokens represented by :obj:`1`. Some models, like :class:`~transformers.XLNetModel` use an