diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst index f6af4ae8c0..cfd8c50dd6 100644 --- a/docs/source/glossary.rst +++ b/docs/source/glossary.rst @@ -24,6 +24,7 @@ The tokenizer takes care of splitting the sequence into tokens available in the :: + # Continuation of the previous script tokenized_sequence = tokenizer.tokenize(sequence) assert tokenized_sequence == ['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M'] @@ -33,6 +34,7 @@ this, the recommended being `encode` or `encode_plus`, which leverage the Rust i :: + # Continuation of the previous script encoded_sequence = tokenizer.encode(sequence) assert encoded_sequence == [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102] @@ -48,6 +50,9 @@ For example, consider these two sequences: :: + from transformers import BertTokenizer + tokenizer = BertTokenizer.from_pretrained("bert-base-cased") + sequence_a = "This is a short sequence." sequence_b = "This is a rather long sequence. It is at least longer than the sequence A." @@ -65,10 +70,11 @@ In the first case, the list of IDs will be extended by the padding indices: :: + # Continuation of the previous script padded_sequence_a = tokenizer.encode(sequence_a, max_length=19, pad_to_max_length=True) - assert padded_sequence_a = [101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] - assert encoded_sequence_b = [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102] + assert padded_sequence_a == [101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + assert encoded_sequence_b == [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102] These can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating the position of the padded indices so that the model does not attend to them. For the @@ -79,6 +85,7 @@ The method :func:`~transformers.PreTrainedTokenizer.encode_plus` may be used to :: + # Continuation of the previous script sequence_a_dict = tokenizer.encode_plus(sequence_a, max_length=19, pad_to_max_length=True) assert sequence_a_dict['input_ids'] == [101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] @@ -94,6 +101,9 @@ tokens. For example, the BERT model builds its two sequence input as such: :: + from transformers import BertTokenizer + tokenizer = BertTokenizer.from_pretrained("bert-base-cased") + # [CLS] SEQ_A [SEP] SEQ_B [SEP] sequence_a = "HuggingFace is based in NYC" @@ -110,10 +120,11 @@ We can leverage :func:`~transformers.PreTrainedTokenizer.encode_plus` to output :: + # Continuation of the previous script encoded_dict = tokenizer.encode_plus(sequence_a, sequence_b) - assert sequence_a_dict['input_ids'] == [101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 17520, 102, 2777, 1110, 20164, 10932, 2271, 7954, 1359, 136, 102] - assert sequence_a_dict['token_type_ids'] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1] + assert encoded_dict['input_ids'] == [101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 17520, 102, 2777, 1110, 20164, 10932, 2271, 7954, 1359, 136, 102] + assert encoded_dict['token_type_ids'] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1] The first sequence, the "context" used for the question, has all its tokens represented by :obj:`0`, whereas the question has all its tokens represented by :obj:`1`. Some models, like :class:`~transformers.XLNetModel` use an diff --git a/tests/test_examples.py b/tests/test_examples.py index f8a204e800..f61e85f73a 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -15,6 +15,8 @@ import os import unittest +from typing import List, Union + from .utils import require_torch @@ -26,34 +28,84 @@ def get_examples_from_file(file): for i, line in enumerate(file): if example_mode: current_indentation = len(line) - len(line.strip()) - 1 - if current_indentation == example_indentation or '"""' in line: + + # Check if the indentation is 0 for the example, so that we don't exit as soon as there's a line return. + empty_line = example_indentation == 0 and len(line) == 1 + + # If we're back to the example indentation or if it's the end of the docstring. + if (current_indentation == example_indentation and not empty_line) or '"""' in line: + # Exit the example mode and add the example to the examples list example_mode = False example_indentation = None examples.append(example) example = [] else: + # If line is not empty, add it to the current example if line is not "\n": example.append(line[example_indentation + 4 : -1]) + + # Detect the example from '::' or 'example::' if "example::" in line.lower(): example_mode = True example_indentation = line.lower().find("example::") + elif "examples::" in line.lower(): + example_mode = True + example_indentation = line.lower().find("examples::") + elif "::" in line.lower(): + example_mode = True + example_indentation = line.lower().find("::") - return ['\n'.join(example) for example in examples] + return ["\n".join(example) for example in examples] @require_torch class TestCodeExamples(unittest.TestCase): - def test_configuration_examples(self): - transformers_directory = "../src/transformers" - configuration_files = [file for file in os.listdir(transformers_directory) if "configuration" in file] + def analyze_directory( + self, directory: str, identifier: Union[str, None] = None, ignore_files: Union[List[str], None] = None + ): + files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))] - for configuration_file in configuration_files: - with open(os.path.join(transformers_directory, configuration_file)) as f: + if identifier is not None: + files = [file for file in files if identifier in file] + + if ignore_files is not None: + files = [file for file in files if file not in ignore_files] + + for file in files: + # Open all files + with open(os.path.join(directory, file)) as f: + # Retrieve examples examples = get_examples_from_file(f) - print("Testing", configuration_file, str(len(examples)) + "/" + str(len(examples))) + joined_examples = [] def execute_example(code_example): exec(code_example) - with self.subTest(msg=configuration_file): - [execute_example(code_example) for code_example in examples] + # Some examples are the continuation of others. + if len(examples) > 1: + joined_examples.append(examples[0]) + joined_examples_index = 0 + for example in examples[1:]: + # If they contain this line, then they're a continuation of the previous script + if "# Continuation of the previous script" in example: + joined_examples[joined_examples_index] += "\n" + example + # If not, create a new example and increment the index + else: + joined_examples.append(example) + joined_examples_index += 1 + + print("Testing", file, str(len(joined_examples)) + "/" + str(len(joined_examples))) + + # Execute sub tests with every example. + with self.subTest(msg=file): + [execute_example(code_example) for code_example in joined_examples] + + def test_configuration_examples(self): + transformers_directory = "src/transformers" + configuration_files = "configuration" + ignore_files = ["configuration_auto.py", "configuration_utils.py"] + self.analyze_directory(transformers_directory, identifier=configuration_files, ignore_files=ignore_files) + + def test_main_doc_examples(self): + doc_directory = "docs/source" + self.analyze_directory(doc_directory)