Refactor Code samples; Test code samples (#5036)

* Refactor code samples * Test docstrings * Style * Tokenization examples * Run rust of tests * First step to testing source docs * Style and BART comment * Test the remainder of the code samples * Style * let to const * Formatting fixes * Ready for merge * Fix fixture + Style * Fix last tests * Update docs/source/quicktour.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Addressing @sgugger's comments + Fix MobileBERT in TF Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2020-06-25 16:46:00 -04:00
parent 315f464b0a
commit 364a5ae1f0
68 changed files with 1962 additions and 2979 deletions
--- a/tests/test_doc_samples.py
+++ b/tests/test_doc_samples.py
@@ -13,52 +13,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import doctest
+import logging
 import os
 import unittest
+from pathlib import Path
 from typing import List, Union

+import transformers
+
 from .utils import require_tf, require_torch, slow


-def get_examples_from_file(file):
-    examples = []
-    example = []
-    example_mode = False
-    example_indentation = None
-    for i, line in enumerate(file):
-        if example_mode:
-            current_indentation = len(line) - len(line.strip()) - 1
-
-            # Check if the indentation is 0 for the example, so that we don't exit as soon as there's a line return.
-            empty_line = example_indentation == 0 and len(line) == 1
-
-            # If we're back to the example indentation or if it's the end of the docstring.
-            if (current_indentation == example_indentation and not empty_line) or '"""' in line:
-                # Exit the example mode and add the example to the examples list
-                example_mode = False
-                example_indentation = None
-                examples.append(example)
-                example = []
-            else:
-                # If line is not empty, add it to the current example
-                if line != "\n":
-                    example.append(line[example_indentation + 4 : -1])
-
-        # Detect the example from '::' or 'example::'
-        if "example::" in line.lower():
-            example_mode = True
-            example_indentation = line.lower().find("example::")
-        elif "examples::" in line.lower():
-            example_mode = True
-            example_indentation = line.lower().find("examples::")
-        # elif "::" in line.lower() and len(line.strip()) == 2:
-        #     example_mode = True
-        #     example_indentation = line.lower().find("::")
-
-    examples = ["\n".join(example) for example in examples]
-    examples = [example for example in examples if "not runnable" not in example.lower()]
-
-    return examples
+logger = logging.getLogger()


@require_torch
@@ -66,68 +33,81 @@ def get_examples_from_file(file):
@slow
 class TestCodeExamples(unittest.TestCase):
    def analyze_directory(
-        self, directory: str, identifier: Union[str, None] = None, ignore_files: Union[List[str], None] = None
+        self,
+        directory: Path,
+        identifier: Union[str, None] = None,
+        ignore_files: Union[List[str], None] = [],
+        n_identifier: Union[str, None] = None,
+        only_modules: bool = True,
    ):
+        """
+        Runs through the specific directory, looking for the files identified with `identifier`. Executes
+        the doctests in those files
+
+        Args:
+            directory (:obj:`str`): Directory containing the files
+            identifier (:obj:`str`): Will parse files containing this
+            ignore_files (:obj:`List[str]`): List of files to skip
+            n_identifier (:obj:`str` or :obj:`List[str]`): Will not parse files containing this/these identifiers.
+            only_modules (:obj:`bool`): Whether to only analyze modules
+        """
        files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]

        if identifier is not None:
            files = [file for file in files if identifier in file]

-        if ignore_files is not None:
-            files = [file for file in files if file not in ignore_files]
+        if n_identifier is not None:
+            if isinstance(n_identifier, List):
+                for n_ in n_identifier:
+                    files = [file for file in files if n_ not in file]
+            else:
+                files = [file for file in files if n_identifier not in file]
+
+        ignore_files.append("__init__.py")
+        files = [file for file in files if file not in ignore_files]

        for file in files:
            # Open all files
-            print("Testing", file, end=" ")
-            with open(os.path.join(directory, file)) as f:
-                # Retrieve examples
-                examples = get_examples_from_file(f)
-                joined_examples = []
+            print("Testing", file)

-                def execute_example(code_example):
-                    exec(code_example, {})
-
-                # Some examples are the continuation of others.
-                if len(examples) > 0:
-                    joined_examples.append(examples[0])
-                    joined_examples_index = 0
-                    for example in examples[1:]:
-                        # If they contain this line, then they're a continuation of the previous script
-                        if "# Continuation of the previous script" in example:
-                            joined_examples[joined_examples_index] += "\n" + example
-                        # If not, create a new example and increment the index
-                        else:
-                            joined_examples.append(example)
-                            joined_examples_index += 1
-
-                print(str(len(joined_examples)) + "/" + str(len(joined_examples)))
-
-                # Execute sub tests with every example.
-                for index, code_example in enumerate(joined_examples):
-                    with self.subTest(msg=file + " " + str(index) + "/" + str(len(joined_examples)) + code_example):
-                        execute_example(code_example)
-
-    def test_configuration_examples(self):
-        transformers_directory = "src/transformers"
-        configuration_files = "configuration"
-        ignore_files = ["configuration_auto.py", "configuration_utils.py"]
-        self.analyze_directory(transformers_directory, identifier=configuration_files, ignore_files=ignore_files)
-
-    def test_main_doc_examples(self):
-        doc_directory = "docs/source"
-        ignore_files = ["favicon.ico"]
-        self.analyze_directory(doc_directory, ignore_files=ignore_files)
+            if only_modules:
+                try:
+                    module_identifier = file.split(".")[0]
+                    module_identifier = getattr(transformers, module_identifier)
+                    suite = doctest.DocTestSuite(module_identifier)
+                    result = unittest.TextTestRunner().run(suite)
+                    self.assertIs(len(result.failures), 0)
+                except AttributeError:
+                    logger.info(f"{module_identifier} is not a module.")
+            else:
+                result = doctest.testfile(str(".." / directory / file), optionflags=doctest.ELLIPSIS)
+                self.assertIs(result.failed, 0)

    def test_modeling_examples(self):
        transformers_directory = "src/transformers"
-        modeling_files = "modeling"
+        files = "modeling"
        ignore_files = [
-            "modeling_auto.py",
-            "modeling_t5.py",
-            "modeling_tf_auto.py",
-            "modeling_utils.py",
-            "modeling_tf_t5.py",
-            "modeling_bart.py",
-            "modeling_tf_utils.py",
+            "modeling_ctrl.py",
+            "modeling_tf_ctrl.py",
        ]
-        self.analyze_directory(transformers_directory, identifier=modeling_files, ignore_files=ignore_files)
+        self.analyze_directory(transformers_directory, identifier=files, ignore_files=ignore_files)
+
+    def test_tokenization_examples(self):
+        transformers_directory = Path("src/transformers")
+        files = "tokenization"
+        self.analyze_directory(transformers_directory, identifier=files)
+
+    def test_configuration_examples(self):
+        transformers_directory = Path("src/transformers")
+        files = "configuration"
+        self.analyze_directory(transformers_directory, identifier=files)
+
+    def test_remaining_examples(self):
+        transformers_directory = Path("src/transformers")
+        n_identifiers = ["configuration", "modeling", "tokenization"]
+        self.analyze_directory(transformers_directory, n_identifier=n_identifiers)
+
+    def test_doc_sources(self):
+        doc_source_directory = Path("docs/source")
+        ignore_files = ["favicon.ico"]
+        self.analyze_directory(doc_source_directory, ignore_files=ignore_files, only_modules=False)
--- a/tests/test_modeling_tf_xlm.py
+++ b/tests/test_modeling_tf_xlm.py
@@ -31,6 +31,7 @@ if is_tf_available():
        TFXLMWithLMHeadModel,
        TFXLMForSequenceClassification,
        TFXLMForQuestionAnsweringSimple,
+        TFXLMForTokenClassification,
        TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
    )

@@ -219,6 +220,26 @@ class TFXLMModelTester:

        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size])

+    def create_and_check_xlm_for_token_classification(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        input_mask,
+    ):
+        config.num_labels = self.num_labels
+        model = TFXLMForTokenClassification(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        (logits,) = model(inputs)
+        result = {
+            "logits": logits.numpy(),
+        }
+        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
+
    def prepare_config_and_inputs_for_common(self):
        config_and_inputs = self.prepare_config_and_inputs()
        (
@@ -244,7 +265,14 @@ class TFXLMModelTester:
 class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):

    all_model_classes = (
-        (TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple)
+        # TODO The multiple choice model is missing and should be added.
+        (
+            TFXLMModel,
+            TFXLMWithLMHeadModel,
+            TFXLMForSequenceClassification,
+            TFXLMForQuestionAnsweringSimple,
+            TFXLMForTokenClassification,
+        )
        if is_tf_available()
        else ()
    )
@@ -275,6 +303,10 @@ class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)

+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_for_token_classification(*config_and_inputs)
+
    @slow
    def test_model_from_pretrained(self):
        for model_name in TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: