diff --git a/.github/workflows/doctests.yml b/.github/workflows/doctests.yml new file mode 100644 index 0000000000..913fb7f4c5 --- /dev/null +++ b/.github/workflows/doctests.yml @@ -0,0 +1,42 @@ +name: Doctests + +on: + push: + branches: + - doctest* + repository_dispatch: + schedule: + - cron: "0 0 * * *" + + +env: + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + RUN_SLOW: yes + OMP_NUM_THREADS: 16 + MKL_NUM_THREADS: 16 + PYTEST_TIMEOUT: 600 + +jobs: + run_doctests: + runs-on: [self-hosted, docker-gpu, single-gpu] + container: + image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Launcher docker + uses: actions/checkout@v2 + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Install dependencies + run: | + apt -y update && apt install -y libsndfile1-dev + pip install --upgrade pip + pip install .[dev] + + - name: Run doctests + run: | + pytest --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure diff --git a/docs/source/quicktour.rst b/docs/source/quicktour.rst index 4db1645c09..b0039ff646 100644 --- a/docs/source/quicktour.rst +++ b/docs/source/quicktour.rst @@ -65,7 +65,7 @@ make them readable. For instance: .. code-block:: >>> classifier('We are very happy to show you the 🤗 Transformers library.') - [{'label': 'POSITIVE', 'score': 0.99978}] + [{'label': 'POSITIVE', 'score': 0.9998}] That's encouraging! You can use it on a list of sentences, which will be preprocessed then fed to the model as a `batch`, returning a list of dictionaries like this one: diff --git a/docs/source/task_summary.rst b/docs/source/task_summary.rst index 0aa0d83412..fc6766ed1b 100644 --- a/docs/source/task_summary.rst +++ b/docs/source/task_summary.rst @@ -345,27 +345,27 @@ This outputs the sequences with the mask filled, the confidence score, and the t >>> from pprint import pprint >>> pprint(unmasker(f"HuggingFace is creating a {unmasker.tokenizer.mask_token} that the community uses to solve NLP tasks.")) - [{'score': 0.179275, + [{'score': 0.1793, 'sequence': 'HuggingFace is creating a tool that the community uses to solve ' 'NLP tasks.', 'token': 3944, 'token_str': ' tool'}, - {'score': 0.113494, + {'score': 0.1135, 'sequence': 'HuggingFace is creating a framework that the community uses to ' 'solve NLP tasks.', 'token': 7208, 'token_str': ' framework'}, - {'score': 0.0524355, + {'score': 0.0524, 'sequence': 'HuggingFace is creating a library that the community uses to ' 'solve NLP tasks.', 'token': 5560, 'token_str': ' library'}, - {'score': 0.0349353, + {'score': 0.0349, 'sequence': 'HuggingFace is creating a database that the community uses to ' 'solve NLP tasks.', 'token': 8503, 'token_str': ' database'}, - {'score': 0.0286025, + {'score': 0.0286, 'sequence': 'HuggingFace is creating a prototype that the community uses to ' 'solve NLP tasks.', 'token': 17715, @@ -458,7 +458,7 @@ of tokens. .. code-block:: >>> ## PYTORCH CODE - >>> from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, top_k_top_p_filtering + >>> from transformers import AutoModelForCausalLM, AutoTokenizer, top_k_top_p_filtering >>> import torch >>> from torch import nn @@ -476,9 +476,6 @@ of tokens. >>> # filter >>> filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0) - >>> # set seed for reproducibility - >>> set_seed(42) - >>> # sample >>> probs = nn.functional.softmax(filtered_next_token_logits, dim=-1) >>> next_token = torch.multinomial(probs, num_samples=1) @@ -486,8 +483,10 @@ of tokens. >>> generated = torch.cat([input_ids, next_token], dim=-1) >>> resulting_string = tokenizer.decode(generated.tolist()[0]) + >>> print(resulting_string) + Hugging Face is based in DUMBO, New York City, and ... >>> ## TENSORFLOW CODE - >>> from transformers import TFAutoModelForCausalLM, AutoTokenizer, set_seed, tf_top_k_top_p_filtering + >>> from transformers import TFAutoModelForCausalLM, AutoTokenizer, tf_top_k_top_p_filtering >>> import tensorflow as tf >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") @@ -504,24 +503,17 @@ of tokens. >>> # filter >>> filtered_next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0) - >>> # set seed for reproducibility - >>> set_seed(42) - >>> # sample >>> next_token = tf.random.categorical(filtered_next_token_logits, dtype=tf.int32, num_samples=1) >>> generated = tf.concat([input_ids, next_token], axis=1) >>> resulting_string = tokenizer.decode(generated.numpy().tolist()[0]) - - -This outputs a (hopefully) coherent next token following the original sequence, which in our case is the word -*features*: - -.. code-block:: - >>> print(resulting_string) - Hugging Face is based in DUMBO, New York City, and features + Hugging Face is based in DUMBO, New York City, and ... + +This outputs a (hopefully) coherent next token following the original sequence, which in our case is the word *is* or +*features*. In the next section, we show how :func:`~transformers.generation_utils.GenerationMixin.generate` can be used to generate multiple tokens up to a specified length instead of one token at a time. @@ -555,7 +547,7 @@ Below is an example of text generation using ``XLNet`` and its tokenizer, which .. code-block:: >>> ## PYTORCH CODE - >>> from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed + >>> from transformers import AutoModelForCausalLM, AutoTokenizer >>> model = AutoModelForCausalLM.from_pretrained("xlnet-base-cased") >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased") @@ -573,19 +565,14 @@ Below is an example of text generation using ``XLNet`` and its tokenizer, which ... with people, even a bishop, begging for his blessing. """ >>> prompt = "Today the weather is really nice and I am planning on " - >>> inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt") + >>> inputs = tokenizer(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")["input_ids"] - >>> prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)) + >>> prompt_length = len(tokenizer.decode(inputs[0])) >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60) - >>> # set seed for reproducibility - >>> set_seed(42) - >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length:] + >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length+1:] >>> print(generated) - Today the weather is really nice and I am planning on anning on going to a nearby restaurant on Monday. It is very - cool with the clouds and the wind. A nice afternoon is on the way out of there, when I get my phone in the sun. - Sounds like its a good day in my house, but on that "good"" thing. There is a group of people who'd want to be out - and + Today the weather is really nice and I am planning ... >>> ## TENSORFLOW CODE >>> from transformers import TFAutoModelForCausalLM, AutoTokenizer @@ -605,18 +592,14 @@ Below is an example of text generation using ``XLNet`` and its tokenizer, which ... with people, even a bishop, begging for his blessing. """ >>> prompt = "Today the weather is really nice and I am planning on " - >>> inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="tf") + >>> inputs = tokenizer(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="tf")["input_ids"] - >>> prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)) - >>> # set seed for reproducibility - >>> set_seed(42) + >>> prompt_length = len(tokenizer.decode(inputs[0])) >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60) - >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length:] + >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length+1:] >>> print(generated) - Today the weather is really nice and I am planning on anning on riding a "" over to the coast. It is also an ideal - day to fly to Bali and see more of the local "". “...The weather is great for travel and traveling as far as - local "".”. When the weather is good, I will ride my "" over to the coast and see more of + Today the weather is really nice and I am planning ... Text generation is currently possible with *GPT-2*, *OpenAi-GPT*, *CTRL*, *XLNet*, *Transfo-XL* and *Reformer* in @@ -669,18 +652,18 @@ Here are the expected results: >>> for entity in ner_pipe(sequence): ... print(entity) - {'entity': 'I-ORG', 'score': 0.999579, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2} - {'entity': 'I-ORG', 'score': 0.990976, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7} - {'entity': 'I-ORG', 'score': 0.998223, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12} - {'entity': 'I-ORG', 'score': 0.999488, 'index': 4, 'word': 'Inc', 'start': 13, 'end': 16} - {'entity': 'I-LOC', 'score': 0.999434, 'index': 11, 'word': 'New', 'start': 40, 'end': 43} - {'entity': 'I-LOC', 'score': 0.999320, 'index': 12, 'word': 'York', 'start': 44, 'end': 48} - {'entity': 'I-LOC', 'score': 0.999379, 'index': 13, 'word': 'City', 'start': 49, 'end': 53} - {'entity': 'I-LOC', 'score': 0.986258, 'index': 19, 'word': 'D', 'start': 79, 'end': 80} - {'entity': 'I-LOC', 'score': 0.951427, 'index': 20, 'word': '##UM', 'start': 80, 'end': 82} - {'entity': 'I-LOC', 'score': 0.933659, 'index': 21, 'word': '##BO', 'start': 82, 'end': 84} - {'entity': 'I-LOC', 'score': 0.976165, 'index': 28, 'word': 'Manhattan', 'start': 114, 'end': 123} - {'entity': 'I-LOC', 'score': 0.991463, 'index': 29, 'word': 'Bridge', 'start': 124, 'end': 130} + {'entity': 'I-ORG', 'score': 0.9996, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2} + {'entity': 'I-ORG', 'score': 0.9910, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7} + {'entity': 'I-ORG', 'score': 0.9982, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12} + {'entity': 'I-ORG', 'score': 0.9995, 'index': 4, 'word': 'Inc', 'start': 13, 'end': 16} + {'entity': 'I-LOC', 'score': 0.9994, 'index': 11, 'word': 'New', 'start': 40, 'end': 43} + {'entity': 'I-LOC', 'score': 0.9993, 'index': 12, 'word': 'York', 'start': 44, 'end': 48} + {'entity': 'I-LOC', 'score': 0.9994, 'index': 13, 'word': 'City', 'start': 49, 'end': 53} + {'entity': 'I-LOC', 'score': 0.9863, 'index': 19, 'word': 'D', 'start': 79, 'end': 80} + {'entity': 'I-LOC', 'score': 0.9514, 'index': 20, 'word': '##UM', 'start': 80, 'end': 82} + {'entity': 'I-LOC', 'score': 0.9337, 'index': 21, 'word': '##BO', 'start': 82, 'end': 84} + {'entity': 'I-LOC', 'score': 0.9762, 'index': 28, 'word': 'Manhattan', 'start': 114, 'end': 123} + {'entity': 'I-LOC', 'score': 0.9915, 'index': 29, 'word': 'Bridge', 'start': 124, 'end': 130} Note how the tokens of the sequence "Hugging Face" have been identified as an organisation, and "New York City", "DUMBO" and "Manhattan Bridge" have been identified as locations. diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt new file mode 100644 index 0000000000..5aaed9f481 --- /dev/null +++ b/utils/documentation_tests.txt @@ -0,0 +1,2 @@ +docs/source/quicktour.rst +docs/source/task_summary.rst \ No newline at end of file