From c1aaa439350051acdcd585946e91525502a6b063 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 9 Mar 2022 13:09:56 +0100 Subject: [PATCH] [Doctests] Move doctests to new GPU & Fix bugs (#15969) * test * up * up * Empty test commit * up * update tests * up * fix some vision models * correct * correct docs * Trigger notification * finalize * check * correct quicktour * Apply suggestions from code review * improve doctests * Trigger Build * next try * next try * and again * Output current clone information * Output current clone information * Correct path * add tf round again * revert to daily job Co-authored-by: Lysandre --- .github/workflows/doctests.yml | 30 ++++++++++------ docs/source/quicktour.mdx | 11 +++--- src/transformers/models/beit/modeling_beit.py | 2 +- .../models/convnext/modeling_convnext.py | 2 +- src/transformers/models/deit/modeling_deit.py | 5 ++- .../models/poolformer/modeling_poolformer.py | 4 +-- .../models/segformer/modeling_segformer.py | 2 +- .../speech_to_text/modeling_speech_to_text.py | 10 +++--- .../modeling_speech_to_text_2.py | 15 ++++---- src/transformers/models/swin/modeling_swin.py | 2 +- src/transformers/models/vit/modeling_vit.py | 2 +- .../models/wav2vec2/modeling_wav2vec2.py | 35 ------------------- 12 files changed, 50 insertions(+), 70 deletions(-) diff --git a/.github/workflows/doctests.yml b/.github/workflows/doctests.yml index 6603941131..843ff84b63 100644 --- a/.github/workflows/doctests.yml +++ b/.github/workflows/doctests.yml @@ -16,35 +16,43 @@ env: OMP_NUM_THREADS: 16 MKL_NUM_THREADS: 16 PYTEST_TIMEOUT: 600 + SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} + TF_FORCE_GPU_ALLOW_GROWTH: true jobs: run_doctests: - runs-on: [self-hosted, docker-gpu-test, single-gpu] + runs-on: [self-hosted, doc-tests-gpu] container: - image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime + image: huggingface/transformers-all-latest-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - - name: Launcher docker - uses: actions/checkout@v2 + - uses: actions/checkout@v2 + with: + repository: 'huggingface/transformers' + path: transformers - name: NVIDIA-SMI run: | nvidia-smi - - name: Install dependencies + - name: GPU visibility + working-directory: transformers run: | - apt -y update && apt install -y libsndfile1-dev - pip install --upgrade pip - pip install .[testing,torch-speech] + utils/print_env_pt.py + TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" + TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" - name: Prepare files for doctests + working-directory: transformers run: | - python utils/prepare_for_doc_test.py src docs + python3 utils/prepare_for_doc_test.py src docs - name: Run doctests + working-directory: transformers run: | - pytest --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx" + python3 -m pytest --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx" - name: Clean files after doctests + working-directory: transformers run: | - python utils/prepare_for_doc_test.py src docs --remove_new_line + python3 utils/prepare_for_doc_test.py src docs --remove_new_line diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx index 9f6572b5d4..30a58eb0b7 100644 --- a/docs/source/quicktour.mdx +++ b/docs/source/quicktour.mdx @@ -99,12 +99,13 @@ The [`pipeline`] can also iterate over an entire dataset. Start by installing th pip install datasets ``` -Create a [`pipeline`] with the task you want to solve for and the model you want to use. Set the `device` parameter to `0` to place the tensors on a CUDA device: +Create a [`pipeline`] with the task you want to solve for and the model you want to use. ```py +>>> import torch >>> from transformers import pipeline ->>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0) +>>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h") ``` Next, load a dataset (see the 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart.html) for more details) you'd like to iterate over. For example, let's load the [SUPERB](https://huggingface.co/datasets/superb) dataset: @@ -264,10 +265,10 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725], >>> import tensorflow as tf >>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1) ->>> print(tf_predictions) +>>> print(tf.math.round(tf_predictions * 10**4) / 10**4) tf.Tensor( -[[0.00206 0.00177 0.01155 0.21209 0.77253] - [0.20842 0.18262 0.19693 0.1755 0.23652]], shape=(2, 5), dtype=float32) +[[0.0021 0.0018 0.0116 0.2121 0.7725] + [0.2084 0.1826 0.1969 0.1755 0.2365]], shape=(2, 5), dtype=float32) ``` diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py index ce12de6e8d..d88f26a308 100755 --- a/src/transformers/models/beit/modeling_beit.py +++ b/src/transformers/models/beit/modeling_beit.py @@ -55,7 +55,7 @@ _EXPECTED_OUTPUT_SHAPE = [1, 197, 768] # Image classification docstring _IMAGE_CLASS_CHECKPOINT = "microsoft/beit-base-patch16-224" -_IMAGE_CLASS_EXPECTED_OUTPUT = "'tabby, tabby cat'" +_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" BEIT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "microsoft/beit-base-patch16-224", diff --git a/src/transformers/models/convnext/modeling_convnext.py b/src/transformers/models/convnext/modeling_convnext.py index f66c320255..3d53a8fe72 100755 --- a/src/transformers/models/convnext/modeling_convnext.py +++ b/src/transformers/models/convnext/modeling_convnext.py @@ -46,7 +46,7 @@ _EXPECTED_OUTPUT_SHAPE = [1, 768, 7, 7] # Image classification docstring _IMAGE_CLASS_CHECKPOINT = "facebook/convnext-tiny-224" -_IMAGE_CLASS_EXPECTED_OUTPUT = "'tabby, tabby cat'" +_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" CONVNEXT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "facebook/convnext-tiny-224", diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py index 9b3b3a1539..9696db6a87 100644 --- a/src/transformers/models/deit/modeling_deit.py +++ b/src/transformers/models/deit/modeling_deit.py @@ -51,7 +51,7 @@ _EXPECTED_OUTPUT_SHAPE = [1, 198, 768] # Image classification docstring _IMAGE_CLASS_CHECKPOINT = "facebook/deit-base-distilled-patch16-224" -_IMAGE_CLASS_EXPECTED_OUTPUT = "'tabby, tabby cat'" +_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" DEIT_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -697,9 +697,11 @@ class DeiTForImageClassification(DeiTPreTrainedModel): ```python >>> from transformers import DeiTFeatureExtractor, DeiTForImageClassification + >>> import torch >>> from PIL import Image >>> import requests + >>> torch.manual_seed(3) # doctest: +IGNORE_RESULT >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) @@ -714,6 +716,7 @@ class DeiTForImageClassification(DeiTPreTrainedModel): >>> # model predicts one of the 1000 ImageNet classes >>> predicted_class_idx = logits.argmax(-1).item() >>> print("Predicted class:", model.config.id2label[predicted_class_idx]) + Predicted class: maillot ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict diff --git a/src/transformers/models/poolformer/modeling_poolformer.py b/src/transformers/models/poolformer/modeling_poolformer.py index 17205e3112..40fa4e38e3 100755 --- a/src/transformers/models/poolformer/modeling_poolformer.py +++ b/src/transformers/models/poolformer/modeling_poolformer.py @@ -44,11 +44,11 @@ _FEAT_EXTRACTOR_FOR_DOC = "PoolFormerFeatureExtractor" # Base docstring _CHECKPOINT_FOR_DOC = "sail/poolformer_s12" -_EXPECTED_OUTPUT_SHAPE = [1, 197, 768] +_EXPECTED_OUTPUT_SHAPE = [1, 512, 7, 7] # Image classification docstring _IMAGE_CLASS_CHECKPOINT = "sail/poolformer_s12" -_IMAGE_CLASS_EXPECTED_OUTPUT = "'tabby, tabby cat'" +_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" POOLFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ "sail/poolformer_s12", diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py index 309d18d78d..34bbbb29d3 100755 --- a/src/transformers/models/segformer/modeling_segformer.py +++ b/src/transformers/models/segformer/modeling_segformer.py @@ -49,7 +49,7 @@ _EXPECTED_OUTPUT_SHAPE = [1, 256, 16, 16] # Image classification docstring _IMAGE_CLASS_CHECKPOINT = "nvidia/mit-b0" -_IMAGE_CLASS_EXPECTED_OUTPUT = "'tabby, tabby cat'" +_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ "nvidia/segformer-b0-finetuned-ade-512-512", diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index d674b12273..b0d5ee7a28 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -1168,9 +1168,10 @@ class Speech2TextModel(Speech2TextPreTrainedModel): >>> model = Speech2TextModel.from_pretrained("facebook/s2t-small-librispeech-asr") >>> feature_extractor = Speech2TextFeatureExtractor.from_pretrained("facebook/s2t-small-librispeech-asr") >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") - >>> input_features = feature_extractor( + >>> inputs = feature_extractor( ... ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt" - >>> ).input_features + ... ) + >>> input_features = inputs.input_features >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state >>> list(last_hidden_state.shape) @@ -1322,9 +1323,10 @@ class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel): >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") - >>> input_features = processor( + >>> inputs = processor( ... ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt" - >>> ).input_features + ... ) + >>> input_features = inputs.input_features >>> generated_ids = model.generate(inputs=input_features) diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py index c454a9ab67..292c58c828 100755 --- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py +++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py @@ -874,24 +874,25 @@ class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel): >>> encoder = Wav2Vec2Model(Wav2Vec2Config()) >>> decoder = Speech2Text2ForCausalLM(Speech2Text2Config()) - # init random speech2text model + >>> # init random speech2text model >>> model = SpeechEncoderDecoderModel(encoder=encoder, decoder=decoder) >>> model.config.pad_token_id = tokenizer.pad_token_id >>> model.config.decoder_start_token_id = tokenizer.bos_token_id - # pre-process inputs and labels + >>> # pre-process inputs and labels >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") - >>> input_values = feature_extractor( + >>> inputs = feature_extractor( ... ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt" - >>> ).input_values # Batch size 1 + ... ) + >>> input_values = inputs.input_values >>> decoder_input_ids = tokenizer(ds[0]["text"], return_tensors="pt").input_ids - # compute loss + >>> # compute loss >>> loss = model(inputs=input_values, labels=decoder_input_ids).loss - # backprop loss + >>> # backprop loss - >>> loss.backward() + >>> loss.backward() # doctest: +IGNORE_RESULT ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py index ea255a6d6d..bdfc66b0dc 100644 --- a/src/transformers/models/swin/modeling_swin.py +++ b/src/transformers/models/swin/modeling_swin.py @@ -48,7 +48,7 @@ _EXPECTED_OUTPUT_SHAPE = [1, 49, 768] # Image classification docstring _IMAGE_CLASS_CHECKPOINT = "microsoft/swin-tiny-patch4-window7-224" -_IMAGE_CLASS_EXPECTED_OUTPUT = "'tabby, tabby cat'" +_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" SWIN_PRETRAINED_MODEL_ARCHIVE_LIST = [ diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py index bee1cd92ac..6422755e62 100644 --- a/src/transformers/models/vit/modeling_vit.py +++ b/src/transformers/models/vit/modeling_vit.py @@ -48,7 +48,7 @@ _EXPECTED_OUTPUT_SHAPE = [1, 197, 768] # Image classification docstring _IMAGE_CLASS_CHECKPOINT = "google/vit-base-patch16-224" -_IMAGE_CLASS_EXPECTED_OUTPUT = "'Egyptian cat'" +_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat" VIT_PRETRAINED_MODEL_ARCHIVE_LIST = [ diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index d64747e007..ccacb74118 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -1611,7 +1611,6 @@ class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel): self.post_init() @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=Wav2Vec2BaseModelOutput, config_class=_CONFIG_FOR_DOC) def forward( self, input_values, @@ -1621,40 +1620,6 @@ class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel): return_dict=None, labels=None, ): - r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - TODO(PVP): Fill out when adding training - - Returns: - - Example: - - ```python - >>> from transformers import Wav2Vec2Processor, Wav2Vec2ForMaskedLM - >>> from datasets import load_dataset - >>> import soundfile as sf - >>> import torch - - >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") - >>> model = Wav2Vec2ForMaskedLM.from_pretrained("facebook/wav2vec2-base-960h") - - - >>> def map_to_array(batch): - ... speech, _ = sf.read(batch["file"]) - ... batch["speech"] = speech - ... return batch - - - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") - >>> ds = ds.map(map_to_array) - - >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1 - >>> logits = model(input_values).logits - - >>> predicted_ids = torch.argmax(logits, dim=-1) - >>> transcription = processor.decode(predicted_ids[0]) - ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.wav2vec2(