diff --git a/docs/source/main_classes/pipelines.rst b/docs/source/main_classes/pipelines.rst
index c1c2f5129e..857612afb1 100644
--- a/docs/source/main_classes/pipelines.rst
+++ b/docs/source/main_classes/pipelines.rst
@@ -23,6 +23,7 @@ There are two categories of pipeline abstractions to be aware about:
 - The :func:`~transformers.pipeline` which is the most powerful object encapsulating all other pipelines.
 - The other task-specific pipelines:
 
+    - :class:`~transformers.AudioClassificationPipeline`
     - :class:`~transformers.AutomaticSpeechRecognitionPipeline`
     - :class:`~transformers.ConversationalPipeline`
     - :class:`~transformers.FeatureExtractionPipeline`
@@ -30,13 +31,13 @@ There are two categories of pipeline abstractions to be aware about:
     - :class:`~transformers.ImageClassificationPipeline`
     - :class:`~transformers.QuestionAnsweringPipeline`
     - :class:`~transformers.SummarizationPipeline`
+    - :class:`~transformers.TableQuestionAnsweringPipeline`
     - :class:`~transformers.TextClassificationPipeline`
     - :class:`~transformers.TextGenerationPipeline`
+    - :class:`~transformers.Text2TextGenerationPipeline`
     - :class:`~transformers.TokenClassificationPipeline`
     - :class:`~transformers.TranslationPipeline`
     - :class:`~transformers.ZeroShotClassificationPipeline`
-    - :class:`~transformers.Text2TextGenerationPipeline`
-    - :class:`~transformers.TableQuestionAnsweringPipeline`
 
 The pipeline abstraction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -50,6 +51,13 @@ pipeline but requires an additional argument which is the `task`.
 The task specific pipelines
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+AudioClassificationPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.AudioClassificationPipeline
+    :special-members: __call__
+    :members:
+
 AutomaticSpeechRecognitionPipeline
 =======================================================================================================================
 
diff --git a/docs/source/model_doc/auto.rst b/docs/source/model_doc/auto.rst
index 7ccfbdf87d..2a013ed9b9 100644
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -135,6 +135,13 @@ AutoModelForImageClassification
     :members:
 
 
+AutoModelForAudioClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForAudioClassification
+    :members:
+
+
 TFAutoModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index f35b3c7a48..d7b06c7275 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -277,6 +277,7 @@ _import_structure = {
     "models.xlm_roberta": ["XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLMRobertaConfig"],
     "models.xlnet": ["XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLNetConfig"],
     "pipelines": [
+        "AudioClassificationPipeline",
         "AutomaticSpeechRecognitionPipeline",
         "Conversation",
         "ConversationalPipeline",
@@ -527,6 +528,7 @@ if is_torch_available():
     )
     _import_structure["models.auto"].extend(
         [
+            "MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
             "MODEL_FOR_CAUSAL_LM_MAPPING",
             "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
             "MODEL_FOR_MASKED_LM_MAPPING",
@@ -542,6 +544,7 @@ if is_torch_available():
             "MODEL_MAPPING",
             "MODEL_WITH_LM_HEAD_MAPPING",
             "AutoModel",
+            "AutoModelForAudioClassification",
             "AutoModelForCausalLM",
             "AutoModelForImageClassification",
             "AutoModelForMaskedLM",
@@ -2040,6 +2043,7 @@ if TYPE_CHECKING:
 
     # Pipelines
     from .pipelines import (
+        AudioClassificationPipeline,
         AutomaticSpeechRecognitionPipeline,
         Conversation,
         ConversationalPipeline,
@@ -2248,6 +2252,7 @@ if TYPE_CHECKING:
             load_tf_weights_in_albert,
         )
         from .models.auto import (
+            MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
             MODEL_FOR_CAUSAL_LM_MAPPING,
             MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
             MODEL_FOR_MASKED_LM_MAPPING,
@@ -2263,6 +2268,7 @@ if TYPE_CHECKING:
             MODEL_MAPPING,
             MODEL_WITH_LM_HEAD_MAPPING,
             AutoModel,
+            AutoModelForAudioClassification,
             AutoModelForCausalLM,
             AutoModelForImageClassification,
             AutoModelForMaskedLM,
diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py
index 76f21ee9cb..d4b3dc7d65 100644
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -42,6 +42,7 @@ from .file_utils import (
     is_torch_available,
 )
 from .models.auto.modeling_auto import (
+    MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
     MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
     MODEL_FOR_MASKED_LM_MAPPING_NAMES,
@@ -66,6 +67,7 @@ TASK_MAPPING = {
     "text-classification": MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
     "table-question-answering": MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES,
     "token-classification": MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
+    "audio-classification": MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
 }
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index e0673fd124..b1db8f95e8 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -30,6 +30,7 @@ _import_structure = {
 
 if is_torch_available():
     _import_structure["modeling_auto"] = [
+        "MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
         "MODEL_FOR_CAUSAL_LM_MAPPING",
         "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
         "MODEL_FOR_MASKED_LM_MAPPING",
@@ -45,6 +46,7 @@ if is_torch_available():
         "MODEL_MAPPING",
         "MODEL_WITH_LM_HEAD_MAPPING",
         "AutoModel",
+        "AutoModelForAudioClassification",
         "AutoModelForCausalLM",
         "AutoModelForImageClassification",
         "AutoModelForMaskedLM",
@@ -119,6 +121,7 @@ if TYPE_CHECKING:
 
     if is_torch_available():
         from .modeling_auto import (
+            MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
             MODEL_FOR_CAUSAL_LM_MAPPING,
             MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
             MODEL_FOR_MASKED_LM_MAPPING,
@@ -134,6 +137,7 @@ if TYPE_CHECKING:
             MODEL_MAPPING,
             MODEL_WITH_LM_HEAD_MAPPING,
             AutoModel,
+            AutoModelForAudioClassification,
             AutoModelForCausalLM,
             AutoModelForImageClassification,
             AutoModelForMaskedLM,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 3a3cb9dc6f..be3a964904 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -444,6 +444,14 @@ MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES = OrderedDict(
     ]
 )
 
+MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Audio Classification mapping
+        ("wav2vec2", "Wav2Vec2ForSequenceClassification"),
+        ("hubert", "HubertForSequenceClassification"),
+    ]
+)
+
 MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)
 MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_PRETRAINING_MAPPING_NAMES)
 MODEL_WITH_LM_HEAD_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_LM_HEAD_MAPPING_NAMES)
@@ -472,6 +480,9 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL
 MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES
 )
+MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES
+)
 
 
 class AutoModel(_BaseAutoModelClass):
@@ -576,6 +587,13 @@ class AutoModelForImageClassification(_BaseAutoModelClass):
 AutoModelForImageClassification = auto_class_update(AutoModelForImageClassification, head_doc="image classification")
 
 
+class AutoModelForAudioClassification(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
+
+
+AutoModelForAudioClassification = auto_class_update(AutoModelForAudioClassification, head_doc="audio classification")
+
+
 class AutoModelWithLMHead(_AutoModelWithLMHead):
     @classmethod
     def from_config(cls, config):
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index f603cbe5e6..ade54417f1 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -1616,7 +1616,6 @@ class Wav2Vec2ForSequenceClassification(Wav2Vec2PreTrainedModel):
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-        # End copy
 
         if self.config.use_weighted_layer_sum:
             hidden_states = outputs[2]
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index c55fe6b181..ae460028ef 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -27,6 +27,7 @@ from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, Aut
 from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
 from ..tokenization_utils import PreTrainedTokenizer
 from ..utils import logging
+from .audio_classification import AudioClassificationPipeline
 from .automatic_speech_recognition import AutomaticSpeechRecognitionPipeline
 from .base import (
     ArgumentHandler,
@@ -86,6 +87,7 @@ if is_torch_available():
         MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
         MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
         AutoModel,
+        AutoModelForAudioClassification,
         AutoModelForCausalLM,
         AutoModelForImageClassification,
         AutoModelForMaskedLM,
@@ -108,6 +110,12 @@ TASK_ALIASES = {
     "ner": "token-classification",
 }
 SUPPORTED_TASKS = {
+    "audio-classification": {
+        "impl": AudioClassificationPipeline,
+        "tf": (),
+        "pt": (AutoModelForAudioClassification,) if is_torch_available() else (),
+        "default": {"model": {"pt": "superb/wav2vec2-base-superb-ks"}},
+    },
     "automatic-speech-recognition": {
         "impl": AutomaticSpeechRecognitionPipeline,
         "tf": (),
diff --git a/src/transformers/pipelines/audio_classification.py b/src/transformers/pipelines/audio_classification.py
new file mode 100644
index 0000000000..b3a3c7f948
--- /dev/null
+++ b/src/transformers/pipelines/audio_classification.py
@@ -0,0 +1,160 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import subprocess
+from typing import TYPE_CHECKING, Optional, Union
+
+import numpy as np
+
+from ..feature_extraction_utils import PreTrainedFeatureExtractor
+from ..file_utils import add_end_docstrings, is_torch_available
+from ..utils import logging
+from .base import PIPELINE_INIT_ARGS, Pipeline
+
+
+if TYPE_CHECKING:
+    from ..modeling_tf_utils import TFPreTrainedModel
+    from ..modeling_utils import PreTrainedModel
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
+
+logger = logging.get_logger(__name__)
+
+
+def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
+    """
+    Helper function to read an audio file through ffmpeg.
+    """
+    ar = f"{sampling_rate}"
+    ac = "1"
+    format_for_conversion = "f32le"
+    ffmpeg_command = [
+        "ffmpeg",
+        "-i",
+        "pipe:0",
+        "-ac",
+        ac,
+        "-ar",
+        ar,
+        "-f",
+        format_for_conversion,
+        "-hide_banner",
+        "-loglevel",
+        "quiet",
+        "pipe:1",
+    ]
+
+    try:
+        ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+    except FileNotFoundError:
+        raise ValueError("ffmpeg was not found but is required to load audio files from filename")
+    output_stream = ffmpeg_process.communicate(bpayload)
+    out_bytes = output_stream[0]
+
+    audio = np.frombuffer(out_bytes, np.float32)
+    if audio.shape[0] == 0:
+        raise ValueError("Malformed soundfile")
+    return audio
+
+
+@add_end_docstrings(PIPELINE_INIT_ARGS)
+class AudioClassificationPipeline(Pipeline):
+    """
+    Audio classification pipeline using any :obj:`AutoModelForAudioClassification`. This pipeline predicts the class of
+    a raw waveform or an audio file. In case of an audio file, ffmpeg should be installed to support multiple audio
+    formats.
+
+    This pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task identifier:
+    :obj:`"audio-classification"`.
+
+    See the list of available models on `huggingface.co/models
+    <https://huggingface.co/models?filter=audio-classification>`__.
+    """
+
+    def __init__(
+        self,
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        feature_extractor: PreTrainedFeatureExtractor,
+        framework: Optional[str] = None,
+        **kwargs
+    ):
+        super().__init__(model, feature_extractor=feature_extractor, framework=framework, **kwargs)
+
+        if self.framework != "pt":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+        self.check_model_type(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING)
+
+    def __call__(
+        self,
+        inputs: Union[np.ndarray, bytes, str],
+        top_k: Optional[int] = None,
+        **kwargs,
+    ):
+        """
+        Classify the sequence(s) given as inputs. See the :obj:`~transformers.AutomaticSpeechRecognitionPipeline`
+        documentation for more information.
+
+        Args:
+            inputs (:obj:`np.ndarray` or :obj:`bytes` or :obj:`str`):
+                The inputs is either a raw waveform (:obj:`np.ndarray` of shape (n, ) of type :obj:`np.float32` or
+                :obj:`np.float64`) at the correct sampling rate (no further check will be done) or a :obj:`str` that is
+                the filename of the audio file, the file will be read at the correct sampling rate to get the waveform
+                using `ffmpeg`. This requires `ffmpeg` to be installed on the system. If `inputs` is :obj:`bytes` it is
+                supposed to be the content of an audio file and is interpreted by `ffmpeg` in the same way.
+            top_k (:obj:`int`, `optional`, defaults to None):
+                The number of top labels that will be returned by the pipeline. If the provided number is `None` or
+                higher than the number of labels available in the model configuration, it will default to the number of
+                labels.
+
+        Return:
+            A list of :obj:`dict` with the following keys:
+
+            - **label** (:obj:`str`) -- The label predicted.
+            - **score** (:obj:`float`) -- The corresponding probability.
+        """
+        if isinstance(inputs, str):
+            with open(inputs, "rb") as f:
+                inputs = f.read()
+
+        if isinstance(inputs, bytes):
+            inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
+
+        if not isinstance(inputs, np.ndarray):
+            raise ValueError("We expect a numpy ndarray as input")
+        if len(inputs.shape) != 1:
+            raise ValueError("We expect a single channel audio input for AudioClassificationPipeline")
+
+        if top_k is None or top_k > self.model.config.num_labels:
+            top_k = self.model.config.num_labels
+
+        processed = self.feature_extractor(
+            inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+        processed = self.ensure_tensor_on_device(**processed)
+
+        with torch.no_grad():
+            outputs = self.model(**processed)
+
+            probs = outputs.logits[0].softmax(-1)
+            scores, ids = probs.topk(top_k)
+
+            scores = scores.tolist()
+            ids = ids.tolist()
+
+        labels = [{"score": score, "label": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]
+
+        return labels
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 86de6778ca..b63e3abdaa 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -307,6 +307,9 @@ def load_tf_weights_in_albert(*args, **kwargs):
     requires_backends(load_tf_weights_in_albert, ["torch"])
 
 
+MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = None
+
+
 MODEL_FOR_CAUSAL_LM_MAPPING = None
 
 
@@ -358,6 +361,15 @@ class AutoModel:
         requires_backends(cls, ["torch"])
 
 
+class AutoModelForAudioClassification:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class AutoModelForCausalLM:
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
diff --git a/tests/test_pipelines_audio_classification.py b/tests/test_pipelines_audio_classification.py
new file mode 100644
index 0000000000..4837b05c2a
--- /dev/null
+++ b/tests/test_pipelines_audio_classification.py
@@ -0,0 +1,120 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from transformers import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING, PreTrainedTokenizer
+from transformers.pipelines import AudioClassificationPipeline, pipeline
+from transformers.testing_utils import (
+    is_pipeline_test,
+    nested_simplify,
+    require_datasets,
+    require_tf,
+    require_torch,
+    slow,
+)
+
+from .test_pipelines_common import ANY, PipelineTestCaseMeta
+
+
+@is_pipeline_test
+@require_torch
+class AudioClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+    model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
+
+    @require_datasets
+    @slow
+    def run_pipeline_test(self, model, tokenizer, feature_extractor):
+        import datasets
+
+        audio_classifier = AudioClassificationPipeline(model=model, feature_extractor=feature_extractor)
+
+        # test with a raw waveform
+        audio = np.zeros((34000,))
+        output = audio_classifier(audio)
+        # by default a model is initialized with num_labels=2
+        self.assertEqual(
+            output,
+            [
+                {"score": ANY(float), "label": ANY(str)},
+                {"score": ANY(float), "label": ANY(str)},
+            ],
+        )
+        output = audio_classifier(audio, top_k=1)
+        self.assertEqual(
+            output,
+            [
+                {"score": ANY(float), "label": ANY(str)},
+            ],
+        )
+
+        # test with a local file
+        dataset = datasets.load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
+        filename = dataset[0]["file"]
+        output = audio_classifier(filename)
+        self.assertEqual(
+            output,
+            [
+                {"score": ANY(float), "label": ANY(str)},
+                {"score": ANY(float), "label": ANY(str)},
+            ],
+        )
+
+    @require_torch
+    def test_small_model_pt(self):
+        model = "anton-l/wav2vec2-random-tiny-classifier"
+        tokenizer = PreTrainedTokenizer()
+        audio_classifier = pipeline("audio-classification", model=model, tokenizer=tokenizer)
+
+        audio = np.ones((8000,))
+        output = audio_classifier(audio, top_k=4)
+        self.assertEqual(
+            nested_simplify(output, decimals=4),
+            [
+                {"score": 0.0843, "label": "on"},
+                {"score": 0.0840, "label": "left"},
+                {"score": 0.0837, "label": "off"},
+                {"score": 0.0835, "label": "yes"},
+            ],
+        )
+
+    @require_torch
+    @require_datasets
+    @slow
+    def test_large_model_pt(self):
+        import datasets
+
+        model = "superb/wav2vec2-base-superb-ks"
+        tokenizer = PreTrainedTokenizer()
+        audio_classifier = pipeline("audio-classification", model=model, tokenizer=tokenizer)
+        dataset = datasets.load_dataset("anton-l/superb_dummy", "ks", split="test")
+
+        audio = np.array(dataset[3]["speech"], dtype=np.float32)
+        output = audio_classifier(audio, top_k=4)
+        self.assertEqual(
+            nested_simplify(output, decimals=4),
+            [
+                {"score": 0.9809, "label": "go"},
+                {"score": 0.0073, "label": "up"},
+                {"score": 0.0064, "label": "_unknown_"},
+                {"score": 0.0015, "label": "down"},
+            ],
+        )
+
+    @require_tf
+    @unittest.skip("Audio classification is not implemented for TF")
+    def test_small_model_tf(self):
+        pass
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 088d760aa9..068efc0b15 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -122,8 +122,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
     "TFRagTokenForGeneration",
     "Wav2Vec2ForCTC",
     "HubertForCTC",
-    "Wav2Vec2ForSequenceClassification",
-    "HubertForSequenceClassification",
     "XLMForQuestionAnswering",
     "XLNetForQuestionAnswering",
     "SeparableConv1D",