From 7829c890db958279ca49519cc009e4f2def3fccb Mon Sep 17 00:00:00 2001
From: raghavanone <115454562+raghavanone@users.noreply.github.com>
Date: Wed, 26 Oct 2022 20:36:46 +0530
Subject: [PATCH] Change the import of kenlm from github to pypi (#19770)

* Change the import of kenlm from github to pypi

* Change the import of kenlm from github to pypi in circleci config

* Fix code quality issues

* Fix isort issue, add kenlm in extras for audio

* Add kenlm to deps

* Add kenlm to deps

* Commit 'make fixup' changes

* Remove version from kenlm deps

* commit make fixup changes

* Remove manual installation of kenlm

* Remove manual installation of kenlm

* Remove manual installation of kenlm
---
 .circleci/create_circleci_config.py           |  6 ------
 docker/transformers-all-latest-gpu/Dockerfile |  2 +-
 docker/transformers-doc-builder/Dockerfile    |  2 +-
 docker/transformers-pytorch-gpu/Dockerfile    |  2 +-
 setup.py                                      |  3 ++-
 src/transformers/dependency_versions_table.py |  1 +
 src/transformers/pipelines/__init__.py        | 20 +++++++++++++------
 src/transformers/utils/__init__.py            |  1 +
 src/transformers/utils/import_utils.py        |  4 ++++
 9 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 097bd71d01..9ef5ea1a63 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -127,7 +127,6 @@ torch_and_tf_job = CircleCIJob(
         "pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]",
         TORCH_SCATTER_INSTALL,
         "pip install tensorflow_probability",
-        "pip install https://github.com/kpu/kenlm/archive/master.zip",
         "pip install git+https://github.com/huggingface/accelerate",
     ],
     marker="is_pt_tf_cross_test",
@@ -143,7 +142,6 @@ torch_and_flax_job = CircleCIJob(
         "pip install --upgrade pip",
         "pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]",
         TORCH_SCATTER_INSTALL,
-        "pip install https://github.com/kpu/kenlm/archive/master.zip",
         "pip install git+https://github.com/huggingface/accelerate",
     ],
     marker="is_pt_flax_cross_test",
@@ -158,7 +156,6 @@ torch_job = CircleCIJob(
         "pip install --upgrade pip",
         "pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]",
         TORCH_SCATTER_INSTALL,
-        "pip install https://github.com/kpu/kenlm/archive/master.zip",
         "pip install git+https://github.com/huggingface/accelerate",
     ],
     pytest_num_workers=3,
@@ -172,7 +169,6 @@ tf_job = CircleCIJob(
         "pip install --upgrade pip",
         "pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]",
         "pip install tensorflow_probability",
-        "pip install https://github.com/kpu/kenlm/archive/master.zip",
     ],
     pytest_options={"rA": None},
 )
@@ -184,7 +180,6 @@ flax_job = CircleCIJob(
         "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
         "pip install --upgrade pip",
         "pip install .[flax,testing,sentencepiece,flax-speech,vision]",
-        "pip install https://github.com/kpu/kenlm/archive/master.zip",
     ],
     pytest_options={"rA": None},
 )
@@ -197,7 +192,6 @@ pipelines_torch_job = CircleCIJob(
         "pip install --upgrade pip",
         "pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]",
         TORCH_SCATTER_INSTALL,
-        "pip install https://github.com/kpu/kenlm/archive/master.zip",
     ],
     pytest_options={"rA": None},
     tests_to_run="tests/pipelines/"
diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index 27f0965f5c..10ee71890a 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -46,7 +46,7 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/onnx/tensorflow
 RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python3 -c "from torch import version; print(version.__version__.split('+')[0])")+$CUDA.html
 RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://software.intel.com/ipex-whl-stable
 
-RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip
+RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"
 
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
diff --git a/docker/transformers-doc-builder/Dockerfile b/docker/transformers-doc-builder/Dockerfile
index de0eb17137..c693f2843c 100644
--- a/docker/transformers-doc-builder/Dockerfile
+++ b/docker/transformers-doc-builder/Dockerfile
@@ -11,7 +11,7 @@ RUN apt-get -y update && apt-get install -y libsndfile1-dev && apt install -y te
 RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed]
 
 RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python -c "from torch import version; print(version.__version__.split('+')[0])")+cpu.html
-RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip
+RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install --no-cache-dir pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com
 RUN python3 -m pip install -U "itsdangerous<2.1.0"
 
diff --git a/docker/transformers-pytorch-gpu/Dockerfile b/docker/transformers-pytorch-gpu/Dockerfile
index 668bec3e71..7d0a2c878d 100644
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@@ -23,7 +23,7 @@ RUN [ ${#TORCH_AUDIO} -gt 0 ] && VERSION='torchaudio=='TORCH_AUDIO'.*' ||  VERSI
 RUN python3 -m pip uninstall -y tensorflow flax
 
 RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python3 -c "from torch import version; print(version.__version__.split('+')[0])")+cu113.html
-RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip
+RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"
 
 # When installing in editable mode, `transformers` is not recognized as a package.
diff --git a/setup.py b/setup.py
index cb5c50b058..9266a1de7f 100644
--- a/setup.py
+++ b/setup.py
@@ -123,6 +123,7 @@ _deps = [
     "jax>=0.2.8,!=0.3.2,<=0.3.6",
     "jaxlib>=0.1.65,<=0.3.6",
     "jieba",
+    "kenlm",
     "nltk",
     "numpy>=1.17",
     "onnxconverter-common",
@@ -274,7 +275,7 @@ extras["sigopt"] = deps_list("sigopt")
 extras["integrations"] = extras["optuna"] + extras["ray"] + extras["sigopt"]
 
 extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
-extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer")
+extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer", "kenlm")
 # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
 extras["speech"] = deps_list("torchaudio") + extras["audio"]
 extras["torch-speech"] = deps_list("torchaudio") + extras["audio"]
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index e55219b796..1d6223f2a7 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -29,6 +29,7 @@ deps = {
     "jax": "jax>=0.2.8,!=0.3.2,<=0.3.6",
     "jaxlib": "jaxlib>=0.1.65,<=0.3.6",
     "jieba": "jieba",
+    "kenlm": "kenlm",
     "nltk": "nltk",
     "numpy": "numpy>=1.17",
     "onnxconverter-common": "onnxconverter-common",
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 4e8faa58d2..fece240680 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -36,7 +36,14 @@ from ..models.auto.modeling_auto import AutoModelForDepthEstimation
 from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
 from ..tokenization_utils import PreTrainedTokenizer
 from ..tokenization_utils_fast import PreTrainedTokenizerFast
-from ..utils import HUGGINGFACE_CO_RESOLVE_ENDPOINT, is_tf_available, is_torch_available, logging
+from ..utils import (
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    is_kenlm_available,
+    is_pyctcdecode_available,
+    is_tf_available,
+    is_torch_available,
+    logging,
+)
 from .audio_classification import AudioClassificationPipeline
 from .automatic_speech_recognition import AutomaticSpeechRecognitionPipeline
 from .base import (
@@ -837,11 +844,12 @@ def pipeline(
 
                     kwargs["decoder"] = decoder
                 except ImportError as e:
-                    logger.warning(
-                        f"Could not load the `decoder` for {model_name}. Defaulting to raw CTC. Try to install"
-                        " `pyctcdecode` and `kenlm`: (`pip install pyctcdecode`, `pip install"
-                        f" https://github.com/kpu/kenlm/archive/master.zip`): Error: {e}"
-                    )
+                    logger.warning(f"Could not load the `decoder` for {model_name}. Defaulting to raw CTC. Error: {e}")
+                    if not is_kenlm_available():
+                        logger.warning("Try to install `kenlm`: `pip install kenlm")
+
+                    if not is_pyctcdecode_available():
+                        logger.warning("Try to install `pyctcdecode`: `pip install pyctcdecode")
 
     if task == "translation" and model.config.task_specific_params:
         for key in model.config.task_specific_params:
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 03aa17bc83..2dbca85df0 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -108,6 +108,7 @@ from .import_utils import (
     is_in_notebook,
     is_ipex_available,
     is_jumanpp_available,
+    is_kenlm_available,
     is_librosa_available,
     is_more_itertools_available,
     is_ninja_available,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index b0a3a888d3..9d7ffc4297 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -271,6 +271,10 @@ TORCH_FX_REQUIRED_VERSION = version.parse("1.10")
 TORCH_ONNX_DICT_INPUTS_MINIMUM_VERSION = version.parse("1.8")
 
 
+def is_kenlm_available():
+    return importlib.util.find_spec("kenlm") is not None
+
+
 def is_torch_available():
     return _torch_available