Change the import of kenlm from github to pypi (#19770)

* Change the import of kenlm from github to pypi * Change the import of kenlm from github to pypi in circleci config * Fix code quality issues * Fix isort issue, add kenlm in extras for audio * Add kenlm to deps * Add kenlm to deps * Commit 'make fixup' changes * Remove version from kenlm deps * commit make fixup changes * Remove manual installation of kenlm * Remove manual installation of kenlm * Remove manual installation of kenlm
2022-10-26 20:36:46 +05:30
parent aeae97829f
commit 7829c890db
9 changed files with 25 additions and 16 deletions
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -127,7 +127,6 @@ torch_and_tf_job = CircleCIJob(
        "pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]",
        TORCH_SCATTER_INSTALL,
        "pip install tensorflow_probability",
-        "pip install https://github.com/kpu/kenlm/archive/master.zip",
        "pip install git+https://github.com/huggingface/accelerate",
    ],
    marker="is_pt_tf_cross_test",
@@ -143,7 +142,6 @@ torch_and_flax_job = CircleCIJob(
        "pip install --upgrade pip",
        "pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]",
        TORCH_SCATTER_INSTALL,
-        "pip install https://github.com/kpu/kenlm/archive/master.zip",
        "pip install git+https://github.com/huggingface/accelerate",
    ],
    marker="is_pt_flax_cross_test",
@@ -158,7 +156,6 @@ torch_job = CircleCIJob(
        "pip install --upgrade pip",
        "pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]",
        TORCH_SCATTER_INSTALL,
-        "pip install https://github.com/kpu/kenlm/archive/master.zip",
        "pip install git+https://github.com/huggingface/accelerate",
    ],
    pytest_num_workers=3,
@@ -172,7 +169,6 @@ tf_job = CircleCIJob(
        "pip install --upgrade pip",
        "pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]",
        "pip install tensorflow_probability",
-        "pip install https://github.com/kpu/kenlm/archive/master.zip",
    ],
    pytest_options={"rA": None},
 )
@@ -184,7 +180,6 @@ flax_job = CircleCIJob(
        "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng",
        "pip install --upgrade pip",
        "pip install .[flax,testing,sentencepiece,flax-speech,vision]",
-        "pip install https://github.com/kpu/kenlm/archive/master.zip",
    ],
    pytest_options={"rA": None},
 )
@@ -197,7 +192,6 @@ pipelines_torch_job = CircleCIJob(
        "pip install --upgrade pip",
        "pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]",
        TORCH_SCATTER_INSTALL,
-        "pip install https://github.com/kpu/kenlm/archive/master.zip",
    ],
    pytest_options={"rA": None},
    tests_to_run="tests/pipelines/"
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -46,7 +46,7 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/onnx/tensorflow
 RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python3 -c "from torch import version; print(version.__version__.split('+')[0])")+$CUDA.html
 RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://software.intel.com/ipex-whl-stable

-RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip
+RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"

 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
--- a/docker/transformers-doc-builder/Dockerfile
+++ b/docker/transformers-doc-builder/Dockerfile
@@ -11,7 +11,7 @@ RUN apt-get -y update && apt-get install -y libsndfile1-dev && apt install -y te
 RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed]

 RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python -c "from torch import version; print(version.__version__.split('+')[0])")+cpu.html
-RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip
+RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install --no-cache-dir pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com
 RUN python3 -m pip install -U "itsdangerous<2.1.0"

--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@@ -23,7 +23,7 @@ RUN [ ${#TORCH_AUDIO} -gt 0 ] && VERSION='torchaudio=='TORCH_AUDIO'.*' ||  VERSI
 RUN python3 -m pip uninstall -y tensorflow flax

 RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python3 -c "from torch import version; print(version.__version__.split('+')[0])")+cu113.html
-RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip
+RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"

 # When installing in editable mode, `transformers` is not recognized as a package.
--- a/setup.py
+++ b/setup.py
@@ -123,6 +123,7 @@ _deps = [
    "jax>=0.2.8,!=0.3.2,<=0.3.6",
    "jaxlib>=0.1.65,<=0.3.6",
    "jieba",
+    "kenlm",
    "nltk",
    "numpy>=1.17",
    "onnxconverter-common",
@@ -274,7 +275,7 @@ extras["sigopt"] = deps_list("sigopt")
 extras["integrations"] = extras["optuna"] + extras["ray"] + extras["sigopt"]

 extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
-extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer")
+extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer", "kenlm")
 # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
 extras["speech"] = deps_list("torchaudio") + extras["audio"]
 extras["torch-speech"] = deps_list("torchaudio") + extras["audio"]
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -29,6 +29,7 @@ deps = {
    "jax": "jax>=0.2.8,!=0.3.2,<=0.3.6",
    "jaxlib": "jaxlib>=0.1.65,<=0.3.6",
    "jieba": "jieba",
+    "kenlm": "kenlm",
    "nltk": "nltk",
    "numpy": "numpy>=1.17",
    "onnxconverter-common": "onnxconverter-common",
--- a/src/transformers/pipelines/init.py
+++ b/src/transformers/pipelines/init.py
@@ -36,7 +36,14 @@ from ..models.auto.modeling_auto import AutoModelForDepthEstimation
 from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
 from ..tokenization_utils import PreTrainedTokenizer
 from ..tokenization_utils_fast import PreTrainedTokenizerFast
-from ..utils import HUGGINGFACE_CO_RESOLVE_ENDPOINT, is_tf_available, is_torch_available, logging
+from ..utils import (
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    is_kenlm_available,
+    is_pyctcdecode_available,
+    is_tf_available,
+    is_torch_available,
+    logging,
+)
 from .audio_classification import AudioClassificationPipeline
 from .automatic_speech_recognition import AutomaticSpeechRecognitionPipeline
 from .base import (
@@ -837,11 +844,12 @@ def pipeline(

                    kwargs["decoder"] = decoder
                except ImportError as e:
-                    logger.warning(
-                        f"Could not load the `decoder` for {model_name}. Defaulting to raw CTC. Try to install"
-                        " `pyctcdecode` and `kenlm`: (`pip install pyctcdecode`, `pip install"
-                        f" https://github.com/kpu/kenlm/archive/master.zip`): Error: {e}"
-                    )
+                    logger.warning(f"Could not load the `decoder` for {model_name}. Defaulting to raw CTC. Error: {e}")
+                    if not is_kenlm_available():
+                        logger.warning("Try to install `kenlm`: `pip install kenlm")
+
+                    if not is_pyctcdecode_available():
+                        logger.warning("Try to install `pyctcdecode`: `pip install pyctcdecode")

    if task == "translation" and model.config.task_specific_params:
        for key in model.config.task_specific_params:
--- a/src/transformers/utils/init.py
+++ b/src/transformers/utils/init.py
@@ -108,6 +108,7 @@ from .import_utils import (
    is_in_notebook,
    is_ipex_available,
    is_jumanpp_available,
+    is_kenlm_available,
    is_librosa_available,
    is_more_itertools_available,
    is_ninja_available,
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -271,6 +271,10 @@ TORCH_FX_REQUIRED_VERSION = version.parse("1.10")
 TORCH_ONNX_DICT_INPUTS_MINIMUM_VERSION = version.parse("1.8")


+def is_kenlm_available():
+    return importlib.util.find_spec("kenlm") is not None
+
+
 def is_torch_available():
    return _torch_available