From 7829c890db958279ca49519cc009e4f2def3fccb Mon Sep 17 00:00:00 2001 From: raghavanone <115454562+raghavanone@users.noreply.github.com> Date: Wed, 26 Oct 2022 20:36:46 +0530 Subject: [PATCH] Change the import of kenlm from github to pypi (#19770) * Change the import of kenlm from github to pypi * Change the import of kenlm from github to pypi in circleci config * Fix code quality issues * Fix isort issue, add kenlm in extras for audio * Add kenlm to deps * Add kenlm to deps * Commit 'make fixup' changes * Remove version from kenlm deps * commit make fixup changes * Remove manual installation of kenlm * Remove manual installation of kenlm * Remove manual installation of kenlm --- .circleci/create_circleci_config.py | 6 ------ docker/transformers-all-latest-gpu/Dockerfile | 2 +- docker/transformers-doc-builder/Dockerfile | 2 +- docker/transformers-pytorch-gpu/Dockerfile | 2 +- setup.py | 3 ++- src/transformers/dependency_versions_table.py | 1 + src/transformers/pipelines/__init__.py | 20 +++++++++++++------ src/transformers/utils/__init__.py | 1 + src/transformers/utils/import_utils.py | 4 ++++ 9 files changed, 25 insertions(+), 16 deletions(-) diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py index 097bd71d01..9ef5ea1a63 100644 --- a/.circleci/create_circleci_config.py +++ b/.circleci/create_circleci_config.py @@ -127,7 +127,6 @@ torch_and_tf_job = CircleCIJob( "pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]", TORCH_SCATTER_INSTALL, "pip install tensorflow_probability", - "pip install https://github.com/kpu/kenlm/archive/master.zip", "pip install git+https://github.com/huggingface/accelerate", ], marker="is_pt_tf_cross_test", @@ -143,7 +142,6 @@ torch_and_flax_job = CircleCIJob( "pip install --upgrade pip", "pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]", TORCH_SCATTER_INSTALL, - "pip install https://github.com/kpu/kenlm/archive/master.zip", "pip install git+https://github.com/huggingface/accelerate", ], marker="is_pt_flax_cross_test", @@ -158,7 +156,6 @@ torch_job = CircleCIJob( "pip install --upgrade pip", "pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]", TORCH_SCATTER_INSTALL, - "pip install https://github.com/kpu/kenlm/archive/master.zip", "pip install git+https://github.com/huggingface/accelerate", ], pytest_num_workers=3, @@ -172,7 +169,6 @@ tf_job = CircleCIJob( "pip install --upgrade pip", "pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]", "pip install tensorflow_probability", - "pip install https://github.com/kpu/kenlm/archive/master.zip", ], pytest_options={"rA": None}, ) @@ -184,7 +180,6 @@ flax_job = CircleCIJob( "sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng", "pip install --upgrade pip", "pip install .[flax,testing,sentencepiece,flax-speech,vision]", - "pip install https://github.com/kpu/kenlm/archive/master.zip", ], pytest_options={"rA": None}, ) @@ -197,7 +192,6 @@ pipelines_torch_job = CircleCIJob( "pip install --upgrade pip", "pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]", TORCH_SCATTER_INSTALL, - "pip install https://github.com/kpu/kenlm/archive/master.zip", ], pytest_options={"rA": None}, tests_to_run="tests/pipelines/" diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index 27f0965f5c..10ee71890a 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -46,7 +46,7 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/onnx/tensorflow RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python3 -c "from torch import version; print(version.__version__.split('+')[0])")+$CUDA.html RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://software.intel.com/ipex-whl-stable -RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip +RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract RUN python3 -m pip install -U "itsdangerous<2.1.0" RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate diff --git a/docker/transformers-doc-builder/Dockerfile b/docker/transformers-doc-builder/Dockerfile index de0eb17137..c693f2843c 100644 --- a/docker/transformers-doc-builder/Dockerfile +++ b/docker/transformers-doc-builder/Dockerfile @@ -11,7 +11,7 @@ RUN apt-get -y update && apt-get install -y libsndfile1-dev && apt install -y te RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed] RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python -c "from torch import version; print(version.__version__.split('+')[0])")+cpu.html -RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip +RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract RUN python3 -m pip install --no-cache-dir pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com RUN python3 -m pip install -U "itsdangerous<2.1.0" diff --git a/docker/transformers-pytorch-gpu/Dockerfile b/docker/transformers-pytorch-gpu/Dockerfile index 668bec3e71..7d0a2c878d 100644 --- a/docker/transformers-pytorch-gpu/Dockerfile +++ b/docker/transformers-pytorch-gpu/Dockerfile @@ -23,7 +23,7 @@ RUN [ ${#TORCH_AUDIO} -gt 0 ] && VERSION='torchaudio=='TORCH_AUDIO'.*' || VERSI RUN python3 -m pip uninstall -y tensorflow flax RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python3 -c "from torch import version; print(version.__version__.split('+')[0])")+cu113.html -RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip +RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract RUN python3 -m pip install -U "itsdangerous<2.1.0" # When installing in editable mode, `transformers` is not recognized as a package. diff --git a/setup.py b/setup.py index cb5c50b058..9266a1de7f 100644 --- a/setup.py +++ b/setup.py @@ -123,6 +123,7 @@ _deps = [ "jax>=0.2.8,!=0.3.2,<=0.3.6", "jaxlib>=0.1.65,<=0.3.6", "jieba", + "kenlm", "nltk", "numpy>=1.17", "onnxconverter-common", @@ -274,7 +275,7 @@ extras["sigopt"] = deps_list("sigopt") extras["integrations"] = extras["optuna"] + extras["ray"] + extras["sigopt"] extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette") -extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer") +extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer", "kenlm") # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead extras["speech"] = deps_list("torchaudio") + extras["audio"] extras["torch-speech"] = deps_list("torchaudio") + extras["audio"] diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index e55219b796..1d6223f2a7 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -29,6 +29,7 @@ deps = { "jax": "jax>=0.2.8,!=0.3.2,<=0.3.6", "jaxlib": "jaxlib>=0.1.65,<=0.3.6", "jieba": "jieba", + "kenlm": "kenlm", "nltk": "nltk", "numpy": "numpy>=1.17", "onnxconverter-common": "onnxconverter-common", diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py index 4e8faa58d2..fece240680 100755 --- a/src/transformers/pipelines/__init__.py +++ b/src/transformers/pipelines/__init__.py @@ -36,7 +36,14 @@ from ..models.auto.modeling_auto import AutoModelForDepthEstimation from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer from ..tokenization_utils import PreTrainedTokenizer from ..tokenization_utils_fast import PreTrainedTokenizerFast -from ..utils import HUGGINGFACE_CO_RESOLVE_ENDPOINT, is_tf_available, is_torch_available, logging +from ..utils import ( + HUGGINGFACE_CO_RESOLVE_ENDPOINT, + is_kenlm_available, + is_pyctcdecode_available, + is_tf_available, + is_torch_available, + logging, +) from .audio_classification import AudioClassificationPipeline from .automatic_speech_recognition import AutomaticSpeechRecognitionPipeline from .base import ( @@ -837,11 +844,12 @@ def pipeline( kwargs["decoder"] = decoder except ImportError as e: - logger.warning( - f"Could not load the `decoder` for {model_name}. Defaulting to raw CTC. Try to install" - " `pyctcdecode` and `kenlm`: (`pip install pyctcdecode`, `pip install" - f" https://github.com/kpu/kenlm/archive/master.zip`): Error: {e}" - ) + logger.warning(f"Could not load the `decoder` for {model_name}. Defaulting to raw CTC. Error: {e}") + if not is_kenlm_available(): + logger.warning("Try to install `kenlm`: `pip install kenlm") + + if not is_pyctcdecode_available(): + logger.warning("Try to install `pyctcdecode`: `pip install pyctcdecode") if task == "translation" and model.config.task_specific_params: for key in model.config.task_specific_params: diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index 03aa17bc83..2dbca85df0 100644 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -108,6 +108,7 @@ from .import_utils import ( is_in_notebook, is_ipex_available, is_jumanpp_available, + is_kenlm_available, is_librosa_available, is_more_itertools_available, is_ninja_available, diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index b0a3a888d3..9d7ffc4297 100644 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -271,6 +271,10 @@ TORCH_FX_REQUIRED_VERSION = version.parse("1.10") TORCH_ONNX_DICT_INPUTS_MINIMUM_VERSION = version.parse("1.8") +def is_kenlm_available(): + return importlib.util.find_spec("kenlm") is not None + + def is_torch_available(): return _torch_available