From 2071a9b86e7bc533a52f4fa03f89f8adc2a25bc2 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 11 Feb 2019 10:35:36 +0100 Subject: [PATCH] fix python 2.7 imports --- README.md | 13 +++++++++++++ pytorch_pretrained_bert/file_utils.py | 2 +- tests/tokenization_openai_test.py | 2 +- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 607ab3b689..4549c6ffd0 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,12 @@ PyTorch pretrained bert can be installed by pip as follows: pip install pytorch-pretrained-bert ``` +If you want to use the tokenizer associated to the `OpenAI GPT` tokenizer, you will need to install `ftfy` (if you are using Python 2, version 4.4.3 is the last version working for you) and `SpaCy` : +```bash +pip install spacy ftfy==4.4.3 +python -m spacy download en +``` + ### From source Clone the repository and run: @@ -52,6 +58,13 @@ Clone the repository and run: pip install [--editable] . ``` +Here also, if you want to use `OpenAIGPT` tokenizer, you will need to install `ftfy` (limit to version 4.4.3 if you are using Python 2) and `SpaCy` : +```bash +pip install spacy ftfy==4.4.3 +python -m spacy download en +``` + + A series of tests is included in the [tests folder](https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/tests) and can be run using `pytest` (install pytest if needed: `pip install pytest`). You can run the tests with the command: diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py index 6954bec0e1..b475d450f6 100644 --- a/pytorch_pretrained_bert/file_utils.py +++ b/pytorch_pretrained_bert/file_utils.py @@ -29,7 +29,7 @@ try: from pathlib import Path PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', Path.home() / '.pytorch_pretrained_bert')) -except ImportError: +except AttributeError: PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert')) diff --git a/tests/tokenization_openai_test.py b/tests/tokenization_openai_test.py index dadcd9699a..8a67015ffd 100644 --- a/tests/tokenization_openai_test.py +++ b/tests/tokenization_openai_test.py @@ -32,7 +32,7 @@ class OpenAIGPTTokenizationTest(unittest.TestCase): "low", "lowest", "newer", "wider"] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "l o", "lo w", "e r", ""] - with open("/tmp/openai_tokenizer_vocab_test.json", "w", encoding='utf-8') as fp: + with open("/tmp/openai_tokenizer_vocab_test.json", "wb") as fp: json.dump(vocab_tokens, fp) vocab_file = fp.name with open("/tmp/openai_tokenizer_merges_test.txt", "w", encoding='utf-8') as fp: