From ae88eb88a4baffdd23fa38acf7493aedd23fa6b5 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 14 Dec 2018 13:48:58 +0100 Subject: [PATCH] set encoding to 'utf-8' in calls to open --- examples/extract_features.py | 2 +- examples/run_classifier.py | 5 +++-- examples/run_squad.py | 4 ++-- examples/run_swag.py | 5 +++-- pytorch_pretrained_bert/file_utils.py | 2 +- pytorch_pretrained_bert/modeling.py | 4 ++-- setup.py | 2 +- 7 files changed, 13 insertions(+), 11 deletions(-) diff --git a/examples/extract_features.py b/examples/extract_features.py index dbab934c08..4f8812121e 100644 --- a/examples/extract_features.py +++ b/examples/extract_features.py @@ -168,7 +168,7 @@ def read_examples(input_file): """Read a list of `InputExample`s from an input file.""" examples = [] unique_id = 0 - with open(input_file, "r") as reader: + with open(input_file, "r", encoding='utf-8') as reader: while True: line = reader.readline() if not line: diff --git a/examples/run_classifier.py b/examples/run_classifier.py index e1dcd36344..adf81f4e28 100644 --- a/examples/run_classifier.py +++ b/examples/run_classifier.py @@ -91,7 +91,7 @@ class DataProcessor(object): @classmethod def _read_tsv(cls, input_file, quotechar=None): """Reads a tab separated value file.""" - with open(input_file, "r") as f: + with open(input_file, "r", encoding='utf-8') as f: reader = csv.reader(f, delimiter="\t", quotechar=quotechar) lines = [] for line in reader: @@ -413,7 +413,8 @@ def main(): n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') - logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) + logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( + device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( diff --git a/examples/run_squad.py b/examples/run_squad.py index d6e96f4ac9..6a97dd300b 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -108,7 +108,7 @@ class InputFeatures(object): def read_squad_examples(input_file, is_training): """Read a SQuAD json file into a list of SquadExample.""" - with open(input_file, "r") as reader: + with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] def is_whitespace(c): @@ -757,7 +757,7 @@ def main(): n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') - logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}".format( + logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: diff --git a/examples/run_swag.py b/examples/run_swag.py index bedfff0b13..caddbee8ab 100644 --- a/examples/run_swag.py +++ b/examples/run_swag.py @@ -100,7 +100,7 @@ class InputFeatures(object): def read_swag_examples(input_file, is_training): - with open(input_file, 'r') as f: + with open(input_file, 'r', encoding='utf-8') as f: reader = csv.reader(f) lines = list(reader) @@ -333,7 +333,8 @@ def main(): n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') - logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) + logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( + device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py index 139418f1a5..43fa8ca87e 100644 --- a/pytorch_pretrained_bert/file_utils.py +++ b/pytorch_pretrained_bert/file_utils.py @@ -227,7 +227,7 @@ def read_set_from_file(filename: str) -> Set[str]: Expected file format is one item per line. ''' collection = set() - with open(filename, 'r') as file_: + with open(filename, 'r', encoding='utf-8') as file_: for line in file_: collection.add(line.rstrip()) return collection diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py index c6940c74eb..28f22287d2 100644 --- a/pytorch_pretrained_bert/modeling.py +++ b/pytorch_pretrained_bert/modeling.py @@ -106,7 +106,7 @@ class BertConfig(object): initializing all weight matrices. """ if isinstance(vocab_size_or_config_json_file, str): - with open(vocab_size_or_config_json_file, "r") as reader: + with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: json_config = json.loads(reader.read()) for key, value in json_config.items(): self.__dict__[key] = value @@ -137,7 +137,7 @@ class BertConfig(object): @classmethod def from_json_file(cls, json_file): """Constructs a `BertConfig` from a json file of parameters.""" - with open(json_file, "r") as reader: + with open(json_file, "r", encoding='utf-8') as reader: text = reader.read() return cls.from_dict(json.loads(text)) diff --git a/setup.py b/setup.py index a1e1f68db6..dbfeb2c694 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ setup( author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors", author_email="thomas@huggingface.co", description="PyTorch version of Google AI BERT model with script to load Google pre-trained models", - long_description=open("README.md", "r").read(), + long_description=open("README.md", "r", encoding='utf-8').read(), long_description_content_type="text/markdown", keywords='BERT NLP deep learning google', license='Apache',