From ae88eb88a4baffdd23fa38acf7493aedd23fa6b5 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 14 Dec 2018 13:48:58 +0100
Subject: [PATCH] set encoding to 'utf-8' in calls to open

---
 examples/extract_features.py          | 2 +-
 examples/run_classifier.py            | 5 +++--
 examples/run_squad.py                 | 4 ++--
 examples/run_swag.py                  | 5 +++--
 pytorch_pretrained_bert/file_utils.py | 2 +-
 pytorch_pretrained_bert/modeling.py   | 4 ++--
 setup.py                              | 2 +-
 7 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/examples/extract_features.py b/examples/extract_features.py
index dbab934c08..4f8812121e 100644
--- a/examples/extract_features.py
+++ b/examples/extract_features.py
@@ -168,7 +168,7 @@ def read_examples(input_file):
     """Read a list of `InputExample`s from an input file."""
     examples = []
     unique_id = 0
-    with open(input_file, "r") as reader:
+    with open(input_file, "r", encoding='utf-8') as reader:
         while True:
             line = reader.readline()
             if not line:
diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index e1dcd36344..adf81f4e28 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -91,7 +91,7 @@ class DataProcessor(object):
     @classmethod
     def _read_tsv(cls, input_file, quotechar=None):
         """Reads a tab separated value file."""
-        with open(input_file, "r") as f:
+        with open(input_file, "r", encoding='utf-8') as f:
             reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
             lines = []
             for line in reader:
@@ -413,7 +413,8 @@ def main():
         n_gpu = 1
         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.distributed.init_process_group(backend='nccl')
-    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
+        device, n_gpu, bool(args.local_rank != -1), args.fp16))
 
     if args.gradient_accumulation_steps < 1:
         raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
diff --git a/examples/run_squad.py b/examples/run_squad.py
index d6e96f4ac9..6a97dd300b 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -108,7 +108,7 @@ class InputFeatures(object):
 
 def read_squad_examples(input_file, is_training):
     """Read a SQuAD json file into a list of SquadExample."""
-    with open(input_file, "r") as reader:
+    with open(input_file, "r", encoding='utf-8') as reader:
         input_data = json.load(reader)["data"]
 
     def is_whitespace(c):
@@ -757,7 +757,7 @@ def main():
         n_gpu = 1
         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.distributed.init_process_group(backend='nccl')
-    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}".format(
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
         device, n_gpu, bool(args.local_rank != -1), args.fp16))
 
     if args.gradient_accumulation_steps < 1:
diff --git a/examples/run_swag.py b/examples/run_swag.py
index bedfff0b13..caddbee8ab 100644
--- a/examples/run_swag.py
+++ b/examples/run_swag.py
@@ -100,7 +100,7 @@ class InputFeatures(object):
 
 
 def read_swag_examples(input_file, is_training):
-    with open(input_file, 'r') as f:
+    with open(input_file, 'r', encoding='utf-8') as f:
         reader = csv.reader(f)
         lines = list(reader)
 
@@ -333,7 +333,8 @@ def main():
         n_gpu = 1
         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.distributed.init_process_group(backend='nccl')
-    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
+        device, n_gpu, bool(args.local_rank != -1), args.fp16))
 
     if args.gradient_accumulation_steps < 1:
         raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py
index 139418f1a5..43fa8ca87e 100644
--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_pretrained_bert/file_utils.py
@@ -227,7 +227,7 @@ def read_set_from_file(filename: str) -> Set[str]:
     Expected file format is one item per line.
     '''
     collection = set()
-    with open(filename, 'r') as file_:
+    with open(filename, 'r', encoding='utf-8') as file_:
         for line in file_:
             collection.add(line.rstrip())
     return collection
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index c6940c74eb..28f22287d2 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -106,7 +106,7 @@ class BertConfig(object):
                 initializing all weight matrices.
         """
         if isinstance(vocab_size_or_config_json_file, str):
-            with open(vocab_size_or_config_json_file, "r") as reader:
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
                 json_config = json.loads(reader.read())
             for key, value in json_config.items():
                 self.__dict__[key] = value
@@ -137,7 +137,7 @@ class BertConfig(object):
     @classmethod
     def from_json_file(cls, json_file):
         """Constructs a `BertConfig` from a json file of parameters."""
-        with open(json_file, "r") as reader:
+        with open(json_file, "r", encoding='utf-8') as reader:
             text = reader.read()
         return cls.from_dict(json.loads(text))
 
diff --git a/setup.py b/setup.py
index a1e1f68db6..dbfeb2c694 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,7 @@ setup(
     author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors",
     author_email="thomas@huggingface.co",
     description="PyTorch version of Google AI BERT model with script to load Google pre-trained models",
-    long_description=open("README.md", "r").read(),
+    long_description=open("README.md", "r", encoding='utf-8').read(),
     long_description_content_type="text/markdown",
     keywords='BERT NLP deep learning google',
     license='Apache',