tokenizer.save_pretrained: only save file if non-empty

2020-01-15 17:36:52 +00:00
parent 6e2c28a14a
commit 9d8fd2d40e
3 changed files with 6 additions and 8 deletions
--- a/src/transformers/configuration_auto.py
+++ b/src/transformers/configuration_auto.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Auto Model class. """
+""" Auto Config class. """


 import logging
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -513,11 +513,9 @@ class PreTrainedTokenizer(object):
        with open(special_tokens_map_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))

+        if len(self.added_tokens_encoder) > 0:
            with open(added_tokens_file, "w", encoding="utf-8") as f:
-            if self.added_tokens_encoder:
                out_str = json.dumps(self.added_tokens_encoder, ensure_ascii=False)
-            else:
-                out_str = "{}"
                f.write(out_str)

        vocab_files = self.save_vocabulary(save_directory)
--- a/tests/test_tokenization_auto.py
+++ b/tests/test_tokenization_auto.py
@@ -33,13 +33,13 @@ class AutoTokenizerTest(unittest.TestCase):
    # @slow
    def test_tokenizer_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
-        for model_name in [x for x in BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys() if "japanese" not in x]:
+        for model_name in (x for x in BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys() if "japanese" not in x):
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.assertIsNotNone(tokenizer)
            self.assertIsInstance(tokenizer, BertTokenizer)
            self.assertGreater(len(tokenizer), 0)

-        for model_name in list(GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP.keys())[:1]:
+        for model_name in GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP.keys():
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.assertIsNotNone(tokenizer)
            self.assertIsInstance(tokenizer, GPT2Tokenizer)