ALBERT can load pre-trained models. Doesn't inherit from BERT anymore.

2019-10-31 16:37:34 +00:00
parent c4403006b8
commit 4f3a54bfc8
4 changed files with 68 additions and 12 deletions
--- a/transformers/tokenization_albert.py
+++ b/transformers/tokenization_albert.py
@@ -15,7 +15,7 @@
 """ Tokenization classes for ALBERT model."""
 from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
-                        
+
 from .tokenization_utils import PreTrainedTokenizer
 import logging
 import unicodedata
@@ -24,8 +24,25 @@ import os
 from shutil import copyfile

 logger = logging.getLogger(__name__)
-
 VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+    'albert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-spiece.model",
+    'albert-large': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-spiece.model",
+    'albert-xlarge': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-spiece.model",
+    'albert-xxlarge': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-spiece.model",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'albert-base': 512,
+    'albert-large': 512,
+    'albert-xlarge': 512,
+    'albert-xxlarge': 512,
+}
+
 SPIECE_UNDERLINE = u'▁'

 class AlbertTokenizer(PreTrainedTokenizer):
@@ -35,8 +52,8 @@ class AlbertTokenizer(PreTrainedTokenizer):
            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
    """
    vocab_files_names = VOCAB_FILES_NAMES
-    # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    def __init__(self, vocab_file,
                 do_lower_case=True, remove_space=True, keep_accents=False,