From 8678ff8df5cc9997537fb62251ba91e58eefc0ec Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 30 Aug 2019 16:26:04 +0200 Subject: [PATCH] adding 17 and 100 xlm models --- pytorch_transformers/tokenization_xlm.py | 247 ++++++++++++++++++++++- 1 file changed, 244 insertions(+), 3 deletions(-) diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py index c40d4cd16e..d14acb39c6 100644 --- a/pytorch_transformers/tokenization_xlm.py +++ b/pytorch_transformers/tokenization_xlm.py @@ -49,7 +49,7 @@ PRETRAINED_VOCAB_FILES_MAP = { 'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json", 'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json", 'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json", - } + }, 'merges_file': { 'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt", @@ -167,8 +167,249 @@ PRETRAINED_INIT_CONFIGURATION = { "1": "en"}, "lang2id": { "de": 0, "en": 1 }}, - 'xlm-mlm-17-1280': {"do_lowercase_and_remove_accent": False}, - 'xlm-mlm-100-1280': {"do_lowercase_and_remove_accent": False}, + 'xlm-mlm-17-1280': {"do_lowercase_and_remove_accent": False, + "id2lang": { + "0": "ar", + "1": "de", + "2": "en", + "3": "es", + "4": "fr", + "5": "hi", + "6": "it", + "7": "ja", + "8": "ko", + "9": "nl", + "10": "pl", + "11": "pt", + "12": "ru", + "13": "sv", + "14": "tr", + "15": "vi", + "16": "zh" + }, + "lang2id": { + "ar": 0, + "de": 1, + "en": 2, + "es": 3, + "fr": 4, + "hi": 5, + "it": 6, + "ja": 7, + "ko": 8, + "nl": 9, + "pl": 10, + "pt": 11, + "ru": 12, + "sv": 13, + "tr": 14, + "vi": 15, + "zh": 16}}, + 'xlm-mlm-100-1280': {"do_lowercase_and_remove_accent": False, + "id2lang": { + "0": "af", + "1": "als", + "2": "am", + "3": "an", + "4": "ang", + "5": "ar", + "6": "arz", + "7": "ast", + "8": "az", + "9": "bar", + "10": "be", + "11": "bg", + "12": "bn", + "13": "br", + "14": "bs", + "15": "ca", + "16": "ceb", + "17": "ckb", + "18": "cs", + "19": "cy", + "20": "da", + "21": "de", + "22": "el", + "23": "en", + "24": "eo", + "25": "es", + "26": "et", + "27": "eu", + "28": "fa", + "29": "fi", + "30": "fr", + "31": "fy", + "32": "ga", + "33": "gan", + "34": "gl", + "35": "gu", + "36": "he", + "37": "hi", + "38": "hr", + "39": "hu", + "40": "hy", + "41": "ia", + "42": "id", + "43": "is", + "44": "it", + "45": "ja", + "46": "jv", + "47": "ka", + "48": "kk", + "49": "kn", + "50": "ko", + "51": "ku", + "52": "la", + "53": "lb", + "54": "lt", + "55": "lv", + "56": "mk", + "57": "ml", + "58": "mn", + "59": "mr", + "60": "ms", + "61": "my", + "62": "nds", + "63": "ne", + "64": "nl", + "65": "nn", + "66": "no", + "67": "oc", + "68": "pl", + "69": "pt", + "70": "ro", + "71": "ru", + "72": "scn", + "73": "sco", + "74": "sh", + "75": "si", + "76": "simple", + "77": "sk", + "78": "sl", + "79": "sq", + "80": "sr", + "81": "sv", + "82": "sw", + "83": "ta", + "84": "te", + "85": "th", + "86": "tl", + "87": "tr", + "88": "tt", + "89": "uk", + "90": "ur", + "91": "uz", + "92": "vi", + "93": "war", + "94": "wuu", + "95": "yi", + "96": "zh", + "97": "zh_classical", + "98": "zh_min_nan", + "99": "zh_yue" + }, + "lang2id": { + "af": 0, + "als": 1, + "am": 2, + "an": 3, + "ang": 4, + "ar": 5, + "arz": 6, + "ast": 7, + "az": 8, + "bar": 9, + "be": 10, + "bg": 11, + "bn": 12, + "br": 13, + "bs": 14, + "ca": 15, + "ceb": 16, + "ckb": 17, + "cs": 18, + "cy": 19, + "da": 20, + "de": 21, + "el": 22, + "en": 23, + "eo": 24, + "es": 25, + "et": 26, + "eu": 27, + "fa": 28, + "fi": 29, + "fr": 30, + "fy": 31, + "ga": 32, + "gan": 33, + "gl": 34, + "gu": 35, + "he": 36, + "hi": 37, + "hr": 38, + "hu": 39, + "hy": 40, + "ia": 41, + "id": 42, + "is": 43, + "it": 44, + "ja": 45, + "jv": 46, + "ka": 47, + "kk": 48, + "kn": 49, + "ko": 50, + "ku": 51, + "la": 52, + "lb": 53, + "lt": 54, + "lv": 55, + "mk": 56, + "ml": 57, + "mn": 58, + "mr": 59, + "ms": 60, + "my": 61, + "nds": 62, + "ne": 63, + "nl": 64, + "nn": 65, + "no": 66, + "oc": 67, + "pl": 68, + "pt": 69, + "ro": 70, + "ru": 71, + "scn": 72, + "sco": 73, + "sh": 74, + "si": 75, + "simple": 76, + "sk": 77, + "sl": 78, + "sq": 79, + "sr": 80, + "sv": 81, + "sw": 82, + "ta": 83, + "te": 84, + "th": 85, + "tl": 86, + "tr": 87, + "tt": 88, + "uk": 89, + "ur": 90, + "uz": 91, + "vi": 92, + "war": 93, + "wuu": 94, + "yi": 95, + "zh": 96, + "zh_classical": 97, + "zh_min_nan": 98, + "zh_yue": 99 + }}, } def get_pairs(word):