From ae4c9fee734e587a1873c88a44909a93bb478c36 Mon Sep 17 00:00:00 2001 From: Ailing Zhang Date: Sat, 13 Apr 2019 15:00:48 -0700 Subject: [PATCH] add hubconf --- hubconf.py | 187 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 187 insertions(+) create mode 100644 hubconf.py diff --git a/hubconf.py b/hubconf.py new file mode 100644 index 0000000000..b2e44af5d9 --- /dev/null +++ b/hubconf.py @@ -0,0 +1,187 @@ +from pytorch_pretrained_bert.tokenization import BertTokenizer +from pytorch_pretrained_bert.modeling import (BertForNextSentencePrediction, + BertForMaskedLM, + BertForMultipleChoice, + BertForPreTraining, + BertForQuestionAnswering, + BertForSequenceClassification, + ) + +dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex'] + + +def bertTokenizer(*args, **kwargs): + """ + Instantiate a BertTokenizer from a pre-trained/customized vocab file + Args: + pretrained_model_name_or_path: Path to pretrained model archive + or one of pre-trained vocab configs below. + * bert-base-uncased + * bert-large-uncased + * bert-base-cased + * bert-large-cased + * bert-base-multilingual-uncased + * bert-base-multilingual-cased + * bert-base-chinese + Keyword args: + cache_dir: an optional path to a specific directory to download and cache + the pre-trained model weights. + Default: None + do_lower_case: Whether to lower case the input. + Only has an effect when do_wordpiece_only=False + Default: True + do_basic_tokenize: Whether to do basic tokenization before wordpiece. + Default: True + max_len: An artificial maximum length to truncate tokenized sequences to; + Effective maximum length is always the minimum of this + value (if specified) and the underlying BERT model's + sequence length. + Default: None + never_split: List of tokens which will never be split during tokenization. + Only has an effect when do_wordpiece_only=False + Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"] + + Example: + >>> sentence = 'Hello, World!' + >>> tokenizer = torch.hub.load('ailzhang/pytorch-pretrained-BERT:hubconf', 'BertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False) + >>> toks = tokenizer.tokenize(sentence) + ['Hello', '##,', 'World', '##!'] + >>> ids = tokenizer.convert_tokens_to_ids(toks) + [8667, 28136, 1291, 28125] + """ + tokenizer = BertTokenizer.from_pretrained(*args, **kwargs) + return tokenizer + + +def bertForNextSentencePrediction(*args, **kwargs): + """BERT model with next sentence prediction head. + This module comprises the BERT model followed by the next sentence classification head. + Params: + pretrained_model_name_or_path: either: + - a str with the name of a pre-trained model to load selected in the list of: + . `bert-base-uncased` + . `bert-large-uncased` + . `bert-base-cased` + . `bert-large-cased` + . `bert-base-multilingual-uncased` + . `bert-base-multilingual-cased` + . `bert-base-chinese` + - a path or url to a pretrained model archive containing: + . `bert_config.json` a configuration file for the model + . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance + - a path or url to a pretrained model archive containing: + . `bert_config.json` a configuration file for the model + . `model.chkpt` a TensorFlow checkpoint + from_tf: should we load the weights from a locally saved TensorFlow checkpoint + cache_dir: an optional path to a folder in which the pre-trained models will be cached. + state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models + *inputs, **kwargs: additional input for the specific Bert class + (ex: num_labels for BertForSequenceClassification) + """ + model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs) + return model + + +def bertForPreTraining(*args, **kwargs): + """BERT model with pre-training heads. + This module comprises the BERT model followed by the two pre-training heads: + - the masked language modeling head, and + - the next sentence classification head. + Params: + pretrained_model_name_or_path: either: + - a str with the name of a pre-trained model to load selected in the list of: + . `bert-base-uncased` + . `bert-large-uncased` + . `bert-base-cased` + . `bert-large-cased` + . `bert-base-multilingual-uncased` + . `bert-base-multilingual-cased` + . `bert-base-chinese` + - a path or url to a pretrained model archive containing: + . `bert_config.json` a configuration file for the model + . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance + - a path or url to a pretrained model archive containing: + . `bert_config.json` a configuration file for the model + . `model.chkpt` a TensorFlow checkpoint + from_tf: should we load the weights from a locally saved TensorFlow checkpoint + cache_dir: an optional path to a folder in which the pre-trained models will be cached. + state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models + *inputs, **kwargs: additional input for the specific Bert class + (ex: num_labels for BertForSequenceClassification) + + """ + model = BertForPreTraining.from_pretrained(*args, **kwargs) + return model + + +def bertForMaskedLM(*args, **kwargs): + """ + BertForMaskedLM includes the BertModel Transformer followed by the (possibly) + pre-trained masked language modeling head. + Params: + pretrained_model_name_or_path: either: + - a str with the name of a pre-trained model to load selected in the list of: + . `bert-base-uncased` + . `bert-large-uncased` + . `bert-base-cased` + . `bert-large-cased` + . `bert-base-multilingual-uncased` + . `bert-base-multilingual-cased` + . `bert-base-chinese` + - a path or url to a pretrained model archive containing: + . `bert_config.json` a configuration file for the model + . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance + - a path or url to a pretrained model archive containing: + . `bert_config.json` a configuration file for the model + . `model.chkpt` a TensorFlow checkpoint + from_tf: should we load the weights from a locally saved TensorFlow checkpoint + cache_dir: an optional path to a folder in which the pre-trained models will be cached. + state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models + *inputs, **kwargs: additional input for the specific Bert class + (ex: num_labels for BertForSequenceClassification) + """ + model = BertForMaskedLM.from_pretrained(*args, **kwargs) + return model + + +#def bertForSequenceClassification(*args, **kwargs): +# model = BertForSequenceClassification.from_pretrained(*args, **kwargs) +# return model + + +#def bertForMultipleChoice(*args, **kwargs): +# model = BertForMultipleChoice.from_pretrained(*args, **kwargs) +# return model + + +def bertForQuestionAnswering(*args, **kwargs): + """ + BertForQuestionAnswering is a fine-tuning model that includes BertModel with + a token-level classifiers on top of the full sequence of last hidden states. + Params: + pretrained_model_name_or_path: either: + - a str with the name of a pre-trained model to load selected in the list of: + . `bert-base-uncased` + . `bert-large-uncased` + . `bert-base-cased` + . `bert-large-cased` + . `bert-base-multilingual-uncased` + . `bert-base-multilingual-cased` + . `bert-base-chinese` + - a path or url to a pretrained model archive containing: + . `bert_config.json` a configuration file for the model + . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance + - a path or url to a pretrained model archive containing: + . `bert_config.json` a configuration file for the model + . `model.chkpt` a TensorFlow checkpoint + from_tf: should we load the weights from a locally saved TensorFlow checkpoint + cache_dir: an optional path to a folder in which the pre-trained models will be cached. + state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models + *inputs, **kwargs: additional input for the specific Bert class + (ex: num_labels for BertForSequenceClassification) + """ + model = BertForQuestionAnswering.from_pretrained(*args, **kwargs) + return model + + +