diff --git a/.gitignore b/.gitignore index e673ce5f47..d829943209 100644 --- a/.gitignore +++ b/.gitignore @@ -131,4 +131,7 @@ examples/runs # data /data -serialization_dir \ No newline at end of file +serialization_dir + +# emacs +*.*~ \ No newline at end of file diff --git a/README.md b/README.md index 8dd2c2fb66..4a2690f324 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@
State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch
-🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides state-of-the-art general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.
+🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides state-of-the-art general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet, CTRL...) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.
### Features
@@ -122,6 +122,7 @@ At some point in the future, you'll be able to seamlessly move from pre-training
7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the blogpost [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5
) by Victor Sanh, Lysandre Debut and Thomas Wolf.
+9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
@@ -148,6 +149,7 @@ from transformers import *
MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'),
(OpenAIGPTModel, OpenAIGPTTokenizer, 'openai-gpt'),
(GPT2Model, GPT2Tokenizer, 'gpt2'),
+ (CTRLModel, CTRLTokenizer, 'ctrl'),
(TransfoXLModel, TransfoXLTokenizer, 'transfo-xl-wt103'),
(XLNetModel, XLNetTokenizer, 'xlnet-base-cased'),
(XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024'),
@@ -253,7 +255,7 @@ The library comprises several example scripts with SOTA performances for NLU and
- `run_glue.py`: an example fine-tuning Bert, XLNet and XLM on nine different GLUE tasks (*sequence-level classification*)
- `run_squad.py`: an example fine-tuning Bert, XLNet and XLM on the question answering dataset SQuAD 2.0 (*token-level classification*)
-- `run_generation.py`: an example using GPT, GPT-2, Transformer-XL and XLNet for conditional language generation
+- `run_generation.py`: an example using GPT, GPT-2, CTRL, Transformer-XL and XLNet for conditional language generation
- other model-specific examples (see the documentation).
Here are three quick usage examples for these scripts:
@@ -391,7 +393,7 @@ python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncase
This is the model provided as `bert-large-uncased-whole-word-masking-finetuned-squad`.
-### `run_generation.py`: Text generation with GPT, GPT-2, Transformer-XL and XLNet
+### `run_generation.py`: Text generation with GPT, GPT-2, CTRL, Transformer-XL and XLNet
A conditional generation script is also included to generate text from a prompt.
The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by Aman Rusia to get high quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer).
@@ -405,6 +407,16 @@ python ./examples/run_generation.py \
--model_name_or_path=gpt2 \
```
+and from the Salesforce CTRL model:
+```shell
+python ./examples/run_generation.py \
+ --model_type=ctrl \
+ --length=20 \
+ --model_name_or_path=gpt2 \
+ --temperature=0 \
+ --repetition_penalty=1.2 \
+```
+
## Migrating from pytorch-transformers to transformers
Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
diff --git a/examples/run_generation.py b/examples/run_generation.py
index 9e98a9e870..d0fc962267 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -14,7 +14,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-""" Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/Transformer-XL/XLNet)
+""" Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)
"""
from __future__ import absolute_import, division, print_function, unicode_literals
@@ -26,13 +26,13 @@ import torch
import torch.nn.functional as F
import numpy as np
-from transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig
+from transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, CTRLConfig
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
from transformers import XLNetLMHeadModel, XLNetTokenizer
from transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
-
+from transformers import CTRLLMHeadModel, CTRLTokenizer
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
@@ -41,10 +41,11 @@ logger = logging.getLogger(__name__)
MAX_LENGTH = int(10000) # Hardcoded max length to avoid infinite loop
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig)), ())
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, CTRLConfig)), ())
MODEL_CLASSES = {
'gpt2': (GPT2LMHeadModel, GPT2Tokenizer),
+ 'ctrl': (CTRLLMHeadModel, CTRLTokenizer),
'openai-gpt': (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
'xlnet': (XLNetLMHeadModel, XLNetTokenizer),
'transfo-xl': (TransfoXLLMHeadModel, TransfoXLTokenizer),
@@ -103,7 +104,7 @@ def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')
return logits
-def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, is_xlnet=False, device='cpu'):
+def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0, is_xlnet=False, device='cpu'):
context = torch.tensor(context, dtype=torch.long, device=device)
context = context.unsqueeze(0).repeat(num_samples, 1)
generated = context
@@ -122,9 +123,17 @@ def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=
inputs = {'input_ids': input_ids, 'perm_mask': perm_mask, 'target_mapping': target_mapping}
outputs = model(**inputs) # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
- next_token_logits = outputs[0][0, -1, :] / temperature
+ next_token_logits = outputs[0][0, -1, :] / (temperature if temperature > 0 else 1.)
+
+ # reptition penalty from CTRL (https://arxiv.org/abs/1909.05858)
+ for _ in set(generated):
+ next_token_logits[_] /= repetition_penalty
+
filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
- next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
+ if temperature == 0: #greedy sampling:
+ next_token = torch.argmax(filtered_logits).unsqueeze(0)
+ else:
+ next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
return generated
@@ -138,7 +147,10 @@ def main():
parser.add_argument("--prompt", type=str, default="")
parser.add_argument("--padding_text", type=str, default="")
parser.add_argument("--length", type=int, default=20)
- parser.add_argument("--temperature", type=float, default=1.0)
+ parser.add_argument("--temperature", type=float, default=1.0,
+ help="temperature of 0 implies greedy sampling")
+ parser.add_argument("--repetition_penalty", type=float, default=1.0,
+ help="primarily useful for CTRL model; in that case, use 1.2")
parser.add_argument("--top_k", type=int, default=0)
parser.add_argument("--top_p", type=float, default=0.9)
parser.add_argument("--no_cuda", action='store_true',
@@ -146,7 +158,10 @@ def main():
parser.add_argument('--seed', type=int, default=42,
help="random seed for initialization")
args = parser.parse_args()
-
+ if args.model_type in ["ctrl"]:
+ if args.temperature > 0.7 :
+ print('CTRL typically works better with lower temperatures (and lower top_k).')
+
args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = torch.cuda.device_count()
@@ -180,6 +195,7 @@ def main():
temperature=args.temperature,
top_k=args.top_k,
top_p=args.top_p,
+ repetition_penalty=args.repetition_penalty,
device=args.device,
is_xlnet=bool(args.model_type == "xlnet"),
)
diff --git a/transformers/__init__.py b/transformers/__init__.py
index 5248bc9f1b..3a6d2aee2c 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -37,6 +37,7 @@ from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
from .tokenization_openai import OpenAIGPTTokenizer
from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
from .tokenization_gpt2 import GPT2Tokenizer
+from .tokenization_ctrl import CTRLTokenizer
from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
from .tokenization_xlm import XLMTokenizer
from .tokenization_roberta import RobertaTokenizer
@@ -49,7 +50,9 @@ from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
@@ -73,6 +76,9 @@ if is_torch_available():
from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
GPT2LMHeadModel, GPT2DoubleHeadsModel,
load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+ from .modeling_ctrl import (CTRLPreTrainedModel, CTRLModel,
+ CTRLLMHeadModel,
+ CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
XLNetForSequenceClassification, XLNetForMultipleChoice,
XLNetForQuestionAnsweringSimple, XLNetForQuestionAnswering,
diff --git a/transformers/configuration_auto.py b/transformers/configuration_auto.py
index 74dda59fcf..edd21a670c 100644
--- a/transformers/configuration_auto.py
+++ b/transformers/configuration_auto.py
@@ -26,6 +26,7 @@ from .configuration_xlnet import XLNetConfig
from .configuration_xlm import XLMConfig
from .configuration_roberta import RobertaConfig
from .configuration_distilbert import DistilBertConfig
+from .configuration_ctrl import CTRLConfig
logger = logging.getLogger(__name__)
@@ -49,7 +50,7 @@ class AutoConfig(object):
- contains `xlnet`: XLNetConfig (XLNet model)
- contains `xlm`: XLMConfig (XLM model)
- contains `roberta`: RobertaConfig (RoBERTa model)
-
+ - contains `ctrl` : CTRLConfig (CTRL model)
This class cannot be instantiated using `__init__()` (throw an error).
"""
def __init__(self):
@@ -71,7 +72,7 @@ class AutoConfig(object):
- contains `xlnet`: XLNetConfig (XLNet model)
- contains `xlm`: XLMConfig (XLM model)
- contains `roberta`: RobertaConfig (RoBERTa model)
-
+ - contains `ctrl` : CTRLConfig (CTRL model)
Params:
pretrained_model_name_or_path: either:
@@ -129,7 +130,8 @@ class AutoConfig(object):
return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif 'xlm' in pretrained_model_name_or_path:
return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
+ elif 'ctrl' in pretrained_model_name_or_path:
+ return CTRLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
- "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+ "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path))
diff --git a/transformers/configuration_ctrl.py b/transformers/configuration_ctrl.py
new file mode 100644
index 0000000000..4525936885
--- /dev/null
+++ b/transformers/configuration_ctrl.py
@@ -0,0 +1,144 @@
+# coding=utf-8
+# Copyright 2018 Salesforce and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Salesforce CTRL configuration """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-ctrl/pytorch/ctrl-config.json"}
+
+class CTRLConfig(PretrainedConfig):
+ """Configuration class to store the configuration of a `CTRLModel`.
+
+ Args:
+ vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
+ n_positions: Number of positional embeddings.
+ n_ctx: Size of the causal mask (usually same as n_positions).
+ dff: Size of the inner dimension of the FFN.
+ n_embd: Dimensionality of the embeddings and hidden states.
+ n_layer: Number of hidden layers in the Transformer encoder.
+ n_head: Number of attention heads for each attention layer in
+ the Transformer encoder.
+ layer_norm_epsilon: epsilon to use in the layer norm layers
+ resid_pdrop: The dropout probabilitiy for all fully connected
+ layers in the embeddings, encoder, and pooler.
+ attn_pdrop: The dropout ratio for the attention
+ probabilities.
+ embd_pdrop: The dropout ratio for the embeddings.
+ initializer_range: The sttdev of the truncated_normal_initializer for
+ initializing all weight matrices.
+ """
+ pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+ def __init__(
+ self,
+ vocab_size_or_config_json_file=246534,
+ n_positions=50000,
+ n_ctx=512,
+ n_embd=1280,
+ dff=8192,
+ n_layer=48,
+ n_head=16,
+ resid_pdrop=0.1,
+ embd_pdrop=0.1,
+ attn_pdrop=0.1,
+ layer_norm_epsilon=1e-6,
+ initializer_range=0.02,
+
+ num_labels=1,
+ summary_type='cls_index',
+ summary_use_proj=True,
+ summary_activation=None,
+ summary_proj_to_labels=True,
+ summary_first_dropout=0.1,
+ **kwargs
+ ):
+ """Constructs CTRLConfig.
+
+ Args:
+ vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
+ n_positions: Number of positional embeddings.
+ n_ctx: Size of the causal mask (usually same as n_positions).
+ dff: Size of the inner dimension of the FFN.
+ n_embd: Dimensionality of the embeddings and hidden states.
+ n_layer: Number of hidden layers in the Transformer encoder.
+ n_head: Number of attention heads for each attention layer in
+ the Transformer encoder.
+ layer_norm_epsilon: epsilon to use in the layer norm layers
+ resid_pdrop: The dropout probabilitiy for all fully connected
+ layers in the embeddings, encoder, and pooler.
+ attn_pdrop: The dropout ratio for the attention
+ probabilities.
+ embd_pdrop: The dropout ratio for the embeddings.
+ initializer_range: The sttdev of the truncated_normal_initializer for
+ initializing all weight matrices.
+ """
+ super(CTRLConfig, self).__init__(**kwargs)
+
+ if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+ and isinstance(vocab_size_or_config_json_file, unicode)):
+ with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
+ json_config = json.loads(reader.read())
+ for key, value in json_config.items():
+ self.__dict__[key] = value
+ elif isinstance(vocab_size_or_config_json_file, int):
+ self.vocab_size = vocab_size_or_config_json_file
+ self.n_ctx = n_ctx
+ self.n_positions = n_positions
+ self.n_embd = n_embd
+ self.n_layer = n_layer
+ self.n_head = n_head
+ self.dff = dff
+ self.resid_pdrop = resid_pdrop
+ self.embd_pdrop = embd_pdrop
+ self.attn_pdrop = attn_pdrop
+ self.layer_norm_epsilon = layer_norm_epsilon
+ self.initializer_range = initializer_range
+
+ self.num_labels = num_labels
+ self.summary_type = summary_type
+ self.summary_use_proj = summary_use_proj
+ self.summary_activation = summary_activation
+ self.summary_first_dropout = summary_first_dropout
+ self.summary_proj_to_labels = summary_proj_to_labels
+ else:
+ raise ValueError(
+ "First argument must be either a vocabulary size (int)"
+ "or the path to a pretrained model config file (str)"
+ )
+
+ @property
+ def max_position_embeddings(self):
+ return self.n_positions
+
+ @property
+ def hidden_size(self):
+ return self.n_embd
+
+ @property
+ def num_attention_heads(self):
+ return self.n_head
+
+ @property
+ def num_hidden_layers(self):
+ return self.n_layer
diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py
index b76a883b19..d98110d4bd 100644
--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -21,6 +21,7 @@ import logging
from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering
from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel
from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel
+from .modeling_ctrl import CTRLModel, CTRLLMHeadModel
from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel
from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering
from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
@@ -51,6 +52,7 @@ class AutoModel(object):
- contains `bert`: BertModel (Bert model)
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
+ - contains `ctrl`: CTRLModel (Salesforce CTRL model)
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
- contains `xlnet`: XLNetModel (XLNet model)
- contains `xlm`: XLMModel (XLM model)
@@ -73,6 +75,7 @@ class AutoModel(object):
- contains `bert`: BertModel (Bert model)
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
+ - contains `ctrl`: CTRLModel (Salesforce CTRL model)
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
- contains `xlnet`: XLNetModel (XLNet model)
- contains `xlm`: XLMModel (XLM model)
@@ -149,10 +152,11 @@ class AutoModel(object):
return XLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm' in pretrained_model_name_or_path:
return XLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-
+ elif 'ctrl' in pretrained_model_name_or_path:
+ return CTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
- "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+ "'xlm', 'roberta, 'ctrl'".format(pretrained_model_name_or_path))
class AutoModelWithLMHead(object):
@@ -172,6 +176,7 @@ class AutoModelWithLMHead(object):
- contains `bert`: BertForMaskedLM (Bert model)
- contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
- contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
+ - contains `ctrl`: CTRLLMModel (Salesforce CTRL model)
- contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
- contains `xlnet`: XLNetLMHeadModel (XLNet model)
- contains `xlm`: XLMWithLMHeadModel (XLM model)
@@ -273,10 +278,11 @@ class AutoModelWithLMHead(object):
return XLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
elif 'xlm' in pretrained_model_name_or_path:
return XLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-
+ elif 'ctrl' in pretrained_model_name_or_path:
+ return CTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
- "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+ "'xlm', 'roberta','ctrl'".format(pretrained_model_name_or_path))
class AutoModelForSequenceClassification(object):
diff --git a/transformers/modeling_ctrl.py b/transformers/modeling_ctrl.py
new file mode 100644
index 0000000000..3633c2d8fb
--- /dev/null
+++ b/transformers/modeling_ctrl.py
@@ -0,0 +1,387 @@
+# coding=utf-8
+# Copyright 2018 Salesforce and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch CTRL model."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+import numpy as np
+import torch
+import torch.nn as nn
+import pdb
+from torch.nn import CrossEntropyLoss
+from torch.nn.parameter import Parameter
+
+from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
+from .configuration_ctrl import CTRLConfig
+from .file_utils import add_start_docstrings
+
+logger = logging.getLogger(__name__)
+
+CTRL_PRETRAINED_MODEL_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-ctrl/pytorch/seqlen256_v1.bin"}
+
+
+def angle_defn(pos, i, d_model_size):
+ angle_rates = 1 / torch.pow(10000, (2 * (i//2)) / d_model_size)
+ return pos * angle_rates
+
+def positional_encoding(position, d_model_size, dtype):
+ # create the sinusoidal pattern for the positional encoding
+ angle_rads = (angle_defn(torch.arange(position, dtype=dtype).unsqueeze(1), torch.arange(d_model_size, dtype=dtype).unsqueeze(0), d_model_size))
+
+ sines = torch.sin(angle_rads[:, 0::2])
+ cosines = torch.cos(angle_rads[:, 1::2])
+
+ pos_encoding = torch.cat([sines, cosines], dim=-1).unsqueeze(0)
+ return pos_encoding
+
+def scaled_dot_product_attention(q, k, v, mask):
+ # calculate attention
+ matmul_qk = torch.matmul(q, k.permute(0,1,3,2))
+
+ dk = k.shape[-1]
+ scaled_attention_logits = matmul_qk / np.sqrt(dk)
+
+ if mask is not None:
+ scaled_attention_logits += (mask * -1e4)
+
+ attention_weights = torch.softmax(scaled_attention_logits, dim=-1)
+ output = torch.matmul(attention_weights, v)
+
+ return output, attention_weights
+
+
+class MultiHeadAttention(torch.nn.Module):
+ def __init__(self, d_model_size, num_heads):
+ super(MultiHeadAttention, self).__init__()
+ self.num_heads = num_heads
+ self.d_model_size = d_model_size
+
+ self.depth = int(d_model_size / self.num_heads)
+
+ self.Wq = torch.nn.Linear(d_model_size, d_model_size)
+ self.Wk = torch.nn.Linear(d_model_size, d_model_size)
+ self.Wv = torch.nn.Linear(d_model_size, d_model_size)
+
+ self.dense = torch.nn.Linear(d_model_size, d_model_size)
+
+ def split_into_heads(self, x, batch_size):
+ x = x.reshape(batch_size, -1, self.num_heads, self.depth)
+ return x.permute([0, 2, 1, 3])
+
+ def forward(self, v, k, q, mask):
+ batch_size = q.shape[0]
+
+ q = self.Wq(q)
+ k = self.Wk(k)
+ v = self.Wv(v)
+
+ q = self.split_into_heads(q, batch_size)
+ k = self.split_into_heads(k, batch_size)
+ v = self.split_into_heads(v, batch_size)
+ output = scaled_dot_product_attention(q, k, v, mask)
+ scaled_attention = output[0].permute([0, 2, 1, 3])
+ attn = output[1]
+ original_size_attention = scaled_attention.reshape(batch_size, -1, self.d_model_size)
+ output = self.dense(original_size_attention)
+
+ return output, attn
+
+
+
+def point_wise_feed_forward_network(d_model_size, dff):
+ return torch.nn.Sequential(torch.nn.Linear(d_model_size, dff), torch.nn.ReLU(), torch.nn.Linear(dff, d_model_size))
+
+
+class EncoderLayer(torch.nn.Module):
+ def __init__(self, d_model_size, num_heads, dff, rate=0.1):
+ super(EncoderLayer, self).__init__()
+
+ self.multi_head_attention = MultiHeadAttention(d_model_size, num_heads)
+ self.ffn = point_wise_feed_forward_network(d_model_size, dff)
+
+ self.layernorm1 = torch.nn.LayerNorm(d_model_size, eps=1e-6)
+ self.layernorm2 = torch.nn.LayerNorm(d_model_size, eps=1e-6)
+
+ self.dropout1 = torch.nn.Dropout(rate)
+ self.dropout2 = torch.nn.Dropout(rate)
+
+ def forward(self, x, mask):
+ normed = self.layernorm1(x)
+ attn_output, attn = self.multi_head_attention(normed, normed, normed, mask)
+ attn_output = self.dropout1(attn_output)
+ out1 = x + attn_output
+
+ out2 = self.layernorm2(out1)
+ ffn_output = self.ffn(out2)
+ ffn_output = self.dropout2(ffn_output)
+ out2 = out1 + ffn_output
+
+ return out2, attn
+
+
+class CTRLPreTrainedModel(PreTrainedModel):
+ """ An abstract class to handle weights initialization and
+ a simple interface for dowloading and loading pretrained models.
+ """
+ config_class = CTRLConfig
+ pretrained_model_archive_map = CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
+ base_model_prefix = "transformer"
+
+ def __init__(self, *inputs, **kwargs):
+ super(CTRLPreTrainedModel, self).__init__(*inputs, **kwargs)
+
+ def _init_weights(self, module):
+ """ Initialize the weights.
+ """
+ if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
+ # Slightly different from the TF version which uses truncated_normal for initialization
+ # cf https://github.com/pytorch/pytorch/pull/5617
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+
+
+CTRL_START_DOCSTRING = r""" CTRL model was proposed in
+ `CTRL: A Conditional Transformer Language Model for Controllable Generation`_
+ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+ It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
+ corpus of ~140 GB of text data with the first token reserved as a control code (such as Links, Books, Wikipedia etc.).
+
+ This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+ refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+ .. _`CTRL: A Conditional Transformer Language Model for Controllable Generation`:
+ https://www.github.com/salesforce/ctrl
+
+ .. _`torch.nn.Module`:
+ https://pytorch.org/docs/stable/nn.html#module
+
+ Parameters:
+ config (:class:`~transformers.CTRLConfig`): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the configuration.
+ Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+CTRL_INPUTS_DOCSTRING = r""" Inputs:
+ **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+ Indices of input sequence tokens in the vocabulary.
+ CTRL is a model with absolute position embeddings so it's usually advised to pad the inputs on
+ the right rather than the left.
+ Indices can be obtained using :class:`transformers.CTRLTokenizer`.
+ See :func:`transformers.PreTrainedTokenizer.encode` and
+ :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+ **past**:
+ list of ``torch.FloatTensor`` (one for each layer):
+ that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+ (see `past` output below). Can be used to speed up sequential decoding.
+ **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+ Mask to avoid performing attention on padding token indices.
+ Mask values selected in ``[0, 1]``:
+ ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+ **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+ A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+ The embeddings from these tokens will be summed with the respective token embeddings.
+ Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+ **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+ Indices of positions of each input sequence tokens in the position embeddings.
+ Selected in the range ``[0, config.max_position_embeddings - 1]``.
+ **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+ Mask to nullify selected heads of the self-attention modules.
+ Mask values selected in ``[0, 1]``:
+ ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
+ CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING)
+class CTRLModel(CTRLPreTrainedModel):
+ r"""
+ Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+ **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+ Sequence of hidden-states at the last layer of the model.
+ **past**:
+ list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+ that contains pre-computed hidden-states (key and values in the attention blocks).
+ Can be used (see `past` input) to speed up sequential decoding.
+ **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+ list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+ of shape ``(batch_size, sequence_length, hidden_size)``:
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+ **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+ list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+ Examples::
+
+ tokenizer = CTRLTokenizer.from_pretrained('ctrl')
+ model = CTRLModel.from_pretrained('ctrl')
+ input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute")).unsqueeze(0) # Batch size 1
+ outputs = model(input_ids)
+ last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
+
+ """
+ def __init__(self, config):
+ super(CTRLModel, self).__init__(config)
+ self.output_hidden_states = config.output_hidden_states
+ self.d_model_size = config.n_embd
+ self.num_layers = config.n_layer
+
+ self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size, torch.float)
+
+ self.output_attentions = config.output_attentions
+
+ self.w = nn.Embedding(config.vocab_size, config.n_embd)
+
+
+ self.dropout = nn.Dropout(config.embd_pdrop)
+ self.h = nn.ModuleList([EncoderLayer(config.n_embd, config.n_head, config.dff, config.resid_pdrop) for _ in range(config.n_layer)])
+ self.layernorm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+
+ self.init_weights()
+
+ def _resize_token_embeddings(self, new_num_tokens):
+ self.w = self._get_resized_embeddings(self.w, new_num_tokens)
+ return self.w
+
+ def _prune_heads(self, heads_to_prune):
+ """ Prunes heads of the model.
+ heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+ """
+ for layer, heads in heads_to_prune.items():
+ self.h[layer].attn.prune_heads(heads)
+
+ def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+ labels=None):
+
+ embedded = self.w(input_ids)
+ x = embedded.unsqueeze(0) if len(input_ids.shape)<2 else embedded
+ seq_len = input_ids.shape[1]
+ mask = torch.triu(torch.ones(seq_len, seq_len), 1).to(x.device)
+
+ x *= np.sqrt(self.d_model_size)
+
+ x += self.pos_encoding[:, :seq_len, :].to(x.device)
+
+ x = self.dropout(x)
+ all_hidden_states = ()
+ all_attentions = []
+ for i in range(self.num_layers):
+ if self.output_hidden_states:
+ all_hidden_states = all_hidden_states + (x,)
+ x, attn = self.h[i](x, mask)
+ if self.output_attentions:
+ all_attentions.append(attn)
+
+ x = self.layernorm(x)
+ if self.output_hidden_states:
+ all_hidden_states = all_hidden_states + (x,)
+
+ outputs = (x, None)
+ if self.output_hidden_states:
+ outputs = outputs + (all_hidden_states,)
+ if self.output_attentions:
+ outputs = outputs + (all_attentions,)
+ return outputs
+
+
+@add_start_docstrings("""The CTRL Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """, CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING)
+class CTRLLMHeadModel(CTRLPreTrainedModel):
+ r"""
+ **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+ Labels for language modeling.
+ Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+ Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+ All labels set to ``-1`` are ignored (masked), the loss is only
+ computed for labels in ``[0, ..., config.vocab_size]``
+
+ Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+ **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+ Language modeling loss.
+ **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+ **past**:
+ list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+ that contains pre-computed hidden-states (key and values in the attention blocks).
+ Can be used (see `past` input) to speed up sequential decoding.
+ **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+ list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+ of shape ``(batch_size, sequence_length, hidden_size)``:
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+ **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+ list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+ Examples::
+
+ import torch
+ from transformers import CTRLTokenizer, CTRLLMHeadModel
+
+ tokenizer = CTRLTokenizer.from_pretrained('ctrl')
+ model = CTRLLMHeadModel.from_pretrained('ctrl')
+
+ input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute")).unsqueeze(0) # Batch size 1
+ outputs = model(input_ids, labels=input_ids)
+ loss, logits = outputs[:2]
+
+ """
+ def __init__(self, config):
+ super(CTRLLMHeadModel, self).__init__(config)
+ self.transformer = CTRLModel(config)
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True)
+
+ self.init_weights()
+ self.tie_weights()
+
+ def tie_weights(self):
+ """ Make sure we are sharing the input and output embeddings.
+ Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+ """
+ self._tie_or_clone_weights(self.lm_head,
+ self.transformer.w)
+ #self._tie_or_clone_weights(self.lm_head.bias,
+ # self.transformer.w.bias)
+ def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+ labels=None):
+ transformer_outputs = self.transformer(input_ids)
+ hidden_states = transformer_outputs[0]
+
+ lm_logits = self.lm_head(hidden_states)
+
+ outputs = (lm_logits,) + transformer_outputs[1:]
+
+ if labels is not None:
+ # Shift so that tokens < n predict n
+ shift_logits = lm_logits[..., :-1, :].contiguous()
+ shift_labels = labels[..., 1:].contiguous()
+ # Flatten the tokens
+ loss_fct = CrossEntropyLoss(ignore_index=-1)
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+ shift_labels.view(-1))
+ outputs = (loss,) + outputs
+
+ return outputs # (loss), lm_logits, presents, (all hidden_states), (attentions)
+
+
diff --git a/transformers/tests/modeling_ctrl_test.py b/transformers/tests/modeling_ctrl_test.py
new file mode 100644
index 0000000000..ac7c32b113
--- /dev/null
+++ b/transformers/tests/modeling_ctrl_test.py
@@ -0,0 +1,213 @@
+# coding=utf-8
+# Copyright 2018 Salesforce and HuggingFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import pytest
+import shutil
+import pdb
+
+from transformers import is_torch_available
+
+if is_torch_available():
+ from transformers import (CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+ CTRLLMHeadModel)
+else:
+ pytestmark = pytest.mark.skip("Require Torch")
+
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+
+class CTRLModelTest(CommonTestCases.CommonModelTester):
+
+ all_model_classes = (CTRLModel, CTRLLMHeadModel) if is_torch_available() else ()
+ test_pruning = False
+ test_torchscript = False
+ test_resize_embeddings = False
+ test_head_masking = False
+
+ class CTRLModelTester(object):
+
+ def __init__(self,
+ parent,
+ batch_size=13,
+ seq_length=7,
+ is_training=True,
+ use_token_type_ids=True,
+ use_input_mask=True,
+ use_labels=True,
+ use_mc_token_ids=True,
+ vocab_size=99,
+ hidden_size=32,
+ num_hidden_layers=5,
+ num_attention_heads=4,
+ intermediate_size=37,
+ hidden_act="gelu",
+ hidden_dropout_prob=0.1,
+ attention_probs_dropout_prob=0.1,
+ max_position_embeddings=512,
+ type_vocab_size=16,
+ type_sequence_label_size=2,
+ initializer_range=0.02,
+ num_labels=3,
+ num_choices=4,
+ scope=None,
+ ):
+ self.parent = parent
+ self.batch_size = batch_size
+ self.seq_length = seq_length
+ self.is_training = is_training
+ self.use_token_type_ids = use_token_type_ids
+ self.use_input_mask = use_input_mask
+ self.use_labels = use_labels
+ self.use_mc_token_ids = use_mc_token_ids
+ self.vocab_size = vocab_size
+ self.hidden_size = hidden_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.intermediate_size = intermediate_size
+ self.hidden_act = hidden_act
+ self.hidden_dropout_prob = hidden_dropout_prob
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
+ self.max_position_embeddings = max_position_embeddings
+ self.type_vocab_size = type_vocab_size
+ self.type_sequence_label_size = type_sequence_label_size
+ self.initializer_range = initializer_range
+ self.num_labels = num_labels
+ self.num_choices = num_choices
+ self.scope = scope
+
+ def prepare_config_and_inputs(self):
+ input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+ input_mask = None
+ if self.use_input_mask:
+ input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+ token_type_ids = None
+ if self.use_token_type_ids:
+ token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+ mc_token_ids = None
+ if self.use_mc_token_ids:
+ mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+
+ sequence_labels = None
+ token_labels = None
+ choice_labels = None
+ if self.use_labels:
+ sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+ token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+ choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+ config = CTRLConfig(
+ vocab_size_or_config_json_file=self.vocab_size,
+ n_embd=self.hidden_size,
+ n_layer=self.num_hidden_layers,
+ n_head=self.num_attention_heads,
+ # intermediate_size=self.intermediate_size,
+ # hidden_act=self.hidden_act,
+ # hidden_dropout_prob=self.hidden_dropout_prob,
+ # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+ n_positions=self.max_position_embeddings,
+ n_ctx=self.max_position_embeddings
+ # type_vocab_size=self.type_vocab_size,
+ # initializer_range=self.initializer_range
+ )
+
+ head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+ return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
+
+ def check_loss_output(self, result):
+ self.parent.assertListEqual(
+ list(result["loss"].size()),
+ [])
+
+ def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+ model = CTRLModel(config=config)
+ model.eval()
+
+ model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
+ model(input_ids, token_type_ids=token_type_ids)
+ sequence_output, _ = model(input_ids)
+
+ result = {
+ "sequence_output": sequence_output,
+ }
+ self.parent.assertListEqual(
+ list(result["sequence_output"].size()),
+ [self.batch_size, self.seq_length, self.hidden_size])
+
+ def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+ model = CTRLLMHeadModel(config)
+ model.eval()
+
+ loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+
+ result = {
+ "loss": loss,
+ "lm_logits": lm_logits
+ }
+ self.parent.assertListEqual(
+ list(result["loss"].size()),
+ [])
+ self.parent.assertListEqual(
+ list(result["lm_logits"].size()),
+ [self.batch_size, self.seq_length, self.vocab_size])
+
+
+ def prepare_config_and_inputs_for_common(self):
+ config_and_inputs = self.prepare_config_and_inputs()
+
+ (config, input_ids, input_mask, head_mask, token_type_ids,
+ mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+
+ inputs_dict = {
+ 'input_ids': input_ids,
+ 'token_type_ids': token_type_ids,
+ 'head_mask': head_mask
+ }
+
+ return config, inputs_dict
+
+ def setUp(self):
+ self.model_tester = CTRLModelTest.CTRLModelTester(self)
+ self.config_tester = ConfigTester(self, config_class=CTRLConfig, n_embd=37)
+
+ def test_config(self):
+ self.config_tester.run_common_tests()
+
+ def test_ctrl_model(self):
+ config_and_inputs = self.model_tester.prepare_config_and_inputs()
+ self.model_tester.create_and_check_ctrl_model(*config_and_inputs)
+
+ def test_ctrl_lm_head_model(self):
+ config_and_inputs = self.model_tester.prepare_config_and_inputs()
+ self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
+
+ @pytest.mark.slow
+ def test_model_from_pretrained(self):
+ cache_dir = "/tmp/transformers_test/"
+ for model_name in list(CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+ model = CTRLModel.from_pretrained(model_name, cache_dir=cache_dir)
+ shutil.rmtree(cache_dir)
+ self.assertIsNotNone(model)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/transformers/tests/tokenization_ctrl_test.py b/transformers/tests/tokenization_ctrl_test.py
new file mode 100644
index 0000000000..fbd99af7bb
--- /dev/null
+++ b/transformers/tests/tokenization_ctrl_test.py
@@ -0,0 +1,69 @@
+# coding=utf-8
+# Copyright 2018 Salesforce and HuggingFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+import json
+from io import open
+
+from transformers.tokenization_ctrl import CTRLTokenizer, VOCAB_FILES_NAMES
+
+from .tokenization_tests_commons import CommonTestCases
+
+class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester):
+
+ tokenizer_class = CTRLTokenizer
+
+ def setUp(self):
+ super(CTRLTokenizationTest, self).setUp()
+
+ # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
+ vocab = ['adapt', 're@@', 'a@@', 'apt', 'c@@', 't', '