Merge branch 'master' into squad-refactor
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
__version__ = "2.1.1"
|
||||
__version__ = "2.2.1"
|
||||
|
||||
# Work around to update TensorFlow's absl.logging threshold which alters the
|
||||
# default Python logging output behavior when present.
|
||||
@@ -26,11 +26,12 @@ from .data import (is_sklearn_available,
|
||||
InputExample, InputFeatures, DataProcessor,
|
||||
glue_output_modes, glue_convert_examples_to_features,
|
||||
glue_processors, glue_tasks_num_labels,
|
||||
xnli_output_modes, xnli_processors, xnli_tasks_num_labels,
|
||||
squad_convert_examples_to_features, SquadFeatures,
|
||||
SquadExample, SquadV1Processor, SquadV2Processor)
|
||||
|
||||
if is_sklearn_available():
|
||||
from .data import glue_compute_metrics
|
||||
from .data import glue_compute_metrics, xnli_compute_metrics
|
||||
|
||||
# Tokenizers
|
||||
from .tokenization_utils import (PreTrainedTokenizer)
|
||||
@@ -44,6 +45,7 @@ from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
|
||||
from .tokenization_xlm import XLMTokenizer
|
||||
from .tokenization_roberta import RobertaTokenizer
|
||||
from .tokenization_distilbert import DistilBertTokenizer
|
||||
from .tokenization_albert import AlbertTokenizer
|
||||
from .tokenization_camembert import CamembertTokenizer
|
||||
|
||||
# Configurations
|
||||
@@ -59,6 +61,7 @@ from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
||||
# Modeling
|
||||
@@ -85,9 +88,10 @@ if is_torch_available():
|
||||
CTRLLMHeadModel,
|
||||
CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
|
||||
XLNetForSequenceClassification, XLNetForMultipleChoice,
|
||||
XLNetForQuestionAnsweringSimple, XLNetForQuestionAnswering,
|
||||
load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
XLNetForSequenceClassification, XLNetForTokenClassification,
|
||||
XLNetForMultipleChoice, XLNetForQuestionAnsweringSimple,
|
||||
XLNetForQuestionAnswering, load_tf_weights_in_xlnet,
|
||||
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
|
||||
XLMWithLMHeadModel, XLMForSequenceClassification,
|
||||
XLMForQuestionAnswering, XLMForQuestionAnsweringSimple,
|
||||
@@ -96,7 +100,7 @@ if is_torch_available():
|
||||
RobertaForSequenceClassification, RobertaForMultipleChoice,
|
||||
RobertaForTokenClassification,
|
||||
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
|
||||
from .modeling_distilbert import (DistilBertPreTrainedModel, DistilBertForMaskedLM, DistilBertModel,
|
||||
DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
|
||||
DistilBertForTokenClassification,
|
||||
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
@@ -106,6 +110,10 @@ if is_torch_available():
|
||||
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
|
||||
|
||||
from .modeling_albert import (AlbertPreTrainedModel, AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification,
|
||||
AlbertForQuestionAnswering,
|
||||
load_tf_weights_in_albert, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
|
||||
# Optimization
|
||||
from .optimization import (AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup,
|
||||
get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup)
|
||||
@@ -113,7 +121,7 @@ if is_torch_available():
|
||||
|
||||
# TensorFlow
|
||||
if is_tf_available():
|
||||
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary
|
||||
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list
|
||||
from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
|
||||
TFAutoModelWithLMHead)
|
||||
|
||||
@@ -139,6 +147,7 @@ if is_tf_available():
|
||||
from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer,
|
||||
TFXLNetModel, TFXLNetLMHeadModel,
|
||||
TFXLNetForSequenceClassification,
|
||||
TFXLNetForTokenClassification,
|
||||
TFXLNetForQuestionAnsweringSimple,
|
||||
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
|
||||
@@ -157,6 +166,7 @@ if is_tf_available():
|
||||
from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
|
||||
TFDistilBertModel, TFDistilBertForMaskedLM,
|
||||
TFDistilBertForSequenceClassification,
|
||||
TFDistilBertForTokenClassification,
|
||||
TFDistilBertForQuestionAnswering,
|
||||
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
|
||||
@@ -164,6 +174,12 @@ if is_tf_available():
|
||||
TFCTRLLMHeadModel,
|
||||
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
|
||||
from .modeling_tf_albert import (TFAlbertPreTrainedModel, TFAlbertModel, TFAlbertForMaskedLM,
|
||||
TFAlbertForSequenceClassification,
|
||||
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
# Optimization
|
||||
from .optimization_tf import (WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator)
|
||||
|
||||
# TF 2.0 <=> PyTorch conversion utilities
|
||||
from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name,
|
||||
load_pytorch_checkpoint_in_tf2_model,
|
||||
|
||||
12
transformers/commands/__init__.py
Normal file
12
transformers/commands/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from argparse import ArgumentParser
|
||||
|
||||
class BaseTransformersCLICommand(ABC):
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def register_subcommand(parser: ArgumentParser):
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def run(self):
|
||||
raise NotImplementedError()
|
||||
165
transformers/commands/user.py
Normal file
165
transformers/commands/user.py
Normal file
@@ -0,0 +1,165 @@
|
||||
from argparse import ArgumentParser
|
||||
from getpass import getpass
|
||||
import os
|
||||
|
||||
from transformers.commands import BaseTransformersCLICommand
|
||||
from transformers.hf_api import HfApi, HfFolder, HTTPError
|
||||
|
||||
|
||||
class UserCommands(BaseTransformersCLICommand):
|
||||
@staticmethod
|
||||
def register_subcommand(parser: ArgumentParser):
|
||||
login_parser = parser.add_parser('login')
|
||||
login_parser.set_defaults(func=lambda args: LoginCommand(args))
|
||||
whoami_parser = parser.add_parser('whoami')
|
||||
whoami_parser.set_defaults(func=lambda args: WhoamiCommand(args))
|
||||
logout_parser = parser.add_parser('logout')
|
||||
logout_parser.set_defaults(func=lambda args: LogoutCommand(args))
|
||||
list_parser = parser.add_parser('ls')
|
||||
list_parser.set_defaults(func=lambda args: ListObjsCommand(args))
|
||||
# upload
|
||||
upload_parser = parser.add_parser('upload')
|
||||
upload_parser.add_argument('file', type=str, help='Local filepath of the file to upload.')
|
||||
upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override object filename on S3.')
|
||||
upload_parser.set_defaults(func=lambda args: UploadCommand(args))
|
||||
|
||||
|
||||
|
||||
class ANSI:
|
||||
"""
|
||||
Helper for en.wikipedia.org/wiki/ANSI_escape_code
|
||||
"""
|
||||
_bold = u"\u001b[1m"
|
||||
_reset = u"\u001b[0m"
|
||||
@classmethod
|
||||
def bold(cls, s):
|
||||
return "{}{}{}".format(cls._bold, s, cls._reset)
|
||||
|
||||
|
||||
class BaseUserCommand:
|
||||
def __init__(self, args):
|
||||
self.args = args
|
||||
self._api = HfApi()
|
||||
|
||||
|
||||
class LoginCommand(BaseUserCommand):
|
||||
def run(self):
|
||||
print("""
|
||||
_| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|
|
||||
_| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
|
||||
_|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|
|
||||
_| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
|
||||
_| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|
|
||||
|
||||
""")
|
||||
username = input("Username: ")
|
||||
password = getpass()
|
||||
try:
|
||||
token = self._api.login(username, password)
|
||||
except HTTPError as e:
|
||||
# probably invalid credentials, display error message.
|
||||
print(e)
|
||||
exit(1)
|
||||
HfFolder.save_token(token)
|
||||
print("Login successful")
|
||||
print("Your token:", token, "\n")
|
||||
print("Your token has been saved to", HfFolder.path_token)
|
||||
|
||||
|
||||
class WhoamiCommand(BaseUserCommand):
|
||||
def run(self):
|
||||
token = HfFolder.get_token()
|
||||
if token is None:
|
||||
print("Not logged in")
|
||||
exit()
|
||||
try:
|
||||
user = self._api.whoami(token)
|
||||
print(user)
|
||||
except HTTPError as e:
|
||||
print(e)
|
||||
|
||||
|
||||
class LogoutCommand(BaseUserCommand):
|
||||
def run(self):
|
||||
token = HfFolder.get_token()
|
||||
if token is None:
|
||||
print("Not logged in")
|
||||
exit()
|
||||
HfFolder.delete_token()
|
||||
self._api.logout(token)
|
||||
print("Successfully logged out.")
|
||||
|
||||
|
||||
class ListObjsCommand(BaseUserCommand):
|
||||
def tabulate(self, rows, headers):
|
||||
# type: (List[List[Union[str, int]]], List[str]) -> str
|
||||
"""
|
||||
Inspired by:
|
||||
stackoverflow.com/a/8356620/593036
|
||||
stackoverflow.com/questions/9535954/printing-lists-as-tabular-data
|
||||
"""
|
||||
col_widths = [max(len(str(x)) for x in col) for col in zip(*rows, headers)]
|
||||
row_format = ("{{:{}}} " * len(headers)).format(*col_widths)
|
||||
lines = []
|
||||
lines.append(
|
||||
row_format.format(*headers)
|
||||
)
|
||||
lines.append(
|
||||
row_format.format(*["-" * w for w in col_widths])
|
||||
)
|
||||
for row in rows:
|
||||
lines.append(
|
||||
row_format.format(*row)
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
def run(self):
|
||||
token = HfFolder.get_token()
|
||||
if token is None:
|
||||
print("Not logged in")
|
||||
exit(1)
|
||||
try:
|
||||
objs = self._api.list_objs(token)
|
||||
except HTTPError as e:
|
||||
print(e)
|
||||
exit(1)
|
||||
if len(objs) == 0:
|
||||
print("No shared file yet")
|
||||
exit()
|
||||
rows = [ [
|
||||
obj.filename,
|
||||
obj.LastModified,
|
||||
obj.ETag,
|
||||
obj.Size
|
||||
] for obj in objs ]
|
||||
print(
|
||||
self.tabulate(rows, headers=["Filename", "LastModified", "ETag", "Size"])
|
||||
)
|
||||
|
||||
|
||||
class UploadCommand(BaseUserCommand):
|
||||
def run(self):
|
||||
token = HfFolder.get_token()
|
||||
if token is None:
|
||||
print("Not logged in")
|
||||
exit(1)
|
||||
filepath = os.path.join(os.getcwd(), self.args.file)
|
||||
filename = self.args.filename if self.args.filename is not None else os.path.basename(filepath)
|
||||
print(
|
||||
"About to upload file {} to S3 under filename {}".format(
|
||||
ANSI.bold(filepath), ANSI.bold(filename)
|
||||
)
|
||||
)
|
||||
|
||||
choice = input("Proceed? [Y/n] ").lower()
|
||||
if not(choice == "" or choice == "y" or choice == "yes"):
|
||||
print("Abort")
|
||||
exit()
|
||||
print(
|
||||
ANSI.bold("Uploading... This might take a while if file is large")
|
||||
)
|
||||
access_url = self._api.presign_and_upload(
|
||||
token=token, filename=filename, filepath=filepath
|
||||
)
|
||||
print("Your file now lives at:")
|
||||
print(access_url)
|
||||
100
transformers/configuration_albert.py
Normal file
100
transformers/configuration_albert.py
Normal file
@@ -0,0 +1,100 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" ALBERT model configuration """
|
||||
|
||||
from .configuration_utils import PretrainedConfig
|
||||
|
||||
ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-config.json",
|
||||
'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-config.json",
|
||||
'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-config.json",
|
||||
'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-config.json",
|
||||
'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json",
|
||||
'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-config.json",
|
||||
'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-config.json",
|
||||
'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-config.json",
|
||||
}
|
||||
|
||||
class AlbertConfig(PretrainedConfig):
|
||||
"""Configuration for `AlbertModel`.
|
||||
|
||||
The default settings match the configuration of model `albert_xxlarge`.
|
||||
"""
|
||||
|
||||
pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
||||
def __init__(self,
|
||||
vocab_size_or_config_json_file=30000,
|
||||
embedding_size=128,
|
||||
hidden_size=4096,
|
||||
num_hidden_layers=12,
|
||||
num_hidden_groups=1,
|
||||
num_attention_heads=64,
|
||||
intermediate_size=16384,
|
||||
inner_group_num=1,
|
||||
hidden_act="gelu_new",
|
||||
hidden_dropout_prob=0,
|
||||
attention_probs_dropout_prob=0,
|
||||
max_position_embeddings=512,
|
||||
type_vocab_size=2,
|
||||
initializer_range=0.02,
|
||||
layer_norm_eps=1e-12, **kwargs):
|
||||
"""Constructs AlbertConfig.
|
||||
|
||||
Args:
|
||||
vocab_size: Vocabulary size of `inputs_ids` in `AlbertModel`.
|
||||
embedding_size: size of voc embeddings.
|
||||
hidden_size: Size of the encoder layers and the pooler layer.
|
||||
num_hidden_layers: Number of hidden layers in the Transformer encoder.
|
||||
num_hidden_groups: Number of group for the hidden layers, parameters in
|
||||
the same group are shared.
|
||||
num_attention_heads: Number of attention heads for each attention layer in
|
||||
the Transformer encoder.
|
||||
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
|
||||
layer in the Transformer encoder.
|
||||
inner_group_num: int, number of inner repetition of attention and ffn.
|
||||
down_scale_factor: float, the scale to apply
|
||||
hidden_act: The non-linear activation function (function or string) in the
|
||||
encoder and pooler.
|
||||
hidden_dropout_prob: The dropout probability for all fully connected
|
||||
layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob: The dropout ratio for the attention
|
||||
probabilities.
|
||||
max_position_embeddings: The maximum sequence length that this model might
|
||||
ever be used with. Typically set this to something large just in case
|
||||
(e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
|
||||
`AlbertModel`.
|
||||
initializer_range: The stdev of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
"""
|
||||
super(AlbertConfig, self).__init__(**kwargs)
|
||||
|
||||
self.vocab_size = vocab_size_or_config_json_file
|
||||
self.embedding_size = embedding_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_hidden_groups = num_hidden_groups
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.inner_group_num = inner_group_num
|
||||
self.hidden_act = hidden_act
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.type_vocab_size = type_vocab_size
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
@@ -28,6 +28,7 @@ from .configuration_roberta import RobertaConfig
|
||||
from .configuration_distilbert import DistilBertConfig
|
||||
from .configuration_ctrl import CTRLConfig
|
||||
from .configuration_camembert import CamembertConfig
|
||||
from .configuration_albert import AlbertConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -44,14 +45,15 @@ class AutoConfig(object):
|
||||
The base model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `distilbert`: DistilBertConfig (DistilBERT model)
|
||||
- contains `albert`: AlbertConfig (ALBERT model)
|
||||
- contains `camembert`: CamembertConfig (CamemBERT model)
|
||||
- contains `roberta`: RobertaConfig (RoBERTa model)
|
||||
- contains `bert`: BertConfig (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
||||
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetConfig (XLNet model)
|
||||
- contains `xlm`: XLMConfig (XLM model)
|
||||
- contains `roberta`: RobertaConfig (RoBERTa model)
|
||||
- contains `camembert`: CamembertConfig (CamemBERT model)
|
||||
- contains `ctrl` : CTRLConfig (CTRL model)
|
||||
This class cannot be instantiated using `__init__()` (throw an error).
|
||||
"""
|
||||
@@ -67,14 +69,15 @@ class AutoConfig(object):
|
||||
The configuration class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `distilbert`: DistilBertConfig (DistilBERT model)
|
||||
- contains `albert`: AlbertConfig (ALBERT model)
|
||||
- contains `camembert`: CamembertConfig (CamemBERT model)
|
||||
- contains `roberta`: RobertaConfig (RoBERTa model)
|
||||
- contains `bert`: BertConfig (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
||||
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetConfig (XLNet model)
|
||||
- contains `xlm`: XLMConfig (XLM model)
|
||||
- contains `roberta`: RobertaConfig (RoBERTa model)
|
||||
- contains `camembert`: CamembertConfig (CamemBERT model)
|
||||
- contains `ctrl` : CTRLConfig (CTRL model)
|
||||
Params:
|
||||
pretrained_model_name_or_path: either:
|
||||
@@ -95,6 +98,9 @@ class AutoConfig(object):
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||
|
||||
resume_download: (`optional`) boolean, default False:
|
||||
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
@@ -119,6 +125,8 @@ class AutoConfig(object):
|
||||
"""
|
||||
if 'distilbert' in pretrained_model_name_or_path:
|
||||
return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
elif 'albert' in pretrained_model_name_or_path:
|
||||
return AlbertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
elif 'camembert' in pretrained_model_name_or_path:
|
||||
return CamembertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
elif 'roberta' in pretrained_model_name_or_path:
|
||||
@@ -139,4 +147,4 @@ class AutoConfig(object):
|
||||
return CTRLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||
"'xlm', 'roberta', 'camembert', 'ctrl'".format(pretrained_model_name_or_path))
|
||||
"'xlm', 'roberta', 'distilbert', 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))
|
||||
|
||||
@@ -27,7 +27,9 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
|
||||
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
|
||||
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json",
|
||||
'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json",
|
||||
'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -94,6 +94,9 @@ class PretrainedConfig(object):
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||
|
||||
resume_download: (`optional`) boolean, default False:
|
||||
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
@@ -120,6 +123,7 @@ class PretrainedConfig(object):
|
||||
"""
|
||||
cache_dir = kwargs.pop('cache_dir', None)
|
||||
force_download = kwargs.pop('force_download', False)
|
||||
resume_download = kwargs.pop('resume_download', False)
|
||||
proxies = kwargs.pop('proxies', None)
|
||||
return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
|
||||
|
||||
@@ -131,7 +135,8 @@ class PretrainedConfig(object):
|
||||
config_file = pretrained_model_name_or_path
|
||||
# redirect to the cache, if necessary
|
||||
try:
|
||||
resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
|
||||
resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download,
|
||||
proxies=proxies, resume_download=resume_download)
|
||||
except EnvironmentError:
|
||||
if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
|
||||
msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
|
||||
|
||||
@@ -0,0 +1,67 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Convert ALBERT checkpoint."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import torch
|
||||
|
||||
from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert
|
||||
|
||||
import logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
|
||||
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
|
||||
# Initialise PyTorch model
|
||||
config = AlbertConfig.from_json_file(albert_config_file)
|
||||
print("Building PyTorch model from configuration: {}".format(str(config)))
|
||||
model = AlbertForMaskedLM(config)
|
||||
|
||||
# Load weights from tf checkpoint
|
||||
load_tf_weights_in_albert(model, config, tf_checkpoint_path)
|
||||
|
||||
# Save pytorch-model
|
||||
print("Save PyTorch model to {}".format(pytorch_dump_path))
|
||||
torch.save(model.state_dict(), pytorch_dump_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
## Required parameters
|
||||
parser.add_argument("--tf_checkpoint_path",
|
||||
default = None,
|
||||
type = str,
|
||||
required = True,
|
||||
help = "Path to the TensorFlow checkpoint path.")
|
||||
parser.add_argument("--albert_config_file",
|
||||
default = None,
|
||||
type = str,
|
||||
required = True,
|
||||
help = "The config json file corresponding to the pre-trained ALBERT model. \n"
|
||||
"This specifies the model architecture.")
|
||||
parser.add_argument("--pytorch_dump_path",
|
||||
default = None,
|
||||
type = str,
|
||||
required = True,
|
||||
help = "Path to the output PyTorch model.")
|
||||
args = parser.parse_args()
|
||||
convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
|
||||
args.albert_config_file,
|
||||
args.pytorch_dump_path)
|
||||
|
||||
@@ -33,7 +33,8 @@ from transformers import (load_pytorch_checkpoint_in_tf2_model,
|
||||
OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP)
|
||||
CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
AlbertConfig, TFAlbertForMaskedLM, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
@@ -46,7 +47,8 @@ if is_torch_available():
|
||||
OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
else:
|
||||
(BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
@@ -56,7 +58,8 @@ else:
|
||||
OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP) = (
|
||||
CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP) = (
|
||||
None, None, None, None,
|
||||
None, None,
|
||||
None, None,
|
||||
@@ -65,6 +68,7 @@ else:
|
||||
None, None,
|
||||
None, None, None,
|
||||
None, None, None,
|
||||
None, None,
|
||||
None, None)
|
||||
|
||||
|
||||
@@ -85,7 +89,8 @@ MODEL_CLASSES = {
|
||||
'roberta-large-mnli': (RobertaConfig, TFRobertaForSequenceClassification, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||
'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||
'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||
'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP)
|
||||
'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP),
|
||||
'albert': (AlbertConfig, TFAlbertForMaskedLM, AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
|
||||
}
|
||||
|
||||
def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True):
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures
|
||||
from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
|
||||
from .processors import squad_convert_examples_to_features, SquadExample, SquadV1Processor, SquadV2Processor
|
||||
from .processors import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
|
||||
|
||||
from .metrics import is_sklearn_available
|
||||
if is_sklearn_available():
|
||||
from .metrics import glue_compute_metrics
|
||||
from .metrics import glue_compute_metrics, xnli_compute_metrics
|
||||
|
||||
@@ -81,3 +81,11 @@ if _has_sklearn:
|
||||
return {"acc": simple_accuracy(preds, labels)}
|
||||
else:
|
||||
raise KeyError(task_name)
|
||||
|
||||
|
||||
def xnli_compute_metrics(task_name, preds, labels):
|
||||
assert len(preds) == len(labels)
|
||||
if task_name == "xnli":
|
||||
return {"acc": simple_accuracy(preds, labels)}
|
||||
else:
|
||||
raise KeyError(task_name)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from .utils import InputExample, InputFeatures, DataProcessor
|
||||
from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
|
||||
from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, SquadV1Processor, SquadV2Processor
|
||||
|
||||
from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
|
||||
85
transformers/data/processors/xnli.py
Normal file
85
transformers/data/processors/xnli.py
Normal file
@@ -0,0 +1,85 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" XNLI utils (dataset loading and evaluation) """
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
||||
from .utils import DataProcessor, InputExample
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class XnliProcessor(DataProcessor):
|
||||
"""Processor for the XNLI dataset.
|
||||
Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207"""
|
||||
|
||||
def __init__(self, language, train_language = None):
|
||||
self.language = language
|
||||
self.train_language = train_language
|
||||
|
||||
def get_train_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
lg = self.language if self.train_language is None else self.train_language
|
||||
lines = self._read_tsv(os.path.join(data_dir, "XNLI-MT-1.0/multinli/multinli.train.{}.tsv".format(lg)))
|
||||
examples = []
|
||||
for (i, line) in enumerate(lines):
|
||||
if i == 0:
|
||||
continue
|
||||
guid = "%s-%s" % ('train', i)
|
||||
text_a = line[0]
|
||||
text_b = line[1]
|
||||
label = "contradiction" if line[2] == "contradictory" else line[2]
|
||||
assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
|
||||
examples.append(
|
||||
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
|
||||
return examples
|
||||
|
||||
def get_test_examples(self, data_dir):
|
||||
"""See base class."""
|
||||
lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv"))
|
||||
examples = []
|
||||
for (i, line) in enumerate(lines):
|
||||
if i == 0:
|
||||
continue
|
||||
language = line[0]
|
||||
if language != self.language:
|
||||
continue
|
||||
guid = "%s-%s" % ('test', i)
|
||||
text_a = line[6]
|
||||
text_b = line[7]
|
||||
label = line[1]
|
||||
assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str)
|
||||
examples.append(
|
||||
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
|
||||
return examples
|
||||
|
||||
def get_labels(self):
|
||||
"""See base class."""
|
||||
return ["contradiction", "entailment", "neutral"]
|
||||
|
||||
xnli_processors = {
|
||||
"xnli": XnliProcessor,
|
||||
}
|
||||
|
||||
xnli_output_modes = {
|
||||
"xnli": "classification",
|
||||
}
|
||||
|
||||
xnli_tasks_num_labels = {
|
||||
"xnli": 3,
|
||||
}
|
||||
@@ -22,6 +22,7 @@ from botocore.config import Config
|
||||
from botocore.exceptions import ClientError
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
from contextlib import contextmanager
|
||||
|
||||
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
@@ -152,7 +153,7 @@ def filename_to_url(filename, cache_dir=None):
|
||||
return url, etag
|
||||
|
||||
|
||||
def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None):
|
||||
def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None, resume_download=False):
|
||||
"""
|
||||
Given something that might be a URL (or might be a local path),
|
||||
determine which. If it's a URL, download the file and cache it, and
|
||||
@@ -161,6 +162,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
|
||||
Args:
|
||||
cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
|
||||
force_download: if True, re-dowload the file even if it's already cached in the cache dir.
|
||||
resume_download: if True, resume the download if incompletly recieved file is found.
|
||||
"""
|
||||
if cache_dir is None:
|
||||
cache_dir = TRANSFORMERS_CACHE
|
||||
@@ -173,7 +175,9 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
|
||||
|
||||
if parsed.scheme in ('http', 'https', 's3'):
|
||||
# URL, so get it from the cache (downloading if necessary)
|
||||
return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
|
||||
return get_from_cache(url_or_filename, cache_dir=cache_dir,
|
||||
force_download=force_download, proxies=proxies,
|
||||
resume_download=resume_download)
|
||||
elif os.path.exists(url_or_filename):
|
||||
# File, and it exists.
|
||||
return url_or_filename
|
||||
@@ -234,19 +238,22 @@ def s3_get(url, temp_file, proxies=None):
|
||||
s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
|
||||
|
||||
|
||||
def http_get(url, temp_file, proxies=None):
|
||||
req = requests.get(url, stream=True, proxies=proxies)
|
||||
content_length = req.headers.get('Content-Length')
|
||||
total = int(content_length) if content_length is not None else None
|
||||
progress = tqdm(unit="B", total=total)
|
||||
for chunk in req.iter_content(chunk_size=1024):
|
||||
def http_get(url, temp_file, proxies=None, resume_size=0):
|
||||
headers={'Range':'bytes=%d-'%(resume_size,)} if resume_size > 0 else None
|
||||
response = requests.get(url, stream=True, proxies=proxies, headers=headers)
|
||||
if response.status_code == 416: # Range not satisfiable
|
||||
return
|
||||
content_length = response.headers.get('Content-Length')
|
||||
total = resume_size + int(content_length) if content_length is not None else None
|
||||
progress = tqdm(unit="B", total=total, initial=resume_size)
|
||||
for chunk in response.iter_content(chunk_size=1024):
|
||||
if chunk: # filter out keep-alive new chunks
|
||||
progress.update(len(chunk))
|
||||
temp_file.write(chunk)
|
||||
progress.close()
|
||||
|
||||
|
||||
def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10):
|
||||
def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10, resume_download=False):
|
||||
"""
|
||||
Given a URL, look for the corresponding dataset in the local cache.
|
||||
If it's not there, download it. Then return the path to the cached file.
|
||||
@@ -289,17 +296,35 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag
|
||||
if matching_files:
|
||||
cache_path = os.path.join(cache_dir, matching_files[-1])
|
||||
|
||||
if resume_download:
|
||||
incomplete_path = cache_path + '.incomplete'
|
||||
@contextmanager
|
||||
def _resumable_file_manager():
|
||||
with open(incomplete_path,'a+b') as f:
|
||||
yield f
|
||||
os.remove(incomplete_path)
|
||||
temp_file_manager = _resumable_file_manager
|
||||
if os.path.exists(incomplete_path):
|
||||
resume_size = os.stat(incomplete_path).st_size
|
||||
else:
|
||||
resume_size = 0
|
||||
else:
|
||||
temp_file_manager = tempfile.NamedTemporaryFile
|
||||
resume_size = 0
|
||||
|
||||
if not os.path.exists(cache_path) or force_download:
|
||||
# Download to temporary file, then copy to cache dir once finished.
|
||||
# Otherwise you get corrupt cache entries if the download gets interrupted.
|
||||
with tempfile.NamedTemporaryFile() as temp_file:
|
||||
with temp_file_manager() as temp_file:
|
||||
logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
|
||||
|
||||
# GET file object
|
||||
if url.startswith("s3://"):
|
||||
if resume_download:
|
||||
logger.warn('Warning: resumable downloads are not implemented for "s3://" urls')
|
||||
s3_get(url, temp_file, proxies=proxies)
|
||||
else:
|
||||
http_get(url, temp_file, proxies=proxies)
|
||||
http_get(url, temp_file, proxies=proxies, resume_size=resume_size)
|
||||
|
||||
# we are copying the file before closing it, so flush to avoid truncation
|
||||
temp_file.flush()
|
||||
|
||||
228
transformers/hf_api.py
Normal file
228
transformers/hf_api.py
Normal file
@@ -0,0 +1,228 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2019-present, the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import os
|
||||
from os.path import expanduser
|
||||
|
||||
import requests
|
||||
import six
|
||||
from requests.exceptions import HTTPError
|
||||
from tqdm import tqdm
|
||||
|
||||
ENDPOINT = "https://huggingface.co"
|
||||
|
||||
class S3Obj:
|
||||
def __init__(
|
||||
self,
|
||||
filename, # type: str
|
||||
LastModified, # type: str
|
||||
ETag, # type: str
|
||||
Size, # type: int
|
||||
**kwargs
|
||||
):
|
||||
self.filename = filename
|
||||
self.LastModified = LastModified
|
||||
self.ETag = ETag
|
||||
self.Size = Size
|
||||
|
||||
|
||||
class PresignedUrl:
|
||||
def __init__(
|
||||
self,
|
||||
write, # type: str
|
||||
access, # type: str
|
||||
type, # type: str
|
||||
**kwargs
|
||||
):
|
||||
self.write = write
|
||||
self.access = access
|
||||
self.type = type # mime-type to send to S3.
|
||||
|
||||
|
||||
class HfApi:
|
||||
def __init__(self, endpoint=None):
|
||||
self.endpoint = endpoint if endpoint is not None else ENDPOINT
|
||||
|
||||
def login(
|
||||
self,
|
||||
username, # type: str
|
||||
password, # type: str
|
||||
):
|
||||
# type: (...) -> str
|
||||
"""
|
||||
Call HF API to sign in a user and get a token if credentials are valid.
|
||||
|
||||
Outputs:
|
||||
token if credentials are valid
|
||||
|
||||
Throws:
|
||||
requests.exceptions.HTTPError if credentials are invalid
|
||||
"""
|
||||
path = "{}/api/login".format(self.endpoint)
|
||||
r = requests.post(path, json={"username": username, "password": password})
|
||||
r.raise_for_status()
|
||||
d = r.json()
|
||||
return d["token"]
|
||||
|
||||
def whoami(
|
||||
self,
|
||||
token, # type: str
|
||||
):
|
||||
# type: (...) -> str
|
||||
"""
|
||||
Call HF API to know "whoami"
|
||||
"""
|
||||
path = "{}/api/whoami".format(self.endpoint)
|
||||
r = requests.get(path, headers={"authorization": "Bearer {}".format(token)})
|
||||
r.raise_for_status()
|
||||
d = r.json()
|
||||
return d["user"]
|
||||
|
||||
def logout(self, token):
|
||||
# type: (...) -> void
|
||||
"""
|
||||
Call HF API to log out.
|
||||
"""
|
||||
path = "{}/api/logout".format(self.endpoint)
|
||||
r = requests.post(path, headers={"authorization": "Bearer {}".format(token)})
|
||||
r.raise_for_status()
|
||||
|
||||
def presign(self, token, filename):
|
||||
# type: (...) -> PresignedUrl
|
||||
"""
|
||||
Call HF API to get a presigned url to upload `filename` to S3.
|
||||
"""
|
||||
path = "{}/api/presign".format(self.endpoint)
|
||||
r = requests.post(
|
||||
path,
|
||||
headers={"authorization": "Bearer {}".format(token)},
|
||||
json={"filename": filename},
|
||||
)
|
||||
r.raise_for_status()
|
||||
d = r.json()
|
||||
return PresignedUrl(**d)
|
||||
|
||||
def presign_and_upload(self, token, filename, filepath):
|
||||
# type: (...) -> str
|
||||
"""
|
||||
Get a presigned url, then upload file to S3.
|
||||
|
||||
Outputs:
|
||||
url: Read-only url for the stored file on S3.
|
||||
"""
|
||||
urls = self.presign(token, filename=filename)
|
||||
# streaming upload:
|
||||
# https://2.python-requests.org/en/master/user/advanced/#streaming-uploads
|
||||
#
|
||||
# Even though we presign with the correct content-type,
|
||||
# the client still has to specify it when uploading the file.
|
||||
with open(filepath, "rb") as f:
|
||||
pf = TqdmProgressFileReader(f)
|
||||
|
||||
r = requests.put(urls.write, data=f, headers={
|
||||
"content-type": urls.type,
|
||||
})
|
||||
r.raise_for_status()
|
||||
pf.close()
|
||||
return urls.access
|
||||
|
||||
def list_objs(self, token):
|
||||
# type: (...) -> List[S3Obj]
|
||||
"""
|
||||
Call HF API to list all stored files for user.
|
||||
"""
|
||||
path = "{}/api/listObjs".format(self.endpoint)
|
||||
r = requests.get(path, headers={"authorization": "Bearer {}".format(token)})
|
||||
r.raise_for_status()
|
||||
d = r.json()
|
||||
return [S3Obj(**x) for x in d]
|
||||
|
||||
|
||||
|
||||
class TqdmProgressFileReader:
|
||||
"""
|
||||
Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`)
|
||||
and override `f.read()` so as to display a tqdm progress bar.
|
||||
|
||||
see github.com/huggingface/transformers/pull/2078#discussion_r354739608
|
||||
for implementation details.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
f # type: io.BufferedReader
|
||||
):
|
||||
self.f = f
|
||||
self.total_size = os.fstat(f.fileno()).st_size # type: int
|
||||
self.pbar = tqdm(total=self.total_size, leave=False)
|
||||
if six.PY3:
|
||||
# does not work unless PY3
|
||||
# no big deal as the CLI does not currently support PY2 anyways.
|
||||
self.read = f.read
|
||||
f.read = self._read
|
||||
|
||||
def _read(self, n=-1):
|
||||
self.pbar.update(n)
|
||||
return self.read(n)
|
||||
|
||||
def close(self):
|
||||
self.pbar.close()
|
||||
|
||||
|
||||
|
||||
class HfFolder:
|
||||
path_token = expanduser("~/.huggingface/token")
|
||||
|
||||
@classmethod
|
||||
def save_token(cls, token):
|
||||
"""
|
||||
Save token, creating folder as needed.
|
||||
"""
|
||||
if six.PY3:
|
||||
os.makedirs(os.path.dirname(cls.path_token), exist_ok=True)
|
||||
else:
|
||||
# Python 2
|
||||
try:
|
||||
os.makedirs(os.path.dirname(cls.path_token))
|
||||
except OSError as e:
|
||||
if e.errno != os.errno.EEXIST:
|
||||
raise e
|
||||
pass
|
||||
with open(cls.path_token, 'w+') as f:
|
||||
f.write(token)
|
||||
|
||||
@classmethod
|
||||
def get_token(cls):
|
||||
"""
|
||||
Get token or None if not existent.
|
||||
"""
|
||||
try:
|
||||
with open(cls.path_token, 'r') as f:
|
||||
return f.read()
|
||||
except:
|
||||
# this is too wide. When Py2 is dead use:
|
||||
# `except FileNotFoundError:` instead
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def delete_token(cls):
|
||||
"""
|
||||
Delete token.
|
||||
Do not fail if token does not exist.
|
||||
"""
|
||||
try:
|
||||
os.remove(cls.path_token)
|
||||
except:
|
||||
return
|
||||
801
transformers/modeling_albert.py
Normal file
801
transformers/modeling_albert.py
Normal file
@@ -0,0 +1,801 @@
|
||||
|
||||
# coding=utf-8
|
||||
# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""PyTorch ALBERT model. """
|
||||
|
||||
import os
|
||||
import math
|
||||
import logging
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.nn import CrossEntropyLoss, MSELoss
|
||||
from transformers.modeling_utils import PreTrainedModel
|
||||
from transformers.configuration_albert import AlbertConfig
|
||||
from transformers.modeling_bert import BertEmbeddings, BertSelfAttention, prune_linear_layer, ACT2FN
|
||||
from .file_utils import add_start_docstrings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||
'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-pytorch_model.bin",
|
||||
'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-pytorch_model.bin",
|
||||
'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-pytorch_model.bin",
|
||||
'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-pytorch_model.bin",
|
||||
'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-pytorch_model.bin",
|
||||
'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-pytorch_model.bin",
|
||||
'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-pytorch_model.bin",
|
||||
'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-pytorch_model.bin",
|
||||
}
|
||||
|
||||
|
||||
def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
|
||||
""" Load tf checkpoints in a pytorch model."""
|
||||
try:
|
||||
import re
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
except ImportError:
|
||||
logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
|
||||
"https://www.tensorflow.org/install/ for installation instructions.")
|
||||
raise
|
||||
tf_path = os.path.abspath(tf_checkpoint_path)
|
||||
logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
|
||||
# Load weights from TF model
|
||||
init_vars = tf.train.list_variables(tf_path)
|
||||
names = []
|
||||
arrays = []
|
||||
for name, shape in init_vars:
|
||||
logger.info("Loading TF weight {} with shape {}".format(name, shape))
|
||||
array = tf.train.load_variable(tf_path, name)
|
||||
names.append(name)
|
||||
arrays.append(array)
|
||||
|
||||
for name, array in zip(names, arrays):
|
||||
print(name)
|
||||
|
||||
for name, array in zip(names, arrays):
|
||||
original_name = name
|
||||
|
||||
# If saved from the TF HUB module
|
||||
name = name.replace("module/", "")
|
||||
|
||||
# Renaming and simplifying
|
||||
name = name.replace("ffn_1", "ffn")
|
||||
name = name.replace("bert/", "albert/")
|
||||
name = name.replace("attention_1", "attention")
|
||||
name = name.replace("transform/", "")
|
||||
name = name.replace("LayerNorm_1", "full_layer_layer_norm")
|
||||
name = name.replace("LayerNorm", "attention/LayerNorm")
|
||||
name = name.replace("transformer/", "")
|
||||
|
||||
# The feed forward layer had an 'intermediate' step which has been abstracted away
|
||||
name = name.replace("intermediate/dense/", "")
|
||||
name = name.replace("ffn/intermediate/output/dense/", "ffn_output/")
|
||||
|
||||
# ALBERT attention was split between self and output which have been abstracted away
|
||||
name = name.replace("/output/", "/")
|
||||
name = name.replace("/self/", "/")
|
||||
|
||||
# The pooler is a linear layer
|
||||
name = name.replace("pooler/dense", "pooler")
|
||||
|
||||
# The classifier was simplified to predictions from cls/predictions
|
||||
name = name.replace("cls/predictions", "predictions")
|
||||
name = name.replace("predictions/attention", "predictions")
|
||||
|
||||
# Naming was changed to be more explicit
|
||||
name = name.replace("embeddings/attention", "embeddings")
|
||||
name = name.replace("inner_group_", "albert_layers/")
|
||||
name = name.replace("group_", "albert_layer_groups/")
|
||||
|
||||
# Classifier
|
||||
if len(name.split("/")) == 1 and ("output_bias" in name or "output_weights" in name):
|
||||
name = "classifier/" + name
|
||||
|
||||
# No ALBERT model currently handles the next sentence prediction task
|
||||
if "seq_relationship" in name:
|
||||
continue
|
||||
|
||||
name = name.split('/')
|
||||
|
||||
# Ignore the gradients applied by the LAMB/ADAM optimizers.
|
||||
if "adam_m" in name or "adam_v" in name or "global_step" in name:
|
||||
logger.info("Skipping {}".format("/".join(name)))
|
||||
continue
|
||||
|
||||
pointer = model
|
||||
for m_name in name:
|
||||
if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
|
||||
l = re.split(r'_(\d+)', m_name)
|
||||
else:
|
||||
l = [m_name]
|
||||
|
||||
if l[0] == 'kernel' or l[0] == 'gamma':
|
||||
pointer = getattr(pointer, 'weight')
|
||||
elif l[0] == 'output_bias' or l[0] == 'beta':
|
||||
pointer = getattr(pointer, 'bias')
|
||||
elif l[0] == 'output_weights':
|
||||
pointer = getattr(pointer, 'weight')
|
||||
elif l[0] == 'squad':
|
||||
pointer = getattr(pointer, 'classifier')
|
||||
else:
|
||||
try:
|
||||
pointer = getattr(pointer, l[0])
|
||||
except AttributeError:
|
||||
logger.info("Skipping {}".format("/".join(name)))
|
||||
continue
|
||||
if len(l) >= 2:
|
||||
num = int(l[1])
|
||||
pointer = pointer[num]
|
||||
|
||||
if m_name[-11:] == '_embeddings':
|
||||
pointer = getattr(pointer, 'weight')
|
||||
elif m_name == 'kernel':
|
||||
array = np.transpose(array)
|
||||
try:
|
||||
assert pointer.shape == array.shape
|
||||
except AssertionError as e:
|
||||
e.args += (pointer.shape, array.shape)
|
||||
raise
|
||||
print("Initialize PyTorch weight {} from {}".format(name, original_name))
|
||||
pointer.data = torch.from_numpy(array)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
class AlbertEmbeddings(BertEmbeddings):
|
||||
"""
|
||||
Construct the embeddings from word, position and token_type embeddings.
|
||||
"""
|
||||
def __init__(self, config):
|
||||
super(AlbertEmbeddings, self).__init__(config)
|
||||
|
||||
self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=0)
|
||||
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
|
||||
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)
|
||||
self.LayerNorm = torch.nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
|
||||
|
||||
|
||||
class AlbertAttention(BertSelfAttention):
|
||||
def __init__(self, config):
|
||||
super(AlbertAttention, self).__init__(config)
|
||||
|
||||
self.output_attentions = config.output_attentions
|
||||
self.num_attention_heads = config.num_attention_heads
|
||||
self.hidden_size = config.hidden_size
|
||||
self.attention_head_size = config.hidden_size // config.num_attention_heads
|
||||
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||
self.pruned_heads = set()
|
||||
|
||||
def prune_heads(self, heads):
|
||||
if len(heads) == 0:
|
||||
return
|
||||
mask = torch.ones(self.num_attention_heads, self.attention_head_size)
|
||||
heads = set(heads) - self.pruned_heads # Convert to set and emove already pruned heads
|
||||
for head in heads:
|
||||
# Compute how many pruned heads are before the head and move the index accordingly
|
||||
head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
|
||||
mask[head] = 0
|
||||
mask = mask.view(-1).contiguous().eq(1)
|
||||
index = torch.arange(len(mask))[mask].long()
|
||||
|
||||
# Prune linear layers
|
||||
self.query = prune_linear_layer(self.query, index)
|
||||
self.key = prune_linear_layer(self.key, index)
|
||||
self.value = prune_linear_layer(self.value, index)
|
||||
self.dense = prune_linear_layer(self.dense, index, dim=1)
|
||||
|
||||
# Update hyper params and store pruned heads
|
||||
self.num_attention_heads = self.num_attention_heads - len(heads)
|
||||
self.all_head_size = self.attention_head_size * self.num_attention_heads
|
||||
self.pruned_heads = self.pruned_heads.union(heads)
|
||||
|
||||
def forward(self, input_ids, attention_mask=None, head_mask=None):
|
||||
mixed_query_layer = self.query(input_ids)
|
||||
mixed_key_layer = self.key(input_ids)
|
||||
mixed_value_layer = self.value(input_ids)
|
||||
|
||||
query_layer = self.transpose_for_scores(mixed_query_layer)
|
||||
key_layer = self.transpose_for_scores(mixed_key_layer)
|
||||
value_layer = self.transpose_for_scores(mixed_value_layer)
|
||||
|
||||
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
|
||||
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
|
||||
if attention_mask is not None:
|
||||
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
|
||||
attention_scores = attention_scores + attention_mask
|
||||
|
||||
# Normalize the attention scores to probabilities.
|
||||
attention_probs = nn.Softmax(dim=-1)(attention_scores)
|
||||
|
||||
# This is actually dropping out entire tokens to attend to, which might
|
||||
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||
attention_probs = self.dropout(attention_probs)
|
||||
|
||||
# Mask heads if we want to
|
||||
if head_mask is not None:
|
||||
attention_probs = attention_probs * head_mask
|
||||
|
||||
context_layer = torch.matmul(attention_probs, value_layer)
|
||||
|
||||
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
|
||||
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
|
||||
reshaped_context_layer = context_layer.view(*new_context_layer_shape)
|
||||
|
||||
|
||||
# Should find a better way to do this
|
||||
w = self.dense.weight.t().view(self.num_attention_heads, self.attention_head_size, self.hidden_size).to(context_layer.dtype)
|
||||
b = self.dense.bias.to(context_layer.dtype)
|
||||
|
||||
projected_context_layer = torch.einsum("bfnd,ndh->bfh", context_layer, w) + b
|
||||
projected_context_layer_dropout = self.dropout(projected_context_layer)
|
||||
layernormed_context_layer = self.LayerNorm(input_ids + projected_context_layer_dropout)
|
||||
return (layernormed_context_layer, attention_probs) if self.output_attentions else (layernormed_context_layer,)
|
||||
|
||||
|
||||
class AlbertLayer(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(AlbertLayer, self).__init__()
|
||||
|
||||
self.config = config
|
||||
self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||
self.attention = AlbertAttention(config)
|
||||
self.ffn = nn.Linear(config.hidden_size, config.intermediate_size)
|
||||
self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size)
|
||||
self.activation = ACT2FN[config.hidden_act]
|
||||
|
||||
def forward(self, hidden_states, attention_mask=None, head_mask=None):
|
||||
attention_output = self.attention(hidden_states, attention_mask, head_mask)
|
||||
ffn_output = self.ffn(attention_output[0])
|
||||
ffn_output = self.activation(ffn_output)
|
||||
ffn_output = self.ffn_output(ffn_output)
|
||||
hidden_states = self.full_layer_layer_norm(ffn_output + attention_output[0])
|
||||
|
||||
return (hidden_states,) + attention_output[1:] # add attentions if we output them
|
||||
|
||||
|
||||
class AlbertLayerGroup(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(AlbertLayerGroup, self).__init__()
|
||||
|
||||
self.output_attentions = config.output_attentions
|
||||
self.output_hidden_states = config.output_hidden_states
|
||||
self.albert_layers = nn.ModuleList([AlbertLayer(config) for _ in range(config.inner_group_num)])
|
||||
|
||||
def forward(self, hidden_states, attention_mask=None, head_mask=None):
|
||||
layer_hidden_states = ()
|
||||
layer_attentions = ()
|
||||
|
||||
for layer_index, albert_layer in enumerate(self.albert_layers):
|
||||
layer_output = albert_layer(hidden_states, attention_mask, head_mask[layer_index])
|
||||
hidden_states = layer_output[0]
|
||||
|
||||
if self.output_attentions:
|
||||
layer_attentions = layer_attentions + (layer_output[1],)
|
||||
|
||||
if self.output_hidden_states:
|
||||
layer_hidden_states = layer_hidden_states + (hidden_states,)
|
||||
|
||||
outputs = (hidden_states,)
|
||||
if self.output_hidden_states:
|
||||
outputs = outputs + (layer_hidden_states,)
|
||||
if self.output_attentions:
|
||||
outputs = outputs + (layer_attentions,)
|
||||
return outputs # last-layer hidden state, (layer hidden states), (layer attentions)
|
||||
|
||||
|
||||
class AlbertTransformer(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(AlbertTransformer, self).__init__()
|
||||
|
||||
self.config = config
|
||||
self.output_attentions = config.output_attentions
|
||||
self.output_hidden_states = config.output_hidden_states
|
||||
self.embedding_hidden_mapping_in = nn.Linear(config.embedding_size, config.hidden_size)
|
||||
self.albert_layer_groups = nn.ModuleList([AlbertLayerGroup(config) for _ in range(config.num_hidden_groups)])
|
||||
|
||||
def forward(self, hidden_states, attention_mask=None, head_mask=None):
|
||||
hidden_states = self.embedding_hidden_mapping_in(hidden_states)
|
||||
|
||||
all_attentions = ()
|
||||
|
||||
if self.output_hidden_states:
|
||||
all_hidden_states = (hidden_states,)
|
||||
|
||||
for i in range(self.config.num_hidden_layers):
|
||||
# Number of layers in a hidden group
|
||||
layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups)
|
||||
|
||||
# Index of the hidden group
|
||||
group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
|
||||
|
||||
# Index of the layer inside the group
|
||||
layer_idx = int(i - group_idx * layers_per_group)
|
||||
|
||||
layer_group_output = self.albert_layer_groups[group_idx](hidden_states, attention_mask, head_mask[group_idx*layers_per_group:(group_idx+1)*layers_per_group])
|
||||
hidden_states = layer_group_output[0]
|
||||
|
||||
if self.output_attentions:
|
||||
all_attentions = all_attentions + layer_group_output[-1]
|
||||
|
||||
if self.output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
|
||||
outputs = (hidden_states,)
|
||||
if self.output_hidden_states:
|
||||
outputs = outputs + (all_hidden_states,)
|
||||
if self.output_attentions:
|
||||
outputs = outputs + (all_attentions,)
|
||||
return outputs # last-layer hidden state, (all hidden states), (all attentions)
|
||||
|
||||
|
||||
|
||||
class AlbertPreTrainedModel(PreTrainedModel):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for dowloading and loading pretrained models.
|
||||
"""
|
||||
config_class = AlbertConfig
|
||||
pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
base_model_prefix = "albert"
|
||||
|
||||
def _init_weights(self, module):
|
||||
""" Initialize the weights.
|
||||
"""
|
||||
if isinstance(module, (nn.Linear, nn.Embedding)):
|
||||
# Slightly different from the TF version which uses truncated_normal for initialization
|
||||
# cf https://github.com/pytorch/pytorch/pull/5617
|
||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||
if isinstance(module, (nn.Linear)) and module.bias is not None:
|
||||
module.bias.data.zero_()
|
||||
elif isinstance(module, nn.LayerNorm):
|
||||
module.bias.data.zero_()
|
||||
module.weight.data.fill_(1.0)
|
||||
|
||||
|
||||
ALBERT_START_DOCSTRING = r""" The ALBERT model was proposed in
|
||||
`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`_
|
||||
by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. It presents
|
||||
two parameter-reduction techniques to lower memory consumption and increase the trainig speed of BERT.
|
||||
|
||||
This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
|
||||
refer to the PyTorch documentation for all matter related to general usage and behavior.
|
||||
|
||||
.. _`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`:
|
||||
https://arxiv.org/abs/1909.11942
|
||||
|
||||
.. _`torch.nn.Module`:
|
||||
https://pytorch.org/docs/stable/nn.html#module
|
||||
|
||||
Parameters:
|
||||
config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||
"""
|
||||
|
||||
ALBERT_INPUTS_DOCSTRING = r"""
|
||||
Inputs:
|
||||
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
|
||||
|
||||
(a) For sequence pairs:
|
||||
|
||||
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
|
||||
|
||||
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
|
||||
|
||||
(b) For single sequences:
|
||||
|
||||
``tokens: [CLS] the dog is hairy . [SEP]``
|
||||
|
||||
``token_type_ids: 0 0 0 0 0 0 0``
|
||||
|
||||
Albert is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||
the right rather than the left.
|
||||
|
||||
Indices can be obtained using :class:`transformers.AlbertTokenizer`.
|
||||
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Mask to avoid performing attention on padding token indices.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Segment token indices to indicate first and second portions of the inputs.
|
||||
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
||||
corresponds to a `sentence B` token
|
||||
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
|
||||
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Indices of positions of each input sequence tokens in the position embeddings.
|
||||
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||
Mask to nullify selected heads of the self-attention modules.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||
"""
|
||||
|
||||
@add_start_docstrings("The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.",
|
||||
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||
class AlbertModel(AlbertPreTrainedModel):
|
||||
r"""
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
**pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
|
||||
Last layer hidden-state of the first token of the sequence (classification token)
|
||||
further processed by a Linear layer and a Tanh activation function. The Linear
|
||||
layer weights are trained from the next sentence prediction (classification)
|
||||
objective during Bert pretraining. This output is usually *not* a good summary
|
||||
of the semantic content of the input, you're often better with averaging or pooling
|
||||
the sequence of hidden-states for the whole input sequence.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
"""
|
||||
|
||||
config_class = AlbertConfig
|
||||
pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
load_tf_weights = load_tf_weights_in_albert
|
||||
base_model_prefix = "albert"
|
||||
|
||||
def __init__(self, config):
|
||||
super(AlbertModel, self).__init__(config)
|
||||
|
||||
self.config = config
|
||||
self.embeddings = AlbertEmbeddings(config)
|
||||
self.encoder = AlbertTransformer(config)
|
||||
self.pooler = nn.Linear(config.hidden_size, config.hidden_size)
|
||||
self.pooler_activation = nn.Tanh()
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.embeddings.word_embeddings
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.embeddings.word_embeddings = value
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
old_embeddings = self.embeddings.word_embeddings
|
||||
new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
|
||||
self.embeddings.word_embeddings = new_embeddings
|
||||
return self.embeddings.word_embeddings
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
ALBERT has a different architecture in that its layers are shared across groups, which then has inner groups.
|
||||
If an ALBERT model has 12 hidden layers and 2 hidden groups, with two inner groups, there
|
||||
is a total of 4 different layers.
|
||||
|
||||
These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
|
||||
while [2,3] correspond to the two inner groups of the second hidden layer.
|
||||
|
||||
Any layer with in index other than [0,1,2,3] will result in an error.
|
||||
See base class PreTrainedModel for more information about head pruning
|
||||
"""
|
||||
for layer, heads in heads_to_prune.items():
|
||||
group_idx = int(layer / self.config.inner_group_num)
|
||||
inner_group_idx = int(layer - group_idx * self.config.inner_group_num)
|
||||
self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)
|
||||
|
||||
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||
inputs_embeds=None):
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
elif input_ids is not None:
|
||||
input_shape = input_ids.size()
|
||||
elif inputs_embeds is not None:
|
||||
input_shape = inputs_embeds.size()[:-1]
|
||||
else:
|
||||
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||
|
||||
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
||||
|
||||
if attention_mask is None:
|
||||
attention_mask = torch.ones(input_shape, device=device)
|
||||
if token_type_ids is None:
|
||||
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
|
||||
|
||||
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
|
||||
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
|
||||
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
|
||||
if head_mask is not None:
|
||||
if head_mask.dim() == 1:
|
||||
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
|
||||
head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
|
||||
elif head_mask.dim() == 2:
|
||||
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer
|
||||
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
|
||||
else:
|
||||
head_mask = [None] * self.config.num_hidden_layers
|
||||
|
||||
embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
||||
inputs_embeds=inputs_embeds)
|
||||
encoder_outputs = self.encoder(embedding_output,
|
||||
extended_attention_mask,
|
||||
head_mask=head_mask)
|
||||
|
||||
sequence_output = encoder_outputs[0]
|
||||
|
||||
pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0]))
|
||||
|
||||
outputs = (sequence_output, pooled_output) + encoder_outputs[1:] # add hidden_states and attentions if they are here
|
||||
return outputs
|
||||
|
||||
class AlbertMLMHead(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(AlbertMLMHead, self).__init__()
|
||||
|
||||
self.LayerNorm = nn.LayerNorm(config.embedding_size)
|
||||
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
|
||||
self.dense = nn.Linear(config.hidden_size, config.embedding_size)
|
||||
self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
|
||||
self.activation = ACT2FN[config.hidden_act]
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.activation(hidden_states)
|
||||
hidden_states = self.LayerNorm(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
|
||||
prediction_scores = hidden_states + self.bias
|
||||
|
||||
return prediction_scores
|
||||
|
||||
|
||||
@add_start_docstrings("Bert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||
class AlbertForMaskedLM(AlbertPreTrainedModel):
|
||||
r"""
|
||||
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
in ``[0, ..., config.vocab_size]``
|
||||
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Masked language modeling loss.
|
||||
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
super(AlbertForMaskedLM, self).__init__(config)
|
||||
|
||||
self.albert = AlbertModel(config)
|
||||
self.predictions = AlbertMLMHead(config)
|
||||
|
||||
self.init_weights()
|
||||
self.tie_weights()
|
||||
|
||||
def tie_weights(self):
|
||||
""" Make sure we are sharing the input and output embeddings.
|
||||
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
|
||||
"""
|
||||
self._tie_or_clone_weights(self.predictions.decoder,
|
||||
self.albert.embeddings.word_embeddings)
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.predictions.decoder
|
||||
|
||||
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
|
||||
masked_lm_labels=None):
|
||||
outputs = self.albert(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds
|
||||
)
|
||||
sequence_outputs = outputs[0]
|
||||
|
||||
prediction_scores = self.predictions(sequence_outputs)
|
||||
|
||||
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
|
||||
if masked_lm_labels is not None:
|
||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
||||
outputs = (masked_lm_loss,) + outputs
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
@add_start_docstrings("""Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
|
||||
the pooled output) e.g. for GLUE tasks. """,
|
||||
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||
class AlbertForSequenceClassification(AlbertPreTrainedModel):
|
||||
r"""
|
||||
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||
Labels for computing the sequence classification/regression loss.
|
||||
Indices should be in ``[0, ..., config.num_labels - 1]``.
|
||||
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
|
||||
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
|
||||
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Classification (or regression if config.num_labels==1) loss.
|
||||
**logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
|
||||
Examples::
|
||||
|
||||
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
||||
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, labels=labels)
|
||||
loss, logits = outputs[:2]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
super(AlbertForSequenceClassification, self).__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
||||
self.albert = AlbertModel(config)
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
|
||||
position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
|
||||
|
||||
outputs = self.albert(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds
|
||||
)
|
||||
|
||||
pooled_output = outputs[1]
|
||||
|
||||
pooled_output = self.dropout(pooled_output)
|
||||
logits = self.classifier(pooled_output)
|
||||
|
||||
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
|
||||
|
||||
if labels is not None:
|
||||
if self.num_labels == 1:
|
||||
# We are doing regression
|
||||
loss_fct = MSELoss()
|
||||
loss = loss_fct(logits.view(-1), labels.view(-1))
|
||||
else:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
outputs = (loss,) + outputs
|
||||
|
||||
return outputs # (loss), logits, (hidden_states), (attentions)
|
||||
|
||||
|
||||
|
||||
@add_start_docstrings("""Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
|
||||
the hidden-states output to compute `span start logits` and `span end logits`). """,
|
||||
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||
class AlbertForQuestionAnswering(AlbertPreTrainedModel):
|
||||
r"""
|
||||
**start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||
Labels for position (index) of the start of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
**end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||
Labels for position (index) of the end of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`).
|
||||
Position outside of the sequence are not taken into account for computing the loss.
|
||||
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
|
||||
**start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
|
||||
Span-start scores (before SoftMax).
|
||||
**end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
|
||||
Span-end scores (before SoftMax).
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
|
||||
Examples::
|
||||
|
||||
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
||||
model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2')
|
||||
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
|
||||
input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
|
||||
input_ids = tokenizer.encode(input_text)
|
||||
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
|
||||
start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
|
||||
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
|
||||
print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
|
||||
# a nice puppet
|
||||
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
super(AlbertForQuestionAnswering, self).__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
||||
self.albert = AlbertModel(config)
|
||||
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||
inputs_embeds=None, start_positions=None, end_positions=None):
|
||||
|
||||
outputs = self.albert(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
|
||||
logits = self.qa_outputs(sequence_output)
|
||||
start_logits, end_logits = logits.split(1, dim=-1)
|
||||
start_logits = start_logits.squeeze(-1)
|
||||
end_logits = end_logits.squeeze(-1)
|
||||
|
||||
outputs = (start_logits, end_logits,) + outputs[2:]
|
||||
if start_positions is not None and end_positions is not None:
|
||||
# If we are on multi-GPU, split add a dimension
|
||||
if len(start_positions.size()) > 1:
|
||||
start_positions = start_positions.squeeze(-1)
|
||||
if len(end_positions.size()) > 1:
|
||||
end_positions = end_positions.squeeze(-1)
|
||||
# sometimes the start/end positions are outside our model inputs, we ignore these terms
|
||||
ignored_index = start_logits.size(1)
|
||||
start_positions.clamp_(0, ignored_index)
|
||||
end_positions.clamp_(0, ignored_index)
|
||||
|
||||
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
|
||||
start_loss = loss_fct(start_logits, start_positions)
|
||||
end_loss = loss_fct(end_logits, end_positions)
|
||||
total_loss = (start_loss + end_loss) / 2
|
||||
outputs = (total_loss,) + outputs
|
||||
|
||||
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
|
||||
@@ -27,6 +27,9 @@ from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassi
|
||||
from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
|
||||
from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
|
||||
from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
|
||||
from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice
|
||||
from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice
|
||||
from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering
|
||||
|
||||
from .modeling_utils import PreTrainedModel, SequenceSummary
|
||||
|
||||
@@ -48,14 +51,16 @@ class AutoModel(object):
|
||||
The base model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `distilbert`: DistilBertModel (DistilBERT model)
|
||||
- contains `albert`: AlbertModel (ALBERT model)
|
||||
- contains `camembert`: CamembertModel (CamemBERT model)
|
||||
- contains `roberta`: RobertaModel (RoBERTa model)
|
||||
- contains `bert`: BertModel (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
|
||||
- contains `ctrl`: CTRLModel (Salesforce CTRL model)
|
||||
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetModel (XLNet model)
|
||||
- contains `xlm`: XLMModel (XLM model)
|
||||
- contains `ctrl`: CTRLModel (Salesforce CTRL model)
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
"""
|
||||
@@ -71,14 +76,16 @@ class AutoModel(object):
|
||||
The model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `distilbert`: DistilBertModel (DistilBERT model)
|
||||
- contains `albert`: AlbertModel (ALBERT model)
|
||||
- contains `camembert`: CamembertModel (CamemBERT model)
|
||||
- contains `roberta`: RobertaModel (RoBERTa model)
|
||||
- contains `bert`: BertModel (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
|
||||
- contains `ctrl`: CTRLModel (Salesforce CTRL model)
|
||||
- contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetModel (XLNet model)
|
||||
- contains `xlm`: XLMModel (XLM model)
|
||||
- contains `ctrl`: CTRLModel (Salesforce CTRL model)
|
||||
|
||||
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
||||
To train the model, you should first set it back in training mode with `model.train()`
|
||||
@@ -112,6 +119,9 @@ class AutoModel(object):
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||
|
||||
resume_download: (`optional`) boolean, default False:
|
||||
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
@@ -138,6 +148,10 @@ class AutoModel(object):
|
||||
"""
|
||||
if 'distilbert' in pretrained_model_name_or_path:
|
||||
return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'albert' in pretrained_model_name_or_path:
|
||||
return AlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'camembert' in pretrained_model_name_or_path:
|
||||
return CamembertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'roberta' in pretrained_model_name_or_path:
|
||||
return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'bert' in pretrained_model_name_or_path:
|
||||
@@ -156,7 +170,7 @@ class AutoModel(object):
|
||||
return CTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||
"'xlm', 'roberta, 'ctrl'".format(pretrained_model_name_or_path))
|
||||
"'xlm', 'roberta, 'ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
|
||||
|
||||
|
||||
class AutoModelWithLMHead(object):
|
||||
@@ -172,14 +186,16 @@ class AutoModelWithLMHead(object):
|
||||
The model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
|
||||
- contains `albert`: AlbertForMaskedLM (ALBERT model)
|
||||
- contains `camembert`: CamembertForMaskedLM (CamemBERT model)
|
||||
- contains `roberta`: RobertaForMaskedLM (RoBERTa model)
|
||||
- contains `bert`: BertForMaskedLM (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
|
||||
- contains `ctrl`: CTRLLMModel (Salesforce CTRL model)
|
||||
- contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetLMHeadModel (XLNet model)
|
||||
- contains `xlm`: XLMWithLMHeadModel (XLM model)
|
||||
- contains `ctrl`: CTRLLMHeadModel (Salesforce CTRL model)
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
"""
|
||||
@@ -198,6 +214,8 @@ class AutoModelWithLMHead(object):
|
||||
The model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
|
||||
- contains `albert`: AlbertForMaskedLM (ALBERT model)
|
||||
- contains `camembert`: CamembertForMaskedLM (CamemBERT model)
|
||||
- contains `roberta`: RobertaForMaskedLM (RoBERTa model)
|
||||
- contains `bert`: BertForMaskedLM (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
|
||||
@@ -205,6 +223,7 @@ class AutoModelWithLMHead(object):
|
||||
- contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetLMHeadModel (XLNet model)
|
||||
- contains `xlm`: XLMWithLMHeadModel (XLM model)
|
||||
- contains `ctrl`: CTRLLMHeadModel (Salesforce CTRL model)
|
||||
|
||||
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
||||
To train the model, you should first set it back in training mode with `model.train()`
|
||||
@@ -237,6 +256,8 @@ class AutoModelWithLMHead(object):
|
||||
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||
resume_download: (`optional`) boolean, default False:
|
||||
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
@@ -264,6 +285,10 @@ class AutoModelWithLMHead(object):
|
||||
"""
|
||||
if 'distilbert' in pretrained_model_name_or_path:
|
||||
return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'albert' in pretrained_model_name_or_path:
|
||||
return AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'camembert' in pretrained_model_name_or_path:
|
||||
return CamembertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'roberta' in pretrained_model_name_or_path:
|
||||
return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'bert' in pretrained_model_name_or_path:
|
||||
@@ -282,7 +307,7 @@ class AutoModelWithLMHead(object):
|
||||
return CTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||
"'xlm', 'roberta','ctrl'".format(pretrained_model_name_or_path))
|
||||
"'xlm', 'roberta','ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
|
||||
|
||||
|
||||
class AutoModelForSequenceClassification(object):
|
||||
@@ -298,6 +323,8 @@ class AutoModelForSequenceClassification(object):
|
||||
The model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
|
||||
- contains `albert`: AlbertForSequenceClassification (ALBERT model)
|
||||
- contains `camembert`: CamembertForSequenceClassification (CamemBERT model)
|
||||
- contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
|
||||
- contains `bert`: BertForSequenceClassification (Bert model)
|
||||
- contains `xlnet`: XLNetForSequenceClassification (XLNet model)
|
||||
@@ -320,6 +347,8 @@ class AutoModelForSequenceClassification(object):
|
||||
The model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
|
||||
- contains `albert`: AlbertForSequenceClassification (ALBERT model)
|
||||
- contains `camembert`: CamembertForSequenceClassification (CamemBERT model)
|
||||
- contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
|
||||
- contains `bert`: BertForSequenceClassification (Bert model)
|
||||
- contains `xlnet`: XLNetForSequenceClassification (XLNet model)
|
||||
@@ -357,6 +386,9 @@ class AutoModelForSequenceClassification(object):
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||
|
||||
resume_download: (`optional`) boolean, default False:
|
||||
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
@@ -383,6 +415,10 @@ class AutoModelForSequenceClassification(object):
|
||||
"""
|
||||
if 'distilbert' in pretrained_model_name_or_path:
|
||||
return DistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'albert' in pretrained_model_name_or_path:
|
||||
return AlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'camembert' in pretrained_model_name_or_path:
|
||||
return CamembertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'roberta' in pretrained_model_name_or_path:
|
||||
return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'bert' in pretrained_model_name_or_path:
|
||||
@@ -393,7 +429,7 @@ class AutoModelForSequenceClassification(object):
|
||||
return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
|
||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||
"'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
|
||||
"'bert', 'xlnet', 'xlm', 'roberta', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
|
||||
|
||||
|
||||
class AutoModelForQuestionAnswering(object):
|
||||
@@ -409,6 +445,7 @@ class AutoModelForQuestionAnswering(object):
|
||||
The model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
|
||||
- contains `albert`: AlbertForQuestionAnswering (ALBERT model)
|
||||
- contains `bert`: BertForQuestionAnswering (Bert model)
|
||||
- contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
|
||||
- contains `xlm`: XLMForQuestionAnswering (XLM model)
|
||||
@@ -430,6 +467,7 @@ class AutoModelForQuestionAnswering(object):
|
||||
The model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
|
||||
- contains `albert`: AlbertForQuestionAnswering (ALBERT model)
|
||||
- contains `bert`: BertForQuestionAnswering (Bert model)
|
||||
- contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
|
||||
- contains `xlm`: XLMForQuestionAnswering (XLM model)
|
||||
@@ -492,6 +530,8 @@ class AutoModelForQuestionAnswering(object):
|
||||
"""
|
||||
if 'distilbert' in pretrained_model_name_or_path:
|
||||
return DistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'albert' in pretrained_model_name_or_path:
|
||||
return AlbertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'bert' in pretrained_model_name_or_path:
|
||||
return BertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'xlnet' in pretrained_model_name_or_path:
|
||||
@@ -500,4 +540,4 @@ class AutoModelForQuestionAnswering(object):
|
||||
return XLMForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
|
||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||
"'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))
|
||||
"'bert', 'xlnet', 'xlm', 'distilbert', 'albert'".format(pretrained_model_name_or_path))
|
||||
|
||||
@@ -138,7 +138,11 @@ def swish(x):
|
||||
return x * torch.sigmoid(x)
|
||||
|
||||
|
||||
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new}
|
||||
def mish(x):
|
||||
return x * torch.tanh(nn.functional.softplus(x))
|
||||
|
||||
|
||||
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new, "mish": mish}
|
||||
|
||||
|
||||
BertLayerNorm = torch.nn.LayerNorm
|
||||
@@ -278,7 +282,7 @@ class BertAttention(nn.Module):
|
||||
if len(heads) == 0:
|
||||
return
|
||||
mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
|
||||
heads = set(heads) - self.pruned_heads # Convert to set and emove already pruned heads
|
||||
heads = set(heads) - self.pruned_heads # Convert to set and remove already pruned heads
|
||||
for head in heads:
|
||||
# Compute how many pruned heads are before the head and move the index accordingly
|
||||
head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
|
||||
@@ -597,7 +601,7 @@ class BertModel(BertPreTrainedModel):
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = BertModel.from_pretrained('bert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||
|
||||
@@ -656,8 +660,6 @@ class BertModel(BertPreTrainedModel):
|
||||
|
||||
if attention_mask is None:
|
||||
attention_mask = torch.ones(input_shape, device=device)
|
||||
if encoder_attention_mask is None:
|
||||
encoder_attention_mask = torch.ones(input_shape, device=device)
|
||||
if token_type_ids is None:
|
||||
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
|
||||
|
||||
@@ -665,11 +667,10 @@ class BertModel(BertPreTrainedModel):
|
||||
# ourselves in which case we just need to make it broadcastable to all heads.
|
||||
if attention_mask.dim() == 3:
|
||||
extended_attention_mask = attention_mask[:, None, :, :]
|
||||
|
||||
# Provided a padding mask of dimensions [batch_size, seq_length]
|
||||
# - if the model is a decoder, apply a causal mask in addition to the padding mask
|
||||
# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||
if attention_mask.dim() == 2:
|
||||
elif attention_mask.dim() == 2:
|
||||
# Provided a padding mask of dimensions [batch_size, seq_length]
|
||||
# - if the model is a decoder, apply a causal mask in addition to the padding mask
|
||||
# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||
if self.config.is_decoder:
|
||||
batch_size, seq_length = input_shape
|
||||
seq_ids = torch.arange(seq_length, device=device)
|
||||
@@ -677,6 +678,8 @@ class BertModel(BertPreTrainedModel):
|
||||
extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
|
||||
else:
|
||||
extended_attention_mask = attention_mask[:, None, None, :]
|
||||
else:
|
||||
raise ValueError("Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(input_shape, attention_mask.shape))
|
||||
|
||||
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
||||
# masked positions, this operation will create a tensor which is 0.0 for
|
||||
@@ -688,13 +691,22 @@ class BertModel(BertPreTrainedModel):
|
||||
|
||||
# If a 2D ou 3D attention mask is provided for the cross-attention
|
||||
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
||||
if encoder_attention_mask.dim() == 3:
|
||||
encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
|
||||
if encoder_attention_mask.dim() == 2:
|
||||
encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
|
||||
if self.config.is_decoder:
|
||||
if encoder_attention_mask is None:
|
||||
encoder_attention_mask = torch.ones(input_shape, device=device)
|
||||
|
||||
encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
|
||||
encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
|
||||
if encoder_attention_mask.dim() == 3:
|
||||
encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
|
||||
elif encoder_attention_mask.dim() == 2:
|
||||
encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
|
||||
else:
|
||||
raise ValueError("Wrong shape for input_ids (shape {}) or encoder_attention_mask (shape {})".format(input_shape,
|
||||
encoder_attention_mask.shape))
|
||||
|
||||
encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
|
||||
encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
|
||||
else:
|
||||
encoder_extended_attention_mask = None
|
||||
|
||||
# Prepare head mask if needed
|
||||
# 1.0 in head_mask indicate we keep the head
|
||||
@@ -760,7 +772,7 @@ class BertForPreTraining(BertPreTrainedModel):
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = BertForPreTraining.from_pretrained('bert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
prediction_scores, seq_relationship_scores = outputs[:2]
|
||||
|
||||
@@ -836,7 +848,7 @@ class BertForMaskedLM(BertPreTrainedModel):
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, masked_lm_labels=input_ids)
|
||||
loss, prediction_scores = outputs[:2]
|
||||
|
||||
@@ -919,7 +931,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
seq_relationship_scores = outputs[0]
|
||||
|
||||
@@ -984,7 +996,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, labels=labels)
|
||||
loss, logits = outputs[:2]
|
||||
@@ -1060,7 +1072,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
|
||||
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
|
||||
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
|
||||
input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
|
||||
labels = torch.tensor(1).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, labels=labels)
|
||||
loss, classification_scores = outputs[:2]
|
||||
@@ -1134,7 +1146,7 @@ class BertForTokenClassification(BertPreTrainedModel):
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = BertForTokenClassification.from_pretrained('bert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, labels=labels)
|
||||
loss, scores = outputs[:2]
|
||||
|
||||
@@ -63,7 +63,8 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N
|
||||
scaled_attention_logits = matmul_qk / np.sqrt(dk)
|
||||
|
||||
if mask is not None:
|
||||
scaled_attention_logits += (mask * -1e4)
|
||||
nd, ns = scaled_attention_logits.size(-2), scaled_attention_logits.size(-1)
|
||||
scaled_attention_logits += (mask[ns-nd:ns, :ns] * -1e4)
|
||||
|
||||
if attention_mask is not None:
|
||||
# Apply the attention mask
|
||||
@@ -251,7 +252,7 @@ class CTRLModel(CTRLPreTrainedModel):
|
||||
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||
Sequence of hidden-states at the last layer of the model.
|
||||
**past**:
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
||||
should not be passed as input ids as they have already been computed.
|
||||
@@ -373,7 +374,7 @@ class CTRLModel(CTRLPreTrainedModel):
|
||||
inputs_embeds = self.w(input_ids)
|
||||
# inputs_embeds = embedded.unsqueeze(0) if len(input_ids.shape)<2 else embedded
|
||||
seq_len = input_shape[-1]
|
||||
mask = torch.triu(torch.ones(seq_len, seq_len), 1).to(inputs_embeds.device)
|
||||
mask = torch.triu(torch.ones(seq_len + past_length, seq_len + past_length), 1).to(inputs_embeds.device)
|
||||
|
||||
inputs_embeds *= np.sqrt(self.d_model_size)
|
||||
|
||||
@@ -437,7 +438,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
|
||||
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
**past**:
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
||||
should not be passed as input ids as they have already been computed.
|
||||
|
||||
@@ -42,7 +42,9 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin",
|
||||
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin"
|
||||
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin",
|
||||
'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-pytorch_model.bin",
|
||||
'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-pytorch_model.bin",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -329,7 +329,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
||||
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||
Sequence of hidden-states at the last layer of the model.
|
||||
**past**:
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
||||
should not be passed as input ids as they have already been computed.
|
||||
@@ -503,7 +503,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
||||
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
**past**:
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
||||
should not be passed as input ids as they have already been computed.
|
||||
@@ -596,7 +596,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
**mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
|
||||
Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
|
||||
**past**:
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
|
||||
should not be passed as input ids as they have already been computed.
|
||||
|
||||
@@ -50,8 +50,10 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
|
||||
|
||||
logger.info("Loading weights from {}".format(openai_checkpoint_folder_path))
|
||||
|
||||
names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8'))
|
||||
shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8'))
|
||||
with open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8') as names_handle:
|
||||
names = json.load(names_handle)
|
||||
with open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8') as shapes_handle:
|
||||
shapes = json.load(shapes_handle)
|
||||
offsets = np.cumsum([np.prod(shape) for shape in shapes])
|
||||
init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
|
||||
init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
|
||||
|
||||
794
transformers/modeling_tf_albert.py
Normal file
794
transformers/modeling_tf_albert.py
Normal file
@@ -0,0 +1,794 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" TF 2.0 ALBERT model. """
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
import sys
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from .configuration_albert import AlbertConfig
|
||||
from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
|
||||
from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
|
||||
from .file_utils import add_start_docstrings
|
||||
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||
'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tf_model.h5",
|
||||
'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-tf_model.h5",
|
||||
'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-tf_model.h5",
|
||||
'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-tf_model.h5",
|
||||
'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-tf_model.h5",
|
||||
'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-tf_model.h5",
|
||||
'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-tf_model.h5",
|
||||
'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-tf_model.h5",
|
||||
}
|
||||
|
||||
|
||||
class TFAlbertEmbeddings(tf.keras.layers.Layer):
|
||||
"""Construct the embeddings from word, position and token_type embeddings.
|
||||
"""
|
||||
|
||||
def __init__(self, config, **kwargs):
|
||||
super(TFAlbertEmbeddings, self).__init__(**kwargs)
|
||||
|
||||
self.config = config
|
||||
self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
|
||||
config.embedding_size,
|
||||
embeddings_initializer=get_initializer(
|
||||
self.config.initializer_range),
|
||||
name='position_embeddings')
|
||||
self.token_type_embeddings = tf.keras.layers.Embedding(config.type_vocab_size,
|
||||
config.embedding_size,
|
||||
embeddings_initializer=get_initializer(
|
||||
self.config.initializer_range),
|
||||
name='token_type_embeddings')
|
||||
|
||||
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
|
||||
# any TensorFlow checkpoint file
|
||||
self.LayerNorm = tf.keras.layers.LayerNormalization(
|
||||
epsilon=config.layer_norm_eps, name='LayerNorm')
|
||||
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
def build(self, input_shape):
|
||||
"""Build shared word embedding layer """
|
||||
with tf.name_scope("word_embeddings"):
|
||||
# Create and initialize weights. The random normal initializer was chosen
|
||||
# arbitrarily, and works well.
|
||||
self.word_embeddings = self.add_weight(
|
||||
"weight",
|
||||
shape=[self.config.vocab_size, self.config.embedding_size],
|
||||
initializer=get_initializer(self.config.initializer_range))
|
||||
super(TFAlbertEmbeddings, self).build(input_shape)
|
||||
|
||||
def call(self, inputs, mode="embedding", training=False):
|
||||
"""Get token embeddings of inputs.
|
||||
Args:
|
||||
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
|
||||
mode: string, a valid value is one of "embedding" and "linear".
|
||||
Returns:
|
||||
outputs: (1) If mode == "embedding", output embedding tensor, float32 with
|
||||
shape [batch_size, length, embedding_size]; (2) mode == "linear", output
|
||||
linear tensor, float32 with shape [batch_size, length, vocab_size].
|
||||
Raises:
|
||||
ValueError: if mode is not valid.
|
||||
|
||||
Shared weights logic adapted from
|
||||
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
|
||||
"""
|
||||
if mode == "embedding":
|
||||
return self._embedding(inputs, training=training)
|
||||
elif mode == "linear":
|
||||
return self._linear(inputs)
|
||||
else:
|
||||
raise ValueError("mode {} is not valid.".format(mode))
|
||||
|
||||
def _embedding(self, inputs, training=False):
|
||||
"""Applies embedding based on inputs tensor."""
|
||||
input_ids, position_ids, token_type_ids, inputs_embeds = inputs
|
||||
|
||||
if input_ids is not None:
|
||||
input_shape = shape_list(input_ids)
|
||||
else:
|
||||
input_shape = shape_list(inputs_embeds)[:-1]
|
||||
|
||||
seq_length = input_shape[1]
|
||||
if position_ids is None:
|
||||
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
|
||||
if token_type_ids is None:
|
||||
token_type_ids = tf.fill(input_shape, 0)
|
||||
|
||||
if inputs_embeds is None:
|
||||
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
|
||||
position_embeddings = self.position_embeddings(position_ids)
|
||||
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
||||
|
||||
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
|
||||
embeddings = self.LayerNorm(embeddings)
|
||||
embeddings = self.dropout(embeddings, training=training)
|
||||
return embeddings
|
||||
|
||||
def _linear(self, inputs):
|
||||
"""Computes logits by running inputs through a linear layer.
|
||||
Args:
|
||||
inputs: A float32 tensor with shape [batch_size, length, embedding_size]
|
||||
Returns:
|
||||
float32 tensor with shape [batch_size, length, vocab_size].
|
||||
"""
|
||||
batch_size = shape_list(inputs)[0]
|
||||
length = shape_list(inputs)[1]
|
||||
x = tf.reshape(inputs, [-1, self.config.embedding_size])
|
||||
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
|
||||
return tf.reshape(logits, [batch_size, length, self.config.vocab_size])
|
||||
|
||||
|
||||
class TFAlbertSelfAttention(tf.keras.layers.Layer):
|
||||
def __init__(self, config, **kwargs):
|
||||
super(TFAlbertSelfAttention, self).__init__(**kwargs)
|
||||
if config.hidden_size % config.num_attention_heads != 0:
|
||||
raise ValueError(
|
||||
"The hidden size (%d) is not a multiple of the number of attention "
|
||||
"heads (%d)" % (config.hidden_size, config.num_attention_heads))
|
||||
self.output_attentions = config.output_attentions
|
||||
|
||||
self.num_attention_heads = config.num_attention_heads
|
||||
assert config.hidden_size % config.num_attention_heads == 0
|
||||
self.attention_head_size = int(
|
||||
config.hidden_size / config.num_attention_heads)
|
||||
self.all_head_size = self.num_attention_heads * self.attention_head_size
|
||||
|
||||
self.query = tf.keras.layers.Dense(self.all_head_size,
|
||||
kernel_initializer=get_initializer(
|
||||
config.initializer_range),
|
||||
name='query')
|
||||
self.key = tf.keras.layers.Dense(self.all_head_size,
|
||||
kernel_initializer=get_initializer(
|
||||
config.initializer_range),
|
||||
name='key')
|
||||
self.value = tf.keras.layers.Dense(self.all_head_size,
|
||||
kernel_initializer=get_initializer(
|
||||
config.initializer_range),
|
||||
name='value')
|
||||
|
||||
self.dropout = tf.keras.layers.Dropout(
|
||||
config.attention_probs_dropout_prob)
|
||||
|
||||
def transpose_for_scores(self, x, batch_size):
|
||||
x = tf.reshape(
|
||||
x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
|
||||
return tf.transpose(x, perm=[0, 2, 1, 3])
|
||||
|
||||
def call(self, inputs, training=False):
|
||||
hidden_states, attention_mask, head_mask = inputs
|
||||
|
||||
batch_size = shape_list(hidden_states)[0]
|
||||
mixed_query_layer = self.query(hidden_states)
|
||||
mixed_key_layer = self.key(hidden_states)
|
||||
mixed_value_layer = self.value(hidden_states)
|
||||
|
||||
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
|
||||
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
|
||||
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
|
||||
|
||||
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||
# (batch size, num_heads, seq_len_q, seq_len_k)
|
||||
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
|
||||
# scale attention_scores
|
||||
dk = tf.cast(shape_list(key_layer)[-1], tf.float32)
|
||||
attention_scores = attention_scores / tf.math.sqrt(dk)
|
||||
|
||||
if attention_mask is not None:
|
||||
# Apply the attention mask is (precomputed for all layers in TFAlbertModel call() function)
|
||||
attention_scores = attention_scores + attention_mask
|
||||
|
||||
# Normalize the attention scores to probabilities.
|
||||
attention_probs = tf.nn.softmax(attention_scores, axis=-1)
|
||||
|
||||
# This is actually dropping out entire tokens to attend to, which might
|
||||
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||
attention_probs = self.dropout(attention_probs, training=training)
|
||||
|
||||
# Mask heads if we want to
|
||||
if head_mask is not None:
|
||||
attention_probs = attention_probs * head_mask
|
||||
|
||||
context_layer = tf.matmul(attention_probs, value_layer)
|
||||
|
||||
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
|
||||
context_layer = tf.reshape(context_layer,
|
||||
(batch_size, -1, self.all_head_size)) # (batch_size, seq_len_q, all_head_size)
|
||||
|
||||
outputs = (context_layer, attention_probs) if self.output_attentions else (
|
||||
context_layer,)
|
||||
return outputs
|
||||
|
||||
|
||||
class TFAlbertSelfOutput(tf.keras.layers.Layer):
|
||||
def __init__(self, config, **kwargs):
|
||||
super(TFAlbertSelfOutput, self).__init__(**kwargs)
|
||||
self.dense = tf.keras.layers.Dense(config.hidden_size,
|
||||
kernel_initializer=get_initializer(
|
||||
config.initializer_range),
|
||||
name='dense')
|
||||
self.LayerNorm = tf.keras.layers.LayerNormalization(
|
||||
epsilon=config.layer_norm_eps, name='LayerNorm')
|
||||
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
def call(self, inputs, training=False):
|
||||
hidden_states, input_tensor = inputs
|
||||
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states, training=training)
|
||||
hidden_states = self.LayerNorm(hidden_states + input_tensor)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class TFAlbertAttention(TFBertSelfAttention):
|
||||
def __init__(self, config, **kwargs):
|
||||
super(TFAlbertAttention, self).__init__(config, **kwargs)
|
||||
|
||||
self.hidden_size = config.hidden_size
|
||||
self.dense = tf.keras.layers.Dense(config.hidden_size,
|
||||
kernel_initializer=get_initializer(
|
||||
config.initializer_range),
|
||||
name='dense')
|
||||
self.LayerNorm = tf.keras.layers.LayerNormalization(
|
||||
epsilon=config.layer_norm_eps, name='LayerNorm')
|
||||
self.pruned_heads = set()
|
||||
|
||||
def prune_heads(self, heads):
|
||||
raise NotImplementedError
|
||||
|
||||
def call(self, inputs, training=False):
|
||||
input_tensor, attention_mask, head_mask = inputs
|
||||
|
||||
batch_size = shape_list(input_tensor)[0]
|
||||
mixed_query_layer = self.query(input_tensor)
|
||||
mixed_key_layer = self.key(input_tensor)
|
||||
mixed_value_layer = self.value(input_tensor)
|
||||
|
||||
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
|
||||
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
|
||||
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
|
||||
|
||||
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||
# (batch size, num_heads, seq_len_q, seq_len_k)
|
||||
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
|
||||
# scale attention_scores
|
||||
dk = tf.cast(shape_list(key_layer)[-1], tf.float32)
|
||||
attention_scores = attention_scores / tf.math.sqrt(dk)
|
||||
|
||||
if attention_mask is not None:
|
||||
# Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
|
||||
attention_scores = attention_scores + attention_mask
|
||||
|
||||
# Normalize the attention scores to probabilities.
|
||||
attention_probs = tf.nn.softmax(attention_scores, axis=-1)
|
||||
|
||||
# This is actually dropping out entire tokens to attend to, which might
|
||||
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||
attention_probs = self.dropout(attention_probs, training=training)
|
||||
|
||||
# Mask heads if we want to
|
||||
if head_mask is not None:
|
||||
attention_probs = attention_probs * head_mask
|
||||
|
||||
context_layer = tf.matmul(attention_probs, value_layer)
|
||||
|
||||
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
|
||||
context_layer = tf.reshape(context_layer,
|
||||
(batch_size, -1, self.all_head_size)) # (batch_size, seq_len_q, all_head_size)
|
||||
|
||||
self_outputs = (context_layer, attention_probs) if self.output_attentions else (
|
||||
context_layer,)
|
||||
|
||||
hidden_states = self_outputs[0]
|
||||
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states, training=training)
|
||||
attention_output = self.LayerNorm(hidden_states + input_tensor)
|
||||
|
||||
# add attentions if we output them
|
||||
outputs = (attention_output,) + self_outputs[1:]
|
||||
return outputs
|
||||
|
||||
|
||||
class TFAlbertLayer(tf.keras.layers.Layer):
|
||||
def __init__(self, config, **kwargs):
|
||||
super(TFAlbertLayer, self).__init__(**kwargs)
|
||||
self.attention = TFAlbertAttention(config, name='attention')
|
||||
|
||||
self.ffn = tf.keras.layers.Dense(config.intermediate_size, kernel_initializer=get_initializer(
|
||||
config.initializer_range), name='ffn')
|
||||
|
||||
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
|
||||
self.activation = ACT2FN[config.hidden_act]
|
||||
else:
|
||||
self.activation = config.hidden_act
|
||||
|
||||
self.ffn_output = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
|
||||
config.initializer_range), name='ffn_output')
|
||||
self.full_layer_layer_norm = tf.keras.layers.LayerNormalization(
|
||||
epsilon=config.layer_norm_eps, name='full_layer_layer_norm')
|
||||
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
def call(self, inputs, training=False):
|
||||
hidden_states, attention_mask, head_mask = inputs
|
||||
|
||||
attention_outputs = self.attention(
|
||||
[hidden_states, attention_mask, head_mask], training=training)
|
||||
ffn_output = self.ffn(attention_outputs[0])
|
||||
ffn_output = self.activation(ffn_output)
|
||||
ffn_output = self.ffn_output(ffn_output)
|
||||
|
||||
hidden_states = self.dropout(hidden_states, training=training)
|
||||
hidden_states = self.full_layer_layer_norm(
|
||||
ffn_output + attention_outputs[0])
|
||||
|
||||
# add attentions if we output them
|
||||
outputs = (hidden_states,) + attention_outputs[1:]
|
||||
return outputs
|
||||
|
||||
|
||||
class TFAlbertLayerGroup(tf.keras.layers.Layer):
|
||||
def __init__(self, config, **kwargs):
|
||||
super(TFAlbertLayerGroup, self).__init__(**kwargs)
|
||||
|
||||
self.output_attentions = config.output_attentions
|
||||
self.output_hidden_states = config.output_hidden_states
|
||||
self.albert_layers = [TFAlbertLayer(config, name="albert_layers_._{}".format(
|
||||
i)) for i in range(config.inner_group_num)]
|
||||
|
||||
def call(self, inputs, training=False):
|
||||
hidden_states, attention_mask, head_mask = inputs
|
||||
|
||||
layer_hidden_states = ()
|
||||
layer_attentions = ()
|
||||
|
||||
for layer_index, albert_layer in enumerate(self.albert_layers):
|
||||
layer_output = albert_layer(
|
||||
[hidden_states, attention_mask, head_mask[layer_index]], training=training)
|
||||
hidden_states = layer_output[0]
|
||||
|
||||
if self.output_attentions:
|
||||
layer_attentions = layer_attentions + (layer_output[1],)
|
||||
|
||||
if self.output_hidden_states:
|
||||
layer_hidden_states = layer_hidden_states + (hidden_states,)
|
||||
|
||||
outputs = (hidden_states,)
|
||||
if self.output_hidden_states:
|
||||
outputs = outputs + (layer_hidden_states,)
|
||||
if self.output_attentions:
|
||||
outputs = outputs + (layer_attentions,)
|
||||
# last-layer hidden state, (layer hidden states), (layer attentions)
|
||||
return outputs
|
||||
|
||||
|
||||
class TFAlbertTransformer(tf.keras.layers.Layer):
|
||||
def __init__(self, config, **kwargs):
|
||||
super(TFAlbertTransformer, self).__init__(**kwargs)
|
||||
|
||||
self.config = config
|
||||
self.output_attentions = config.output_attentions
|
||||
self.output_hidden_states = config.output_hidden_states
|
||||
self.embedding_hidden_mapping_in = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
|
||||
config.initializer_range), name='embedding_hidden_mapping_in')
|
||||
self.albert_layer_groups = [TFAlbertLayerGroup(
|
||||
config, name="albert_layer_groups_._{}".format(i)) for i in range(config.num_hidden_groups)]
|
||||
|
||||
def call(self, inputs, training=False):
|
||||
hidden_states, attention_mask, head_mask = inputs
|
||||
|
||||
hidden_states = self.embedding_hidden_mapping_in(hidden_states)
|
||||
all_attentions = ()
|
||||
|
||||
if self.output_hidden_states:
|
||||
all_hidden_states = (hidden_states,)
|
||||
|
||||
for i in range(self.config.num_hidden_layers):
|
||||
# Number of layers in a hidden group
|
||||
layers_per_group = int(
|
||||
self.config.num_hidden_layers / self.config.num_hidden_groups)
|
||||
|
||||
# Index of the hidden group
|
||||
group_idx = int(
|
||||
i / (self.config.num_hidden_layers / self.config.num_hidden_groups))
|
||||
|
||||
layer_group_output = self.albert_layer_groups[group_idx](
|
||||
[hidden_states, attention_mask, head_mask[group_idx*layers_per_group:(group_idx+1)*layers_per_group]], training=training)
|
||||
hidden_states = layer_group_output[0]
|
||||
|
||||
if self.output_attentions:
|
||||
all_attentions = all_attentions + layer_group_output[-1]
|
||||
|
||||
if self.output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
outputs = (hidden_states,)
|
||||
if self.output_hidden_states:
|
||||
outputs = outputs + (all_hidden_states,)
|
||||
if self.output_attentions:
|
||||
outputs = outputs + (all_attentions,)
|
||||
|
||||
# last-layer hidden state, (all hidden states), (all attentions)
|
||||
return outputs
|
||||
|
||||
|
||||
class TFAlbertPreTrainedModel(TFPreTrainedModel):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for dowloading and loading pretrained models.
|
||||
"""
|
||||
config_class = AlbertConfig
|
||||
pretrained_model_archive_map = TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
base_model_prefix = "albert"
|
||||
|
||||
|
||||
class TFAlbertMLMHead(tf.keras.layers.Layer):
|
||||
def __init__(self, config, input_embeddings, **kwargs):
|
||||
super(TFAlbertMLMHead, self).__init__(**kwargs)
|
||||
self.vocab_size = config.vocab_size
|
||||
|
||||
self.dense = tf.keras.layers.Dense(config.embedding_size,
|
||||
kernel_initializer=get_initializer(
|
||||
config.initializer_range),
|
||||
name='dense')
|
||||
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
|
||||
self.activation = ACT2FN[config.hidden_act]
|
||||
else:
|
||||
self.activation = config.hidden_act
|
||||
|
||||
self.LayerNorm = tf.keras.layers.LayerNormalization(
|
||||
epsilon=config.layer_norm_eps, name='LayerNorm')
|
||||
|
||||
# The output weights are the same as the input embeddings, but there is
|
||||
# an output-only bias for each token.
|
||||
self.decoder = input_embeddings
|
||||
|
||||
def build(self, input_shape):
|
||||
self.bias = self.add_weight(shape=(self.vocab_size,),
|
||||
initializer='zeros',
|
||||
trainable=True,
|
||||
name='bias')
|
||||
self.decoder_bias = self.add_weight(shape=(self.vocab_size,),
|
||||
initializer='zeros',
|
||||
trainable=True,
|
||||
name='decoder/bias')
|
||||
super(TFAlbertMLMHead, self).build(input_shape)
|
||||
|
||||
def call(self, hidden_states):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.activation(hidden_states)
|
||||
hidden_states = self.LayerNorm(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states, mode="linear") + self.decoder_bias
|
||||
hidden_states = hidden_states + self.bias
|
||||
return hidden_states
|
||||
|
||||
|
||||
ALBERT_START_DOCSTRING = r""" The ALBERT model was proposed in
|
||||
`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`_
|
||||
by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. It presents
|
||||
two parameter-reduction techniques to lower memory consumption and increase the trainig speed of BERT.
|
||||
|
||||
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
|
||||
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
|
||||
|
||||
.. _`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`:
|
||||
https://arxiv.org/abs/1909.11942
|
||||
|
||||
.. _`tf.keras.Model`:
|
||||
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
|
||||
|
||||
Note on the model inputs:
|
||||
TF 2.0 models accepts two formats as inputs:
|
||||
|
||||
- having all inputs as keyword arguments (like PyTorch models), or
|
||||
- having all inputs as a list, tuple or dict in the first positional arguments.
|
||||
|
||||
This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
|
||||
|
||||
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
|
||||
|
||||
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
|
||||
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
|
||||
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
|
||||
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
|
||||
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
|
||||
|
||||
Parameters:
|
||||
config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||
"""
|
||||
|
||||
ALBERT_INPUTS_DOCSTRING = r"""
|
||||
Inputs:
|
||||
**input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
To match pre-training, ALBERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
|
||||
|
||||
(a) For sequence pairs:
|
||||
|
||||
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
|
||||
|
||||
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
|
||||
|
||||
(b) For single sequences:
|
||||
|
||||
``tokens: [CLS] the dog is hairy . [SEP]``
|
||||
|
||||
``token_type_ids: 0 0 0 0 0 0 0``
|
||||
|
||||
Albert is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||
the right rather than the left.
|
||||
|
||||
Indices can be obtained using :class:`transformers.AlbertTokenizer`.
|
||||
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||
**attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Mask to avoid performing attention on padding token indices.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||
**token_type_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Segment token indices to indicate first and second portions of the inputs.
|
||||
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
||||
corresponds to a `sentence B` token
|
||||
(see `ALBERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
|
||||
**position_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Indices of positions of each input sequence tokens in the position embeddings.
|
||||
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
||||
**head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||
Mask to nullify selected heads of the self-attention modules.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||
"""
|
||||
|
||||
@add_start_docstrings("The bare Albert Model transformer outputing raw hidden-states without any specific head on top.",
|
||||
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||
class TFAlbertModel(TFAlbertPreTrainedModel):
|
||||
r"""
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
**pooler_output**: ``tf.Tensor`` of shape ``(batch_size, hidden_size)``
|
||||
Last layer hidden-state of the first token of the sequence (classification token)
|
||||
further processed by a Linear layer and a Tanh activation function. The Linear
|
||||
layer weights are trained from the next sentence prediction (classification)
|
||||
objective during Albert pretraining. This output is usually *not* a good summary
|
||||
of the semantic content of the input, you're often better with averaging or pooling
|
||||
the sequence of hidden-states for the whole input sequence.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
|
||||
Examples::
|
||||
|
||||
import tensorflow as tf
|
||||
from transformers import AlbertTokenizer, TFAlbertModel
|
||||
|
||||
tokenizer = AlbertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = TFAlbertModel.from_pretrained('bert-base-uncased')
|
||||
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, config, **kwargs):
|
||||
super(TFAlbertModel, self).__init__(config, **kwargs)
|
||||
self.num_hidden_layers = config.num_hidden_layers
|
||||
|
||||
self.embeddings = TFAlbertEmbeddings(config, name="embeddings")
|
||||
self.encoder = TFAlbertTransformer(config, name="encoder")
|
||||
self.pooler = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer(
|
||||
config.initializer_range), activation='tanh', name='pooler')
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.embeddings
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
raise NotImplementedError
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
See base class PreTrainedModel
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False):
|
||||
if isinstance(inputs, (tuple, list)):
|
||||
input_ids = inputs[0]
|
||||
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
|
||||
token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
|
||||
position_ids = inputs[3] if len(inputs) > 3 else position_ids
|
||||
head_mask = inputs[4] if len(inputs) > 4 else head_mask
|
||||
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
|
||||
assert len(inputs) <= 6, "Too many inputs."
|
||||
elif isinstance(inputs, dict):
|
||||
input_ids = inputs.get('input_ids')
|
||||
attention_mask = inputs.get('attention_mask', attention_mask)
|
||||
token_type_ids = inputs.get('token_type_ids', token_type_ids)
|
||||
position_ids = inputs.get('position_ids', position_ids)
|
||||
head_mask = inputs.get('head_mask', head_mask)
|
||||
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
|
||||
assert len(inputs) <= 6, "Too many inputs."
|
||||
else:
|
||||
input_ids = inputs
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
elif input_ids is not None:
|
||||
input_shape = shape_list(input_ids)
|
||||
elif inputs_embeds is not None:
|
||||
input_shape = shape_list(inputs_embeds)[:-1]
|
||||
else:
|
||||
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||
|
||||
if attention_mask is None:
|
||||
attention_mask = tf.fill(input_shape, 1)
|
||||
if token_type_ids is None:
|
||||
token_type_ids = tf.fill(input_shape, 0)
|
||||
|
||||
# We create a 3D attention mask from a 2D tensor mask.
|
||||
# Sizes are [batch_size, 1, 1, to_seq_length]
|
||||
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
|
||||
# this attention mask is more simple than the triangular masking of causal attention
|
||||
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
|
||||
extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
|
||||
|
||||
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
||||
# masked positions, this operation will create a tensor which is 0.0 for
|
||||
# positions we want to attend and -10000.0 for masked positions.
|
||||
# Since we are adding it to the raw scores before the softmax, this is
|
||||
# effectively the same as removing these entirely.
|
||||
|
||||
extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
|
||||
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
|
||||
|
||||
# Prepare head mask if needed
|
||||
# 1.0 in head_mask indicate we keep the head
|
||||
# attention_probs has shape bsz x n_heads x N x N
|
||||
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
|
||||
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
|
||||
if not head_mask is None:
|
||||
raise NotImplementedError
|
||||
else:
|
||||
head_mask = [None] * self.num_hidden_layers
|
||||
# head_mask = tf.constant([0] * self.num_hidden_layers)
|
||||
|
||||
embedding_output = self.embeddings(
|
||||
[input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
|
||||
encoder_outputs = self.encoder(
|
||||
[embedding_output, extended_attention_mask, head_mask], training=training)
|
||||
|
||||
sequence_output = encoder_outputs[0]
|
||||
pooled_output = self.pooler(sequence_output[:, 0])
|
||||
|
||||
# add hidden_states and attentions if they are here
|
||||
outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]
|
||||
# sequence_output, pooled_output, (hidden_states), (attentions)
|
||||
return outputs
|
||||
|
||||
|
||||
@add_start_docstrings("""Albert Model with a `language modeling` head on top. """,
|
||||
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||
class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
|
||||
r"""
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**prediction_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
|
||||
Examples::
|
||||
|
||||
import tensorflow as tf
|
||||
from transformers import AlbertTokenizer, TFAlbertForMaskedLM
|
||||
|
||||
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
||||
model = TFAlbertForMaskedLM.from_pretrained('albert-base-v2')
|
||||
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
prediction_scores = outputs[0]
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super(TFAlbertForMaskedLM, self).__init__(config, *inputs, **kwargs)
|
||||
|
||||
self.albert = TFAlbertModel(config, name='albert')
|
||||
self.predictions = TFAlbertMLMHead(
|
||||
config, self.albert.embeddings, name='predictions')
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.albert.embeddings
|
||||
|
||||
def call(self, inputs, **kwargs):
|
||||
outputs = self.albert(inputs, **kwargs)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
prediction_scores = self.predictions(
|
||||
sequence_output, training=kwargs.get('training', False))
|
||||
|
||||
# Add hidden states and attention if they are here
|
||||
outputs = (prediction_scores,) + outputs[2:]
|
||||
|
||||
return outputs # prediction_scores, (hidden_states), (attentions)
|
||||
|
||||
|
||||
@add_start_docstrings("""Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
|
||||
the pooled output) e.g. for GLUE tasks. """,
|
||||
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||
class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
|
||||
r"""
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**logits**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
|
||||
Classification (or regression if config.num_labels==1) scores (before SoftMax).
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
|
||||
Examples::
|
||||
|
||||
import tensorflow as tf
|
||||
from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
|
||||
|
||||
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
||||
model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2')
|
||||
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
logits = outputs[0]
|
||||
|
||||
"""
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super(TFAlbertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
|
||||
self.num_labels = config.num_labels
|
||||
|
||||
self.albert = TFAlbertModel(config, name='albert')
|
||||
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
|
||||
self.classifier = tf.keras.layers.Dense(config.num_labels,
|
||||
kernel_initializer=get_initializer(config.initializer_range),
|
||||
name='classifier')
|
||||
|
||||
def call(self, inputs, **kwargs):
|
||||
outputs = self.albert(inputs, **kwargs)
|
||||
|
||||
pooled_output = outputs[1]
|
||||
|
||||
pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
|
||||
logits = self.classifier(pooled_output)
|
||||
|
||||
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
|
||||
|
||||
return outputs # logits, (hidden_states), (attentions)
|
||||
@@ -109,6 +109,9 @@ class TFAutoModel(object):
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||
|
||||
resume_download: (`optional`) boolean, default False:
|
||||
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
@@ -237,6 +240,9 @@ class TFAutoModelWithLMHead(object):
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||
|
||||
resume_download: (`optional`) boolean, default False:
|
||||
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
@@ -360,6 +366,9 @@ class TFAutoModelForSequenceClassification(object):
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||
|
||||
resume_download: (`optional`) boolean, default False:
|
||||
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
@@ -472,6 +481,9 @@ class TFAutoModelForQuestionAnswering(object):
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||
|
||||
resume_download: (`optional`) boolean, default False:
|
||||
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
|
||||
@@ -28,7 +28,7 @@ import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
from .configuration_bert import BertConfig
|
||||
from .modeling_tf_utils import TFPreTrainedModel, get_initializer
|
||||
from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
|
||||
from .file_utils import add_start_docstrings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -145,9 +145,9 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
|
||||
input_ids, position_ids, token_type_ids, inputs_embeds = inputs
|
||||
|
||||
if input_ids is not None:
|
||||
input_shape = tf.shape(input_ids)
|
||||
input_shape = shape_list(input_ids)
|
||||
else:
|
||||
input_shape = tf.shape(inputs_embeds)[:-1]
|
||||
input_shape = shape_list(inputs_embeds)[:-1]
|
||||
|
||||
seq_length = input_shape[1]
|
||||
if position_ids is None:
|
||||
@@ -172,8 +172,8 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
|
||||
Returns:
|
||||
float32 tensor with shape [batch_size, length, vocab_size].
|
||||
"""
|
||||
batch_size = tf.shape(inputs)[0]
|
||||
length = tf.shape(inputs)[1]
|
||||
batch_size = shape_list(inputs)[0]
|
||||
length = shape_list(inputs)[1]
|
||||
|
||||
x = tf.reshape(inputs, [-1, self.hidden_size])
|
||||
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
|
||||
@@ -214,7 +214,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
|
||||
def call(self, inputs, training=False):
|
||||
hidden_states, attention_mask, head_mask = inputs
|
||||
|
||||
batch_size = tf.shape(hidden_states)[0]
|
||||
batch_size = shape_list(hidden_states)[0]
|
||||
mixed_query_layer = self.query(hidden_states)
|
||||
mixed_key_layer = self.key(hidden_states)
|
||||
mixed_value_layer = self.value(hidden_states)
|
||||
@@ -225,7 +225,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
|
||||
|
||||
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) # (batch size, num_heads, seq_len_q, seq_len_k)
|
||||
dk = tf.cast(tf.shape(key_layer)[-1], tf.float32) # scale attention_scores
|
||||
dk = tf.cast(shape_list(key_layer)[-1], tf.float32) # scale attention_scores
|
||||
attention_scores = attention_scores / tf.math.sqrt(dk)
|
||||
|
||||
if attention_mask is not None:
|
||||
@@ -502,9 +502,9 @@ class TFBertMainLayer(tf.keras.layers.Layer):
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
elif input_ids is not None:
|
||||
input_shape = input_ids.shape
|
||||
input_shape = shape_list(input_ids)
|
||||
elif inputs_embeds is not None:
|
||||
input_shape = inputs_embeds.shape[:-1]
|
||||
input_shape = shape_list(inputs_embeds)[:-1]
|
||||
else:
|
||||
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||
|
||||
@@ -939,11 +939,11 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
|
||||
input_ids = inputs
|
||||
|
||||
if input_ids is not None:
|
||||
num_choices = tf.shape(input_ids)[1]
|
||||
seq_length = tf.shape(input_ids)[2]
|
||||
num_choices = shape_list(input_ids)[1]
|
||||
seq_length = shape_list(input_ids)[2]
|
||||
else:
|
||||
num_choices = tf.shape(inputs_embeds)[1]
|
||||
seq_length = tf.shape(inputs_embeds)[2]
|
||||
num_choices = shape_list(inputs_embeds)[1]
|
||||
seq_length = shape_list(inputs_embeds)[2]
|
||||
|
||||
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
|
||||
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
|
||||
|
||||
@@ -95,7 +95,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
|
||||
|
||||
def call(self, inputs, training=False):
|
||||
v, k, q, mask, layer_past, attention_mask, head_mask = inputs
|
||||
batch_size = q.shape[0]
|
||||
batch_size = shape_list(q)[0]
|
||||
|
||||
q = self.Wq(q)
|
||||
k = self.Wk(k)
|
||||
@@ -400,7 +400,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
|
||||
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||
Sequence of hidden-states at the last layer of the model.
|
||||
**past**:
|
||||
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of ``tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
@@ -462,7 +462,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
|
||||
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
**past**:
|
||||
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of ``tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
|
||||
@@ -37,7 +37,8 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
|
||||
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5"
|
||||
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5",
|
||||
'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-tf_model.h5",
|
||||
}
|
||||
|
||||
|
||||
@@ -137,9 +138,9 @@ class TFEmbeddings(tf.keras.layers.Layer):
|
||||
input_ids, position_ids = inputs
|
||||
|
||||
if input_ids is not None:
|
||||
seq_length = tf.shape(input_ids)[1]
|
||||
seq_length = shape_list(input_ids)[1]
|
||||
else:
|
||||
seq_length = tf.shape(inputs_embeds)[1]
|
||||
seq_length = shape_list(inputs_embeds)[1]
|
||||
|
||||
if position_ids is None:
|
||||
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
|
||||
@@ -160,8 +161,8 @@ class TFEmbeddings(tf.keras.layers.Layer):
|
||||
Returns:
|
||||
float32 tensor with shape [batch_size, length, vocab_size].
|
||||
"""
|
||||
batch_size = tf.shape(inputs)[0]
|
||||
length = tf.shape(inputs)[1]
|
||||
batch_size = shape_list(inputs)[0]
|
||||
length = shape_list(inputs)[1]
|
||||
|
||||
x = tf.reshape(inputs, [-1, self.dim])
|
||||
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
|
||||
@@ -703,6 +704,53 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
|
||||
return outputs # logits, (hidden_states), (attentions)
|
||||
|
||||
|
||||
@add_start_docstrings("""DistilBert Model with a token classification head on top (a linear layer on top of
|
||||
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
|
||||
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||
class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
|
||||
r"""
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
|
||||
Classification scores (before SoftMax).
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
Examples::
|
||||
import tensorflow as tf
|
||||
from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification
|
||||
tokenizer = DistilBertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = TFDistilBertForTokenClassification.from_pretrained('bert-base-uncased')
|
||||
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
scores = outputs[0]
|
||||
"""
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super(TFDistilBertForTokenClassification, self).__init__(config, *inputs, **kwargs)
|
||||
self.num_labels = config.num_labels
|
||||
|
||||
self.distilbert = TFDistilBertMainLayer(config, name='distilbert')
|
||||
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
||||
self.classifier = tf.keras.layers.Dense(config.num_labels,
|
||||
kernel_initializer=get_initializer(config.initializer_range),
|
||||
name='classifier')
|
||||
|
||||
def call(self, inputs, **kwargs):
|
||||
outputs = self.distilbert(inputs, **kwargs)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
|
||||
sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
|
||||
logits = self.classifier(sequence_output)
|
||||
|
||||
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
|
||||
|
||||
return outputs # scores, (hidden_states), (attentions)
|
||||
|
||||
|
||||
@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
|
||||
the hidden-states output to compute `span start logits` and `span end logits`). """,
|
||||
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
|
||||
|
||||
@@ -92,7 +92,7 @@ class TFAttention(tf.keras.layers.Layer):
|
||||
# q, k, v have shape [batch, heads, sequence, features]
|
||||
w = tf.matmul(q, k, transpose_b=True)
|
||||
if self.scale:
|
||||
dk = tf.cast(tf.shape(k)[-1], tf.float32) # scale attention_scores
|
||||
dk = tf.cast(shape_list(k)[-1], tf.float32) # scale attention_scores
|
||||
w = w / tf.math.sqrt(dk)
|
||||
|
||||
# w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
|
||||
@@ -436,7 +436,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
|
||||
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||
Sequence of hidden-states at the last layer of the model.
|
||||
**past**:
|
||||
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of ``tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
@@ -476,7 +476,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
|
||||
**prediction_scores**: `tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
**past**:
|
||||
list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of `tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
@@ -535,7 +535,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
|
||||
**mc_prediction_scores**: `tf.Tensor`` of shape ``(batch_size, num_choices)``
|
||||
Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
|
||||
**past**:
|
||||
list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
list of `tf.Tensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
|
||||
@@ -98,7 +98,7 @@ class TFAttention(tf.keras.layers.Layer):
|
||||
# q, k, v have shape [batch, heads, sequence, features]
|
||||
w = tf.matmul(q, k, transpose_b=True)
|
||||
if self.scale:
|
||||
dk = tf.cast(tf.shape(k)[-1], tf.float32) # scale attention_scores
|
||||
dk = tf.cast(shape_list(k)[-1], tf.float32) # scale attention_scores
|
||||
w = w / tf.math.sqrt(dk)
|
||||
|
||||
# w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
|
||||
|
||||
@@ -118,6 +118,9 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
|
||||
new_key = key.replace('gamma', 'weight')
|
||||
if 'beta' in key:
|
||||
new_key = key.replace('beta', 'bias')
|
||||
# DialoGPT format
|
||||
if key == 'lm_head.decoder.weight':
|
||||
new_key = 'lm_head.weight'
|
||||
if new_key:
|
||||
old_keys.append(key)
|
||||
new_keys.append(new_key)
|
||||
|
||||
@@ -24,7 +24,7 @@ import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
from .configuration_roberta import RobertaConfig
|
||||
from .modeling_tf_utils import TFPreTrainedModel, get_initializer
|
||||
from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
|
||||
from .file_utils import add_start_docstrings
|
||||
|
||||
from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu, gelu_new
|
||||
@@ -51,9 +51,9 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
|
||||
input_ids, position_ids, token_type_ids, inputs_embeds = inputs
|
||||
|
||||
if input_ids is not None:
|
||||
seq_length = tf.shape(input_ids)[1]
|
||||
seq_length = shape_list(input_ids)[1]
|
||||
else:
|
||||
seq_length = tf.shape(inputs_embeds)[1]
|
||||
seq_length = shape_list(inputs_embeds)[1]
|
||||
|
||||
if position_ids is None:
|
||||
position_ids = tf.range(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=tf.int32)[tf.newaxis, :]
|
||||
|
||||
@@ -337,7 +337,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
|
||||
emb_i = tf.einsum('id,de->ie', emb_i, self.emb_projs[i])
|
||||
|
||||
mask_idx = tf.cast(tf.where(mask_i), dtype=tf.int64)
|
||||
emb_flat += tf.scatter_nd(mask_idx, emb_i, tf.cast(tf.shape(emb_flat), dtype=tf.int64))
|
||||
emb_flat += tf.scatter_nd(mask_idx, emb_i, tf.cast(shape_list(emb_flat), dtype=tf.int64))
|
||||
|
||||
embed_shape = shape_list(inp) + [self.d_proj]
|
||||
embed = tf.reshape(emb_flat, embed_shape)
|
||||
|
||||
@@ -105,7 +105,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
|
||||
|
||||
@staticmethod
|
||||
def _gather_logprob(logprob, target):
|
||||
lp_size = tf.shape(logprob)
|
||||
lp_size = shape_list(logprob)
|
||||
r = tf.range(lp_size[0])
|
||||
idx = tf.stack([r, target], 1)
|
||||
return tf.gather_nd(logprob, idx)
|
||||
@@ -159,7 +159,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
|
||||
cur_logprob = self._gather_logprob(cur_tail_logprob, cur_target)
|
||||
cur_logprob += cur_head_logprob[:, self.cutoff_ends[1] + i - 1]
|
||||
if target is not None:
|
||||
loss += tf.scatter_nd(mask_idx, -cur_logprob, tf.cast(tf.shape(loss), dtype=tf.int64))
|
||||
loss += tf.scatter_nd(mask_idx, -cur_logprob, tf.cast(shape_list(loss), dtype=tf.int64))
|
||||
out = tf.concat(out, axis=-1)
|
||||
|
||||
if target is not None:
|
||||
|
||||
@@ -51,7 +51,15 @@ class TFPreTrainedModel(tf.keras.Model):
|
||||
config_class = None
|
||||
pretrained_model_archive_map = {}
|
||||
base_model_prefix = ""
|
||||
dummy_inputs = tf.constant(DUMMY_INPUTS) # dummy inputs to build the network
|
||||
|
||||
@property
|
||||
def dummy_inputs(self):
|
||||
""" Dummy inputs to build the network.
|
||||
|
||||
Returns:
|
||||
tf.Tensor with dummy inputs
|
||||
"""
|
||||
return tf.constant(DUMMY_INPUTS)
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super(TFPreTrainedModel, self).__init__(*inputs, **kwargs)
|
||||
@@ -191,6 +199,9 @@ class TFPreTrainedModel(tf.keras.Model):
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||
|
||||
resume_download: (`optional`) boolean, default False:
|
||||
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
@@ -216,6 +227,7 @@ class TFPreTrainedModel(tf.keras.Model):
|
||||
cache_dir = kwargs.pop('cache_dir', None)
|
||||
from_pt = kwargs.pop('from_pt', False)
|
||||
force_download = kwargs.pop('force_download', False)
|
||||
resume_download = kwargs.pop('resume_download', False)
|
||||
proxies = kwargs.pop('proxies', None)
|
||||
|
||||
# Load config
|
||||
@@ -224,6 +236,7 @@ class TFPreTrainedModel(tf.keras.Model):
|
||||
pretrained_model_name_or_path, *model_args,
|
||||
cache_dir=cache_dir, return_unused_kwargs=True,
|
||||
force_download=force_download,
|
||||
resume_download=resume_download,
|
||||
**kwargs
|
||||
)
|
||||
else:
|
||||
@@ -251,7 +264,8 @@ class TFPreTrainedModel(tf.keras.Model):
|
||||
|
||||
# redirect to the cache, if necessary
|
||||
try:
|
||||
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
|
||||
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download,
|
||||
resume_download=resume_download, proxies=proxies)
|
||||
except EnvironmentError as e:
|
||||
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
|
||||
logger.error(
|
||||
@@ -454,7 +468,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
|
||||
elif self.summary_type == 'first':
|
||||
output = hidden_states[:, 0]
|
||||
elif self.summary_type == 'mean':
|
||||
output = tf.mean(hidden_states, axis=1)
|
||||
output = tf.reduce_mean(hidden_states, axis=1)
|
||||
elif self.summary_type == 'cls_index':
|
||||
hidden_shape = shape_list(hidden_states) # e.g. [batch, num choices, seq length, hidden dims]
|
||||
if cls_index is None:
|
||||
|
||||
@@ -112,8 +112,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
|
||||
def prune_heads(self, heads):
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def rel_shift(x, klen=-1):
|
||||
def rel_shift(self, x, klen=-1):
|
||||
"""perform relative shift to form the relative attention score."""
|
||||
x_size = shape_list(x)
|
||||
|
||||
@@ -135,7 +134,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
|
||||
|
||||
# position based attention score
|
||||
bd = tf.einsum('ibnd,jbnd->ijbn', q_head + self.r_r_bias, k_head_r)
|
||||
bd = self.rel_shift(bd, klen=ac.shape[1])
|
||||
bd = self.rel_shift(bd, klen=shape_list(ac)[1])
|
||||
|
||||
# segment based attention score
|
||||
if seg_mat is None:
|
||||
@@ -192,7 +191,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
|
||||
if g is not None:
|
||||
###### Two-stream attention with relative positional encoding.
|
||||
# content based attention score
|
||||
if mems is not None and mems.shape.ndims > 1:
|
||||
if mems is not None and len(shape_list(mems)) > 1:
|
||||
cat = tf.concat([mems, h], axis=0)
|
||||
else:
|
||||
cat = h
|
||||
@@ -252,7 +251,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
|
||||
|
||||
else:
|
||||
###### Multi-head attention with relative positional encoding
|
||||
if mems is not None and mems.shape.ndims > 1:
|
||||
if mems is not None and len(shape_list(mems)) > 1:
|
||||
cat = tf.concat([mems, h], axis=0)
|
||||
else:
|
||||
cat = h
|
||||
@@ -565,7 +564,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
|
||||
|
||||
if data_mask is not None:
|
||||
# all mems can be attended to
|
||||
mems_mask = tf.zeros([tf.shape(data_mask)[0], mlen, bsz],
|
||||
mems_mask = tf.zeros([shape_list(data_mask)[0], mlen, bsz],
|
||||
dtype=dtype_float)
|
||||
data_mask = tf.concat([mems_mask, data_mask], axis=1)
|
||||
if attn_mask is None:
|
||||
@@ -590,7 +589,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
|
||||
word_emb_k = self.word_embedding(input_ids)
|
||||
output_h = self.dropout(word_emb_k, training=training)
|
||||
if target_mapping is not None:
|
||||
word_emb_q = tf.tile(self.mask_emb, [tf.shape(target_mapping)[0], bsz, 1])
|
||||
word_emb_q = tf.tile(self.mask_emb, [shape_list(target_mapping)[0], bsz, 1])
|
||||
# else: # We removed the inp_q input which was same as target mapping
|
||||
# inp_q_ext = inp_q[:, :, None]
|
||||
# word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
|
||||
@@ -939,6 +938,59 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
|
||||
return outputs # return logits, (mems), (hidden states), (attentions)
|
||||
|
||||
|
||||
@add_start_docstrings("""XLNet Model with a token classification head on top (a linear layer on top of
|
||||
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
|
||||
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
|
||||
class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
|
||||
r"""
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
|
||||
Classification scores (before SoftMax).
|
||||
**mems**: (`optional`, returned when ``config.mem_len > 0``)
|
||||
list of ``tf.Tensor`` (one for each layer):
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
|
||||
See details in the docstring of the `mems` input above.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
|
||||
Examples::
|
||||
|
||||
import tensorflow as tf
|
||||
from transformers import XLNetTokenizer, TFXLNetForTokenClassification
|
||||
|
||||
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
|
||||
model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
|
||||
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
scores = outputs[0]
|
||||
|
||||
"""
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super(TFXLNetForTokenClassification, self).__init__(config, *inputs, **kwargs)
|
||||
self.num_labels = config.num_labels
|
||||
|
||||
self.transformer = TFXLNetMainLayer(config, name='transformer')
|
||||
self.classifier = tf.keras.layers.Dense(config.num_labels,
|
||||
kernel_initializer=get_initializer(config.initializer_range),
|
||||
name='classifier')
|
||||
|
||||
def call(self, inputs, **kwargs):
|
||||
transformer_outputs = self.transformer(inputs, **kwargs)
|
||||
output = transformer_outputs[0]
|
||||
|
||||
logits = self.classifier(output)
|
||||
|
||||
outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it
|
||||
|
||||
return outputs # return logits, (mems), (hidden states), (attentions)
|
||||
|
||||
|
||||
# @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
|
||||
# the hidden-states output to compute `span start logits` and `span end logits`). """,
|
||||
# XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
|
||||
|
||||
@@ -291,6 +291,9 @@ class PreTrainedModel(nn.Module):
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
|
||||
|
||||
resume_download: (`optional`) boolean, default False:
|
||||
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
@@ -315,11 +318,16 @@ class PreTrainedModel(nn.Module):
|
||||
model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||
|
||||
"""
|
||||
if "albert" in pretrained_model_name_or_path and "v2" in pretrained_model_name_or_path:
|
||||
logger.warning("There is currently an upstream reproducibility issue with ALBERT v2 models. Please see " +
|
||||
"https://github.com/google-research/google-research/issues/119 for more information.")
|
||||
|
||||
config = kwargs.pop('config', None)
|
||||
state_dict = kwargs.pop('state_dict', None)
|
||||
cache_dir = kwargs.pop('cache_dir', None)
|
||||
from_tf = kwargs.pop('from_tf', False)
|
||||
force_download = kwargs.pop('force_download', False)
|
||||
resume_download = kwargs.pop('resume_download', False)
|
||||
proxies = kwargs.pop('proxies', None)
|
||||
output_loading_info = kwargs.pop('output_loading_info', False)
|
||||
|
||||
@@ -329,6 +337,7 @@ class PreTrainedModel(nn.Module):
|
||||
pretrained_model_name_or_path, *model_args,
|
||||
cache_dir=cache_dir, return_unused_kwargs=True,
|
||||
force_download=force_download,
|
||||
resume_download=resume_download,
|
||||
proxies=proxies,
|
||||
**kwargs
|
||||
)
|
||||
@@ -361,7 +370,8 @@ class PreTrainedModel(nn.Module):
|
||||
|
||||
# redirect to the cache, if necessary
|
||||
try:
|
||||
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
|
||||
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download,
|
||||
proxies=proxies, resume_download=resume_download)
|
||||
except EnvironmentError:
|
||||
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
|
||||
msg = "Couldn't reach server at '{}' to download pretrained weights.".format(
|
||||
@@ -417,6 +427,8 @@ class PreTrainedModel(nn.Module):
|
||||
new_key = key.replace('gamma', 'weight')
|
||||
if 'beta' in key:
|
||||
new_key = key.replace('beta', 'bias')
|
||||
if key == 'lm_head.decoder.weight':
|
||||
new_key = 'lm_head.weight'
|
||||
if new_key:
|
||||
old_keys.append(key)
|
||||
new_keys.append(new_key)
|
||||
@@ -728,7 +740,7 @@ class SequenceSummary(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(SequenceSummary, self).__init__()
|
||||
|
||||
self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
|
||||
self.summary_type = config.summary_type if hasattr(config, 'summary_type') else 'last'
|
||||
if self.summary_type == 'attn':
|
||||
# We should use a standard multi-head attention module with absolute positional embedding for that.
|
||||
# Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
|
||||
|
||||
@@ -583,6 +583,7 @@ class XLNetModel(XLNetPreTrainedModel):
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
|
||||
|
||||
Examples::
|
||||
|
||||
@@ -878,7 +879,11 @@ class XLNetModel(XLNetPreTrainedModel):
|
||||
hidden_states = tuple(hs.permute(1, 0, 2).contiguous() for hs in hidden_states)
|
||||
outputs = outputs + (hidden_states,)
|
||||
if self.output_attentions:
|
||||
attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
|
||||
if target_mapping is not None:
|
||||
# when target_mapping is provided, there are 2-tuple of attentions
|
||||
attentions = tuple(tuple(att_stream.permute(2, 3, 0, 1).contiguous() for att_stream in t) for t in attentions)
|
||||
else:
|
||||
attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
|
||||
outputs = outputs + (attentions,)
|
||||
|
||||
return outputs # outputs, (new_mems), (hidden_states), (attentions)
|
||||
@@ -913,6 +918,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
|
||||
|
||||
Examples::
|
||||
|
||||
@@ -995,6 +1001,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
|
||||
|
||||
Examples::
|
||||
|
||||
@@ -1046,6 +1053,106 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
|
||||
|
||||
return outputs # return (loss), logits, (mems), (hidden states), (attentions)
|
||||
|
||||
@add_start_docstrings("""XLNet Model with a token classification head on top (a linear layer on top of
|
||||
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
|
||||
XLNET_START_DOCSTRING,
|
||||
XLNET_INPUTS_DOCSTRING)
|
||||
class XLNetForTokenClassification(XLNetPreTrainedModel):
|
||||
r"""
|
||||
Inputs:
|
||||
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
The second dimension of the input (`num_choices`) indicates the number of choices to scores.
|
||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Segment token indices to indicate first and second portions of the inputs.
|
||||
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Mask to avoid performing attention on padding token indices.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||
Mask to nullify selected heads of the self-attention modules.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
|
||||
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
|
||||
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||
than the model's internal embedding lookup matrix.
|
||||
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||
Labels for computing the multiple choice classification loss.
|
||||
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
|
||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Classification loss.
|
||||
**scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
|
||||
Classification scores (before SoftMax).
|
||||
**mems**: (`optional`, returned when ``config.mem_len > 0``)
|
||||
list of ``torch.FloatTensor`` (one for each layer):
|
||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
|
||||
See details in the docstring of the `mems` input above.
|
||||
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
|
||||
Examples::
|
||||
|
||||
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
|
||||
model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, labels=labels)
|
||||
scores = outputs[0]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
super(XLNetForTokenClassification, self).__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
||||
self.transformer = XLNetModel(config)
|
||||
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
|
||||
token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
|
||||
|
||||
outputs = self.transformer(input_ids,
|
||||
attention_mask=attention_mask,
|
||||
mems=mems,
|
||||
perm_mask=perm_mask,
|
||||
target_mapping=target_mapping,
|
||||
token_type_ids=token_type_ids,
|
||||
input_mask=input_mask,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
|
||||
logits = self.classifier(sequence_output)
|
||||
|
||||
outputs = (logits,) + outputs[1:] # Keep mems, hidden states, attentions if there are in it
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
# Only keep active parts of the loss
|
||||
if attention_mask is not None:
|
||||
active_loss = attention_mask.view(-1) == 1
|
||||
active_logits = logits.view(-1, self.num_labels)[active_loss]
|
||||
active_labels = labels.view(-1)[active_loss]
|
||||
loss = loss_fct(active_logits, active_labels)
|
||||
else:
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
outputs = (loss,) + outputs
|
||||
|
||||
return outputs # return (loss), logits, (mems), (hidden states), (attentions)
|
||||
|
||||
|
||||
@add_start_docstrings("""XLNet Model with a multiple choice classification head on top (a linear layer on top of
|
||||
the pooled output and a softmax) e.g. for RACE/SWAG tasks. """,
|
||||
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
|
||||
@@ -1095,6 +1202,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
|
||||
|
||||
Examples::
|
||||
|
||||
@@ -1180,6 +1288,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
|
||||
|
||||
Examples::
|
||||
|
||||
@@ -1294,6 +1403,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
|
||||
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
|
||||
|
||||
Examples::
|
||||
|
||||
|
||||
254
transformers/optimization_tf.py
Normal file
254
transformers/optimization_tf.py
Normal file
@@ -0,0 +1,254 @@
|
||||
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Functions and classes related to optimization (weight updates)."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import re
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
|
||||
"""Applys a warmup schedule on a given learning rate decay schedule."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
initial_learning_rate,
|
||||
decay_schedule_fn,
|
||||
warmup_steps,
|
||||
power=1.0,
|
||||
name=None):
|
||||
super(WarmUp, self).__init__()
|
||||
self.initial_learning_rate = initial_learning_rate
|
||||
self.warmup_steps = warmup_steps
|
||||
self.power = power
|
||||
self.decay_schedule_fn = decay_schedule_fn
|
||||
self.name = name
|
||||
|
||||
def __call__(self, step):
|
||||
with tf.name_scope(self.name or 'WarmUp') as name:
|
||||
# Implements polynomial warmup. i.e., if global_step < warmup_steps, the
|
||||
# learning rate will be `global_step/num_warmup_steps * init_lr`.
|
||||
global_step_float = tf.cast(step, tf.float32)
|
||||
warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
|
||||
warmup_percent_done = global_step_float / warmup_steps_float
|
||||
warmup_learning_rate = (
|
||||
self.initial_learning_rate *
|
||||
tf.math.pow(warmup_percent_done, self.power))
|
||||
return tf.cond(global_step_float < warmup_steps_float,
|
||||
lambda: warmup_learning_rate,
|
||||
lambda: self.decay_schedule_fn(step),
|
||||
name=name)
|
||||
|
||||
def get_config(self):
|
||||
return {
|
||||
'initial_learning_rate': self.initial_learning_rate,
|
||||
'decay_schedule_fn': self.decay_schedule_fn,
|
||||
'warmup_steps': self.warmup_steps,
|
||||
'power': self.power,
|
||||
'name': self.name
|
||||
}
|
||||
|
||||
|
||||
def create_optimizer(init_lr, num_train_steps, num_warmup_steps):
|
||||
"""Creates an optimizer with learning rate schedule."""
|
||||
# Implements linear decay of the learning rate.
|
||||
learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
|
||||
initial_learning_rate=init_lr,
|
||||
decay_steps=num_train_steps,
|
||||
end_learning_rate=0.0)
|
||||
if num_warmup_steps:
|
||||
learning_rate_fn = WarmUp(initial_learning_rate=init_lr,
|
||||
decay_schedule_fn=learning_rate_fn,
|
||||
warmup_steps=num_warmup_steps)
|
||||
optimizer = AdamWeightDecay(
|
||||
learning_rate=learning_rate_fn,
|
||||
weight_decay_rate=0.01,
|
||||
beta_1=0.9,
|
||||
beta_2=0.999,
|
||||
epsilon=1e-6,
|
||||
exclude_from_weight_decay=['layer_norm', 'bias'])
|
||||
return optimizer
|
||||
|
||||
|
||||
class AdamWeightDecay(tf.keras.optimizers.Adam):
|
||||
"""Adam enables L2 weight decay and clip_by_global_norm on gradients.
|
||||
|
||||
Just adding the square of the weights to the loss function is *not* the
|
||||
correct way of using L2 regularization/weight decay with Adam, since that will
|
||||
interact with the m and v parameters in strange ways.
|
||||
|
||||
Instead we want ot decay the weights in a manner that doesn't interact with
|
||||
the m/v parameters. This is equivalent to adding the square of the weights to
|
||||
the loss with plain (non-momentum) SGD.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
learning_rate=0.001,
|
||||
beta_1=0.9,
|
||||
beta_2=0.999,
|
||||
epsilon=1e-7,
|
||||
amsgrad=False,
|
||||
weight_decay_rate=0.0,
|
||||
include_in_weight_decay=None,
|
||||
exclude_from_weight_decay=None,
|
||||
name='AdamWeightDecay',
|
||||
**kwargs):
|
||||
super(AdamWeightDecay, self).__init__(
|
||||
learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
|
||||
self.weight_decay_rate = weight_decay_rate
|
||||
self._include_in_weight_decay = include_in_weight_decay
|
||||
self._exclude_from_weight_decay = exclude_from_weight_decay
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, config):
|
||||
"""Creates an optimizer from its config with WarmUp custom object."""
|
||||
custom_objects = {'WarmUp': WarmUp}
|
||||
return super(AdamWeightDecay, cls).from_config(
|
||||
config, custom_objects=custom_objects)
|
||||
|
||||
def _prepare_local(self, var_device, var_dtype, apply_state):
|
||||
super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype,
|
||||
apply_state)
|
||||
apply_state['weight_decay_rate'] = tf.constant(
|
||||
self.weight_decay_rate, name='adam_weight_decay_rate')
|
||||
|
||||
def _decay_weights_op(self, var, learning_rate, apply_state):
|
||||
do_decay = self._do_use_weight_decay(var.name)
|
||||
if do_decay:
|
||||
return var.assign_sub(
|
||||
learning_rate * var *
|
||||
apply_state['weight_decay_rate'],
|
||||
use_locking=self._use_locking)
|
||||
return tf.no_op()
|
||||
|
||||
def apply_gradients(self, grads_and_vars, clip_norm, name=None):
|
||||
grads, tvars = list(zip(*grads_and_vars))
|
||||
(grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm)
|
||||
return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars))
|
||||
|
||||
def _get_lr(self, var_device, var_dtype, apply_state):
|
||||
"""Retrieves the learning rate with the given state."""
|
||||
if apply_state is None:
|
||||
return self._decayed_lr_t[var_dtype], {}
|
||||
|
||||
apply_state = apply_state or {}
|
||||
coefficients = apply_state.get((var_device, var_dtype))
|
||||
if coefficients is None:
|
||||
coefficients = self._fallback_apply_state(var_device, var_dtype)
|
||||
apply_state[(var_device, var_dtype)] = coefficients
|
||||
|
||||
return coefficients['lr_t'], dict(apply_state=apply_state)
|
||||
|
||||
def _resource_apply_dense(self, grad, var, apply_state=None):
|
||||
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
|
||||
decay = self._decay_weights_op(var, lr_t, apply_state)
|
||||
with tf.control_dependencies([decay]):
|
||||
return super(AdamWeightDecay, self)._resource_apply_dense(
|
||||
grad, var, **kwargs)
|
||||
|
||||
def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
|
||||
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
|
||||
decay = self._decay_weights_op(var, lr_t, apply_state)
|
||||
with tf.control_dependencies([decay]):
|
||||
return super(AdamWeightDecay, self)._resource_apply_sparse(
|
||||
grad, var, indices, **kwargs)
|
||||
|
||||
def get_config(self):
|
||||
config = super(AdamWeightDecay, self).get_config()
|
||||
config.update({
|
||||
'weight_decay_rate': self.weight_decay_rate,
|
||||
})
|
||||
return config
|
||||
|
||||
def _do_use_weight_decay(self, param_name):
|
||||
"""Whether to use L2 weight decay for `param_name`."""
|
||||
if self.weight_decay_rate == 0:
|
||||
return False
|
||||
|
||||
if self._include_in_weight_decay:
|
||||
for r in self._include_in_weight_decay:
|
||||
if re.search(r, param_name) is not None:
|
||||
return True
|
||||
|
||||
if self._exclude_from_weight_decay:
|
||||
for r in self._exclude_from_weight_decay:
|
||||
if re.search(r, param_name) is not None:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
## Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
|
||||
class GradientAccumulator(object):
|
||||
"""Distribution strategies-aware gradient accumulation utility."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initializes the accumulator."""
|
||||
self._gradients = []
|
||||
self._accum_steps = tf.Variable(
|
||||
initial_value=0,
|
||||
dtype=tf.int64,
|
||||
trainable=False,
|
||||
aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
|
||||
|
||||
@property
|
||||
def step(self):
|
||||
"""Number of accumulated steps."""
|
||||
return self._accum_steps.value()
|
||||
|
||||
@property
|
||||
def gradients(self):
|
||||
"""The accumulated gradients."""
|
||||
return list(gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients())
|
||||
|
||||
def __call__(self, gradients):
|
||||
"""Accumulates :obj:`gradients`."""
|
||||
if not self._gradients:
|
||||
self._gradients.extend([tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient for gradient in gradients])
|
||||
|
||||
if len(gradients) != len(self._gradients):
|
||||
raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients)))
|
||||
|
||||
for accum_gradient, gradient in zip(self._get_replica_gradients(), gradients):
|
||||
if accum_gradient is not None:
|
||||
accum_gradient.assign_add(gradient)
|
||||
|
||||
self._accum_steps.assign_add(1)
|
||||
|
||||
def reset(self):
|
||||
"""Resets the accumulated gradients."""
|
||||
if self._gradients:
|
||||
self._accum_steps.assign(0)
|
||||
|
||||
for gradient in self._get_replica_gradients():
|
||||
if gradient is not None:
|
||||
gradient.assign(tf.zeros_like(gradient))
|
||||
|
||||
def _get_replica_gradients(self):
|
||||
if tf.distribute.has_strategy():
|
||||
# In a replica context, we want to accumulate gradients on each replica
|
||||
# without synchronization, so we directly assign the value of the
|
||||
# current replica.
|
||||
replica_context = tf.distribute.get_replica_context()
|
||||
|
||||
if replica_context is None or tf.distribute.get_strategy().num_replicas_in_sync == 1:
|
||||
return self._gradients
|
||||
|
||||
return (gradient.device_map.select_for_current_replica(gradient.values, replica_context) for gradient in self._gradients)
|
||||
else:
|
||||
return self._gradients
|
||||
@@ -1,31 +0,0 @@
|
||||
# content of conftest.py
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption(
|
||||
"--runslow", action="store_true", default=False, help="run slow tests"
|
||||
)
|
||||
parser.addoption(
|
||||
"--use_cuda", action="store_true", default=False, help="run tests on gpu"
|
||||
)
|
||||
|
||||
|
||||
def pytest_configure(config):
|
||||
config.addinivalue_line("markers", "slow: mark test as slow to run")
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(config, items):
|
||||
if config.getoption("--runslow"):
|
||||
# --runslow given in cli: do not skip slow tests
|
||||
return
|
||||
skip_slow = pytest.mark.skip(reason="need --runslow option to run")
|
||||
for item in items:
|
||||
if "slow" in item.keywords:
|
||||
item.add_marker(skip_slow)
|
||||
|
||||
@pytest.fixture
|
||||
def use_cuda(request):
|
||||
""" Run test on gpu """
|
||||
return request.config.getoption("--use_cuda")
|
||||
BIN
transformers/tests/fixtures/spiece.model
vendored
Normal file
BIN
transformers/tests/fixtures/spiece.model
vendored
Normal file
Binary file not shown.
102
transformers/tests/hf_api_test.py
Normal file
102
transformers/tests/hf_api_test.py
Normal file
@@ -0,0 +1,102 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2019-present, the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import os
|
||||
import six
|
||||
import time
|
||||
import unittest
|
||||
|
||||
from transformers.hf_api import HfApi, S3Obj, PresignedUrl, HfFolder, HTTPError
|
||||
|
||||
USER = "__DUMMY_TRANSFORMERS_USER__"
|
||||
PASS = "__DUMMY_TRANSFORMERS_PASS__"
|
||||
FILE_KEY = "Test-{}.txt".format(int(time.time()))
|
||||
FILE_PATH = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
|
||||
)
|
||||
|
||||
|
||||
|
||||
class HfApiCommonTest(unittest.TestCase):
|
||||
_api = HfApi(endpoint="https://moon-staging.huggingface.co")
|
||||
|
||||
|
||||
class HfApiLoginTest(HfApiCommonTest):
|
||||
def test_login_invalid(self):
|
||||
with self.assertRaises(HTTPError):
|
||||
self._api.login(username=USER, password="fake")
|
||||
|
||||
def test_login_valid(self):
|
||||
token = self._api.login(username=USER, password=PASS)
|
||||
self.assertIsInstance(token, six.string_types)
|
||||
|
||||
|
||||
class HfApiEndpointsTest(HfApiCommonTest):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""
|
||||
Share this valid token in all tests below.
|
||||
"""
|
||||
cls._token = cls._api.login(username=USER, password=PASS)
|
||||
|
||||
def test_whoami(self):
|
||||
user = self._api.whoami(token=self._token)
|
||||
self.assertEqual(user, USER)
|
||||
|
||||
def test_presign(self):
|
||||
urls = self._api.presign(token=self._token, filename=FILE_KEY)
|
||||
self.assertIsInstance(urls, PresignedUrl)
|
||||
self.assertEqual(urls.type, "text/plain")
|
||||
|
||||
def test_presign_and_upload(self):
|
||||
access_url = self._api.presign_and_upload(
|
||||
token=self._token, filename=FILE_KEY, filepath=FILE_PATH
|
||||
)
|
||||
self.assertIsInstance(access_url, six.string_types)
|
||||
|
||||
def test_list_objs(self):
|
||||
objs = self._api.list_objs(token=self._token)
|
||||
self.assertIsInstance(objs, list)
|
||||
if len(objs) > 0:
|
||||
o = objs[-1]
|
||||
self.assertIsInstance(o, S3Obj)
|
||||
|
||||
|
||||
|
||||
class HfFolderTest(unittest.TestCase):
|
||||
def test_token_workflow(self):
|
||||
"""
|
||||
Test the whole token save/get/delete workflow,
|
||||
with the desired behavior with respect to non-existent tokens.
|
||||
"""
|
||||
token = "token-{}".format(int(time.time()))
|
||||
HfFolder.save_token(token)
|
||||
self.assertEqual(
|
||||
HfFolder.get_token(),
|
||||
token
|
||||
)
|
||||
HfFolder.delete_token()
|
||||
HfFolder.delete_token()
|
||||
# ^^ not an error, we test that the
|
||||
# second call does not fail.
|
||||
self.assertEqual(
|
||||
HfFolder.get_token(),
|
||||
None
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
240
transformers/tests/modeling_albert_test.py
Normal file
240
transformers/tests/modeling_albert_test.py
Normal file
@@ -0,0 +1,240 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
if is_torch_available():
|
||||
from transformers import (AlbertConfig, AlbertModel, AlbertForMaskedLM,
|
||||
AlbertForSequenceClassification, AlbertForQuestionAnswering,
|
||||
)
|
||||
from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
|
||||
|
||||
@require_torch
|
||||
class AlbertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (AlbertModel, AlbertForMaskedLM) if is_torch_available() else ()
|
||||
|
||||
class AlbertModelTester(object):
|
||||
|
||||
def __init__(self,
|
||||
parent,
|
||||
batch_size=13,
|
||||
seq_length=7,
|
||||
is_training=True,
|
||||
use_input_mask=True,
|
||||
use_token_type_ids=True,
|
||||
use_labels=True,
|
||||
vocab_size=99,
|
||||
embedding_size=16,
|
||||
hidden_size=36,
|
||||
num_hidden_layers=6,
|
||||
num_hidden_groups=6,
|
||||
num_attention_heads=6,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=512,
|
||||
type_vocab_size=16,
|
||||
type_sequence_label_size=2,
|
||||
initializer_range=0.02,
|
||||
num_labels=3,
|
||||
num_choices=4,
|
||||
scope=None,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.seq_length = seq_length
|
||||
self.is_training = is_training
|
||||
self.use_input_mask = use_input_mask
|
||||
self.use_token_type_ids = use_token_type_ids
|
||||
self.use_labels = use_labels
|
||||
self.vocab_size = vocab_size
|
||||
self.embedding_size = embedding_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_act = hidden_act
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.type_vocab_size = type_vocab_size
|
||||
self.type_sequence_label_size = type_sequence_label_size
|
||||
self.initializer_range = initializer_range
|
||||
self.num_labels = num_labels
|
||||
self.num_choices = num_choices
|
||||
self.scope = scope
|
||||
self.num_hidden_groups = num_hidden_groups
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
|
||||
input_mask = None
|
||||
if self.use_input_mask:
|
||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||
|
||||
token_type_ids = None
|
||||
if self.use_token_type_ids:
|
||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||
|
||||
sequence_labels = None
|
||||
token_labels = None
|
||||
choice_labels = None
|
||||
if self.use_labels:
|
||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = AlbertConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
hidden_size=self.hidden_size,
|
||||
num_hidden_layers=self.num_hidden_layers,
|
||||
num_attention_heads=self.num_attention_heads,
|
||||
intermediate_size=self.intermediate_size,
|
||||
hidden_act=self.hidden_act,
|
||||
hidden_dropout_prob=self.hidden_dropout_prob,
|
||||
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
initializer_range=self.initializer_range,
|
||||
num_hidden_groups=self.num_hidden_groups)
|
||||
|
||||
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
|
||||
def check_loss_output(self, result):
|
||||
self.parent.assertListEqual(
|
||||
list(result["loss"].size()),
|
||||
[])
|
||||
|
||||
def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = AlbertModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
||||
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
||||
sequence_output, pooled_output = model(input_ids)
|
||||
|
||||
result = {
|
||||
"sequence_output": sequence_output,
|
||||
"pooled_output": pooled_output,
|
||||
}
|
||||
self.parent.assertListEqual(
|
||||
list(result["sequence_output"].size()),
|
||||
[self.batch_size, self.seq_length, self.hidden_size])
|
||||
self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
|
||||
|
||||
|
||||
def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = AlbertForMaskedLM(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
|
||||
result = {
|
||||
"loss": loss,
|
||||
"prediction_scores": prediction_scores,
|
||||
}
|
||||
self.parent.assertListEqual(
|
||||
list(result["prediction_scores"].size()),
|
||||
[self.batch_size, self.seq_length, self.vocab_size])
|
||||
self.check_loss_output(result)
|
||||
|
||||
def create_and_check_albert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = AlbertForQuestionAnswering(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
||||
start_positions=sequence_labels, end_positions=sequence_labels)
|
||||
result = {
|
||||
"loss": loss,
|
||||
"start_logits": start_logits,
|
||||
"end_logits": end_logits,
|
||||
}
|
||||
self.parent.assertListEqual(
|
||||
list(result["start_logits"].size()),
|
||||
[self.batch_size, self.seq_length])
|
||||
self.parent.assertListEqual(
|
||||
list(result["end_logits"].size()),
|
||||
[self.batch_size, self.seq_length])
|
||||
self.check_loss_output(result)
|
||||
|
||||
|
||||
def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
config.num_labels = self.num_labels
|
||||
model = AlbertForSequenceClassification(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
|
||||
result = {
|
||||
"loss": loss,
|
||||
"logits": logits,
|
||||
}
|
||||
self.parent.assertListEqual(
|
||||
list(result["logits"].size()),
|
||||
[self.batch_size, self.num_labels])
|
||||
self.check_loss_output(result)
|
||||
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
(config, input_ids, token_type_ids, input_mask,
|
||||
sequence_labels, token_labels, choice_labels) = config_and_inputs
|
||||
inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
|
||||
return config, inputs_dict
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = AlbertModelTest.AlbertModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_albert_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_albert_model(*config_and_inputs)
|
||||
|
||||
def test_for_masked_lm(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_albert_for_masked_lm(*config_and_inputs)
|
||||
|
||||
def test_for_question_answering(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_albert_for_question_answering(*config_and_inputs)
|
||||
|
||||
def test_for_sequence_classification(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_albert_for_sequence_classification(*config_and_inputs)
|
||||
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
model = AlbertModel.from_pretrained(model_name, cache_dir=cache_dir)
|
||||
shutil.rmtree(cache_dir)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -18,11 +18,12 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
import logging
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
from .utils import require_torch, slow
|
||||
|
||||
if is_torch_available():
|
||||
from transformers import (AutoConfig, BertConfig,
|
||||
AutoModel, BertModel,
|
||||
@@ -33,12 +34,11 @@ if is_torch_available():
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
|
||||
@require_torch
|
||||
class AutoModelTest(unittest.TestCase):
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
@@ -53,7 +53,7 @@ class AutoModelTest(unittest.TestCase):
|
||||
for value in loading_info.values():
|
||||
self.assertEqual(len(value), 0)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_lmhead_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
@@ -66,7 +66,7 @@ class AutoModelTest(unittest.TestCase):
|
||||
self.assertIsNotNone(model)
|
||||
self.assertIsInstance(model, BertForMaskedLM)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_sequence_classification_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
@@ -79,7 +79,7 @@ class AutoModelTest(unittest.TestCase):
|
||||
self.assertIsNotNone(model)
|
||||
self.assertIsInstance(model, BertForSequenceClassification)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_question_answering_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -18,12 +18,12 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
if is_torch_available():
|
||||
from transformers import (BertConfig, BertModel, BertForMaskedLM,
|
||||
@@ -31,11 +31,9 @@ if is_torch_available():
|
||||
BertForQuestionAnswering, BertForSequenceClassification,
|
||||
BertForTokenClassification, BertForMultipleChoice)
|
||||
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("use_cuda")
|
||||
@require_torch
|
||||
class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
|
||||
@@ -67,7 +65,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
num_labels=3,
|
||||
num_choices=4,
|
||||
scope=None,
|
||||
device='cpu',
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
@@ -91,26 +88,25 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
self.num_labels = num_labels
|
||||
self.num_choices = num_choices
|
||||
self.scope = scope
|
||||
self.device = device
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).to(self.device)
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
|
||||
input_mask = None
|
||||
if self.use_input_mask:
|
||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2).to(self.device)
|
||||
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
|
||||
|
||||
token_type_ids = None
|
||||
if self.use_token_type_ids:
|
||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size).to(self.device)
|
||||
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||
|
||||
sequence_labels = None
|
||||
token_labels = None
|
||||
choice_labels = None
|
||||
if self.use_labels:
|
||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size).to(self.device)
|
||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels).to(self.device)
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices).to(self.device)
|
||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = BertConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
@@ -144,7 +140,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = BertModel(config=config)
|
||||
model.to(input_ids.device)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
||||
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
||||
@@ -161,6 +157,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_bert_model_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
|
||||
model = BertModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
|
||||
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
|
||||
@@ -177,6 +174,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = BertForMaskedLM(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
|
||||
result = {
|
||||
@@ -190,6 +188,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_bert_model_for_masked_lm_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
|
||||
model = BertForMaskedLM(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
|
||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states)
|
||||
@@ -204,6 +203,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = BertForNextSentencePrediction(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels)
|
||||
result = {
|
||||
@@ -217,6 +217,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = BertForPreTraining(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, prediction_scores, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
||||
masked_lm_labels=token_labels, next_sentence_label=sequence_labels)
|
||||
@@ -235,6 +236,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = BertForQuestionAnswering(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
||||
start_positions=sequence_labels, end_positions=sequence_labels)
|
||||
@@ -254,6 +256,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
config.num_labels = self.num_labels
|
||||
model = BertForSequenceClassification(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
|
||||
result = {
|
||||
@@ -268,6 +271,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
config.num_labels = self.num_labels
|
||||
model = BertForTokenClassification(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
|
||||
result = {
|
||||
@@ -282,6 +286,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
config.num_choices = self.num_choices
|
||||
model = BertForMultipleChoice(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||
@@ -313,10 +318,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_bert_model(self, use_cuda=False):
|
||||
# ^^ This could be a real fixture
|
||||
if use_cuda:
|
||||
self.model_tester.device = "cuda"
|
||||
def test_bert_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_bert_model(*config_and_inputs)
|
||||
|
||||
@@ -356,7 +358,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -27,10 +27,11 @@ import uuid
|
||||
|
||||
import unittest
|
||||
import logging
|
||||
import pytest
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
import numpy as np
|
||||
@@ -38,8 +39,6 @@ if is_torch_available():
|
||||
from transformers import (AdaptiveEmbedding, PretrainedConfig, PreTrainedModel,
|
||||
BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
if sys.version_info[0] == 2:
|
||||
import cPickle as pickle
|
||||
@@ -65,6 +64,7 @@ def _config_zero_init(config):
|
||||
|
||||
class CommonTestCases:
|
||||
|
||||
@require_torch
|
||||
class CommonModelTester(unittest.TestCase):
|
||||
|
||||
model_tester = None
|
||||
@@ -79,6 +79,7 @@ class CommonTestCases:
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs_dict)
|
||||
@@ -86,12 +87,13 @@ class CommonTestCases:
|
||||
with TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname)
|
||||
model = model_class.from_pretrained(tmpdirname)
|
||||
model.to(torch_device)
|
||||
with torch.no_grad():
|
||||
after_outputs = model(**inputs_dict)
|
||||
|
||||
# Make sure we don't have nans
|
||||
out_1 = after_outputs[0].numpy()
|
||||
out_2 = outputs[0].numpy()
|
||||
out_1 = after_outputs[0].cpu().numpy()
|
||||
out_2 = outputs[0].cpu().numpy()
|
||||
out_1 = out_1[~np.isnan(out_1)]
|
||||
out_2 = out_2[~np.isnan(out_2)]
|
||||
max_diff = np.amax(np.abs(out_1 - out_2))
|
||||
@@ -113,6 +115,7 @@ class CommonTestCases:
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
first, second = model(inputs_dict["input_ids"])[0], model(inputs_dict["input_ids"])[0]
|
||||
self.assertEqual(first.ne(second).sum().item(), 0)
|
||||
@@ -125,6 +128,7 @@ class CommonTestCases:
|
||||
config.output_attentions = True
|
||||
config.output_hidden_states = False
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
outputs = model(**inputs_dict)
|
||||
attentions = outputs[-1]
|
||||
@@ -142,6 +146,7 @@ class CommonTestCases:
|
||||
config.output_attentions = True
|
||||
config.output_hidden_states = True
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
outputs = model(**inputs_dict)
|
||||
self.assertEqual(out_len+1, len(outputs))
|
||||
@@ -181,6 +186,7 @@ class CommonTestCases:
|
||||
configs_no_init.torchscript = True
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config=configs_no_init)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
inputs = inputs_dict['input_ids'] # Let's keep only input_ids
|
||||
|
||||
@@ -201,7 +207,10 @@ class CommonTestCases:
|
||||
except ValueError:
|
||||
self.fail("Couldn't load module.")
|
||||
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
loaded_model.to(torch_device)
|
||||
loaded_model.eval()
|
||||
|
||||
model_params = model.parameters()
|
||||
@@ -228,11 +237,12 @@ class CommonTestCases:
|
||||
configs_no_init = _config_zero_init(config) # To be sure we have no Nan
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config=configs_no_init)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
# Prepare head_mask
|
||||
# Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
|
||||
head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
|
||||
head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device)
|
||||
head_mask[0, 0] = 0
|
||||
head_mask[-1, :-1] = 0
|
||||
head_mask.requires_grad_(requires_grad=True)
|
||||
@@ -282,6 +292,7 @@ class CommonTestCases:
|
||||
config.output_attentions = True
|
||||
config.output_hidden_states = False
|
||||
model = model_class(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
|
||||
-1: [0]}
|
||||
@@ -310,6 +321,7 @@ class CommonTestCases:
|
||||
config.output_attentions = True
|
||||
config.output_hidden_states = False
|
||||
model = model_class(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
|
||||
-1: [0]}
|
||||
@@ -319,6 +331,7 @@ class CommonTestCases:
|
||||
os.makedirs(directory)
|
||||
model.save_pretrained(directory)
|
||||
model = model_class.from_pretrained(directory)
|
||||
model.to(torch_device)
|
||||
|
||||
outputs = model(**inputs_dict)
|
||||
attentions = outputs[-1]
|
||||
@@ -346,6 +359,7 @@ class CommonTestCases:
|
||||
config.pruned_heads = heads_to_prune
|
||||
|
||||
model = model_class(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
outputs = model(**inputs_dict)
|
||||
@@ -372,6 +386,7 @@ class CommonTestCases:
|
||||
config.pruned_heads = heads_to_prune
|
||||
|
||||
model = model_class(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
outputs = model(**inputs_dict)
|
||||
@@ -388,6 +403,7 @@ class CommonTestCases:
|
||||
os.makedirs(directory)
|
||||
model.save_pretrained(directory)
|
||||
model = model_class.from_pretrained(directory)
|
||||
model.to(torch_device)
|
||||
shutil.rmtree(directory)
|
||||
|
||||
outputs = model(**inputs_dict)
|
||||
@@ -419,6 +435,7 @@ class CommonTestCases:
|
||||
config.output_hidden_states = True
|
||||
config.output_attentions = False
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
outputs = model(**inputs_dict)
|
||||
hidden_states = outputs[-1]
|
||||
@@ -538,6 +555,7 @@ class CommonTestCases:
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
wte = model.get_input_embeddings()
|
||||
@@ -628,6 +646,7 @@ class CommonTestCases:
|
||||
def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
|
||||
mc_labels, lm_labels, mc_token_ids):
|
||||
model = self.base_model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
outputs = model(input_ids, position_ids, token_type_ids)
|
||||
@@ -643,6 +662,7 @@ class CommonTestCases:
|
||||
def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
|
||||
mc_labels, lm_labels, mc_token_ids):
|
||||
model = self.lm_head_model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
|
||||
loss, lm_logits = outputs[:2]
|
||||
@@ -659,6 +679,7 @@ class CommonTestCases:
|
||||
mc_labels, lm_labels, mc_token_ids):
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
outputs = model(input_ids)
|
||||
presents = outputs[-1]
|
||||
@@ -671,6 +692,7 @@ class CommonTestCases:
|
||||
def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
|
||||
mc_labels, lm_labels, mc_token_ids):
|
||||
model = self.double_head_model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
|
||||
token_type_ids=token_type_ids, position_ids=position_ids)
|
||||
@@ -716,7 +738,7 @@ class CommonTestCases:
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
self.create_and_check_presents(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def run_slow_tests(self):
|
||||
self.create_and_check_model_from_pretrained()
|
||||
|
||||
@@ -770,7 +792,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None):
|
||||
for _ in range(total_dims):
|
||||
values.append(rng.randint(0, vocab_size - 1))
|
||||
|
||||
return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
|
||||
return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
|
||||
|
||||
|
||||
def floats_tensor(shape, scale=1.0, rng=None, name=None):
|
||||
@@ -786,11 +808,12 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None):
|
||||
for _ in range(total_dims):
|
||||
values.append(rng.random() * scale)
|
||||
|
||||
return torch.tensor(data=values, dtype=torch.float).view(shape).contiguous()
|
||||
return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
|
||||
|
||||
|
||||
@require_torch
|
||||
class ModelUtilsTest(unittest.TestCase):
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -16,7 +16,6 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import pytest
|
||||
import shutil
|
||||
import pdb
|
||||
|
||||
@@ -25,13 +24,13 @@ from transformers import is_torch_available
|
||||
if is_torch_available():
|
||||
from transformers import (CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
CTRLLMHeadModel)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
|
||||
@require_torch
|
||||
class CTRLModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (CTRLModel, CTRLLMHeadModel) if is_torch_available() else ()
|
||||
@@ -140,6 +139,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||
model = CTRLModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
||||
@@ -157,6 +157,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||
model = CTRLLMHeadModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
||||
@@ -202,7 +203,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -17,7 +17,6 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import pytest
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
@@ -25,13 +24,13 @@ if is_torch_available():
|
||||
from transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
|
||||
DistilBertForTokenClassification,
|
||||
DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
|
||||
@require_torch
|
||||
class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering,
|
||||
@@ -126,6 +125,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = DistilBertModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
(sequence_output,) = model(input_ids, input_mask)
|
||||
(sequence_output,) = model(input_ids)
|
||||
@@ -139,6 +139,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = DistilBertForMaskedLM(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, masked_lm_labels=token_labels)
|
||||
result = {
|
||||
@@ -152,6 +153,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = DistilBertForQuestionAnswering(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels)
|
||||
result = {
|
||||
@@ -170,6 +172,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
||||
def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
config.num_labels = self.num_labels
|
||||
model = DistilBertForSequenceClassification(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, logits = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
|
||||
result = {
|
||||
@@ -184,6 +187,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
||||
def create_and_check_distilbert_for_token_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
config.num_labels = self.num_labels
|
||||
model = DistilBertForTokenClassification(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
loss, logits = model(input_ids, attention_mask=input_mask, labels=token_labels)
|
||||
@@ -229,7 +233,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_distilbert_for_token_classification(*config_and_inputs)
|
||||
|
||||
# @pytest.mark.slow
|
||||
# @slow
|
||||
# def test_model_from_pretrained(self):
|
||||
# cache_dir = "/tmp/transformers_test/"
|
||||
# for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -15,19 +15,18 @@
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
import pytest
|
||||
|
||||
from transformers import is_torch_available
|
||||
from .utils import require_torch, slow
|
||||
|
||||
if is_torch_available():
|
||||
from transformers import BertModel, BertForMaskedLM, Model2Model
|
||||
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
|
||||
@require_torch
|
||||
class EncoderDecoderModelTest(unittest.TestCase):
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model2model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -17,7 +17,6 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import pytest
|
||||
import shutil
|
||||
|
||||
from transformers import is_torch_available
|
||||
@@ -25,13 +24,13 @@ from transformers import is_torch_available
|
||||
if is_torch_available():
|
||||
from transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
GPT2LMHeadModel, GPT2DoubleHeadsModel)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
|
||||
@require_torch
|
||||
class GPT2ModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
|
||||
@@ -136,6 +135,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||
model = GPT2Model(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
||||
@@ -153,6 +153,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
|
||||
model = GPT2LMHeadModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
||||
@@ -171,6 +172,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_double_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
|
||||
model = GPT2DoubleHeadsModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
|
||||
@@ -235,7 +237,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -17,7 +17,6 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import pytest
|
||||
import shutil
|
||||
|
||||
from transformers import is_torch_available
|
||||
@@ -25,13 +24,13 @@ from transformers import is_torch_available
|
||||
if is_torch_available():
|
||||
from transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
|
||||
@require_torch
|
||||
class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else ()
|
||||
@@ -124,6 +123,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||
model = OpenAIGPTModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
|
||||
@@ -139,6 +139,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||
model = OpenAIGPTLMHeadModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
|
||||
@@ -157,6 +158,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
|
||||
model = OpenAIGPTDoubleHeadsModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
|
||||
@@ -203,7 +205,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -18,7 +18,6 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
@@ -27,13 +26,13 @@ if is_torch_available():
|
||||
from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM,
|
||||
RobertaForSequenceClassification, RobertaForTokenClassification)
|
||||
from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
|
||||
@require_torch
|
||||
class RobertaModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (RobertaForMaskedLM, RobertaModel) if is_torch_available() else ()
|
||||
@@ -129,6 +128,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
||||
def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
|
||||
token_labels, choice_labels):
|
||||
model = RobertaModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
||||
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
||||
@@ -146,6 +146,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
||||
def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
|
||||
token_labels, choice_labels):
|
||||
model = RobertaForMaskedLM(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
|
||||
result = {
|
||||
@@ -161,6 +162,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
||||
sequence_labels, token_labels, choice_labels):
|
||||
config.num_labels = self.num_labels
|
||||
model = RobertaForTokenClassification(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
||||
labels=token_labels)
|
||||
@@ -195,7 +197,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
@@ -207,10 +209,10 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
class RobertaModelIntegrationTest(unittest.TestCase):
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_inference_masked_lm(self):
|
||||
model = RobertaForMaskedLM.from_pretrained('roberta-base')
|
||||
|
||||
|
||||
input_ids = torch.tensor([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
||||
output = model(input_ids)[0]
|
||||
expected_shape = torch.Size((1, 11, 50265))
|
||||
@@ -228,10 +230,10 @@ class RobertaModelIntegrationTest(unittest.TestCase):
|
||||
torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
|
||||
)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_inference_no_head(self):
|
||||
model = RobertaModel.from_pretrained('roberta-base')
|
||||
|
||||
|
||||
input_ids = torch.tensor([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
||||
output = model(input_ids)[0]
|
||||
# compare the actual values for a slice.
|
||||
@@ -244,10 +246,10 @@ class RobertaModelIntegrationTest(unittest.TestCase):
|
||||
torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
|
||||
)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_inference_classification_head(self):
|
||||
model = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
|
||||
|
||||
|
||||
input_ids = torch.tensor([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
||||
output = model(input_ids)[0]
|
||||
expected_shape = torch.Size((1, 3))
|
||||
|
||||
230
transformers/tests/modeling_tf_albert_test.py
Normal file
230
transformers/tests/modeling_tf_albert_test.py
Normal file
@@ -0,0 +1,230 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import sys
|
||||
|
||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_tf, slow
|
||||
|
||||
from transformers import AlbertConfig, is_tf_available
|
||||
|
||||
if is_tf_available():
|
||||
import tensorflow as tf
|
||||
from transformers.modeling_tf_albert import (TFAlbertModel, TFAlbertForMaskedLM,
|
||||
TFAlbertForSequenceClassification,
|
||||
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
all_model_classes = (
|
||||
TFAlbertModel,
|
||||
TFAlbertForMaskedLM,
|
||||
TFAlbertForSequenceClassification
|
||||
) if is_tf_available() else ()
|
||||
|
||||
class TFAlbertModelTester(object):
|
||||
|
||||
def __init__(self,
|
||||
parent,
|
||||
batch_size=13,
|
||||
seq_length=7,
|
||||
is_training=True,
|
||||
use_input_mask=True,
|
||||
use_token_type_ids=True,
|
||||
use_labels=True,
|
||||
vocab_size=99,
|
||||
embedding_size=16,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=512,
|
||||
type_vocab_size=16,
|
||||
type_sequence_label_size=2,
|
||||
initializer_range=0.02,
|
||||
num_labels=3,
|
||||
num_choices=4,
|
||||
scope=None,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.seq_length = seq_length
|
||||
self.is_training = is_training
|
||||
self.use_input_mask = use_input_mask
|
||||
self.use_token_type_ids = use_token_type_ids
|
||||
self.use_labels = use_labels
|
||||
self.vocab_size = vocab_size
|
||||
self.embedding_size = embedding_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_act = hidden_act
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.type_vocab_size = type_vocab_size
|
||||
self.type_sequence_label_size = type_sequence_label_size
|
||||
self.initializer_range = initializer_range
|
||||
self.num_labels = num_labels
|
||||
self.num_choices = num_choices
|
||||
self.scope = scope
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
input_ids = ids_tensor(
|
||||
[self.batch_size, self.seq_length], self.vocab_size)
|
||||
|
||||
input_mask = None
|
||||
if self.use_input_mask:
|
||||
input_mask = ids_tensor(
|
||||
[self.batch_size, self.seq_length], vocab_size=2)
|
||||
|
||||
token_type_ids = None
|
||||
if self.use_token_type_ids:
|
||||
token_type_ids = ids_tensor(
|
||||
[self.batch_size, self.seq_length], self.type_vocab_size)
|
||||
|
||||
sequence_labels = None
|
||||
token_labels = None
|
||||
choice_labels = None
|
||||
if self.use_labels:
|
||||
sequence_labels = ids_tensor(
|
||||
[self.batch_size], self.type_sequence_label_size)
|
||||
token_labels = ids_tensor(
|
||||
[self.batch_size, self.seq_length], self.num_labels)
|
||||
choice_labels = ids_tensor([self.batch_size], self.num_choices)
|
||||
|
||||
config = AlbertConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
hidden_size=self.hidden_size,
|
||||
num_hidden_layers=self.num_hidden_layers,
|
||||
num_attention_heads=self.num_attention_heads,
|
||||
intermediate_size=self.intermediate_size,
|
||||
hidden_act=self.hidden_act,
|
||||
hidden_dropout_prob=self.hidden_dropout_prob,
|
||||
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
initializer_range=self.initializer_range)
|
||||
|
||||
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
|
||||
def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = TFAlbertModel(config=config)
|
||||
# inputs = {'input_ids': input_ids,
|
||||
# 'attention_mask': input_mask,
|
||||
# 'token_type_ids': token_type_ids}
|
||||
# sequence_output, pooled_output = model(**inputs)
|
||||
inputs = {'input_ids': input_ids,
|
||||
'attention_mask': input_mask,
|
||||
'token_type_ids': token_type_ids}
|
||||
sequence_output, pooled_output = model(inputs)
|
||||
|
||||
inputs = [input_ids, input_mask]
|
||||
sequence_output, pooled_output = model(inputs)
|
||||
|
||||
sequence_output, pooled_output = model(input_ids)
|
||||
|
||||
result = {
|
||||
"sequence_output": sequence_output.numpy(),
|
||||
"pooled_output": pooled_output.numpy(),
|
||||
}
|
||||
self.parent.assertListEqual(
|
||||
list(result["sequence_output"].shape),
|
||||
[self.batch_size, self.seq_length, self.hidden_size])
|
||||
self.parent.assertListEqual(list(result["pooled_output"].shape), [
|
||||
self.batch_size, self.hidden_size])
|
||||
|
||||
def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
model = TFAlbertForMaskedLM(config=config)
|
||||
inputs = {'input_ids': input_ids,
|
||||
'attention_mask': input_mask,
|
||||
'token_type_ids': token_type_ids}
|
||||
prediction_scores, = model(inputs)
|
||||
result = {
|
||||
"prediction_scores": prediction_scores.numpy(),
|
||||
}
|
||||
self.parent.assertListEqual(
|
||||
list(result["prediction_scores"].shape),
|
||||
[self.batch_size, self.seq_length, self.vocab_size])
|
||||
|
||||
def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||
config.num_labels = self.num_labels
|
||||
model = TFAlbertForSequenceClassification(config=config)
|
||||
inputs = {'input_ids': input_ids,
|
||||
'attention_mask': input_mask,
|
||||
'token_type_ids': token_type_ids}
|
||||
logits, = model(inputs)
|
||||
result = {
|
||||
"logits": logits.numpy(),
|
||||
}
|
||||
self.parent.assertListEqual(
|
||||
list(result["logits"].shape),
|
||||
[self.batch_size, self.num_labels])
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
(config, input_ids, token_type_ids, input_mask,
|
||||
sequence_labels, token_labels, choice_labels) = config_and_inputs
|
||||
inputs_dict = {'input_ids': input_ids,
|
||||
'token_type_ids': token_type_ids, 'attention_mask': input_mask}
|
||||
return config, inputs_dict
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = TFAlbertModelTest.TFAlbertModelTester(self)
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=AlbertConfig, hidden_size=37)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_albert_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_albert_model(*config_and_inputs)
|
||||
|
||||
def test_for_masked_lm(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_albert_for_masked_lm(
|
||||
*config_and_inputs)
|
||||
|
||||
def test_for_sequence_classification(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_albert_for_sequence_classification(
|
||||
*config_and_inputs)
|
||||
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
# for model_name in list(TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
for model_name in ['albert-base-uncased']:
|
||||
model = TFAlbertModel.from_pretrained(
|
||||
model_name, cache_dir=cache_dir)
|
||||
shutil.rmtree(cache_dir)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -18,11 +18,12 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
import logging
|
||||
|
||||
from transformers import is_tf_available
|
||||
|
||||
from .utils import require_tf, slow
|
||||
|
||||
if is_tf_available():
|
||||
from transformers import (AutoConfig, BertConfig,
|
||||
TFAutoModel, TFBertModel,
|
||||
@@ -33,11 +34,11 @@ if is_tf_available():
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFAutoModelTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
import h5py
|
||||
self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
|
||||
@@ -53,6 +54,7 @@ class TFAutoModelTest(unittest.TestCase):
|
||||
self.assertIsNotNone(model)
|
||||
self.assertIsInstance(model, TFBertModel)
|
||||
|
||||
@slow
|
||||
def test_lmhead_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
# for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
@@ -65,6 +67,7 @@ class TFAutoModelTest(unittest.TestCase):
|
||||
self.assertIsNotNone(model)
|
||||
self.assertIsInstance(model, TFBertForMaskedLM)
|
||||
|
||||
@slow
|
||||
def test_sequence_classification_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
# for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
@@ -77,6 +80,7 @@ class TFAutoModelTest(unittest.TestCase):
|
||||
self.assertIsNotNone(model)
|
||||
self.assertIsInstance(model, TFBertForSequenceClassification)
|
||||
|
||||
@slow
|
||||
def test_question_answering_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
# for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -18,11 +18,11 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
import sys
|
||||
|
||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_tf, slow
|
||||
|
||||
from transformers import BertConfig, is_tf_available
|
||||
|
||||
@@ -36,10 +36,9 @@ if is_tf_available():
|
||||
TFBertForTokenClassification,
|
||||
TFBertForQuestionAnswering,
|
||||
TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
all_model_classes = (TFBertModel, TFBertForMaskedLM, TFBertForNextSentencePrediction,
|
||||
@@ -309,7 +308,7 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
# for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -25,18 +25,17 @@ import unittest
|
||||
import uuid
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
import sys
|
||||
|
||||
from transformers import is_tf_available, is_torch_available
|
||||
|
||||
from .utils import require_tf, slow
|
||||
|
||||
if is_tf_available():
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
from transformers import TFPreTrainedModel
|
||||
# from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
if sys.version_info[0] == 2:
|
||||
import cPickle as pickle
|
||||
@@ -62,6 +61,7 @@ def _config_zero_init(config):
|
||||
|
||||
class TFCommonTestCases:
|
||||
|
||||
@require_tf
|
||||
class TFCommonModelTester(unittest.TestCase):
|
||||
|
||||
model_tester = None
|
||||
@@ -164,7 +164,7 @@ class TFCommonTestCases:
|
||||
for model_class in self.all_model_classes:
|
||||
# Prepare our model
|
||||
model = model_class(config)
|
||||
|
||||
|
||||
# Let's load it from the disk to be sure we can use pretrained weights
|
||||
with TemporaryDirectory() as tmpdirname:
|
||||
outputs = model(inputs_dict) # build the model
|
||||
@@ -233,80 +233,6 @@ class TFCommonTestCases:
|
||||
self.model_tester.seq_length,
|
||||
self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
|
||||
|
||||
def test_headmasking(self):
|
||||
pass
|
||||
# config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
# config.output_attentions = True
|
||||
# config.output_hidden_states = True
|
||||
# configs_no_init = _config_zero_init(config) # To be sure we have no Nan
|
||||
# for model_class in self.all_model_classes:
|
||||
# model = model_class(config=configs_no_init)
|
||||
# model.eval()
|
||||
|
||||
# # Prepare head_mask
|
||||
# # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
|
||||
# head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
|
||||
# head_mask[0, 0] = 0
|
||||
# head_mask[-1, :-1] = 0
|
||||
# head_mask.requires_grad_(requires_grad=True)
|
||||
# inputs = inputs_dict.copy()
|
||||
# inputs['head_mask'] = head_mask
|
||||
|
||||
# outputs = model(**inputs)
|
||||
|
||||
# # Test that we can get a gradient back for importance score computation
|
||||
# output = sum(t.sum() for t in outputs[0])
|
||||
# output = output.sum()
|
||||
# output.backward()
|
||||
# multihead_outputs = head_mask.grad
|
||||
|
||||
# attentions = outputs[-1]
|
||||
# hidden_states = outputs[-2]
|
||||
|
||||
# # Remove Nan
|
||||
|
||||
# self.assertIsNotNone(multihead_outputs)
|
||||
# self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
|
||||
# self.assertAlmostEqual(
|
||||
# attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
|
||||
# self.assertNotEqual(
|
||||
# attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
|
||||
# self.assertNotEqual(
|
||||
# attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
|
||||
# self.assertAlmostEqual(
|
||||
# attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
|
||||
# self.assertNotEqual(
|
||||
# attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
|
||||
|
||||
|
||||
def test_head_pruning(self):
|
||||
pass
|
||||
# if not self.test_pruning:
|
||||
# return
|
||||
|
||||
# config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
# for model_class in self.all_model_classes:
|
||||
# config.output_attentions = True
|
||||
# config.output_hidden_states = False
|
||||
# model = model_class(config=config)
|
||||
# model.eval()
|
||||
# heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
|
||||
# -1: [0]}
|
||||
# model.prune_heads(heads_to_prune)
|
||||
# outputs = model(**inputs_dict)
|
||||
|
||||
# attentions = outputs[-1]
|
||||
|
||||
# self.assertEqual(
|
||||
# attentions[0].shape[-3], 1)
|
||||
# self.assertEqual(
|
||||
# attentions[1].shape[-3], self.model_tester.num_attention_heads)
|
||||
# self.assertEqual(
|
||||
# attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
|
||||
|
||||
|
||||
def test_hidden_states_output(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
@@ -323,43 +249,6 @@ class TFCommonTestCases:
|
||||
list(hidden_states[0].shape[-2:]),
|
||||
[self.model_tester.seq_length, self.model_tester.hidden_size])
|
||||
|
||||
|
||||
def test_resize_tokens_embeddings(self):
|
||||
pass
|
||||
# original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
# if not self.test_resize_embeddings:
|
||||
# return
|
||||
|
||||
# for model_class in self.all_model_classes:
|
||||
# config = copy.deepcopy(original_config)
|
||||
# model = model_class(config)
|
||||
|
||||
# model_vocab_size = config.vocab_size
|
||||
# # Retrieve the embeddings and clone theme
|
||||
# model_embed = model.resize_token_embeddings(model_vocab_size)
|
||||
# cloned_embeddings = model_embed.weight.clone()
|
||||
|
||||
# # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
|
||||
# model_embed = model.resize_token_embeddings(model_vocab_size + 10)
|
||||
# self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
|
||||
# # Check that it actually resizes the embeddings matrix
|
||||
# self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
|
||||
|
||||
# # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
|
||||
# model_embed = model.resize_token_embeddings(model_vocab_size - 15)
|
||||
# self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
|
||||
# # Check that it actually resizes the embeddings matrix
|
||||
# self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
|
||||
|
||||
# # Check that adding and removing tokens has not modified the first part of the embedding matrix.
|
||||
# models_equal = True
|
||||
# for p1, p2 in zip(cloned_embeddings, model_embed.weight):
|
||||
# if p1.data.ne(p2.data).sum() > 0:
|
||||
# models_equal = False
|
||||
|
||||
# self.assertTrue(models_equal)
|
||||
|
||||
|
||||
def test_model_common_attributes(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
@@ -369,40 +258,6 @@ class TFCommonTestCases:
|
||||
x = model.get_output_embeddings()
|
||||
assert x is None or isinstance(x, tf.keras.layers.Layer)
|
||||
|
||||
|
||||
def test_tie_model_weights(self):
|
||||
pass
|
||||
# config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
# def check_same_values(layer_1, layer_2):
|
||||
# equal = True
|
||||
# for p1, p2 in zip(layer_1.weight, layer_2.weight):
|
||||
# if p1.data.ne(p2.data).sum() > 0:
|
||||
# equal = False
|
||||
# return equal
|
||||
|
||||
# for model_class in self.all_model_classes:
|
||||
# if not hasattr(model_class, 'tie_weights'):
|
||||
# continue
|
||||
|
||||
# config.torchscript = True
|
||||
# model_not_tied = model_class(config)
|
||||
# params_not_tied = list(model_not_tied.parameters())
|
||||
|
||||
# config_tied = copy.deepcopy(config)
|
||||
# config_tied.torchscript = False
|
||||
# model_tied = model_class(config_tied)
|
||||
# params_tied = list(model_tied.parameters())
|
||||
|
||||
# # Check that the embedding layer and decoding layer are the same in size and in value
|
||||
# self.assertGreater(len(params_not_tied), len(params_tied))
|
||||
|
||||
# # Check that after resize they remain tied.
|
||||
# model_tied.resize_token_embeddings(config.vocab_size + 10)
|
||||
# params_tied_2 = list(model_tied.parameters())
|
||||
# self.assertGreater(len(params_not_tied), len(params_tied))
|
||||
# self.assertEqual(len(params_tied_2), len(params_tied))
|
||||
|
||||
def test_determinism(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
@@ -426,9 +281,17 @@ class TFCommonTestCases:
|
||||
try:
|
||||
x = wte([input_ids], mode="embedding")
|
||||
except:
|
||||
x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
|
||||
try:
|
||||
x = wte([input_ids, None, None, None], mode="embedding")
|
||||
except:
|
||||
if hasattr(self.model_tester, "embedding_size"):
|
||||
x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
|
||||
else:
|
||||
x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
|
||||
# ^^ In our TF models, the input_embeddings can take slightly different forms,
|
||||
# so we try two of them and fall back to just synthetically creating a dummy tensor of ones.
|
||||
# so we try a few of them.
|
||||
# We used to fall back to just synthetically creating a dummy tensor of ones:
|
||||
#
|
||||
inputs_dict["inputs_embeds"] = x
|
||||
outputs = model(inputs_dict)
|
||||
|
||||
@@ -453,29 +316,5 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
|
||||
return output
|
||||
|
||||
|
||||
class TFModelUtilsTest(unittest.TestCase):
|
||||
@pytest.mark.skipif('tensorflow' not in sys.modules, reason="requires TensorFlow")
|
||||
def test_model_from_pretrained(self):
|
||||
pass
|
||||
# logging.basicConfig(level=logging.INFO)
|
||||
# for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
# config = BertConfig.from_pretrained(model_name)
|
||||
# self.assertIsNotNone(config)
|
||||
# self.assertIsInstance(config, PretrainedConfig)
|
||||
|
||||
# model = BertModel.from_pretrained(model_name)
|
||||
# model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
|
||||
# self.assertIsNotNone(model)
|
||||
# self.assertIsInstance(model, PreTrainedModel)
|
||||
# for value in loading_info.values():
|
||||
# self.assertEqual(len(value), 0)
|
||||
|
||||
# config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
|
||||
# model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
|
||||
# self.assertEqual(model.config.output_attentions, True)
|
||||
# self.assertEqual(model.config.output_hidden_states, True)
|
||||
# self.assertEqual(model.config, config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -18,11 +18,11 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
import sys
|
||||
|
||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_tf, slow
|
||||
|
||||
from transformers import CTRLConfig, is_tf_available
|
||||
|
||||
@@ -30,10 +30,9 @@ if is_tf_available():
|
||||
import tensorflow as tf
|
||||
from transformers.modeling_tf_ctrl import (TFCTRLModel, TFCTRLLMHeadModel,
|
||||
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel) if is_tf_available() else ()
|
||||
@@ -188,7 +187,7 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_ctrl_lm_head(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -17,10 +17,10 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import pytest
|
||||
|
||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_tf, slow
|
||||
|
||||
from transformers import DistilBertConfig, is_tf_available
|
||||
|
||||
@@ -30,10 +30,9 @@ if is_tf_available():
|
||||
TFDistilBertForMaskedLM,
|
||||
TFDistilBertForQuestionAnswering,
|
||||
TFDistilBertForSequenceClassification)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
all_model_classes = (TFDistilBertModel, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering,
|
||||
@@ -210,7 +209,7 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs)
|
||||
|
||||
# @pytest.mark.slow
|
||||
# @slow
|
||||
# def test_model_from_pretrained(self):
|
||||
# cache_dir = "/tmp/transformers_test/"
|
||||
# for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -18,11 +18,11 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
import sys
|
||||
|
||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_tf, slow
|
||||
|
||||
from transformers import GPT2Config, is_tf_available
|
||||
|
||||
@@ -31,10 +31,9 @@ if is_tf_available():
|
||||
from transformers.modeling_tf_gpt2 import (TFGPT2Model, TFGPT2LMHeadModel,
|
||||
TFGPT2DoubleHeadsModel,
|
||||
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel,
|
||||
@@ -219,7 +218,7 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_gpt2_double_head(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -18,11 +18,11 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
import sys
|
||||
|
||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_tf, slow
|
||||
|
||||
from transformers import OpenAIGPTConfig, is_tf_available
|
||||
|
||||
@@ -31,10 +31,9 @@ if is_tf_available():
|
||||
from transformers.modeling_tf_openai import (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
|
||||
TFOpenAIGPTDoubleHeadsModel,
|
||||
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
all_model_classes = (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
|
||||
@@ -218,7 +217,7 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_openai_gpt_double_head(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -18,10 +18,10 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_tf, slow
|
||||
|
||||
from transformers import RobertaConfig, is_tf_available
|
||||
|
||||
@@ -32,10 +32,9 @@ if is_tf_available():
|
||||
TFRobertaForSequenceClassification,
|
||||
TFRobertaForTokenClassification,
|
||||
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
all_model_classes = (TFRobertaModel,TFRobertaForMaskedLM,
|
||||
@@ -191,7 +190,7 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
@@ -203,10 +202,10 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
class TFRobertaModelIntegrationTest(unittest.TestCase):
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_inference_masked_lm(self):
|
||||
model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
|
||||
|
||||
|
||||
input_ids = tf.constant([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
||||
output = model(input_ids)[0]
|
||||
expected_shape = [1, 11, 50265]
|
||||
@@ -224,10 +223,10 @@ class TFRobertaModelIntegrationTest(unittest.TestCase):
|
||||
numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
|
||||
)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_inference_no_head(self):
|
||||
model = TFRobertaModel.from_pretrained('roberta-base')
|
||||
|
||||
|
||||
input_ids = tf.constant([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
||||
output = model(input_ids)[0]
|
||||
# compare the actual values for a slice.
|
||||
@@ -240,10 +239,10 @@ class TFRobertaModelIntegrationTest(unittest.TestCase):
|
||||
numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
|
||||
)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_inference_classification_head(self):
|
||||
model = TFRobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
|
||||
|
||||
|
||||
input_ids = tf.constant([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
||||
output = model(input_ids)[0]
|
||||
expected_shape = [1, 3]
|
||||
|
||||
@@ -19,10 +19,10 @@ from __future__ import print_function
|
||||
import unittest
|
||||
import random
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_tf, slow
|
||||
|
||||
from transformers import TransfoXLConfig, is_tf_available
|
||||
|
||||
@@ -31,10 +31,9 @@ if is_tf_available():
|
||||
from transformers.modeling_tf_transfo_xl import (TFTransfoXLModel,
|
||||
TFTransfoXLLMHeadModel,
|
||||
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
all_model_classes = (TFTransfoXLModel, TFTransfoXLLMHeadModel) if is_tf_available() else ()
|
||||
@@ -204,7 +203,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_transfo_xl_lm_head(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -18,7 +18,6 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from transformers import is_tf_available
|
||||
|
||||
@@ -29,13 +28,13 @@ if is_tf_available():
|
||||
TFXLMForSequenceClassification,
|
||||
TFXLMForQuestionAnsweringSimple,
|
||||
TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_tf, slow
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
all_model_classes = (TFXLMModel, TFXLMWithLMHeadModel,
|
||||
@@ -251,7 +250,7 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -21,7 +21,6 @@ import unittest
|
||||
import json
|
||||
import random
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from transformers import XLNetConfig, is_tf_available
|
||||
|
||||
@@ -30,18 +29,21 @@ if is_tf_available():
|
||||
|
||||
from transformers.modeling_tf_xlnet import (TFXLNetModel, TFXLNetLMHeadModel,
|
||||
TFXLNetForSequenceClassification,
|
||||
TFXLNetForTokenClassification,
|
||||
TFXLNetForQuestionAnsweringSimple,
|
||||
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require TensorFlow")
|
||||
|
||||
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_tf, slow
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
|
||||
all_model_classes=(TFXLNetModel, TFXLNetLMHeadModel,
|
||||
TFXLNetForSequenceClassification,
|
||||
TFXLNetForTokenClassification,
|
||||
TFXLNetForQuestionAnsweringSimple) if is_tf_available() else ()
|
||||
test_pruning = False
|
||||
|
||||
@@ -258,6 +260,26 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
list(list(mem.shape) for mem in result["mems_1"]),
|
||||
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
|
||||
|
||||
def create_and_check_xlnet_for_token_classification(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
|
||||
config.num_labels = input_ids_1.shape[1]
|
||||
model = TFXLNetForTokenClassification(config)
|
||||
inputs = {'input_ids': input_ids_1,
|
||||
'attention_mask': input_mask,
|
||||
# 'token_type_ids': token_type_ids
|
||||
}
|
||||
logits, mems_1 = model(inputs)
|
||||
result = {
|
||||
"mems_1": [mem.numpy() for mem in mems_1],
|
||||
"logits": logits.numpy(),
|
||||
}
|
||||
self.parent.assertListEqual(
|
||||
list(result["logits"].shape),
|
||||
[self.batch_size, self.seq_length, config.num_labels])
|
||||
self.parent.assertListEqual(
|
||||
list(list(mem.shape) for mem in result["mems_1"]),
|
||||
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
(config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
@@ -282,19 +304,23 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
|
||||
def test_xlnet_lm_head(self):
|
||||
self.model_tester.set_seed()
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs)
|
||||
self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs)
|
||||
|
||||
def test_xlnet_sequence_classif(self):
|
||||
self.model_tester.set_seed()
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
|
||||
|
||||
def test_xlnet_token_classification(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlnet_for_token_classification(*config_and_inputs)
|
||||
|
||||
def test_xlnet_qa(self):
|
||||
self.model_tester.set_seed()
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -19,7 +19,6 @@ from __future__ import print_function
|
||||
import unittest
|
||||
import random
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
@@ -27,12 +26,13 @@ if is_torch_available():
|
||||
import torch
|
||||
from transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
|
||||
from transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
|
||||
@require_torch
|
||||
class TransfoXLModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (TransfoXLModel, TransfoXLLMHeadModel) if is_torch_available() else ()
|
||||
@@ -111,6 +111,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
|
||||
model = TransfoXLModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
hidden_states_1, mems_1 = model(input_ids_1)
|
||||
@@ -140,6 +141,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
|
||||
model = TransfoXLLMHeadModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
lm_logits_1, mems_1 = model(input_ids_1)
|
||||
@@ -204,7 +206,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
|
||||
output_result = self.model_tester.create_transfo_xl_lm_head(*config_and_inputs)
|
||||
self.model_tester.check_transfo_xl_lm_head_output(output_result)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -18,7 +18,6 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
@@ -26,13 +25,13 @@ if is_torch_available():
|
||||
from transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering,
|
||||
XLMForSequenceClassification, XLMForQuestionAnsweringSimple)
|
||||
from transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
|
||||
@require_torch
|
||||
class XLMModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes = (XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering,
|
||||
@@ -148,6 +147,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_xlm_model(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
|
||||
model = XLMModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
outputs = model(input_ids, lengths=input_lengths, langs=token_type_ids)
|
||||
outputs = model(input_ids, langs=token_type_ids)
|
||||
@@ -163,6 +163,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_xlm_lm_head(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
|
||||
model = XLMWithLMHeadModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
loss, logits = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
|
||||
@@ -182,6 +183,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_xlm_simple_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
|
||||
model = XLMForQuestionAnsweringSimple(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
outputs = model(input_ids)
|
||||
@@ -206,6 +208,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_xlm_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
|
||||
model = XLMForQuestionAnswering(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
outputs = model(input_ids)
|
||||
@@ -260,6 +263,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
def create_and_check_xlm_sequence_classif(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
|
||||
model = XLMForSequenceClassification(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
(logits,) = model(input_ids)
|
||||
@@ -312,7 +316,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -21,24 +21,25 @@ import unittest
|
||||
import json
|
||||
import random
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
|
||||
from transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification,
|
||||
XLNetForTokenClassification, XLNetForQuestionAnswering)
|
||||
from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ids_tensor)
|
||||
from .configuration_common_test import ConfigTester
|
||||
from .utils import require_torch, slow, torch_device
|
||||
|
||||
|
||||
@require_torch
|
||||
class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
all_model_classes=(XLNetModel, XLNetLMHeadModel,
|
||||
all_model_classes=(XLNetModel, XLNetLMHeadModel, XLNetForTokenClassification,
|
||||
XLNetForSequenceClassification, XLNetForQuestionAnswering) if is_torch_available() else ()
|
||||
test_pruning = False
|
||||
|
||||
@@ -99,18 +100,20 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||
input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float()
|
||||
|
||||
input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
|
||||
perm_mask = torch.zeros(self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float)
|
||||
perm_mask = torch.zeros(self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float, device=torch_device)
|
||||
perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
|
||||
target_mapping = torch.zeros(self.batch_size, 1, self.seq_length + 1, dtype=torch.float)
|
||||
target_mapping = torch.zeros(self.batch_size, 1, self.seq_length + 1, dtype=torch.float, device=torch_device)
|
||||
target_mapping[:, 0, -1] = 1.0 # predict last token
|
||||
|
||||
sequence_labels = None
|
||||
lm_labels = None
|
||||
is_impossible_labels = None
|
||||
token_labels = None
|
||||
if self.use_labels:
|
||||
lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||
is_impossible_labels = ids_tensor([self.batch_size], 2).float()
|
||||
token_labels = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||
|
||||
config = XLNetConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
@@ -129,15 +132,16 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||
num_labels=self.type_sequence_label_size)
|
||||
|
||||
return (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels)
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels)
|
||||
|
||||
def set_seed(self):
|
||||
random.seed(self.seed)
|
||||
torch.manual_seed(self.seed)
|
||||
|
||||
def create_and_check_xlnet_base_model(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
|
||||
model = XLNetModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
_, _ = model(input_ids_1, input_mask=input_mask)
|
||||
@@ -152,6 +156,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||
|
||||
config.mem_len = 0
|
||||
model = XLNetModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
no_mems_outputs = model(input_ids_1)
|
||||
self.parent.assertEqual(len(no_mems_outputs), 1)
|
||||
@@ -163,9 +168,23 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||
list(list(mem.size()) for mem in result["mems_1"]),
|
||||
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
|
||||
|
||||
def create_and_check_xlnet_base_model_with_att_output(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
|
||||
model = XLNetModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
_, _, attentions = model(input_ids_1, target_mapping=target_mapping)
|
||||
|
||||
self.parent.assertEqual(len(attentions), config.n_layer)
|
||||
self.parent.assertIsInstance(attentions[0], tuple)
|
||||
self.parent.assertEqual(len(attentions[0]), 2)
|
||||
self.parent.assertTrue(attentions[0][0].shape, attentions[0][0].shape)
|
||||
|
||||
def create_and_check_xlnet_lm_head(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
|
||||
model = XLNetLMHeadModel(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
loss_1, all_logits_1, mems_1 = model(input_ids_1, token_type_ids=segment_ids, labels=lm_labels)
|
||||
@@ -204,8 +223,9 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
|
||||
|
||||
def create_and_check_xlnet_qa(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
|
||||
model = XLNetForQuestionAnswering(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
outputs = model(input_ids_1)
|
||||
@@ -261,9 +281,43 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||
list(list(mem.size()) for mem in result["mems"]),
|
||||
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
|
||||
|
||||
def create_and_check_xlnet_token_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
|
||||
model = XLNetForTokenClassification(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
logits, mems_1 = model(input_ids_1)
|
||||
loss, logits, mems_1 = model(input_ids_1, labels=token_labels)
|
||||
|
||||
result = {
|
||||
"loss": loss,
|
||||
"mems_1": mems_1,
|
||||
"logits": logits,
|
||||
}
|
||||
|
||||
self.parent.assertListEqual(
|
||||
list(result["loss"].size()),
|
||||
[])
|
||||
self.parent.assertListEqual(
|
||||
list(result["logits"].size()),
|
||||
[self.batch_size, self.seq_length, self.type_sequence_label_size])
|
||||
self.parent.assertListEqual(
|
||||
list(list(mem.size()) for mem in result["mems_1"]),
|
||||
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
(config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
target_mapping, segment_ids, lm_labels,
|
||||
sequence_labels, is_impossible_labels) = config_and_inputs
|
||||
inputs_dict = {'input_ids': input_ids_1}
|
||||
return config, inputs_dict
|
||||
|
||||
def create_and_check_xlnet_sequence_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
|
||||
target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
|
||||
model = XLNetForSequenceClassification(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
logits, mems_1 = model(input_ids_1)
|
||||
@@ -289,7 +343,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
(config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
|
||||
target_mapping, segment_ids, lm_labels,
|
||||
sequence_labels, is_impossible_labels) = config_and_inputs
|
||||
sequence_labels, is_impossible_labels, token_labels) = config_and_inputs
|
||||
inputs_dict = {'input_ids': input_ids_1}
|
||||
return config, inputs_dict
|
||||
|
||||
@@ -306,22 +360,33 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlnet_base_model(*config_and_inputs)
|
||||
|
||||
def test_xlnet_base_model_with_att_output(self):
|
||||
self.model_tester.set_seed()
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
config_and_inputs[0].output_attentions = True
|
||||
self.model_tester.create_and_check_xlnet_base_model_with_att_output(*config_and_inputs)
|
||||
|
||||
def test_xlnet_lm_head(self):
|
||||
self.model_tester.set_seed()
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs)
|
||||
self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs)
|
||||
|
||||
def test_xlnet_sequence_classif(self):
|
||||
self.model_tester.set_seed()
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
|
||||
|
||||
def test_xlnet_token_classif(self):
|
||||
self.model_tester.set_seed()
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlnet_token_classif(*config_and_inputs)
|
||||
|
||||
def test_xlnet_qa(self):
|
||||
self.model_tester.set_seed()
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
cache_dir = "/tmp/transformers_test/"
|
||||
for model_name in list(XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -18,7 +18,6 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import os
|
||||
import pytest
|
||||
|
||||
from transformers import is_torch_available
|
||||
|
||||
@@ -31,10 +30,9 @@ if is_torch_available():
|
||||
get_cosine_schedule_with_warmup,
|
||||
get_cosine_with_hard_restarts_schedule_with_warmup,
|
||||
get_linear_schedule_with_warmup)
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch")
|
||||
|
||||
from .tokenization_tests_commons import TemporaryDirectory
|
||||
from .utils import require_torch
|
||||
|
||||
|
||||
def unwrap_schedule(scheduler, num_steps=10):
|
||||
@@ -58,6 +56,7 @@ def unwrap_and_save_reload_schedule(scheduler, num_steps=10):
|
||||
scheduler.load_state_dict(state_dict)
|
||||
return lrs
|
||||
|
||||
@require_torch
|
||||
class OptimizationTest(unittest.TestCase):
|
||||
|
||||
def assertListAlmostEqual(self, list1, list2, tol):
|
||||
@@ -80,6 +79,7 @@ class OptimizationTest(unittest.TestCase):
|
||||
self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
|
||||
|
||||
|
||||
@require_torch
|
||||
class ScheduleInitTest(unittest.TestCase):
|
||||
m = torch.nn.Linear(50, 50) if is_torch_available() else None
|
||||
optimizer = AdamW(m.parameters(), lr=10.) if is_torch_available() else None
|
||||
|
||||
90
transformers/tests/optimization_tf_test.py
Normal file
90
transformers/tests/optimization_tf_test.py
Normal file
@@ -0,0 +1,90 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
|
||||
from transformers import is_tf_available
|
||||
|
||||
from .utils import require_tf
|
||||
|
||||
if is_tf_available():
|
||||
import tensorflow as tf
|
||||
from tensorflow.python.eager import context
|
||||
from tensorflow.python.framework import ops
|
||||
from transformers import (create_optimizer, GradientAccumulator)
|
||||
|
||||
|
||||
@require_tf
|
||||
class OptimizationFTest(unittest.TestCase):
|
||||
def assertListAlmostEqual(self, list1, list2, tol):
|
||||
self.assertEqual(len(list1), len(list2))
|
||||
for a, b in zip(list1, list2):
|
||||
self.assertAlmostEqual(a, b, delta=tol)
|
||||
|
||||
def testGradientAccumulator(self):
|
||||
accumulator = GradientAccumulator()
|
||||
accumulator([tf.constant([1.0, 2.0])])
|
||||
accumulator([tf.constant([-2.0, 1.0])])
|
||||
accumulator([tf.constant([-1.0, 2.0])])
|
||||
with self.assertRaises(ValueError):
|
||||
accumulator([tf.constant([1.0, 1.0]), tf.constant([2.0, 2.0])])
|
||||
self.assertEqual(accumulator.step, 3)
|
||||
self.assertEqual(len(accumulator.gradients), 1)
|
||||
self.assertListAlmostEqual(accumulator.gradients[0].numpy().tolist(), [-2.0, 5.0], tol=1e-2)
|
||||
accumulator.reset()
|
||||
self.assertEqual(accumulator.step, 0)
|
||||
self.assertListAlmostEqual(accumulator.gradients[0].numpy().tolist(), [0.0, 0.0], tol=1e-2)
|
||||
|
||||
def testGradientAccumulatorDistributionStrategy(self):
|
||||
context._context = None
|
||||
ops.enable_eager_execution_internal()
|
||||
physical_devices = tf.config.experimental.list_physical_devices("CPU")
|
||||
tf.config.experimental.set_virtual_device_configuration(
|
||||
physical_devices[0],
|
||||
[tf.config.experimental.VirtualDeviceConfiguration(),
|
||||
tf.config.experimental.VirtualDeviceConfiguration()])
|
||||
|
||||
devices = tf.config.experimental.list_logical_devices(device_type="CPU")
|
||||
strategy = tf.distribute.MirroredStrategy(devices=[device.name for device in devices])
|
||||
|
||||
with strategy.scope():
|
||||
accumulator = GradientAccumulator()
|
||||
variable = tf.Variable([4.0, 3.0])
|
||||
optimizer = create_optimizer(5e-5, 10, 5)
|
||||
gradient_placeholder = tf.Variable([0.0, 0.0], trainable=False)
|
||||
|
||||
def accumulate_on_replica(gradient):
|
||||
accumulator([gradient])
|
||||
|
||||
def apply_on_replica():
|
||||
optimizer.apply_gradients(list(zip(accumulator.gradients, [variable])), 1.0)
|
||||
|
||||
@tf.function
|
||||
def accumulate(grad1, grad2):
|
||||
with strategy.scope():
|
||||
gradient_placeholder.values[0].assign(grad1)
|
||||
gradient_placeholder.values[1].assign(grad2)
|
||||
strategy.experimental_run_v2(accumulate_on_replica, args=(gradient_placeholder,))
|
||||
|
||||
@tf.function
|
||||
def apply_grad():
|
||||
with strategy.scope():
|
||||
strategy.experimental_run_v2(apply_on_replica)
|
||||
|
||||
accumulate([1.0, 2.0], [-1.0, 1.0])
|
||||
accumulate([3.0, -1.0], [-1.0, -1.0])
|
||||
accumulate([-2.0, 2.0], [3.0, -2.0])
|
||||
self.assertEqual(accumulator.step, 3)
|
||||
self.assertListAlmostEqual(accumulator._gradients[0].values[0].value().numpy().tolist(), [2.0, 3.0], tol=1e-2)
|
||||
self.assertListAlmostEqual(accumulator._gradients[0].values[1].value().numpy().tolist(), [1.0, -2.0], tol=1e-2)
|
||||
apply_grad()
|
||||
self.assertListAlmostEqual(variable.value().numpy().tolist(), [4.0, 3.0], tol=1e-2)
|
||||
accumulator.reset()
|
||||
self.assertEqual(accumulator.step, 0)
|
||||
self.assertListAlmostEqual(accumulator._gradients[0].values[0].value().numpy().tolist(), [0.0, 0.0], tol=1e-2)
|
||||
self.assertListAlmostEqual(accumulator._gradients[0].values[1].value().numpy().tolist(), [0.0, 0.0], tol=1e-2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
78
transformers/tests/tokenization_albert_test.py
Normal file
78
transformers/tests/tokenization_albert_test.py
Normal file
@@ -0,0 +1,78 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2019 Hugging Face inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from transformers.tokenization_albert import (AlbertTokenizer, SPIECE_UNDERLINE)
|
||||
|
||||
from .tokenization_tests_commons import CommonTestCases
|
||||
|
||||
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
||||
'fixtures/spiece.model')
|
||||
|
||||
class AlbertTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
|
||||
tokenizer_class = AlbertTokenizer
|
||||
|
||||
def setUp(self):
|
||||
super(AlbertTokenizationTest, self).setUp()
|
||||
|
||||
# We have a SentencePiece fixture for testing
|
||||
tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
|
||||
tokenizer.save_pretrained(self.tmpdirname)
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return AlbertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def get_input_output_texts(self):
|
||||
input_text = u"this is a test"
|
||||
output_text = u"this is a test"
|
||||
return input_text, output_text
|
||||
|
||||
|
||||
def test_full_tokenizer(self):
|
||||
tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||
|
||||
tokens = tokenizer.tokenize(u'This is a test')
|
||||
self.assertListEqual(tokens, [u'▁this', u'▁is', u'▁a', u'▁test'])
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289])
|
||||
|
||||
tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
|
||||
self.assertListEqual(tokens, [u'▁i', u'▁was', u'▁born', u'▁in', u'▁9', u'2000', u',', u'▁and', u'▁this', u'▁is', u'▁fal', u's', u'é', u'.'])
|
||||
ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
self.assertListEqual(ids, [31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])
|
||||
|
||||
back_tokens = tokenizer.convert_ids_to_tokens(ids)
|
||||
self.assertListEqual(back_tokens, ['▁i', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fal', 's', '<unk>', '.'])
|
||||
|
||||
def test_sequence_builders(self):
|
||||
tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
|
||||
|
||||
text = tokenizer.encode("sequence builders")
|
||||
text_2 = tokenizer.encode("multi-sequence build")
|
||||
|
||||
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
||||
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
||||
|
||||
assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
|
||||
assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [tokenizer.sep_token_id]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -18,15 +18,16 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
import logging
|
||||
|
||||
from transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
|
||||
from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
||||
from .utils import slow
|
||||
|
||||
|
||||
class AutoTokenizerTest(unittest.TestCase):
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_tokenizer_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys())[:1]:
|
||||
|
||||
@@ -16,7 +16,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
||||
|
||||
import os
|
||||
import unittest
|
||||
import pytest
|
||||
from io import open
|
||||
|
||||
from transformers.tokenization_bert import (BasicTokenizer,
|
||||
@@ -26,6 +25,7 @@ from transformers.tokenization_bert import (BasicTokenizer,
|
||||
_is_whitespace, VOCAB_FILES_NAMES)
|
||||
|
||||
from .tokenization_tests_commons import CommonTestCases
|
||||
from .utils import slow
|
||||
|
||||
class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
|
||||
@@ -126,7 +126,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
self.assertFalse(_is_punctuation(u"A"))
|
||||
self.assertFalse(_is_punctuation(u" "))
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_sequence_builders(self):
|
||||
tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
|
||||
|
||||
|
||||
@@ -16,13 +16,13 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
||||
|
||||
import os
|
||||
import unittest
|
||||
import pytest
|
||||
from io import open
|
||||
|
||||
from transformers.tokenization_distilbert import (DistilBertTokenizer)
|
||||
|
||||
from .tokenization_tests_commons import CommonTestCases
|
||||
from .tokenization_bert_test import BertTokenizationTest
|
||||
from .utils import slow
|
||||
|
||||
class DistilBertTokenizationTest(BertTokenizationTest):
|
||||
|
||||
@@ -31,7 +31,7 @@ class DistilBertTokenizationTest(BertTokenizationTest):
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_sequence_builders(self):
|
||||
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
|
||||
|
||||
|
||||
@@ -17,11 +17,11 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
||||
import os
|
||||
import json
|
||||
import unittest
|
||||
import pytest
|
||||
from io import open
|
||||
|
||||
from transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
|
||||
from .tokenization_tests_commons import CommonTestCases
|
||||
from .utils import slow
|
||||
|
||||
|
||||
class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
@@ -79,7 +79,7 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
|
||||
)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_sequence_builders(self):
|
||||
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
|
||||
|
||||
|
||||
@@ -102,14 +102,48 @@ class CommonTestCases:
|
||||
with TemporaryDirectory() as tmpdirname:
|
||||
|
||||
filename = os.path.join(tmpdirname, u"tokenizer.bin")
|
||||
pickle.dump(tokenizer, open(filename, "wb"))
|
||||
with open(filename, "wb") as handle:
|
||||
pickle.dump(tokenizer, handle)
|
||||
|
||||
tokenizer_new = pickle.load(open(filename, "rb"))
|
||||
with open(filename, "rb") as handle:
|
||||
tokenizer_new = pickle.load(handle)
|
||||
|
||||
subwords_loaded = tokenizer_new.tokenize(text)
|
||||
|
||||
self.assertListEqual(subwords, subwords_loaded)
|
||||
|
||||
def test_added_tokens_do_lower_case(self):
|
||||
tokenizer = self.get_tokenizer(do_lower_case=True)
|
||||
|
||||
special_token = tokenizer.all_special_tokens[0]
|
||||
|
||||
text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
|
||||
text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
|
||||
|
||||
toks0 = tokenizer.tokenize(text) # toks before adding new_toks
|
||||
|
||||
new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", 'AAAAA BBBBBB', 'CCCCCCCCCDDDDDDDD']
|
||||
added = tokenizer.add_tokens(new_toks)
|
||||
self.assertEqual(added, 2)
|
||||
|
||||
toks = tokenizer.tokenize(text)
|
||||
toks2 = tokenizer.tokenize(text2)
|
||||
|
||||
self.assertEqual(len(toks), len(toks2))
|
||||
self.assertNotEqual(len(toks), len(toks0)) # toks0 should be longer
|
||||
self.assertListEqual(toks, toks2)
|
||||
|
||||
tokenizer = self.get_tokenizer(do_lower_case=False)
|
||||
|
||||
added = tokenizer.add_tokens(new_toks)
|
||||
self.assertEqual(added, 4)
|
||||
|
||||
toks = tokenizer.tokenize(text)
|
||||
toks2 = tokenizer.tokenize(text2)
|
||||
|
||||
self.assertEqual(len(toks), len(toks2)) # Length should still be the same
|
||||
self.assertNotEqual(len(toks), len(toks0))
|
||||
self.assertNotEqual(toks[1], toks2[1]) # But at least the first non-special tokens should differ
|
||||
|
||||
def test_add_tokens_tokenizer(self):
|
||||
tokenizer = self.get_tokenizer()
|
||||
@@ -243,7 +277,11 @@ class CommonTestCases:
|
||||
sequence = tokenizer.encode(seq_0, add_special_tokens=False)
|
||||
num_added_tokens = tokenizer.num_added_tokens()
|
||||
total_length = len(sequence) + num_added_tokens
|
||||
information = tokenizer.encode_plus(seq_0, max_length=total_length - 2, add_special_tokens=True, stride=stride)
|
||||
information = tokenizer.encode_plus(seq_0,
|
||||
max_length=total_length - 2,
|
||||
add_special_tokens=True,
|
||||
stride=stride,
|
||||
return_overflowing_tokens=True)
|
||||
|
||||
truncated_sequence = information["input_ids"]
|
||||
overflowing_tokens = information["overflowing_tokens"]
|
||||
@@ -270,10 +308,12 @@ class CommonTestCases:
|
||||
)
|
||||
|
||||
information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True,
|
||||
stride=stride, truncation_strategy='only_second')
|
||||
stride=stride, truncation_strategy='only_second',
|
||||
return_overflowing_tokens=True)
|
||||
information_first_truncated = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2,
|
||||
add_special_tokens=True, stride=stride,
|
||||
truncation_strategy='only_first')
|
||||
truncation_strategy='only_first',
|
||||
return_overflowing_tokens=True)
|
||||
|
||||
truncated_sequence = information["input_ids"]
|
||||
overflowing_tokens = information["overflowing_tokens"]
|
||||
@@ -305,7 +345,7 @@ class CommonTestCases:
|
||||
|
||||
# Testing single inputs
|
||||
encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
|
||||
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
|
||||
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True, return_special_tokens_mask=True)
|
||||
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
||||
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
|
||||
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
|
||||
@@ -317,7 +357,8 @@ class CommonTestCases:
|
||||
# Testing inputs pairs
|
||||
encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode(sequence_1,
|
||||
add_special_tokens=False)
|
||||
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True)
|
||||
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True,
|
||||
return_special_tokens_mask=True)
|
||||
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
||||
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
|
||||
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
|
||||
@@ -329,7 +370,9 @@ class CommonTestCases:
|
||||
# Testing with already existing special tokens
|
||||
if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id:
|
||||
tokenizer.add_special_tokens({'cls_token': '</s>', 'sep_token': '<s>'})
|
||||
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
|
||||
encoded_sequence_dict = tokenizer.encode_plus(sequence_0,
|
||||
add_special_tokens=True,
|
||||
return_special_tokens_mask=True)
|
||||
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
||||
special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
|
||||
special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, already_has_special_tokens=True)
|
||||
|
||||
@@ -16,7 +16,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
||||
|
||||
import os
|
||||
import unittest
|
||||
import pytest
|
||||
from io import open
|
||||
|
||||
from transformers import is_torch_available
|
||||
@@ -24,11 +23,12 @@ from transformers import is_torch_available
|
||||
if is_torch_available():
|
||||
import torch
|
||||
from transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
|
||||
else:
|
||||
pytestmark = pytest.mark.skip("Require Torch") # TODO: untangle Transfo-XL tokenizer from torch.load and torch.save
|
||||
|
||||
from .tokenization_tests_commons import CommonTestCases
|
||||
from .utils import require_torch
|
||||
|
||||
|
||||
@require_torch
|
||||
class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
|
||||
tokenizer_class = TransfoXLTokenizer if is_torch_available() else None
|
||||
|
||||
@@ -18,13 +18,14 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import six
|
||||
import pytest
|
||||
|
||||
from transformers import PreTrainedTokenizer
|
||||
from transformers.tokenization_gpt2 import GPT2Tokenizer
|
||||
|
||||
from .utils import slow
|
||||
|
||||
class TokenizerUtilsTest(unittest.TestCase):
|
||||
@pytest.mark.slow
|
||||
|
||||
def check_tokenizer_from_pretrained(self, tokenizer_class):
|
||||
s3_models = list(tokenizer_class.max_model_input_sizes.keys())
|
||||
for model_name in s3_models[:1]:
|
||||
@@ -41,6 +42,7 @@ class TokenizerUtilsTest(unittest.TestCase):
|
||||
special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
|
||||
self.assertIsInstance(special_tok_id, int)
|
||||
|
||||
@slow
|
||||
def test_pretrained_tokenizers(self):
|
||||
self.check_tokenizer_from_pretrained(GPT2Tokenizer)
|
||||
|
||||
|
||||
@@ -17,11 +17,11 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
||||
import os
|
||||
import unittest
|
||||
import json
|
||||
import pytest
|
||||
|
||||
from transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
|
||||
|
||||
from .tokenization_tests_commons import CommonTestCases
|
||||
from .utils import slow
|
||||
|
||||
class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
|
||||
@@ -67,7 +67,7 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
self.assertListEqual(
|
||||
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_sequence_builders(self):
|
||||
tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
|
||||
|
||||
|
||||
@@ -16,11 +16,11 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
||||
|
||||
import os
|
||||
import unittest
|
||||
import pytest
|
||||
|
||||
from transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
|
||||
|
||||
from .tokenization_tests_commons import CommonTestCases
|
||||
from .utils import slow
|
||||
|
||||
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
||||
'fixtures/test_sentencepiece.model')
|
||||
@@ -90,7 +90,7 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
|
||||
SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
|
||||
|
||||
@pytest.mark.slow
|
||||
@slow
|
||||
def test_sequence_builders(self):
|
||||
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
|
||||
|
||||
|
||||
64
transformers/tests/utils.py
Normal file
64
transformers/tests/utils.py
Normal file
@@ -0,0 +1,64 @@
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from distutils.util import strtobool
|
||||
|
||||
from transformers.file_utils import _tf_available, _torch_available
|
||||
|
||||
|
||||
try:
|
||||
run_slow = os.environ["RUN_SLOW"]
|
||||
except KeyError:
|
||||
# RUN_SLOW isn't set, default to skipping slow tests.
|
||||
_run_slow_tests = False
|
||||
else:
|
||||
# RUN_SLOW is set, convert it to True or False.
|
||||
try:
|
||||
_run_slow_tests = strtobool(run_slow)
|
||||
except ValueError:
|
||||
# More values are supported, but let's keep the message simple.
|
||||
raise ValueError("If set, RUN_SLOW must be yes or no.")
|
||||
|
||||
|
||||
def slow(test_case):
|
||||
"""
|
||||
Decorator marking a test as slow.
|
||||
|
||||
Slow tests are skipped by default. Set the RUN_SLOW environment variable
|
||||
to a truthy value to run them.
|
||||
|
||||
"""
|
||||
if not _run_slow_tests:
|
||||
test_case = unittest.skip("test is slow")(test_case)
|
||||
return test_case
|
||||
|
||||
|
||||
def require_torch(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires PyTorch.
|
||||
|
||||
These tests are skipped when PyTorch isn't installed.
|
||||
|
||||
"""
|
||||
if not _torch_available:
|
||||
test_case = unittest.skip("test requires PyTorch")(test_case)
|
||||
return test_case
|
||||
|
||||
|
||||
def require_tf(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires TensorFlow.
|
||||
|
||||
These tests are skipped when TensorFlow isn't installed.
|
||||
|
||||
"""
|
||||
if not _tf_available:
|
||||
test_case = unittest.skip("test requires TensorFlow")(test_case)
|
||||
return test_case
|
||||
|
||||
|
||||
if _torch_available:
|
||||
# Set the USE_CUDA environment variable to select a GPU.
|
||||
torch_device = "cuda" if os.environ.get("USE_CUDA") else "cpu"
|
||||
else:
|
||||
torch_device = None
|
||||
252
transformers/tokenization_albert.py
Normal file
252
transformers/tokenization_albert.py
Normal file
@@ -0,0 +1,252 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Tokenization classes for ALBERT model."""
|
||||
from __future__ import (absolute_import, division, print_function,
|
||||
unicode_literals)
|
||||
|
||||
from .tokenization_utils import PreTrainedTokenizer
|
||||
import logging
|
||||
import unicodedata
|
||||
import six
|
||||
import os
|
||||
from shutil import copyfile
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
|
||||
|
||||
PRETRAINED_VOCAB_FILES_MAP = {
|
||||
'vocab_file':
|
||||
{
|
||||
'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-spiece.model",
|
||||
'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-spiece.model",
|
||||
'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-spiece.model",
|
||||
'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-spiece.model",
|
||||
'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model",
|
||||
'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-spiece.model",
|
||||
'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-spiece.model",
|
||||
'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-spiece.model",
|
||||
}
|
||||
}
|
||||
|
||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
'albert-base-v1': 512,
|
||||
'albert-large-v1': 512,
|
||||
'albert-xlarge-v1': 512,
|
||||
'albert-xxlarge-v1': 512,
|
||||
'albert-base-v2': 512,
|
||||
'albert-large-v2': 512,
|
||||
'albert-xlarge-v2': 512,
|
||||
'albert-xxlarge-v2': 512,
|
||||
}
|
||||
|
||||
SPIECE_UNDERLINE = u'▁'
|
||||
|
||||
class AlbertTokenizer(PreTrainedTokenizer):
|
||||
"""
|
||||
SentencePiece based tokenizer. Peculiarities:
|
||||
|
||||
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
|
||||
"""
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
|
||||
def __init__(self, vocab_file,
|
||||
do_lower_case=True, remove_space=True, keep_accents=False,
|
||||
bos_token="[CLS]", eos_token="[SEP]", unk_token="<unk>", sep_token="[SEP]",
|
||||
pad_token="<pad>", cls_token="[CLS]", mask_token="[MASK]", **kwargs):
|
||||
super(AlbertTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token,
|
||||
unk_token=unk_token, sep_token=sep_token,
|
||||
pad_token=pad_token, cls_token=cls_token,
|
||||
mask_token=mask_token, **kwargs)
|
||||
|
||||
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
|
||||
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
|
||||
|
||||
try:
|
||||
import sentencepiece as spm
|
||||
except ImportError:
|
||||
logger.warning("You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
|
||||
"pip install sentencepiece")
|
||||
|
||||
self.do_lower_case = do_lower_case
|
||||
self.remove_space = remove_space
|
||||
self.keep_accents = keep_accents
|
||||
self.vocab_file = vocab_file
|
||||
|
||||
self.sp_model = spm.SentencePieceProcessor()
|
||||
self.sp_model.Load(vocab_file)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return len(self.sp_model)
|
||||
|
||||
def __getstate__(self):
|
||||
state = self.__dict__.copy()
|
||||
state["sp_model"] = None
|
||||
return state
|
||||
|
||||
def __setstate__(self, d):
|
||||
self.__dict__ = d
|
||||
try:
|
||||
import sentencepiece as spm
|
||||
except ImportError:
|
||||
logger.warning("You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
|
||||
"pip install sentencepiece")
|
||||
self.sp_model = spm.SentencePieceProcessor()
|
||||
self.sp_model.Load(self.vocab_file)
|
||||
|
||||
def preprocess_text(self, inputs):
|
||||
if self.remove_space:
|
||||
outputs = ' '.join(inputs.strip().split())
|
||||
else:
|
||||
outputs = inputs
|
||||
outputs = outputs.replace("``", '"').replace("''", '"')
|
||||
|
||||
if six.PY2 and isinstance(outputs, str):
|
||||
outputs = outputs.decode('utf-8')
|
||||
|
||||
if not self.keep_accents:
|
||||
outputs = unicodedata.normalize('NFKD', outputs)
|
||||
outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
|
||||
if self.do_lower_case:
|
||||
outputs = outputs.lower()
|
||||
|
||||
return outputs
|
||||
|
||||
def _tokenize(self, text, return_unicode=True, sample=False):
|
||||
""" Tokenize a string.
|
||||
return_unicode is used only for py2
|
||||
"""
|
||||
text = self.preprocess_text(text)
|
||||
# note(zhiliny): in some systems, sentencepiece only accepts str for py2
|
||||
if six.PY2 and isinstance(text, unicode):
|
||||
text = text.encode('utf-8')
|
||||
|
||||
if not sample:
|
||||
pieces = self.sp_model.EncodeAsPieces(text)
|
||||
else:
|
||||
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
|
||||
new_pieces = []
|
||||
for piece in pieces:
|
||||
if len(piece) > 1 and piece[-1] == str(',') and piece[-2].isdigit():
|
||||
cur_pieces = self.sp_model.EncodeAsPieces(
|
||||
piece[:-1].replace(SPIECE_UNDERLINE, ''))
|
||||
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
|
||||
if len(cur_pieces[0]) == 1:
|
||||
cur_pieces = cur_pieces[1:]
|
||||
else:
|
||||
cur_pieces[0] = cur_pieces[0][1:]
|
||||
cur_pieces.append(piece[-1])
|
||||
new_pieces.extend(cur_pieces)
|
||||
else:
|
||||
new_pieces.append(piece)
|
||||
|
||||
# note(zhiliny): convert back to unicode for py2
|
||||
if six.PY2 and return_unicode:
|
||||
ret_pieces = []
|
||||
for piece in new_pieces:
|
||||
if isinstance(piece, str):
|
||||
piece = piece.decode('utf-8')
|
||||
ret_pieces.append(piece)
|
||||
new_pieces = ret_pieces
|
||||
|
||||
return new_pieces
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
||||
return self.sp_model.PieceToId(token)
|
||||
|
||||
def _convert_id_to_token(self, index, return_unicode=True):
|
||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
||||
token = self.sp_model.IdToPiece(index)
|
||||
if six.PY2 and return_unicode and isinstance(token, str):
|
||||
token = token.decode('utf-8')
|
||||
return token
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
|
||||
out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
|
||||
return out_string
|
||||
|
||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||
"""
|
||||
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||
by concatenating and adding special tokens.
|
||||
An ALBERT sequence has the following format:
|
||||
single sequence: [CLS] X [SEP]
|
||||
pair of sequences: [CLS] A [SEP] B [SEP]
|
||||
"""
|
||||
sep = [self.sep_token_id]
|
||||
cls = [self.cls_token_id]
|
||||
if token_ids_1 is None:
|
||||
return cls + token_ids_0 + sep
|
||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||
|
||||
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
||||
"""
|
||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||
|
||||
Args:
|
||||
token_ids_0: list of ids (must not contain special tokens)
|
||||
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
||||
for sequence pairs
|
||||
already_has_special_tokens: (default False) Set to True if the token list is already formated with
|
||||
special tokens for the model
|
||||
|
||||
Returns:
|
||||
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||
"""
|
||||
|
||||
if already_has_special_tokens:
|
||||
if token_ids_1 is not None:
|
||||
raise ValueError("You should not supply a second sequence if the provided sequence of "
|
||||
"ids is already formated with special tokens for the model.")
|
||||
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
|
||||
|
||||
if token_ids_1 is not None:
|
||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||
|
||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
||||
"""
|
||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||
An ALBERT sequence pair mask has the following format:
|
||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
|
||||
| first sequence | second sequence
|
||||
|
||||
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
||||
"""
|
||||
sep = [self.sep_token_id]
|
||||
cls = [self.cls_token_id]
|
||||
|
||||
if token_ids_1 is None:
|
||||
return len(cls + token_ids_0 + sep) * [0]
|
||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
||||
|
||||
def save_vocabulary(self, save_directory):
|
||||
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
|
||||
to a directory.
|
||||
"""
|
||||
if not os.path.isdir(save_directory):
|
||||
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
||||
return
|
||||
out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
|
||||
|
||||
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
|
||||
copyfile(self.vocab_file, out_vocab_file)
|
||||
|
||||
return (out_vocab_file,)
|
||||
@@ -28,6 +28,7 @@ from .tokenization_xlm import XLMTokenizer
|
||||
from .tokenization_roberta import RobertaTokenizer
|
||||
from .tokenization_distilbert import DistilBertTokenizer
|
||||
from .tokenization_camembert import CamembertTokenizer
|
||||
from .tokenization_albert import AlbertTokenizer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -42,16 +43,17 @@ class AutoTokenizer(object):
|
||||
|
||||
The tokenizer class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `camembert`: CamembertTokenizer (CamemBERT model)
|
||||
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
|
||||
- contains `albert`: AlbertTokenizer (ALBERT model)
|
||||
- contains `camembert`: CamembertTokenizer (CamemBERT model)
|
||||
- contains `roberta`: RobertaTokenizer (RoBERTa model)
|
||||
- contains `bert`: BertTokenizer (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
|
||||
- contains `ctrl`: CTRLTokenizer (Salesforce CTRL model)
|
||||
- contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetTokenizer (XLNet model)
|
||||
- contains `xlm`: XLMTokenizer (XLM model)
|
||||
- contains `ctrl`: CTRLTokenizer (Salesforce CTRL model)
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throw an error).
|
||||
"""
|
||||
@@ -66,16 +68,17 @@ class AutoTokenizer(object):
|
||||
|
||||
The tokenizer class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `camembert`: CamembertTokenizer (CamemBERT model)
|
||||
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
|
||||
- contains `albert`: AlbertTokenizer (ALBERT model)
|
||||
- contains `camembert`: CamembertTokenizer (CamemBERT model)
|
||||
- contains `roberta`: RobertaTokenizer (RoBERTa model)
|
||||
- contains `bert`: BertTokenizer (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
|
||||
- contains `ctrl`: CTRLTokenizer (Salesforce CTRL model)
|
||||
- contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetTokenizer (XLNet model)
|
||||
- contains `xlm`: XLMTokenizer (XLM model)
|
||||
- contains `ctrl`: CTRLTokenizer (Salesforce CTRL model)
|
||||
|
||||
Params:
|
||||
pretrained_model_name_or_path: either:
|
||||
@@ -90,6 +93,9 @@ class AutoTokenizer(object):
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the vocabulary files and override the cached versions if they exists.
|
||||
|
||||
resume_download: (`optional`) boolean, default False:
|
||||
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
@@ -106,6 +112,8 @@ class AutoTokenizer(object):
|
||||
"""
|
||||
if 'distilbert' in pretrained_model_name_or_path:
|
||||
return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
elif 'albert' in pretrained_model_name_or_path:
|
||||
return AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
elif 'camembert' in pretrained_model_name_or_path:
|
||||
return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
elif 'roberta' in pretrained_model_name_or_path:
|
||||
@@ -126,4 +134,4 @@ class AutoTokenizer(object):
|
||||
return CTRLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||
"'xlm', 'roberta', 'camembert', 'ctrl'".format(pretrained_model_name_or_path))
|
||||
"'xlm', 'roberta', 'distilbert,' 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))
|
||||
|
||||
@@ -51,7 +51,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
||||
|
||||
def __init__(self, vocab_file, bos_token="<s>", eos_token="</s>", sep_token="</s>",
|
||||
cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>',
|
||||
additional_special_tokens=['<s>NOTUSED', '<s>NOTUSED'], **kwargs):
|
||||
additional_special_tokens=['<s>NOTUSED', '</s>NOTUSED'], **kwargs):
|
||||
super(CamembertTokenizer, self).__init__(max_len=512, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
|
||||
sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
|
||||
mask_token=mask_token, additional_special_tokens=additional_special_tokens,
|
||||
@@ -125,7 +125,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return self.fairseq_offset + len(self.sp_model)
|
||||
return len(self.fairseq_tokens_to_ids) + len(self.sp_model)
|
||||
|
||||
def _tokenize(self, text):
|
||||
return self.sp_model.EncodeAsPieces(text)
|
||||
@@ -134,6 +134,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
||||
if token in self.fairseq_tokens_to_ids:
|
||||
return self.fairseq_tokens_to_ids[token]
|
||||
elif self.sp_model.PieceToId(token) == 0:
|
||||
# Convert sentence piece unk token to fairseq unk token index
|
||||
return self.unk_token_id
|
||||
return self.fairseq_offset + self.sp_model.PieceToId(token)
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
|
||||
@@ -133,9 +133,11 @@ class CTRLTokenizer(PreTrainedTokenizer):
|
||||
self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
|
||||
self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
|
||||
|
||||
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
self.decoder = {v:k for k,v in self.encoder.items()}
|
||||
merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
|
||||
with open(merges_file, encoding='utf-8') as merges_handle:
|
||||
merges = merges_handle.read().split('\n')[1:-1]
|
||||
merges = [tuple(merge.split()) for merge in merges]
|
||||
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
||||
self.cache = {}
|
||||
@@ -192,9 +194,9 @@ class CTRLTokenizer(PreTrainedTokenizer):
|
||||
"""
|
||||
split_tokens = []
|
||||
|
||||
text = text.split(' ')
|
||||
words = re.findall(r'\S+\n?', text)
|
||||
|
||||
for token in text:
|
||||
for token in words:
|
||||
split_tokens.extend([t for t in self.bpe(token).split(' ')])
|
||||
return split_tokens
|
||||
|
||||
|
||||
@@ -33,12 +33,16 @@ PRETRAINED_VOCAB_FILES_MAP = {
|
||||
{
|
||||
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
|
||||
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
|
||||
'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt",
|
||||
'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
|
||||
}
|
||||
}
|
||||
|
||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
'distilbert-base-uncased': 512,
|
||||
'distilbert-base-uncased-distilled-squad': 512,
|
||||
'distilbert-base-german-cased': 512,
|
||||
'distilbert-base-multilingual-cased': 512,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -72,7 +72,7 @@ def bytes_to_unicode():
|
||||
"""
|
||||
Returns list of utf-8 byte and a mapping to unicode strings.
|
||||
We specifically avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
|
||||
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
@@ -107,10 +107,10 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
"""
|
||||
GPT-2 BPE tokenizer. Peculiarities:
|
||||
- Byte-level Byte-Pair-Encoding
|
||||
- Requires a space to start the input string => the encoding methods should be called with the
|
||||
- Requires a space to start the input string => the encoding and tokenize methods should be called with the
|
||||
``add_prefix_space`` flag set to ``True``.
|
||||
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
|
||||
the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
|
||||
Otherwise, this tokenizer's ``encode``, ``decode``, and ``tokenize`` methods will not conserve
|
||||
the spaces at the beginning of a string: `tokenizer.decode(tokenizer.encode(" Hello")) = "Hello"`
|
||||
"""
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
@@ -122,13 +122,15 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
|
||||
self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
|
||||
|
||||
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
self.errors = errors # how to handle errors in decoding
|
||||
self.byte_encoder = bytes_to_unicode()
|
||||
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
|
||||
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
|
||||
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
|
||||
with open(merges_file, encoding='utf-8') as merges_handle:
|
||||
bpe_merges = merges_handle.read().split('\n')[1:-1]
|
||||
bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
|
||||
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
|
||||
self.cache = {}
|
||||
|
||||
@@ -184,7 +186,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
""" Tokenize a string.
|
||||
Args:
|
||||
- add_prefix_space (boolean, default False):
|
||||
Begin the sentence with at least one space toto get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
|
||||
Begin the sentence with at least one space to get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
|
||||
"""
|
||||
if add_prefix_space:
|
||||
text = ' ' + text
|
||||
@@ -234,4 +236,4 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
writer.write(' '.join(bpe_tokens) + u'\n')
|
||||
index += 1
|
||||
|
||||
return vocab_file, merge_file
|
||||
return vocab_file, merge_file
|
||||
|
||||
@@ -101,9 +101,11 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
|
||||
self.nlp = BasicTokenizer(do_lower_case=True)
|
||||
self.fix_text = None
|
||||
|
||||
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
self.decoder = {v:k for k,v in self.encoder.items()}
|
||||
merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
|
||||
with open(merges_file, encoding='utf-8') as merges_handle:
|
||||
merges = merges_handle.read().split('\n')[1:-1]
|
||||
merges = [tuple(merge.split()) for merge in merges]
|
||||
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
||||
self.cache = {}
|
||||
|
||||
@@ -22,6 +22,7 @@ import json
|
||||
import six
|
||||
import copy
|
||||
import itertools
|
||||
import re
|
||||
from io import open
|
||||
|
||||
from .file_utils import cached_path, is_tf_available, is_torch_available
|
||||
@@ -263,6 +264,9 @@ class PreTrainedTokenizer(object):
|
||||
force_download: (`optional`) boolean, default False:
|
||||
Force to (re-)download the vocabulary files and override the cached versions if they exists.
|
||||
|
||||
resume_download: (`optional`) boolean, default False:
|
||||
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
|
||||
The proxies are used on each request.
|
||||
@@ -298,6 +302,7 @@ class PreTrainedTokenizer(object):
|
||||
def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
|
||||
cache_dir = kwargs.pop('cache_dir', None)
|
||||
force_download = kwargs.pop('force_download', False)
|
||||
resume_download = kwargs.pop('resume_download', False)
|
||||
proxies = kwargs.pop('proxies', None)
|
||||
|
||||
s3_models = list(cls.max_model_input_sizes.keys())
|
||||
@@ -354,7 +359,7 @@ class PreTrainedTokenizer(object):
|
||||
"We assumed '{}' was a path or url to a directory containing vocabulary files "
|
||||
"named {} but couldn't find such vocabulary files at this path or url.".format(
|
||||
pretrained_model_name_or_path, ', '.join(s3_models),
|
||||
pretrained_model_name_or_path,
|
||||
pretrained_model_name_or_path,
|
||||
list(cls.vocab_files_names.values())))
|
||||
|
||||
# Get files from url, cache, or disk depending on the case
|
||||
@@ -364,7 +369,7 @@ class PreTrainedTokenizer(object):
|
||||
if file_path is None:
|
||||
resolved_vocab_files[file_id] = None
|
||||
else:
|
||||
resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
|
||||
resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download)
|
||||
except EnvironmentError:
|
||||
if pretrained_model_name_or_path in s3_models:
|
||||
msg = "Couldn't reach server at '{}' to download vocabulary files."
|
||||
@@ -389,7 +394,8 @@ class PreTrainedTokenizer(object):
|
||||
# Did we saved some inputs and kwargs to reload ?
|
||||
tokenizer_config_file = resolved_vocab_files.pop('tokenizer_config_file', None)
|
||||
if tokenizer_config_file is not None:
|
||||
init_kwargs = json.load(open(tokenizer_config_file, encoding="utf-8"))
|
||||
with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
|
||||
init_kwargs = json.load(tokenizer_config_handle)
|
||||
saved_init_inputs = init_kwargs.pop('init_inputs', ())
|
||||
if not init_inputs:
|
||||
init_inputs = saved_init_inputs
|
||||
@@ -414,7 +420,8 @@ class PreTrainedTokenizer(object):
|
||||
if args_name not in init_kwargs:
|
||||
init_kwargs[args_name] = file_path
|
||||
if special_tokens_map_file is not None:
|
||||
special_tokens_map = json.load(open(special_tokens_map_file, encoding="utf-8"))
|
||||
with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
|
||||
special_tokens_map = json.load(special_tokens_map_handle)
|
||||
for key, value in special_tokens_map.items():
|
||||
if key not in init_kwargs:
|
||||
init_kwargs[key] = value
|
||||
@@ -428,7 +435,8 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
# Add supplementary tokens.
|
||||
if added_tokens_file is not None:
|
||||
added_tok_encoder = json.load(open(added_tokens_file, encoding="utf-8"))
|
||||
with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
|
||||
added_tok_encoder = json.load(added_tokens_handle)
|
||||
added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
|
||||
tokenizer.added_tokens_encoder.update(added_tok_encoder)
|
||||
tokenizer.added_tokens_decoder.update(added_tok_decoder)
|
||||
@@ -524,6 +532,8 @@ class PreTrainedTokenizer(object):
|
||||
to_add_tokens = []
|
||||
for token in new_tokens:
|
||||
assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
|
||||
if self.init_kwargs.get('do_lower_case', False) and token not in self.all_special_tokens:
|
||||
token = token.lower()
|
||||
if token != self.unk_token and \
|
||||
self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and \
|
||||
token not in to_add_tokens:
|
||||
@@ -621,6 +631,19 @@ class PreTrainedTokenizer(object):
|
||||
return_tokens_mapped_to_origin: (optional) Set to True to return the index of each token in the initial whitespace tokenization. (default False).
|
||||
**kwargs: passed to the child `self.tokenize()` method
|
||||
"""
|
||||
def lowercase_text(t):
|
||||
# convert non-special tokens to lowercase
|
||||
escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens]
|
||||
pattern = r'(^' + r'|'.join(escaped_special_toks) + r')|' + \
|
||||
r'(.+?)'
|
||||
return re.sub(
|
||||
pattern,
|
||||
lambda m: m.groups()[0] or m.groups()[1].lower(),
|
||||
t)
|
||||
|
||||
if self.init_kwargs.get('do_lower_case', False):
|
||||
text = lowercase_text(text)
|
||||
|
||||
def split_on_token(tok, text):
|
||||
result = []
|
||||
split_text = text.split(tok)
|
||||
@@ -823,7 +846,6 @@ class PreTrainedTokenizer(object):
|
||||
``input_ids``: list of token ids to be fed to a model
|
||||
``token_type_ids``: list of token type ids to be fed to a model
|
||||
``attention_mask``: list of indices specifying which tokens should be attended to by the model
|
||||
|
||||
``overflowing_tokens``: list of overflowing tokens if a max length is specified.
|
||||
``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
|
||||
``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
|
||||
@@ -904,15 +926,18 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
{
|
||||
input_ids: list[int],
|
||||
overflowing_tokens: list[int] if a ``max_length`` is specified, else None
|
||||
special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True``
|
||||
token_type_ids: list[int] if return_token_type_ids is True (default)
|
||||
overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True
|
||||
num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True
|
||||
special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
|
||||
}
|
||||
|
||||
With the fields:
|
||||
``input_ids``: list of tokens to be fed to a model
|
||||
``input_ids``: list of token ids to be fed to a model
|
||||
``token_type_ids``: list of token type ids to be fed to a model
|
||||
|
||||
``overflowing_tokens``: list of overflowing tokens if a max length is specified.
|
||||
|
||||
``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
|
||||
``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
|
||||
tokens and 1 specifying sequence tokens.
|
||||
"""
|
||||
@@ -921,23 +946,31 @@ class PreTrainedTokenizer(object):
|
||||
len_pair_ids = len(pair_ids) if pair else 0
|
||||
|
||||
encoded_inputs = {}
|
||||
|
||||
# Handle max sequence length
|
||||
total_len = len_ids + len_pair_ids + (self.num_added_tokens(pair=pair) if add_special_tokens else 0)
|
||||
if max_length and total_len > max_length:
|
||||
ids, pair_ids, overflowing_tokens = self.truncate_sequences(ids, pair_ids=pair_ids,
|
||||
num_tokens_to_remove=total_len-max_length,
|
||||
truncation_strategy=truncation_strategy,
|
||||
stride=stride)
|
||||
encoded_inputs["overflowing_tokens"] = overflowing_tokens
|
||||
encoded_inputs["num_truncated_tokens"] = total_len - max_length
|
||||
if return_overflowing_tokens:
|
||||
encoded_inputs["overflowing_tokens"] = overflowing_tokens
|
||||
encoded_inputs["num_truncated_tokens"] = total_len - max_length
|
||||
|
||||
# Handle special_tokens
|
||||
if add_special_tokens:
|
||||
sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
|
||||
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
|
||||
encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
|
||||
special_tokens_mask = self.get_special_tokens_mask(ids, pair_ids)
|
||||
else:
|
||||
sequence = ids + pair_ids if pair else ids
|
||||
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
|
||||
special_tokens_mask = [0] * (len(ids) + (len(pair_ids) if pair else 0))
|
||||
if return_special_tokens_mask:
|
||||
encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
|
||||
|
||||
# Prepare inputs as tensors if asked
|
||||
if return_tensors == 'tf' and is_tf_available():
|
||||
sequence = tf.constant([sequence])
|
||||
token_type_ids = tf.constant([token_type_ids])
|
||||
@@ -948,12 +981,15 @@ class PreTrainedTokenizer(object):
|
||||
logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors))
|
||||
|
||||
encoded_inputs["input_ids"] = sequence
|
||||
encoded_inputs["token_type_ids"] = token_type_ids
|
||||
if return_token_type_ids:
|
||||
encoded_inputs["token_type_ids"] = token_type_ids
|
||||
|
||||
if max_length and len(encoded_inputs["input_ids"]) > max_length:
|
||||
encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length]
|
||||
encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
|
||||
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]
|
||||
if return_token_type_ids:
|
||||
encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
|
||||
if return_special_tokens_mask:
|
||||
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]
|
||||
|
||||
if max_length is None and len(encoded_inputs["input_ids"]) > self.max_len:
|
||||
logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
|
||||
@@ -995,7 +1031,7 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
elif return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
|
||||
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):
|
||||
@@ -1039,7 +1075,6 @@ class PreTrainedTokenizer(object):
|
||||
return (ids, pair_ids, overflowing_tokens)
|
||||
|
||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
||||
logger.warning("This tokenizer does not make use of special tokens.")
|
||||
if token_ids_1 is None:
|
||||
return len(token_ids_0) * [0]
|
||||
return [0] * len(token_ids_0) + [1] * len(token_ids_1)
|
||||
@@ -1052,7 +1087,6 @@ class PreTrainedTokenizer(object):
|
||||
single sequence: <s> X </s>
|
||||
pair of sequences: <s> A </s></s> B </s>
|
||||
"""
|
||||
logger.warning("This tokenizer does not make use of special tokens. Input is returned with no modification.")
|
||||
if token_ids_1 is None:
|
||||
return token_ids_0
|
||||
return token_ids_0 + token_ids_1
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tokenization classes for OpenAI GPT."""
|
||||
"""Tokenization classes for XLM."""
|
||||
from __future__ import (absolute_import, division, print_function,
|
||||
unicode_literals)
|
||||
|
||||
@@ -524,7 +524,7 @@ class XLMTokenizer(PreTrainedTokenizer):
|
||||
|
||||
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
|
||||
(ex: "__classify__") to a vocabulary
|
||||
|
||||
|
||||
- `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies)
|
||||
|
||||
- `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies)
|
||||
@@ -564,9 +564,11 @@ class XLMTokenizer(PreTrainedTokenizer):
|
||||
self.ja_word_tokenizer = None
|
||||
self.zh_word_tokenizer = None
|
||||
|
||||
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
self.decoder = {v:k for k,v in self.encoder.items()}
|
||||
merges = open(merges_file, encoding='utf-8').read().split('\n')[:-1]
|
||||
with open(merges_file, encoding='utf-8') as merges_handle:
|
||||
merges = merges_handle.read().split('\n')[:-1]
|
||||
merges = [tuple(merge.split()[:2]) for merge in merges]
|
||||
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
||||
self.cache = {}
|
||||
@@ -758,9 +760,9 @@ class XLMTokenizer(PreTrainedTokenizer):
|
||||
"""
|
||||
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||
by concatenating and adding special tokens.
|
||||
A RoBERTa sequence has the following format:
|
||||
A XLM sequence has the following format:
|
||||
single sequence: <s> X </s>
|
||||
pair of sequences: <s> A </s></s> B </s>
|
||||
pair of sequences: <s> A </s> B </s>
|
||||
"""
|
||||
if token_ids_1 is None:
|
||||
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
||||
|
||||
@@ -143,7 +143,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
||||
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
|
||||
new_pieces = []
|
||||
for piece in pieces:
|
||||
if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
|
||||
if len(piece) > 1 and piece[-1] == str(',') and piece[-2].isdigit():
|
||||
cur_pieces = self.sp_model.EncodeAsPieces(
|
||||
piece[:-1].replace(SPIECE_UNDERLINE, ''))
|
||||
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
|
||||
@@ -187,9 +187,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
||||
"""
|
||||
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||
by concatenating and adding special tokens.
|
||||
A RoBERTa sequence has the following format:
|
||||
single sequence: <s> X </s>
|
||||
pair of sequences: <s> A </s></s> B </s>
|
||||
An XLNet sequence has the following format:
|
||||
single sequence: X <sep> <cls>
|
||||
pair of sequences: A <sep> B <sep> <cls>
|
||||
"""
|
||||
sep = [self.sep_token_id]
|
||||
cls = [self.cls_token_id]
|
||||
@@ -226,10 +226,10 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
||||
"""
|
||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||
A BERT sequence pair mask has the following format:
|
||||
An XLNet sequence pair mask has the following format:
|
||||
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
|
||||
| first sequence | second sequence | CLS segment ID
|
||||
|
||||
|
||||
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
||||
"""
|
||||
sep = [self.sep_token_id]
|
||||
|
||||
Reference in New Issue
Block a user