[fsmt] rewrite SinusoidalPositionalEmbedding + USE_CUDA test fixes + new TranslationPipeline test (#7224)

* fix USE_CUDA, add pipeline * USE_CUDA fix * recode SinusoidalPositionalEmbedding into nn.Embedding subclass was needed for torchscript to work - this is now part of the state_dict, so will have to remove these keys during save_pretrained * back out (ci debug) * restore * slow last? * facilitate not saving certain keys and test * remove no longer used keys * style * fix logging import * cleanup * Update src/transformers/modeling_utils.py Co-authored-by: Sam Shleifer <sshleifer@gmail.com> * fix bug in max_positional_embeddings * rename keys to keys_to_never_save per suggestion, improve the setup * Update src/transformers/modeling_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Sam Shleifer <sshleifer@gmail.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2020-09-21 06:13:35 -07:00
parent 67c4b0c517
commit 8ff88d25e9
4 changed files with 129 additions and 96 deletions
--- a/src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
@@ -31,16 +31,14 @@ import torch
 from fairseq import hub_utils
 from fairseq.data.dictionary import Dictionary
-from transformers import WEIGHTS_NAME
+from transformers import WEIGHTS_NAME, logging
 from transformers.configuration_fsmt import FSMTConfig
 from transformers.modeling_fsmt import FSMTForConditionalGeneration
 from transformers.tokenization_fsmt import VOCAB_FILES_NAMES
 from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
 from .utils import logging
-
+logging.set_verbosity_warning()
 logging.set_verbosity_info()
 json_indent = 2
@@ -229,6 +227,8 @@ def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder
        "model.decoder.version",
        "model.encoder_embed_tokens.weight",
        "model.decoder_embed_tokens.weight",
        "model.encoder.embed_positions._float_tensor",
        "model.decoder.embed_positions._float_tensor",
    ]
    for k in ignore_keys:
        model_state_dict.pop(k, None)
--- a/src/transformers/modeling_fsmt.py
+++ b/src/transformers/modeling_fsmt.py
@@ -397,24 +397,18 @@ class FSMTEncoder(nn.Module):
    def __init__(self, config: FSMTConfig, embed_tokens):
        super().__init__()
        self.dropout = config.dropout
        self.layerdrop = config.encoder_layerdrop
-
+        self.padding_idx = embed_tokens.padding_idx
        self.embed_tokens = embed_tokens
        embed_dim = embed_tokens.embedding_dim
        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
        self.padding_idx = embed_tokens.padding_idx
        self.max_source_positions = config.max_position_embeddings
        self.embed_tokens = embed_tokens
        # print(config.max_position_embeddings, embed_dim, self.padding_idx)
        num_embeddings = config.src_vocab_size
        self.embed_positions = SinusoidalPositionalEmbedding(
-            embed_dim,
+            config.max_position_embeddings + self.padding_idx + 1, embed_dim, self.padding_idx
            self.padding_idx,
            init_size=num_embeddings + self.padding_idx + 1,  # removed: config.max_position_embeddings
        )
-        self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layers = nn.ModuleList(
            [EncoderLayer(config) for _ in range(config.encoder_layers)]
        )  # type: List[EncoderLayer]
    def forward(
        self, input_ids, attention_mask=None, output_attentions=False, output_hidden_states=False, return_dict=False
@@ -570,15 +564,11 @@ class FSMTDecoder(nn.Module):
        self.dropout = config.dropout
        self.layerdrop = config.decoder_layerdrop
        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = config.max_position_embeddings
        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
        self.embed_tokens = embed_tokens
        embed_dim = embed_tokens.embedding_dim
        num_embeddings = config.tgt_vocab_size
        self.embed_positions = SinusoidalPositionalEmbedding(
-            embed_dim,
+            config.max_position_embeddings + self.padding_idx + 1, embed_dim, self.padding_idx
            self.padding_idx,
            init_size=num_embeddings + self.padding_idx + 1,
        )
        self.layers = nn.ModuleList(
            [DecoderLayer(config) for _ in range(config.decoder_layers)]
@@ -1003,6 +993,14 @@ class FSMTModel(PretrainedFSMTModel):
 )
 class FSMTForConditionalGeneration(PretrainedFSMTModel):
    base_model_prefix = "model"
    authorized_missing_keys = [
        "model.encoder.embed_positions.weight",
        "model.decoder.embed_positions.weight",
    ]
    keys_to_never_save = [
        "model.encoder.embed_positions.weight",
        "model.decoder.embed_positions.weight",
    ]
    def __init__(self, config: FSMTConfig):
        super().__init__(config)
@@ -1137,36 +1135,34 @@ class FSMTForConditionalGeneration(PretrainedFSMTModel):
        return self.model.decoder.embed_tokens
-def make_positions(tensor, padding_idx: int):
+class SinusoidalPositionalEmbedding(nn.Embedding):
    """Replace non-padding symbols with their position numbers.
    Position numbers begin at padding_idx+1. Padding symbols are ignored.
    """
-    # The series of casts and type-conversions here are carefully
+    This module produces sinusoidal positional embeddings of any length.
    # balanced to both work with ONNX export and XLA. In particular XLA
    # prefers ints, cumsum defaults to output longs, and ONNX doesn't know
    # how to handle the dtype kwarg in cumsum.
    mask = tensor.ne(padding_idx).int()
    return (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + padding_idx
-
+    We don't want to save the weight of this embedding since it's not trained
-class SinusoidalPositionalEmbedding(nn.Module):
+    (deterministic) and it can be huge.
    """This module produces sinusoidal positional embeddings of any length.
    Padding symbols are ignored.
    These embeddings get automatically extended in forward if more positions is needed.
    """
-    def __init__(self, embedding_dim, padding_idx, init_size=1024):
+    def __init__(self, num_positions, embedding_dim, padding_idx):
-        super().__init__()
+        self.make_weight(num_positions, embedding_dim, padding_idx)
-        self.embedding_dim = embedding_dim
+
-        self.padding_idx = padding_idx
+    def make_weight(self, num_positions, embedding_dim, padding_idx, device=None):
-        self.weights = SinusoidalPositionalEmbedding.get_embedding(init_size, embedding_dim, padding_idx)
+        weight = self.get_embedding(num_positions, embedding_dim, padding_idx)
-        self.register_buffer("_float_tensor", torch.zeros(1))  # used for getting the right device
+        if device is not None:
-        self.max_positions = int(1e5)
+            weight = weight.to(device)
        if not hasattr(self, "weight"):
            super().__init__(num_positions, embedding_dim, padding_idx, _weight=weight)
        else:
            self.weight = nn.Parameter(weight)
        self.weight.detach_()
        self.weight.requires_grad = False
    # XXX: bart uses s/num_embeddings/num_positions/, s/weights/weight/ - could make those match
    @staticmethod
-    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+    def get_embedding(num_embeddings, embedding_dim, padding_idx):
        """Build sinusoidal embeddings.
        This matches the implementation in tensor2tensor, but differs slightly
@@ -1184,28 +1180,30 @@ class SinusoidalPositionalEmbedding(nn.Module):
            emb[padding_idx, :] = 0
        return emb
    @staticmethod
    def make_positions(tensor, padding_idx: int):
        """Replace non-padding symbols with their position numbers.
        Position numbers begin at padding_idx+1. Padding symbols are ignored.
        """
        # The series of casts and type-conversions here are carefully
        # balanced to both work with ONNX export and XLA. In particular XLA
        # prefers ints, cumsum defaults to output longs, and ONNX doesn't know
        # how to handle the dtype kwarg in cumsum.
        mask = tensor.ne(padding_idx).int()
        return (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + padding_idx
    def forward(
        self,
        input,
        incremental_state: Optional[Any] = None,
        timestep: Optional[Tensor] = None,
        positions: Optional[Any] = None,
    ):
        """Input is expected to be of size [bsz x seqlen]."""
        # bspair = torch.onnx.operators.shape_as_tensor(input)
        # bsz, seq_len = bspair[0], bspair[1]
        bsz, seq_len = input.shape[:2]
        max_pos = self.padding_idx + 1 + seq_len
-        if self.weights is None or max_pos > self.weights.size(0):
+        if max_pos > self.weight.size(0):
-            # recompute/expand embeddings if needed
+            # expand embeddings if needed
-            self.weights = SinusoidalPositionalEmbedding.get_embedding(max_pos, self.embedding_dim, self.padding_idx)
+            self.make_weight(max_pos, self.embedding_dim, self.padding_idx, device=input.device)
-        self.weights = self.weights.to(self._float_tensor)
+        positions = self.make_positions(input, self.padding_idx)
-
+        return super().forward(positions)
        if incremental_state is not None:
            # positions is the same for every token when decoding a single step
            pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len
            return self.weights[self.padding_idx + pos, :].expand(bsz, 1, -1)
        positions = make_positions(input, self.padding_idx)
        return self.weights.index_select(0, positions.view(-1)).view(bsz, seq_len, -1).detach()
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -391,10 +391,14 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
          derived classes of the same architecture adding modules on top of the base model.
        - **authorized_missing_keys** (:obj:`Optional[List[str]]`) -- A list of re pattern of tensor names to ignore
          when loading the model (and avoid unnecessary warnings).
        - **keys_to_never_save** (:obj:`Optional[List[str]]`) -- A list of of tensor names to ignore
          when saving the model (useful for keys that aren't trained, but which are deterministic)
    """
    config_class = None
    base_model_prefix = ""
    authorized_missing_keys = None
    keys_to_never_save = None
    @property
    def dummy_inputs(self) -> Dict[str, torch.Tensor]:
@@ -688,6 +692,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
        # Attach architecture to the config
        model_to_save.config.architectures = [model_to_save.__class__.__name__]
        state_dict = model_to_save.state_dict()
        # Handle the case where some state_dict keys shouldn't be saved
        if self.keys_to_never_save is not None:
            state_dict = {k: v for k, v in state_dict.items() if k not in self.keys_to_never_save}
        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
@@ -698,10 +708,10 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
                # Save configuration file
                model_to_save.config.save_pretrained(save_directory)
            # xm.save takes care of saving only from master
-            xm.save(model_to_save.state_dict(), output_model_file)
+            xm.save(state_dict, output_model_file)
        else:
            model_to_save.config.save_pretrained(save_directory)
-            torch.save(model_to_save.state_dict(), output_model_file)
+            torch.save(state_dict, output_model_file)
        logger.info("Model weights saved in {}".format(output_model_file))
--- a/tests/test_modeling_fsmt.py
+++ b/tests/test_modeling_fsmt.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import tempfile
 import unittest
@@ -20,7 +21,7 @@ import timeout_decorator  # noqa
 from parameterized import parameterized
 from transformers import is_torch_available
-from transformers.file_utils import cached_property
+from transformers.file_utils import WEIGHTS_NAME, cached_property
 from transformers.testing_utils import require_torch, slow, torch_device
 from .test_configuration_common import ConfigTester
@@ -37,6 +38,7 @@ if is_torch_available():
        invert_mask,
        shift_tokens_right,
    )
    from transformers.pipelines import TranslationPipeline
@require_torch
@@ -207,6 +209,27 @@ class FSMTModelTest(ModelTesterMixin, unittest.TestCase):
                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
            self.assertEqual(info["missing_keys"], [])
    def test_save_load_no_save_keys(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        for model_class in self.all_model_classes:
            model = model_class(config)
            state_dict_no_save_keys = getattr(model, "state_dict_no_save_keys", None)
            if state_dict_no_save_keys is None:
                continue
            # check the keys are in the original state_dict
            for k in state_dict_no_save_keys:
                self.assertIn(k, model.state_dict())
            # check that certain keys didn't get saved with the model
            with tempfile.TemporaryDirectory() as tmpdirname:
                model.save_pretrained(tmpdirname)
                output_model_file = os.path.join(tmpdirname, WEIGHTS_NAME)
                state_dict_saved = torch.load(output_model_file)
                for k in state_dict_no_save_keys:
                    self.assertNotIn(k, state_dict_saved)
    @unittest.skip("can't be implemented for FSMT due to dual vocab.")
    def test_resize_tokens_embeddings(self):
        pass
@@ -219,14 +242,6 @@ class FSMTModelTest(ModelTesterMixin, unittest.TestCase):
    def test_tie_model_weights(self):
        pass
    @unittest.skip("failing on CI - needs review")
    def test_torchscript_output_attentions(self):
        pass
    @unittest.skip("failing on CI - needs review")
    def test_torchscript_output_hidden_state(self):
        pass
    # def test_auto_model(self):
    #     # XXX: add a tiny model to s3?
    #     model_name = "facebook/wmt19-ru-en-tiny"
@@ -366,6 +381,14 @@ def _long_tensor(tok_lst):
 TOLERANCE = 1e-4
 pairs = [
    ["en-ru"],
    ["ru-en"],
    ["en-de"],
    ["de-en"],
 ]
@require_torch
 class FSMTModelIntegrationTests(unittest.TestCase):
    tokenizers_cache = {}
@@ -399,7 +422,7 @@ class FSMTModelIntegrationTests(unittest.TestCase):
        src_text = "My friend computer will translate this for me"
        input_ids = tokenizer([src_text], return_tensors="pt")["input_ids"]
-        input_ids = _long_tensor(input_ids)
+        input_ids = _long_tensor(input_ids).to(torch_device)
        inputs_dict = prepare_fsmt_inputs_dict(model.config, input_ids)
        with torch.no_grad():
            output = model(**inputs_dict)[0]
@@ -409,19 +432,10 @@ class FSMTModelIntegrationTests(unittest.TestCase):
        # may have to adjust if switched to a different checkpoint
        expected_slice = torch.tensor(
            [[-1.5753, -1.5753, 2.8975], [-0.9540, -0.9540, 1.0299], [-3.3131, -3.3131, 0.5219]]
-        )
+        ).to(torch_device)
        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
-    @parameterized.expand(
+    def translation_setup(self, pair):
        [
            ["en-ru"],
            ["ru-en"],
            ["en-de"],
            ["de-en"],
        ]
    )
    @slow
    def test_translation(self, pair):
        text = {
            "en": "Machine learning is great, isn't it?",
            "ru": "Машинное обучение - это здорово, не так ли?",
@@ -432,16 +446,32 @@ class FSMTModelIntegrationTests(unittest.TestCase):
        print(f"Testing {src} -> {tgt}")
        mname = f"facebook/wmt19-{pair}"
-        src_sentence = text[src]
+        src_text = text[src]
-        tgt_sentence = text[tgt]
+        tgt_text = text[tgt]
        tokenizer = self.get_tokenizer(mname)
        model = self.get_model(mname)
        return tokenizer, model, src_text, tgt_text
    @parameterized.expand(pairs)
    @slow
    def test_translation_direct(self, pair):
        tokenizer, model, src_text, tgt_text = self.translation_setup(pair)
        input_ids = tokenizer.encode(src_text, return_tensors="pt").to(torch_device)
        input_ids = tokenizer.encode(src_sentence, return_tensors="pt")
        outputs = model.generate(input_ids)
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        assert decoded == tgt_sentence, f"\n\ngot: {decoded}\nexp: {tgt_sentence}\n"
+        assert decoded == tgt_text, f"\n\ngot: {decoded}\nexp: {tgt_text}\n"
    @parameterized.expand(pairs)
    @slow
    def test_translation_pipeline(self, pair):
        tokenizer, model, src_text, tgt_text = self.translation_setup(pair)
        device = 0 if torch_device == "cuda" else -1
        pipeline = TranslationPipeline(model, tokenizer, framework="pt", device=device)
        output = pipeline([src_text])
        self.assertEqual([tgt_text], [x["translation_text"] for x in output])
@require_torch
@@ -449,10 +479,9 @@ class TestSinusoidalPositionalEmbeddings(unittest.TestCase):
    padding_idx = 1
    tolerance = 1e-4
    @unittest.skip("failing on CI - needs review")
    def test_basic(self):
        input_ids = torch.tensor([[4, 10]], dtype=torch.long, device=torch_device)
-        emb1 = SinusoidalPositionalEmbedding(embedding_dim=6, padding_idx=self.padding_idx, init_size=6).to(
+        emb1 = SinusoidalPositionalEmbedding(num_positions=6, embedding_dim=6, padding_idx=self.padding_idx).to(
            torch_device
        )
        emb = emb1(input_ids)
@@ -461,7 +490,7 @@ class TestSinusoidalPositionalEmbeddings(unittest.TestCase):
                [9.0930e-01, 1.9999e-02, 2.0000e-04, -4.1615e-01, 9.9980e-01, 1.0000e00],
                [1.4112e-01, 2.9995e-02, 3.0000e-04, -9.8999e-01, 9.9955e-01, 1.0000e00],
            ]
-        )
+        ).to(torch_device)
        self.assertTrue(
            torch.allclose(emb[0], desired_weights, atol=self.tolerance),
            msg=f"\nexp:\n{desired_weights}\ngot:\n{emb[0]}\n",
@@ -469,14 +498,10 @@ class TestSinusoidalPositionalEmbeddings(unittest.TestCase):
    def test_odd_embed_dim(self):
        # odd embedding_dim  is allowed
-        SinusoidalPositionalEmbedding.get_embedding(
+        SinusoidalPositionalEmbedding(num_positions=4, embedding_dim=5, padding_idx=self.padding_idx).to(torch_device)
            num_embeddings=4, embedding_dim=5, padding_idx=self.padding_idx
        ).to(torch_device)
        # odd num_embeddings is allowed
-        SinusoidalPositionalEmbedding.get_embedding(
+        SinusoidalPositionalEmbedding(num_positions=5, embedding_dim=4, padding_idx=self.padding_idx).to(torch_device)
            num_embeddings=5, embedding_dim=4, padding_idx=self.padding_idx
        ).to(torch_device)
    @unittest.skip("different from marian (needs more research)")
    def test_positional_emb_weights_against_marian(self):
@@ -488,7 +513,7 @@ class TestSinusoidalPositionalEmbeddings(unittest.TestCase):
                [0.90929741, 0.93651021, 0.95829457, 0.97505713, 0.98720258],
            ]
        )
-        emb1 = SinusoidalPositionalEmbedding(init_size=512, embedding_dim=512, padding_idx=self.padding_idx).to(
+        emb1 = SinusoidalPositionalEmbedding(num_positions=512, embedding_dim=512, padding_idx=self.padding_idx).to(
            torch_device
        )
        weights = emb1.weights.data[:3, :5]