From 6303b5a7185fba43830db0cbb06c61861f57ddff Mon Sep 17 00:00:00 2001 From: Huang Lianzhe Date: Wed, 23 Sep 2020 00:31:21 +0800 Subject: [PATCH] [Bug Fix] The actual batch_size is inconsistent with the settings. (#7235) * [bug fix] fixed the bug that the actual batch_size is inconsistent with the parameter settings * reformat * reformat * reformat * add support for dict and BatchEncoding * add support for dict and BatchEncoding * add documentation for DataCollatorForNextSentencePrediction * Some more nits for the docstring Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Some more nits for the docstring Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Some more nits for the docstring Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Some more nits for the docstring Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Some more nits for the docstring Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * rename variables Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- src/transformers/data/data_collator.py | 152 +++++------------- .../data/datasets/language_modeling.py | 101 +++++++++++- 2 files changed, 136 insertions(+), 117 deletions(-) diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index 9768c7d1f3..485045b609 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -1,4 +1,3 @@ -import random from dataclasses import dataclass from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union @@ -402,8 +401,8 @@ class DataCollatorForPermutationLanguageModeling: @dataclass class DataCollatorForNextSentencePrediction: """ - Data collator used for language modeling. - - collates batches of tensors, honoring their tokenizer's pad_token + Data collator used for next sentence prediction. + - collates examples which contains pre-generated negative examples - preprocesses batches for masked language modeling """ @@ -414,21 +413,30 @@ class DataCollatorForNextSentencePrediction: nsp_probability: float = 0.5 mlm_probability: float = 0.15 - def __call__(self, examples: List[Union[List[List[int]], Dict[str, torch.Tensor]]]) -> Dict[str, torch.Tensor]: - if isinstance(examples[0], (dict, BatchEncoding)): - examples = [e["input_ids"] for e in examples] + def __call__(self, examples: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]: + """ + The input should contain negative examples, :class:`~transformers.DataCollatorForNextSentencePrediction` will not generate any negative examples. + Args: + examples (:obj:`List[Dict]`): Each dictionary should have the following keys: + - ``tokens_a``: A sequence of tokens, which should appear before ``tokens_b`` in the text. + - ``tokens_b``: A sequence of tokens, which should appear after ``tokens_a`` in the text. + - ``is_random_next``: 1 if this pair is generated randomly, else 0. + """ + + tokens_a = [e["tokens_a"] for e in examples] + tokens_b = [e["tokens_b"] for e in examples] + nsp_labels = [1 if e["is_random_next"] else 0 for e in examples] input_ids = [] segment_ids = [] attention_masks = [] - nsp_labels = [] - for i, doc in enumerate(examples): - input_id, segment_id, attention_mask, label = self.create_examples_from_document(doc, i, examples) - input_ids.extend(input_id) - segment_ids.extend(segment_id) - attention_masks.extend(attention_mask) - nsp_labels.extend(label) + assert len(tokens_a) == len(tokens_b) + for i in range(len(tokens_a)): + input_id, attention_mask, segment_id = self.create_features_from_example(tokens_a[i], tokens_b[i]) + input_ids.append(input_id) + segment_ids.append(segment_id) + attention_masks.append(attention_mask) if self.mlm: input_ids, mlm_labels = self.mask_tokens(self._tensorize_batch(input_ids)) else: @@ -438,6 +446,7 @@ class DataCollatorForNextSentencePrediction: "input_ids": input_ids, "attention_mask": self._tensorize_batch(attention_masks), "token_type_ids": self._tensorize_batch(segment_ids), + "masked_lm_labels": mlm_labels if self.mlm else None, "next_sentence_label": torch.tensor(nsp_labels), } if self.mlm: @@ -457,111 +466,34 @@ class DataCollatorForNextSentencePrediction: ) return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id) - def create_examples_from_document( - self, document: List[List[int]], doc_index: int, examples: List[List[List[int]]] - ): + def create_features_from_example(self, tokens_a, tokens_b): """Creates examples for a single document.""" max_num_tokens = self.block_size - self.tokenizer.num_special_tokens_to_add(pair=True) - # We *usually* want to fill up the entire sequence since we are padding - # to `block_size` anyways, so short sequences are generally wasted - # computation. However, we *sometimes* - # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter - # sequences to minimize the mismatch between pre-training and fine-tuning. - # The `target_seq_length` is just a rough target however, whereas - # `block_size` is a hard limit. - target_seq_length = max_num_tokens - if random.random() < self.short_seq_probability: - target_seq_length = random.randint(2, max_num_tokens) + tokens_a, tokens_b, _ = self.tokenizer.truncate_sequences( + tokens_a, + tokens_b, + num_tokens_to_remove=len(tokens_a) + len(tokens_b) - max_num_tokens, + truncation_strategy="longest_first", + ) - current_chunk = [] # a buffer stored current working segments - current_length = 0 - i = 0 - input_ids = [] - segment_ids = [] - attention_masks = [] - labels = [] - while i < len(document): - segment = document[i] - current_chunk.append(segment) - current_length += len(segment) - if i == len(document) - 1 or current_length >= target_seq_length: - if current_chunk: - # `a_end` is how many segments from `current_chunk` go into the `A` - # (first) sentence. - a_end = 1 - if len(current_chunk) >= 2: - a_end = random.randint(1, len(current_chunk) - 1) + input_id = self.tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b) + attention_mask = [1] * len(input_id) + segment_id = self.tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b) + assert len(input_id) <= self.block_size - tokens_a = [] - for j in range(a_end): - tokens_a.extend(current_chunk[j]) + # pad + while len(input_id) < self.block_size: + input_id.append(0) + attention_mask.append(0) + segment_id.append(0) - tokens_b = [] + input_id = torch.tensor(input_id) + attention_mask = torch.tensor(attention_mask) + segment_id = torch.tensor(segment_id) - if len(current_chunk) == 1 or random.random() < self.nsp_probability: - is_random_next = True - target_b_length = target_seq_length - len(tokens_a) - - # This should rarely go for more than one iteration for large - # corpora. However, just to be careful, we try to make sure that - # the random document is not the same as the document - # we're processing. Also check to make sure that the random document - # is not empty. - for _ in range(10): - random_document_index = random.randint(0, len(examples) - 1) - if random_document_index != doc_index and len(examples[random_document_index]) > 0: - break - - random_document = examples[random_document_index] - random_start = random.randint(0, len(random_document) - 1) - for j in range(random_start, len(random_document)): - tokens_b.extend(random_document[j]) - if len(tokens_b) >= target_b_length: - break - # We didn't actually use these segments so we "put them back" so - # they don't go to waste. - num_unused_segments = len(current_chunk) - a_end - i -= num_unused_segments - # Actual next - else: - is_random_next = False - for j in range(a_end, len(current_chunk)): - tokens_b.extend(current_chunk[j]) - - assert len(tokens_a) >= 1 - assert len(tokens_b) >= 1 - - tokens_a, tokens_b, _ = self.tokenizer.truncate_sequences( - tokens_a, - tokens_b, - num_tokens_to_remove=len(tokens_a) + len(tokens_b) - max_num_tokens, - truncation_strategy="longest_first", - ) - - input_id = self.tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b) - attention_mask = [1] * len(input_id) - segment_id = self.tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b) - assert len(input_id) <= self.block_size - - # pad - while len(input_id) < self.block_size: - input_id.append(0) - attention_mask.append(0) - segment_id.append(0) - - input_ids.append(torch.tensor(input_id)) - segment_ids.append(torch.tensor(segment_id)) - attention_masks.append(torch.tensor(attention_mask)) - labels.append(torch.tensor(1 if is_random_next else 0)) - - current_chunk = [] - current_length = 0 - - i += 1 - - return input_ids, segment_ids, attention_masks, labels + return input_id, attention_mask, segment_id def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ diff --git a/src/transformers/data/datasets/language_modeling.py b/src/transformers/data/datasets/language_modeling.py index 982ea8786e..17f4ae0a50 100644 --- a/src/transformers/data/datasets/language_modeling.py +++ b/src/transformers/data/datasets/language_modeling.py @@ -2,7 +2,7 @@ import os import pickle import random import time -from typing import Dict, Optional +from typing import Dict, List, Optional import torch from torch.utils.data.dataset import Dataset @@ -267,10 +267,14 @@ class TextDatasetForNextSentencePrediction(Dataset): file_path: str, block_size: int, overwrite_cache=False, + short_seq_probability=0.1, + nsp_probability=0.5, ): assert os.path.isfile(file_path), f"Input file path {file_path} not found" - block_size = block_size - tokenizer.num_special_tokens_to_add(pair=True) + self.block_size = block_size - tokenizer.num_special_tokens_to_add(pair=True) + self.short_seq_probability = short_seq_probability + self.nsp_probability = nsp_probability directory, filename = os.path.split(file_path) cached_features_file = os.path.join( @@ -283,7 +287,6 @@ class TextDatasetForNextSentencePrediction(Dataset): ) self.tokenizer = tokenizer - self.examples = [] # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. @@ -313,7 +316,7 @@ class TextDatasetForNextSentencePrediction(Dataset): else: logger.info(f"Creating features from dataset file at {directory}") - self.examples = [[]] + self.documents = [[]] with open(file_path, encoding="utf-8") as f: while True: line = f.readline() @@ -322,12 +325,17 @@ class TextDatasetForNextSentencePrediction(Dataset): line = line.strip() # Empty lines are used as document delimiters - if not line and len(self.examples[-1]) != 0: - self.examples.append([]) + if not line and len(self.documents[-1]) != 0: + self.documents.append([]) tokens = tokenizer.tokenize(line) tokens = tokenizer.convert_tokens_to_ids(tokens) if tokens: - self.examples[-1].append(tokens) + self.documents[-1].append(tokens) + + logger.info(f"Creating examples from {len(self.documents)} documents.") + self.examples = [] + for doc_index, document in enumerate(self.documents): + self.create_examples_from_document(document, doc_index) start = time.time() with open(cached_features_file, "wb") as handle: @@ -336,6 +344,85 @@ class TextDatasetForNextSentencePrediction(Dataset): "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start ) + def create_examples_from_document(self, document: List[List[int]], doc_index: int): + """Creates examples for a single document.""" + + max_num_tokens = self.block_size - self.tokenizer.num_special_tokens_to_add(pair=True) + + # We *usually* want to fill up the entire sequence since we are padding + # to `block_size` anyways, so short sequences are generally wasted + # computation. However, we *sometimes* + # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter + # sequences to minimize the mismatch between pre-training and fine-tuning. + # The `target_seq_length` is just a rough target however, whereas + # `block_size` is a hard limit. + target_seq_length = max_num_tokens + if random.random() < self.short_seq_probability: + target_seq_length = random.randint(2, max_num_tokens) + + current_chunk = [] # a buffer stored current working segments + current_length = 0 + i = 0 + + while i < len(document): + segment = document[i] + current_chunk.append(segment) + current_length += len(segment) + if i == len(document) - 1 or current_length >= target_seq_length: + if current_chunk: + # `a_end` is how many segments from `current_chunk` go into the `A` + # (first) sentence. + a_end = 1 + if len(current_chunk) >= 2: + a_end = random.randint(1, len(current_chunk) - 1) + + tokens_a = [] + for j in range(a_end): + tokens_a.extend(current_chunk[j]) + + tokens_b = [] + + if len(current_chunk) == 1 or random.random() < self.nsp_probability: + is_random_next = True + target_b_length = target_seq_length - len(tokens_a) + + # This should rarely go for more than one iteration for large + # corpora. However, just to be careful, we try to make sure that + # the random document is not the same as the document + # we're processing. + for _ in range(10): + random_document_index = random.randint(0, len(self.documents) - 1) + if random_document_index != doc_index: + break + + random_document = self.documents[random_document_index] + random_start = random.randint(0, len(random_document) - 1) + for j in range(random_start, len(random_document)): + tokens_b.extend(random_document[j]) + if len(tokens_b) >= target_b_length: + break + # We didn't actually use these segments so we "put them back" so + # they don't go to waste. + num_unused_segments = len(current_chunk) - a_end + i -= num_unused_segments + # Actual next + else: + is_random_next = False + for j in range(a_end, len(current_chunk)): + tokens_b.extend(current_chunk[j]) + + assert len(tokens_a) >= 1 + assert len(tokens_b) >= 1 + + self.examples.append( + {"tokens_a": tokens_a, "tokens_b": tokens_b, "is_random_next": is_random_next} + ) + + current_chunk = [] + current_length = 0 + + i += 1 + def __len__(self): return len(self.examples)