Fix doc errors and typos across the board (#8139)
* Fix doc errors and typos across the board * Fix a typo * Fix the CI * Fix more typos * Fix CI * More fixes * Fix CI * More fixes * More fixes
This commit is contained in:
@@ -291,10 +291,9 @@ def hans_convert_examples_to_features(
|
||||
|
||||
Args:
|
||||
examples: List of ``InputExamples`` containing the examples.
|
||||
tokenizer: Instance of a tokenizer that will tokenize the examples.
|
||||
max_length: Maximum example length.
|
||||
label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method.
|
||||
output_mode: String indicating the output mode. Either ``regression`` or ``classification``.
|
||||
max_length: Maximum example length.
|
||||
tokenizer: Instance of a tokenizer that will tokenize the examples.
|
||||
|
||||
Returns:
|
||||
A list of task-specific ``InputFeatures`` which can be fed to the model.
|
||||
|
||||
@@ -155,7 +155,7 @@ class BertModelWithPabee(BertModel):
|
||||
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
|
||||
|
||||
# If a 2D ou 3D attention mask is provided for the cross-attention
|
||||
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
||||
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||
if self.config.is_decoder and encoder_hidden_states is not None:
|
||||
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
||||
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
||||
|
||||
@@ -198,7 +198,7 @@ class DeeBertModel(BertPreTrainedModel):
|
||||
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
|
||||
|
||||
# If a 2D ou 3D attention mask is provided for the cross-attention
|
||||
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
||||
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||
if encoder_attention_mask.dim() == 3:
|
||||
encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
|
||||
if encoder_attention_mask.dim() == 2:
|
||||
@@ -260,7 +260,7 @@ class BertHighway(nn.Module):
|
||||
|
||||
# BertModel
|
||||
bmodel_output = (pooler_input, pooler_output) + encoder_outputs[1:]
|
||||
# "return" bodel_output
|
||||
# "return" bmodel_output
|
||||
|
||||
# Dropout and classification
|
||||
pooled_output = bmodel_output[1]
|
||||
|
||||
@@ -265,7 +265,7 @@ class Distiller:
|
||||
-------
|
||||
token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
|
||||
attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
|
||||
clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -100 where there is nothing to predict.
|
||||
clm_labels: `torch.tensor(bs, seq_length)` - The causal language modeling labels. There is a -100 where there is nothing to predict.
|
||||
"""
|
||||
token_ids, lengths = batch
|
||||
token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
|
||||
@@ -401,9 +401,9 @@ class Distiller:
|
||||
# https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
|
||||
# https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
|
||||
if self.params.restrict_ce_to_mask:
|
||||
mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size)
|
||||
mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits) # (bs, seq_length, voc_size)
|
||||
else:
|
||||
mask = attention_mask.unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size)
|
||||
mask = attention_mask.unsqueeze(-1).expand_as(s_logits) # (bs, seq_length, voc_size)
|
||||
s_logits_slct = torch.masked_select(s_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask
|
||||
s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1)) # (bs * seq_length, voc_size) modulo the 1s in mask
|
||||
t_logits_slct = torch.masked_select(t_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask
|
||||
|
||||
@@ -61,7 +61,7 @@ class LmSeqsDataset(Dataset):
|
||||
|
||||
def remove_long_sequences(self):
|
||||
"""
|
||||
Sequences that are too long are splitted by chunk of max_model_input_size.
|
||||
Sequences that are too long are split by chunk of max_model_input_size.
|
||||
"""
|
||||
max_len = self.params.max_model_input_size
|
||||
indices = self.lengths > max_len
|
||||
@@ -138,8 +138,8 @@ class LmSeqsDataset(Dataset):
|
||||
# logger.info(f'{data_len} tokens ({nb_unique_tokens} unique)')
|
||||
|
||||
# unk_idx = self.params.special_tok_ids['unk_token']
|
||||
# nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids])
|
||||
# logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)')
|
||||
# nb_unknown = sum([(t==unk_idx).sum() for t in self.token_ids])
|
||||
# logger.info(f'{nb_unknown} unknown tokens (covering {100*nb_unknown/data_len:.2f}% of the data)')
|
||||
|
||||
def batch_sequences(self, batch):
|
||||
"""
|
||||
|
||||
@@ -96,7 +96,7 @@ if __name__ == "__main__":
|
||||
compressed_sd["lm_head.weight"] = state_dict["lm_head.weight"]
|
||||
|
||||
print(f"N layers selected for distillation: {std_idx}")
|
||||
print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
|
||||
print(f"Number of params transferred for distillation: {len(compressed_sd.keys())}")
|
||||
|
||||
print(f"Save transfered checkpoint to {args.dump_checkpoint}.")
|
||||
print(f"Save transferred checkpoint to {args.dump_checkpoint}.")
|
||||
torch.save(compressed_sd, args.dump_checkpoint)
|
||||
|
||||
@@ -266,14 +266,14 @@ def find_top_rpn_proposals(
|
||||
):
|
||||
"""Args:
|
||||
proposals (list[Tensor]): (L, N, Hi*Wi*A, 4).
|
||||
pred_objectness_logits: tensors of lenngth L.
|
||||
pred_objectness_logits: tensors of length L.
|
||||
nms_thresh (float): IoU threshold to use for NMS
|
||||
pre_nms_topk (int): before nms
|
||||
post_nms_topk (int): after nms
|
||||
min_box_side_len (float): minimum proposal box side
|
||||
training (bool): True if proposals are to be used in training,
|
||||
Returns:
|
||||
resuls (List[Dict]): stores post_nms_topk object proposals for image i.
|
||||
results (List[Dict]): stores post_nms_topk object proposals for image i.
|
||||
"""
|
||||
num_images = len(images)
|
||||
device = proposals[0].device
|
||||
@@ -648,7 +648,7 @@ class RPNOutputs(object):
|
||||
images (ImageList): :class:`ImageList` instance representing N input images
|
||||
pred_objectness_logits (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, A, Hi, W)
|
||||
pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, A*4, Hi, Wi)
|
||||
anchors (list[torch.Tensor]): nested list ofboxes. anchors[i][j] at (n, l) stores anchor array for feature map l
|
||||
anchors (list[torch.Tensor]): nested list of boxes. anchors[i][j] at (n, l) stores anchor array for feature map l
|
||||
boundary_threshold (int): if >= 0, then anchors that extend beyond the image boundary by more than boundary_thresh are not used in training.
|
||||
gt_boxes (list[Boxes], optional): A list of N elements.
|
||||
smooth_l1_beta (float): The transition point between L1 and L2 lossn. When set to 0, the loss becomes L1. When +inf, it is ignored
|
||||
@@ -1186,7 +1186,7 @@ class ROIOutputs(object):
|
||||
attr_probs_all, attrs_all = self._predict_attrs(attr_logits, preds_per_image)
|
||||
features = features.split(preds_per_image, dim=0)
|
||||
|
||||
# fun for each image too, also I can expirement and do multiple images
|
||||
# fun for each image too, also I can experiment and do multiple images
|
||||
final_results = []
|
||||
zipped = zip(boxes_all, obj_scores_all, attr_probs_all, attrs_all, sizes)
|
||||
for i, (boxes, obj_scores, attr_probs, attrs, size) in enumerate(zipped):
|
||||
@@ -1412,7 +1412,7 @@ class AnchorGenerator(nn.Module):
|
||||
|
||||
def generate_cell_anchors(self, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)):
|
||||
"""
|
||||
anchors are continious geometric rectangles
|
||||
anchors are continuous geometric rectangles
|
||||
centered on one feature map point sample.
|
||||
We can later build the set of anchors
|
||||
for the entire feature map by tiling these tensors
|
||||
@@ -1865,7 +1865,7 @@ class GeneralizedRCNN(nn.Module):
|
||||
scales_yx=None,
|
||||
**kwargs,
|
||||
):
|
||||
# run images through bacbone
|
||||
# run images through backbone
|
||||
original_sizes = image_shapes * scales_yx
|
||||
features = self.backbone(images)
|
||||
|
||||
|
||||
@@ -116,7 +116,7 @@ class Preprocess:
|
||||
images = self.aug(images)
|
||||
# transpose images and convert to torch tensors
|
||||
# images = [torch.as_tensor(i.astype("float32")).permute(2, 0, 1).to(self.device) for i in images]
|
||||
# now normalize before pad to aoid useless arithmatic
|
||||
# now normalize before pad to avoid useless arithmetic
|
||||
images = [self.normalizer(x) for x in images]
|
||||
# now pad them to do the following operations
|
||||
images, sizes = self.pad(images)
|
||||
|
||||
@@ -236,7 +236,7 @@ def compare(in_tensor):
|
||||
), f"{sum([1 for x in np.isclose(n1, n2, rtol=0.01, atol=0.1).flatten() if x == False])/len(n1.flatten())*100:.4f} % element-wise mismatch"
|
||||
raise Exception("tensors are all good")
|
||||
|
||||
# Hugging face functiions below
|
||||
# Hugging face functions below
|
||||
|
||||
|
||||
def is_remote_url(url_or_filename):
|
||||
@@ -520,7 +520,7 @@ def get_image_from_url(url):
|
||||
return img
|
||||
|
||||
|
||||
# to load legace frcnn checkpoint from detectron
|
||||
# to load legacy frcnn checkpoint from detectron
|
||||
def load_frcnn_pkl_from_url(url):
|
||||
fn = url.split("/")[-1]
|
||||
if fn not in os.listdir(os.getcwd()):
|
||||
|
||||
@@ -33,7 +33,7 @@ def main(args):
|
||||
remaining_count = 0 # Number of remaining (not pruned) params in the encoder
|
||||
encoder_count = 0 # Number of params in the encoder
|
||||
|
||||
print("name".ljust(60, " "), "Remaining Weights %", "Remaning Weight")
|
||||
print("name".ljust(60, " "), "Remaining Weights %", "Remaining Weight")
|
||||
for name, param in st.items():
|
||||
if "encoder" not in name:
|
||||
continue
|
||||
|
||||
@@ -591,7 +591,7 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
|
||||
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
|
||||
|
||||
# If a 2D ou 3D attention mask is provided for the cross-attention
|
||||
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
||||
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||
if self.config.is_decoder and encoder_hidden_states is not None:
|
||||
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
||||
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
||||
@@ -631,7 +631,7 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
|
||||
) # We can specify head_mask for each layer
|
||||
head_mask = head_mask.to(
|
||||
dtype=next(self.parameters()).dtype
|
||||
) # switch to fload if need + fp16 compatibility
|
||||
) # switch to float if need + fp16 compatibility
|
||||
else:
|
||||
head_mask = [None] * self.config.num_hidden_layers
|
||||
|
||||
|
||||
@@ -225,7 +225,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
||||
desc="Epoch",
|
||||
disable=args.local_rank not in [-1, 0],
|
||||
)
|
||||
set_seed(args) # Added here for reproductibility
|
||||
set_seed(args) # Added here for reproducibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
@@ -705,7 +705,7 @@ def main():
|
||||
"--final_lambda",
|
||||
default=0.0,
|
||||
type=float,
|
||||
help="Regularization intensity (used in conjunction with `regulariation`.",
|
||||
help="Regularization intensity (used in conjunction with `regularization`.",
|
||||
)
|
||||
|
||||
parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
|
||||
@@ -816,7 +816,7 @@ def main():
|
||||
if args.local_rank == -1 or args.no_cuda:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
|
||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
|
||||
@@ -231,7 +231,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
||||
train_iterator = trange(
|
||||
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
|
||||
)
|
||||
# Added here for reproductibility
|
||||
# Added here for reproducibility
|
||||
set_seed(args)
|
||||
|
||||
for _ in train_iterator:
|
||||
@@ -824,7 +824,7 @@ def main():
|
||||
"--final_lambda",
|
||||
default=0.0,
|
||||
type=float,
|
||||
help="Regularization intensity (used in conjunction with `regulariation`.",
|
||||
help="Regularization intensity (used in conjunction with `regularization`.",
|
||||
)
|
||||
|
||||
parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
|
||||
@@ -977,7 +977,7 @@ def main():
|
||||
if args.local_rank == -1 or args.no_cuda:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
|
||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
|
||||
@@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
|
||||
class RagPyTorchDistributedRetriever(RagRetriever):
|
||||
"""
|
||||
A distributed retriever built on top of the ``torch.distributed`` communication package. During training all workers
|
||||
initalize their own instance of the retriever, however, only the main worker loads the index into memory. The index is stored
|
||||
initialize their own instance of the retriever, however, only the main worker loads the index into memory. The index is stored
|
||||
in cpu memory. The index will also work well in a non-distributed setup.
|
||||
|
||||
Args:
|
||||
@@ -45,7 +45,7 @@ class RagPyTorchDistributedRetriever(RagRetriever):
|
||||
|
||||
def init_retrieval(self, distributed_port: int):
|
||||
"""
|
||||
Retriever initalization function, needs to be called from the training process. The function sets some common parameters
|
||||
Retriever initialization function, needs to be called from the training process. The function sets some common parameters
|
||||
and environment variables. On top of that, (only) the main process in the process group loads the index into memory.
|
||||
|
||||
Args:
|
||||
@@ -56,7 +56,7 @@ class RagPyTorchDistributedRetriever(RagRetriever):
|
||||
|
||||
logger.info("initializing retrieval")
|
||||
|
||||
# initializing a separate process group for retrievel as the default
|
||||
# initializing a separate process group for retrieval as the default
|
||||
# nccl backend doesn't support gather/scatter operations while gloo
|
||||
# is too slow to replace nccl for the core gpu communication
|
||||
if dist.is_initialized():
|
||||
@@ -101,7 +101,7 @@ class RagPyTorchDistributedRetriever(RagRetriever):
|
||||
n_docs (:obj:`int`):
|
||||
The number of docs retrieved per query.
|
||||
|
||||
Ouput:
|
||||
Output:
|
||||
retrieved_doc_embeds (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs, dim)`
|
||||
The retrieval embeddings of the retrieved docs per query.
|
||||
doc_ids (:obj:`np.ndarray` of shape :obj:`batch_size, n_docs`)
|
||||
|
||||
@@ -176,7 +176,7 @@ def get_args():
|
||||
choices=["e2e", "retrieval"],
|
||||
default="e2e",
|
||||
type=str,
|
||||
help="Evaluation mode, e2e calculates exact match and F1 of the downstream task, retrieval calulates precision@k.",
|
||||
help="Evaluation mode, e2e calculates exact match and F1 of the downstream task, retrieval calculates precision@k.",
|
||||
)
|
||||
parser.add_argument("--k", default=1, type=int, help="k for the precision@k calculation")
|
||||
parser.add_argument(
|
||||
@@ -206,7 +206,7 @@ def get_args():
|
||||
"--predictions_path",
|
||||
type=str,
|
||||
default="predictions.txt",
|
||||
help="Name of the predictions file, to be stored in the checkpoints directry",
|
||||
help="Name of the predictions file, to be stored in the checkpoints directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--eval_all_checkpoints",
|
||||
|
||||
@@ -26,7 +26,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
|
||||
def split_text(text: str, n=100, character=" ") -> List[str]:
|
||||
"""Split the text every ``n``-th occurence of ``character``"""
|
||||
"""Split the text every ``n``-th occurrence of ``character``"""
|
||||
text = text.split(character)
|
||||
return [character.join(text[i : i + n]).strip() for i in range(0, len(text), n)]
|
||||
|
||||
|
||||
@@ -44,7 +44,7 @@ class BertAbsConfig(PretrainedConfig):
|
||||
enc_ff_size: int
|
||||
The size of the encoder's feed-forward layers.
|
||||
enc_dropout: int
|
||||
The dropout probabilitiy for all fully connected layers in the
|
||||
The dropout probability for all fully connected layers in the
|
||||
embeddings, layers, pooler and also the attention probabilities in
|
||||
the encoder.
|
||||
dec_layer: int
|
||||
@@ -56,7 +56,7 @@ class BertAbsConfig(PretrainedConfig):
|
||||
dec_ff_size: int
|
||||
The size of the decoder's feed-forward layers.
|
||||
dec_dropout: int
|
||||
The dropout probabilitiy for all fully connected layers in the
|
||||
The dropout probability for all fully connected layers in the
|
||||
embeddings, layers, pooler and also the attention probabilities in
|
||||
the decoder.
|
||||
"""
|
||||
|
||||
@@ -152,7 +152,7 @@ class TransformerDecoder(nn.Module):
|
||||
dropout (float): dropout parameters
|
||||
embeddings (:obj:`onmt.modules.Embeddings`):
|
||||
embeddings to use, should have positional encodings
|
||||
attn_type (str): if using a seperate copy attention
|
||||
attn_type (str): if using a separate copy attention
|
||||
"""
|
||||
|
||||
def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings, vocab_size):
|
||||
@@ -817,11 +817,7 @@ class Translator(object):
|
||||
|
||||
Args:
|
||||
batch (:obj:`Batch`): a batch from a dataset object
|
||||
data (:obj:`Dataset`): the dataset object
|
||||
fast (bool): enables fast beam search (may not support all features)
|
||||
|
||||
Todo:
|
||||
Shouldn't need the original dataset.
|
||||
"""
|
||||
with torch.no_grad():
|
||||
return self._fast_translate_batch(batch, self.max_length, min_length=self.min_length)
|
||||
|
||||
@@ -12,7 +12,7 @@ def convert(src_path: str, map_location: str = "cpu", save_path: Union[str, None
|
||||
state_dict = torch.load(src_path, map_location=map_location)
|
||||
for k, v in tqdm(state_dict.items()):
|
||||
if not isinstance(v, torch.Tensor):
|
||||
raise TypeError("FP16 conversion only works on paths that are saved state dics, like pytorch_model.bin")
|
||||
raise TypeError("FP16 conversion only works on paths that are saved state dicts, like pytorch_model.bin")
|
||||
state_dict[k] = v.half()
|
||||
if save_path is None: # overwrite src_path
|
||||
save_path = src_path
|
||||
|
||||
Reference in New Issue
Block a user