From 44e3e3fb4930298f092f336c2b7add3ebf051928 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Wed, 1 Mar 2023 10:49:21 +0100 Subject: [PATCH] prepare for "__floordiv__ is deprecated and its behavior will change in a future version of pytorch" (#20211) * rounding_mode = "floor" instead of // to prevent behavioral change * add other TODO * use `torch_int_div` from pytrch_utils * same for tests * fix copies * style * use relative imports when needed * Co-authored-by: sgugger --- .../models/big_bird/modeling_big_bird.py | 9 +++------ .../bigbird_pegasus/modeling_bigbird_pegasus.py | 8 +++----- .../image_processing_conditional_detr.py | 6 ++++-- .../conditional_detr/modeling_conditional_detr.py | 2 +- .../image_processing_deformable_detr.py | 6 ++++-- .../deformable_detr/modeling_deformable_detr.py | 6 +++--- .../models/deta/image_processing_deta.py | 4 +++- src/transformers/models/deta/modeling_deta.py | 6 +++--- .../mask2former/image_processing_mask2former.py | 4 +++- .../maskformer/image_processing_maskformer.py | 4 +++- .../models/oneformer/image_processing_oneformer.py | 4 +++- .../models/reformer/modeling_reformer.py | 5 ++--- src/transformers/models/tapas/modeling_tapas.py | 14 ++++++++------ tests/generation/test_beam_search.py | 13 +++++-------- tests/models/wav2vec2/test_modeling_wav2vec2.py | 13 +++++-------- 15 files changed, 53 insertions(+), 51 deletions(-) diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py index 1ab66edf5c..172b2a78e8 100755 --- a/src/transformers/models/big_bird/modeling_big_bird.py +++ b/src/transformers/models/big_bird/modeling_big_bird.py @@ -37,7 +37,7 @@ from ...modeling_outputs import ( TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import apply_chunking_to_forward +from ...pytorch_utils import apply_chunking_to_forward, torch_int_div from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -971,11 +971,8 @@ class BigBirdBlockSparseAttention(nn.Module): num_indices_to_gather = indices.shape[-2] * indices.shape[-1] num_indices_to_pick_from = params.shape[2] - indices_shift = ( - torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device) - // num_indices_to_gather - * num_indices_to_pick_from - ) + shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device) + indices_shift = torch_int_div(shift, num_indices_to_gather) * num_indices_to_pick_from flattened_indices = indices.view(-1) + indices_shift flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1]) diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index f8182e8b6f..fb76051890 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -36,6 +36,7 @@ from ...modeling_outputs import ( Seq2SeqSequenceClassifierOutput, ) from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import torch_int_div from ...utils import ( add_code_sample_docstrings, add_end_docstrings, @@ -789,11 +790,8 @@ class BigBirdPegasusBlockSparseAttention(nn.Module): num_indices_to_gather = indices.shape[-2] * indices.shape[-1] num_indices_to_pick_from = params.shape[2] - indices_shift = ( - torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device) - // num_indices_to_gather - * num_indices_to_pick_from - ) + shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device) + indices_shift = torch_int_div(shift, num_indices_to_gather) * num_indices_to_pick_from flattened_indices = indices.view(-1) + indices_shift flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1]) diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py index 0d7e9aa0da..2a70e45edf 100644 --- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py +++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py @@ -67,6 +67,8 @@ if is_torch_available(): import torch from torch import nn + from transformers.pytorch_utils import torch_int_div + if is_vision_available(): import PIL @@ -1311,7 +1313,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): prob = out_logits.sigmoid() topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 300, dim=1) scores = topk_values - topk_boxes = topk_indexes // out_logits.shape[2] + topk_boxes = torch_int_div(topk_indexes, out_logits.shape[2]) labels = topk_indexes % out_logits.shape[2] boxes = center_to_corners_format(out_bbox) boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) @@ -1357,7 +1359,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): prob = out_logits.sigmoid() topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1) scores = topk_values - topk_boxes = topk_indexes // out_logits.shape[2] + topk_boxes = torch_int_div(topk_indexes, out_logits.shape[2]) labels = topk_indexes % out_logits.shape[2] boxes = center_to_corners_format(out_bbox) boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py index 5068c003f9..36e6e94215 100644 --- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py @@ -504,7 +504,7 @@ def build_position_encoding(config): def gen_sine_position_embeddings(pos_tensor): scale = 2 * math.pi dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device) - dim_t = 10000 ** (2 * (dim_t // 2) / 128) + dim_t = 10000 ** (2 * torch_int_div(dim_t, 2) / 128) x_embed = pos_tensor[:, :, 0] * scale y_embed = pos_tensor[:, :, 1] * scale pos_x = x_embed[:, :, None] / dim_t diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py index 5b6d9839e9..4db2e27647 100644 --- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py +++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py @@ -67,6 +67,8 @@ if is_torch_available(): import torch from torch import nn + from ...pytorch_utils import torch_int_div + if is_vision_available(): import PIL @@ -1309,7 +1311,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor): prob = out_logits.sigmoid() topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1) scores = topk_values - topk_boxes = topk_indexes // out_logits.shape[2] + topk_boxes = torch_int_div(topk_indexes, out_logits.shape[2]) labels = topk_indexes % out_logits.shape[2] boxes = center_to_corners_format(out_bbox) boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) @@ -1354,7 +1356,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor): prob = out_logits.sigmoid() topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1) scores = topk_values - topk_boxes = topk_indexes // out_logits.shape[2] + topk_boxes = torch_int_div(topk_indexes, out_logits.shape[2]) labels = topk_indexes % out_logits.shape[2] boxes = center_to_corners_format(out_bbox) boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py index a7ee782501..630e30b7ea 100755 --- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -41,7 +41,7 @@ from ...file_utils import ( ) from ...modeling_outputs import BaseModelOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import meshgrid +from ...pytorch_utils import meshgrid, torch_int_div from ...utils import is_ninja_available, logging from ..auto import AutoBackbone from .configuration_deformable_detr import DeformableDetrConfig @@ -497,7 +497,7 @@ class DeformableDetrSinePositionEmbedding(nn.Module): x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device) - dim_t = self.temperature ** (2 * (dim_t // 2) / self.embedding_dim) + dim_t = self.temperature ** (2 * torch_int_div(dim_t, 2 / self.embedding_dim)) pos_x = x_embed[:, :, :, None] / dim_t pos_y = y_embed[:, :, :, None] / dim_t @@ -1552,7 +1552,7 @@ class DeformableDetrModel(DeformableDetrPreTrainedModel): scale = 2 * math.pi dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device) - dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats) + dim_t = temperature ** (2 * torch.div(dim_t, 2) / num_pos_feats) # batch_size, num_queries, 4 proposals = proposals.sigmoid() * scale # batch_size, num_queries, 4, 128 diff --git a/src/transformers/models/deta/image_processing_deta.py b/src/transformers/models/deta/image_processing_deta.py index 717fbfdd54..8afc69fd31 100644 --- a/src/transformers/models/deta/image_processing_deta.py +++ b/src/transformers/models/deta/image_processing_deta.py @@ -63,6 +63,8 @@ from ...utils.generic import ExplicitEnum, TensorType if is_torch_available(): import torch + from ...pytorch_utils import torch_int_div + if is_torchvision_available(): from torchvision.ops.boxes import batched_nms @@ -965,7 +967,7 @@ class DetaImageProcessor(BaseImageProcessor): all_scores = prob.view(batch_size, num_queries * num_labels).to(out_logits.device) all_indexes = torch.arange(num_queries * num_labels)[None].repeat(batch_size, 1).to(out_logits.device) - all_boxes = all_indexes // out_logits.shape[2] + all_boxes = torch_int_div(all_indexes, out_logits.shape[2]) all_labels = all_indexes % out_logits.shape[2] boxes = center_to_corners_format(out_bbox) diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py index eb77604fbf..8ae5fbbca4 100644 --- a/src/transformers/models/deta/modeling_deta.py +++ b/src/transformers/models/deta/modeling_deta.py @@ -36,7 +36,7 @@ from ...file_utils import ( ) from ...modeling_outputs import BaseModelOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import meshgrid +from ...pytorch_utils import meshgrid, torch_int_div from ...utils import is_torchvision_available, logging, requires_backends from ..auto import AutoBackbone from .configuration_deta import DetaConfig @@ -399,7 +399,7 @@ class DetaSinePositionEmbedding(nn.Module): x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device) - dim_t = self.temperature ** (2 * (dim_t // 2) / self.embedding_dim) + dim_t = self.temperature ** (2 * torch_int_div(dim_t, 2 / self.embedding_dim)) pos_x = x_embed[:, :, :, None] / dim_t pos_y = y_embed[:, :, :, None] / dim_t @@ -1463,7 +1463,7 @@ class DetaModel(DetaPreTrainedModel): scale = 2 * math.pi dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device) - dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats) + dim_t = temperature ** (2 * torch.div(dim_t, 2) / num_pos_feats) # batch_size, num_queries, 4 proposals = proposals.sigmoid() * scale # batch_size, num_queries, 4, 128 diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py index 501c4ccce7..b27ef5207b 100644 --- a/src/transformers/models/mask2former/image_processing_mask2former.py +++ b/src/transformers/models/mask2former/image_processing_mask2former.py @@ -57,6 +57,8 @@ if is_torch_available(): import torch from torch import nn + from ...pytorch_utils import torch_int_div + # Copied from transformers.models.detr.image_processing_detr.max_across_indices def max_across_indices(values: Iterable[Any]) -> List[Any]: @@ -1007,7 +1009,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor): scores_per_image, topk_indices = scores.flatten(0, 1).topk(num_queries, sorted=False) labels_per_image = labels[topk_indices] - topk_indices = topk_indices // num_classes + topk_indices = torch_int_div(topk_indices, num_classes) mask_pred = mask_pred[topk_indices] pred_masks = (mask_pred > 0).float() diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py index 7457d1eacd..373bee1ab2 100644 --- a/src/transformers/models/maskformer/image_processing_maskformer.py +++ b/src/transformers/models/maskformer/image_processing_maskformer.py @@ -61,6 +61,8 @@ if is_torch_available(): import torch from torch import nn + from ...pytorch_utils import torch_int_div + # Copied from transformers.models.detr.image_processing_detr.max_across_indices def max_across_indices(values: Iterable[Any]) -> List[Any]: @@ -1075,7 +1077,7 @@ class MaskFormerImageProcessor(BaseImageProcessor): scores_per_image, topk_indices = scores.flatten(0, 1).topk(num_queries, sorted=False) labels_per_image = labels[topk_indices] - topk_indices = topk_indices // num_classes + topk_indices = torch_int_div(topk_indices, num_classes) mask_pred = mask_pred[topk_indices] pred_masks = (mask_pred > 0).float() diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py index 2573844995..237a3dbad4 100644 --- a/src/transformers/models/oneformer/image_processing_oneformer.py +++ b/src/transformers/models/oneformer/image_processing_oneformer.py @@ -58,6 +58,8 @@ if is_torch_available(): import torch from torch import nn + from ...pytorch_utils import torch_int_div + # Copied from transformers.models.detr.image_processing_detr.max_across_indices def max_across_indices(values: Iterable[Any]) -> List[Any]: @@ -1120,7 +1122,7 @@ class OneFormerImageProcessor(BaseImageProcessor): scores_per_image, topk_indices = scores.flatten(0, 1).topk(num_queries, sorted=False) labels_per_image = labels[topk_indices] - topk_indices = topk_indices // num_classes + topk_indices = torch_int_div(topk_indices, num_classes) # mask_pred = mask_pred.unsqueeze(1).repeat(1, self.sem_seg_head.num_classes, 1).flatten(0, 1) mask_pred = masks_queries_logits[i][topk_indices] diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py index fb3100f88b..4bd29e78ee 100755 --- a/src/transformers/models/reformer/modeling_reformer.py +++ b/src/transformers/models/reformer/modeling_reformer.py @@ -909,10 +909,9 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin): relevant_bucket_idx_chunk = bucket_idx[tuple(relevant_bucket_idx_chunk.transpose(0, 1))] # adapt bucket_idx for batch and hidden states for index select + offset = torch.arange(relevant_bucket_idx_chunk.shape[-1], device=hidden_states.device, dtype=torch.long) bucket_idx_batch_offset = sequence_length * ( - batch_size - * torch.arange(relevant_bucket_idx_chunk.shape[-1], device=hidden_states.device, dtype=torch.long) - // relevant_bucket_idx_chunk.shape[-1] + batch_size * torch.div(offset, relevant_bucket_idx_chunk.shape[-1], rounding_mode="floor") ) # add batch offset diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py index 83a3f9fefc..2bb4e2baae 100644 --- a/src/transformers/models/tapas/modeling_tapas.py +++ b/src/transformers/models/tapas/modeling_tapas.py @@ -29,7 +29,12 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, MaskedLMOutput, SequenceClassifierOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...pytorch_utils import ( + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer, + torch_int_div, +) from ...utils import ( ModelOutput, add_start_docstrings, @@ -1632,11 +1637,8 @@ class ProductIndexMap(IndexMap): def project_outer(self, index): """Projects an index with the same index set onto the outer components.""" - return IndexMap( - indices=(index.indices // self.inner_index.num_segments).type(torch.float).floor().type(torch.long), - num_segments=self.outer_index.num_segments, - batch_dims=index.batch_dims, - ) + indices = torch_int_div(index.indices, self.inner_index.num_segments).type(torch.long) + return IndexMap(indices=indices, num_segments=self.outer_index.num_segments, batch_dims=index.batch_dims) def project_inner(self, index): """Projects an index with the same index set onto the inner components.""" diff --git a/tests/generation/test_beam_search.py b/tests/generation/test_beam_search.py index 72202ae2da..e35e8d9b81 100644 --- a/tests/generation/test_beam_search.py +++ b/tests/generation/test_beam_search.py @@ -32,6 +32,7 @@ if is_torch_available(): DisjunctiveConstraint, PhrasalConstraint, ) + from transformers.pytorch_utils import torch_int_div class BeamSearchTester: @@ -160,10 +161,8 @@ class BeamSearchTester: expected_output_scores = cut_expected_tensor(next_scores) # add num_beams * batch_idx - expected_output_indices = ( - cut_expected_tensor(next_indices) - + (torch.arange(self.num_beams * self.batch_size, device=torch_device) // self.num_beams) * self.num_beams - ) + offset = torch_int_div(torch.arange(self.num_beams * self.batch_size, device=torch_device), self.num_beams) + expected_output_indices = cut_expected_tensor(next_indices) + offset * self.num_beams self.parent.assertListEqual(expected_output_tokens.tolist(), output_tokens.tolist()) self.parent.assertListEqual(expected_output_indices.tolist(), output_indices.tolist()) @@ -399,10 +398,8 @@ class ConstrainedBeamSearchTester: expected_output_scores = cut_expected_tensor(next_scores) # add num_beams * batch_idx - expected_output_indices = ( - cut_expected_tensor(next_indices) - + (torch.arange(self.num_beams * self.batch_size, device=torch_device) // self.num_beams) * self.num_beams - ) + offset = torch_int_div(torch.arange(self.num_beams * self.batch_size, device=torch_device), self.num_beams) + expected_output_indices = cut_expected_tensor(next_indices) + offset * self.num_beams self.parent.assertListEqual(expected_output_tokens.tolist(), output_tokens.tolist()) self.parent.assertListEqual(expected_output_indices.tolist(), output_indices.tolist()) diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py index a29dbda2f1..cd98d41034 100644 --- a/tests/models/wav2vec2/test_modeling_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py @@ -71,7 +71,7 @@ if is_torch_available(): _compute_mask_indices, _sample_negative_indices, ) - from transformers.pytorch_utils import is_torch_less_than_1_9 + from transformers.pytorch_utils import is_torch_less_than_1_9, torch_int_div else: is_torch_less_than_1_9 = True @@ -1217,10 +1217,8 @@ class Wav2Vec2UtilsTest(unittest.TestCase): sequence_length = 10 hidden_size = 4 num_negatives = 3 - - features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view( - sequence_length, hidden_size - ) # each value in vector consits of same value + sequence = torch_int_div(torch.arange(sequence_length * hidden_size, device=torch_device), hidden_size) + features = sequence.view(sequence_length, hidden_size) # each value in vector consits of same value features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous() # sample negative indices @@ -1247,9 +1245,8 @@ class Wav2Vec2UtilsTest(unittest.TestCase): mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device) mask[-1, sequence_length // 2 :] = 0 - features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view( - sequence_length, hidden_size - ) # each value in vector consits of same value + sequence = torch_int_div(torch.arange(sequence_length * hidden_size, device=torch_device), hidden_size) + features = sequence.view(sequence_length, hidden_size) # each value in vector consits of same value features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous() # replace masked feature vectors with -100 to test that those are not sampled