diff --git a/docs/source/en/model_doc/conditional_detr.md b/docs/source/en/model_doc/conditional_detr.md index 4dff1d0e35..8993fb3843 100644 --- a/docs/source/en/model_doc/conditional_detr.md +++ b/docs/source/en/model_doc/conditional_detr.md @@ -43,7 +43,6 @@ This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The o [[autodoc]] ConditionalDetrImageProcessor - preprocess - - pad_and_create_pixel_mask - post_process_object_detection - post_process_instance_segmentation - post_process_semantic_segmentation @@ -53,7 +52,6 @@ This model was contributed by [DepuMeng](https://huggingface.co/DepuMeng). The o [[autodoc]] ConditionalDetrFeatureExtractor - __call__ - - pad_and_create_pixel_mask - post_process_object_detection - post_process_instance_segmentation - post_process_semantic_segmentation diff --git a/docs/source/en/model_doc/deformable_detr.md b/docs/source/en/model_doc/deformable_detr.md index a9b1267c15..0bceb0bdf3 100644 --- a/docs/source/en/model_doc/deformable_detr.md +++ b/docs/source/en/model_doc/deformable_detr.md @@ -52,14 +52,12 @@ If you're interested in submitting a resource to be included here, please feel f [[autodoc]] DeformableDetrImageProcessor - preprocess - - pad_and_create_pixel_mask - post_process_object_detection ## DeformableDetrFeatureExtractor [[autodoc]] DeformableDetrFeatureExtractor - __call__ - - pad_and_create_pixel_mask - post_process_object_detection ## DeformableDetrConfig diff --git a/docs/source/en/model_doc/detr.md b/docs/source/en/model_doc/detr.md index a83f3097bf..2c03a0f8b8 100644 --- a/docs/source/en/model_doc/detr.md +++ b/docs/source/en/model_doc/detr.md @@ -190,7 +190,6 @@ If you're interested in submitting a resource to be included here, please feel f [[autodoc]] DetrFeatureExtractor - __call__ - - pad_and_create_pixel_mask - post_process_object_detection - post_process_semantic_segmentation - post_process_instance_segmentation diff --git a/docs/source/en/preprocessing.md b/docs/source/en/preprocessing.md index 1f8eb4b154..c90c6c2a22 100644 --- a/docs/source/en/preprocessing.md +++ b/docs/source/en/preprocessing.md @@ -62,8 +62,8 @@ Then pass your text to the tokenizer: ```py >>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.") >>> print(encoded_input) -{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], - 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], +{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], + 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} ``` @@ -93,14 +93,14 @@ If there are several sentences you want to preprocess, pass them as a list to th ... ] >>> encoded_inputs = tokenizer(batch_sentences) >>> print(encoded_inputs) -{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102]], - 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], +{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102]], + 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0]], + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]} ``` @@ -118,14 +118,14 @@ Set the `padding` parameter to `True` to pad the shorter sequences in the batch ... ] >>> encoded_input = tokenizer(batch_sentences, padding=True) >>> print(encoded_input) -{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], - 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], +{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], + 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} ``` @@ -145,14 +145,14 @@ Set the `truncation` parameter to `True` to truncate a sequence to the maximum l ... ] >>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True) >>> print(encoded_input) -{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], - 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], +{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], + 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} ``` @@ -181,10 +181,10 @@ Set the `return_tensors` parameter to either `pt` for PyTorch, or `tf` for Tenso >>> print(encoded_input) {'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]), + [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])} @@ -203,11 +203,11 @@ Set the `return_tensors` parameter to either `pt` for PyTorch, or `tf` for Tenso array([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], - dtype=int32)>, + dtype=int32)>, 'token_type_ids': , + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': >> def collate_fn(batch): ... pixel_values = [item["pixel_values"] for item in batch] -... encoding = image_processor.pad_and_create_pixel_mask(pixel_values, return_tensors="pt") +... encoding = image_processor.pad(pixel_values, return_tensors="pt") ... labels = [item["labels"] for item in batch] ... batch = {} ... batch["pixel_values"] = encoding["pixel_values"] diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md index 56d6a1d8c4..457d96bfd3 100644 --- a/docs/source/en/tasks/object_detection.md +++ b/docs/source/en/tasks/object_detection.md @@ -305,7 +305,7 @@ to indicate which pixels are real (1) and which are padding (0). ```py >>> def collate_fn(batch): ... pixel_values = [item["pixel_values"] for item in batch] -... encoding = image_processor.pad_and_create_pixel_mask(pixel_values, return_tensors="pt") +... encoding = image_processor.pad(pixel_values, return_tensors="pt") ... labels = [item["labels"] for item in batch] ... batch = {} ... batch["pixel_values"] = encoding["pixel_values"] diff --git a/docs/source/ko/preprocessing.md b/docs/source/ko/preprocessing.md index a7597f23a0..7a9d298738 100644 --- a/docs/source/ko/preprocessing.md +++ b/docs/source/ko/preprocessing.md @@ -62,8 +62,8 @@ pip install datasets ```py >>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.") >>> print(encoded_input) -{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], - 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], +{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], + 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} ``` @@ -93,14 +93,14 @@ pip install datasets ... ] >>> encoded_inputs = tokenizer(batch_sentences) >>> print(encoded_inputs) -{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102]], - 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], +{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102]], + 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0]], + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]} ``` @@ -118,14 +118,14 @@ pip install datasets ... ] >>> encoded_input = tokenizer(batch_sentences, padding=True) >>> print(encoded_input) -{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], - 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], +{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], + 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} ``` @@ -145,14 +145,14 @@ pip install datasets ... ] >>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True) >>> print(encoded_input) -{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], - 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], +{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], + 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} ``` @@ -181,10 +181,10 @@ pip install datasets >>> print(encoded_input) {'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]), + [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])} @@ -203,11 +203,11 @@ pip install datasets array([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], - dtype=int32)>, + dtype=int32)>, 'token_type_ids': , + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': @@ -335,17 +335,17 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], ์ด๋ฏธ์ง€ ์ „์ฒ˜๋ฆฌ๋Š” ์ด๋ฏธ์ง€ ์ฆ๊ฐ• ๊ธฐ๋ฒ•์„ ๋ช‡ ๊ฐ€์ง€ ์ ์šฉํ•œ ๋’ค์— ํ•  ์ˆ˜๋„ ์žˆ์Šต๋‹ˆ๋‹ค. ์ด๋ฏธ์ง€ ์ „์ฒ˜๋ฆฌ ๋ฐ ์ด๋ฏธ์ง€ ์ฆ๊ฐ•์€ ๋ชจ๋‘ ์ด๋ฏธ์ง€ ๋ฐ์ดํ„ฐ๋ฅผ ๋ณ€ํ˜•ํ•˜์ง€๋งŒ, ์„œ๋กœ ๋‹ค๋ฅธ ๋ชฉ์ ์„ ๊ฐ€์ง€๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค: -* ์ด๋ฏธ์ง€ ์ฆ๊ฐ•์€ ๊ณผ์ ํ•ฉ(over-fitting)์„ ๋ฐฉ์ง€ํ•˜๊ณ  ๋ชจ๋ธ์˜ ๊ฒฌ๊ณ ํ•จ(resiliency)์„ ๋†’์ด๋Š” ๋ฐ ๋„์›€์ด ๋˜๋Š” ๋ฐฉ์‹์œผ๋กœ ์ด๋ฏธ์ง€๋ฅผ ์ˆ˜์ •ํ•ฉ๋‹ˆ๋‹ค. -๋ฐ๊ธฐ์™€ ์ƒ‰์ƒ ์กฐ์ •, ์ž๋ฅด๊ธฐ, ํšŒ์ „, ํฌ๊ธฐ ์กฐ์ •, ํ™•๋Œ€/์ถ•์†Œ ๋“ฑ ๋‹ค์–‘ํ•œ ๋ฐฉ๋ฒ•์œผ๋กœ ๋ฐ์ดํ„ฐ๋ฅผ ์ฆ๊ฐ•ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. +* ์ด๋ฏธ์ง€ ์ฆ๊ฐ•์€ ๊ณผ์ ํ•ฉ(over-fitting)์„ ๋ฐฉ์ง€ํ•˜๊ณ  ๋ชจ๋ธ์˜ ๊ฒฌ๊ณ ํ•จ(resiliency)์„ ๋†’์ด๋Š” ๋ฐ ๋„์›€์ด ๋˜๋Š” ๋ฐฉ์‹์œผ๋กœ ์ด๋ฏธ์ง€๋ฅผ ์ˆ˜์ •ํ•ฉ๋‹ˆ๋‹ค. +๋ฐ๊ธฐ์™€ ์ƒ‰์ƒ ์กฐ์ •, ์ž๋ฅด๊ธฐ, ํšŒ์ „, ํฌ๊ธฐ ์กฐ์ •, ํ™•๋Œ€/์ถ•์†Œ ๋“ฑ ๋‹ค์–‘ํ•œ ๋ฐฉ๋ฒ•์œผ๋กœ ๋ฐ์ดํ„ฐ๋ฅผ ์ฆ๊ฐ•ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ๊ทธ๋Ÿฌ๋‚˜ ์ฆ๊ฐ•์œผ๋กœ ์ด๋ฏธ์ง€์˜ ์˜๋ฏธ๊ฐ€ ๋ฐ”๋€Œ์ง€ ์•Š๋„๋ก ์ฃผ์˜ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. -* ์ด๋ฏธ์ง€ ์ „์ฒ˜๋ฆฌ๋Š” ์ด๋ฏธ์ง€๊ฐ€ ๋ชจ๋ธ์ด ์˜ˆ์ƒํ•˜๋Š” ์ž…๋ ฅ ํ˜•์‹๊ณผ ์ผ์น˜ํ•˜๋„๋ก ๋ณด์žฅํ•ฉ๋‹ˆ๋‹ค. +* ์ด๋ฏธ์ง€ ์ „์ฒ˜๋ฆฌ๋Š” ์ด๋ฏธ์ง€๊ฐ€ ๋ชจ๋ธ์ด ์˜ˆ์ƒํ•˜๋Š” ์ž…๋ ฅ ํ˜•์‹๊ณผ ์ผ์น˜ํ•˜๋„๋ก ๋ณด์žฅํ•ฉ๋‹ˆ๋‹ค. ์ปดํ“จํ„ฐ ๋น„์ „ ๋ชจ๋ธ์„ ๋ฏธ์„ธ ์กฐ์ •ํ•  ๋•Œ ์ด๋ฏธ์ง€๋Š” ๋ชจ๋ธ์ด ์ดˆ๊ธฐ์— ํ›ˆ๋ จ๋  ๋•Œ์™€ ์ •ํ™•ํžˆ ๊ฐ™์€ ๋ฐฉ์‹์œผ๋กœ ์ „์ฒ˜๋ฆฌ๋˜์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. ์ด๋ฏธ์ง€ ์ฆ๊ฐ•์—๋Š” ์›ํ•˜๋Š” ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋ฅผ ๋ฌด์—‡์ด๋“  ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ์ด๋ฏธ์ง€ ์ „์ฒ˜๋ฆฌ์—๋Š” ๋ชจ๋ธ๊ณผ ์—ฐ๊ฒฐ๋œ `ImageProcessor`๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค. -[food101](https://huggingface.co/datasets/food101) ๋ฐ์ดํ„ฐ ์„ธํŠธ๋ฅผ ๊ฐ€์ ธ์™€์„œ ์ปดํ“จํ„ฐ ๋น„์ „ ๋ฐ์ดํ„ฐ ์„ธํŠธ์—์„œ ์ด๋ฏธ์ง€ ํ”„๋กœ์„ธ์„œ๋ฅผ ์–ด๋–ป๊ฒŒ ์‚ฌ์šฉํ•˜๋Š”์ง€ ์•Œ์•„๋ณด์„ธ์š”. +[food101](https://huggingface.co/datasets/food101) ๋ฐ์ดํ„ฐ ์„ธํŠธ๋ฅผ ๊ฐ€์ ธ์™€์„œ ์ปดํ“จํ„ฐ ๋น„์ „ ๋ฐ์ดํ„ฐ ์„ธํŠธ์—์„œ ์ด๋ฏธ์ง€ ํ”„๋กœ์„ธ์„œ๋ฅผ ์–ด๋–ป๊ฒŒ ์‚ฌ์šฉํ•˜๋Š”์ง€ ์•Œ์•„๋ณด์„ธ์š”. ๋ฐ์ดํ„ฐ ์„ธํŠธ๋ฅผ ๋ถˆ๋Ÿฌ์˜ค๋Š” ๋ฐฉ๋ฒ•์€ ๐Ÿค— [๋ฐ์ดํ„ฐ ์„ธํŠธ ํŠœํ† ๋ฆฌ์–ผ](https://huggingface.co/docs/datasets/load_hub.html)์„ ์ฐธ๊ณ ํ•˜์„ธ์š”. @@ -382,7 +382,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], ๋‹ค๋ฅธ ๋ฐ์ดํ„ฐ ์ฆ๊ฐ• ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋ฅผ ์‚ฌ์šฉํ•ด๋ณด๊ณ  ์‹ถ๋‹ค๋ฉด, [Albumentations](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) ๋˜๋Š” [Kornia notebooks](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)์—์„œ ์–ด๋–ป๊ฒŒ ์‚ฌ์šฉํ•˜๋Š”์ง€ ๋ฐฐ์šธ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. 1. [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html)๋กœ [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html)์™€ [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html) ๋“ฑ ๋ณ€ํ™˜์„ ๋ช‡ ๊ฐ€์ง€ ์—ฐ๊ฒฐํ•˜์„ธ์š”. -์ฐธ๊ณ ๋กœ ํฌ๊ธฐ ์กฐ์ •์— ํ•„์š”ํ•œ ์ด๋ฏธ์ง€์˜ ํฌ๊ธฐ ์š”๊ตฌ์‚ฌํ•ญ์€ `image_processor`์—์„œ ๊ฐ€์ ธ์˜ฌ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. +์ฐธ๊ณ ๋กœ ํฌ๊ธฐ ์กฐ์ •์— ํ•„์š”ํ•œ ์ด๋ฏธ์ง€์˜ ํฌ๊ธฐ ์š”๊ตฌ์‚ฌํ•ญ์€ `image_processor`์—์„œ ๊ฐ€์ ธ์˜ฌ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ์ผ๋ถ€ ๋ชจ๋ธ์€ ์ •ํ™•ํ•œ ๋†’์ด์™€ ๋„ˆ๋น„๋ฅผ ์š”๊ตฌํ•˜์ง€๋งŒ, ์ œ์ผ ์งง์€ ๋ณ€์˜ ๊ธธ์ด(`shortest_edge`)๋งŒ ์ •์˜๋œ ๋ชจ๋ธ๋„ ์žˆ์Šต๋‹ˆ๋‹ค. ```py @@ -397,8 +397,8 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], >>> _transforms = Compose([RandomResizedCrop(size), ColorJitter(brightness=0.5, hue=0.5)]) ``` -2. ๋ชจ๋ธ์€ ์ž…๋ ฅ์œผ๋กœ [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values)๋ฅผ ๋ฐ›์Šต๋‹ˆ๋‹ค. -`ImageProcessor`๋Š” ์ด๋ฏธ์ง€ ์ •๊ทœํ™” ๋ฐ ์ ์ ˆํ•œ ํ…์„œ ์ƒ์„ฑ์„ ์ฒ˜๋ฆฌํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. +2. ๋ชจ๋ธ์€ ์ž…๋ ฅ์œผ๋กœ [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values)๋ฅผ ๋ฐ›์Šต๋‹ˆ๋‹ค. +`ImageProcessor`๋Š” ์ด๋ฏธ์ง€ ์ •๊ทœํ™” ๋ฐ ์ ์ ˆํ•œ ํ…์„œ ์ƒ์„ฑ์„ ์ฒ˜๋ฆฌํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ๋ฐฐ์น˜ ์ด๋ฏธ์ง€์— ๋Œ€ํ•œ ์ด๋ฏธ์ง€ ์ฆ๊ฐ• ๋ฐ ์ด๋ฏธ์ง€ ์ „์ฒ˜๋ฆฌ๋ฅผ ๊ฒฐํ•ฉํ•˜๊ณ  `pixel_values`๋ฅผ ์ƒ์„ฑํ•˜๋Š” ํ•จ์ˆ˜๋ฅผ ๋งŒ๋“ญ๋‹ˆ๋‹ค: ```py @@ -410,9 +410,9 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], -์œ„์˜ ์˜ˆ์—์„œ๋Š” ์ด๋ฏธ์ง€ ์ฆ๊ฐ• ์ค‘์— ์ด๋ฏธ์ง€ ํฌ๊ธฐ๋ฅผ ์กฐ์ •ํ–ˆ๊ธฐ ๋•Œ๋ฌธ์— `do_resize=False`๋กœ ์„ค์ •ํ•˜๊ณ , ํ•ด๋‹น `image_processor`์—์„œ `size` ์†์„ฑ์„ ํ™œ์šฉํ–ˆ์Šต๋‹ˆ๋‹ค. -์ด๋ฏธ์ง€ ์ฆ๊ฐ• ์ค‘์— ์ด๋ฏธ์ง€ ํฌ๊ธฐ๋ฅผ ์กฐ์ •ํ•˜์ง€ ์•Š์€ ๊ฒฝ์šฐ ์ด ๋งค๊ฐœ๋ณ€์ˆ˜๋ฅผ ์ƒ๋žตํ•˜์„ธ์š”. -๊ธฐ๋ณธ์ ์œผ๋กœ๋Š” `ImageProcessor`๊ฐ€ ํฌ๊ธฐ ์กฐ์ •์„ ์ฒ˜๋ฆฌํ•ฉ๋‹ˆ๋‹ค. +์œ„์˜ ์˜ˆ์—์„œ๋Š” ์ด๋ฏธ์ง€ ์ฆ๊ฐ• ์ค‘์— ์ด๋ฏธ์ง€ ํฌ๊ธฐ๋ฅผ ์กฐ์ •ํ–ˆ๊ธฐ ๋•Œ๋ฌธ์— `do_resize=False`๋กœ ์„ค์ •ํ•˜๊ณ , ํ•ด๋‹น `image_processor`์—์„œ `size` ์†์„ฑ์„ ํ™œ์šฉํ–ˆ์Šต๋‹ˆ๋‹ค. +์ด๋ฏธ์ง€ ์ฆ๊ฐ• ์ค‘์— ์ด๋ฏธ์ง€ ํฌ๊ธฐ๋ฅผ ์กฐ์ •ํ•˜์ง€ ์•Š์€ ๊ฒฝ์šฐ ์ด ๋งค๊ฐœ๋ณ€์ˆ˜๋ฅผ ์ƒ๋žตํ•˜์„ธ์š”. +๊ธฐ๋ณธ์ ์œผ๋กœ๋Š” `ImageProcessor`๊ฐ€ ํฌ๊ธฐ ์กฐ์ •์„ ์ฒ˜๋ฆฌํ•ฉ๋‹ˆ๋‹ค. ์ฆ๊ฐ• ๋ณ€ํ™˜ ๊ณผ์ •์—์„œ ์ด๋ฏธ์ง€๋ฅผ ์ •๊ทœํ™”ํ•˜๋ ค๋ฉด `image_processor.image_mean` ๋ฐ `image_processor.image_std` ๊ฐ’์„ ์‚ฌ์šฉํ•˜์„ธ์š”. @@ -424,7 +424,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], >>> dataset.set_transform(transforms) ``` -4. ์ด์ œ ์ด๋ฏธ์ง€์— ์ ‘๊ทผํ•˜๋ฉด ์ด๋ฏธ์ง€ ํ”„๋กœ์„ธ์„œ๊ฐ€ `pixel_values`๋ฅผ ์ถ”๊ฐ€ํ•œ ๊ฒƒ์„ ์•Œ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. +4. ์ด์ œ ์ด๋ฏธ์ง€์— ์ ‘๊ทผํ•˜๋ฉด ์ด๋ฏธ์ง€ ํ”„๋กœ์„ธ์„œ๊ฐ€ `pixel_values`๋ฅผ ์ถ”๊ฐ€ํ•œ ๊ฒƒ์„ ์•Œ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ๋“œ๋””์–ด ์ฒ˜๋ฆฌ๋œ ๋ฐ์ดํ„ฐ ์„ธํŠธ๋ฅผ ๋ชจ๋ธ์— ์ „๋‹ฌํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค! ```py @@ -447,21 +447,21 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], -`ImageProcessor`๋Š” ๊ฐ์ฒด ๊ฐ์ง€, ์‹œ๋งจํ‹ฑ ์„ธ๊ทธ๋ฉ˜ํ…Œ์ด์…˜(semantic segmentation), ์ธ์Šคํ„ด์Šค ์„ธ๊ทธ๋ฉ˜ํ…Œ์ด์…˜(instance segmentation), ํŒŒ๋†‰ํ‹ฑ ์„ธ๊ทธ๋ฉ˜ํ…Œ์ด์…˜(panoptic segmentation)๊ณผ ๊ฐ™์€ ์ž‘์—…์— ๋Œ€ํ•œ ํ›„์ฒ˜๋ฆฌ ๋ฐฉ๋ฒ•์„ ์ œ๊ณตํ•ฉ๋‹ˆ๋‹ค. +`ImageProcessor`๋Š” ๊ฐ์ฒด ๊ฐ์ง€, ์‹œ๋งจํ‹ฑ ์„ธ๊ทธ๋ฉ˜ํ…Œ์ด์…˜(semantic segmentation), ์ธ์Šคํ„ด์Šค ์„ธ๊ทธ๋ฉ˜ํ…Œ์ด์…˜(instance segmentation), ํŒŒ๋†‰ํ‹ฑ ์„ธ๊ทธ๋ฉ˜ํ…Œ์ด์…˜(panoptic segmentation)๊ณผ ๊ฐ™์€ ์ž‘์—…์— ๋Œ€ํ•œ ํ›„์ฒ˜๋ฆฌ ๋ฐฉ๋ฒ•์„ ์ œ๊ณตํ•ฉ๋‹ˆ๋‹ค. ์ด๋Ÿฌํ•œ ๋ฐฉ๋ฒ•์€ ๋ชจ๋ธ์˜ ์›์‹œ ์ถœ๋ ฅ์„ ๊ฒฝ๊ณ„ ์ƒ์ž๋‚˜ ์„ธ๊ทธ๋ฉ˜ํ…Œ์ด์…˜ ๋งต๊ณผ ๊ฐ™์€ ์˜๋ฏธ ์žˆ๋Š” ์˜ˆ์ธก์œผ๋กœ ๋ณ€ํ™˜ํ•ด์ค๋‹ˆ๋‹ค. ### ํŒจ๋”ฉ[[pad]] -์˜ˆ๋ฅผ ๋“ค์–ด, [DETR](./model_doc/detr)์™€ ๊ฐ™์€ ๊ฒฝ์šฐ์—๋Š” ๋ชจ๋ธ์ด ํ›ˆ๋ จํ•  ๋•Œ ํฌ๊ธฐ ์กฐ์ • ์ฆ๊ฐ•์„ ์ ์šฉํ•ฉ๋‹ˆ๋‹ค. -์ด๋กœ ์ธํ•ด ๋ฐฐ์น˜ ๋‚ด ์ด๋ฏธ์ง€ ํฌ๊ธฐ๊ฐ€ ๋‹ฌ๋ผ์งˆ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. -[`DetrImageProcessor`]์˜ [`DetrImageProcessor.pad_and_create_pixel_mask`]๋ฅผ ์‚ฌ์šฉํ•˜๊ณ  ์‚ฌ์šฉ์ž ์ •์˜ `collate_fn`์„ ์ •์˜ํ•ด์„œ ๋ฐฐ์น˜ ์ด๋ฏธ์ง€๋ฅผ ์ฒ˜๋ฆฌํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. +์˜ˆ๋ฅผ ๋“ค์–ด, [DETR](./model_doc/detr)์™€ ๊ฐ™์€ ๊ฒฝ์šฐ์—๋Š” ๋ชจ๋ธ์ด ํ›ˆ๋ จํ•  ๋•Œ ํฌ๊ธฐ ์กฐ์ • ์ฆ๊ฐ•์„ ์ ์šฉํ•ฉ๋‹ˆ๋‹ค. +์ด๋กœ ์ธํ•ด ๋ฐฐ์น˜ ๋‚ด ์ด๋ฏธ์ง€ ํฌ๊ธฐ๊ฐ€ ๋‹ฌ๋ผ์งˆ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. +[`DetrImageProcessor`]์˜ [`DetrImageProcessor.pad`]๋ฅผ ์‚ฌ์šฉํ•˜๊ณ  ์‚ฌ์šฉ์ž ์ •์˜ `collate_fn`์„ ์ •์˜ํ•ด์„œ ๋ฐฐ์น˜ ์ด๋ฏธ์ง€๋ฅผ ์ฒ˜๋ฆฌํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ```py >>> def collate_fn(batch): ... pixel_values = [item["pixel_values"] for item in batch] -... encoding = image_processor.pad_and_create_pixel_mask(pixel_values, return_tensors="pt") +... encoding = image_processor.pad(pixel_values, return_tensors="pt") ... labels = [item["labels"] for item in batch] ... batch = {} ... batch["pixel_values"] = encoding["pixel_values"] @@ -472,10 +472,10 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], ## ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ[[multimodal]] -๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ์ž…๋ ฅ์ด ํ•„์š”ํ•œ ์ž‘์—…์˜ ๊ฒฝ์šฐ, ๋ชจ๋ธ์— ๋ฐ์ดํ„ฐ ์„ธํŠธ๋ฅผ ์ค€๋น„ํ•˜๊ธฐ ์œ„ํ•œ [ํ”„๋กœ์„ธ์„œ](main_classes/processors)๊ฐ€ ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค. +๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ ์ž…๋ ฅ์ด ํ•„์š”ํ•œ ์ž‘์—…์˜ ๊ฒฝ์šฐ, ๋ชจ๋ธ์— ๋ฐ์ดํ„ฐ ์„ธํŠธ๋ฅผ ์ค€๋น„ํ•˜๊ธฐ ์œ„ํ•œ [ํ”„๋กœ์„ธ์„œ](main_classes/processors)๊ฐ€ ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค. ํ”„๋กœ์„ธ์„œ๋Š” ํ† ํฌ๋‚˜์ด์ €์™€ ํŠน์„ฑ ์ถ”์ถœ๊ธฐ์™€ ๊ฐ™์€ ๋‘ ๊ฐ€์ง€ ์ฒ˜๋ฆฌ ๊ฐ์ฒด๋ฅผ ๊ฒฐํ•ฉํ•ฉ๋‹ˆ๋‹ค. -[LJ Speech](https://huggingface.co/datasets/lj_speech) ๋ฐ์ดํ„ฐ ์„ธํŠธ๋ฅผ ๊ฐ€์ ธ์™€์„œ ์ž๋™ ์Œ์„ฑ ์ธ์‹(ASR)์„ ์œ„ํ•œ ํ”„๋กœ์„ธ์„œ๋ฅผ ์‚ฌ์šฉํ•˜๋Š” ๋ฐฉ๋ฒ•์„ ํ™•์ธํ•˜์„ธ์š”. +[LJ Speech](https://huggingface.co/datasets/lj_speech) ๋ฐ์ดํ„ฐ ์„ธํŠธ๋ฅผ ๊ฐ€์ ธ์™€์„œ ์ž๋™ ์Œ์„ฑ ์ธ์‹(ASR)์„ ์œ„ํ•œ ํ”„๋กœ์„ธ์„œ๋ฅผ ์‚ฌ์šฉํ•˜๋Š” ๋ฐฉ๋ฒ•์„ ํ™•์ธํ•˜์„ธ์š”. (๋ฐ์ดํ„ฐ ์„ธํŠธ๋ฅผ ๊ฐ€์ ธ์˜ค๋Š” ๋ฐฉ๋ฒ•์— ๋Œ€ํ•œ ์ž์„ธํ•œ ๋‚ด์šฉ์€ ๐Ÿค— [๋ฐ์ดํ„ฐ ์„ธํŠธ ํŠœํ† ๋ฆฌ์–ผ](https://huggingface.co/docs/datasets/load_hub.html)์—์„œ ๋ณผ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.) ```py @@ -517,7 +517,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") ``` -1. `array`์— ๋“ค์–ด ์žˆ๋Š” ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ๋ฅผ `input_values`๋กœ ๋ณ€ํ™˜ํ•˜๊ณ  `text`๋ฅผ ํ† ํฐํ™”ํ•˜์—ฌ `labels`๋กœ ๋ณ€ํ™˜ํ•˜๋Š” ํ•จ์ˆ˜๋ฅผ ๋งŒ๋“ญ๋‹ˆ๋‹ค. +1. `array`์— ๋“ค์–ด ์žˆ๋Š” ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ๋ฅผ `input_values`๋กœ ๋ณ€ํ™˜ํ•˜๊ณ  `text`๋ฅผ ํ† ํฐํ™”ํ•˜์—ฌ `labels`๋กœ ๋ณ€ํ™˜ํ•˜๋Š” ํ•จ์ˆ˜๋ฅผ ๋งŒ๋“ญ๋‹ˆ๋‹ค. ๋ชจ๋ธ์˜ ์ž…๋ ฅ์€ ๋‹ค์Œ๊ณผ ๊ฐ™์Šต๋‹ˆ๋‹ค: ```py @@ -535,5 +535,5 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], >>> prepare_dataset(lj_speech[0]) ``` -์ด์ œ ํ”„๋กœ์„ธ์„œ๊ฐ€ `input_values`์™€ `labels`๋ฅผ ์ถ”๊ฐ€ํ•˜๊ณ , ์ƒ˜ํ”Œ๋ง ๋ ˆ์ดํŠธ๋„ ์˜ฌ๋ฐ”๋ฅด๊ฒŒ 16kHz๋กœ ๋‹ค์šด์ƒ˜ํ”Œ๋งํ–ˆ์Šต๋‹ˆ๋‹ค. -๋“œ๋””์–ด ์ฒ˜๋ฆฌ๋œ ๋ฐ์ดํ„ฐ ์„ธํŠธ๋ฅผ ๋ชจ๋ธ์— ์ „๋‹ฌํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค! \ No newline at end of file +์ด์ œ ํ”„๋กœ์„ธ์„œ๊ฐ€ `input_values`์™€ `labels`๋ฅผ ์ถ”๊ฐ€ํ•˜๊ณ , ์ƒ˜ํ”Œ๋ง ๋ ˆ์ดํŠธ๋„ ์˜ฌ๋ฐ”๋ฅด๊ฒŒ 16kHz๋กœ ๋‹ค์šด์ƒ˜ํ”Œ๋งํ–ˆ์Šต๋‹ˆ๋‹ค. +๋“œ๋””์–ด ์ฒ˜๋ฆฌ๋œ ๋ฐ์ดํ„ฐ ์„ธํŠธ๋ฅผ ๋ชจ๋ธ์— ์ „๋‹ฌํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค! diff --git a/docs/source/ko/tasks/object_detection.md b/docs/source/ko/tasks/object_detection.md index 9eb3f4f74a..cb573ed4e7 100644 --- a/docs/source/ko/tasks/object_detection.md +++ b/docs/source/ko/tasks/object_detection.md @@ -18,10 +18,10 @@ rendered properly in your Markdown viewer. [[open-in-colab]] -๊ฐ์ฒด ํƒ์ง€๋Š” ์ด๋ฏธ์ง€์—์„œ ์ธ์Šคํ„ด์Šค(์˜ˆ: ์‚ฌ๋žŒ, ๊ฑด๋ฌผ ๋˜๋Š” ์ž๋™์ฐจ)๋ฅผ ๊ฐ์ง€ํ•˜๋Š” ์ปดํ“จํ„ฐ ๋น„์ „ ์ž‘์—…์ž…๋‹ˆ๋‹ค. ๊ฐ์ฒด ํƒ์ง€ ๋ชจ๋ธ์€ ์ด๋ฏธ์ง€๋ฅผ ์ž…๋ ฅ์œผ๋กœ ๋ฐ›๊ณ  ํƒ์ง€๋œ ๋ฐ”์šด๋”ฉ ๋ฐ•์Šค์˜ ์ขŒํ‘œ์™€ ๊ด€๋ จ๋œ ๋ ˆ์ด๋ธ”์„ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค. -ํ•˜๋‚˜์˜ ์ด๋ฏธ์ง€์—๋Š” ์—ฌ๋Ÿฌ ๊ฐ์ฒด๊ฐ€ ์žˆ์„ ์ˆ˜ ์žˆ์œผ๋ฉฐ ๊ฐ๊ฐ์€ ์ž์ฒด์ ์ธ ๋ฐ”์šด๋”ฉ ๋ฐ•์Šค์™€ ๋ ˆ์ด๋ธ”์„ ๊ฐ€์งˆ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค(์˜ˆ: ์ฐจ์™€ ๊ฑด๋ฌผ์ด ์žˆ๋Š” ์ด๋ฏธ์ง€). -๋˜ํ•œ ๊ฐ ๊ฐ์ฒด๋Š” ์ด๋ฏธ์ง€์˜ ๋‹ค๋ฅธ ๋ถ€๋ถ„์— ์กด์žฌํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค(์˜ˆ: ์ด๋ฏธ์ง€์— ์—ฌ๋Ÿฌ ๋Œ€์˜ ์ฐจ๊ฐ€ ์žˆ์„ ์ˆ˜ ์žˆ์Œ). -์ด ์ž‘์—…์€ ๋ณดํ–‰์ž, ๋„๋กœ ํ‘œ์ง€ํŒ, ์‹ ํ˜ธ๋“ฑ๊ณผ ๊ฐ™์€ ๊ฒƒ๋“ค์„ ๊ฐ์ง€ํ•˜๋Š” ์ž์œจ ์ฃผํ–‰์— ์ผ๋ฐ˜์ ์œผ๋กœ ์‚ฌ์šฉ๋ฉ๋‹ˆ๋‹ค. +๊ฐ์ฒด ํƒ์ง€๋Š” ์ด๋ฏธ์ง€์—์„œ ์ธ์Šคํ„ด์Šค(์˜ˆ: ์‚ฌ๋žŒ, ๊ฑด๋ฌผ ๋˜๋Š” ์ž๋™์ฐจ)๋ฅผ ๊ฐ์ง€ํ•˜๋Š” ์ปดํ“จํ„ฐ ๋น„์ „ ์ž‘์—…์ž…๋‹ˆ๋‹ค. ๊ฐ์ฒด ํƒ์ง€ ๋ชจ๋ธ์€ ์ด๋ฏธ์ง€๋ฅผ ์ž…๋ ฅ์œผ๋กœ ๋ฐ›๊ณ  ํƒ์ง€๋œ ๋ฐ”์šด๋”ฉ ๋ฐ•์Šค์˜ ์ขŒํ‘œ์™€ ๊ด€๋ จ๋œ ๋ ˆ์ด๋ธ”์„ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค. +ํ•˜๋‚˜์˜ ์ด๋ฏธ์ง€์—๋Š” ์—ฌ๋Ÿฌ ๊ฐ์ฒด๊ฐ€ ์žˆ์„ ์ˆ˜ ์žˆ์œผ๋ฉฐ ๊ฐ๊ฐ์€ ์ž์ฒด์ ์ธ ๋ฐ”์šด๋”ฉ ๋ฐ•์Šค์™€ ๋ ˆ์ด๋ธ”์„ ๊ฐ€์งˆ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค(์˜ˆ: ์ฐจ์™€ ๊ฑด๋ฌผ์ด ์žˆ๋Š” ์ด๋ฏธ์ง€). +๋˜ํ•œ ๊ฐ ๊ฐ์ฒด๋Š” ์ด๋ฏธ์ง€์˜ ๋‹ค๋ฅธ ๋ถ€๋ถ„์— ์กด์žฌํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค(์˜ˆ: ์ด๋ฏธ์ง€์— ์—ฌ๋Ÿฌ ๋Œ€์˜ ์ฐจ๊ฐ€ ์žˆ์„ ์ˆ˜ ์žˆ์Œ). +์ด ์ž‘์—…์€ ๋ณดํ–‰์ž, ๋„๋กœ ํ‘œ์ง€ํŒ, ์‹ ํ˜ธ๋“ฑ๊ณผ ๊ฐ™์€ ๊ฒƒ๋“ค์„ ๊ฐ์ง€ํ•˜๋Š” ์ž์œจ ์ฃผํ–‰์— ์ผ๋ฐ˜์ ์œผ๋กœ ์‚ฌ์šฉ๋ฉ๋‹ˆ๋‹ค. ๋‹ค๋ฅธ ์‘์šฉ ๋ถ„์•ผ๋กœ๋Š” ์ด๋ฏธ์ง€ ๋‚ด ๊ฐ์ฒด ์ˆ˜ ๊ณ„์‚ฐ ๋ฐ ์ด๋ฏธ์ง€ ๊ฒ€์ƒ‰ ๋“ฑ์ด ์žˆ์Šต๋‹ˆ๋‹ค. ์ด ๊ฐ€์ด๋“œ์—์„œ ๋‹ค์Œ์„ ๋ฐฐ์šธ ๊ฒƒ์ž…๋‹ˆ๋‹ค: @@ -45,7 +45,7 @@ rendered properly in your Markdown viewer. pip install -q datasets transformers evaluate timm albumentations ``` -ํ—ˆ๊น…ํŽ˜์ด์Šค ํ—ˆ๋ธŒ์—์„œ ๋ฐ์ดํ„ฐ ์„ธํŠธ๋ฅผ ๊ฐ€์ ธ์˜ค๊ธฐ ์œ„ํ•œ ๐Ÿค— Datasets๊ณผ ๋ชจ๋ธ์„ ํ•™์Šตํ•˜๊ธฐ ์œ„ํ•œ ๐Ÿค— Transformers, ๋ฐ์ดํ„ฐ๋ฅผ ์ฆ๊ฐ•ํ•˜๊ธฐ ์œ„ํ•œ `albumentations`๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค. +ํ—ˆ๊น…ํŽ˜์ด์Šค ํ—ˆ๋ธŒ์—์„œ ๋ฐ์ดํ„ฐ ์„ธํŠธ๋ฅผ ๊ฐ€์ ธ์˜ค๊ธฐ ์œ„ํ•œ ๐Ÿค— Datasets๊ณผ ๋ชจ๋ธ์„ ํ•™์Šตํ•˜๊ธฐ ์œ„ํ•œ ๐Ÿค— Transformers, ๋ฐ์ดํ„ฐ๋ฅผ ์ฆ๊ฐ•ํ•˜๊ธฐ ์œ„ํ•œ `albumentations`๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค. DETR ๋ชจ๋ธ์˜ ํ•ฉ์„ฑ๊ณฑ ๋ฐฑ๋ณธ์„ ๊ฐ€์ ธ์˜ค๊ธฐ ์œ„ํ•ด์„œ๋Š” ํ˜„์žฌ `timm`์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค. ์ปค๋ฎค๋‹ˆํ‹ฐ์— ๋ชจ๋ธ์„ ์—…๋กœ๋“œํ•˜๊ณ  ๊ณต์œ ํ•  ์ˆ˜ ์žˆ๋„๋ก Hugging Face ๊ณ„์ •์— ๋กœ๊ทธ์ธํ•˜๋Š” ๊ฒƒ์„ ๊ถŒ์žฅํ•ฉ๋‹ˆ๋‹ค. ํ”„๋กฌํ”„ํŠธ๊ฐ€ ๋‚˜ํƒ€๋‚˜๋ฉด ํ† ํฐ์„ ์ž…๋ ฅํ•˜์—ฌ ๋กœ๊ทธ์ธํ•˜์„ธ์š”: @@ -110,7 +110,7 @@ DatasetDict({ - `bbox`: ๊ฐ์ฒด์˜ ๋ฐ”์šด๋”ฉ ๋ฐ•์Šค ([COCO ํฌ๋งท](https://albumentations.ai/docs/getting_started/bounding_boxes_augmentation/#coco)์œผ๋กœ) - `category`: ๊ฐ์ฒด์˜ ์นดํ…Œ๊ณ ๋ฆฌ, ๊ฐ€๋Šฅํ•œ ๊ฐ’์œผ๋กœ๋Š” `Coverall (0)`, `Face_Shield (1)`, `Gloves (2)`, `Goggles (3)` ๋ฐ `Mask (4)` ๊ฐ€ ํฌํ•จ๋ฉ๋‹ˆ๋‹ค. -`bbox` ํ•„๋“œ๊ฐ€ DETR ๋ชจ๋ธ์ด ์š”๊ตฌํ•˜๋Š” COCO ํ˜•์‹์„ ๋”ฐ๋ฅธ๋‹ค๋Š” ๊ฒƒ์„ ์•Œ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. +`bbox` ํ•„๋“œ๊ฐ€ DETR ๋ชจ๋ธ์ด ์š”๊ตฌํ•˜๋Š” COCO ํ˜•์‹์„ ๋”ฐ๋ฅธ๋‹ค๋Š” ๊ฒƒ์„ ์•Œ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ๊ทธ๋Ÿฌ๋‚˜ `objects` ๋‚ด๋ถ€์˜ ํ•„๋“œ ๊ทธ๋ฃน์€ DETR์ด ์š”๊ตฌํ•˜๋Š” ์–ด๋…ธํ…Œ์ด์…˜ ํ˜•์‹๊ณผ ๋‹ค๋ฆ…๋‹ˆ๋‹ค. ๋”ฐ๋ผ์„œ ์ด ๋ฐ์ดํ„ฐ๋ฅผ ํ•™์Šต์— ์‚ฌ์šฉํ•˜๊ธฐ ์ „์— ์ „์ฒ˜๋ฆฌ๋ฅผ ์ ์šฉํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. ๋ฐ์ดํ„ฐ๋ฅผ ๋” ์ž˜ ์ดํ•ดํ•˜๊ธฐ ์œ„ํ•ด์„œ ๋ฐ์ดํ„ฐ ์„ธํŠธ์—์„œ ํ•œ ๊ฐ€์ง€ ์˜ˆ์‹œ๋ฅผ ์‹œ๊ฐํ™”ํ•˜์„ธ์š”. @@ -143,13 +143,13 @@ DatasetDict({ CPPE-5 Image Example -๋ฐ”์šด๋”ฉ ๋ฐ•์Šค์™€ ์—ฐ๊ฒฐ๋œ ๋ ˆ์ด๋ธ”์„ ์‹œ๊ฐํ™”ํ•˜๋ ค๋ฉด ๋ฐ์ดํ„ฐ ์„ธํŠธ์˜ ๋ฉ”ํƒ€ ๋ฐ์ดํ„ฐ, ํŠนํžˆ `category` ํ•„๋“œ์—์„œ ๋ ˆ์ด๋ธ”์„ ๊ฐ€์ ธ์™€์•ผ ํ•ฉ๋‹ˆ๋‹ค. -๋˜ํ•œ ๋ ˆ์ด๋ธ” ID๋ฅผ ๋ ˆ์ด๋ธ” ํด๋ž˜์Šค์— ๋งคํ•‘ํ•˜๋Š” `id2label`๊ณผ ๋ฐ˜๋Œ€๋กœ ๋งคํ•‘ํ•˜๋Š” `label2id` ๋”•์…”๋„ˆ๋ฆฌ๋ฅผ ๋งŒ๋“ค์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. +๋ฐ”์šด๋”ฉ ๋ฐ•์Šค์™€ ์—ฐ๊ฒฐ๋œ ๋ ˆ์ด๋ธ”์„ ์‹œ๊ฐํ™”ํ•˜๋ ค๋ฉด ๋ฐ์ดํ„ฐ ์„ธํŠธ์˜ ๋ฉ”ํƒ€ ๋ฐ์ดํ„ฐ, ํŠนํžˆ `category` ํ•„๋“œ์—์„œ ๋ ˆ์ด๋ธ”์„ ๊ฐ€์ ธ์™€์•ผ ํ•ฉ๋‹ˆ๋‹ค. +๋˜ํ•œ ๋ ˆ์ด๋ธ” ID๋ฅผ ๋ ˆ์ด๋ธ” ํด๋ž˜์Šค์— ๋งคํ•‘ํ•˜๋Š” `id2label`๊ณผ ๋ฐ˜๋Œ€๋กœ ๋งคํ•‘ํ•˜๋Š” `label2id` ๋”•์…”๋„ˆ๋ฆฌ๋ฅผ ๋งŒ๋“ค์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. ๋ชจ๋ธ์„ ์„ค์ •ํ•  ๋•Œ ์ด๋Ÿฌํ•œ ๋งคํ•‘์„ ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ์ด๋Ÿฌํ•œ ๋งคํ•‘์€ ํ—ˆ๊น…ํŽ˜์ด์Šค ํ—ˆ๋ธŒ์—์„œ ๋ชจ๋ธ์„ ๊ณต์œ ํ–ˆ์„ ๋•Œ ๋‹ค๋ฅธ ์‚ฌ๋žŒ๋“ค์ด ์žฌ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. -๋ฐ์ดํ„ฐ๋ฅผ ๋” ์ž˜ ์ดํ•ดํ•˜๊ธฐ ์œ„ํ•œ ์ตœ์ข… ๋‹จ๊ณ„๋กœ, ์ž ์žฌ์ ์ธ ๋ฌธ์ œ๋ฅผ ์ฐพ์•„๋ณด์„ธ์š”. -๊ฐ์ฒด ๊ฐ์ง€๋ฅผ ์œ„ํ•œ ๋ฐ์ดํ„ฐ ์„ธํŠธ์—์„œ ์ž์ฃผ ๋ฐœ์ƒํ•˜๋Š” ๋ฌธ์ œ ์ค‘ ํ•˜๋‚˜๋Š” ๋ฐ”์šด๋”ฉ ๋ฐ•์Šค๊ฐ€ ์ด๋ฏธ์ง€์˜ ๊ฐ€์žฅ์ž๋ฆฌ๋ฅผ ๋„˜์–ด๊ฐ€๋Š” ๊ฒƒ์ž…๋‹ˆ๋‹ค. -์ด๋Ÿฌํ•œ ๋ฐ”์šด๋”ฉ ๋ฐ•์Šค๋ฅผ "๋„˜์–ด๊ฐ€๋Š” ๊ฒƒ(run away)"์€ ํ›ˆ๋ จ ์ค‘์— ์˜ค๋ฅ˜๋ฅผ ๋ฐœ์ƒ์‹œํ‚ฌ ์ˆ˜ ์žˆ๊ธฐ์— ์ด ๋‹จ๊ณ„์—์„œ ์ฒ˜๋ฆฌํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. +๋ฐ์ดํ„ฐ๋ฅผ ๋” ์ž˜ ์ดํ•ดํ•˜๊ธฐ ์œ„ํ•œ ์ตœ์ข… ๋‹จ๊ณ„๋กœ, ์ž ์žฌ์ ์ธ ๋ฌธ์ œ๋ฅผ ์ฐพ์•„๋ณด์„ธ์š”. +๊ฐ์ฒด ๊ฐ์ง€๋ฅผ ์œ„ํ•œ ๋ฐ์ดํ„ฐ ์„ธํŠธ์—์„œ ์ž์ฃผ ๋ฐœ์ƒํ•˜๋Š” ๋ฌธ์ œ ์ค‘ ํ•˜๋‚˜๋Š” ๋ฐ”์šด๋”ฉ ๋ฐ•์Šค๊ฐ€ ์ด๋ฏธ์ง€์˜ ๊ฐ€์žฅ์ž๋ฆฌ๋ฅผ ๋„˜์–ด๊ฐ€๋Š” ๊ฒƒ์ž…๋‹ˆ๋‹ค. +์ด๋Ÿฌํ•œ ๋ฐ”์šด๋”ฉ ๋ฐ•์Šค๋ฅผ "๋„˜์–ด๊ฐ€๋Š” ๊ฒƒ(run away)"์€ ํ›ˆ๋ จ ์ค‘์— ์˜ค๋ฅ˜๋ฅผ ๋ฐœ์ƒ์‹œํ‚ฌ ์ˆ˜ ์žˆ๊ธฐ์— ์ด ๋‹จ๊ณ„์—์„œ ์ฒ˜๋ฆฌํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. ์ด ๋ฐ์ดํ„ฐ ์„ธํŠธ์—๋„ ๊ฐ™์€ ๋ฌธ์ œ๊ฐ€ ์žˆ๋Š” ๋ช‡ ๊ฐ€์ง€ ์˜ˆ๊ฐ€ ์žˆ์Šต๋‹ˆ๋‹ค. ์ด ๊ฐ€์ด๋“œ์—์„œ๋Š” ๊ฐ„๋‹จํ•˜๊ฒŒํ•˜๊ธฐ ์œ„ํ•ด ๋ฐ์ดํ„ฐ์—์„œ ์ด๋Ÿฌํ•œ ์ด๋ฏธ์ง€๋ฅผ ์ œ๊ฑฐํ•ฉ๋‹ˆ๋‹ค. ```py @@ -160,15 +160,15 @@ DatasetDict({ ## ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌํ•˜๊ธฐ [[preprocess-the-data]] -๋ชจ๋ธ์„ ๋ฏธ์„ธ ์กฐ์ • ํ•˜๋ ค๋ฉด, ๋ฏธ๋ฆฌ ํ•™์Šต๋œ ๋ชจ๋ธ์—์„œ ์‚ฌ์šฉํ•œ ์ „์ฒ˜๋ฆฌ ๋ฐฉ์‹๊ณผ ์ •ํ™•ํ•˜๊ฒŒ ์ผ์น˜ํ•˜๋„๋ก ์‚ฌ์šฉํ•  ๋ฐ์ดํ„ฐ๋ฅผ ์ „์ฒ˜๋ฆฌํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. -[`AutoImageProcessor`]๋Š” ์ด๋ฏธ์ง€ ๋ฐ์ดํ„ฐ๋ฅผ ์ฒ˜๋ฆฌํ•˜์—ฌ DETR ๋ชจ๋ธ์ด ํ•™์Šต์— ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ๋Š” `pixel_values`, `pixel_mask`, ๊ทธ๋ฆฌ๊ณ  `labels`๋ฅผ ์ƒ์„ฑํ•˜๋Š” ์ž‘์—…์„ ๋‹ด๋‹นํ•ฉ๋‹ˆ๋‹ค. +๋ชจ๋ธ์„ ๋ฏธ์„ธ ์กฐ์ • ํ•˜๋ ค๋ฉด, ๋ฏธ๋ฆฌ ํ•™์Šต๋œ ๋ชจ๋ธ์—์„œ ์‚ฌ์šฉํ•œ ์ „์ฒ˜๋ฆฌ ๋ฐฉ์‹๊ณผ ์ •ํ™•ํ•˜๊ฒŒ ์ผ์น˜ํ•˜๋„๋ก ์‚ฌ์šฉํ•  ๋ฐ์ดํ„ฐ๋ฅผ ์ „์ฒ˜๋ฆฌํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. +[`AutoImageProcessor`]๋Š” ์ด๋ฏธ์ง€ ๋ฐ์ดํ„ฐ๋ฅผ ์ฒ˜๋ฆฌํ•˜์—ฌ DETR ๋ชจ๋ธ์ด ํ•™์Šต์— ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ๋Š” `pixel_values`, `pixel_mask`, ๊ทธ๋ฆฌ๊ณ  `labels`๋ฅผ ์ƒ์„ฑํ•˜๋Š” ์ž‘์—…์„ ๋‹ด๋‹นํ•ฉ๋‹ˆ๋‹ค. ์ด ์ด๋ฏธ์ง€ ํ”„๋กœ์„ธ์„œ์—๋Š” ๊ฑฑ์ •ํ•˜์ง€ ์•Š์•„๋„ ๋˜๋Š” ๋ช‡ ๊ฐ€์ง€ ์†์„ฑ์ด ์žˆ์Šต๋‹ˆ๋‹ค: - `image_mean = [0.485, 0.456, 0.406 ]` - `image_std = [0.229, 0.224, 0.225]` -์ด ๊ฐ’๋“ค์€ ๋ชจ๋ธ ์‚ฌ์ „ ํ›ˆ๋ จ ์ค‘ ์ด๋ฏธ์ง€๋ฅผ ์ •๊ทœํ™”ํ•˜๋Š” ๋ฐ ์‚ฌ์šฉ๋˜๋Š” ํ‰๊ท ๊ณผ ํ‘œ์ค€ ํŽธ์ฐจ์ž…๋‹ˆ๋‹ค. +์ด ๊ฐ’๋“ค์€ ๋ชจ๋ธ ์‚ฌ์ „ ํ›ˆ๋ จ ์ค‘ ์ด๋ฏธ์ง€๋ฅผ ์ •๊ทœํ™”ํ•˜๋Š” ๋ฐ ์‚ฌ์šฉ๋˜๋Š” ํ‰๊ท ๊ณผ ํ‘œ์ค€ ํŽธ์ฐจ์ž…๋‹ˆ๋‹ค. ์ด ๊ฐ’๋“ค์€ ์ถ”๋ก  ๋˜๋Š” ์‚ฌ์ „ ํ›ˆ๋ จ๋œ ์ด๋ฏธ์ง€ ๋ชจ๋ธ์„ ์„ธ๋ฐ€ํ•˜๊ฒŒ ์กฐ์ •ํ•  ๋•Œ ๋ณต์ œํ•ด์•ผ ํ•˜๋Š” ์ค‘์š”ํ•œ ๊ฐ’์ž…๋‹ˆ๋‹ค. ์‚ฌ์ „ ํ›ˆ๋ จ๋œ ๋ชจ๋ธ๊ณผ ๋™์ผํ•œ ์ฒดํฌํฌ์ธํŠธ์—์„œ ์ด๋ฏธ์ง€ ํ”„๋กœ์„ธ์„œ๋ฅผ ์ธ์Šคํ„ด์Šคํ™”ํ•ฉ๋‹ˆ๋‹ค. @@ -187,7 +187,7 @@ DatasetDict({ ์ฒซ์งธ๋กœ, ๋ชจ๋ธ์ด ํ•™์Šต ๋ฐ์ดํ„ฐ์— ๊ณผ์ ํ•ฉ ๋˜์ง€ ์•Š๋„๋ก ๋ฐ์ดํ„ฐ ์ฆ๊ฐ• ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์ค‘ ์•„๋ฌด๊ฑฐ๋‚˜ ์‚ฌ์šฉํ•˜์—ฌ ๋ณ€ํ™˜์„ ์ ์šฉํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ์—ฌ๊ธฐ์—์„œ๋Š” [Albumentations](https://albumentations.ai/docs/) ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค... ์ด ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋Š” ๋ณ€ํ™˜์„ ์ด๋ฏธ์ง€์— ์ ์šฉํ•˜๊ณ  ๋ฐ”์šด๋”ฉ ๋ฐ•์Šค๋ฅผ ์ ์ ˆํ•˜๊ฒŒ ์—…๋ฐ์ดํŠธํ•˜๋„๋ก ๋ณด์žฅํ•ฉ๋‹ˆ๋‹ค. -๐Ÿค— Datasets ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ๋ฌธ์„œ์—๋Š” [๊ฐ์ฒด ํƒ์ง€๋ฅผ ์œ„ํ•ด ์ด๋ฏธ์ง€๋ฅผ ๋ณด๊ฐ•ํ•˜๋Š” ๋ฐฉ๋ฒ•์— ๋Œ€ํ•œ ์ž์„ธํ•œ ๊ฐ€์ด๋“œ](https://huggingface.co/docs/datasets/object_detection)๊ฐ€ ์žˆ์œผ๋ฉฐ, +๐Ÿค— Datasets ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ๋ฌธ์„œ์—๋Š” [๊ฐ์ฒด ํƒ์ง€๋ฅผ ์œ„ํ•ด ์ด๋ฏธ์ง€๋ฅผ ๋ณด๊ฐ•ํ•˜๋Š” ๋ฐฉ๋ฒ•์— ๋Œ€ํ•œ ์ž์„ธํ•œ ๊ฐ€์ด๋“œ](https://huggingface.co/docs/datasets/object_detection)๊ฐ€ ์žˆ์œผ๋ฉฐ, ์ด ์˜ˆ์ œ์™€ ์ •ํ™•ํžˆ ๋™์ผํ•œ ๋ฐ์ดํ„ฐ ์„ธํŠธ๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค. ์—ฌ๊ธฐ์„œ๋Š” ๊ฐ ์ด๋ฏธ์ง€๋ฅผ (480, 480) ํฌ๊ธฐ๋กœ ์กฐ์ •ํ•˜๊ณ , ์ขŒ์šฐ๋กœ ๋’ค์ง‘๊ณ , ๋ฐ๊ธฐ๋ฅผ ๋†’์ด๋Š” ๋™์ผํ•œ ์ ‘๊ทผ๋ฒ•์„ ์ ์šฉํ•ฉ๋‹ˆ๋‹ค: @@ -290,14 +290,14 @@ DatasetDict({ 'labels': {'size': tensor([800, 800]), 'image_id': tensor([756]), 'class_labels': tensor([4]), 'boxes': tensor([[0.7340, 0.6986, 0.3414, 0.5944]]), 'area': tensor([519544.4375]), 'iscrowd': tensor([0]), 'orig_size': tensor([480, 480])}} ``` -๊ฐ๊ฐ์˜ ์ด๋ฏธ์ง€๋ฅผ ์„ฑ๊ณต์ ์œผ๋กœ ์ฆ๊ฐ•ํ•˜๊ณ  ์ด๋ฏธ์ง€์˜ ์–ด๋…ธํ…Œ์ด์…˜์„ ์ค€๋น„ํ–ˆ์Šต๋‹ˆ๋‹ค. +๊ฐ๊ฐ์˜ ์ด๋ฏธ์ง€๋ฅผ ์„ฑ๊ณต์ ์œผ๋กœ ์ฆ๊ฐ•ํ•˜๊ณ  ์ด๋ฏธ์ง€์˜ ์–ด๋…ธํ…Œ์ด์…˜์„ ์ค€๋น„ํ–ˆ์Šต๋‹ˆ๋‹ค. ๊ทธ๋Ÿฌ๋‚˜ ์ „์ฒ˜๋ฆฌ๋Š” ์•„์ง ๋๋‚˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. ๋งˆ์ง€๋ง‰ ๋‹จ๊ณ„๋กœ, ์ด๋ฏธ์ง€๋ฅผ ๋ฐฐ์น˜๋กœ ๋งŒ๋“ค ์‚ฌ์šฉ์ž ์ •์˜ `collate_fn`์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค. ํ•ด๋‹น ๋ฐฐ์น˜์—์„œ ๊ฐ€์žฅ ํฐ ์ด๋ฏธ์ง€์— ์ด๋ฏธ์ง€(ํ˜„์žฌ `pixel_values` ์ธ)๋ฅผ ํŒจ๋“œํ•˜๊ณ , ์‹ค์ œ ํ”ฝ์…€(1)๊ณผ ํŒจ๋”ฉ(0)์„ ๋‚˜ํƒ€๋‚ด๊ธฐ ์œ„ํ•ด ๊ทธ์— ํ•ด๋‹นํ•˜๋Š” ์ƒˆ๋กœ์šด `pixel_mask`๋ฅผ ์ƒ์„ฑํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. ```py >>> def collate_fn(batch): ... pixel_values = [item["pixel_values"] for item in batch] -... encoding = image_processor.pad_and_create_pixel_mask(pixel_values, return_tensors="pt") +... encoding = image_processor.pad(pixel_values, return_tensors="pt") ... labels = [item["labels"] for item in batch] ... batch = {} ... batch["pixel_values"] = encoding["pixel_values"] @@ -318,7 +318,7 @@ DatasetDict({ 3. ๋ชจ๋ธ, ๋ฐ์ดํ„ฐ ์„ธํŠธ, ์ด๋ฏธ์ง€ ํ”„๋กœ์„ธ์„œ ๋ฐ ๋ฐ์ดํ„ฐ ์ฝœ๋ ˆ์ดํ„ฐ์™€ ํ•จ๊ป˜ [`Trainer`]์— ํ›ˆ๋ จ ์ธ์ˆ˜๋ฅผ ์ „๋‹ฌํ•ฉ๋‹ˆ๋‹ค. 4. [`~Trainer.train`]๋ฅผ ํ˜ธ์ถœํ•˜์—ฌ ๋ชจ๋ธ์„ ๋ฏธ์„ธ ์กฐ์ • ํ•ฉ๋‹ˆ๋‹ค. -์ „์ฒ˜๋ฆฌ์— ์‚ฌ์šฉํ•œ ์ฒดํฌํฌ์ธํŠธ์™€ ๋™์ผํ•œ ์ฒดํฌํฌ์ธํŠธ์—์„œ ๋ชจ๋ธ์„ ๊ฐ€์ ธ์˜ฌ ๋•Œ, ๋ฐ์ดํ„ฐ ์„ธํŠธ์˜ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ์—์„œ ๋งŒ๋“  `label2id`์™€ `id2label` ๋งคํ•‘์„ ์ „๋‹ฌํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. +์ „์ฒ˜๋ฆฌ์— ์‚ฌ์šฉํ•œ ์ฒดํฌํฌ์ธํŠธ์™€ ๋™์ผํ•œ ์ฒดํฌํฌ์ธํŠธ์—์„œ ๋ชจ๋ธ์„ ๊ฐ€์ ธ์˜ฌ ๋•Œ, ๋ฐ์ดํ„ฐ ์„ธํŠธ์˜ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ์—์„œ ๋งŒ๋“  `label2id`์™€ `id2label` ๋งคํ•‘์„ ์ „๋‹ฌํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. ๋˜ํ•œ, `ignore_mismatched_sizes=True`๋ฅผ ์ง€์ •ํ•˜์—ฌ ๊ธฐ์กด ๋ถ„๋ฅ˜ ํ—ค๋“œ(๋ชจ๋ธ์—์„œ ๋ถ„๋ฅ˜์— ์‚ฌ์šฉ๋˜๋Š” ๋งˆ์ง€๋ง‰ ๋ ˆ์ด์–ด)๋ฅผ ์ƒˆ ๋ถ„๋ฅ˜ ํ—ค๋“œ๋กœ ๋Œ€์ฒดํ•ฉ๋‹ˆ๋‹ค. ```py @@ -333,7 +333,7 @@ DatasetDict({ ``` [`TrainingArguments`]์—์„œ `output_dir`์„ ์‚ฌ์šฉํ•˜์—ฌ ๋ชจ๋ธ์„ ์ €์žฅํ•  ์œ„์น˜๋ฅผ ์ง€์ •ํ•œ ๋‹ค์Œ, ํ•„์š”์— ๋”ฐ๋ผ ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ๋ฅผ ๊ตฌ์„ฑํ•˜์„ธ์š”. -์‚ฌ์šฉํ•˜์ง€ ์•Š๋Š” ์—ด์„ ์ œ๊ฑฐํ•˜์ง€ ์•Š๋„๋ก ์ฃผ์˜ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. ๋งŒ์•ฝ `remove_unused_columns`๊ฐ€ `True`์ผ ๊ฒฝ์šฐ ์ด๋ฏธ์ง€ ์—ด์ด ์‚ญ์ œ๋ฉ๋‹ˆ๋‹ค. +์‚ฌ์šฉํ•˜์ง€ ์•Š๋Š” ์—ด์„ ์ œ๊ฑฐํ•˜์ง€ ์•Š๋„๋ก ์ฃผ์˜ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. ๋งŒ์•ฝ `remove_unused_columns`๊ฐ€ `True`์ผ ๊ฒฝ์šฐ ์ด๋ฏธ์ง€ ์—ด์ด ์‚ญ์ œ๋ฉ๋‹ˆ๋‹ค. ์ด๋ฏธ์ง€ ์—ด์ด ์—†๋Š” ๊ฒฝ์šฐ `pixel_values`๋ฅผ ์ƒ์„ฑํ•  ์ˆ˜ ์—†๊ธฐ ๋•Œ๋ฌธ์— `remove_unused_columns`๋ฅผ `False`๋กœ ์„ค์ •ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. ๋ชจ๋ธ์„ Hub์— ์—…๋กœ๋“œํ•˜์—ฌ ๊ณต์œ ํ•˜๋ ค๋ฉด `push_to_hub`๋ฅผ `True`๋กœ ์„ค์ •ํ•˜์‹ญ์‹œ์˜ค(ํ—ˆ๊น…ํŽ˜์ด์Šค์— ๋กœ๊ทธ์ธํ•˜์—ฌ ๋ชจ๋ธ์„ ์—…๋กœ๋“œํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค). @@ -372,7 +372,7 @@ DatasetDict({ >>> trainer.train() ``` -`training_args`์—์„œ `push_to_hub`๋ฅผ `True`๋กœ ์„ค์ •ํ•œ ๊ฒฝ์šฐ, ํ•™์Šต ์ฒดํฌํฌ์ธํŠธ๋Š” ํ—ˆ๊น…ํŽ˜์ด์Šค ํ—ˆ๋ธŒ์— ์—…๋กœ๋“œ๋ฉ๋‹ˆ๋‹ค. +`training_args`์—์„œ `push_to_hub`๋ฅผ `True`๋กœ ์„ค์ •ํ•œ ๊ฒฝ์šฐ, ํ•™์Šต ์ฒดํฌํฌ์ธํŠธ๋Š” ํ—ˆ๊น…ํŽ˜์ด์Šค ํ—ˆ๋ธŒ์— ์—…๋กœ๋“œ๋ฉ๋‹ˆ๋‹ค. ํ•™์Šต ์™„๋ฃŒ ํ›„, [`~transformers.Trainer.push_to_hub`] ๋ฉ”์†Œ๋“œ๋ฅผ ํ˜ธ์ถœํ•˜์—ฌ ์ตœ์ข… ๋ชจ๋ธ์„ ํ—ˆ๊น…ํŽ˜์ด์Šค ํ—ˆ๋ธŒ์— ์—…๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค. ```py @@ -381,14 +381,14 @@ DatasetDict({ ## ํ‰๊ฐ€ํ•˜๊ธฐ [[evaluate]] -๊ฐ์ฒด ํƒ์ง€ ๋ชจ๋ธ์€ ์ผ๋ฐ˜์ ์œผ๋กœ ์ผ๋ จ์˜ COCO-์Šคํƒ€์ผ ์ง€ํ‘œ๋กœ ํ‰๊ฐ€๋ฉ๋‹ˆ๋‹ค. +๊ฐ์ฒด ํƒ์ง€ ๋ชจ๋ธ์€ ์ผ๋ฐ˜์ ์œผ๋กœ ์ผ๋ จ์˜ COCO-์Šคํƒ€์ผ ์ง€ํ‘œ๋กœ ํ‰๊ฐ€๋ฉ๋‹ˆ๋‹ค. ๊ธฐ์กด์— ๊ตฌํ˜„๋œ ํ‰๊ฐ€ ์ง€ํ‘œ ์ค‘ ํ•˜๋‚˜๋ฅผ ์‚ฌ์šฉํ•  ์ˆ˜๋„ ์žˆ์ง€๋งŒ, ์—ฌ๊ธฐ์—์„œ๋Š” ํ—ˆ๊น…ํŽ˜์ด์Šค ํ—ˆ๋ธŒ์— ํ‘ธ์‹œํ•œ ์ตœ์ข… ๋ชจ๋ธ์„ ํ‰๊ฐ€ํ•˜๋Š” ๋ฐ `torchvision`์—์„œ ์ œ๊ณตํ•˜๋Š” ํ‰๊ฐ€ ์ง€ํ‘œ๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค. -`torchvision` ํ‰๊ฐ€์ž(evaluator)๋ฅผ ์‚ฌ์šฉํ•˜๋ ค๋ฉด ์‹ค์ธก๊ฐ’์ธ COCO ๋ฐ์ดํ„ฐ ์„ธํŠธ๋ฅผ ์ค€๋น„ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. -COCO ๋ฐ์ดํ„ฐ ์„ธํŠธ๋ฅผ ๋นŒ๋“œํ•˜๋Š” API๋Š” ๋ฐ์ดํ„ฐ๋ฅผ ํŠน์ • ํ˜•์‹์œผ๋กœ ์ €์žฅํ•ด์•ผ ํ•˜๋ฏ€๋กœ, ๋จผ์ € ์ด๋ฏธ์ง€์™€ ์–ด๋…ธํ…Œ์ด์…˜์„ ๋””์Šคํฌ์— ์ €์žฅํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. +`torchvision` ํ‰๊ฐ€์ž(evaluator)๋ฅผ ์‚ฌ์šฉํ•˜๋ ค๋ฉด ์‹ค์ธก๊ฐ’์ธ COCO ๋ฐ์ดํ„ฐ ์„ธํŠธ๋ฅผ ์ค€๋น„ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. +COCO ๋ฐ์ดํ„ฐ ์„ธํŠธ๋ฅผ ๋นŒ๋“œํ•˜๋Š” API๋Š” ๋ฐ์ดํ„ฐ๋ฅผ ํŠน์ • ํ˜•์‹์œผ๋กœ ์ €์žฅํ•ด์•ผ ํ•˜๋ฏ€๋กœ, ๋จผ์ € ์ด๋ฏธ์ง€์™€ ์–ด๋…ธํ…Œ์ด์…˜์„ ๋””์Šคํฌ์— ์ €์žฅํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. ํ•™์Šต์„ ์œ„ํ•ด ๋ฐ์ดํ„ฐ๋ฅผ ์ค€๋น„ํ•  ๋•Œ์™€ ๋งˆ์ฐฌ๊ฐ€์ง€๋กœ, cppe5["test"]์—์„œ์˜ ์–ด๋…ธํ…Œ์ด์…˜์€ ํฌ๋งท์„ ๋งž์ถฐ์•ผ ํ•ฉ๋‹ˆ๋‹ค. ๊ทธ๋Ÿฌ๋‚˜ ์ด๋ฏธ์ง€๋Š” ๊ทธ๋Œ€๋กœ ์œ ์ง€ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. -ํ‰๊ฐ€ ๋‹จ๊ณ„๋Š” ์•ฝ๊ฐ„์˜ ์ž‘์—…์ด ํ•„์š”ํ•˜์ง€๋งŒ, ํฌ๊ฒŒ ์„ธ ๊ฐ€์ง€ ์ฃผ์š” ๋‹จ๊ณ„๋กœ ๋‚˜๋ˆŒ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. +ํ‰๊ฐ€ ๋‹จ๊ณ„๋Š” ์•ฝ๊ฐ„์˜ ์ž‘์—…์ด ํ•„์š”ํ•˜์ง€๋งŒ, ํฌ๊ฒŒ ์„ธ ๊ฐ€์ง€ ์ฃผ์š” ๋‹จ๊ณ„๋กœ ๋‚˜๋ˆŒ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ๋จผ์ €, `cppe5["test"]` ์„ธํŠธ๋ฅผ ์ค€๋น„ํ•ฉ๋‹ˆ๋‹ค: ์–ด๋…ธํ…Œ์ด์…˜์„ ํฌ๋งท์— ๋งž๊ฒŒ ๋งŒ๋“ค๊ณ  ๋ฐ์ดํ„ฐ๋ฅผ ๋””์Šคํฌ์— ์ €์žฅํ•ฉ๋‹ˆ๋‹ค. ```py @@ -532,9 +532,9 @@ IoU metric: bbox ## ์ถ”๋ก ํ•˜๊ธฐ [[inference]] -DETR ๋ชจ๋ธ์„ ๋ฏธ์„ธ ์กฐ์ • ๋ฐ ํ‰๊ฐ€ํ•˜๊ณ , ํ—ˆ๊น…ํŽ˜์ด์Šค ํ—ˆ๋ธŒ์— ์—…๋กœ๋“œ ํ–ˆ์œผ๋ฏ€๋กœ ์ถ”๋ก ์— ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. +DETR ๋ชจ๋ธ์„ ๋ฏธ์„ธ ์กฐ์ • ๋ฐ ํ‰๊ฐ€ํ•˜๊ณ , ํ—ˆ๊น…ํŽ˜์ด์Šค ํ—ˆ๋ธŒ์— ์—…๋กœ๋“œ ํ–ˆ์œผ๋ฏ€๋กœ ์ถ”๋ก ์— ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. -๋ฏธ์„ธ ์กฐ์ •๋œ ๋ชจ๋ธ์„ ์ถ”๋ก ์— ์‚ฌ์šฉํ•˜๋Š” ๊ฐ€์žฅ ๊ฐ„๋‹จํ•œ ๋ฐฉ๋ฒ•์€ [`pipeline`]์—์„œ ๋ชจ๋ธ์„ ์‚ฌ์šฉํ•˜๋Š” ๊ฒƒ์ž…๋‹ˆ๋‹ค. +๋ฏธ์„ธ ์กฐ์ •๋œ ๋ชจ๋ธ์„ ์ถ”๋ก ์— ์‚ฌ์šฉํ•˜๋Š” ๊ฐ€์žฅ ๊ฐ„๋‹จํ•œ ๋ฐฉ๋ฒ•์€ [`pipeline`]์—์„œ ๋ชจ๋ธ์„ ์‚ฌ์šฉํ•˜๋Š” ๊ฒƒ์ž…๋‹ˆ๋‹ค. ๋ชจ๋ธ๊ณผ ํ•จ๊ป˜ ๊ฐ์ฒด ํƒ์ง€๋ฅผ ์œ„ํ•œ ํŒŒ์ดํ”„๋ผ์ธ์„ ์ธ์Šคํ„ด์Šคํ™”ํ•˜๊ณ , ์ด๋ฏธ์ง€๋ฅผ ์ „๋‹ฌํ•˜์„ธ์š”: ```py diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index 0893792e2d..63585d5e03 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -24,7 +24,6 @@ from .image_utils import ( get_channel_dimension_axis, get_image_size, infer_channel_dimension_format, - to_numpy_array, ) from .utils import ExplicitEnum, TensorType, is_jax_tensor, is_tf_tensor, is_torch_tensor from .utils.import_utils import ( @@ -345,18 +344,6 @@ def normalize( data_format (`ChannelDimension`, *optional*): The channel dimension format of the output image. If unset, will use the inferred format from the input. """ - requires_backends(normalize, ["vision"]) - - if isinstance(image, PIL.Image.Image): - warnings.warn( - "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.", - FutureWarning, - ) - # Convert PIL image to numpy array with the same logic as in the previous feature extractor normalize - - # casting to numpy array and dividing by 255. - image = to_numpy_array(image) - image = rescale(image, scale=1 / 255) - if not isinstance(image, np.ndarray): raise ValueError("image must be a numpy array") @@ -418,15 +405,10 @@ def center_crop( """ requires_backends(center_crop, ["vision"]) - if isinstance(image, PIL.Image.Image): - warnings.warn( - "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.", - FutureWarning, - ) - image = to_numpy_array(image) - return_numpy = False if return_numpy is None else return_numpy - else: - return_numpy = True if return_numpy is None else return_numpy + if return_numpy is not None: + warnings.warn("return_numpy is deprecated and will be removed in v.4.33", FutureWarning) + + return_numpy = True if return_numpy is None else return_numpy if not isinstance(image, np.ndarray): raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py index 4fafc5fda6..dd2ce35936 100644 --- a/src/transformers/models/beit/image_processing_beit.py +++ b/src/transformers/models/beit/image_processing_beit.py @@ -128,15 +128,6 @@ class BeitImageProcessor(BaseImageProcessor): self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD self.do_reduce_labels = do_reduce_labels - @property - def reduce_labels(self) -> bool: - warnings.warn( - "The `reduce_labels` property is deprecated and will be removed in v4.27. Please use" - " `do_reduce_labels` instead.", - FutureWarning, - ) - return self.do_reduce_labels - @classmethod def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): """ diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py index 76e0876c76..ede52b6fe5 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -14,7 +14,6 @@ # limitations under the License. """Image processor class for BridgeTower.""" -import warnings from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import numpy as np @@ -352,42 +351,6 @@ class BridgeTowerImageProcessor(BaseImageProcessor): return BatchFeature(data=data, tensor_type=return_tensors) - # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad_and_create_pixel_mask - def pad_and_create_pixel_mask( - self, - pixel_values_list: List[ImageInput], - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = None, - ) -> BatchFeature: - """ - Pads a batch of images with zeros to the size of largest height and width in the batch and returns their - corresponding pixel mask. - - Args: - images (`List[np.ndarray]`): - Batch of images to pad. - return_tensors (`str` or `TensorType`, *optional*): - The type of tensors to return. Can be one of: - - Unset: Return a list of `np.ndarray`. - - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. - - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. - - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. - - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. - data_format (`str` or `ChannelDimension`, *optional*): - The channel dimension format of the image. If not provided, it will be the same as the input image. - """ - warnings.warn( - "This method is deprecated and will be removed in v4.26.0. Please use pad instead.", FutureWarning - ) - # pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors - images = [to_numpy_array(image) for image in pixel_values_list] - return self.pad( - images=images, - return_pixel_mask=True, - return_tensors=return_tensors, - data_format=data_format, - ) - def preprocess( self, images: ImageInput, diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py index 52b46471dd..3de243cd86 100644 --- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py +++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py @@ -820,15 +820,6 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.do_pad = do_pad - @property - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.max_size - def max_size(self): - logger.warning( - "The `max_size` parameter is deprecated and will be removed in v4.27. " - "Please specify in `size['longest_edge'] instead`.", - ) - return self.size["longest_edge"] - @classmethod # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->ConditionalDetr def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): @@ -873,7 +864,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare def prepare(self, image, target, return_segmentation_masks=False, masks_path=None): logger.warning_once( - "The `prepare` method is deprecated and will be removed in a future version. " + "The `prepare` method is deprecated and will be removed in a v4.33. " "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method " "does not return the image anymore.", ) @@ -882,23 +873,17 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.convert_coco_poly_to_mask def convert_coco_poly_to_mask(self, *args, **kwargs): - logger.warning_once( - "The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. " - ) + logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ") return convert_coco_poly_to_mask(*args, **kwargs) # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection with DETR->ConditionalDetr def prepare_coco_detection(self, *args, **kwargs): - logger.warning_once( - "The `prepare_coco_detection` method is deprecated and will be removed in a future version. " - ) + logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ") return prepare_coco_detection_annotation(*args, **kwargs) # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic def prepare_coco_panoptic(self, *args, **kwargs): - logger.warning_once( - "The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. " - ) + logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ") return prepare_coco_panoptic_annotation(*args, **kwargs) # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize @@ -979,40 +964,6 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): """ return normalize_annotation(annotation, image_size=image_size) - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad_and_create_pixel_mask - def pad_and_create_pixel_mask( - self, - pixel_values_list: List[ImageInput], - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = None, - ) -> BatchFeature: - """ - Pads a batch of images with zeros to the size of largest height and width in the batch and returns their - corresponding pixel mask. - - Args: - images (`List[np.ndarray]`): - Batch of images to pad. - return_tensors (`str` or `TensorType`, *optional*): - The type of tensors to return. Can be one of: - - Unset: Return a list of `np.ndarray`. - - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. - - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. - - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. - - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. - data_format (`str` or `ChannelDimension`, *optional*): - The channel dimension format of the image. If not provided, it will be the same as the input image. - """ - logger.warning_once("This method is deprecated and will be removed in v4.27.0. Please use pad instead.") - # pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors - images = [to_numpy_array(image) for image in pixel_values_list] - return self.pad( - images=images, - return_pixel_mask=True, - return_tensors=return_tensors, - data_format=data_format, - ) - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image def _pad_image( self, diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py index 908cc148c2..6aa3d5a82f 100644 --- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py +++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py @@ -818,15 +818,6 @@ class DeformableDetrImageProcessor(BaseImageProcessor): self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.do_pad = do_pad - @property - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.max_size - def max_size(self): - logger.warning( - "The `max_size` parameter is deprecated and will be removed in v4.27. " - "Please specify in `size['longest_edge'] instead`.", - ) - return self.size["longest_edge"] - @classmethod # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->DeformableDetr def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): @@ -871,7 +862,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor): # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare def prepare(self, image, target, return_segmentation_masks=None, masks_path=None): logger.warning_once( - "The `prepare` method is deprecated and will be removed in a future version. " + "The `prepare` method is deprecated and will be removed in a v4.33. " "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method " "does not return the image anymore.", ) @@ -880,23 +871,17 @@ class DeformableDetrImageProcessor(BaseImageProcessor): # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.convert_coco_poly_to_mask def convert_coco_poly_to_mask(self, *args, **kwargs): - logger.warning_once( - "The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. " - ) + logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ") return convert_coco_poly_to_mask(*args, **kwargs) # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection def prepare_coco_detection(self, *args, **kwargs): - logger.warning_once( - "The `prepare_coco_detection` method is deprecated and will be removed in a future version. " - ) + logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ") return prepare_coco_detection_annotation(*args, **kwargs) # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic def prepare_coco_panoptic(self, *args, **kwargs): - logger.warning_once( - "The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. " - ) + logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ") return prepare_coco_panoptic_annotation(*args, **kwargs) # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize @@ -977,40 +962,6 @@ class DeformableDetrImageProcessor(BaseImageProcessor): """ return normalize_annotation(annotation, image_size=image_size) - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad_and_create_pixel_mask - def pad_and_create_pixel_mask( - self, - pixel_values_list: List[ImageInput], - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = None, - ) -> BatchFeature: - """ - Pads a batch of images with zeros to the size of largest height and width in the batch and returns their - corresponding pixel mask. - - Args: - images (`List[np.ndarray]`): - Batch of images to pad. - return_tensors (`str` or `TensorType`, *optional*): - The type of tensors to return. Can be one of: - - Unset: Return a list of `np.ndarray`. - - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. - - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. - - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. - - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. - data_format (`str` or `ChannelDimension`, *optional*): - The channel dimension format of the image. If not provided, it will be the same as the input image. - """ - logger.warning_once("This method is deprecated and will be removed in v4.27.0. Please use pad instead.") - # pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors - images = [to_numpy_array(image) for image in pixel_values_list] - return self.pad( - images=images, - return_pixel_mask=True, - return_tensors=return_tensors, - data_format=data_format, - ) - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image def _pad_image( self, diff --git a/src/transformers/models/deta/image_processing_deta.py b/src/transformers/models/deta/image_processing_deta.py index d60f6f838c..573c71dafc 100644 --- a/src/transformers/models/deta/image_processing_deta.py +++ b/src/transformers/models/deta/image_processing_deta.py @@ -544,7 +544,7 @@ class DetaImageProcessor(BaseImageProcessor): # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare def prepare(self, image, target, return_segmentation_masks=None, masks_path=None): logger.warning_once( - "The `prepare` method is deprecated and will be removed in a future version. " + "The `prepare` method is deprecated and will be removed in a v4.33. " "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method " "does not return the image anymore.", ) @@ -553,23 +553,17 @@ class DetaImageProcessor(BaseImageProcessor): # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.convert_coco_poly_to_mask def convert_coco_poly_to_mask(self, *args, **kwargs): - logger.warning_once( - "The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. " - ) + logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ") return convert_coco_poly_to_mask(*args, **kwargs) # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection def prepare_coco_detection(self, *args, **kwargs): - logger.warning_once( - "The `prepare_coco_detection` method is deprecated and will be removed in a future version. " - ) + logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ") return prepare_coco_detection_annotation(*args, **kwargs) # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic def prepare_coco_panoptic(self, *args, **kwargs): - logger.warning_once( - "The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. " - ) + logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ") return prepare_coco_panoptic_annotation(*args, **kwargs) def resize( @@ -641,40 +635,6 @@ class DetaImageProcessor(BaseImageProcessor): """ return normalize_annotation(annotation, image_size=image_size) - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad_and_create_pixel_mask - def pad_and_create_pixel_mask( - self, - pixel_values_list: List[ImageInput], - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = None, - ) -> BatchFeature: - """ - Pads a batch of images with zeros to the size of largest height and width in the batch and returns their - corresponding pixel mask. - - Args: - images (`List[np.ndarray]`): - Batch of images to pad. - return_tensors (`str` or `TensorType`, *optional*): - The type of tensors to return. Can be one of: - - Unset: Return a list of `np.ndarray`. - - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. - - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. - - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. - - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. - data_format (`str` or `ChannelDimension`, *optional*): - The channel dimension format of the image. If not provided, it will be the same as the input image. - """ - logger.warning_once("This method is deprecated and will be removed in v4.27.0. Please use pad instead.") - # pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors - images = [to_numpy_array(image) for image in pixel_values_list] - return self.pad( - images=images, - return_pixel_mask=True, - return_tensors=return_tensors, - data_format=data_format, - ) - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image def _pad_image( self, diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py index c8ea81eac7..5983506eff 100644 --- a/src/transformers/models/detr/image_processing_detr.py +++ b/src/transformers/models/detr/image_processing_detr.py @@ -802,14 +802,6 @@ class DetrImageProcessor(BaseImageProcessor): self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.do_pad = do_pad - @property - def max_size(self): - logger.warning( - "The `max_size` parameter is deprecated and will be removed in v4.27. " - "Please specify in `size['longest_edge'] instead`.", - ) - return self.size["longest_edge"] - @classmethod def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): """ @@ -851,7 +843,7 @@ class DetrImageProcessor(BaseImageProcessor): def prepare(self, image, target, return_segmentation_masks=None, masks_path=None): logger.warning_once( - "The `prepare` method is deprecated and will be removed in a future version. " + "The `prepare` method is deprecated and will be removed in a v4.33. " "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method " "does not return the image anymore.", ) @@ -859,21 +851,15 @@ class DetrImageProcessor(BaseImageProcessor): return image, target def convert_coco_poly_to_mask(self, *args, **kwargs): - logger.warning_once( - "The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. " - ) + logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ") return convert_coco_poly_to_mask(*args, **kwargs) def prepare_coco_detection(self, *args, **kwargs): - logger.warning_once( - "The `prepare_coco_detection` method is deprecated and will be removed in a future version. " - ) + logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ") return prepare_coco_detection_annotation(*args, **kwargs) def prepare_coco_panoptic(self, *args, **kwargs): - logger.warning_once( - "The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. " - ) + logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ") return prepare_coco_panoptic_annotation(*args, **kwargs) def resize( @@ -949,39 +935,6 @@ class DetrImageProcessor(BaseImageProcessor): """ return normalize_annotation(annotation, image_size=image_size) - def pad_and_create_pixel_mask( - self, - pixel_values_list: List[ImageInput], - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = None, - ) -> BatchFeature: - """ - Pads a batch of images with zeros to the size of largest height and width in the batch and returns their - corresponding pixel mask. - - Args: - images (`List[np.ndarray]`): - Batch of images to pad. - return_tensors (`str` or `TensorType`, *optional*): - The type of tensors to return. Can be one of: - - Unset: Return a list of `np.ndarray`. - - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. - - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. - - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. - - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. - data_format (`str` or `ChannelDimension`, *optional*): - The channel dimension format of the image. If not provided, it will be the same as the input image. - """ - logger.warning_once("This method is deprecated and will be removed in v4.27.0. Please use pad instead.") - # pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors - images = [to_numpy_array(image) for image in pixel_values_list] - return self.pad( - images=images, - return_pixel_mask=True, - return_tensors=return_tensors, - data_format=data_format, - ) - def _pad_image( self, image: np.ndarray, diff --git a/src/transformers/models/donut/image_processing_donut.py b/src/transformers/models/donut/image_processing_donut.py index c949274729..294d42580d 100644 --- a/src/transformers/models/donut/image_processing_donut.py +++ b/src/transformers/models/donut/image_processing_donut.py @@ -151,12 +151,6 @@ class DonutImageProcessor(BaseImageProcessor): return image - def rotate_image(self, *args, **kwargs): - logger.info( - "rotate_image is deprecated and will be removed in version 4.27. Please use align_long_axis instead." - ) - return self.align_long_axis(*args, **kwargs) - def pad_image( self, image: np.ndarray, diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py index 234e784bd6..59cf037925 100644 --- a/src/transformers/models/mask2former/image_processing_mask2former.py +++ b/src/transformers/models/mask2former/image_processing_mask2former.py @@ -29,7 +29,6 @@ from ...image_transforms import ( rescale, resize, to_channel_dimension_format, - to_numpy_array, ) from ...image_utils import ( ChannelDimension, @@ -38,6 +37,7 @@ from ...image_utils import ( get_image_size, infer_channel_dimension_format, is_batched, + to_numpy_array, valid_images, ) from ...utils import ( @@ -441,24 +441,6 @@ class Mask2FormerImageProcessor(BaseImageProcessor): image_processor_dict["size_divisibility"] = kwargs.pop("size_divisibility") return super().from_dict(image_processor_dict, **kwargs) - @property - def size_divisibility(self): - warnings.warn( - "The `size_divisibility` property is deprecated and will be removed in v4.27. Please use " - "`size_divisor` instead.", - FutureWarning, - ) - return self.size_divisor - - @property - def max_size(self): - warnings.warn( - "The `max_size` property is deprecated and will be removed in v4.27. Please use size['longest_edge']" - " instead.", - FutureWarning, - ) - return self.size["longest_edge"] - def resize( self, image: np.ndarray, @@ -789,7 +771,6 @@ class Mask2FormerImageProcessor(BaseImageProcessor): ignore_index: Optional[int] = None, reduce_labels: bool = False, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs, ): """ Pad images up to the largest image in a batch and create a corresponding `pixel_mask`. @@ -840,12 +821,6 @@ class Mask2FormerImageProcessor(BaseImageProcessor): """ ignore_index = self.ignore_index if ignore_index is None else ignore_index reduce_labels = self.reduce_labels if reduce_labels is None else reduce_labels - - if "pad_and_return_pixel_mask" in kwargs: - warnings.warn( - "The `pad_and_return_pixel_mask` argument has no effect and will be removed in v4.27", FutureWarning - ) - pixel_values_list = [to_numpy_array(pixel_values) for pixel_values in pixel_values_list] encoded_inputs = self.pad(pixel_values_list, return_tensors=return_tensors) diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py index bfdc07431e..c40212a75a 100644 --- a/src/transformers/models/maskformer/image_processing_maskformer.py +++ b/src/transformers/models/maskformer/image_processing_maskformer.py @@ -29,7 +29,6 @@ from ...image_transforms import ( rescale, resize, to_channel_dimension_format, - to_numpy_array, ) from ...image_utils import ( ChannelDimension, @@ -38,6 +37,7 @@ from ...image_utils import ( get_image_size, infer_channel_dimension_format, make_list_of_images, + to_numpy_array, valid_images, ) from ...utils import ( @@ -452,33 +452,6 @@ class MaskFormerImageProcessor(BaseImageProcessor): image_processor_dict["size_divisibility"] = kwargs.pop("size_divisibility") return super().from_dict(image_processor_dict, **kwargs) - @property - def size_divisibility(self): - warnings.warn( - "The `size_divisibility` property is deprecated and will be removed in v4.27. Please use " - "`size_divisor` instead.", - FutureWarning, - ) - return self.size_divisor - - @property - def max_size(self): - warnings.warn( - "The `max_size` property is deprecated and will be removed in v4.27. Please use size['longest_edge']" - " instead.", - FutureWarning, - ) - return self.size["longest_edge"] - - @property - def reduce_labels(self): - warnings.warn( - "The `reduce_labels` property is deprecated and will be removed in v4.27. Please use " - "`do_reduce_labels` instead.", - FutureWarning, - ) - return self.do_reduce_labels - def resize( self, image: np.ndarray, @@ -820,7 +793,6 @@ class MaskFormerImageProcessor(BaseImageProcessor): ignore_index: Optional[int] = None, reduce_labels: bool = False, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs, ): """ Pad images up to the largest image in a batch and create a corresponding `pixel_mask`. @@ -869,10 +841,6 @@ class MaskFormerImageProcessor(BaseImageProcessor): `annotations` are provided). They identify the labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`. """ - if "pad_and_return_pixel_mask" in kwargs: - warnings.warn( - "The `pad_and_return_pixel_mask` argument has no effect and will be removed in v4.27", FutureWarning - ) ignore_index = self.ignore_index if ignore_index is None else ignore_index reduce_labels = self.do_reduce_labels if reduce_labels is None else reduce_labels diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py index a99a7182b8..6f59cb661c 100644 --- a/src/transformers/models/oneformer/image_processing_oneformer.py +++ b/src/transformers/models/oneformer/image_processing_oneformer.py @@ -30,7 +30,6 @@ from ...image_transforms import ( rescale, resize, to_channel_dimension_format, - to_numpy_array, ) from ...image_utils import ( ChannelDimension, @@ -39,6 +38,7 @@ from ...image_utils import ( get_image_size, infer_channel_dimension_format, make_list_of_images, + to_numpy_array, valid_images, ) from ...utils import ( @@ -881,7 +881,6 @@ class OneFormerImageProcessor(BaseImageProcessor): ignore_index: Optional[int] = None, reduce_labels: bool = False, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs, ): """ Pad images up to the largest image in a batch and create a corresponding `pixel_mask`. @@ -935,11 +934,6 @@ class OneFormerImageProcessor(BaseImageProcessor): - **text_inputs** -- Optional list of text string entries to be fed to a model (when `annotations` are provided). They identify the binary masks present in the image. """ - if "pad_and_return_pixel_mask" in kwargs: - warnings.warn( - "The `pad_and_return_pixel_mask` argument has no effect and will be removed in v4.27", FutureWarning - ) - ignore_index = self.ignore_index if ignore_index is None else ignore_index reduce_labels = self.do_reduce_labels if reduce_labels is None else reduce_labels pixel_values_list = [to_numpy_array(pixel_values) for pixel_values in pixel_values_list] diff --git a/src/transformers/models/owlvit/image_processing_owlvit.py b/src/transformers/models/owlvit/image_processing_owlvit.py index 2d86599fff..ea5cd7b776 100644 --- a/src/transformers/models/owlvit/image_processing_owlvit.py +++ b/src/transformers/models/owlvit/image_processing_owlvit.py @@ -27,7 +27,6 @@ from ...image_transforms import ( rescale, resize, to_channel_dimension_format, - to_numpy_array, ) from ...image_utils import ( OPENAI_CLIP_MEAN, @@ -36,6 +35,7 @@ from ...image_utils import ( ImageInput, PILImageResampling, make_list_of_images, + to_numpy_array, valid_images, ) from ...utils import TensorType, is_torch_available, logging diff --git a/src/transformers/models/segformer/image_processing_segformer.py b/src/transformers/models/segformer/image_processing_segformer.py index 36d171f8e2..df92c940ce 100644 --- a/src/transformers/models/segformer/image_processing_segformer.py +++ b/src/transformers/models/segformer/image_processing_segformer.py @@ -116,15 +116,6 @@ class SegformerImageProcessor(BaseImageProcessor): self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.do_reduce_labels = do_reduce_labels - @property - def reduce_labels(self): - warnings.warn( - "The `reduce_labels` property is deprecated and will be removed in a v4.27. Please use " - "`do_reduce_labels` instead.", - FutureWarning, - ) - return self.do_reduce_labels - @classmethod def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): """ diff --git a/src/transformers/models/vilt/image_processing_vilt.py b/src/transformers/models/vilt/image_processing_vilt.py index 87b6e682e9..5377ffbb7c 100644 --- a/src/transformers/models/vilt/image_processing_vilt.py +++ b/src/transformers/models/vilt/image_processing_vilt.py @@ -14,7 +14,6 @@ # limitations under the License. """Image processor class for Vilt.""" -import warnings from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import numpy as np @@ -331,41 +330,6 @@ class ViltImageProcessor(BaseImageProcessor): return BatchFeature(data=data, tensor_type=return_tensors) - def pad_and_create_pixel_mask( - self, - pixel_values_list: List[ImageInput], - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = None, - ) -> BatchFeature: - """ - Pads a batch of images with zeros to the size of largest height and width in the batch and returns their - corresponding pixel mask. - - Args: - images (`List[np.ndarray]`): - Batch of images to pad. - return_tensors (`str` or `TensorType`, *optional*): - The type of tensors to return. Can be one of: - - Unset: Return a list of `np.ndarray`. - - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. - - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. - - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. - - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. - data_format (`str` or `ChannelDimension`, *optional*): - The channel dimension format of the image. If not provided, it will be the same as the input image. - """ - warnings.warn( - "This method is deprecated and will be removed in v4.26.0. Please use pad instead.", FutureWarning - ) - # pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors - images = [to_numpy_array(image) for image in pixel_values_list] - return self.pad( - images=images, - return_pixel_mask=True, - return_tensors=return_tensors, - data_format=data_format, - ) - def preprocess( self, images: ImageInput, diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py index 1aa37fec42..a472674171 100644 --- a/src/transformers/models/yolos/image_processing_yolos.py +++ b/src/transformers/models/yolos/image_processing_yolos.py @@ -729,15 +729,6 @@ class YolosImageProcessor(BaseImageProcessor): self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.do_pad = do_pad - @property - # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.max_size - def max_size(self): - logger.warning( - "The `max_size` parameter is deprecated and will be removed in v4.27. " - "Please specify in `size['longest_edge'] instead`.", - ) - return self.size["longest_edge"] - @classmethod # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->Yolos def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): @@ -782,7 +773,7 @@ class YolosImageProcessor(BaseImageProcessor): # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare def prepare(self, image, target, return_segmentation_masks=False, masks_path=None): logger.warning_once( - "The `prepare` method is deprecated and will be removed in a future version. " + "The `prepare` method is deprecated and will be removed in a v4.33. " "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method " "does not return the image anymore.", ) @@ -791,23 +782,17 @@ class YolosImageProcessor(BaseImageProcessor): # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.convert_coco_poly_to_mask def convert_coco_poly_to_mask(self, *args, **kwargs): - logger.warning_once( - "The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. " - ) + logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ") return convert_coco_poly_to_mask(*args, **kwargs) # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection with DETR->Yolos def prepare_coco_detection(self, *args, **kwargs): - logger.warning_once( - "The `prepare_coco_detection` method is deprecated and will be removed in a future version. " - ) + logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ") return prepare_coco_detection_annotation(*args, **kwargs) # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic def prepare_coco_panoptic(self, *args, **kwargs): - logger.warning_once( - "The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. " - ) + logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ") return prepare_coco_panoptic_annotation(*args, **kwargs) # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize @@ -1010,7 +995,7 @@ class YolosImageProcessor(BaseImageProcessor): """ if "pad_and_return_pixel_mask" in kwargs: logger.warning_once( - "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " + "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in v4.33, " "use `do_pad` instead.", ) do_pad = kwargs.pop("pad_and_return_pixel_mask") @@ -1018,7 +1003,7 @@ class YolosImageProcessor(BaseImageProcessor): max_size = None if "max_size" in kwargs: logger.warning_once( - "The `max_size` argument is deprecated and will be removed in a future version, use" + "The `max_size` argument is deprecated and will be removed in v4.33, use" " `size['longest_edge']` instead.", ) size = kwargs.pop("max_size") diff --git a/tests/models/bridgetower/test_image_processing_bridgetower.py b/tests/models/bridgetower/test_image_processing_bridgetower.py index 9a9bff9fc3..80ea966c2a 100644 --- a/tests/models/bridgetower/test_image_processing_bridgetower.py +++ b/tests/models/bridgetower/test_image_processing_bridgetower.py @@ -236,23 +236,3 @@ class BridgeTowerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.Te expected_width, ), ) - - def test_equivalence_pad_and_create_pixel_mask(self): - # Initialize image processors - image_processing_1 = self.image_processing_class(**self.image_processor_dict) - image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False) - # create random PyTorch tensors - image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True) - for image in image_inputs: - self.assertIsInstance(image, torch.Tensor) - - # Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors - encoded_images_with_method = image_processing_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt") - encoded_images = image_processing_2(image_inputs, return_tensors="pt") - - self.assertTrue( - torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4) - ) - self.assertTrue( - torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4) - ) diff --git a/tests/models/conditional_detr/test_image_processing_conditional_detr.py b/tests/models/conditional_detr/test_image_processing_conditional_detr.py index ba77431467..98510a3c00 100644 --- a/tests/models/conditional_detr/test_image_processing_conditional_detr.py +++ b/tests/models/conditional_detr/test_image_processing_conditional_detr.py @@ -244,26 +244,6 @@ class ConditionalDetrImageProcessingTest(ImageProcessingSavingTestMixin, unittes ), ) - def test_equivalence_pad_and_create_pixel_mask(self): - # Initialize image_processings - image_processing_1 = self.image_processing_class(**self.image_processor_dict) - image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False) - # create random PyTorch tensors - image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True) - for image in image_inputs: - self.assertIsInstance(image, torch.Tensor) - - # Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors - encoded_images_with_method = image_processing_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt") - encoded_images = image_processing_2(image_inputs, return_tensors="pt") - - self.assertTrue( - torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4) - ) - self.assertTrue( - torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4) - ) - @slow def test_call_pytorch_with_coco_detection_annotations(self): # prepare image and target diff --git a/tests/models/deformable_detr/test_image_processing_deformable_detr.py b/tests/models/deformable_detr/test_image_processing_deformable_detr.py index c0d927b9c9..40bf405eeb 100644 --- a/tests/models/deformable_detr/test_image_processing_deformable_detr.py +++ b/tests/models/deformable_detr/test_image_processing_deformable_detr.py @@ -246,27 +246,6 @@ class DeformableDetrImageProcessingTest(ImageProcessingSavingTestMixin, unittest ), ) - def test_equivalence_pad_and_create_pixel_mask(self): - # Initialize image_processings - image_processing_1 = self.image_processing_class(**self.image_processor_dict) - image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False) - - # create random PyTorch tensors - image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True) - for image in image_inputs: - self.assertIsInstance(image, torch.Tensor) - - # Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors - encoded_images_with_method = image_processing_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt") - encoded_images = image_processing_2(image_inputs, return_tensors="pt") - - self.assertTrue( - torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4) - ) - self.assertTrue( - torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4) - ) - @slow def test_call_pytorch_with_coco_detection_annotations(self): # prepare image and target diff --git a/tests/models/deta/test_image_processing_deta.py b/tests/models/deta/test_image_processing_deta.py index e6e9847665..b3e550fc4c 100644 --- a/tests/models/deta/test_image_processing_deta.py +++ b/tests/models/deta/test_image_processing_deta.py @@ -240,27 +240,6 @@ class DetaImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase) ), ) - def test_equivalence_pad_and_create_pixel_mask(self): - # Initialize image_processings - image_processing_1 = self.image_processing_class(**self.image_processor_dict) - image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False) - - # create random PyTorch tensors - image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True) - for image in image_inputs: - self.assertIsInstance(image, torch.Tensor) - - # Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors - encoded_images_with_method = image_processing_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt") - encoded_images = image_processing_2(image_inputs, return_tensors="pt") - - self.assertTrue( - torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4) - ) - self.assertTrue( - torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4) - ) - @slow def test_call_pytorch_with_coco_detection_annotations(self): # prepare image and target diff --git a/tests/models/detr/test_image_processing_detr.py b/tests/models/detr/test_image_processing_detr.py index d6354de43d..f923cb6726 100644 --- a/tests/models/detr/test_image_processing_detr.py +++ b/tests/models/detr/test_image_processing_detr.py @@ -247,26 +247,6 @@ class DetrImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase) ), ) - def test_equivalence_pad_and_create_pixel_mask(self): - # Initialize image_processings - image_processing_1 = self.image_processing_class(**self.image_processor_dict) - image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False) - # create random PyTorch tensors - image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True) - for image in image_inputs: - self.assertIsInstance(image, torch.Tensor) - - # Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors - encoded_images_with_method = image_processing_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt") - encoded_images = image_processing_2(image_inputs, return_tensors="pt") - - self.assertTrue( - torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4) - ) - self.assertTrue( - torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4) - ) - @slow def test_call_pytorch_with_coco_detection_annotations(self): # prepare image and target diff --git a/tests/models/mask2former/test_image_processing_mask2former.py b/tests/models/mask2former/test_image_processing_mask2former.py index c750edd2e2..4ba6389b7f 100644 --- a/tests/models/mask2former/test_image_processing_mask2former.py +++ b/tests/models/mask2former/test_image_processing_mask2former.py @@ -147,7 +147,6 @@ class Mask2FormerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.Te self.assertTrue(hasattr(image_processing, "do_normalize")) self.assertTrue(hasattr(image_processing, "do_resize")) self.assertTrue(hasattr(image_processing, "size")) - self.assertTrue(hasattr(image_processing, "max_size")) self.assertTrue(hasattr(image_processing, "ignore_index")) self.assertTrue(hasattr(image_processing, "num_labels")) @@ -263,28 +262,6 @@ class Mask2FormerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.Te ), ) - def test_equivalence_pad_and_create_pixel_mask(self): - # Initialize image_processings - image_processing_1 = self.image_processing_class(**self.image_processor_dict) - image_processing_2 = self.image_processing_class( - do_resize=False, do_normalize=False, do_rescale=False, num_labels=self.image_processor_tester.num_classes - ) - # create random PyTorch tensors - image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True) - for image in image_inputs: - self.assertIsInstance(image, torch.Tensor) - - # Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors - encoded_images_with_method = image_processing_1.encode_inputs(image_inputs, return_tensors="pt") - encoded_images = image_processing_2(image_inputs, return_tensors="pt") - - self.assertTrue( - torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4) - ) - self.assertTrue( - torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4) - ) - def comm_get_image_processing_inputs( self, with_segmentation_maps=False, is_instance_map=False, segmentation_type="np" ): diff --git a/tests/models/maskformer/test_image_processing_maskformer.py b/tests/models/maskformer/test_image_processing_maskformer.py index 3fff1022c1..535582bc1f 100644 --- a/tests/models/maskformer/test_image_processing_maskformer.py +++ b/tests/models/maskformer/test_image_processing_maskformer.py @@ -147,7 +147,6 @@ class MaskFormerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.Tes self.assertTrue(hasattr(image_processing, "do_normalize")) self.assertTrue(hasattr(image_processing, "do_resize")) self.assertTrue(hasattr(image_processing, "size")) - self.assertTrue(hasattr(image_processing, "max_size")) self.assertTrue(hasattr(image_processing, "ignore_index")) self.assertTrue(hasattr(image_processing, "num_labels")) @@ -263,28 +262,6 @@ class MaskFormerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.Tes ), ) - def test_equivalence_pad_and_create_pixel_mask(self): - # Initialize image_processings - image_processing_1 = self.image_processing_class(**self.image_processor_dict) - image_processing_2 = self.image_processing_class( - do_resize=False, do_normalize=False, do_rescale=False, num_labels=self.image_processor_tester.num_classes - ) - # create random PyTorch tensors - image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True) - for image in image_inputs: - self.assertIsInstance(image, torch.Tensor) - - # Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors - encoded_images_with_method = image_processing_1.encode_inputs(image_inputs, return_tensors="pt") - encoded_images = image_processing_2(image_inputs, return_tensors="pt") - - self.assertTrue( - torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4) - ) - self.assertTrue( - torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4) - ) - def comm_get_image_processing_inputs( self, with_segmentation_maps=False, is_instance_map=False, segmentation_type="np" ): diff --git a/tests/models/oneformer/test_image_processing_oneformer.py b/tests/models/oneformer/test_image_processing_oneformer.py index d6fc5e228b..864c803f35 100644 --- a/tests/models/oneformer/test_image_processing_oneformer.py +++ b/tests/models/oneformer/test_image_processing_oneformer.py @@ -286,36 +286,6 @@ class OneFormerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.Test ), ) - def test_equivalence_pad_and_create_pixel_mask(self): - # Initialize image_processors - image_processor_1 = self.image_processing_class(**self.image_processor_dict) - image_processor_2 = self.image_processing_class( - do_resize=False, - do_normalize=False, - do_rescale=False, - num_labels=self.image_processing_tester.num_classes, - class_info_file="ade20k_panoptic.json", - num_text=self.image_processing_tester.num_text, - repo_path="shi-labs/oneformer_demo", - ) - # create random PyTorch tensors - image_inputs = prepare_image_inputs(self.image_processing_tester, equal_resolution=False, torchify=True) - for image in image_inputs: - self.assertIsInstance(image, torch.Tensor) - - # Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors - encoded_images_with_method = image_processor_1.encode_inputs( - image_inputs, ["semantic"] * len(image_inputs), return_tensors="pt" - ) - encoded_images = image_processor_2(image_inputs, ["semantic"] * len(image_inputs), return_tensors="pt") - - self.assertTrue( - torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4) - ) - self.assertTrue( - torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4) - ) - def comm_get_image_processor_inputs( self, with_segmentation_maps=False, is_instance_map=False, segmentation_type="np" ): diff --git a/tests/models/oneformer/test_processor_oneformer.py b/tests/models/oneformer/test_processor_oneformer.py index e97895d7ce..c65807ca91 100644 --- a/tests/models/oneformer/test_processor_oneformer.py +++ b/tests/models/oneformer/test_processor_oneformer.py @@ -355,41 +355,6 @@ class OneFormerProcessingTest(unittest.TestCase): (self.processing_tester.batch_size, expected_sequence_length), ) - def test_equivalence_pad_and_create_pixel_mask(self): - # Initialize processors - processor_1 = self.processing_class(**self.processor_dict) - - image_processor = OneFormerImageProcessor( - do_resize=False, - do_normalize=False, - do_rescale=False, - num_labels=self.processing_tester.num_classes, - class_info_file="ade20k_panoptic.json", - num_text=self.processing_tester.num_text, - ) - tokenizer = CLIPTokenizer.from_pretrained("shi-labs/oneformer_ade20k_swin_tiny") - processor_2 = self.processing_class( - image_processor=image_processor, tokenizer=tokenizer, max_seq_length=77, task_seq_length=77 - ) - - # create random PyTorch tensors - image_inputs = prepare_image_inputs(self.processing_tester, equal_resolution=False, torchify=True) - for image in image_inputs: - self.assertIsInstance(image, torch.Tensor) - - # Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors - encoded_images_with_method = processor_1.encode_inputs( - image_inputs, ["semantic"] * len(image_inputs), return_tensors="pt" - ) - encoded_images = processor_2(image_inputs, ["semantic"] * len(image_inputs), return_tensors="pt") - - self.assertTrue( - torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4) - ) - self.assertTrue( - torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4) - ) - def comm_get_processor_inputs(self, with_segmentation_maps=False, is_instance_map=False, segmentation_type="np"): processor = self.processing_class(**self.processor_dict) # prepare image and target diff --git a/tests/models/vilt/test_image_processing_vilt.py b/tests/models/vilt/test_image_processing_vilt.py index f33492d102..28cf9f2fe6 100644 --- a/tests/models/vilt/test_image_processing_vilt.py +++ b/tests/models/vilt/test_image_processing_vilt.py @@ -237,23 +237,3 @@ class ViltImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase) expected_width, ), ) - - def test_equivalence_pad_and_create_pixel_mask(self): - # Initialize image_processings - image_processing_1 = self.image_processing_class(**self.image_processor_dict) - image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False) - # create random PyTorch tensors - image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True) - for image in image_inputs: - self.assertIsInstance(image, torch.Tensor) - - # Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors - encoded_images_with_method = image_processing_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt") - encoded_images = image_processing_2(image_inputs, return_tensors="pt") - - self.assertTrue( - torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4) - ) - self.assertTrue( - torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4) - )