Apply ruff flake8-comprehensions (#21694)
This commit is contained in:
@@ -29,7 +29,7 @@ def get_min_hash(tokens: List[str]) -> Optional[MinHash]:
|
||||
|
||||
def get_tokens(code: str) -> Set[str]:
|
||||
"""Tokenize a code snippet."""
|
||||
return set([t for t in NON_ALPHA.split(code) if len(t.strip()) > 0])
|
||||
return {t for t in NON_ALPHA.split(code) if len(t.strip()) > 0}
|
||||
|
||||
|
||||
class DuplicationIndex:
|
||||
@@ -243,7 +243,7 @@ def deduplicate_dataset(
|
||||
>>> ds_dedup, duplicate_clusters = deduplicate_dataset(ds, jaccard_threshold=0.85)
|
||||
"""
|
||||
duplicate_clusters = make_duplicate_clusters(dataset, jaccard_threshold)
|
||||
duplicate_indices = set(x["base_index"] for cluster in duplicate_clusters for x in cluster)
|
||||
duplicate_indices = {x["base_index"] for cluster in duplicate_clusters for x in cluster}
|
||||
extreme_dict = {}
|
||||
extremes_clusters = find_extremes(duplicate_clusters, dataset, jaccard_threshold)
|
||||
for extremes in extremes_clusters:
|
||||
|
||||
@@ -114,7 +114,7 @@ def char_token_ratio(example):
|
||||
|
||||
def preprocess(example):
|
||||
"""Chain all preprocessing steps into one function to not fill cache."""
|
||||
results = dict()
|
||||
results = {}
|
||||
results.update(get_hash(example))
|
||||
results.update(line_stats(example))
|
||||
results.update(alpha_stats(example))
|
||||
|
||||
@@ -8,7 +8,7 @@ from transformers import AutoTokenizer, HfArgumentParser
|
||||
|
||||
|
||||
def tokenize(example):
|
||||
output = dict()
|
||||
output = {}
|
||||
output["input_ids"] = tokenizer(example["content"], truncation=False)["input_ids"]
|
||||
output["ratio_char_token"] = len(example["content"]) / len(output["input_ids"])
|
||||
return output
|
||||
|
||||
Reference in New Issue
Block a user