Apply ruff flake8-comprehensions (#21694)

This commit is contained in:
Aaron Gokaslan
2023-02-22 03:14:54 -05:00
committed by GitHub
parent df06fb1f0b
commit 5e8c8eb5ba
230 changed files with 971 additions and 955 deletions

View File

@@ -29,7 +29,7 @@ def get_min_hash(tokens: List[str]) -> Optional[MinHash]:
def get_tokens(code: str) -> Set[str]:
"""Tokenize a code snippet."""
return set([t for t in NON_ALPHA.split(code) if len(t.strip()) > 0])
return {t for t in NON_ALPHA.split(code) if len(t.strip()) > 0}
class DuplicationIndex:
@@ -243,7 +243,7 @@ def deduplicate_dataset(
>>> ds_dedup, duplicate_clusters = deduplicate_dataset(ds, jaccard_threshold=0.85)
"""
duplicate_clusters = make_duplicate_clusters(dataset, jaccard_threshold)
duplicate_indices = set(x["base_index"] for cluster in duplicate_clusters for x in cluster)
duplicate_indices = {x["base_index"] for cluster in duplicate_clusters for x in cluster}
extreme_dict = {}
extremes_clusters = find_extremes(duplicate_clusters, dataset, jaccard_threshold)
for extremes in extremes_clusters:

View File

@@ -114,7 +114,7 @@ def char_token_ratio(example):
def preprocess(example):
"""Chain all preprocessing steps into one function to not fill cache."""
results = dict()
results = {}
results.update(get_hash(example))
results.update(line_stats(example))
results.update(alpha_stats(example))

View File

@@ -8,7 +8,7 @@ from transformers import AutoTokenizer, HfArgumentParser
def tokenize(example):
output = dict()
output = {}
output["input_ids"] = tokenizer(example["content"], truncation=False)["input_ids"]
output["ratio_char_token"] = len(example["content"]) / len(output["input_ids"])
return output