[pack_dataset] don't sort before packing, only pack train (#5954)

This commit is contained in:
Sam Shleifer
2020-07-27 12:14:23 -04:00
committed by GitHub
parent c8bdf7f4ec
commit 4302ace5bd
3 changed files with 27 additions and 12 deletions

View File

@@ -0,0 +1,19 @@
from pathlib import Path
import fire
def minify(src_dir: str, dest_dir: str, n: int):
"""Write first n lines of each file f in src_dir to dest_dir/f """
src_dir = Path(src_dir)
dest_dir = Path(dest_dir)
dest_dir.mkdir(exist_ok=True)
for path in src_dir.iterdir():
new = [x.rstrip() for x in list(path.open().readlines())][:n]
dest_path = dest_dir.joinpath(path.name)
print(dest_path)
dest_path.open("w").write("\n".join(new))
if __name__ == "__main__":
fire.Fire(minify)