From 2ae98336d17fceea7506af9880b862b6252a38f6 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Tue, 18 Feb 2020 16:17:35 +0000
Subject: [PATCH] fix vocab size in binarized_data (distil): int16 vs int32

---
 examples/distillation/scripts/binarized_data.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/distillation/scripts/binarized_data.py b/examples/distillation/scripts/binarized_data.py
index 7590cfcbcf..2dcca18396 100644
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@@ -75,13 +75,17 @@ def main():
         iter += 1
         if iter % interval == 0:
             end = time.time()
-            logger.info(f"{iter} examples processed. - {(end-start)/interval:.2f}s/expl")
+            logger.info(f"{iter} examples processed. - {(end-start):.2f}s/{interval}expl")
             start = time.time()
     logger.info("Finished binarization")
     logger.info(f"{len(data)} examples processed.")
 
     dp_file = f"{args.dump_file}.{args.tokenizer_name}.pickle"
-    rslt_ = [np.uint16(d) for d in rslt]
+    vocab_size = tokenizer.vocab_size
+    if vocab_size < (1 << 16):
+        rslt_ = [np.uint16(d) for d in rslt]
+    else:
+        rslt_ = [np.int32(d) for d in rslt]
     random.shuffle(rslt_)
     logger.info(f"Dump to {dp_file}")
     with open(dp_file, "wb") as handle: