Tokenization behave the same as original XLM proprocessing for most languages except zh, ja and th; Change API to allow specifying language in tokenize

2019-08-23 14:40:17 -04:00
parent df9d6effae
commit 436ce07218
3 changed files with 135 additions and 20 deletions
--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,8 @@ setup(
                      'requests',
                      'tqdm',
                      'regex',
-                      'sentencepiece'],
+                      'sentencepiece',
+                      'sacremoses'],
    entry_points={
      'console_scripts': [
        "pytorch_transformers=pytorch_transformers.__main__:main",