fix: switch from slow to generic tokenizer class (#15122)
This commit is contained in:
committed by
GitHub
parent
27b819b0e3
commit
aa0135f2e0
@@ -2,7 +2,7 @@ from datasets import load_dataset
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from arguments import TokenizerTrainingArguments
|
from arguments import TokenizerTrainingArguments
|
||||||
from transformers import GPT2Tokenizer, HfArgumentParser
|
from transformers import AutoTokenizer, HfArgumentParser
|
||||||
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
|
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
|
||||||
|
|
||||||
|
|
||||||
@@ -17,7 +17,7 @@ parser = HfArgumentParser(TokenizerTrainingArguments)
|
|||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Base tokenizer
|
# Base tokenizer
|
||||||
tokenizer = GPT2Tokenizer.from_pretrained(args.base_tokenizer)
|
tokenizer = AutoTokenizer.from_pretrained(args.base_tokenizer)
|
||||||
base_vocab = list(bytes_to_unicode().values())
|
base_vocab = list(bytes_to_unicode().values())
|
||||||
|
|
||||||
# Load dataset
|
# Load dataset
|
||||||
|
|||||||
Reference in New Issue
Block a user