[Marian] documentation and AutoModel support (#4152)
- MarianSentencepieceTokenizer - > MarianTokenizer - Start using unk token. - add docs page - add better generation params to MarianConfig - more conversion utilities
This commit is contained in:
@@ -248,7 +248,7 @@ if is_torch_available():
|
||||
BART_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
from .modeling_marian import MarianMTModel
|
||||
from .tokenization_marian import MarianSentencePieceTokenizer
|
||||
from .tokenization_marian import MarianTokenizer
|
||||
from .modeling_roberta import (
|
||||
RobertaForMaskedLM,
|
||||
RobertaModel,
|
||||
|
||||
@@ -28,6 +28,7 @@ from .configuration_electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, Electr
|
||||
from .configuration_encoder_decoder import EncoderDecoderConfig
|
||||
from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig
|
||||
from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
|
||||
from .configuration_marian import MarianConfig
|
||||
from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig
|
||||
from .configuration_reformer import ReformerConfig
|
||||
from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
|
||||
@@ -73,6 +74,7 @@ CONFIG_MAPPING = OrderedDict(
|
||||
("albert", AlbertConfig,),
|
||||
("camembert", CamembertConfig,),
|
||||
("xlm-roberta", XLMRobertaConfig,),
|
||||
("marian", MarianConfig,),
|
||||
("bart", BartConfig,),
|
||||
("reformer", ReformerConfig,),
|
||||
("roberta", RobertaConfig,),
|
||||
|
||||
@@ -23,4 +23,5 @@ PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
|
||||
|
||||
class MarianConfig(BartConfig):
|
||||
model_type = "marian"
|
||||
pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
||||
@@ -11,7 +11,8 @@ import numpy as np
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from transformers import MarianConfig, MarianMTModel, MarianSentencePieceTokenizer
|
||||
from transformers import MarianConfig, MarianMTModel, MarianTokenizer
|
||||
from transformers.hf_api import HfApi
|
||||
|
||||
|
||||
def remove_prefix(text: str, prefix: str):
|
||||
@@ -38,6 +39,19 @@ def load_layers_(layer_lst: torch.nn.ModuleList, opus_state: dict, converter, is
|
||||
layer.load_state_dict(sd, strict=True)
|
||||
|
||||
|
||||
def find_pretrained_model(src_lang: str, tgt_lang: str) -> List[str]:
|
||||
"""Find models that can accept src_lang as input and return tgt_lang as output."""
|
||||
prefix = "Helsinki-NLP/opus-mt-"
|
||||
api = HfApi()
|
||||
model_list = api.model_list()
|
||||
model_ids = [x.modelId for x in model_list if x.modelId.startswith("Helsinki-NLP")]
|
||||
src_and_targ = [
|
||||
remove_prefix(m, prefix).lower().split("-") for m in model_ids if "+" not in m
|
||||
] # + cant be loaded.
|
||||
matching = [f"{prefix}{a}-{b}" for (a, b) in src_and_targ if src_lang in a and tgt_lang in b]
|
||||
return matching
|
||||
|
||||
|
||||
def add_emb_entries(wemb, final_bias, n_special_tokens=1):
|
||||
vsize, d_model = wemb.shape
|
||||
embs_to_add = np.zeros((n_special_tokens, d_model))
|
||||
@@ -81,7 +95,12 @@ def find_model_file(dest_dir): # this one better
|
||||
return model_file
|
||||
|
||||
|
||||
def parse_readmes(repo_path):
|
||||
def make_registry(repo_path="Opus-MT-train/models"):
|
||||
if not (Path(repo_path) / "fr-en" / "README.md").exists():
|
||||
raise ValueError(
|
||||
f"repo_path:{repo_path} does not exist: "
|
||||
"You must run: git clone git@github.com:Helsinki-NLP/Opus-MT-train.git before calling."
|
||||
)
|
||||
results = {}
|
||||
for p in Path(repo_path).ls():
|
||||
n_dash = p.name.count("-")
|
||||
@@ -90,22 +109,53 @@ def parse_readmes(repo_path):
|
||||
else:
|
||||
lns = list(open(p / "README.md").readlines())
|
||||
results[p.name] = _parse_readme(lns)
|
||||
return results
|
||||
return [(k, v["pre-processing"], v["download"]) for k, v in results.items()]
|
||||
|
||||
|
||||
def download_all_sentencepiece_models(repo_path="Opus-MT-train/models"):
|
||||
CH_GROUP = "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh"
|
||||
|
||||
|
||||
def convert_all_sentencepiece_models(model_list=None, repo_path=None):
|
||||
"""Requires 300GB"""
|
||||
save_dir = Path("marian_ckpt")
|
||||
if not Path(repo_path).exists():
|
||||
raise ValueError("You must run: git clone git@github.com:Helsinki-NLP/Opus-MT-train.git")
|
||||
results: dict = parse_readmes(repo_path)
|
||||
for k, v in tqdm(list(results.items())):
|
||||
if os.path.exists(save_dir / k):
|
||||
print(f"already have path {k}")
|
||||
dest_dir = Path("marian_converted")
|
||||
dest_dir.mkdir(exist_ok=True)
|
||||
if model_list is None:
|
||||
model_list: list = make_registry(repo_path=repo_path)
|
||||
for k, prepro, download in tqdm(model_list):
|
||||
if "SentencePiece" not in prepro: # dont convert BPE models.
|
||||
continue
|
||||
if "SentencePiece" not in v["pre-processing"]:
|
||||
if not os.path.exists(save_dir / k / "pytorch_model.bin"):
|
||||
download_and_unzip(download, save_dir / k)
|
||||
pair_name = k.replace(CH_GROUP, "ch_group")
|
||||
convert(save_dir / k, dest_dir / f"opus-mt-{pair_name}")
|
||||
|
||||
|
||||
def lmap(f, x) -> List:
|
||||
return list(map(f, x))
|
||||
|
||||
|
||||
def fetch_test_set(readmes_raw, pair):
|
||||
import wget
|
||||
|
||||
download_url = readmes_raw[pair]["download"]
|
||||
test_set_url = download_url[:-4] + ".test.txt"
|
||||
fname = wget.download(test_set_url, f"opus_test_{pair}.txt")
|
||||
lns = Path(fname).open().readlines()
|
||||
src = lmap(str.strip, lns[::4])
|
||||
gold = lmap(str.strip, lns[1::4])
|
||||
mar_model = lmap(str.strip, lns[2::4])
|
||||
assert len(gold) == len(mar_model) == len(src)
|
||||
os.remove(fname)
|
||||
return src, mar_model, gold
|
||||
|
||||
|
||||
def convert_whole_dir(path=Path("marian_ckpt/")):
|
||||
for subdir in tqdm(list(path.ls())):
|
||||
dest_dir = f"marian_converted/{subdir.name}"
|
||||
if (dest_dir / "pytorch_model.bin").exists():
|
||||
continue
|
||||
download_and_unzip(v["download"], save_dir / k)
|
||||
convert(source_dir, dest_dir)
|
||||
|
||||
|
||||
def _parse_readme(lns):
|
||||
@@ -131,7 +181,7 @@ def _parse_readme(lns):
|
||||
return subres
|
||||
|
||||
|
||||
def write_metadata(dest_dir: Path):
|
||||
def save_tokenizer_config(dest_dir: Path):
|
||||
dname = dest_dir.name.split("-")
|
||||
dct = dict(target_lang=dname[-1], source_lang="-".join(dname[:-1]))
|
||||
save_json(dct, dest_dir / "tokenizer_config.json")
|
||||
@@ -148,13 +198,17 @@ def add_to_vocab_(vocab: Dict[str, int], special_tokens: List[str]):
|
||||
return added
|
||||
|
||||
|
||||
def find_vocab_file(model_dir):
|
||||
return list(model_dir.glob("*vocab.yml"))[0]
|
||||
|
||||
|
||||
def add_special_tokens_to_vocab(model_dir: Path) -> None:
|
||||
vocab = load_yaml(model_dir / "opus.spm32k-spm32k.vocab.yml")
|
||||
vocab = load_yaml(find_vocab_file(model_dir))
|
||||
vocab = {k: int(v) for k, v in vocab.items()}
|
||||
num_added = add_to_vocab_(vocab, ["<pad>"])
|
||||
print(f"added {num_added} tokens to vocab")
|
||||
save_json(vocab, model_dir / "vocab.json")
|
||||
write_metadata(model_dir)
|
||||
save_tokenizer_config(model_dir)
|
||||
|
||||
|
||||
def save_tokenizer(self, save_directory):
|
||||
@@ -251,7 +305,6 @@ class OpusState:
|
||||
|
||||
# Process decoder.yml
|
||||
decoder_yml = cast_marian_config(load_yaml(source_dir / "decoder.yml"))
|
||||
# TODO: what are normalize and word-penalty?
|
||||
check_marian_cfg_assumptions(cfg)
|
||||
self.hf_config = MarianConfig(
|
||||
vocab_size=cfg["vocab_size"],
|
||||
@@ -273,6 +326,9 @@ class OpusState:
|
||||
dropout=0.1, # see opus-mt-train repo/transformer-dropout param.
|
||||
# default: add_final_layer_norm=False,
|
||||
num_beams=decoder_yml["beam-size"],
|
||||
decoder_start_token_id=self.pad_token_id,
|
||||
bad_words_ids=[[self.pad_token_id]],
|
||||
max_length=512,
|
||||
)
|
||||
|
||||
def _check_layer_entries(self):
|
||||
@@ -349,12 +405,12 @@ def download_and_unzip(url, dest_dir):
|
||||
os.remove(filename)
|
||||
|
||||
|
||||
def main(source_dir, dest_dir):
|
||||
def convert(source_dir: Path, dest_dir):
|
||||
dest_dir = Path(dest_dir)
|
||||
dest_dir.mkdir(exist_ok=True)
|
||||
|
||||
add_special_tokens_to_vocab(source_dir)
|
||||
tokenizer = MarianSentencePieceTokenizer.from_pretrained(str(source_dir))
|
||||
tokenizer = MarianTokenizer.from_pretrained(str(source_dir))
|
||||
save_tokenizer(tokenizer, dest_dir)
|
||||
|
||||
opus_state = OpusState(source_dir)
|
||||
@@ -377,7 +433,7 @@ if __name__ == "__main__":
|
||||
source_dir = Path(args.src)
|
||||
assert source_dir.exists()
|
||||
dest_dir = f"converted-{source_dir.name}" if args.dest is None else args.dest
|
||||
main(source_dir, dest_dir)
|
||||
convert(source_dir, dest_dir)
|
||||
|
||||
|
||||
def load_yaml(path):
|
||||
|
||||
@@ -39,6 +39,7 @@ from .configuration_auto import (
|
||||
XLMRobertaConfig,
|
||||
XLNetConfig,
|
||||
)
|
||||
from .configuration_marian import MarianConfig
|
||||
from .configuration_utils import PretrainedConfig
|
||||
from .modeling_albert import (
|
||||
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
@@ -98,6 +99,7 @@ from .modeling_flaubert import (
|
||||
FlaubertWithLMHeadModel,
|
||||
)
|
||||
from .modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2LMHeadModel, GPT2Model
|
||||
from .modeling_marian import MarianMTModel
|
||||
from .modeling_openai import OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OpenAIGPTLMHeadModel, OpenAIGPTModel
|
||||
from .modeling_reformer import ReformerModel, ReformerModelWithLMHead
|
||||
from .modeling_roberta import (
|
||||
@@ -214,6 +216,7 @@ MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
|
||||
(AlbertConfig, AlbertForMaskedLM),
|
||||
(CamembertConfig, CamembertForMaskedLM),
|
||||
(XLMRobertaConfig, XLMRobertaForMaskedLM),
|
||||
(MarianConfig, MarianMTModel),
|
||||
(BartConfig, BartForConditionalGeneration),
|
||||
(RobertaConfig, RobertaForMaskedLM),
|
||||
(BertConfig, BertForMaskedLM),
|
||||
|
||||
@@ -18,16 +18,30 @@
|
||||
from transformers.modeling_bart import BartForConditionalGeneration
|
||||
|
||||
|
||||
PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||
"opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/pytorch_model.bin",
|
||||
}
|
||||
|
||||
|
||||
class MarianMTModel(BartForConditionalGeneration):
|
||||
"""Pytorch version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints.
|
||||
Model API is identical to BartForConditionalGeneration"""
|
||||
r"""
|
||||
Pytorch version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints.
|
||||
Model API is identical to BartForConditionalGeneration.
|
||||
Available models are listed at `Model List <https://huggingface.co/models?search=Helsinki-NLP>`__
|
||||
|
||||
pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
Examples::
|
||||
|
||||
from transformers import MarianTokenizer, MarianMTModel
|
||||
from typing import List
|
||||
src = 'fr' # source language
|
||||
trg = 'en' # target language
|
||||
sample_text = "où est l'arrêt de bus ?"
|
||||
mname = f'Helsinki-NLP/opus-mt-{src}-{trg}' # `Model List`__
|
||||
|
||||
model = MarianMTModel.from_pretrained(mname)
|
||||
tok = MarianTokenizer.from_pretrained(mname)
|
||||
batch = tok.prepare_translation_batch(src_texts=[sample_text]) # don't need tgt_text for inference
|
||||
gen = model.generate(**batch) # for forward pass: model(**batch)
|
||||
words: List[str] = tok.decode_batch(gen, skip_special_tokens=True) # returns "Where is the the bus stop ?"
|
||||
|
||||
"""
|
||||
|
||||
pretrained_model_archive_map = {} # see https://huggingface.co/models?search=Helsinki-NLP
|
||||
|
||||
def prepare_scores_for_generation(self, scores, cur_len, max_length):
|
||||
if cur_len == max_length - 1 and self.config.eos_token_id is not None:
|
||||
|
||||
@@ -38,6 +38,7 @@ from .configuration_auto import (
|
||||
XLMRobertaConfig,
|
||||
XLNetConfig,
|
||||
)
|
||||
from .configuration_marian import MarianConfig
|
||||
from .configuration_utils import PretrainedConfig
|
||||
from .tokenization_albert import AlbertTokenizer
|
||||
from .tokenization_bart import BartTokenizer
|
||||
@@ -49,6 +50,7 @@ from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFas
|
||||
from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast
|
||||
from .tokenization_flaubert import FlaubertTokenizer
|
||||
from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
|
||||
from .tokenization_marian import MarianTokenizer
|
||||
from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
|
||||
from .tokenization_reformer import ReformerTokenizer
|
||||
from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
|
||||
@@ -69,6 +71,7 @@ TOKENIZER_MAPPING = OrderedDict(
|
||||
(AlbertConfig, (AlbertTokenizer, None)),
|
||||
(CamembertConfig, (CamembertTokenizer, None)),
|
||||
(XLMRobertaConfig, (XLMRobertaTokenizer, None)),
|
||||
(MarianConfig, (MarianTokenizer, None)),
|
||||
(BartConfig, (BartTokenizer, None)),
|
||||
(RobertaConfig, (RobertaTokenizer, RobertaTokenizerFast)),
|
||||
(ReformerConfig, (ReformerTokenizer, None)),
|
||||
|
||||
@@ -22,7 +22,21 @@ PRETRAINED_VOCAB_FILES_MAP = {
|
||||
# Example URL https://s3.amazonaws.com/models.huggingface.co/bert/Helsinki-NLP/opus-mt-en-de/vocab.json
|
||||
|
||||
|
||||
class MarianSentencePieceTokenizer(PreTrainedTokenizer):
|
||||
class MarianTokenizer(PreTrainedTokenizer):
|
||||
"""Sentencepiece tokenizer for marian. Source and target languages have different SPM models.
|
||||
The logic is use the relevant source_spm or target_spm to encode txt as pieces, then look up each piece in a vocab dictionary.
|
||||
|
||||
Examples::
|
||||
|
||||
from transformers import MarianTokenizer
|
||||
tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
|
||||
src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
|
||||
tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional
|
||||
batch_enc: BatchEncoding = tok.prepare_translation_batch(src_texts, tgt_texts=tgt_texts)
|
||||
# keys [input_ids, attention_mask, decoder_input_ids, decoder_attention_mask].
|
||||
# model(**batch) should work
|
||||
"""
|
||||
|
||||
vocab_files_names = vocab_files_names
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = {m: 512 for m in MODEL_NAMES}
|
||||
@@ -49,6 +63,8 @@ class MarianSentencePieceTokenizer(PreTrainedTokenizer):
|
||||
pad_token=pad_token,
|
||||
)
|
||||
self.encoder = load_json(vocab)
|
||||
if self.unk_token not in self.encoder:
|
||||
raise KeyError("<unk> token must be in vocab")
|
||||
assert self.pad_token in self.encoder
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
|
||||
@@ -64,8 +80,11 @@ class MarianSentencePieceTokenizer(PreTrainedTokenizer):
|
||||
self.spm_target = sentencepiece.SentencePieceProcessor()
|
||||
self.spm_target.Load(target_spm)
|
||||
|
||||
# Note(SS): splitter would require lots of book-keeping.
|
||||
# self.sentence_splitter = MosesSentenceSplitter(source_lang)
|
||||
# Multilingual target side: default to using first supported language code.
|
||||
self.supported_language_codes: list = [k for k in self.encoder if k.startswith(">>") and k.endswith("<<")]
|
||||
self.tgt_lang_id = None # will not be used unless it is set through prepare_translation_batch
|
||||
|
||||
# Note(SS): sentence_splitter would require lots of book-keeping.
|
||||
try:
|
||||
from mosestokenizer import MosesPunctuationNormalizer
|
||||
|
||||
@@ -75,11 +94,10 @@ class MarianSentencePieceTokenizer(PreTrainedTokenizer):
|
||||
self.punc_normalizer = lambda x: x
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
return self.encoder[token]
|
||||
return self.encoder.get(token, self.encoder[self.unk_token])
|
||||
|
||||
def _tokenize(self, text: str, src=True) -> List[str]:
|
||||
spm = self.spm_source if src else self.spm_target
|
||||
return spm.EncodeAsPieces(text)
|
||||
def _tokenize(self, text: str) -> List[str]:
|
||||
return self.current_spm.EncodeAsPieces(text)
|
||||
|
||||
def _convert_id_to_token(self, index: int) -> str:
|
||||
"""Converts an index (integer) in a token (str) using the encoder."""
|
||||
@@ -89,10 +107,6 @@ class MarianSentencePieceTokenizer(PreTrainedTokenizer):
|
||||
"""Uses target language sentencepiece model"""
|
||||
return self.spm_target.DecodePieces(tokens)
|
||||
|
||||
def _append_special_tokens_and_truncate(self, tokens: str, max_length: int,) -> List[int]:
|
||||
ids: list = self.convert_tokens_to_ids(tokens)[:max_length]
|
||||
return ids + [self.eos_token_id]
|
||||
|
||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
|
||||
"""Build model inputs from a sequence by appending eos_token_id."""
|
||||
if token_ids_1 is None:
|
||||
@@ -100,7 +114,7 @@ class MarianSentencePieceTokenizer(PreTrainedTokenizer):
|
||||
# We don't expect to process pairs, but leave the pair logic for API consistency
|
||||
return token_ids_0 + token_ids_1 + [self.eos_token_id]
|
||||
|
||||
def decode_batch(self, token_ids, **kwargs) -> List[str]:
|
||||
def batch_decode(self, token_ids, **kwargs) -> List[str]:
|
||||
return [self.decode(ids, **kwargs) for ids in token_ids]
|
||||
|
||||
def prepare_translation_batch(
|
||||
@@ -114,40 +128,38 @@ class MarianSentencePieceTokenizer(PreTrainedTokenizer):
|
||||
"""
|
||||
Arguments:
|
||||
src_texts: list of src language texts
|
||||
src_lang: default en_XX (english)
|
||||
tgt_texts: list of tgt language texts
|
||||
tgt_lang: default ro_RO (romanian)
|
||||
max_length: (None) defer to config (1024 for mbart-large-en-ro)
|
||||
pad_to_max_length: (bool)
|
||||
return_tensors: (str) default "pt" returns pytorch tensors, pass None to return lists.
|
||||
|
||||
Returns:
|
||||
BatchEncoding: with keys [input_ids, attention_mask, decoder_input_ids, decoder_attention_mask]
|
||||
all shaped bs, seq_len. (BatchEncoding is a dict of string -> tensor or lists)
|
||||
|
||||
Examples:
|
||||
from transformers import MarianS
|
||||
all shaped bs, seq_len. (BatchEncoding is a dict of string -> tensor or lists).
|
||||
If no tgt_text is specified, the only keys will be input_ids and attention_mask.
|
||||
"""
|
||||
self.current_spm = self.spm_source
|
||||
model_inputs: BatchEncoding = self.batch_encode_plus(
|
||||
src_texts,
|
||||
add_special_tokens=True,
|
||||
return_tensors=return_tensors,
|
||||
max_length=max_length,
|
||||
pad_to_max_length=pad_to_max_length,
|
||||
src=True,
|
||||
)
|
||||
if tgt_texts is None:
|
||||
return model_inputs
|
||||
|
||||
self.current_spm = self.spm_target
|
||||
decoder_inputs: BatchEncoding = self.batch_encode_plus(
|
||||
tgt_texts,
|
||||
add_special_tokens=True,
|
||||
return_tensors=return_tensors,
|
||||
max_length=max_length,
|
||||
pad_to_max_length=pad_to_max_length,
|
||||
src=False,
|
||||
)
|
||||
for k, v in decoder_inputs.items():
|
||||
model_inputs[f"decoder_{k}"] = v
|
||||
self.current_spm = self.spm_source
|
||||
return model_inputs
|
||||
|
||||
@property
|
||||
|
||||
Reference in New Issue
Block a user