From 22e7c4edaf007d92912df54c336d10078ac7d565 Mon Sep 17 00:00:00 2001 From: erenup Date: Thu, 3 Oct 2019 18:33:53 +0800 Subject: [PATCH 1/8] fixing for roberta tokenizer decoding --- examples/run_squad.py | 4 ++-- examples/utils_squad.py | 37 ++++++++++++++++++++++--------------- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 0c0fbf2963..8a9f123d20 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -263,7 +263,7 @@ def evaluate(args, model, tokenizer, prefix=""): write_predictions(examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, - args.version_2_with_negative, args.null_score_diff_threshold) + args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, args.model_type) # Evaluate with the official SQuAD script evaluate_options = EVAL_OPTS(data_file=args.predict_file, @@ -296,7 +296,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, - is_training=not evaluate) + is_training=not evaluate, add_prefix_space=True if args.model_type == 'roberta' else False) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) diff --git a/examples/utils_squad.py b/examples/utils_squad.py index b990ecc842..82a4b96b79 100644 --- a/examples/utils_squad.py +++ b/examples/utils_squad.py @@ -25,6 +25,7 @@ import collections from io import open from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize +from transformers.tokenization_roberta import RobertaTokenizer # Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method) from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores @@ -192,7 +193,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, cls_token='[CLS]', sep_token='[SEP]', pad_token=0, sequence_a_segment_id=0, sequence_b_segment_id=1, cls_token_segment_id=0, pad_token_segment_id=0, - mask_padding_with_zero=True): + mask_padding_with_zero=True, add_prefix_space=False): """Loads a data file into a list of `InputBatch`s.""" unique_id = 1000000000 @@ -205,8 +206,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, # if example_index % 100 == 0: # logger.info('Converting %s/%s pos %s neg %s', example_index, len(examples), cnt_pos, cnt_neg) - - query_tokens = tokenizer.tokenize(example.question_text) + query_tokens = tokenizer.tokenize(example.question_text, add_prefix_space=add_prefix_space) if len(query_tokens) > max_query_length: query_tokens = query_tokens[0:max_query_length] @@ -216,7 +216,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) - sub_tokens = tokenizer.tokenize(token) + sub_tokens = tokenizer.tokenize(token, add_prefix_space=add_prefix_space) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) @@ -234,7 +234,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, - example.orig_answer_text) + example.orig_answer_text, add_prefix_space) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 @@ -398,7 +398,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, - orig_answer_text): + orig_answer_text, add_prefix_space): """Returns tokenized answer spans that better match the annotated answer.""" # The SQuAD annotations are character based. We first project them to @@ -423,7 +423,7 @@ def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, # the word "Japanese". Since our WordPiece tokenizer does not split # "Japanese", we just use "Japanese" as the annotation. This is fairly rare # in SQuAD, but does happen. - tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text, add_prefix_space=add_prefix_space)) for new_start in range(input_start, input_end + 1): for new_end in range(input_end, new_start - 1, -1): @@ -477,7 +477,7 @@ RawResult = collections.namedtuple("RawResult", def write_predictions(all_examples, all_features, all_results, n_best_size, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, verbose_logging, - version_2_with_negative, null_score_diff_threshold): + version_2_with_negative, null_score_diff_threshold, tokenizer, mode_type='bert'): """Write final predictions to the json file and log-odds of null if needed.""" logger.info("Writing predictions to: %s" % (output_prediction_file)) logger.info("Writing nbest to: %s" % (output_nbest_file)) @@ -576,15 +576,22 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, tok_text = " ".join(tok_tokens) # De-tokenize WordPieces that have been split off. - tok_text = tok_text.replace(" ##", "") - tok_text = tok_text.replace("##", "") + if mode_type == 'roberta': + tok_text = tokenizer.convert_tokens_to_string(tok_tokens) + tok_text = tok_text.replace("##", "") + tok_text = " ".join(tok_text.strip().split()) + orig_text = " ".join(orig_tokens) + final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging, None) + else: + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") - # Clean whitespace - tok_text = tok_text.strip() - tok_text = " ".join(tok_text.split()) - orig_text = " ".join(orig_tokens) + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) - final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) + final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) if final_text in seen_predictions: continue From b5d73976ad7e701e912664deacc4d44d0adefd05 Mon Sep 17 00:00:00 2001 From: erenup Date: Thu, 3 Oct 2019 20:48:17 +0800 Subject: [PATCH 2/8] Revert "fixing for roberta tokenizer decoding" This reverts commit 22e7c4edaf007d92912df54c336d10078ac7d565. --- examples/run_squad.py | 4 ++-- examples/utils_squad.py | 37 +++++++++++++++---------------------- 2 files changed, 17 insertions(+), 24 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 8a9f123d20..0c0fbf2963 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -263,7 +263,7 @@ def evaluate(args, model, tokenizer, prefix=""): write_predictions(examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, - args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, args.model_type) + args.version_2_with_negative, args.null_score_diff_threshold) # Evaluate with the official SQuAD script evaluate_options = EVAL_OPTS(data_file=args.predict_file, @@ -296,7 +296,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, - is_training=not evaluate, add_prefix_space=True if args.model_type == 'roberta' else False) + is_training=not evaluate) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) diff --git a/examples/utils_squad.py b/examples/utils_squad.py index 82a4b96b79..b990ecc842 100644 --- a/examples/utils_squad.py +++ b/examples/utils_squad.py @@ -25,7 +25,6 @@ import collections from io import open from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize -from transformers.tokenization_roberta import RobertaTokenizer # Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method) from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores @@ -193,7 +192,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, cls_token='[CLS]', sep_token='[SEP]', pad_token=0, sequence_a_segment_id=0, sequence_b_segment_id=1, cls_token_segment_id=0, pad_token_segment_id=0, - mask_padding_with_zero=True, add_prefix_space=False): + mask_padding_with_zero=True): """Loads a data file into a list of `InputBatch`s.""" unique_id = 1000000000 @@ -206,7 +205,8 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, # if example_index % 100 == 0: # logger.info('Converting %s/%s pos %s neg %s', example_index, len(examples), cnt_pos, cnt_neg) - query_tokens = tokenizer.tokenize(example.question_text, add_prefix_space=add_prefix_space) + + query_tokens = tokenizer.tokenize(example.question_text) if len(query_tokens) > max_query_length: query_tokens = query_tokens[0:max_query_length] @@ -216,7 +216,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) - sub_tokens = tokenizer.tokenize(token, add_prefix_space=add_prefix_space) + sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) @@ -234,7 +234,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, - example.orig_answer_text, add_prefix_space) + example.orig_answer_text) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 @@ -398,7 +398,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, - orig_answer_text, add_prefix_space): + orig_answer_text): """Returns tokenized answer spans that better match the annotated answer.""" # The SQuAD annotations are character based. We first project them to @@ -423,7 +423,7 @@ def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, # the word "Japanese". Since our WordPiece tokenizer does not split # "Japanese", we just use "Japanese" as the annotation. This is fairly rare # in SQuAD, but does happen. - tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text, add_prefix_space=add_prefix_space)) + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) for new_start in range(input_start, input_end + 1): for new_end in range(input_end, new_start - 1, -1): @@ -477,7 +477,7 @@ RawResult = collections.namedtuple("RawResult", def write_predictions(all_examples, all_features, all_results, n_best_size, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, verbose_logging, - version_2_with_negative, null_score_diff_threshold, tokenizer, mode_type='bert'): + version_2_with_negative, null_score_diff_threshold): """Write final predictions to the json file and log-odds of null if needed.""" logger.info("Writing predictions to: %s" % (output_prediction_file)) logger.info("Writing nbest to: %s" % (output_nbest_file)) @@ -576,22 +576,15 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, tok_text = " ".join(tok_tokens) # De-tokenize WordPieces that have been split off. - if mode_type == 'roberta': - tok_text = tokenizer.convert_tokens_to_string(tok_tokens) - tok_text = tok_text.replace("##", "") - tok_text = " ".join(tok_text.strip().split()) - orig_text = " ".join(orig_tokens) - final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging, None) - else: - tok_text = tok_text.replace(" ##", "") - tok_text = tok_text.replace("##", "") + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") - # Clean whitespace - tok_text = tok_text.strip() - tok_text = " ".join(tok_text.split()) - orig_text = " ".join(orig_tokens) + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) - final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) + final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) if final_text in seen_predictions: continue From 9b312f9d41b85ed3a4cf68b8bb3c5126c6df2259 Mon Sep 17 00:00:00 2001 From: erenup Date: Fri, 13 Dec 2019 14:51:40 +0800 Subject: [PATCH 3/8] initial version for roberta squad --- examples/run_squad.py | 19 ++--- transformers/__init__.py | 2 +- transformers/data/metrics/squad_metrics.py | 14 ++-- transformers/data/processors/squad.py | 4 +- transformers/modeling_roberta.py | 86 ++++++++++++++++++++++ 5 files changed, 106 insertions(+), 19 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 117b86e32c..d124d07eb5 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -39,6 +39,7 @@ from tqdm import tqdm, trange from transformers import (WEIGHTS_NAME, BertConfig, BertForQuestionAnswering, BertTokenizer, + RobertaForQuestionAnswering, RobertaTokenizer, RobertaConfig, XLMConfig, XLMForQuestionAnswering, XLMTokenizer, XLNetConfig, XLNetForQuestionAnswering, @@ -53,10 +54,11 @@ from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_e logger = logging.getLogger(__name__) ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \ - for conf in (BertConfig, XLNetConfig, XLMConfig)), ()) + for conf in (BertConfig, RobertaConfig, XLNetConfig, XLMConfig)), ()) MODEL_CLASSES = { 'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer), + 'roberta': (RobertaConfig, RobertaForQuestionAnswering, RobertaTokenizer), 'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer), 'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer), 'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer), @@ -141,13 +143,11 @@ def train(args, train_dataset, model, tokenizer): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], + 'token_type_ids': None if args.model_type in ['xlm', 'roberta', 'distilbert'] else batch[2], 'start_positions': batch[3], - 'end_positions': batch[4] + 'end_positions': batch[4], } - if args.model_type != 'distilbert': - inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2] - if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[5], 'p_mask': batch[6]}) @@ -241,12 +241,9 @@ def evaluate(args, model, tokenizer, prefix=""): with torch.no_grad(): inputs = { 'input_ids': batch[0], - 'attention_mask': batch[1] + 'attention_mask': batch[1], + 'token_type_ids': None if args.model_type in ['xlm', 'roberta', 'distilbert'] else batch[2], } - - if args.model_type != 'distilbert': - inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids - example_indices = batch[3] # XLNet and XLM use more arguments for their predictions @@ -311,7 +308,7 @@ def evaluate(args, model, tokenizer, prefix=""): predictions = compute_predictions_logits(examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, - args.version_2_with_negative, args.null_score_diff_threshold) + args.version_2_with_negative, args.null_score_diff_threshold, tokenizer) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) diff --git a/transformers/__init__.py b/transformers/__init__.py index 5d7b0b772c..5353551e3e 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -99,7 +99,7 @@ if is_torch_available(): XLM_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_roberta import (RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification, RobertaForMultipleChoice, - RobertaForTokenClassification, + RobertaForTokenClassification, RobertaForQuestionAnswering, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_distilbert import (DistilBertPreTrainedModel, DistilBertForMaskedLM, DistilBertModel, DistilBertForSequenceClassification, DistilBertForQuestionAnswering, diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py index 7b03255f49..acbb884fb8 100644 --- a/transformers/data/metrics/squad_metrics.py +++ b/transformers/data/metrics/squad_metrics.py @@ -377,7 +377,8 @@ def compute_predictions_logits( output_null_log_odds_file, verbose_logging, version_2_with_negative, - null_score_diff_threshold + null_score_diff_threshold, + tokenizer, ): """Write final predictions to the json file and log-odds of null if needed.""" logger.info("Writing predictions to: %s" % (output_prediction_file)) @@ -474,11 +475,14 @@ def compute_predictions_logits( orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] - tok_text = " ".join(tok_tokens) - # De-tokenize WordPieces that have been split off. - tok_text = tok_text.replace(" ##", "") - tok_text = tok_text.replace("##", "") + tok_text = tokenizer.convert_tokens_to_string(tok_tokens) + + # tok_text = " ".join(tok_tokens) + # + # # De-tokenize WordPieces that have been split off. + # tok_text = tok_text.replace(" ##", "") + # tok_text = tok_text.replace("##", "") # Clean whitespace tok_text = tok_text.strip() diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 9bc4375684..3f5fd46382 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -140,7 +140,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) - if is_training and not example.is_impossible: tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: @@ -155,7 +154,8 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, spans = [] truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) - sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence + sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence + 1 \ + if 'roberta' in str(type(tokenizer)) else tokenizer.max_len - tokenizer.max_len_single_sentence sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair span_doc_tokens = all_doc_tokens diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py index fc27353d37..2f6f634fa6 100644 --- a/transformers/modeling_roberta.py +++ b/transformers/modeling_roberta.py @@ -555,3 +555,89 @@ class RobertaClassificationHead(nn.Module): x = self.dropout(x) x = self.out_proj(x) return x + + +@add_start_docstrings("""Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of + the hidden-states output to compute `span start logits` and `span end logits`). """, + ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) +class RobertaForQuestionAnswering(BertPreTrainedModel): + r""" + **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-start scores (before SoftMax). + **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-end scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Examples:: + tokenizer = RobertaTokenizer.from_pretrained('roberta-base') + model = RobertaForMultipleChoice.from_pretrained('roberta-base') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + start_positions = torch.tensor([1]) + end_positions = torch.tensor([3]) + outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) + loss, start_scores, end_scores = outputs[:2] + """ + config_class = RobertaConfig + pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP + base_model_prefix = "roberta" + + def __init__(self, config): + super(RobertaForQuestionAnswering, self).__init__(config) + self.num_labels = config.num_labels + + self.roberta = RobertaModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, + start_positions=None, end_positions=None): + + outputs = self.roberta(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + outputs = (start_logits, end_logits,) + outputs[2:] + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + outputs = (total_loss,) + outputs + + return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) \ No newline at end of file From 8e9526b4b56486606979f1c47d3317b0b22340fe Mon Sep 17 00:00:00 2001 From: erenup Date: Sat, 14 Dec 2019 08:43:58 +0800 Subject: [PATCH 4/8] add multiple processing --- examples/run_squad.py | 5 +- transformers/data/processors/squad.py | 342 ++++++++++++++------------ 2 files changed, 187 insertions(+), 160 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index d124d07eb5..b8883b8852 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -360,7 +360,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, - return_dataset='pt' + return_dataset='pt', + threads=args.threads, ) if args.local_rank in [-1, 0]: @@ -478,6 +479,8 @@ def main(): "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") + + parser.add_argument('--threads', type=int, default=1, help='multiple threads for converting example to features') args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 3f5fd46382..d24775996e 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -4,6 +4,9 @@ import logging import os import json import numpy as np +from multiprocessing import Pool +from multiprocessing import cpu_count +from functools import partial from ...tokenization_bert import BasicTokenizer, whitespace_tokenize from .utils import DataProcessor, InputExample, InputFeatures @@ -76,9 +79,168 @@ def _is_whitespace(c): return True return False +def squad_convert_example_to_features(example, max_seq_length, + doc_stride, max_query_length, is_training): + features = [] + if is_training and not example.is_impossible: + # Get start and end position + start_position = example.start_position + end_position = example.end_position + + # If the answer cannot be found in the text, then skip this example. + actual_text = " ".join(example.doc_tokens[start_position:(end_position + 1)]) + cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text)) + if actual_text.find(cleaned_answer_text) == -1: + logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) + return [] + + tok_to_orig_index = [] + orig_to_tok_index = [] + all_doc_tokens = [] + for (i, token) in enumerate(example.doc_tokens): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = tokenizer.tokenize(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + + if is_training and not example.is_impossible: + tok_start_position = orig_to_tok_index[example.start_position] + if example.end_position < len(example.doc_tokens) - 1: + tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + + (tok_start_position, tok_end_position) = _improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text + ) + + spans = [] + + truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) + sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence + 1 \ + if 'roberta' in str(type(tokenizer)) else tokenizer.max_len - tokenizer.max_len_single_sentence + sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair + + span_doc_tokens = all_doc_tokens + while len(spans) * doc_stride < len(all_doc_tokens): + + encoded_dict = tokenizer.encode_plus( + truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, + span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, + max_length=max_seq_length, + return_overflowing_tokens=True, + pad_to_max_length=True, + stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, + truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first' + ) + + paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, + max_seq_length - len(truncated_query) - sequence_pair_added_tokens) + + if tokenizer.pad_token_id in encoded_dict['input_ids']: + non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)] + else: + non_padded_ids = encoded_dict['input_ids'] + + tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) + + token_to_orig_map = {} + for i in range(paragraph_len): + index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i + token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i] + + encoded_dict["paragraph_len"] = paragraph_len + encoded_dict["tokens"] = tokens + encoded_dict["token_to_orig_map"] = token_to_orig_map + encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens + encoded_dict["token_is_max_context"] = {} + encoded_dict["start"] = len(spans) * doc_stride + encoded_dict["length"] = paragraph_len + + spans.append(encoded_dict) + + if "overflowing_tokens" not in encoded_dict: + break + span_doc_tokens = encoded_dict["overflowing_tokens"] + + for doc_span_index in range(len(spans)): + for j in range(spans[doc_span_index]["paragraph_len"]): + is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) + index = j if tokenizer.padding_side == "left" else spans[doc_span_index][ + "truncated_query_with_special_tokens_length"] + j + spans[doc_span_index]["token_is_max_context"][index] = is_max_context + + for span in spans: + # Identify the position of the CLS token + cls_index = span['input_ids'].index(tokenizer.cls_token_id) + + # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) + # Original TF implem also keep the classification token (set to 0) (not sure why...) + p_mask = np.array(span['token_type_ids']) + + p_mask = np.minimum(p_mask, 1) + + if tokenizer.padding_side == "right": + # Limit positive values to one + p_mask = 1 - p_mask + + p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1 + + # Set the CLS index to '0' + p_mask[cls_index] = 0 + + span_is_impossible = example.is_impossible + start_position = 0 + end_position = 0 + if is_training and not span_is_impossible: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + doc_start = span["start"] + doc_end = span["start"] + span["length"] - 1 + out_of_span = False + + if not (tok_start_position >= doc_start and tok_end_position <= doc_end): + out_of_span = True + + if out_of_span: + start_position = cls_index + end_position = cls_index + span_is_impossible = True + else: + if tokenizer.padding_side == "left": + doc_offset = 0 + else: + doc_offset = len(truncated_query) + sequence_added_tokens + + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + features.append(SquadFeatures( + span['input_ids'], + span['attention_mask'], + span['token_type_ids'], + cls_index, + p_mask.tolist(), + example_index=0, + unique_id=0, + paragraph_len=span['paragraph_len'], + token_is_max_context=span["token_is_max_context"], + tokens=span["tokens"], + token_to_orig_map=span["token_to_orig_map"], + + start_position=start_position, + end_position=end_position + )) + return features + +def squad_convert_example_to_features_init(tokenizer_for_convert): + global tokenizer + tokenizer = tokenizer_for_convert + def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, - return_dataset=False): + return_dataset=False, threads=1): """ Converts a list of examples into a list of features that can be directly given as input to a model. It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs. @@ -93,6 +255,8 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, return_dataset: Default False. Either 'pt' or 'tf'. if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset + threads: multiple processing threadsa-smi + Returns: list of :class:`~transformers.data.processors.squad.SquadFeatures` @@ -113,165 +277,26 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, """ # Defining helper methods - unique_id = 1000000000 - features = [] - for (example_index, example) in enumerate(tqdm(examples)): - if is_training and not example.is_impossible: - # Get start and end position - start_position = example.start_position - end_position = example.end_position - - # If the answer cannot be found in the text, then skip this example. - actual_text = " ".join(example.doc_tokens[start_position:(end_position + 1)]) - cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text)) - if actual_text.find(cleaned_answer_text) == -1: - logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) - continue - - - tok_to_orig_index = [] - orig_to_tok_index = [] - all_doc_tokens = [] - for (i, token) in enumerate(example.doc_tokens): - orig_to_tok_index.append(len(all_doc_tokens)) - sub_tokens = tokenizer.tokenize(token) - for sub_token in sub_tokens: - tok_to_orig_index.append(i) - all_doc_tokens.append(sub_token) - - if is_training and not example.is_impossible: - tok_start_position = orig_to_tok_index[example.start_position] - if example.end_position < len(example.doc_tokens) - 1: - tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 - else: - tok_end_position = len(all_doc_tokens) - 1 - - (tok_start_position, tok_end_position) = _improve_answer_span( - all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text - ) - - spans = [] - - truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) - sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence + 1 \ - if 'roberta' in str(type(tokenizer)) else tokenizer.max_len - tokenizer.max_len_single_sentence - sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair - - span_doc_tokens = all_doc_tokens - while len(spans) * doc_stride < len(all_doc_tokens): - - encoded_dict = tokenizer.encode_plus( - truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, - span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, - max_length=max_seq_length, - return_overflowing_tokens=True, - pad_to_max_length=True, - stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, - truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first' - ) - - paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens) - - if tokenizer.pad_token_id in encoded_dict['input_ids']: - non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)] - else: - non_padded_ids = encoded_dict['input_ids'] - - tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) - - token_to_orig_map = {} - for i in range(paragraph_len): - index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i - token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i] - - encoded_dict["paragraph_len"] = paragraph_len - encoded_dict["tokens"] = tokens - encoded_dict["token_to_orig_map"] = token_to_orig_map - encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens - encoded_dict["token_is_max_context"] = {} - encoded_dict["start"] = len(spans) * doc_stride - encoded_dict["length"] = paragraph_len - - spans.append(encoded_dict) - - if "overflowing_tokens" not in encoded_dict: - break - span_doc_tokens = encoded_dict["overflowing_tokens"] - - for doc_span_index in range(len(spans)): - for j in range(spans[doc_span_index]["paragraph_len"]): - is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) - index = j if tokenizer.padding_side == "left" else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j - spans[doc_span_index]["token_is_max_context"][index] = is_max_context - - for span in spans: - # Identify the position of the CLS token - cls_index = span['input_ids'].index(tokenizer.cls_token_id) - - # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) - # Original TF implem also keep the classification token (set to 0) (not sure why...) - p_mask = np.array(span['token_type_ids']) - - p_mask = np.minimum(p_mask, 1) - - if tokenizer.padding_side == "right": - # Limit positive values to one - p_mask = 1 - p_mask - - p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1 - - # Set the CLS index to '0' - p_mask[cls_index] = 0 - - - span_is_impossible = example.is_impossible - start_position = 0 - end_position = 0 - if is_training and not span_is_impossible: - # For training, if our document chunk does not contain an annotation - # we throw it out, since there is nothing to predict. - doc_start = span["start"] - doc_end = span["start"] + span["length"] - 1 - out_of_span = False - - if not (tok_start_position >= doc_start and tok_end_position <= doc_end): - out_of_span = True - - if out_of_span: - start_position = cls_index - end_position = cls_index - span_is_impossible = True - else: - if tokenizer.padding_side == "left": - doc_offset = 0 - else: - doc_offset = len(truncated_query) + sequence_added_tokens - - start_position = tok_start_position - doc_start + doc_offset - end_position = tok_end_position - doc_start + doc_offset - - - features.append(SquadFeatures( - span['input_ids'], - span['attention_mask'], - span['token_type_ids'], - cls_index, - p_mask.tolist(), - - example_index=example_index, - unique_id=unique_id, - paragraph_len=span['paragraph_len'], - token_is_max_context=span["token_is_max_context"], - tokens=span["tokens"], - token_to_orig_map=span["token_to_orig_map"], - - start_position=start_position, - end_position=end_position - )) - + threads = min(threads, cpu_count()) + with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p: + annotate_ = partial(squad_convert_example_to_features, max_seq_length=max_seq_length, + doc_stride=doc_stride, max_query_length=max_query_length, is_training=is_training) + features = list(tqdm(p.imap(annotate_, examples, chunksize=32), total=len(examples), desc='convert squad examples to features')) + new_features = [] + unique_id = 1000000000 + example_index = 0 + for example_features in tqdm(features, total=len(features), desc='add example index and unique id'): + if not example_features: + continue + for example_feature in example_features: + example_feature.example_index = example_index + example_feature.unique_id = unique_id + new_features.append(example_feature) unique_id += 1 - + example_index += 1 + features = new_features + del new_features if return_dataset == 'pt': if not is_torch_available(): raise ImportError("Pytorch must be installed to return a pytorch dataset.") @@ -295,7 +320,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, all_cls_index, all_p_mask) return features, dataset - return features From a1faaf99625cb05820e7cc6b958f2aa3540778e3 Mon Sep 17 00:00:00 2001 From: erenup Date: Sat, 14 Dec 2019 08:57:13 +0800 Subject: [PATCH 5/8] deleted useless file --- srl_label.txt | 130 -------------------------------------------------- 1 file changed, 130 deletions(-) delete mode 100644 srl_label.txt diff --git a/srl_label.txt b/srl_label.txt deleted file mode 100644 index b6a7f3ad95..0000000000 --- a/srl_label.txt +++ /dev/null @@ -1,130 +0,0 @@ -O -I-ARG1 -I-ARG2 -I-ARG0 -B-V -B-ARG1 -B-ARG0 -I-ARGM-ADV -I-ARGM-TMP -B-ARG2 -I-ARGM-LOC -I-ARGM-MNR -B-ARGM-TMP -I-ARGM-CAU -I-ARGM-PRP -B-ARGM-MOD -I-C-ARG1 -B-ARGM-ADV -I-ARGM-PRD -B-ARGM-DIS -I-ARG3 -I-V -I-ARG4 -B-ARGM-MNR -B-ARGM-LOC -I-ARGM-NEG -B-ARGM-NEG -B-R-ARG0 -I-ARGM-DIR -I-ARGM-DIS -I-ARGM-PNC -I-ARGM-ADJ -B-R-ARG1 -B-ARG3 -B-ARGM-PRP -B-ARG4 -I-ARGM-GOL -I-R-ARG0 -B-ARGM-CAU -B-ARGM-DIR -B-ARGM-PRD -I-ARGM-EXT -B-C-ARG1 -B-ARGM-ADJ -I-C-ARG0 -B-ARGM-EXT -I-C-ARG2 -I-ARGM-COM -I-R-ARG1 -I-ARGM-MOD -B-ARGM-GOL -B-ARGM-PNC -B-R-ARGM-LOC -B-R-ARGM-TMP -B-ARGM-LVB -B-ARGM-COM -B-R-ARG2 -I-C-ARGM-MNR -B-C-ARG0 -I-R-ARGM-LOC -B-C-ARG2 -I-C-ARGM-EXT -I-C-ARG4 -B-ARGM-REC -I-R-ARG2 -I-C-ARGM-TMP -I-ARG5 -I-C-ARG3 -I-C-ARGM-ADV -B-ARG5 -B-R-ARGM-MNR -I-ARGM-DSP -I-C-ARGM-LOC -B-R-ARG3 -I-ARGA -I-R-ARGM-MNR -B-R-ARGM-CAU -I-R-ARGM-TMP -B-C-ARGM-MNR -B-ARGA -I-C-ARGM-DSP -B-C-ARGM-ADV -I-R-ARG3 -B-R-ARGM-ADV -B-C-ARG4 -I-C-ARGM-CAU -B-C-ARGM-EXT -B-C-ARGM-TMP -B-R-ARGM-DIR -B-R-ARG4 -I-R-ARGM-ADV -I-ARGM-REC -B-C-ARG3 -B-C-ARGM-LOC -B-R-ARGM-EXT -B-ARGM-PRR -B-R-ARGM-PRP -B-ARGM-PRX -I-R-ARGM-DIR -I-R-ARGM-EXT -I-C-ARGM-NEG -B-ARGM-DSP -B-R-ARGM-GOL -I-R-ARGM-GOL -I-R-ARGM-PNC -I-C-ARGM-PRP -B-R-ARGM-COM -I-R-ARGM-PRP -I-C-ARGM-COM -B-C-ARGM-CAU -B-C-ARGM-DSP -I-R-ARGM-COM -I-R-ARGM-CAU -B-R-ARGM-PNC -I-C-ARGM-DIS -I-C-ARGM-DIR -I-R-ARG4 -B-R-ARGM-PRD -I-R-ARGM-PRD -B-C-ARGM-PRP -B-R-ARG5 -B-C-ARGM-MOD -I-C-ARGM-MOD -B-C-ARGM-ADJ -I-C-ARGM-ADJ -B-C-ARGM-DIS -B-C-ARGM-NEG -B-C-ARGM-COM -B-C-ARGM-DIR -B-R-ARGM-MOD From 3c6efd0ca367063b8b3883020b54aa22fc4abb27 Mon Sep 17 00:00:00 2001 From: erenup Date: Tue, 17 Dec 2019 11:18:12 +0800 Subject: [PATCH 6/8] updated usage example in modeling_roberta for question and answering --- transformers/modeling_roberta.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py index 2f6f634fa6..ea9211cbb9 100644 --- a/transformers/modeling_roberta.py +++ b/transformers/modeling_roberta.py @@ -585,13 +585,16 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: - tokenizer = RobertaTokenizer.from_pretrained('roberta-base') - model = RobertaForMultipleChoice.from_pretrained('roberta-base') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 - start_positions = torch.tensor([1]) - end_positions = torch.tensor([3]) - outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) - loss, start_scores, end_scores = outputs[:2] + tokenizer = RobertaTokenizer.from_pretrained('roberta-large') + model = RobertaForQuestionAnswering.from_pretrained('roberta-large') + question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" + input_ids = tokenizer.encode(question, text) + start_scores, end_scores = model(torch.tensor([input_ids])) + all_tokens = tokenizer.convert_ids_to_tokens(input_ids) + print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])) + # a nice puppet + # Note: 'roberta-large' model can not produce the right answer above. Waiting for 'roberta-large-finetuned-squad' + to be uploaded. """ config_class = RobertaConfig pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP From d000195ee683c4cba15a739202140c04cdcc19bd Mon Sep 17 00:00:00 2001 From: erenup Date: Tue, 17 Dec 2019 11:28:34 +0800 Subject: [PATCH 7/8] add comment for example_index and unique_id in single process --- transformers/data/processors/squad.py | 2 +- transformers/modeling_roberta.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index b4cd8dedac..f028141924 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -225,7 +225,7 @@ def squad_convert_example_to_features(example, max_seq_length, span['token_type_ids'], cls_index, p_mask.tolist(), - example_index=0, + example_index=0, # Can not set unique_id and example_index here. They will be set after multiple processing. unique_id=0, paragraph_len=span['paragraph_len'], token_is_max_context=span["token_is_max_context"], diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py index ea9211cbb9..05b7ad345b 100644 --- a/transformers/modeling_roberta.py +++ b/transformers/modeling_roberta.py @@ -592,8 +592,8 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): start_scores, end_scores = model(torch.tensor([input_ids])) all_tokens = tokenizer.convert_ids_to_tokens(input_ids) print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])) - # a nice puppet - # Note: 'roberta-large' model can not produce the right answer above. Waiting for 'roberta-large-finetuned-squad' + a nice puppet + Note: 'roberta-large' model can not produce the right answer above. Waiting for 'roberta-large-finetuned-squad' to be uploaded. """ config_class = RobertaConfig From 805c21aebacc00f447affd35dbc159d7f64d10dd Mon Sep 17 00:00:00 2001 From: erenup Date: Tue, 17 Dec 2019 11:36:00 +0800 Subject: [PATCH 8/8] tried to fix the failed checks --- transformers/modeling_roberta.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py index 05b7ad345b..85f9e6e461 100644 --- a/transformers/modeling_roberta.py +++ b/transformers/modeling_roberta.py @@ -591,10 +591,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): input_ids = tokenizer.encode(question, text) start_scores, end_scores = model(torch.tensor([input_ids])) all_tokens = tokenizer.convert_ids_to_tokens(input_ids) - print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])) - a nice puppet - Note: 'roberta-large' model can not produce the right answer above. Waiting for 'roberta-large-finetuned-squad' - to be uploaded. + answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]) """ config_class = RobertaConfig pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP