From 52d250f6aa14844024806e5e4dd1c7882bbd8dd5 Mon Sep 17 00:00:00 2001 From: Pedro Lima Date: Tue, 15 Sep 2020 13:54:12 +0100 Subject: [PATCH] [model_cards] pvl/labse_bert model card From **Language-Agnostic BERT Sentence Embedding** https://ai.googleblog.com/2020/08/language-agnostic-bert-sentence.html --- model_cards/pvl/labse_bert/README.md | 47 ++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 model_cards/pvl/labse_bert/README.md diff --git a/model_cards/pvl/labse_bert/README.md b/model_cards/pvl/labse_bert/README.md new file mode 100644 index 0000000000..56898861f3 --- /dev/null +++ b/model_cards/pvl/labse_bert/README.md @@ -0,0 +1,47 @@ +--- +language: en +thumbnail: +tags: +- bert +- embeddings +license: Apache-2.0 +--- + +# LABSE BERT + +## Model description + +Model for "Language-agnostic BERT Sentence Embedding" paper from Fangxiaoyu Feng, Yinfei Yang, Daniel Cer, Naveen Arivazhagan, Wei Wang. Model available in [TensorFlow Hub](https://tfhub.dev/google/LaBSE/1). + +## Intended uses & limitations + +#### How to use + +```python +from transformers import AutoTokenizer, AutoModel +import torch + +# from sentence-transformers +def mean_pooling(model_output, attention_mask): + token_embeddings = model_output[0] #First element of model_output contains all token embeddings + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) + sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) + return sum_embeddings / sum_mask + +tokenizer = AutoTokenizer.from_pretrained("pvl/labse_bert", do_lower_case=False) +model = AutoModel.from_pretrained("pvl/labse_bert") + +sentences = ['This framework generates embeddings for each input sentence', + 'Sentences are passed as a list of string.', + 'The quick brown fox jumps over the lazy dog.'] + +encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt') + +with torch.no_grad(): + model_output = model(**encoded_input) + +sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) + + +```