Update all references to canonical models (#29001)

* Script & Manual edition * Update
2024-02-16 08:16:58 +01:00
parent 1e402b957d
commit f497f564bb
561 changed files with 2682 additions and 2687 deletions
--- a/examples/legacy/token-classification/README.md
+++ b/examples/legacy/token-classification/README.md
@@ -34,7 +34,7 @@ Let's define some variables that we need for further pre-processing steps and tr

 ```bash
 export MAX_LENGTH=128
-export BERT_MODEL=bert-base-multilingual-cased
+export BERT_MODEL=google-bert/bert-base-multilingual-cased
 ```

 Run the pre-processing script on training, dev and test datasets:
@@ -92,7 +92,7 @@ Instead of passing all parameters via commandline arguments, the `run_ner.py` sc
 {
    "data_dir": ".",
    "labels": "./labels.txt",
-    "model_name_or_path": "bert-base-multilingual-cased",
+    "model_name_or_path": "google-bert/bert-base-multilingual-cased",
    "output_dir": "germeval-model",
    "max_seq_length": 128,
    "num_train_epochs": 3,
@@ -222,7 +222,7 @@ Let's define some variables that we need for further pre-processing steps:

 ```bash
 export MAX_LENGTH=128
-export BERT_MODEL=bert-large-cased
+export BERT_MODEL=google-bert/bert-large-cased
 ```

 Here we use the English BERT large model for fine-tuning.
@@ -250,7 +250,7 @@ This configuration file looks like:
 {
    "data_dir": "./data_wnut_17",
    "labels": "./data_wnut_17/labels.txt",
-    "model_name_or_path": "bert-large-cased",
+    "model_name_or_path": "google-bert/bert-large-cased",
    "output_dir": "wnut-17-model-1",
    "max_seq_length": 128,
    "num_train_epochs": 3,
--- a/examples/legacy/token-classification/utils_ner.py
+++ b/examples/legacy/token-classification/utils_ner.py
@@ -113,7 +113,7 @@ class TokenClassificationTask:
            for word, label in zip(example.words, example.labels):
                word_tokens = tokenizer.tokenize(word)

-                # bert-base-multilingual-cased sometimes output "nothing ([]) when calling tokenize with just a space.
+                # google-bert/bert-base-multilingual-cased sometimes output "nothing ([]) when calling tokenize with just a space.
                if len(word_tokens) > 0:
                    tokens.extend(word_tokens)
                    # Use the real label id for the first token of the word, and padding ids for the remaining tokens