From 2c731fd129b55a6bb437d637a7187579a1c3b7ab Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 2 Nov 2018 01:38:22 +0100
Subject: [PATCH] small tweaks

---
 modeling_pytorch.py       | 32 ++++++++++++++++++++++----------
 run_classifier_pytorch.py | 25 ++++++++++---------------
 2 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/modeling_pytorch.py b/modeling_pytorch.py
index 2ed222071e..c1f1c185d0 100644
--- a/modeling_pytorch.py
+++ b/modeling_pytorch.py
@@ -349,7 +349,6 @@ class BertModel(nn.Module):
     """BERT model ("Bidirectional Embedding Representations from a Transformer").
 
     Example usage:
-
     ```python
     # Already been converted into WordPiece token ids
     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
@@ -359,16 +358,10 @@ class BertModel(nn.Module):
     config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
         num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
 
-    model = modeling.BertModel(config=config, is_training=True,
-        input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
-
-    label_embeddings = tf.get_variable(...)
-    pooled_output = model.get_pooled_output()
-    logits = tf.matmul(pooled_output, label_embeddings)
-    ...
+    model = modeling.BertModel(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     ```
     """
-
     def __init__(self, config: BertConfig):
         """Constructor for BertModel.
 
@@ -400,7 +393,26 @@ class BertModel(nn.Module):
         return all_encoder_layers, pooled_output
 
 class BertForSequenceClassification(nn.Module):
-    def __init__(self, config, num_labels):
+    """BERT model for classification.
+    This module is composed of the BERT model with a linear layer on top of
+    the pooled output.
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])
+
+    config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
+        num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
+
+    num_labels = 2
+
+    model = modeling.BertModel(config, num_labels)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """    def __init__(self, config, num_labels):
         super(BertForSequenceClassification, self).__init__()
         self.bert = BertModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
diff --git a/run_classifier_pytorch.py b/run_classifier_pytorch.py
index 3ad28726a3..0fde0938df 100644
--- a/run_classifier_pytorch.py
+++ b/run_classifier_pytorch.py
@@ -115,16 +115,10 @@ parser.add_argument("--save_checkpoints_steps",
                     default = 1000,
                     type = int,
                     help = "How often to save the model checkpoint.")                    
-parser.add_argument("--iterations_per_loop",
-                    default = 1000,
-                    type = int,
-                    help = "How many steps to make in each estimator call.")
-
 parser.add_argument("--no_cuda",
                     default = False,
                     type = bool,
                     help = "Whether not to use CUDA when available")
-
 parser.add_argument("--local_rank",
                     type=int,
                     default=-1,
@@ -518,16 +512,17 @@ def main():
 
         model.train()
         global_step = 0
-        for input_ids, input_mask, segment_ids, label_ids in train_dataloader:
-            input_ids = input_ids.to(device)
-            input_mask = input_mask.float().to(device)
-            segment_ids = segment_ids.to(device)
-            label_ids = label_ids.to(device)
+        for epoch in args.num_train_epochs:
+            for input_ids, input_mask, segment_ids, label_ids in train_dataloader:
+                input_ids = input_ids.to(device)
+                input_mask = input_mask.float().to(device)
+                segment_ids = segment_ids.to(device)
+                label_ids = label_ids.to(device)
 
-            loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
-            loss.backward()
-            optimizer.step()
-            global_step += 1
+                loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
+                loss.backward()
+                optimizer.step()
+                global_step += 1
 
     if args.do_eval:
         eval_examples = processor.get_dev_examples(args.data_dir)