diff --git a/modeling_pytorch.py b/modeling_pytorch.py
index c1f1c185d0..be592eb140 100644
--- a/modeling_pytorch.py
+++ b/modeling_pytorch.py
@@ -412,7 +412,8 @@ class BertForSequenceClassification(nn.Module):
     model = modeling.BertModel(config, num_labels)
     logits = model(input_ids, token_type_ids, input_mask)
     ```
-    """    def __init__(self, config, num_labels):
+    """
+    def __init__(self, config, num_labels):
         super(BertForSequenceClassification, self).__init__()
         self.bert = BertModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
diff --git a/run_classifier_pytorch.py b/run_classifier_pytorch.py
index 0fde0938df..13622e3bf3 100644
--- a/run_classifier_pytorch.py
+++ b/run_classifier_pytorch.py
@@ -73,8 +73,8 @@ parser.add_argument("--init_checkpoint",
                     type = str,
                     help = "Initial checkpoint (usually from a pre-trained BERT model).")
 parser.add_argument("--do_lower_case",
-                    default = True,
-                    type = bool,
+                    default = False,
+                    action='store_true',
                     help = "Whether to lower case the input text. Should be True for uncased models and False for cased models.")
 parser.add_argument("--max_seq_length",
                     default = 128,
@@ -84,11 +84,11 @@ parser.add_argument("--max_seq_length",
                         "than this will be padded.")
 parser.add_argument("--do_train",
                     default = False,
-                    type = bool,
+                    action='store_true',
                     help = "Whether to run training.")
 parser.add_argument("--do_eval",
                     default = False,
-                    type = bool,
+                    action='store_true',
                     help = "Whether to run eval on the dev set.")                                             
 parser.add_argument("--train_batch_size",
                     default = 32,
@@ -117,7 +117,7 @@ parser.add_argument("--save_checkpoints_steps",
                     help = "How often to save the model checkpoint.")                    
 parser.add_argument("--no_cuda",
                     default = False,
-                    type = bool,
+                    action='store_true',
                     help = "Whether not to use CUDA when available")
 parser.add_argument("--local_rank",
                     type=int,
@@ -490,6 +490,7 @@ def main():
                          warmup=args.warmup_proportion,
                          t_total=num_train_steps)
 
+    global_step = 0
     if args.do_train:
         train_features = convert_examples_to_features(
             train_examples, label_list, args.max_seq_length, tokenizer)
@@ -511,7 +512,6 @@ def main():
         train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
 
         model.train()
-        global_step = 0
         for epoch in args.num_train_epochs:
             for input_ids, input_mask, segment_ids, label_ids in train_dataloader:
                 input_ids = input_ids.to(device)
@@ -552,9 +552,11 @@ def main():
             input_ids = input_ids.to(device)
             input_mask = input_mask.float().to(device)
             segment_ids = segment_ids.to(device)
-            label_ids = label_ids.to(device)
 
             tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids)
+
+            logits = logits.detach().cpu().numpy()
+            label_ids = label_ids.to('cpu').numpy()
             tmp_eval_accuracy = accuracy(logits, label_ids)
 
             eval_loss += tmp_eval_loss.item()