Transformers cli clean command (#37657)
* transformers-cli -> transformers * Chat command works with positional argument * update doc references to transformers-cli * doc headers * deepspeed --------- Co-authored-by: Joao Gante <joao@huggingface.co>
This commit is contained in:
@@ -81,10 +81,10 @@ print(f"The predicted token is: {predicted_token}")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```bash
|
||||
echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers-cli run --task fill-mask --model google-bert/bert-base-uncased --device 0
|
||||
echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers run --task fill-mask --model google-bert/bert-base-uncased --device 0
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
@@ -256,4 +256,4 @@ echo -e "Plants create [MASK] through a process known as photosynthesis." | tran
|
||||
|
||||
[[autodoc]] models.bert.modeling_tf_bert.TFBertForPreTrainingOutput
|
||||
|
||||
[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
|
||||
[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
|
||||
|
||||
@@ -35,7 +35,7 @@ The example below demonstrates how to generate code with [`Pipeline`], or the [`
|
||||
|
||||
<hfoptions id="usage">
|
||||
<hfoption id="Pipeline">
|
||||
|
||||
|
||||
```py
|
||||
import torch
|
||||
from transformers import pipeline
|
||||
@@ -76,7 +76,7 @@ prompt = "# Function to calculate the factorial of a number\ndef factorial(n):"
|
||||
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
|
||||
|
||||
output = model.generate(
|
||||
**input_ids,
|
||||
**input_ids,
|
||||
max_new_tokens=256,
|
||||
cache_implementation="static"
|
||||
)
|
||||
@@ -92,10 +92,10 @@ print(filled_text)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```bash
|
||||
echo -e "# Function to calculate the factorial of a number\ndef factorial(n):" | transformers-cli run --task text-generation --model meta-llama/CodeLlama-7b-hf --device 0
|
||||
echo -e "# Function to calculate the factorial of a number\ndef factorial(n):" | transformers run --task text-generation --model meta-llama/CodeLlama-7b-hf --device 0
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
@@ -146,7 +146,7 @@ visualizer("""def func(a, b):
|
||||
- Use the `<FILL_ME>` token where you want your input to be filled. The tokenizer splits this token to create a formatted input string that follows the [original training pattern](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402). This is more robust than preparing the pattern yourself.
|
||||
```py
|
||||
from transformers import LlamaForCausalLM, CodeLlamaTokenizer
|
||||
|
||||
|
||||
tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf")
|
||||
model = LlamaForCausalLM.from_pretrained("meta-llama/CodeLlama-7b-hf")
|
||||
PROMPT = '''def remove_non_ascii(s: str) -> str:
|
||||
@@ -155,7 +155,7 @@ visualizer("""def func(a, b):
|
||||
'''
|
||||
input_ids = tokenizer(PROMPT, return_tensors="pt")["input_ids"]
|
||||
generated_ids = model.generate(input_ids, max_new_tokens=128)
|
||||
|
||||
|
||||
filling = tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0]
|
||||
print(PROMPT.replace("<FILL_ME>", filling))
|
||||
```
|
||||
|
||||
@@ -49,9 +49,9 @@ model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01", t
|
||||
messages = [{"role": "user", "content": "How do plants make energy?"}]
|
||||
input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
|
||||
output = model.generate(
|
||||
input_ids,
|
||||
max_new_tokens=100,
|
||||
do_sample=True,
|
||||
input_ids,
|
||||
max_new_tokens=100,
|
||||
do_sample=True,
|
||||
temperature=0.3,
|
||||
cache_implementation="static",
|
||||
)
|
||||
@@ -59,11 +59,11 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```bash
|
||||
# pip install -U flash-attn --no-build-isolation
|
||||
transformers-cli chat --model_name_or_path CohereForAI/c4ai-command-r-v01 --torch_dtype auto --attn_implementation flash_attention_2
|
||||
transformers chat CohereForAI/c4ai-command-r-v01 --torch_dtype auto --attn_implementation flash_attention_2
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
@@ -85,9 +85,9 @@ model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01", t
|
||||
messages = [{"role": "user", "content": "How do plants make energy?"}]
|
||||
input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
|
||||
output = model.generate(
|
||||
input_ids,
|
||||
max_new_tokens=100,
|
||||
do_sample=True,
|
||||
input_ids,
|
||||
max_new_tokens=100,
|
||||
do_sample=True,
|
||||
temperature=0.3,
|
||||
cache_implementation="static",
|
||||
)
|
||||
|
||||
@@ -83,10 +83,10 @@ print(f"Predicted label: {predicted_label}")
|
||||
|
||||
</hfoption>
|
||||
|
||||
<hfoption id="transformers-cli">
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```bash
|
||||
echo -e "I love using Hugging Face Transformers!" | transformers-cli run --task text-classification --model distilbert-base-uncased-finetuned-sst-2-english
|
||||
echo -e "I love using Hugging Face Transformers!" | transformers run --task text-classification --model distilbert-base-uncased-finetuned-sst-2-english
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
@@ -213,7 +213,3 @@ echo -e "I love using Hugging Face Transformers!" | transformers-cli run --task
|
||||
|
||||
</jax>
|
||||
</frameworkcontent>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -45,9 +45,9 @@ import torch
|
||||
from transformers import pipeline
|
||||
|
||||
classifier = pipeline(
|
||||
task="text-classification",
|
||||
model="bhadresh-savani/electra-base-emotion",
|
||||
torch_dtype=torch.float16,
|
||||
task="text-classification",
|
||||
model="bhadresh-savani/electra-base-emotion",
|
||||
torch_dtype=torch.float16,
|
||||
device=0
|
||||
)
|
||||
classifier("This restaurant has amazing food!")
|
||||
@@ -64,7 +64,7 @@ tokenizer = AutoTokenizer.from_pretrained(
|
||||
"bhadresh-savani/electra-base-emotion",
|
||||
)
|
||||
model = AutoModelForSequenceClassification.from_pretrained(
|
||||
"bhadresh-savani/electra-base-emotion",
|
||||
"bhadresh-savani/electra-base-emotion",
|
||||
torch_dtype=torch.float16
|
||||
)
|
||||
inputs = tokenizer("ELECTRA is more efficient than BERT", return_tensors="pt")
|
||||
@@ -78,10 +78,10 @@ print(f"Predicted label: {predicted_label}")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```bash
|
||||
echo -e "This restaurant has amazing food." | transformers-cli run --task text-classification --model bhadresh-savani/electra-base-emotion --device 0
|
||||
echo -e "This restaurant has amazing food." | transformers run --task text-classification --model bhadresh-savani/electra-base-emotion --device 0
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
@@ -96,12 +96,12 @@ echo -e "This restaurant has amazing food." | transformers-cli run --task text-c
|
||||
|
||||
```py
|
||||
# Example of properly handling padding with attention masks
|
||||
inputs = tokenizer(["Short text", "This is a much longer text that needs padding"],
|
||||
padding=True,
|
||||
inputs = tokenizer(["Short text", "This is a much longer text that needs padding"],
|
||||
padding=True,
|
||||
return_tensors="pt")
|
||||
outputs = model(**inputs) # automatically uses the attention_mask
|
||||
```
|
||||
|
||||
|
||||
- When using the discriminator for a downstream task, you can load it into any of the ELECTRA model classes ([`ElectraForSequenceClassification`], [`ElectraForTokenClassification`], etc.).
|
||||
|
||||
## ElectraConfig
|
||||
|
||||
@@ -41,7 +41,7 @@ import torch
|
||||
from transformers import pipeline
|
||||
|
||||
pipeline = pipeline(
|
||||
task="text-generation",
|
||||
task="text-generation",
|
||||
model="tiiuae/falcon-7b-instruct",
|
||||
torch_dtype=torch.bfloat16,
|
||||
device=0
|
||||
@@ -76,11 +76,11 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```bash
|
||||
# pip install -U flash-attn --no-build-isolation
|
||||
transformers-cli chat --model_name_or_path tiiuae/falcon-7b-instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
|
||||
transformers chat tiiuae/falcon-7b-instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
@@ -150,4 +150,4 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||||
## FalconForQuestionAnswering
|
||||
|
||||
[[autodoc]] FalconForQuestionAnswering
|
||||
- forward
|
||||
- forward
|
||||
|
||||
@@ -39,7 +39,7 @@ import torch
|
||||
from transformers import pipeline
|
||||
|
||||
pipeline = pipeline(
|
||||
"text-generation",
|
||||
"text-generation",
|
||||
model="tiiuae/falcon-mamba-7b-instruct",
|
||||
torch_dtype=torch.bfloat16,
|
||||
device=0
|
||||
@@ -73,10 +73,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```bash
|
||||
transformers-cli chat --model_name_or_path tiiuae/falcon-mamba-7b-instruct --torch_dtype auto --device 0
|
||||
transformers chat tiiuae/falcon-mamba-7b-instruct --torch_dtype auto --device 0
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
|
||||
@@ -80,10 +80,10 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```bash
|
||||
echo -e "LLMs generate text through a process known as" | transformers-cli run --task text-generation --model google/gemma-2b --device 0
|
||||
echo -e "LLMs generate text through a process known as" | transformers run --task text-generation --model google/gemma-2b --device 0
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
@@ -114,8 +114,8 @@ model = AutoModelForCausalLM.from_pretrained(
|
||||
input_text = "LLMs generate text through a process known as."
|
||||
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
|
||||
outputs = model.generate(
|
||||
**input_ids,
|
||||
max_new_tokens=50,
|
||||
**input_ids,
|
||||
max_new_tokens=50,
|
||||
cache_implementation="static"
|
||||
)
|
||||
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||||
@@ -127,7 +127,7 @@ Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/bl
|
||||
from transformers.utils.attention_visualizer import AttentionMaskVisualizer
|
||||
|
||||
visualizer = AttentionMaskVisualizer("google/gemma-2b")
|
||||
visualizer("LLMs generate text through a process known as")
|
||||
visualizer("LLMs generate text through a process known as")
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
|
||||
@@ -58,7 +58,7 @@ pipe("Explain quantum computing simply. ", max_new_tokens=50)
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="AutoModel">
|
||||
|
||||
|
||||
```python
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
@@ -80,16 +80,16 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```
|
||||
echo -e "Explain quantum computing simply." | transformers-cli run --task text-generation --model google/gemma-2-2b --device 0
|
||||
echo -e "Explain quantum computing simply." | transformers run --task text-generation --model google/gemma-2-2b --device 0
|
||||
```
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
|
||||
|
||||
|
||||
The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.
|
||||
|
||||
```python
|
||||
@@ -118,7 +118,7 @@ Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/bl
|
||||
```python
|
||||
from transformers.utils.attention_visualizer import AttentionMaskVisualizer
|
||||
visualizer = AttentionMaskVisualizer("google/gemma-2b")
|
||||
visualizer("You are an assistant. Make sure you print me")
|
||||
visualizer("You are an assistant. Make sure you print me")
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
@@ -137,7 +137,7 @@ visualizer("You are an assistant. Make sure you print me")
|
||||
|
||||
inputs = tokenizer(text="My name is Gemma", return_tensors="pt")
|
||||
max_generated_length = inputs.input_ids.shape[1] + 10
|
||||
past_key_values = HybridCache(config=model.config, max_batch_size=1,
|
||||
past_key_values = HybridCache(config=model.config, max_batch_size=1,
|
||||
max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
|
||||
outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
|
||||
```
|
||||
|
||||
@@ -99,10 +99,10 @@ print(processor.decode(output[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```bash
|
||||
echo -e "Plants create energy through a process known as" | transformers-cli run --task text-generation --model google/gemma-3-1b-pt --device 0
|
||||
echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model google/gemma-3-1b-pt --device 0
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
|
||||
@@ -64,10 +64,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```bash
|
||||
echo -e "Hello, I'm a language model" | transformers-cli run --task text-generation --model openai-community/gpt2 --device 0
|
||||
echo -e "Hello, I'm a language model" | transformers run --task text-generation --model openai-community/gpt2 --device 0
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
@@ -82,16 +82,16 @@ import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
|
||||
|
||||
quantization_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype="float16",
|
||||
bnb_4bit_use_double_quant=True
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype="float16",
|
||||
bnb_4bit_use_double_quant=True
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"openai-community/gpt2-xl",
|
||||
quantization_config=quantization_config,
|
||||
device_map="auto"
|
||||
device_map="auto"
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2-xl")
|
||||
|
||||
@@ -75,10 +75,10 @@ output = model.generate(**input_ids, cache_implementation="static")
|
||||
print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
```
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```bash
|
||||
echo -e "Plants create energy through a process known as" | transformers-cli run --task text-generation --model ai21labs/AI21-Jamba-Mini-1.6 --device 0
|
||||
echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model ai21labs/AI21-Jamba-Mini-1.6 --device 0
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
|
||||
@@ -74,10 +74,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```bash
|
||||
echo -e "Plants create energy through a process known as" | transformers-cli run --task text-generation --model huggyllama/llama-7b --device 0
|
||||
echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model huggyllama/llama-7b --device 0
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
|
||||
@@ -74,10 +74,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```bash
|
||||
transformers-cli chat --model_name_or_path meta-llama/Llama-2-7b-chat-hf --torch_dtype auto --attn_implementation flash_attention_2
|
||||
transformers chat meta-llama/Llama-2-7b-chat-hf --torch_dtype auto --attn_implementation flash_attention_2
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
@@ -175,4 +175,3 @@ visualizer("Plants create energy through a process known as")
|
||||
|
||||
[[autodoc]] LlamaForSequenceClassification
|
||||
- forward
|
||||
|
||||
|
||||
@@ -76,10 +76,10 @@ tokenizer.decode(predictions).split()
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```bash
|
||||
echo -e "San Francisco 49ers cornerback Shawntae Spencer will miss the rest of the <mask> with a torn ligament in his left knee." | transformers-cli run --task fill-mask --model allenai/longformer-base-4096 --device 0
|
||||
echo -e "San Francisco 49ers cornerback Shawntae Spencer will miss the rest of the <mask> with a torn ligament in his left knee." | transformers run --task fill-mask --model allenai/longformer-base-4096 --device 0
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
@@ -147,42 +147,42 @@ echo -e "San Francisco 49ers cornerback Shawntae Spencer will miss the rest of t
|
||||
|
||||
## LongformerForMaskedLM
|
||||
|
||||
[[autodoc]] LongformerForMaskedLM
|
||||
[[autodoc]] LongformerForMaskedLM
|
||||
- forward
|
||||
|
||||
## LongformerForSequenceClassification
|
||||
|
||||
[[autodoc]] LongformerForSequenceClassification
|
||||
[[autodoc]] LongformerForSequenceClassification
|
||||
- forward
|
||||
|
||||
## LongformerForMultipleChoice
|
||||
|
||||
[[autodoc]] LongformerForMultipleChoice
|
||||
[[autodoc]] LongformerForMultipleChoice
|
||||
- forward
|
||||
|
||||
## LongformerForTokenClassification
|
||||
|
||||
[[autodoc]] LongformerForTokenClassification
|
||||
[[autodoc]] LongformerForTokenClassification
|
||||
- forward
|
||||
|
||||
## LongformerForQuestionAnswering
|
||||
|
||||
[[autodoc]] LongformerForQuestionAnswering
|
||||
[[autodoc]] LongformerForQuestionAnswering
|
||||
- forward
|
||||
|
||||
## TFLongformerModel
|
||||
|
||||
[[autodoc]] TFLongformerModel
|
||||
[[autodoc]] TFLongformerModel
|
||||
- call
|
||||
|
||||
## TFLongformerForMaskedLM
|
||||
|
||||
[[autodoc]] TFLongformerForMaskedLM
|
||||
[[autodoc]] TFLongformerForMaskedLM
|
||||
- call
|
||||
|
||||
## TFLongformerForQuestionAnswering
|
||||
|
||||
[[autodoc]] TFLongformerForQuestionAnswering
|
||||
[[autodoc]] TFLongformerForQuestionAnswering
|
||||
- call
|
||||
|
||||
## TFLongformerForSequenceClassification
|
||||
@@ -192,10 +192,10 @@ echo -e "San Francisco 49ers cornerback Shawntae Spencer will miss the rest of t
|
||||
|
||||
## TFLongformerForTokenClassification
|
||||
|
||||
[[autodoc]] TFLongformerForTokenClassification
|
||||
[[autodoc]] TFLongformerForTokenClassification
|
||||
- call
|
||||
|
||||
## TFLongformerForMultipleChoice
|
||||
|
||||
[[autodoc]] TFLongformerForMultipleChoice
|
||||
[[autodoc]] TFLongformerForMultipleChoice
|
||||
- call
|
||||
|
||||
@@ -27,7 +27,7 @@ rendered properly in your Markdown viewer.
|
||||
|
||||
# Mistral
|
||||
|
||||
[Mistral](https://huggingface.co/papers/2310.06825) is a 7B parameter language model, available as a pretrained and instruction-tuned variant, focused on balancing
|
||||
[Mistral](https://huggingface.co/papers/2310.06825) is a 7B parameter language model, available as a pretrained and instruction-tuned variant, focused on balancing
|
||||
the scaling costs of large models with performance and efficient inference. This model uses sliding window attention (SWA) trained with a 8K context length and a fixed cache size to handle longer sequences more effectively. Grouped-query attention (GQA) speeds up inference and reduces memory requirements. Mistral also features a byte-fallback BPE tokenizer to improve token handling and efficiency by ensuring characters are never mapped to out-of-vocabulary tokens.
|
||||
|
||||
You can find all the original Mistral checkpoints under the [Mistral AI_](https://huggingface.co/mistralai) organization.
|
||||
@@ -78,10 +78,10 @@ The example below demonstrates how to chat with [`Pipeline`] or the [`AutoModel`
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```python
|
||||
echo -e "My favorite condiment is" | transformers-cli chat --model_name_or_path mistralai/Mistral-7B-v0.3 --torch_dtype auto --device 0 --attn_implementation flash_attention_2
|
||||
echo -e "My favorite condiment is" | transformers chat mistralai/Mistral-7B-v0.3 --torch_dtype auto --device 0 --attn_implementation flash_attention_2
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
|
||||
@@ -76,10 +76,10 @@ print(f"The predicted token is: {predicted_token}")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```bash
|
||||
echo -e "The capital of France is [MASK]." | transformers-cli run --task fill-mask --model google/mobilebert-uncased --device 0
|
||||
echo -e "The capital of France is [MASK]." | transformers run --task fill-mask --model google/mobilebert-uncased --device 0
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
|
||||
@@ -79,10 +79,10 @@ print(f"The predicted token is: {predicted_token}")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```bash
|
||||
echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers-cli run --task fill-mask --model answerdotai/ModernBERT-base --device 0
|
||||
echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers run --task fill-mask --model answerdotai/ModernBERT-base --device 0
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
|
||||
@@ -70,10 +70,10 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```bash
|
||||
echo -e "The future of AI is" | transformers-cli run --task text-generation --model openai-community/openai-gpt --device 0
|
||||
echo -e "The future of AI is" | transformers run --task text-generation --model openai-community/openai-gpt --device 0
|
||||
|
||||
```
|
||||
</hfoption>
|
||||
|
||||
@@ -65,10 +65,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```bash
|
||||
echo -e "'''def print_prime(n): """ Print all primes between 1 and n"""'''" | transformers-cli run --task text-classification --model microsoft/phi-1.5 --device 0
|
||||
echo -e "'''def print_prime(n): """ Print all primes between 1 and n"""'''" | transformers run --task text-classification --model microsoft/phi-1.5 --device 0
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
@@ -102,7 +102,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
```py
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"microsoft/phi-1",
|
||||
@@ -110,12 +110,12 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
device_map="auto",
|
||||
trust_remote_code=True,
|
||||
attn_implementation="sdpa")
|
||||
|
||||
|
||||
input_ids = tokenizer('''def print_prime(n):
|
||||
"""
|
||||
Print all primes between 1 and n
|
||||
"""''', return_tensors="pt").to("cuda")
|
||||
|
||||
|
||||
output = model.generate(**input_ids, cache_implementation="static")
|
||||
print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
@@ -64,7 +64,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"Qwen/Qwen2-1.5B-Instruct",
|
||||
torch_dtype=torch.bfloat16,
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
attn_implementation="sdpa"
|
||||
)
|
||||
@@ -86,10 +86,10 @@ generated_ids = model.generate(
|
||||
model_inputs.input_ids,
|
||||
cache_implementation="static",
|
||||
max_new_tokens=512,
|
||||
do_sample=True,
|
||||
temperature=0.7,
|
||||
top_k=50,
|
||||
top_p=0.95
|
||||
do_sample=True,
|
||||
temperature=0.7,
|
||||
top_k=50,
|
||||
top_p=0.95
|
||||
)
|
||||
generated_ids = [
|
||||
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
||||
@@ -100,11 +100,11 @@ print(response)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```bash
|
||||
# pip install -U flash-attn --no-build-isolation
|
||||
transformers-cli chat --model_name_or_path Qwen/Qwen2-7B-Instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
|
||||
transformers chat Qwen/Qwen2-7B-Instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
@@ -121,21 +121,21 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
||||
|
||||
quantization_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_compute_dtype=torch.bfloat16,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_use_double_quant=True,
|
||||
bnb_4bit_compute_dtype=torch.bfloat16,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B")
|
||||
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"Qwen/Qwen2-7B",
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
quantization_config=quantization_config,
|
||||
attn_implementation="flash_attention_2"
|
||||
attn_implementation="flash_attention_2"
|
||||
)
|
||||
|
||||
inputs = tokenizer("The Qwen2 model family is", return_tensors="pt").to("cuda")
|
||||
inputs = tokenizer("The Qwen2 model family is", return_tensors="pt").to("cuda")
|
||||
outputs = model.generate(**inputs, max_new_tokens=100)
|
||||
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
@@ -75,10 +75,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="transformers-cli">
|
||||
<hfoption id="transformers CLI">
|
||||
|
||||
```bash
|
||||
echo -e "translate English to French: The weather is nice today." | transformers-cli run --task text2text-generation --model google-t5/t5-base --device 0
|
||||
echo -e "translate English to French: The weather is nice today." | transformers run --task text2text-generation --model google-t5/t5-base --device 0
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
|
||||
Reference in New Issue
Block a user