@@ -56,7 +56,7 @@ In the following, we demonstrate how to use `glm-4-9b-chat` for the inference. N
|
||||
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
>>> device = "cuda" # the device to load the model onto
|
||||
|
||||
>>> model = AutoModelForCausalLM.from_pretrained("THUDM/glm-4-9b-chat", device_map="auto")
|
||||
>>> model = AutoModelForCausalLM.from_pretrained("THUDM/glm-4-9b-chat", device_map="auto", trust_remote_code=True)
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b-chat")
|
||||
|
||||
>>> prompt = "Give me a short introduction to large language model."
|
||||
|
||||
@@ -64,18 +64,19 @@ Here's how to use the model for zero-shot object detection:
|
||||
|
||||
>>> results = processor.post_process_grounded_object_detection(
|
||||
... outputs,
|
||||
... threshold=0.4,
|
||||
... inputs.input_ids,
|
||||
... box_threshold=0.4,
|
||||
... text_threshold=0.3,
|
||||
... target_sizes=[(image.height, image.width)]
|
||||
... target_sizes=[image.size[::-1]]
|
||||
... )
|
||||
>>> # Retrieve the first image result
|
||||
|
||||
# Retrieve the first image result
|
||||
>>> result = results[0]
|
||||
>>> for box, score, text_label in zip(result["boxes"], result["scores"], result["text_labels"]):
|
||||
>>> for box, score, labels in zip(result["boxes"], result["scores"], result["labels"]):
|
||||
... box = [round(x, 2) for x in box.tolist()]
|
||||
... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
|
||||
Detected a cat with confidence 0.479 at location [344.7, 23.11, 637.18, 374.28]
|
||||
Detected a cat with confidence 0.438 at location [12.27, 51.91, 316.86, 472.44]
|
||||
Detected a remote control with confidence 0.478 at location [38.57, 70.0, 176.78, 118.18]
|
||||
... print(f"Detected {labels} with confidence {round(score.item(), 3)} at location {box}")
|
||||
Detected a cat with confidence 0.468 at location [344.78, 22.9, 637.3, 373.62]
|
||||
Detected a cat with confidence 0.426 at location [11.74, 51.55, 316.51, 473.22]
|
||||
```
|
||||
|
||||
## Grounded SAM
|
||||
|
||||
@@ -81,7 +81,7 @@ text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=
|
||||
|
||||
# Note that the template simply formats your prompt, you still have to tokenize it and obtain pixel values for your images
|
||||
print(text_prompt)
|
||||
>>> "<|im_start|>user\n<image>What is shown in this image?<|im_end|>\n<|im_start|>assistant\nPage showing the list of options.<|im_end|>"
|
||||
'<|im_start|>user\n<image>What is shown in this image?<|im_end|>\n<|im_start|>assistant\nPage showing the list of options.<|im_end|>'
|
||||
```
|
||||
|
||||
This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay).
|
||||
|
||||
@@ -110,9 +110,14 @@ To follow the example of the following image, `"Hello, I'm Moshi"` could be tran
|
||||
>>> from datasets import load_dataset, Audio
|
||||
>>> import torch, math
|
||||
>>> from transformers import MoshiForConditionalGeneration, AutoFeatureExtractor, AutoTokenizer
|
||||
>>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
|
||||
|
||||
>>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/moshiko-pytorch-bf16")
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("kyutai/moshiko-pytorch-bf16")
|
||||
>>> device = "cuda"
|
||||
>>> dtype = torch.bfloat16
|
||||
|
||||
>>> # prepare user input audio
|
||||
>>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
|
||||
>>> audio_sample = librispeech_dummy[-1]["audio"]["array"]
|
||||
|
||||
@@ -57,10 +57,7 @@ Phi-3 has been integrated in the development version (4.40.0.dev) of `transforme
|
||||
>>> outputs = model.generate(inputs, max_new_tokens=32)
|
||||
>>> text = tokenizer.batch_decode(outputs)[0]
|
||||
>>> print(text)
|
||||
<s><|user|>
|
||||
Can you provide ways to eat combinations of bananas and dragonfruits?<|end|>
|
||||
<|assistant|>
|
||||
Certainly! Bananas and dragonfruits can be combined in various delicious ways. Here are some ideas for eating combinations of bananas and
|
||||
<|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Certainly! Bananas and dragonfruits can be combined in various delicious ways. Here are some creative ideas for incorporating both fruits
|
||||
```
|
||||
|
||||
## Phi3Config
|
||||
|
||||
@@ -52,7 +52,7 @@ Here is how to use the processor to process text and audio:
|
||||
```python
|
||||
>>> # let's load an audio sample from an Arabic speech corpus
|
||||
>>> from datasets import load_dataset
|
||||
>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True)
|
||||
>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
|
||||
>>> audio_sample = next(iter(dataset))["audio"]
|
||||
|
||||
>>> # now, process it
|
||||
|
||||
@@ -52,7 +52,7 @@ Here is how to use the processor to process text and audio:
|
||||
```python
|
||||
>>> # let's load an audio sample from an Arabic speech corpus
|
||||
>>> from datasets import load_dataset
|
||||
>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True)
|
||||
>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
|
||||
>>> audio_sample = next(iter(dataset))["audio"]
|
||||
|
||||
>>> # now, process it
|
||||
|
||||
@@ -86,7 +86,7 @@ If you want to do the pre- and postprocessing yourself, here's how to do that:
|
||||
>>> candidate_labels = ["2 cats", "2 dogs"]
|
||||
# follows the pipeline prompt template to get same results
|
||||
>>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
|
||||
>>> # important: we pass `padding=max_length` since the model was trained with this
|
||||
# important: we pass `padding=max_length` since the model was trained with this
|
||||
>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
|
||||
|
||||
>>> with torch.no_grad():
|
||||
@@ -95,7 +95,7 @@ If you want to do the pre- and postprocessing yourself, here's how to do that:
|
||||
>>> logits_per_image = outputs.logits_per_image
|
||||
>>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
|
||||
>>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
|
||||
31.9% that image 0 is 'a photo of 2 cats'
|
||||
19.8% that image 0 is '2 cats'
|
||||
```
|
||||
|
||||
## Resources
|
||||
@@ -142,8 +142,7 @@ To load and run a model using Flash Attention 2, refer to the snippet below:
|
||||
# follows the pipeline prompt template to get same results
|
||||
>>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
|
||||
# important: we pass `padding=max_length` since the model was trained with this
|
||||
>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
|
||||
>>> inputs.to(device)
|
||||
>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt").to(device)
|
||||
|
||||
>>> with torch.no_grad():
|
||||
... with torch.autocast(device):
|
||||
@@ -152,7 +151,7 @@ To load and run a model using Flash Attention 2, refer to the snippet below:
|
||||
>>> logits_per_image = outputs.logits_per_image
|
||||
>>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
|
||||
>>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
|
||||
51.3% that image 0 is 'This is a photo of 2 cats.'
|
||||
19.8% that image 0 is '2 cats'
|
||||
```
|
||||
|
||||
|
||||
|
||||
@@ -70,7 +70,7 @@ Alternatively, one can also perform inference using the classes:
|
||||
>>> inputs = image_processor(images=image, return_tensors="pt")
|
||||
|
||||
>>> with torch.no_grad():
|
||||
... outputs = model(pixel_values)
|
||||
... outputs = model(inputs)
|
||||
|
||||
>>> # interpolate to original size and visualize the prediction
|
||||
>>> ## ZoeDepth dynamically pads the input image. Thus we pass the original image size as argument
|
||||
|
||||
Reference in New Issue
Block a user