-
Notifications
You must be signed in to change notification settings - Fork 165
关于output_attentions的问题 #100
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Comments
感谢您的反馈,我们正在复现这个问题。 |
我们发现这可能是由于数值溢出造成的,改为 torch.bfloat16 可以初步解决。 import argparse
import os
import time
import numpy as np
import torch
from PIL import Image
from decord import VideoReader, cpu
from vita.constants import (
DEFAULT_AUDIO_TOKEN,
DEFAULT_IMAGE_TOKEN,
DEFAULT_VIDEO_TOKEN,
IGNORE_INDEX,
IMAGE_TOKEN_INDEX,
MAX_IMAGE_LENGTH,
)
from vita.conversation import SeparatorStyle, conv_templates
from vita.model.builder import load_pretrained_model
from vita.util.mm_utils import (
KeywordsStoppingCriteria,
get_model_name_from_path,
tokenizer_image_audio_token,
tokenizer_image_token,
)
from vita.util.utils import disable_torch_init
def _get_rawvideo_dec(
video_path,
image_processor,
max_frames=MAX_IMAGE_LENGTH,
min_frames=4,
image_resolution=384,
video_framerate=1,
s=None,
e=None,
image_aspect_ratio="pad",
):
# speed up video decode via decord.
if s is None:
start_time, end_time = None, None
else:
start_time = int(s)
end_time = int(e)
start_time = start_time if start_time >= 0.0 else 0.0
end_time = end_time if end_time >= 0.0 else 0.0
if start_time > end_time:
start_time, end_time = end_time, start_time
elif start_time == end_time:
end_time = start_time + 1
if os.path.exists(video_path):
vreader = VideoReader(video_path, ctx=cpu(0))
else:
print(video_path)
raise FileNotFoundError
fps = vreader.get_avg_fps()
f_start = 0 if start_time is None else int(start_time * fps)
f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
num_frames = f_end - f_start + 1
if num_frames > 0:
# T x 3 x H x W
sample_fps = int(video_framerate)
t_stride = int(round(float(fps) / sample_fps))
all_pos = list(range(f_start, f_end + 1, t_stride))
if len(all_pos) > max_frames:
sample_pos = [
all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)
]
elif len(all_pos) < min_frames:
sample_pos = [
all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=min_frames, dtype=int)
]
else:
sample_pos = all_pos
patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
if image_aspect_ratio == "pad":
def expand2square(pil_img, background_color):
width, height = pil_img.size
if width == height:
return pil_img
elif width > height:
result = Image.new(pil_img.mode, (width, width), background_color)
result.paste(pil_img, (0, (width - height) // 2))
return result
else:
result = Image.new(pil_img.mode, (height, height), background_color)
result.paste(pil_img, ((height - width) // 2, 0))
return result
patch_images = [
expand2square(i, tuple(int(x * 255) for x in image_processor.image_mean))
for i in patch_images
]
patch_images = [
image_processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
for i in patch_images
]
else:
patch_images = [
image_processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
for i in patch_images
]
patch_images = torch.stack(patch_images)
slice_len = patch_images.shape[0]
return patch_images, slice_len
else:
print("video path: {} error.".format(video_path))
if __name__ == "__main__":
# Initialize the parser
parser = argparse.ArgumentParser(description="Process model and video paths.")
# Add arguments
parser.add_argument("--model_path", type=str, required=True, help="Path to the model directory")
parser.add_argument("--model_base", type=str, default=None)
parser.add_argument("--video_path", type=str, default=None)
parser.add_argument("--image_path", type=str, default=None)
parser.add_argument("--audio_path", type=str, default=None)
parser.add_argument("--model_type", type=str, default="mixtral-8x7b")
parser.add_argument("--conv_mode", type=str, default="mixtral_two")
parser.add_argument("--question", type=str, default="")
parser.add_argument("--frameCat", action='store_true')
# Parse the arguments
args = parser.parse_args()
# Assign arguments to variables
model_path = args.model_path
model_base = args.model_base
video_path = args.video_path
image_path = args.image_path
audio_path = args.audio_path
qs = args.question
assert (audio_path is None) != (qs == ""), "Exactly one of audio_path or qs must be non-None"
conv_mode = args.conv_mode
if args.frameCat:
from vita.util.data_utils_video_audio_neg_frameCat import dynamic_preprocess
else:
from vita.util.data_utils_video_audio_neg_patch import dynamic_preprocess
# The number of visual tokens varies with the length of the video. "max_frames" is the maximum number of frames.
# When the video is long, we will uniformly downsample the video to meet the frames when equal to the "max_frames".
max_frames = MAX_IMAGE_LENGTH # 100
# The number of frames retained per second in the video.
video_framerate = 1
# Sampling Parameter
temperature = 0.01
top_p = None
num_beams = 1
disable_torch_init()
model_path = os.path.expanduser(model_path)
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(
model_path, model_base, model_name, args.model_type
)
# 将主模型转换为 bfloat16
model = model.to(dtype=torch.bfloat16)
model.resize_token_embeddings(len(tokenizer))
vision_tower = model.get_vision_tower()
if not vision_tower.is_loaded:
vision_tower.load_model()
# 将视觉模型转换为 bfloat16
vision_tower.to(dtype=torch.bfloat16)
image_processor = vision_tower.image_processor
audio_encoder = model.get_audio_encoder()
audio_encoder.to(dtype=torch.bfloat16)
audio_processor = audio_encoder.audio_processor
model.eval()
if audio_path is not None:
audio, audio_for_llm_lens = audio_processor.process(os.path.join(audio_path))
audio_length = audio.shape[0]
audio = torch.unsqueeze(audio, dim=0)
audio_length = torch.unsqueeze(torch.tensor(audio_length), dim=0)
audio_for_llm_lens = torch.unsqueeze(torch.tensor(audio_for_llm_lens), dim=0)
audios = dict()
audios["audios"] = audio.to(torch.bfloat16).cuda()
audios["lengths"] = audio_length.to(torch.bfloat16).cuda()
audios["lengths_for_llm"] = audio_for_llm_lens.cuda()
else:
audio = torch.zeros(400, 80)
audio_length = audio.shape[0]
audio_for_llm_lens = 60
audio = torch.unsqueeze(audio, dim=0)
audio_length = torch.unsqueeze(torch.tensor(audio_length), dim=0)
audio_for_llm_lens = torch.unsqueeze(torch.tensor(audio_for_llm_lens), dim=0)
audios = dict()
audios["audios"] = audio.to(torch.bfloat16).cuda()
audios["lengths"] = audio_length.to(torch.bfloat16).cuda()
audios["lengths_for_llm"] = audio_for_llm_lens.cuda()
# audios = None
# Check if the video exists
if video_path is not None:
video_frames, slice_len = _get_rawvideo_dec(
video_path,
image_processor,
max_frames=max_frames,
video_framerate=video_framerate,
image_aspect_ratio=getattr(model.config, "image_aspect_ratio", None),
)
image_tensor = video_frames.to(torch.bfloat16).cuda()
if audio_path:
qs = DEFAULT_IMAGE_TOKEN * slice_len + "\n" + qs + DEFAULT_AUDIO_TOKEN
else:
qs = DEFAULT_IMAGE_TOKEN * slice_len + "\n" + qs
modality = "video"
elif image_path is not None:
image = Image.open(image_path).convert("RGB")
if args.frameCat:
image, p_num = dynamic_preprocess(image, min_num=2, max_num=12, image_size=448, use_thumbnail=True, img_mean=image_processor.image_mean)
else:
image, p_num = dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=True)
assert len(p_num) == 1
image_tensor = model.process_images(image, model.config).to(
dtype=model.dtype, device="cuda"
)
if audio_path:
qs = DEFAULT_IMAGE_TOKEN * p_num[0] + "\n" + qs + DEFAULT_AUDIO_TOKEN
else:
qs = DEFAULT_IMAGE_TOKEN * p_num[0] + "\n" + qs
modality = "image"
else:
image_tensor = torch.zeros((1, 3, 448, 448)).to(dtype=model.dtype, device="cuda")
if audio_path:
qs = qs + DEFAULT_AUDIO_TOKEN
modality = "lang"
conv = conv_templates[conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt(modality)
if audio_path:
input_ids = (
tokenizer_image_audio_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
.unsqueeze(0)
.cuda()
)
else:
input_ids = (
tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
.unsqueeze(0)
.cuda()
)
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
start_time = time.time()
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=image_tensor,
audios=audios,
do_sample=False,
temperature=temperature,
top_p=top_p,
num_beams=num_beams,
output_scores=True,
return_dict_in_generate=True,
max_new_tokens=1024,
use_cache=True,
stopping_criteria=[stopping_criteria],
shared_v_pid_stride=None#2#16#8#4#1#None,
)
infer_time = time.time() - start_time
output_ids = output_ids.sequences
input_token_len = input_ids.shape[1]
if args.model_type == "mixtral-8x7b":
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
if n_diff_input_output > 0:
print(f"[Warning] {n_diff_input_output} output_ids are not the same as the input_ids")
output_ids = output_ids[:, input_token_len:]
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=False)[0]
outputs = outputs.strip()
if outputs.endswith(stop_str):
outputs = outputs[: -len(stop_str)]
outputs = outputs.strip()
print(outputs)
print(f"Time consume: {infer_time}") 输出: CUDA_VISIBLE_DEVICES=2 python video_audio_demo.py --model_path ./VITA-1.5 --image_path asset/vita_newlog.jpg --model_type qwen2p5_instruct --conv_mode qwen2p5_instruct --question "Describe this images."
/root/miniconda3/envs/vita/lib/python3.10/site-packages/xformers/ops/fmha/flash.py:211: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
@torch.library.impl_abstract("xformers_flash::flash_fwd")
/root/miniconda3/envs/vita/lib/python3.10/site-packages/xformers/ops/fmha/flash.py:344: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
@torch.library.impl_abstract("xformers_flash::flash_bwd")
/root/miniconda3/envs/vita/lib/python3.10/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers
warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning)
Please build and install Nvidia apex package with option '--cuda_ext' according to https://github.com/NVIDIA/apex#from-source .
Please install mamba_ssm to use MambaSSM component.
/root/miniconda3/envs/vita/lib/python3.10/site-packages/torch/_jit_internal.py:745: FutureWarning: ignore(True) has been deprecated. TorchScript will now drop the function call on compilation. Use torch.jit.unused now. {}
warnings.warn(
the number of whale encoder params: 341.3681640625M
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████| 4/4 [00:02<00:00, 1.43it/s]
the number of vision encoder params: 289.9287109375M
☜Logo of a company named Vita.<|im_end|>
Time consume: 0.679114580154419 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
video_audio_demo.py文件中的model.generate()中添加output_attentions=True的参数,产生的结果和不添加output_attentions参数时的模型输出不一致。会输出很多无意义的感叹号,有人知道是什么原因吗
The text was updated successfully, but these errors were encountered: