Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,9 @@ uv.lock
workspace/*
.claude/*
remote_code/*

results/
lavis
cookies.txt
external/
*.ipynb
7 changes: 5 additions & 2 deletions examples/models/qwen25vl.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Run and exactly reproduce qwen2vl results!
# mme as an example
export HF_HOME="~/.cache/huggingface"
export HF_HOME="~/flash/.cache/huggingface"
# pip install git+https://github.com/EvolvingLMMs-Lab/lmms-eval.git
# pip3 install qwen_vl_utils
# use `interleave_visuals=True` to control the visual token position, currently only for mmmu_val and mmmu_pro (and potentially for other interleaved image-text tasks), please do not use it unless you are sure about the operation details.
Expand All @@ -11,8 +11,11 @@ export HF_HOME="~/.cache/huggingface"
# --tasks mmmu_pro \
# --batch_size 1

echo "Running Qwen2.5-VL-7B-Instruct"
accelerate launch --num_processes=8 --main_process_port=12346 -m lmms_eval \
--model qwen2_5_vl \
--model_args=pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_pixels=12845056,attn_implementation=flash_attention_2,interleave_visuals=False \
--tasks mme \
--batch_size 1
--batch_size 1

# uv run python -m lmms_eval --model qwen2_5_vl --model_args=pretrained=Qwen/Qwen2.5-VL-3B-Instruct,max_pixels=602112,interleave_visuals=False,attn_implementation=flash_attention_2,video_sampler=uniform --tasks egoschema --batch_size 1 --output_path results/test.jsonl
25 changes: 25 additions & 0 deletions lmms_eval/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
import torch
import yaml

from dotenv import load_dotenv

load_dotenv()

warnings.simplefilter("ignore", category=DeprecationWarning)

import hashlib
Expand Down Expand Up @@ -274,6 +278,8 @@ def parse_eval_args() -> argparse.Namespace:
)
parser.add_argument("--process_with_media", action="store_true", help="Whether you will process you dataset with audio, image. By default set to False" "In case some benchmarks need to be processed with media, set this flag to True.")
parser.add_argument("--force_simple", action="store_true", help="Force the evaluation to use the simple mode of the models")
parser.add_argument("--video_sampler", type=str, default=None, help="Video sampler to use")
parser.add_argument("--video_sampler_kwargs", default="", help="String arguments for video sampler, e.g. `max_num_frames=32,ratio=1,t1=0.8,t2=-100,all_depth=5`",)
args = parser.parse_args()
return args

Expand Down Expand Up @@ -481,9 +487,28 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
request_caching_args = request_caching_arg_to_dict(cache_requests=args.cache_requests)
datetime_str = utils.get_datetime_str(timezone=args.timezone)

# Configure metrics logging destination for downstream log_metrics calls.
os.environ.pop("LMMS_EVAL_METRICS_PATH", None)
if args.output_path:
from lmms_eval.loggers.evaluation_tracker import GeneralConfigTracker

fallback_model_name = args.model if isinstance(args.model, str) else str(args.model)
candidate_model_name = GeneralConfigTracker._get_model_name(args.model_args or "") or fallback_model_name
sanitized_model_name = utils.sanitize_model_name(candidate_model_name) or utils.sanitize_model_name(fallback_model_name) or "model"

is_distributed = torch.distributed.is_available() and torch.distributed.is_initialized()
if not is_distributed or (is_distributed and torch.distributed.get_rank() == 0):
metrics_dir = Path(args.output_path).expanduser().resolve() / sanitized_model_name
metrics_dir.mkdir(parents=True, exist_ok=True)
date_id = datetime_str.replace(":", "-")
metrics_path = metrics_dir / f"{date_id}_metrics.json"
os.environ["LMMS_EVAL_METRICS_PATH"] = str(metrics_path)

results = evaluator.simple_evaluate(
model=args.model,
model_args=args.model_args,
video_sampler=args.video_sampler,
video_sampler_kwargs=args.video_sampler_kwargs,
tasks=task_names,
num_fewshot=args.num_fewshot,
batch_size=args.batch_size,
Expand Down
4 changes: 3 additions & 1 deletion lmms_eval/api/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ class Instance:
metadata: Tuple[str, int, int] = field(default_factory=lambda: (None, None, None)) # TODO: better typehints here
resps: list = field(default_factory=list)
filtered_resps: dict = field(default_factory=dict)

video_metadata: object = None
num_input_tokens: int = None
num_input_vision_tokens: int = None
# initialized after init
task_name: str = None
doc_id: str = None
Expand Down
62 changes: 60 additions & 2 deletions lmms_eval/api/task.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import abc
import ast
import copy
import importlib
import inspect
import itertools
import json
Expand All @@ -27,6 +28,10 @@
)

import datasets
try:
import torch
except ImportError:
torch = None
import numpy as np
from accelerate import Accelerator
from datasets import Audio, DownloadConfig, Image, Sequence
Expand Down Expand Up @@ -939,7 +944,47 @@ def _download_from_youtube(path):
cache_dir = dataset_kwargs["cache_dir"]
cache_dir = os.path.join(hf_home, cache_dir)
accelerator = Accelerator()
external_downloader_spec = dataset_kwargs.get("external_downloader") if dataset_kwargs is not None else None
if accelerator.is_main_process:
if external_downloader_spec:

def _resolve_external_downloader(spec):
if callable(spec):
return spec, {}
if isinstance(spec, str):
if "." not in spec:
raise ValueError("external_downloader string must be a fully qualified path, e.g. 'pkg.module.fn'")
module_path, attr_name = spec.rsplit(".", 1)
return getattr(importlib.import_module(module_path), attr_name), {}
if isinstance(spec, dict):
if "fn" not in spec:
raise ValueError("external_downloader dict must include a 'fn' key")
fn, base_kwargs = _resolve_external_downloader(spec["fn"])
kwargs = {**base_kwargs, **spec.get("kwargs", {})}
return fn, kwargs
raise TypeError(f"Unsupported external_downloader spec type: {type(spec)}")

downloader_fn, downloader_kwargs = _resolve_external_downloader(external_downloader_spec)
downloader_kwargs.setdefault("cache_dir", cache_dir)
downloader_kwargs.setdefault("videos_dir", os.path.join(cache_dir, "videos"))
download_result = downloader_fn(**downloader_kwargs)

def _set_nested(target_dict, dotted_key, value):
keys = dotted_key.split(".")
curr = target_dict
for key in keys[:-1]:
if key not in curr or not isinstance(curr[key], dict):
curr[key] = {}
curr = curr[key]
curr[keys[-1]] = value

if isinstance(external_downloader_spec, dict):
result_target = external_downloader_spec.get("result_dataset_kwarg")
if result_target and download_result is not None:
if dataset_kwargs is None:
dataset_kwargs = {}
_set_nested(dataset_kwargs, result_target, os.path.expanduser(str(download_result)))

force_download = dataset_kwargs.get("force_download", False)
force_unzip = dataset_kwargs.get("force_unzip", False)
revision = dataset_kwargs.get("revision", "main")
Expand Down Expand Up @@ -1024,8 +1069,21 @@ def concat_tar_parts(tar_parts, output_tar):
eval_logger.info(f"Symbolic link created successfully: {cache_path} -> {cache_dir}")

accelerator.wait_for_everyone()
dataset_kwargs.pop("cache_dir")
dataset_kwargs.pop("video")
if dataset_kwargs is not None:
if accelerator.num_processes > 1:
if torch is not None and torch.distributed.is_available() and torch.distributed.is_initialized():
shared_dataset_kwargs = [dataset_kwargs if accelerator.is_main_process else None]
torch.distributed.broadcast_object_list(shared_dataset_kwargs, src=0)
dataset_kwargs = shared_dataset_kwargs[0]
elif accelerator.is_main_process:
eval_logger.warning("Multiple processes detected but torch.distributed is not initialized. Secondary ranks may not receive updated dataset kwargs.")
if "external_downloader" in dataset_kwargs:
external_downloader = dataset_kwargs.pop("external_downloader", None)
if 'data_files' in external_downloader:
dataset_kwargs['data_files'] = external_downloader.pop('data_files')
dataset_kwargs['split'] = external_downloader.pop('split', 'test')
dataset_kwargs.pop("cache_dir", None)
dataset_kwargs.pop("video", None)

if "builder_script" in dataset_kwargs:
builder_script = dataset_kwargs["builder_script"]
Expand Down
52 changes: 52 additions & 0 deletions lmms_eval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
run_task_tests,
simple_parse_args_string,
)
from lmms_eval.video_samplers import get_video_sampler_cls


@positional_deprecated
Expand Down Expand Up @@ -83,6 +84,8 @@ def simple_evaluate(
distributed_executor_backend: str = "accelerate",
cli_args=None,
force_simple: bool = False,
video_sampler: Optional[str] = None,
video_sampler_kwargs: Optional[str] = None,
):
"""Instantiate and evaluate a model on a list of tasks.

Expand Down Expand Up @@ -183,6 +186,18 @@ def simple_evaluate(
if task_manager is None:
task_manager = TaskManager(verbosity, model_name=model)

video_sampler_instance = None
if isinstance(video_sampler, str):
if video_sampler_kwargs is None:
video_sampler_kwargs = ""
video_sampler_instance = get_video_sampler_cls(video_sampler).create_from_arg_string(video_sampler_kwargs,
{
"batch_size": batch_size,
"device": device,
}
)


if isinstance(model, str):
if model_args is None:
model_args = ""
Expand All @@ -191,6 +206,7 @@ def simple_evaluate(
{
"batch_size": batch_size,
"max_batch_size": max_batch_size,
"video_sampler": video_sampler_instance,
"device": device,
},
)
Expand Down Expand Up @@ -255,6 +271,8 @@ def _adjust_config(task_dict):
evaluation_tracker.general_config_tracker.log_experiment_args(
model_source=model,
model_args=model_args,
video_sampler=video_sampler,
video_sampler_kwargs=video_sampler_kwargs,
system_instruction=system_instruction,
chat_template=lm.chat_template if apply_chat_template else None,
fewshot_as_multiturn=fewshot_as_multiturn,
Expand Down Expand Up @@ -411,6 +429,20 @@ def evaluate(
if distributed_executor_backend == "accelerate" and not hasattr(lm, "accelerator"):
lm.accelerator = Accelerator()

def _serialize_video_metadata(meta):
if isinstance(meta, dict):
return {k: _serialize_video_metadata(v) for k, v in meta.items()}
if isinstance(meta, (list, tuple)):
return [_serialize_video_metadata(v) for v in meta]
if hasattr(meta, "tolist"):
try:
return meta.tolist()
except TypeError:
pass
if isinstance(meta, (np.generic,)):
return meta.item()
return meta

for task_output in eval_tasks:
task: Task = task_output.task
task_name = task_output.task_name
Expand Down Expand Up @@ -578,6 +610,18 @@ def evaluate(
# else:
# filtered_arguments.append(_handle_non_serializable(value))

video_metadata_list = []
seen_instance_ids = set()
for req in requests:
if id(req) in seen_instance_ids:
continue
seen_instance_ids.add(id(req))
meta = getattr(req, "video_metadata", None)
num_input_tokens = getattr(req, "num_input_tokens", None)
num_input_vision_tokens = getattr(req, "num_input_vision_tokens", None)
if meta is None:
continue
video_metadata_list.append(_serialize_video_metadata(meta))
example = {
"doc_id": doc_id,
"doc": saved_doc,
Expand All @@ -595,6 +639,14 @@ def evaluate(
),
# Removing prompt hash and target hash here
}
if video_metadata_list:
example["video_metadata"] = (
video_metadata_list if len(video_metadata_list) > 1 else video_metadata_list[0]
)
if num_input_tokens is not None:
example["num_input_tokens"] = num_input_tokens
if num_input_vision_tokens is not None:
example["num_input_vision_tokens"] = num_input_vision_tokens
example.update(metrics)
task_output.logged_samples.append(example)
for metric, value in metrics.items():
Expand Down
4 changes: 4 additions & 0 deletions lmms_eval/loggers/evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ def log_experiment_args(
self,
model_source: str,
model_args: str,
video_sampler: str,
video_sampler_kwargs: str,
system_instruction: str,
chat_template: str,
fewshot_as_multiturn: bool,
Expand All @@ -86,6 +88,8 @@ def log_experiment_args(
self.model_source = model_source
self.model_name = GeneralConfigTracker._get_model_name(model_args)
self.model_name_sanitized = sanitize_model_name(self.model_name)
self.video_sampler = video_sampler
self.video_sampler_kwargs = video_sampler_kwargs
self.system_instruction = system_instruction
self.system_instruction_sha = hash_string(system_instruction) if system_instruction else None
self.chat_template = chat_template
Expand Down
Loading