diff --git a/torcheval/metrics/metrics.py b/torcheval/metrics/metrics.py
new file mode 100644
index 00000000..3027aa77
--- /dev/null
+++ b/torcheval/metrics/metrics.py
@@ -0,0 +1,227 @@
+# pylint: disable=E1101,W0622
+
+from __future__ import annotations
+
+from functools import partial
+from math import nan
+from typing import Any, Callable, Iterable
+
+try:
+    from functools import cached_property  # type: ignore
+except ImportError:
+    from functools import lru_cache
+
+    def cached_property(f):  # type: ignore
+        return property(lru_cache()(f))
+
+
+import torch
+from chanfig import FlatDict
+from torch import Tensor
+from torch import distributed as dist
+from .metric import Metric
+from . import functional as F
+
+
+class flist(list):  # pylint: disable=R0903
+    def __format__(self, *args, **kwargs):
+        return " ".join([x.__format__(*args, **kwargs) for x in self])
+
+
+class Metrics(Metric):
+    r"""
+    Metric class wraps around multiple metrics that share the same states.
+
+    Typically, there are many metrics that we want to compute for a single task.
+    For example, we usually needs to compute `accuracy`, `auroc`, `auprc` for a classification task.
+    Computing them one by one is inefficient, especially when evaluating in a distributed environment.
+
+    To solve this problem, Metrics maintains a shared state for multiple metric functions.
+
+    Attributes:
+        metrics: A dictionary of metrics to be computed.
+        input: The input tensor of latest batch.
+        target: The target tensor of latest batch.
+        inputs: All input tensors.
+        targets: All target tensors.
+
+    Args:
+        *args: A single mapping of metrics.
+        **metrics: Metrics.
+    """
+
+    metrics: FlatDict[str, Callable]
+    _input: Tensor
+    _target: Tensor
+    _inputs: list[Tensor]
+    _targets: list[Tensor]
+    _input_buffer: list[Tensor]
+    _target_buffer: list[Tensor]
+    index: str
+    best_fn: Callable
+
+    def __init__(self, *args, **metrics: FlatDict[str, Callable]):
+        super().__init__()
+        self._add_state("_input", torch.empty(0))
+        self._add_state("_target", torch.empty(0))
+        self._add_state("_inputs", [])
+        self._add_state("_targets", [])
+        self._add_state("_input_buffer", [])
+        self._add_state("_target_buffer", [])
+        self.metrics = FlatDict(*args, **metrics)
+
+    @torch.inference_mode()
+    def update(self, input: Any, target: Any) -> None:
+        if not isinstance(input, torch.Tensor):
+            input = torch.tensor(input)
+        if not isinstance(target, torch.Tensor):
+            target = torch.tensor(target)
+        input, target = input.to(self.device), target.to(self.device)
+        self._input, self._target = input, target
+        self._input_buffer.append(input)
+        self._target_buffer.append(target)
+
+    def compute(self) -> FlatDict[str, float]:
+        return self.comp
+
+    def value(self) -> FlatDict[str, float]:
+        return self.val
+
+    def average(self) -> FlatDict[str, float]:
+        return self.avg
+
+    @cached_property
+    def comp(self) -> FlatDict[str, float]:
+        return self._compute(self._input, self._target)
+
+    @cached_property
+    def val(self) -> FlatDict[str, float]:
+        return self._compute(self.input, self.target)
+
+    @cached_property
+    def avg(self) -> FlatDict[str, float]:
+        return self._compute(self.inputs, self.targets)
+
+    @torch.inference_mode()
+    def _compute(self, input: Tensor, target: Tensor) -> flist | float:
+        if input.numel() == 0 == target.numel():
+            return FlatDict({name: nan for name in self.metrics.keys()})
+        ret = FlatDict()
+        for name, metric in self.metrics.items():
+            score = metric(input, target)
+            ret[name] = score.item() if score.numel() == 1 else flist(score.tolist())
+        return ret
+
+    @torch.inference_mode()
+    def merge_state(self, metrics: Iterable):
+        raise NotImplementedError()
+
+    @cached_property
+    @torch.inference_mode()
+    def input(self):
+        if not dist.is_initialized() or dist.get_world_size() == 1:
+            return self._input
+        synced_input = [torch.zeros_like(self._input) for _ in range(dist.get_world_size())]
+        dist.all_gather(synced_input, self._input)
+        return torch.cat([t.to(self.device) for t in synced_input], 0)
+
+    @cached_property
+    @torch.inference_mode()
+    def target(self):
+        if not dist.is_initialized() or dist.get_world_size() == 1:
+            return self._target
+        synced_target = [torch.zeros_like(self._target) for _ in range(dist.get_world_size())]
+        dist.all_gather(synced_target, self._target)
+        return torch.cat([t.to(self.device) for t in synced_target], 0)
+
+    @cached_property
+    @torch.inference_mode()
+    def inputs(self):
+        if not self._inputs:
+            return torch.empty(0)
+        if self._input_buffer and dist.is_initialized() and dist.get_world_size() > 1:
+            synced_inputs = [None for _ in range(dist.get_world_size())]
+            dist.all_gather_object(synced_inputs, self._input_buffer)
+            self._inputs.extend(synced_inputs)
+        return torch.cat(self._inputs, 0)
+
+    @cached_property
+    @torch.inference_mode()
+    def targets(self):
+        if not self._targets:
+            return torch.empty(0)
+        if self._target_buffer and dist.is_initialized() and dist.get_world_size() > 1:
+            synced_targets = [None for _ in range(dist.get_world_size())]
+            dist.all_gather_object(synced_targets, self._target_buffer)
+            self._targets.extend(synced_targets)
+        return torch.cat(self._targets, 0)
+
+    def __repr__(self):
+        keys = tuple(i for i in self.metrics.keys())
+        return f"{self.__class__.__name__}{keys}"
+
+    def __format__(self, format_spec):
+        val, avg = self.compute(), self.average()
+        return "\n".join(
+            [f"{key}: {val[key].__format__(format_spec)} ({avg[key].__format__(format_spec)})" for key in self.metrics]
+        )
+
+
+class IndexMetrics(Metrics):
+    r"""
+    IndexMetrics is a subclass of Metrics that supports scoring.
+
+    Score is a single value that best represents the performance of the model.
+    It is the core metrics that we use to compare different models.
+    For example, in classification, we usually use auroc as the score.
+
+    IndexMetrics requires two additional arguments: `index` and `best_fn`.
+    `index` is the name of the metric that we use to compute the score.
+    `best_fn` is a function that takes a list of values and returns the best value.
+    `best_fn` is only not used by IndexMetrics, it is meant to be accessed by other classes.
+
+    Attributes:
+        index: The name of the metric that we use to compute the score.
+        best_fn: A function that takes a list of values and returns the best value.
+
+    Args:
+        *args: A single mapping of metrics.
+        index: The name of the metric that we use to compute the score. Defaults to the first metric.
+        best_fn: A function that takes a list of values and returns the best value. Defaults to `max`.
+        **metrics: Metrics.
+    """
+
+    index: str
+    best_fn: Callable
+
+    def __init__(
+        self, *args, index: str | None = None, best_fn: Callable | None = max, **metrics: FlatDict[str, Callable]
+    ):
+        super().__init__(*args, **metrics)
+        self.index = index or next(iter(self.metrics.keys()))
+        self.metric = self.metrics[self.index]
+        self.best_fn = best_fn or max
+
+    def score(self, scope: str) -> float | flist:
+        if scope == "batch":
+            return self.batch_score()
+        if scope == "average":
+            return self.average_score()
+        raise ValueError(f"Unknown scope: {scope}")
+
+    def batch_score(self) -> float | flist:
+        return self.calculate(self.metric, self.input, self.target)
+
+    def average_score(self) -> float | flist:
+        return self.calculate(self.metric, self.inputs, self.targets)
+
+
+def binary_metrics():
+    return Metrics(auroc=F.binary_auroc, auprc=F.binary_auprc, acc=F.binary_accuracy)
+
+
+def multiclass_metrics(num_classes: int):
+    auroc = partial(F.multiclass_auroc, num_classes=num_classes)
+    auprc = partial(F.multiclass_auprc, num_classes=num_classes)
+    acc = partial(F.multiclass_accuracy, num_classes=num_classes)
+    return Metrics(auroc=auroc, auprc=auprc, acc=acc)