Skip to content

Commit

Permalink
MacroF MicroF to version 2.0
Browse files Browse the repository at this point in the history
  • Loading branch information
thammegowda committed Sep 18, 2021
1 parent 078c440 commit 591ac9e
Show file tree
Hide file tree
Showing 10 changed files with 762 additions and 6 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,7 @@ data/
build
dist
__pycache__
**.pyc
.idea
.sacrebleu

8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,14 @@ TER related arguments (The defaults replicate TERCOM's behavior):
--ter-asian-support Enables special treatment of Asian characters (Default: False)
--ter-no-punct Removes punctuation. (Default: False)
--ter-normalized Applies basic normalization and tokenization. (Default: False)
MacroF and MicroF related arguments (the defaults replicate Gowda et al NAACL 2021):
--f-beta F_BETA Determine the importance of recall w.r.t precision. (Default: 1)
--f-lowercase Enable case-insensitivity. (Default: False)
--f-tokenize {none,zh,13a,char,intl,ja-mecab}
Tokenization method to use for Macro and MicroF, same as BLEU --tokenize. (Default: None)
--f-smooth-value F_SMOOTH_VALUE
The smoothing value. only add-k smoothing method is supported for MacroF and MicroF (Default: 1)
```

### Version Signatures
Expand Down
1 change: 1 addition & 0 deletions sacrebleu/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@
from .compat import corpus_bleu, raw_corpus_bleu, sentence_bleu # noqa: F401
from .compat import corpus_chrf, sentence_chrf # noqa: F401
from .compat import corpus_ter, sentence_ter # noqa: F401
from .compat import corpus_f, corpus_macrof, corpus_microf # noqa: F401
42 changes: 40 additions & 2 deletions sacrebleu/compat.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Sequence, Optional
from typing import Sequence, Optional, Union
from functools import partial

from .metrics import BLEU, CHRF, TER, BLEUScore, CHRFScore, TERScore
from .metrics import METRICS, BLEU, CHRF, TER, BLEUScore, CHRFScore, TERScore, ClassifierEval, MultiClassMeasure


######################################################################
Expand Down Expand Up @@ -196,3 +197,40 @@ def sentence_ter(hypothesis: str,
asian_support=asian_support,
case_sensitive=case_sensitive)
return metric.sentence_score(hypothesis, references)


def corpus_f(hypotheses: Union[str, Sequence[str]],
references: Union[str, Sequence[Sequence[str]]],
average: str,
smooth_value=ClassifierEval.DEF_SMOOTH_VAL,
beta=ClassifierEval.DEF_F_BETA,
force=False,
lowercase=False,
tokenize=ClassifierEval.TOKENIZER_DEFAULT) -> MultiClassMeasure:
"""
Computes F-measure on a corpus
:param average: what kind of averaging to use for obtaining corpus level performance from types.
Options: macro, micro
:param hypotheses: The system stream (a sequence of segments)
:param references: A list of one or more reference streams (each a sequence of segments)
:param smooth_value: The smoothing value for `add-k` method. set smooth_value=0 to disable.
Does not influence macro-average
:param f_beta: β value that weighs recall in F-measure
:param force: Ignore data that looks already tokenized
:param lowercase: Lowercase the data
:param tokenize: The tokenizer to use
:return: a `MultiClassMeasure` object
"""

# limiting to these special cases because others are not tested
assert average in ('macro', 'micro')
args = dict(smooth_value=smooth_value, force=force,
lowercase=lowercase, tokenize=tokenize, beta=beta)
metric = METRICS[average.upper() + 'F'](**args)
return metric.corpus_score(hypotheses, references)

"""Computes Macro-F measure on a corpus. Refer to `corpus_f()` for additional arguments."""
corpus_macrof = partial(corpus_f, average='macro')

"""Computes Micro-F measure on a corpus. Refer to `corpus_f()` for additional arguments."""
corpus_microf = partial(corpus_f, average='micro')
6 changes: 6 additions & 0 deletions sacrebleu/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
"""The implementation of various metrics."""

from functools import partial
from .bleu import BLEU, BLEUScore # noqa: F401
from .chrf import CHRF, CHRFScore # noqa: F401
from .ter import TER, TERScore # noqa: F401
from .clseval import ClassifierEval, MultiClassMeasure


METRICS = {
'BLEU': BLEU,
'CHRF': CHRF,
'TER': TER,
}

METRICS['MACROF'] = partial(ClassifierEval, average='macro', max_ngram_order=1)
METRICS['MICROF'] = partial(ClassifierEval, average='micro', max_ngram_order=1)
5 changes: 5 additions & 0 deletions sacrebleu/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@


class Score:

# https://docs.python.org/3/reference/datamodel.html#slots
# __slots__ minimizes memory usage when many objs are created e.g. clseval.ClassMeasure
__slots__ = ('name', 'score', '_mean', '_ci', '_verbose')

"""A base score class to derive from.
:param name: The name of the underlying metric.
Expand Down
Loading

0 comments on commit 591ac9e

Please sign in to comment.