Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MT evaluation as a multi-class classifier: Macro and Micro averaged F-meaures #153

Closed
wants to merge 35 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
8429e22
reorganize: datasets and tokenizers are moved to separate .py modules
thammegowda Mar 20, 2020
8117680
invocation: swap ./sacrebleu.py with python -m sacrebleu
thammegowda Mar 20, 2020
a746099
Update README: remove `./sacrebleu.py` since it is no longer valid.
thammegowda Mar 20, 2020
ccd0549
ReBLEU: Revised BLEU with recall and micro/macro avg
thammegowda Mar 27, 2020
0a45115
remove unused code
thammegowda Mar 27, 2020
4c2a445
Change name of ReBLEU
thammegowda Mar 27, 2020
68d2f39
merged with master and resolved conflicts
thammegowda Mar 28, 2020
5832e2b
version 1.4.5
thammegowda Mar 28, 2020
d729c54
Merge branch 'master' of github.com:mjpost/sacrebleu
thammegowda Mar 30, 2020
6de1633
restructure corpus_rebleu()
thammegowda Apr 1, 2020
62f0344
rewrite rebleu: per class performance includes ngram performance
thammegowda Apr 1, 2020
8e93f2e
rebleu The one that works: 1gram f1 * precision of 2+ grams
thammegowda Apr 3, 2020
9338416
MacroBLEU format message
thammegowda Apr 7, 2020
a346dcd
Fix write_report of ReBLEU
thammegowda Apr 25, 2020
1336bab
Merge branch 'master' of github.com:mjpost/sacrebleu
thammegowda Aug 19, 2020
71c250b
resolve comflicts and merge
thammegowda Aug 19, 2020
ff4acce
integrate rebleu: macrobleu microbleu macrof1 microf1
thammegowda Aug 20, 2020
83690ea
hashbang: `env bash` instead of hardcoded /bin/bash
thammegowda Aug 20, 2020
ddef9a3
Merge branch 'master' of github.com:mjpost/sacrebleu
thammegowda Aug 26, 2020
b767626
Merge branch 'master' into rebleu
thammegowda Aug 26, 2020
dbaae8d
ReBLEU mem efficiency using __slots__ API
thammegowda Sep 1, 2020
a327125
add ReCHRF (macrochrf, microchrf), fix rebleu reporting, add __slots_…
thammegowda Sep 3, 2020
e9098a0
resolve conflicts and merge
thammegowda Sep 3, 2020
30a1e7c
fix log warning issue
thammegowda Sep 3, 2020
1040f32
integrate write_report to rechrf
thammegowda Sep 3, 2020
57bc647
rewrite rebleu with proper bucketing
thammegowda Sep 8, 2020
90f93f5
fix rebleu2 name in score formatting
thammegowda Sep 9, 2020
44f17a8
Merge pull request #2 from isi-nlp/rebleu
thammegowda Oct 16, 2020
85c3e71
Merge branch 'master' of github.com:mjpost/sacrebleu
thammegowda Apr 3, 2021
81344ff
resolve merge conflicts
thammegowda Apr 4, 2021
2d11364
MT evaluation as Multiclass classifier; Micro and Macro F-measure (#1)
thammegowda Apr 7, 2021
349e3bb
Update README: list down additional metrics supported
thammegowda Apr 7, 2021
ef434bc
MT evaluation as a multi-class classifier: macrof and microf metrics …
thammegowda Apr 7, 2021
df74a19
remove fstrings to support python 3.5
thammegowda Apr 8, 2021
dbbd619
merge and resolve conflicts
thammegowda Sep 8, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@ data/
build
dist
__pycache__

.sacrebleu
.idea
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,17 @@ If you use SacreBLEU, please cite the following:
pages = "186--191",
}
```

----
# SacreBLEU Extended

SacreBLEU, in its recent versions, has been extended to include additional evaluation metrics:

* ChrF : https://www.aclweb.org/anthology/W16-2341/
* TER : https://github.com/jhclark/tercom
* MacroF and MicroF : (TODO: update link)


Example:

sacrebleu REF.txt -m bleu chrf ter macrof microf < HYP.detok.txt
4 changes: 2 additions & 2 deletions sacrebleu/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

See the [README.md] file for more information.
"""
from .sacrebleu import main
from .sacrebleu import main

if __name__ == '__main__':
main()
main()
43 changes: 43 additions & 0 deletions sacrebleu/compat.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from typing import Union, Iterable, List
from argparse import Namespace
from functools import partial

from .tokenizers import DEFAULT_TOKENIZER
from .metrics import BLEU, CHRF, TER, BLEUScore, CHRFScore, TERScore
from .metrics import MultiClassMeasure, METRICS, AVG_TYPES


######################################################################
Expand Down Expand Up @@ -169,3 +171,44 @@ def sentence_ter(hypothesis: str,
asian_support=asian_support, case_sensitive=case_sensitive)
metric = TER(args)
return metric.sentence_score(hypothesis, references)


def corpus_f(sys_stream: Union[str, Iterable[str]],
ref_streams: Union[str, List[Iterable[str]]],
average: str,
smooth_value=1,
f_beta=1,
force=False,
lowercase=False,
tokenize=DEFAULT_TOKENIZER) -> MultiClassMeasure:
"""
Computes F-measure on a corpus
:param average: what kind of averaging to use for obtaining corpus level performance from types.
Options: macro, micro
:param sys_stream: The system stream (a sequence of segments)
:param ref_streams: A list of one or more reference streams (each a sequence of segments)
:param smooth_value: The smoothing value for `add-k` method. set smooth_value=0 to disable.
Does not influence macro-average
:param f_beta: β value that weighs recall in F-measure
:param force: Ignore data that looks already tokenized
:param lowercase: Lowercase the data
:param tokenize: The tokenizer to use
:return: a `MultiClassMeasure` object
"""

# limiting to these special cases because others are not tested
assert average in AVG_TYPES
smooth_method = 'add-k'
max_order = 1 # We tested higher order n-grams too, but unigrams itself turned out competitive

args = dict(smooth_method=smooth_method, smooth_value=smooth_value, force=force,
short=False, lc=lowercase, tokenize=tokenize, f_beta=f_beta, max_order=max_order)

metric = METRICS[average + 'f'](args)
return metric.corpus_score(sys_stream, ref_streams)

"""Computes Macro-F measure on a corpus. Refer to `corpus_f()` for additional arguments."""
corpus_macrof = partial(corpus_f, average='macro')

"""Computes Micro-F measure on a corpus. Refer to `corpus_f()` for additional arguments."""
corpus_microf = partial(corpus_f, average='micro')
6 changes: 6 additions & 0 deletions sacrebleu/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
# -*- coding: utf-8 -*-

from functools import partial

from .bleu import BLEU, BLEUScore
from .chrf import CHRF, CHRFScore
from .clseval import AVG_TYPES, DEF_F_BETA, DEF_AVERAGE, DEF_SMOOTH_VAL
from .clseval import ClassifierEval, MultiClassMeasure
from .ter import TER, TERScore

METRICS = {
'bleu': BLEU,
'chrf': CHRF,
'ter': TER,
'macrof': partial(ClassifierEval, average='macro', smooth_method='add-k', max_order=1),
'microf': partial(ClassifierEval, average='micro', smooth_method='add-k', max_order=1)
}
4 changes: 4 additions & 0 deletions sacrebleu/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

class BaseScore:
"""A base score class to derive from."""

__slots__ = ('score',)

def __init__(self, score):
self.score = score

Expand Down Expand Up @@ -56,3 +59,4 @@ def __str__(self):

def __repr__(self):
return self.__str__()

4 changes: 2 additions & 2 deletions sacrebleu/metrics/bleu.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def extract_ngrams(line, min_order=1, max_order=NGRAM_ORDER) -> Counter:
return ngrams

@staticmethod
def reference_stats(refs, output_len):
def reference_stats(refs, output_len, max_order=NGRAM_ORDER):
"""Extracts reference statistics for a given segment.

:param refs: A list of segment tokens.
Expand All @@ -145,7 +145,7 @@ def reference_stats(refs, output_len):
if reflen < closest_len:
closest_len = reflen

ngrams_ref = BLEU.extract_ngrams(ref)
ngrams_ref = BLEU.extract_ngrams(ref, max_order=max_order)
for ngram in ngrams_ref.keys():
ngrams[ngram] = max(ngrams[ngram], ngrams_ref[ngram])

Expand Down
Loading