mjpost · thammegowda · Mar 20, 2020 · Mar 20, 2020 · Mar 20, 2020 · Mar 27, 2020
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,6 @@ data/
 build
 dist
 __pycache__
+
+.sacrebleu
+.idea
diff --git a/README.md b/README.md
@@ -130,3 +130,17 @@ If you use SacreBLEU, please cite the following:
   pages = "186--191",
 }
 ```
+
+----
+# SacreBLEU Extended
+
+SacreBLEU, in its recent versions, has been extended to include additional evaluation metrics:
+
+* ChrF : https://www.aclweb.org/anthology/W16-2341/ 
+* TER :  https://github.com/jhclark/tercom
+* MacroF and MicroF : (TODO: update link)  
+
+
+Example: 
+
+    sacrebleu REF.txt -m bleu chrf ter macrof microf < HYP.detok.txt   
diff --git a/sacrebleu/__main__.py b/sacrebleu/__main__.py
@@ -21,7 +21,7 @@
 
 See the [README.md] file for more information.
 """
-from  .sacrebleu import main
+from .sacrebleu import main
 
 if __name__ == '__main__':
-    main()
+    main()
diff --git a/sacrebleu/compat.py b/sacrebleu/compat.py
@@ -1,8 +1,10 @@
 from typing import Union, Iterable, List
 from argparse import Namespace
+from functools import partial
 
 from .tokenizers import DEFAULT_TOKENIZER
 from .metrics import BLEU, CHRF, TER, BLEUScore, CHRFScore, TERScore
+from .metrics import MultiClassMeasure, METRICS, AVG_TYPES
 
 
 ######################################################################
@@ -169,3 +171,44 @@ def sentence_ter(hypothesis: str,
         asian_support=asian_support, case_sensitive=case_sensitive)
     metric = TER(args)
     return metric.sentence_score(hypothesis, references)
+
+
+def corpus_f(sys_stream: Union[str, Iterable[str]],
+            ref_streams: Union[str, List[Iterable[str]]],
+            average: str,
+            smooth_value=1,
+            f_beta=1,
+            force=False,
+            lowercase=False,
+            tokenize=DEFAULT_TOKENIZER) -> MultiClassMeasure:
+    """
+    Computes F-measure on a corpus
+    :param average: what kind of averaging to use for obtaining corpus level performance from types.
+         Options: macro, micro
+    :param sys_stream: The system stream (a sequence of segments)
+    :param ref_streams: A list of one or more reference streams (each a sequence of segments)
+    :param smooth_value: The smoothing value for `add-k` method. set smooth_value=0 to disable.
+                 Does not influence macro-average
+    :param f_beta: β value that weighs recall in F-measure
+    :param force: Ignore data that looks already tokenized
+    :param lowercase: Lowercase the data
+    :param tokenize: The tokenizer to use
+    :return: a `MultiClassMeasure` object
+    """
+
+    # limiting to these special cases because others are not tested
+    assert average in AVG_TYPES
+    smooth_method = 'add-k'
+    max_order = 1 # We tested higher order n-grams too, but unigrams itself turned out competitive
+
+    args = dict(smooth_method=smooth_method, smooth_value=smooth_value, force=force,
+        short=False, lc=lowercase, tokenize=tokenize, f_beta=f_beta, max_order=max_order)
+
+    metric = METRICS[average + 'f'](args)
+    return metric.corpus_score(sys_stream, ref_streams)
+
+"""Computes Macro-F measure on a corpus. Refer to `corpus_f()` for additional arguments."""
+corpus_macrof = partial(corpus_f, average='macro')
+
+"""Computes Micro-F measure on a corpus. Refer to `corpus_f()` for additional arguments."""
+corpus_microf = partial(corpus_f, average='micro')
diff --git a/sacrebleu/metrics/__init__.py b/sacrebleu/metrics/__init__.py
@@ -1,11 +1,17 @@
 # -*- coding: utf-8 -*-
 
+from functools import partial
+
 from .bleu import BLEU, BLEUScore
 from .chrf import CHRF, CHRFScore
+from .clseval import AVG_TYPES, DEF_F_BETA, DEF_AVERAGE, DEF_SMOOTH_VAL
+from .clseval import ClassifierEval, MultiClassMeasure
 from .ter import TER, TERScore
 
 METRICS = {
     'bleu': BLEU,
     'chrf': CHRF,
     'ter': TER,
+    'macrof': partial(ClassifierEval, average='macro', smooth_method='add-k', max_order=1),
+    'microf': partial(ClassifierEval, average='micro', smooth_method='add-k', max_order=1)
 }
diff --git a/sacrebleu/metrics/base.py b/sacrebleu/metrics/base.py
@@ -4,6 +4,9 @@
 
 class BaseScore:
     """A base score class to derive from."""
+
+    __slots__ = ('score',)
+
     def __init__(self, score):
         self.score = score
 
@@ -56,3 +59,4 @@ def __str__(self):
 
     def __repr__(self):
         return self.__str__()
+
diff --git a/sacrebleu/metrics/bleu.py b/sacrebleu/metrics/bleu.py
@@ -122,7 +122,7 @@ def extract_ngrams(line, min_order=1, max_order=NGRAM_ORDER) -> Counter:
         return ngrams
 
     @staticmethod
-    def reference_stats(refs, output_len):
+    def reference_stats(refs, output_len, max_order=NGRAM_ORDER):
         """Extracts reference statistics for a given segment.
 
         :param refs: A list of segment tokens.
@@ -145,7 +145,7 @@ def reference_stats(refs, output_len):
                 if reflen < closest_len:
                     closest_len = reflen
 
-            ngrams_ref = BLEU.extract_ngrams(ref)
+            ngrams_ref = BLEU.extract_ngrams(ref, max_order=max_order)
             for ngram in ngrams_ref.keys():
                 ngrams[ngram] = max(ngrams[ngram], ngrams_ref[ngram])
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,3 +3,6 @@ data/ @@
     build
     dist
     __pycache__
+    .sacrebleu
+    .idea