rename script to nereval.py and add metric defintion

jantrienes · jantrienes · commit 4c5e5f5d2b60 · 2018-01-31T14:23:53.000+01:00
diff --git a/Makefile b/Makefile
@@ -9,7 +9,7 @@ test:
 	pytest
 
 test-coverage:
-	pytest --cov=muceval --cov-report term
+	pytest --cov=nereval --cov-report term
 
 lint:
-	pylint muceval.py || exit 0
+	pylint nereval.py || exit 0
diff --git a/README.md b/README.md
@@ -1,39 +1,50 @@
-# muceval
-MUC-like evaluation script for named entity recognition systems as used in the advanced research project for NLP.
+# nereval
+Evaluation script for named entity recognition (NER) systems based on entity-level F1 score.
+
+## Definition
+The metric as implemented here has been described by Nadeau and Sekine (2007) and was widely used as part of the Message Understanding Conferences (Grishman and Sundheim, 1996). It evaluates an NER system according to two axes: whether it is able to assign the right type to an entity, and whether it finds the exact entity boundaries. For both axes, the number of correct predictions (COR), the number of actual predictions (ACT) and the number of possible predictions (POS) are computed. From these statistics, precision and recall can be derived:
+
+```
+precision = COR/ACT
+recall = COR/POS
+```
+
+The final score is the micro-averaged F1 measure of precision and recall of both type and boundary axes.
 
 ## Installation
 ```sh
-git clone https://github.com/jantrienes/twente-arp-nlp-evaluation.git
-cd twente-arp-nlp-evaluation
-# either install this as module via pip
+git clone https://github.com/jantrienes/nereval.git
+cd nereval
 pip install .
-
-# or copy main python file into local project
-cp muceval.py ~/theproject
 ```
 
 ## Usage
 The script can either be used from within Python or from the command line when classification results have been written to a JSON file.
 
 ### Usage from Command Line
-Assume we have the following classification results in `input.json`:
+Assume we have the following classification results in `examples/input.json`:
 
 ```json
 [
   {
-    "text": "a b",
+    "text": "CILINDRISCHE PLUG",
     "true": [
       {
-        "text": "a",
-        "type": "NAME",
+        "text": "CILINDRISCHE PLUG",
+        "type": "Productname",
         "start": 0
       }
     ],
     "predicted": [
       {
-        "text": "a",
-        "type": "LOCATION",
+        "text": "CILINDRISCHE",
+        "type": "Productname",
         "start": 0
+      },
+      {
+        "text": "PLUG",
+        "type": "Productname",
+        "start": 13
       }
     ]
   }
@@ -43,16 +54,16 @@ Assume we have the following classification results in `input.json`:
 Then the script can be executed as follows:
 
 ```sh
-python muceval.py input.json
-F1-score: 0.50
+python nereval.py examples/input.json
+F1-score: 0.33
 ```
 
 ### Usage from Python
 Alternatively, the evaluation metric can be directly invoked from within python. Example:
 
 ```py
-import muceval
-from muceval import Entity
+import nereval
+from nereval import Entity
 
 # Ground-truth:
 # CILINDRISCHE PLUG
@@ -71,13 +82,13 @@ y_pred = [
     Entity('PLUG', 'Productname', 13)
 ]
 
-score = muceval.evaluate([y_true], [y_pred])
+score = nereval.evaluate([y_true], [y_pred])
 print('F1-score: %.2f' % score)
 F1-score: 0.33
 ```
 
-## Important Note on Symmetry
-The metric itself is not symmetric due to the inherent problem of word overlaps in NER. So `evaluate(y_true, y_pred) != evaluate(y_pred, y_true)`. This comes apparent if we consider the following example (tagger uses an IOB scheme):
+## Note on Symmetry
+The metric itself is not symmetric due to the inherent problem of word overlaps in NER. So `evaluate(y_true, y_pred) != evaluate(y_pred, y_true)`. This comes apparent if we consider the following example (tagger uses an BIO scheme):
 
 ```
 # Example 1:
@@ -96,3 +107,10 @@ Predicted: B_PROD       I_PROD   B_PROD  B_DIM   O
 Correct Text: 2
 Correct Type: 3
 ```
+
+## Notes and References
+Used in a student research project on natural language processing at [University of Twente, Netherlands](https://www.utwente.nl).
+
+**References**
+* Grishman, R., & Sundheim, B. (1996). [Message understanding conference-6: A brief history](http://www.aclweb.org/anthology/C96-1079). *In COLING 1996 Volume 1: The 16th International Conference on Computational Linguistics* (Vol. 1).
+* Nadeau, D., & Sekine, S. (2007). [A survey of named entity recognition and classification](http://www.jbe-platform.com/content/journals/10.1075/li.30.1.03nad). *Lingvisticae Investigationes*, 30(1), 3-26.
diff --git a/input.json b/input.json
@@ -1,19 +1,24 @@
 [
   {
-    "text": "a b",
+    "text": "CILINDRISCHE PLUG",
     "true": [
       {
-        "text": "a",
-        "type": "NAME",
+        "text": "CILINDRISCHE PLUG",
+        "type": "Productname",
         "start": 0
       }
     ],
     "predicted": [
       {
-        "text": "a",
-        "type": "LOCATION",
+        "text": "CILINDRISCHE",
+        "type": "Productname",
         "start": 0
+      },
+      {
+        "text": "PLUG",
+        "type": "Productname",
+        "start": 13
       }
     ]
   }
-]
+]
diff --git a/nereval.py b/nereval.py
@@ -105,7 +105,7 @@ def evaluate(y_true, y_pred):
 
     Example
     -------
-    >>> from muceval import Entity, evaluate
+    >>> from nereval import Entity, evaluate
     >>> y_true = [
     ...     [Entity('a', 'b', 0), Entity('b', 'b', 2)]
     ... ]
diff --git a/setup.py b/setup.py
@@ -1,11 +1,11 @@
 from distutils.core import setup
 
 setup(
-    name='muceval',
+    name='nereval',
     version='0.2.2',
-    description='MUC-like evaluation script for named entity recognition systems.',
+    description='Evaluation script for named entity recognition systems based on F1 score.',
     license='MIT',
-    py_modules=['muceval'],
+    py_modules=['nereval'],
     tests_require=[
         'pytest',
         'pytest-cov',
diff --git a/test_nereval.py b/test_nereval.py
@@ -1,6 +1,6 @@
 import os
 import pytest
-from muceval import (
+from nereval import (
     correct_text, correct_type, count_correct, has_overlap, Entity, precision, recall, evaluate,
     _parse_json, evaluate_json, sign_test
 )
@@ -173,9 +173,10 @@ def test_parse_json():
     predictions = _parse_json(file_name)
     assert len(predictions) == 1
     instance = predictions[0]
-    assert instance['text'] == 'a b'
-    assert instance['true'][0] == Entity('a', 'NAME', 0)
-    assert instance['predicted'][0] == Entity('a', 'LOCATION', 0)
+    assert instance['text'] == 'CILINDRISCHE PLUG'
+    assert instance['true'][0] == Entity('CILINDRISCHE PLUG', 'Productname', 0)
+    assert instance['predicted'][0] == Entity('CILINDRISCHE', 'Productname', 0)
+    assert instance['predicted'][1] == Entity('PLUG', 'Productname', 13)
 
 def test_evaluate_json():
     file_name = os.path.join(os.path.dirname(__file__), 'input.json')

Original file line number	Diff line number	Diff line change
`@@ -1,19 +1,24 @@`
`1`	`1`	`[`
`2`	`2`	`{`
`3`		`- "text": "a b",`
	`3`	`+ "text": "CILINDRISCHE PLUG",`
`4`	`4`	`"true": [`
`5`	`5`	`{`
`6`		`- "text": "a",`
`7`		`- "type": "NAME",`
	`6`	`+ "text": "CILINDRISCHE PLUG",`
	`7`	`+ "type": "Productname",`
`8`	`8`	`"start": 0`
`9`	`9`	`}`
`10`	`10`	`],`
`11`	`11`	`"predicted": [`
`12`	`12`	`{`
`13`		`- "text": "a",`
`14`		`- "type": "LOCATION",`
	`13`	`+ "text": "CILINDRISCHE",`
	`14`	`+ "type": "Productname",`
`15`	`15`	`"start": 0`
	`16`	`+ },`
	`17`	`+ {`
	`18`	`+ "text": "PLUG",`
	`19`	`+ "type": "Productname",`
	`20`	`+ "start": 13`
`16`	`21`	`}`
`17`	`22`	`]`
`18`	`23`	`}`
`19`		`-]`
	`24`	`+]`