Skip to content

Commit 889cb59

Browse files
authored
Merge pull request #69 from inspirehep/doi_test
RefExtract: update doi extraction
2 parents 9915eb1 + c8c151a commit 889cb59

File tree

2 files changed

+60
-1
lines changed

2 files changed

+60
-1
lines changed

refextract/references/regexs.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -659,7 +659,8 @@ def compute_months():
659659
((\(?[Dd][Oo][Ii](\s)*\)?:?(\s)*) # 'doi:' or 'doi' or '(doi)' (upper or lower case)
660660
|(https?://(dx\.)?doi\.org\/))? # or 'http://(dx.)doi.org/' (neither has to be present)
661661
(?P<doi>10\. # 10. (mandatory for DOI's)
662-
\d{4} # [0-9] x4
662+
\d{3,7} # [0-9] x 3-7
663+
(\.\w+)* # subdivisions separated by . (doesn't have to be present)
663664
(/|%2f) # / (possibly urlencoded)
664665
[\w\-_:;\(\)/\.<>]+ # any character
665666
[\w\-_:;\(\)/<>]) # any character excluding a full stop

tests/test_engine.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,64 @@ def test_d0_conf_note_report_number():
208208
assert references[0]['linemarker'] == [u'4']
209209

210210

211+
def test_doi_4_digit():
212+
ref_line = u'[32] E. Armengaud, et al., JINST 10(05), P05007 (2015). doi:10.1088/1748-0221/10/05/P05007.'
213+
res = get_references(ref_line)
214+
references = res[0]
215+
expected = [
216+
{
217+
'author': [u'E. Armengaud, et al.'],
218+
'doi': [u'doi:10.1088/1748-0221/10/05/P05007'],
219+
'journal_page': [u'P05007'],
220+
'journal_reference': [u'JINST 10 (2015) P05007'],
221+
'journal_title': [u'JINST'],
222+
'journal_volume': [u'10'],
223+
'journal_year': [u'2015'],
224+
'linemarker': [u'32'],
225+
'raw_ref': [ref_line],
226+
'year': [u'2015'],
227+
}
228+
]
229+
assert references == expected
230+
231+
232+
def test_doi_5_digit_multi():
233+
ref_line = u'38 R. Aaij et al. (LHCb Collaboration), "Measurement of charged particle multiplicities in pp collisions at ps = 7 TeV in the forward region", Eur. Phys. J. C (2012) 72: 1947. DOI: 10.1140/epjc/s10052-012-1947-8. HepData DOI: 10.17182/hepdata.65435.'
234+
res = get_references(ref_line)
235+
references = res[0]
236+
expected = [
237+
{
238+
'author': [u'R. Aaij et al.'],
239+
'misc': [u'(LHCb Collaboration)'],
240+
'title': [u'Measurement of charged particle multiplicities in pp collisions at ps = 7 TeV in the forward region'],
241+
'doi': [u'doi:10.1140/epjc/s10052-012-1947-8'],
242+
'journal_page': [u'1947'],
243+
'journal_reference': [u'Eur. Phys. J. C 72 (2012) 1947'],
244+
'journal_title': [u'Eur. Phys. J. C'],
245+
'journal_volume': [u'72'],
246+
'journal_year': [u'2012'],
247+
'linemarker': [u'38'],
248+
'year': [u'2012'],
249+
'raw_ref': [ref_line],
250+
},
251+
{
252+
'misc': [u'HepData'],
253+
'doi': [u'doi:10.17182/hepdata.65435'],
254+
'linemarker': [u'38'],
255+
'raw_ref': [ref_line],
256+
}
257+
]
258+
assert references == expected
259+
260+
261+
def test_doi_subdivisions():
262+
ref_line = u'[10] A. Smith et al., "Introduction to Particle Physics", 2017, Springer Publishing, ISBN: 97881925212214, DOI: 10.978.819252/12214.'
263+
res = get_references(ref_line)
264+
references = res[0]
265+
assert references[0]['doi'] == [u'doi:10.978.819252/12214']
266+
assert references[0]['linemarker'] == [u'10']
267+
268+
211269
def test_get_plaintext_document_body(tmpdir):
212270
input = [u"Some text\n", u"on multiple lines\n"]
213271
f = tmpdir.join("plain.txt")

0 commit comments

Comments
 (0)