Skip to content

Commit 227f3d5

Browse files
committed
Merge branch 'release/v1.2.2'
2 parents 9112cd1 + f09ee94 commit 227f3d5

File tree

6 files changed

+192
-8
lines changed

6 files changed

+192
-8
lines changed

.travis.yml

+6-2
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
language: java
22
jdk:
33
- oraclejdk7
4-
install: mvn clean test package
4+
5+
cache:
6+
directories:
7+
- $HOME/.m2
8+
59
notifications:
10+
email: false
611
hipchat:
712
rooms:
813
secure: HAIO6qjP1Os4yCduLwRfNrXP9K5v3hOpbXs/HOcoPzh0WACTNZUBJx8GPUWlcUJlzaxkUG05QC91dTn2kTyNteFE9xayKw347i7xywcYDttJdEL8M0agIkZowyrOfmiG+wFv/vCayZpf0T/MPYE1gDvFeP4yuP7CU0pdy1j0SRUkHvcpoXcx1OjbW7kMbiO1WedhzGZwrLWabw8UmNybvoyVSZmtBd4acRuOfzOfbynWoWL/9HD9jUoeCIbQeXWFxXGnQb22QpNWRYu7ZMH1ppkE/lI5zJtqZS5BjfEfK3tql4WaL72r4xO4uHkqi0xjUlP0iyQi4WW9G1PGFz6GmZSCrr04eyRQovhEaQcoces4+Q+uT/3pHplT12kE8y5wGTwYJlCfIRjGYN/uqfT6EEdl+8E4ZCA/o3t7rEHrRqxD98Pt+Q/y+GJxtYmGr4n+HAGCfa3BE4uij3N7EisO6mJ7NtMp5g8UGEvnqFV/eV46Urhoih4ILPhTkMKvX7PRxnSvrqE7toY9PXC5Ufkmt5TU7RmTNRT1cnH28MqlpeywWKNkNr8I6chfUCDNeEHT6Ckj00/l+CbQfNtQWQt7XApcCui0cxvTuZSrAKVzNTrWKcN/tMP5XyJ6t/bxBZAFDV4YHzK8fwzNHjz71sLGYukdvls4yLsEW74JqiQZ6Oo=
914
template:
1015
- 'Build #%{build_number} (%{commit}) of %{repository_name}/%{branch} %{result} (%{duration}) %{build_url}'
11-
email: false

VERSION

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
v1.2.2

src/main/java/it/cnr/isti/hpc/wikipedia/parser/ArticleParser.java

+9-5
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,10 @@ public void parse(Article article, String mediawiki) {
8181
} else {
8282

8383
for(String disambiguationKeyword:locale.getDisambigutionIdentifiers()){
84-
if(StringUtils.containsIgnoreCase(mediawiki, ("{{" + disambiguationKeyword)))
85-
article.setType(Type.DISAMBIGUATION);
84+
if(StringUtils.containsIgnoreCase(mediawiki, ("{{" + disambiguationKeyword + "|")) || StringUtils.containsIgnoreCase(mediawiki, ("{{" + disambiguationKeyword + "}}"))) {
85+
logger.info(article.getTitle() + ": Setting disambiguation because it contains " + disambiguationKeyword);
86+
article.setType(Type.DISAMBIGUATION);
87+
}
8688
}
8789

8890
String cleanedMediawiki = removeTemplates(mediawiki);
@@ -157,7 +159,7 @@ private void setWikiTitle(Article article) {
157159
*/
158160
private void setIsList(Article article) {
159161
for (String list : locale.getListIdentifiers()) {
160-
if (StringUtils.startsWithIgnoreCase(article.getTitle(), list)) {
162+
if (StringUtils.startsWithIgnoreCase(article.getTitle(), list + " ")) {
161163
article.setType(Type.LIST);
162164
}
163165
}
@@ -519,15 +521,17 @@ private void setLists(Article article, ParsedPage page) {
519521
private void setDisambiguation(Article a) {
520522

521523
for (String disambiguation : locale.getDisambigutionIdentifiers()) {
522-
if (StringUtils.containsIgnoreCase(a.getTitle(), disambiguation)) {
524+
if (StringUtils.containsIgnoreCase(a.getTitle(), "(" + disambiguation + ")")) {
525+
logger.info(a.getTitle() + ": Disambiguation was set because " + disambiguation + " is in the title");
523526
a.setType(Type.DISAMBIGUATION);
524527
return;
525528
}
526529
for (Template t : a.getTemplates()) {
530+
527531
if (StringUtils.equalsIgnoreCase(t.getName(), disambiguation)) {
532+
logger.info(a.getTitle() + ": Disambiguation was set because " + disambiguation + " template is present");
528533
a.setType(Type.DISAMBIGUATION);
529534
return;
530-
531535
}
532536
}
533537

src/main/resources/lang/locale-en.properties

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
language=English
2-
disambiguation=disambiguation,hndis
2+
disambiguation=disambiguation,hndis,Airport disambiguation,Biology disambiguation,Call sign disambiguation,Caselaw disambiguation,Chinese title disambiguation,Genus disambiguation,Geodis,Hndis,Hndis-cleanup,Hospital disambiguation,Letter disambiguation,Letter-NumberCombDisambig,Mathematical disambiguation,Mil-unit-dis,Numberdis,Phonetics disambiguation,Road disambiguation,School disambiguation,Species Latin name disambiguation,Disambig,dab,Disamb
33

44
list=list
55

src/test/java/it/cnr/isti/hpc/wikipedia/reader/WikipediaArticleReaderTest.java

+24
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
import java.io.IOException;
2626
import java.io.UnsupportedEncodingException;
2727
import java.net.URL;
28+
import java.util.HashMap;
29+
import java.util.Map;
2830

2931
import org.junit.Test;
3032
import org.xml.sax.SAXException;
@@ -46,6 +48,28 @@ public void testDisambiguation() throws UnsupportedEncodingException, FileNotFou
4648
String json = IOUtils.getFileAsUTF8String("/tmp/disambiguation.json.gz");
4749
Article a = Article.fromJson(json);
4850
assert(a.getType().equals(Article.Type.DISAMBIGUATION));
51+
52+
URL url = this.getClass().getResource("/en/disambiguation.xml");
53+
WikipediaArticleReader reader = new WikipediaArticleReader(url.getFile(),"/tmp/en-disambiguation.json.gz", Language.EN);
54+
reader.start();
55+
56+
Map<String, Article> articles = new HashMap<String, Article>();
57+
String[] lines = IOUtils.getFileAsUTF8String("/tmp/en-disambiguation.json.gz").split("\n");
58+
59+
for(String l: lines) {
60+
Article article = Article.fromJson(l);
61+
articles.put(article.getTitle(), article);
62+
}
63+
64+
65+
assert(articles.get("Listed building").getType().equals(Article.Type.ARTICLE));
66+
assert(articles.get("Athens").getType().equals(Article.Type.ARTICLE));
67+
assert(articles.get("Test dab").getType().equals(Article.Type.DISAMBIGUATION));
68+
assert(articles.get("Test hndis").getType().equals(Article.Type.DISAMBIGUATION));
69+
assert(articles.get("Test disambiguation in title").getType().equals(Article.Type.ARTICLE));
70+
assert(articles.get("Test (disambiguation)").getType().equals(Article.Type.DISAMBIGUATION));
71+
72+
4973
}
5074

5175
@Test
+151
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="it">
2+
<siteinfo>
3+
<sitename>Wikipedia</sitename>
4+
<base>http://it.wikipedia.org/wiki/Pagina_principale</base>
5+
<generator>MediaWiki 1.17wmf1</generator>
6+
<case>first-letter</case>
7+
<namespaces>
8+
<namespace key="-2" case="first-letter">Media</namespace>
9+
<namespace key="-1" case="first-letter">Speciale</namespace>
10+
<namespace key="0" case="first-letter" />
11+
<namespace key="1" case="first-letter">Discussione</namespace>
12+
<namespace key="2" case="first-letter">Utente</namespace>
13+
<namespace key="3" case="first-letter">Discussioni utente</namespace>
14+
<namespace key="4" case="first-letter">Wikipedia</namespace>
15+
<namespace key="5" case="first-letter">Discussioni Wikipedia</namespace>
16+
<namespace key="6" case="first-letter">File</namespace>
17+
<namespace key="7" case="first-letter">Discussioni file</namespace>
18+
<namespace key="8" case="first-letter">MediaWiki</namespace>
19+
<namespace key="9" case="first-letter">Discussioni MediaWiki</namespace>
20+
<namespace key="10" case="first-letter">Template</namespace>
21+
<namespace key="11" case="first-letter">Discussioni template</namespace>
22+
<namespace key="12" case="first-letter">Aiuto</namespace>
23+
<namespace key="13" case="first-letter">Discussioni aiuto</namespace>
24+
<namespace key="14" case="first-letter">Categoria</namespace>
25+
<namespace key="15" case="first-letter">Discussioni categoria</namespace>
26+
<namespace key="100" case="first-letter">Portale</namespace>
27+
<namespace key="101" case="first-letter">Discussioni portale</namespace>
28+
<namespace key="102" case="first-letter">Progetto</namespace>
29+
<namespace key="103" case="first-letter">Discussioni progetto</namespace>
30+
</namespaces>
31+
</siteinfo>
32+
<page>
33+
<title>Listed building</title>
34+
<ns>0</ns>
35+
<id>202009</id>
36+
<revision>
37+
<id>661976633</id>
38+
<parentid>661009702</parentid>
39+
<timestamp>2015-05-12T10:09:36Z</timestamp>
40+
<contributor>
41+
<username>Wdeed</username>
42+
<id>19559649</id>
43+
</contributor>
44+
<minor />
45+
<comment>/* External links */ updating links to old website, which will soon become broken</comment>
46+
<model>wikitext</model>
47+
<format>text/x-wiki</format>
48+
<text xml:space="preserve">bla</text>
49+
<sha1>1xxgu4p6c5c09siiwov1kaqscvrr94e</sha1>
50+
</revision>
51+
</page>
52+
<page>
53+
<title>Athens</title>
54+
<ns>0</ns>
55+
<id>1216</id>
56+
<revision>
57+
<id>664843779</id>
58+
<parentid>664790778</parentid>
59+
<timestamp>2015-05-31T11:13:10Z</timestamp>
60+
<contributor>
61+
<username>Marcocapelle</username>
62+
<id>14965160</id>
63+
</contributor>
64+
<comment>inhabited for 7000 years -&gt; established in 5th millennium BC</comment>
65+
<model>wikitext</model>
66+
<format>text/x-wiki</format>
67+
<text xml:space="preserve">{{about|the capital city of Greece|other uses|Athens (disambiguation)}}
68+
Previously, there had been other etymologies by scholars of the 19th century. [[Christian Lobeck|Lobeck]] proposed as the root of the name the word {{lang|grc|ἄθος}} (''athos'') or {{lang|grc|ἄνθος}} (''anthos'') meaning flower, to denote Athens as the ''flowering'' city. On the other hand, [[Ludwig Döderlein|Döderlein]]{{disambiguation needed|date=May 2015}} proposed the stem of the verb {{lang|grc|θάω}}, stem θη- (''thaō'', stem ''thē-'', "to suck") to denote Athens as having fertile soil.&lt;ref&gt;''[[Great Greek Encyclopedia]]'', vol. II, page 30, Athens, 1927&lt;/ref&gt;
69+
</text>
70+
<sha1>1xxgu4p6c5c09siiwov1kaqscvrr94e</sha1>
71+
</revision>
72+
</page>
73+
<page>
74+
<title>Test dab</title>
75+
<ns>0</ns>
76+
<id>1216</id>
77+
<revision>
78+
<id>664843779</id>
79+
<parentid>664790778</parentid>
80+
<timestamp>2015-05-31T11:13:10Z</timestamp>
81+
<contributor>
82+
<username>Marcocapelle</username>
83+
<id>14965160</id>
84+
</contributor>
85+
<comment>inhabited for 7000 years -&gt; established in 5th millennium BC</comment>
86+
<model>wikitext</model>
87+
<format>text/x-wiki</format>
88+
<text xml:space="preserve">{{dab}}
89+
</text>
90+
<sha1>1xxgu4p6c5c09siiwov1kaqscvrr94e</sha1>
91+
</revision>
92+
</page>
93+
<page>
94+
<title>Test hndis</title>
95+
<ns>0</ns>
96+
<id>1216</id>
97+
<revision>
98+
<id>664843779</id>
99+
<parentid>664790778</parentid>
100+
<timestamp>2015-05-31T11:13:10Z</timestamp>
101+
<contributor>
102+
<username>Marcocapelle</username>
103+
<id>14965160</id>
104+
</contributor>
105+
<comment>inhabited for 7000 years -&gt; established in 5th millennium BC</comment>
106+
<model>wikitext</model>
107+
<format>text/x-wiki</format>
108+
<text xml:space="preserve">{{hndis}}
109+
</text>
110+
<sha1>1xxgu4p6c5c09siiwov1kaqscvrr94e</sha1>
111+
</revision>
112+
</page>
113+
<page>
114+
<title>Test disambiguation in title</title>
115+
<ns>0</ns>
116+
<id>1216</id>
117+
<revision>
118+
<id>664843779</id>
119+
<parentid>664790778</parentid>
120+
<timestamp>2015-05-31T11:13:10Z</timestamp>
121+
<contributor>
122+
<username>Marcocapelle</username>
123+
<id>14965160</id>
124+
</contributor>
125+
<comment>inhabited for 7000 years -&gt; established in 5th millennium BC</comment>
126+
<model>wikitext</model>
127+
<format>text/x-wiki</format>
128+
<text xml:space="preserve">hello</text>
129+
<sha1>1xxgu4p6c5c09siiwov1kaqscvrr94e</sha1>
130+
</revision>
131+
</page>
132+
<page>
133+
<title>Test (disambiguation)</title>
134+
<ns>0</ns>
135+
<id>1216</id>
136+
<revision>
137+
<id>664843779</id>
138+
<parentid>664790778</parentid>
139+
<timestamp>2015-05-31T11:13:10Z</timestamp>
140+
<contributor>
141+
<username>Marcocapelle</username>
142+
<id>14965160</id>
143+
</contributor>
144+
<comment>inhabited for 7000 years -&gt; established in 5th millennium BC</comment>
145+
<model>wikitext</model>
146+
<format>text/x-wiki</format>
147+
<text xml:space="preserve">hello</text>
148+
<sha1>1xxgu4p6c5c09siiwov1kaqscvrr94e</sha1>
149+
</revision>
150+
</page>
151+
</mediawiki>

0 commit comments

Comments
 (0)