diff --git a/.gitignore b/.gitignore
index f1a9de7..8837ff5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,8 @@
/wikipedia-miner-core/target/
-/wikipedia-miner-extract/target/
\ No newline at end of file
+/wikipedia-miner-extract/target/
+/wikipedia-miner-web/target/
+/wikipedia-miner-web/nb-configuration.xml
+/wikipedia-miner-web/src/main/webapp/WEB-INF/web.xml
+/target/
+/wikipedia-miner-core/nbactions.xml
+/wikipedia-miner-core/nb-configuration.xml
\ No newline at end of file
diff --git a/README.md b/README.md
index 053f5c0..8aa61ed 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,53 @@
wikipediaminer
==============
-An open source toolkit for mining Wikipedia
+An open source toolkit for mining Wikipedia forked from: https://github.com/dnmilne/wikipediaminer
+
+Contain some improvements in the WebServices and a lot of bugfixes to Milne's sources.
+
+ Documentation at : https://github.com/dnmilne/wikipediaminer/wiki
+
+
+TODO:
+```list
+Add support for live snapshots of wikipedia (DBPedia approach) to stay updated
+Implement other disambigation approaches like http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6354382
+Support for binary data on the webServices, (Thrift for example) to avoid problems with UTF8 characters.
+```
+
+
+Add this repository to your POM.xml.
+
+```xml
+
+
+ galan-maven-repo
+ galan-maven-repo-releases
+ http://galan.ehu.es/artifactory/ext-release-local
+
+
+```
+
+Then...
+
+```xml
+
+
+ galan-maven-repo
+ galan-maven-repo-releases
+ http://galan.ehu.es/artifactory/ext-release-local
+
+
+```
+
+Then add the required subproyect, for example...
+```xml
+
+ org.wikipedia-miner
+ wikipedia-miner-core
+ 1.2.4
+
+
+
+
+
diff --git a/configs/hub-template.xml b/configs/hub-template.xml
index 92a219d..88dc480 100644
--- a/configs/hub-template.xml
+++ b/configs/hub-template.xml
@@ -8,7 +8,14 @@
+
path/to/conf/file
diff --git a/configs/languages.xml b/configs/languages.xml
index f4d26df..7d73ac5 100644
--- a/configs/languages.xml
+++ b/configs/languages.xml
@@ -55,5 +55,25 @@
WEITERLEITUNG
+
+
+
+ Artículos
+
+ Wikipedia:Desambiguación
+
+ desambiguación
+ des
+ desambiguacion
+ disambig
+ REDIRECT
+ des
+ otros usos
+ redirige aquí
+ ico-des
+ REDIRECCIÓN
+ REDIRECCION
+
+
diff --git a/models/compare/artCompare_es_In.model b/models/compare/artCompare_es_ln.model
similarity index 99%
rename from models/compare/artCompare_es_In.model
rename to models/compare/artCompare_es_ln.model
index 7592053..c316895 100644
Binary files a/models/compare/artCompare_es_In.model and b/models/compare/artCompare_es_ln.model differ
diff --git a/models/compare/labelCompare_es_In.model b/models/compare/labelCompare_es_ln.model
similarity index 99%
rename from models/compare/labelCompare_es_In.model
rename to models/compare/labelCompare_es_ln.model
index ba49c95..660ba92 100644
Binary files a/models/compare/labelCompare_es_In.model and b/models/compare/labelCompare_es_ln.model differ
diff --git a/models/compare/labelDisambig_es_In.model b/models/compare/labelDisambig_es_In.model
index 6d687c8..f0651b8 100644
Binary files a/models/compare/labelDisambig_es_In.model and b/models/compare/labelDisambig_es_In.model differ
diff --git a/pom.xml b/pom.xml
index dc8ae58..5581c84 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,44 +1,51 @@
- 4.0.0
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ 4.0.0
+ org.wikipedia-miner
+ wikipedia-miner
+ 1.2.4
+ pom
+ wikipedia-miner
+ http://maven.apache.org
+
+ UTF-8
+
+
+
+
+ false
+
+ central
+ bintray-plugins
+ http://jcenter.bintray.com
+
+
+
+
+ junit
+ junit
+ 3.8.1
+ test
+
+
+
+ wikipedia-miner-core
+ wikipedia-miner-extract
+
- org.wikipedia-miner
- wikipedia-miner
- 0.0.1-SNAPSHOT
- pom
-
- wikipedia-miner
- http://maven.apache.org
-
-
- UTF-8
-
-
-
-
- junit
- junit
- 3.8.1
- test
-
-
-
- wikipedia-miner-core
- wikipedia-miner-extract
-
-
-
-
-
- org.apache.maven.plugins
- maven-compiler-plugin
- 3.0
-
- 1.6
- 1.6
-
-
-
-
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.0
+
+ 1.6
+ 1.6
+
+
+
+
+
\ No newline at end of file
diff --git a/wikipedia-miner-core/pom.xml b/wikipedia-miner-core/pom.xml
index 2b31c50..9053b1f 100644
--- a/wikipedia-miner-core/pom.xml
+++ b/wikipedia-miner-core/pom.xml
@@ -1,104 +1,114 @@
- 4.0.0
-
- org.wikipedia-miner
- wikipedia-miner
- 0.0.1-SNAPSHOT
-
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
+ xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ 4.0.0
+
+ org.wikipedia-miner
+ wikipedia-miner
+ 1.2.4
+
- wikipedia-miner-core
+ wikipedia-miner-core
- wikipedia-miner-core
- http://maven.apache.org
-
- UTF-8
-
-
+ wikipedia-miner-core
+ 1.2.4
-
- com.sleepycat
- je
- 5.0.73
-
+ http://maven.apache.org
+
+ UTF-8
+
+
+
-
- net.sf.trove4j
- trove4j
- 3.0.3
-
+
+ com.sleepycat
+ je
+ 5.0.73
+
+
+
+ net.sf.trove4j
+ trove4j
+ 3.0.3
+
-
- junit
- junit
- 3.8.1
- test
-
-
-
- log4j
- log4j
- 1.2.17
-
+
+ junit
+ junit
+ 3.8.1
+ test
+
-
- org.apache.hadoop
- hadoop-core
- 1.2.1
-
+
+ log4j
+ log4j
+ 1.2.17
+
-
- org.apache.opennlp
- opennlp-tools
- 1.5.3
-
+
+ org.apache.hadoop
+ hadoop-core
+ 1.2.1
+
-
- org.dmilne
- weka-wrapper
- 0.0.1
-
+
+ org.apache.opennlp
+ opennlp-tools
+ 1.5.3
+
-
- org.apache.commons
- commons-math
- 2.2
-
+
+ org.dmilne
+ weka-wrapper
+ 0.0.1
+
-
+
+ org.apache.commons
+ commons-math
+ 2.2
+
+
+ org.apache.commons
+ commons-compress
+ 1.8.1
+ jar
+
+
-
-
-
- maven-assembly-plugin
-
-
-
- org.wikipedia.miner.util.EnvironmentBuilder
-
-
-
- jar-with-dependencies
-
-
-
+
+
+
+ maven-assembly-plugin
+
+
+
+ org.wikipedia.miner.util.EnvironmentBuilder
+
+
+
+ jar-with-dependencies
+
+
+
-
- org.apache.maven.plugins
- maven-compiler-plugin
- 3.0
-
- 1.6
- 1.6
-
-
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.0
+
+ 1.6
+ 1.6
+
+
+
-
-
+
+
+
diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/Disambiguator.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/Disambiguator.java
index d7e3e2e..2125de7 100644
--- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/Disambiguator.java
+++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/Disambiguator.java
@@ -43,7 +43,6 @@
import weka.classifiers.*;
import weka.classifiers.meta.Bagging;
import weka.core.* ;
-import weka.filters.supervised.instance.Resample ;
/**
* A machine-learned disambiguator. Given a term and a sense, it can identify how valid that sense is.
@@ -62,7 +61,7 @@ public class Disambiguator {
private NGrammer nGrammer ;
private double minSenseProbability ;
- private int maxLabelLength = 20 ;
+ private final int maxLabelLength = 20 ;
private double minLinkProbability ;
private int maxContextSize ;
diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/preprocessing/WikiPreprocessor.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/preprocessing/WikiPreprocessor.java
index b4fca02..0b00c76 100644
--- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/preprocessing/WikiPreprocessor.java
+++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/preprocessing/WikiPreprocessor.java
@@ -23,7 +23,6 @@
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
-import java.util.Vector;
import java.util.regex.*;
import org.wikipedia.miner.annotation.preprocessing.PreprocessedDocument.RegionTag;
diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/tagging/DocumentTagger.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/tagging/DocumentTagger.java
index c16df3f..9c76729 100644
--- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/tagging/DocumentTagger.java
+++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/tagging/DocumentTagger.java
@@ -89,7 +89,7 @@ public String tag(PreprocessedDocument doc, Collection topics, RepeatMode
ArrayList references = resolveCollisions(topics) ;
String originalText = doc.getOriginalText() ;
- StringBuffer wikifiedText = new StringBuffer() ;
+ StringBuilder wikifiedText = new StringBuilder() ;
int lastIndex = 0 ;
HashSet doneIds = new HashSet() ;
@@ -149,7 +149,7 @@ private ArrayList resolveCollisions(Collection topics) {
double outerWeight = topicWeights.get(outerRef.getTopicId());
//identify references overlapped by this one, and their total weight
- Vector innerReferences = new Vector() ;
+ List innerReferences = new ArrayList() ;
double maxInnerWeight = 0 ;
for (int j=i+1 ; j decider ;
+ private final Decider decider ;
private Dataset dataset ;
int linksConsidered = 0 ;
@@ -123,6 +123,7 @@ public int getLinksConsidered() {
* @return an ArrayList of the same topics, where the weight of each topic is the probability that it is a link.
* @throws Exception if the link detector has not yet been trained
*/
+ @Override
public HashMap getTopicWeights(Collection topics) throws Exception {
if (!decider.isReady())
diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/weighting/SimpleDocumentIndexer.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/weighting/SimpleDocumentIndexer.java
index 85c9a9c..fd6df9a 100644
--- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/weighting/SimpleDocumentIndexer.java
+++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/weighting/SimpleDocumentIndexer.java
@@ -20,9 +20,7 @@
package org.wikipedia.miner.annotation.weighting;
-import java.util.ArrayList;
import java.util.Collection;
-import java.util.Collections;
import java.util.HashMap;
import org.wikipedia.miner.annotation.Topic;
diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ArticleComparer.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ArticleComparer.java
index 9bac5cf..39648b8 100644
--- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ArticleComparer.java
+++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ArticleComparer.java
@@ -54,7 +54,7 @@ private enum LinkDirection{In, Out} ;
int wikipediaArticleCount ;
Double m ;
- private long articlesCompared = 0 ;
+ private final long articlesCompared = 0 ;
enum Attributes {
inLinkGoogleMeasure,
@@ -255,7 +255,6 @@ public void buildClassifier(Classifier classifier) throws Exception {
/**
*
*
- * @param classifier
* @throws Exception
*/
public void buildDefaultClassifier() throws Exception {
diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ArticleComparison.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ArticleComparison.java
index 4f3e7df..bca9890 100644
--- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ArticleComparison.java
+++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ArticleComparison.java
@@ -4,8 +4,8 @@
public class ArticleComparison {
- private Article articleA ;
- private Article articleB ;
+ private final Article articleA ;
+ private final Article articleB ;
private boolean inLinkFeaturesSet= false ;
private Double inLinkGoogleMeasure ;
diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ComparisonDataSet.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ComparisonDataSet.java
index 6cf3727..0cab671 100644
--- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ComparisonDataSet.java
+++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ComparisonDataSet.java
@@ -183,7 +183,7 @@ public double getRelatedness() {
@Override
public String toString() {
- StringBuffer sb = new StringBuffer() ;
+ StringBuilder sb = new StringBuilder() ;
sb.append(termA) ;
sb.append(",") ;
diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ConnectionSnippet.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ConnectionSnippet.java
index 676983b..efa060a 100644
--- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ConnectionSnippet.java
+++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ConnectionSnippet.java
@@ -9,21 +9,21 @@
public class ConnectionSnippet implements Comparable {
private String _markup ;
- private String _plainText ;
+ private final String _plainText ;
- private Article _source ;
- private Article _topic1 ;
- private Article _topic2 ;
+ private final Article _source ;
+ private final Article _topic1 ;
+ private final Article _topic2 ;
- private int _sentenceIndex ;
+ private final int _sentenceIndex ;
private boolean _followsHeading = false ;
private boolean _isListItem = false ;
private Double _weight ;
- private static Pattern _headingPattern = Pattern.compile("\\s*={2,}(.*?)={2,}(.*)") ;
- private static Pattern _listPattern = Pattern.compile("\\s*[*#]+(.*)") ;
- private static MarkupStripper _stripper = new MarkupStripper() ;
+ private static final Pattern _headingPattern = Pattern.compile("\\s*={2,}(.*?)={2,}(.*)") ;
+ private static final Pattern _listPattern = Pattern.compile("\\s*[*#]+(.*)") ;
+ private static final MarkupStripper _stripper = new MarkupStripper() ;
public ConnectionSnippet(int sentenceIndex, Article source, Article topic1, Article topic2) {
_sentenceIndex = sentenceIndex ;
diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ConnectionSnippetWeighter.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ConnectionSnippetWeighter.java
index 2af4a5a..59a2139 100644
--- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ConnectionSnippetWeighter.java
+++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ConnectionSnippetWeighter.java
@@ -41,10 +41,10 @@ enum Attributes {
}
- private Wikipedia wikipedia ;
- private ArticleComparer cmp ;
+ private final Wikipedia wikipedia ;
+ private final ArticleComparer cmp ;
- private Decider snippetWeighter ;
+ private final Decider snippetWeighter ;
private Dataset trainingDataset ;
@@ -88,7 +88,7 @@ public double getWeight(ConnectionSnippet snippet) throws Exception {
}
}
- public void train(Vector weightedSnippets) throws Exception {
+ public void train(List weightedSnippets) throws Exception {
trainingDataset = snippetWeighter.createNewDataset() ;
@@ -104,7 +104,7 @@ public void train(Vector weightedSnippets) throws Exception {
}
}
- public double test(Vector weightedSnippets) throws Exception {
+ public double test(List weightedSnippets) throws Exception {
List manualWeights = new ArrayList() ;
List autoWeights = new ArrayList() ;
@@ -183,7 +183,6 @@ public void buildClassifier(Classifier classifier) throws Exception {
/**
*
*
- * @param classifier
* @throws Exception
*/
public void buildDefaultClassifier() throws Exception {
diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/LabelComparer.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/LabelComparer.java
index 15d2ec8..3c7f993 100644
--- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/LabelComparer.java
+++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/LabelComparer.java
@@ -5,9 +5,6 @@
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Collections;
-import java.util.TreeSet;
-
-import org.wikipedia.miner.annotation.Disambiguator;
import org.wikipedia.miner.model.Label;
import org.wikipedia.miner.model.Wikipedia;
import org.wikipedia.miner.util.CorrelationCalculator;
@@ -17,9 +14,7 @@
import weka.classifiers.Classifier;
import weka.classifiers.functions.GaussianProcesses;
-import weka.classifiers.functions.SMO;
import weka.classifiers.meta.Bagging;
-import weka.classifiers.meta.FilteredClassifier;
import weka.core.Instance;
import weka.core.Utils;
diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/db/MarkupDatabase.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/db/MarkupDatabase.java
index d924222..ffd5b7a 100644
--- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/db/MarkupDatabase.java
+++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/db/MarkupDatabase.java
@@ -20,6 +20,14 @@
import com.sleepycat.bind.tuple.StringBinding;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseEntry;
+import java.io.BufferedInputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import org.apache.commons.compress.compressors.CompressorException;
+import org.apache.commons.compress.compressors.CompressorInputStream;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.tools.bzip2.* ;
@@ -69,9 +77,10 @@ public void loadFromCsvFile(File dataFile, boolean overwrite, ProgressTracker tr
* @param tracker an optional progress tracker (may be null)
* @throws IOException if there is a problem reading or deserialising the given data file.
* @throws XMLStreamException if the XML within the data file cannot be parsed.
+ * @throws org.apache.commons.compress.compressors.CompressorException
*/
- public void loadFromXmlFile(File dataFile, boolean overwrite, ProgressTracker tracker) throws IOException, XMLStreamException {
-
+ public void loadFromXmlFile(File dataFile, boolean overwrite, ProgressTracker tracker) throws IOException, XMLStreamException, CompressorException {
+// overwrite=true;
if (exists() && !overwrite)
return ;
@@ -85,15 +94,24 @@ public void loadFromXmlFile(File dataFile, boolean overwrite, ProgressTracker tr
StringBuffer characters = new StringBuffer() ;
InputStream reader ;
-
- if (dataFile.getName().endsWith(".bz2"))
- reader = new CBZip2InputStream(new FileInputStream(dataFile)) ;
- else
+ CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
+ .onMalformedInput(CodingErrorAction.REPORT)
+ .onUnmappableCharacter(CodingErrorAction.REPORT);
+
+ if (dataFile.getName().endsWith(".bz2")){
+
+ FileInputStream fin=new FileInputStream(dataFile);
+ BufferedInputStream bis=new BufferedInputStream(fin);
+ CompressorInputStream input=new CompressorStreamFactory().createCompressorInputStream(bis);
+ reader = input;
+ } else{
reader = new FileInputStream(dataFile) ;
-
+ }
XMLInputFactory xmlStreamFactory = XMLInputFactory.newInstance() ;
CountingInputStream countingReader = new CountingInputStream(reader) ;
- XMLStreamReader xmlStreamReader = xmlStreamFactory.createXMLStreamReader(countingReader, "UTF-8") ;
+ // To work around a bug in XERCES (XERCESJ-1257), we assume the XML is always UTF8, so we simply provide reader.
+ XMLStreamReader xmlStreamReader = xmlStreamFactory.createXMLStreamReader(new InputStreamReader(countingReader,decoder)) ;
+ System.out.println("Parser class: " + xmlStreamReader.getClass().toString());
int pageTotal = 0 ;
long charTotal = 0 ;
diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/db/WEnvironment.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/db/WEnvironment.java
index a7fb2af..b81139c 100755
--- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/db/WEnvironment.java
+++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/db/WEnvironment.java
@@ -11,6 +11,7 @@
import javax.xml.stream.XMLStreamException;
import com.sleepycat.je.*;
+import org.apache.commons.compress.compressors.CompressorException;
import org.apache.log4j.Logger;
import org.wikipedia.miner.db.WDatabase.DatabaseType;
@@ -334,7 +335,6 @@ public WEnvironment(WikipediaConfiguration conf, boolean threaded) throws Enviro
envConf.setAllowCreate(false) ;
envConf.setReadOnly(true) ;
envConf.setCachePercent(10) ;
-
env = new Environment(conf.getDatabaseDirectory(), envConf) ;
initDatabases() ;
@@ -672,7 +672,7 @@ public void finalize() {
* @throws IOException if any of the required files cannot be read
* @throws XMLStreamException if the XML dump of wikipedia cannot be parsed
*/
- public static void buildEnvironment(WikipediaConfiguration conf, File dataDirectory, boolean overwrite) throws IOException, XMLStreamException {
+ public static void buildEnvironment(WikipediaConfiguration conf, File dataDirectory, boolean overwrite) throws IOException, XMLStreamException, CompressorException {
//check all files exist and are readable before doing anything
@@ -739,14 +739,14 @@ public static void buildEnvironment(WikipediaConfiguration conf, File dataDirect
env.close();
- TextProcessor tp = conf.getDefaultTextProcessor() ;
- if (tp != null) {
- File tmpDir = new File(conf.getDataDirectory() + File.separator + "tmp" + tp.getName()) ;
- tmpDir.mkdir() ;
- tmpDir.deleteOnExit() ;
-
- prepareTextProcessor(tp, conf, tmpDir, overwrite, 5) ;
- }
+// TextProcessor tp = conf.getDefaultTextProcessor() ;
+// if (tp != null) {
+// File tmpDir = new File(conf.getDataDirectory() + File.separator + "tmp" + tp.getName()) ;
+// tmpDir.mkdir() ;
+// tmpDir.deleteOnExit() ;
+//
+// prepareTextProcessor(tp, conf, tmpDir, overwrite, 5) ;
+// }
}
/**
diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Article.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Article.java
index 436b99c..98b2b1b 100755
--- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Article.java
+++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Article.java
@@ -336,6 +336,7 @@ public Integer[] getSentenceIndexesMentioning(Article art) {
DbLinkLocation key = new DbLinkLocation(id, null) ;
int index = Collections.binarySearch(tmpLinks.getLinkLocations(), key, new Comparator(){
+ @Override
public int compare(DbLinkLocation a, DbLinkLocation b) {
return new Integer(a.getLinkId()).compareTo(b.getLinkId()) ;
}
@@ -401,14 +402,14 @@ public Integer[] getSentenceIndexesMentioning(ArrayList arts) {
*/
public class Label {
- private String text ;
+ private final String text ;
- private long linkDocCount ;
- private long linkOccCount ;
+ private final long linkDocCount ;
+ private final long linkOccCount ;
- private boolean fromTitle ;
- private boolean fromRedirect ;
- private boolean isPrimary ;
+ private final boolean fromTitle ;
+ private final boolean fromRedirect ;
+ private final boolean isPrimary ;
protected Label(DbLabelForPage l) {
diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Label.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Label.java
index 891f628..dbaba8e 100755
--- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Label.java
+++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Label.java
@@ -19,8 +19,8 @@ public class Label {
//properties =============================================================
- private String text ;
- private TextProcessor textProcessor ;
+ private final String text ;
+ private final TextProcessor textProcessor ;
private long linkDocCount = 0 ;
private long linkOccCount = 0 ;
@@ -150,11 +150,11 @@ public Sense[] getSenses() {
public class Sense extends Article {
- private long sLinkDocCount ;
- private long sLinkOccCount ;
+ private final long sLinkDocCount ;
+ private final long sLinkOccCount ;
- private boolean fromTitle ;
- private boolean fromRedirect ;
+ private final boolean fromTitle ;
+ private final boolean fromRedirect ;
//constructor =============================================================
diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Page.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Page.java
index aee6f8d..208e676 100755
--- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Page.java
+++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Page.java
@@ -141,6 +141,7 @@ public boolean equals(Page p) {
* @param p the Page to be compared
* @return see above.
*/
+ @Override
public int compareTo(Page p) {
if (p.id == id)
@@ -163,6 +164,7 @@ public int compareTo(Page p) {
*
* @return a string representation of the page
*/
+ @Override
public String toString() {
String s = getId() + ": " + getTitle() ;
return s ;
diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Wikipedia.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Wikipedia.java
index 8890653..0f1f659 100644
--- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Wikipedia.java
+++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Wikipedia.java
@@ -390,7 +390,15 @@ public void close() {
@Override
public void finalize() {
- if (this.env != null)
- Logger.getLogger(WIterator.class).warn("Unclosed wikipedia. You may be causing a memory leak.") ;
+ try {
+ if (this.env != null)
+ Logger.getLogger(WIterator.class).warn("Unclosed wikipedia. You may be causing a memory leak.") ;
+ } finally {
+ try {
+ super.finalize();
+ } catch (Throwable ex) {
+ Logger.getLogger(WIterator.class).warn("Unclosed wikipedia. You may be causing a memory leak.") ;
+ }
+ }
}
}
diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/util/EmphasisResolver.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/util/EmphasisResolver.java
index d75b62b..06b17ca 100644
--- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/util/EmphasisResolver.java
+++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/util/EmphasisResolver.java
@@ -20,9 +20,6 @@
package org.wikipedia.miner.util;
import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Comparator;
-import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -35,7 +32,7 @@ public class EmphasisResolver {
public String resolveEmphasis(String text) {
- StringBuffer sb = new StringBuffer() ;
+ StringBuilder sb = new StringBuilder() ;
for (String line:text.split("\n")) {
sb.append(resolveLine(line)) ;
@@ -150,9 +147,7 @@ private String resolveLine(String line) {
}
- // Now let's actually convert our apostrophic mush to HTML!
-
- StringBuffer output = new StringBuffer() ;
+ StringBuilder output = new StringBuilder() ;
StringBuffer buffer = new StringBuffer() ;
String state = "" ;
int i = 0 ;
@@ -284,7 +279,7 @@ private String[] getSplits(String text) {
private String getFilledString(int length) {
- StringBuffer sb = new StringBuffer() ;
+ StringBuilder sb = new StringBuilder() ;
for (int i=0 ; i{
+public class LabelIterator implements Iterator