Skip to content

Commit a6398d0

Browse files
committed
first-cut algorithm and database
1 parent 61393cd commit a6398d0

File tree

11 files changed

+812378
-98
lines changed

11 files changed

+812378
-98
lines changed

service/src/main/java/org/folg/places/service/StandardizeService.java

+5-7
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
package org.folg.places.service;
1818

19-
import org.folg.places.standardize.StandardizeResult;
19+
import org.folg.places.standardize.Place;
2020
import org.folg.places.standardize.Standardizer;
2121

2222
import javax.ws.rs.GET;
@@ -33,18 +33,16 @@ public class StandardizeService {
3333
@GET
3434
@Produces({MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON})
3535
@Path("{text}")
36-
public StandardizeResult get(@PathParam("text") String text) {
37-
StandardizeResult result = Standardizer.getInstance().standardize(text);
38-
result.setName(text);
36+
public Place get(@PathParam("text") String text) {
37+
Place result = Standardizer.getInstance().standardize(text);
3938
return result;
4039
}
4140

4241
@GET
4342
@Produces({MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON})
4443
@Path("{text}/{defaultCountry}")
45-
public StandardizeResult get(@PathParam("text") String text, @PathParam("defaultCountry") String defaultCountry) {
46-
StandardizeResult result = Standardizer.getInstance().standardize(text, defaultCountry);
47-
result.setName(text);
44+
public Place get(@PathParam("text") String text, @PathParam("defaultCountry") String defaultCountry) {
45+
Place result = Standardizer.getInstance().standardize(text, defaultCountry);
4846
return result;
4947
}
5048
}

standardize/pom.xml

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
<descriptorRefs>
2121
<descriptorRef>jar-with-dependencies</descriptorRef>
2222
</descriptorRefs>
23+
<finalName>places-standardize</finalName>
2324
</configuration>
2425
<executions>
2526
<execution>

standardize/src/main/java/org/folg/places/standardize/Normalizer.java

+49-12
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ private Normalizer() {
4747
}
4848

4949
/**
50-
* Tokenize name by removing diacritics, lowercasing, splitting on delimiter, and removing non a-z characters
50+
* Tokenize name by removing diacritics, lowercasing, and splitting on non alphanumeric characters
5151
*
5252
* @param text string to tokenize
5353
* @return tokenized place levels
@@ -61,9 +61,12 @@ public NormalizeResults tokenize(String text) {
6161

6262
StringBuilder numBuf = new StringBuilder();
6363

64-
for (int j = text.length() - 1; j >= 0; j--) {
65-
char c = text.charAt(j);
64+
// find the last letter
65+
int lastPos = text.length()-1;
66+
while (lastPos >= 0) {
67+
char c = text.charAt(lastPos);
6668

69+
// not sure why we're keeping ending numbers?
6770
if (c >= '0' && c <= '9') {
6871
numBuf.append(c);
6972
} else if (c == ',') {
@@ -75,23 +78,18 @@ public NormalizeResults tokenize(String text) {
7578
//if we hit letters before we hit a comma then the numbers contains a word and we want to keep it
7679
break;
7780
}
78-
81+
lastPos--;
7982
}
8083

8184
List<String> levelWords = new ArrayList<String>();
8285
StringBuilder buf = new StringBuilder();
8386

84-
for (int i = 0; i < text.length(); i++) {
87+
// parse up to and including the last letter; anything after that is junk
88+
for (int i = 0; i <= lastPos; i++) {
8589
char c = text.charAt(i);
8690
String replacement;
8791

88-
if (c == ' ') {
89-
if (buf.length() > 0) {
90-
levelWords.add(buf.toString());
91-
buf.setLength(0);
92-
}
93-
}
94-
else if (c == ',') {
92+
if (c == ',') {
9593
if (buf.length() > 0) {
9694
levelWords.add(buf.toString());
9795
buf.setLength(0);
@@ -114,6 +112,13 @@ else if (c == ',') {
114112
logger.warning("Untokenized letter:" + c + " (" + (int) c + ") in " + text);
115113
}
116114
}
115+
// tokenize words on non-alphanumeric
116+
else {
117+
if (buf.length() > 0) {
118+
levelWords.add(buf.toString());
119+
buf.setLength(0);
120+
}
121+
}
117122
}
118123
if (buf.length() > 0) {
119124
levelWords.add(buf.toString());
@@ -127,4 +132,36 @@ else if (c == ',') {
127132

128133
return normalizeResults;
129134
}
135+
136+
/**
137+
* Remove diacritics, lowercase, and remove non alphanumeric characters
138+
*
139+
* @param text string to tokenize
140+
* @return tokenized place levels
141+
*/
142+
public String normalize(String text) {
143+
StringBuilder buf = new StringBuilder();
144+
145+
for (int i = 0; i < text.length(); i++) {
146+
char c = text.charAt(i);
147+
String replacement;
148+
149+
if ((replacement = characterReplacements.get(c)) != null) {
150+
buf.append(replacement.toLowerCase());
151+
} else if (c >= 'A' && c <= 'Z') {
152+
buf.append(Character.toLowerCase(c));
153+
} else if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
154+
buf.append(c);
155+
} else if (Character.isLetter(c)) {
156+
// ignore letters > U+0250; they're generally from scripts that don't map well to roman letters
157+
// ignore 186,170: superscript o and a used in spanish numbers: 1^a and 2^o
158+
// ignore 440,439: Ezh and reverse-Ezh; the only times they appear in the data is in what appears to be noise
159+
if (c < 592 && c != 186 && c != 170 && c != 439 && c != 440) {
160+
logger.warning("Untokenized letter:" + c + " (" + (int) c + ") in " + text);
161+
}
162+
}
163+
}
164+
165+
return buf.toString();
166+
}
130167
}

standardize/src/main/java/org/folg/places/standardize/Place.java

+115
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,124 @@
1616

1717
package org.folg.places.standardize;
1818

19+
import javax.xml.bind.annotation.XmlElement;
20+
import javax.xml.bind.annotation.XmlRootElement;
21+
1922
/**
2023
* User: dallan
2124
* Date: 1/10/12
2225
*/
26+
@XmlRootElement
2327
public class Place {
28+
private int id = 0;
29+
private String name = null;
30+
private String[] altNames = null;
31+
private String[] types = null;
32+
private int locatedInId = 0;
33+
private int[] alsoLocatedInIds = null;
34+
private int level = 0;
35+
private int country = 0;
36+
private double latitude = 0.0;
37+
private double longitude = 0.0;
38+
private Standardizer standardizer = null;
39+
40+
public int getId() {
41+
return id;
42+
}
43+
44+
public void setId(int id) {
45+
this.id = id;
46+
}
47+
48+
public String getName() {
49+
return name;
50+
}
51+
52+
public void setName(String name) {
53+
this.name = name;
54+
}
55+
56+
public String[] getAltNames() {
57+
return altNames;
58+
}
59+
60+
public void setAltNames(String[] altNames) {
61+
this.altNames = altNames;
62+
}
63+
64+
public String[] getTypes() {
65+
return types;
66+
}
67+
68+
public void setTypes(String[] types) {
69+
this.types = types;
70+
}
71+
72+
public int getLocatedInId() {
73+
return locatedInId;
74+
}
75+
76+
public void setLocatedInId(int locatedInId) {
77+
this.locatedInId = locatedInId;
78+
}
79+
80+
public int[] getAlsoLocatedInIds() {
81+
return alsoLocatedInIds;
82+
}
83+
84+
public void setAlsoLocatedInIds(int[] alsoLocatedInIds) {
85+
this.alsoLocatedInIds = alsoLocatedInIds;
86+
}
87+
88+
public int getLevel() {
89+
return level;
90+
}
91+
92+
public void setLevel(int level) {
93+
this.level = level;
94+
}
95+
96+
public int getCountry() {
97+
return country;
98+
}
99+
100+
public void setCountry(int country) {
101+
this.country = country;
102+
}
103+
104+
public double getLatitude() {
105+
return latitude;
106+
}
107+
108+
public void setLatitude(double latitude) {
109+
this.latitude = latitude;
110+
}
111+
112+
public double getLongitude() {
113+
return longitude;
114+
}
115+
116+
public void setLongitude(double longitude) {
117+
this.longitude = longitude;
118+
}
119+
120+
void setStandardizer(Standardizer standardizer) {
121+
this.standardizer = standardizer;
122+
}
123+
124+
@XmlElement
125+
public String getFullName() {
126+
StringBuilder buf = new StringBuilder();
127+
if (standardizer != null) {
128+
buf.append(getName());
129+
int locatedIn = getLocatedInId();
130+
while (locatedIn > 0) {
131+
Place p = standardizer.lookupPlace(locatedIn);
132+
buf.append(", ");
133+
buf.append(p.getName());
134+
locatedIn = p.getLocatedInId();
135+
}
136+
}
137+
return buf.toString();
138+
}
24139
}

standardize/src/main/java/org/folg/places/standardize/StandardizeResult.java

-59
This file was deleted.

0 commit comments

Comments
 (0)