Skip to content

Commit 00f50b5

Browse files
committed
semi-ngrams category generation
1 parent bda2e8e commit 00f50b5

File tree

1 file changed

+176
-42
lines changed

1 file changed

+176
-42
lines changed

src/main/java/com/dooji/craftsense/manager/CategoryGenerator.java

+176-42
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,11 @@
1717
import java.lang.reflect.Type;
1818
import java.nio.file.Files;
1919
import java.nio.file.Path;
20-
import java.util.*;
20+
import java.util.ArrayList;
21+
import java.util.Arrays;
22+
import java.util.HashMap;
23+
import java.util.List;
24+
import java.util.Map;
2125
import java.util.regex.Pattern;
2226
import java.util.stream.Collectors;
2327

@@ -26,77 +30,157 @@ public class CategoryGenerator {
2630
public static final Logger LOGGER = LoggerFactory.getLogger(MOD_ID);
2731
private static final Gson GSON = new GsonBuilder().setPrettyPrinting().create();
2832
private static final Path CATEGORIES_PATH = Path.of("config/CraftSense/categories.json");
29-
private static final Pattern SPLIT_PATTERN = Pattern.compile("[_\\s]");
33+
private static final Pattern SPLIT_PATTERN = Pattern.compile("[_\\s]+");
34+
private static final double POSITION_THRESHOLD = 0.5;
35+
private static final List<String> UNCOUNTABLE = Arrays.asList("WOOL", "DIRT", "SAND", "WATER", "MILK", "LAVA", "FLESH", "ICE");
36+
private static final double DYNAMIC_BONUS = 0.1;
3037

3138
public static void generateCategories() {
32-
Map<String, List<String>> categorizedItems = loadExistingCategories();
33-
34-
Map<String, List<String>> specificCategories = new HashMap<>();
35-
specificCategories.put("TOOL", Arrays.asList("AXE", "PICKAXE", "SWORD", "SHOVEL"));
36-
specificCategories.put("ARMOR", Arrays.asList("HELMET", "CHESTPLATE", "LEGGINGS", "BOOTS"));
37-
specificCategories.put("SIGN", Collections.singletonList("SIGN"));
38-
specificCategories.put("STAIR", Collections.singletonList("STAIR"));
39-
specificCategories.put("DOOR", Collections.singletonList("DOOR"));
40-
specificCategories.put("FENCE", Arrays.asList("FENCE", "FENCE_GATE"));
41-
specificCategories.put("BUTTON", Collections.singletonList("BUTTON"));
42-
specificCategories.put("PRESSURE_PLATE", Collections.singletonList("PRESSURE_PLATE"));
43-
specificCategories.put("SLAB", Collections.singletonList("SLAB"));
44-
specificCategories.put("TRAPDOOR", Collections.singletonList("TRAPDOOR"));
45-
specificCategories.put("BOAT", Collections.singletonList("BOAT"));
39+
List<ItemData> items = new ArrayList<>();
4640

4741
for (Item item : Registries.ITEM) {
4842
Identifier itemId = Registries.ITEM.getId(item);
4943
String itemName = itemId.getPath().toUpperCase();
5044

51-
if (isItemCategorized(categorizedItems, itemName)) {
52-
continue;
45+
if (itemName.equals("AIR")) continue;
46+
47+
List<String> tokens;
48+
49+
if (itemName.startsWith("MUSIC_DISC") || itemName.startsWith("DISC_")) {
50+
tokens = new ArrayList<>();
51+
tokens.add("MUSIC_DISC");
52+
} else {
53+
tokens = Arrays.stream(SPLIT_PATTERN.split(itemName))
54+
.filter(s -> s.length() > 2)
55+
.collect(Collectors.toList());
56+
}
57+
58+
items.add(new ItemData(itemName, tokens));
59+
}
60+
61+
Map<String, GlobalStat> globalStats = new HashMap<>();
62+
for (ItemData item : items) {
63+
item.generateNGrams();
64+
65+
for (NGramCandidate ng : item.ngrams) {
66+
globalStats.computeIfAbsent(ng.phrase, k -> new GlobalStat()).add(ng.normalizedPosition);
5367
}
68+
}
5469

55-
List<String> keywords = Arrays.stream(SPLIT_PATTERN.split(itemName))
56-
.filter(word -> word.length() > 2)
57-
.collect(Collectors.toList());
70+
double maxCount = globalStats.values().stream().mapToDouble(gs -> gs.count).max().orElse(1);
5871

59-
boolean isInSpecificCategory = false;
72+
Map<String, String> itemCategoryMap = new HashMap<>();
73+
Map<String, List<String>> existingCategories = loadExistingCategories();
74+
List<String> knownCategories = existingCategories.keySet().stream().map(String::toUpperCase).collect(Collectors.toList());
6075

61-
for (Map.Entry<String, List<String>> entry : specificCategories.entrySet()) {
62-
String category = entry.getKey();
63-
List<String> categoryKeywords = entry.getValue();
76+
for (ItemData item : items) {
77+
NGramCandidate bestCandidate = null;
78+
double bestScore = -1;
6479

65-
if (keywords.stream().anyMatch(categoryKeywords::contains)) {
66-
categorizedItems.computeIfAbsent(category, k -> new ArrayList<>()).add(itemName);
67-
isInSpecificCategory = true;
80+
for (NGramCandidate ng : item.ngrams) {
81+
GlobalStat stat = globalStats.getOrDefault(ng.phrase, new GlobalStat());
82+
double globalAvg = stat.getAverage();
83+
double bonus = 0;
84+
String formattedCandidate = formatCategoryName(ng.phrase);
85+
86+
if (knownCategories.contains(formattedCandidate.toUpperCase())) {
87+
bonus += DYNAMIC_BONUS;
6888
}
69-
}
7089

71-
if (!isInSpecificCategory) {
72-
for (String keyword : keywords) {
73-
categorizedItems.computeIfAbsent(keyword, k -> new ArrayList<>()).add(itemName);
90+
double frequencyFactor = (stat.count / maxCount) * 0.2;
91+
double candidateScore = globalAvg >= POSITION_THRESHOLD ? ng.normalizedPosition + bonus + frequencyFactor : ng.normalizedPosition * 0.5;
92+
93+
if (candidateScore > bestScore) {
94+
bestScore = candidateScore;
95+
bestCandidate = ng;
7496
}
7597
}
98+
99+
if (bestCandidate == null && !item.ngrams.isEmpty()) {
100+
bestCandidate = item.ngrams.get(item.ngrams.size() - 1);
101+
}
102+
103+
if (bestCandidate != null) {
104+
itemCategoryMap.put(item.name, bestCandidate.phrase);
105+
}
106+
}
107+
108+
Map<String, List<String>> categorizedItems = loadExistingCategories();
109+
for (Map.Entry<String, String> entry : itemCategoryMap.entrySet()) {
110+
String itemName = entry.getKey();
111+
String label = formatCategoryName(entry.getValue());
112+
categorizedItems.computeIfAbsent(label, k -> new ArrayList<>()).add(itemName);
76113
}
77114

78115
saveCategoriesToFile(categorizedItems);
79116
}
80117

118+
private static String formatCategoryName(String candidate) {
119+
String[] words = candidate.toLowerCase().split("[_\\s]+");
120+
if (words.length == 0) return "";
121+
122+
for (int i = 0; i < words.length - 1; i++) {
123+
words[i] = capitalize(words[i]);
124+
}
125+
126+
String last = words[words.length - 1];
127+
if (!UNCOUNTABLE.contains(last.toUpperCase())) {
128+
last = pluralize(last);
129+
}
130+
131+
words[words.length - 1] = capitalize(last);
132+
return String.join(" ", words);
133+
}
134+
135+
private static String capitalize(String word) {
136+
if (word.isEmpty()) return word;
137+
return word.substring(0, 1).toUpperCase() + word.substring(1).toLowerCase();
138+
}
139+
140+
private static String pluralize(String word) {
141+
if (word.endsWith("s")) return word;
142+
if (word.endsWith("oo")) return word + "s";
143+
if (word.endsWith("ch") || word.endsWith("sh") || word.endsWith("x") || word.endsWith("z")) return word + "es";
144+
if (word.endsWith("o")) {
145+
char before = word.charAt(word.length() - 2);
146+
if (!isVowel(before)) return word + "es";
147+
return word + "s";
148+
}
149+
150+
if (word.endsWith("y") && word.length() > 1 && !isVowel(word.charAt(word.length() - 2))) return word.substring(0, word.length() - 1) + "ies";
151+
return word + "s";
152+
}
153+
154+
private static boolean isVowel(char c) {
155+
return "aeiou".indexOf(Character.toLowerCase(c)) != -1;
156+
}
157+
81158
private static Map<String, List<String>> loadExistingCategories() {
82-
if (Files.exists(CATEGORIES_PATH)) {
83-
try (FileReader reader = new FileReader(CATEGORIES_PATH.toFile())) {
84-
Type type = new TypeToken<Map<String, List<String>>>() {}.getType();
85-
return GSON.fromJson(reader, type);
86-
} catch (IOException e) {
87-
LOGGER.error("Failed to load existing categories", e);
159+
Map<String, List<String>> map = new HashMap<>();
160+
161+
try {
162+
Files.createDirectories(CATEGORIES_PATH.getParent());
163+
if (Files.exists(CATEGORIES_PATH)) {
164+
try (FileReader reader = new FileReader(CATEGORIES_PATH.toFile())) {
165+
Type type = new TypeToken<Map<String, List<String>>>() {}.getType();
166+
map = GSON.fromJson(reader, type);
167+
168+
if (map == null) {
169+
map = new HashMap<>();
170+
}
171+
}
88172
}
173+
} catch (IOException e) {
174+
LOGGER.error("Failed to load existing categories", e);
89175
}
90-
return new HashMap<>();
91-
}
92176

93-
private static boolean isItemCategorized(Map<String, List<String>> categorizedItems, String itemName) {
94-
return categorizedItems.values().stream().anyMatch(list -> list.contains(itemName));
177+
return map;
95178
}
96179

97180
private static void saveCategoriesToFile(Map<String, List<String>> categorizedItems) {
98181
try {
99182
Files.createDirectories(CATEGORIES_PATH.getParent());
183+
100184
try (FileWriter writer = new FileWriter(CATEGORIES_PATH.toFile())) {
101185
GSON.toJson(categorizedItems, writer);
102186
LOGGER.info("Categories successfully saved to {}", CATEGORIES_PATH);
@@ -105,4 +189,54 @@ private static void saveCategoriesToFile(Map<String, List<String>> categorizedIt
105189
LOGGER.error("Failed to save categories to file", e);
106190
}
107191
}
192+
193+
private static class ItemData {
194+
String name;
195+
List<String> tokens;
196+
List<NGramCandidate> ngrams;
197+
198+
ItemData(String name, List<String> tokens) {
199+
this.name = name;
200+
this.tokens = tokens;
201+
this.ngrams = new ArrayList<>();
202+
}
203+
204+
void generateNGrams() {
205+
int len = tokens.size();
206+
for (int n = 1; n <= 3; n++) {
207+
if (n > len) break;
208+
209+
for (int i = 0; i <= len - n; i++) {
210+
double norm = (i + ((n + 1) / 2.0)) / (double) len;
211+
String phrase = String.join("_", tokens.subList(i, i + n));
212+
213+
ngrams.add(new NGramCandidate(phrase, norm));
214+
}
215+
}
216+
}
217+
}
218+
219+
private static class NGramCandidate {
220+
String phrase;
221+
double normalizedPosition;
222+
223+
NGramCandidate(String phrase, double normalizedPosition) {
224+
this.phrase = phrase;
225+
this.normalizedPosition = normalizedPosition;
226+
}
227+
}
228+
229+
private static class GlobalStat {
230+
double sum = 0;
231+
int count = 0;
232+
233+
void add(double value) {
234+
sum += value;
235+
count++;
236+
}
237+
238+
double getAverage() {
239+
return count == 0 ? 0 : sum / count;
240+
}
241+
}
108242
}

0 commit comments

Comments
 (0)