-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCorrelator.java
More file actions
106 lines (97 loc) · 3.84 KB
/
Correlator.java
File metadata and controls
106 lines (97 loc) · 3.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
package shake_n_bacon;
import java.io.IOException;
import providedCode.*;
/**
* @author Scott Kinder
* @UWNetID kinders
* @studentID 1235149
* @email kinders@uw.edu
*
* This is a correlator to test the correlation between two different
* text files. It is intended to show how similar (or different) the two
* text files are, by computing a variance. This variance is like the
* Euclidean distance between two vectors, as it squares the resulting
* difference, and adds that to the variance. It should be noted that
* it doesn't take into account all the words, just relevant words that
* are shared between the two text files.
*
*/
public class Correlator {
public static void main(String[] args) {
if (args.length != 3) {
return;
}
double variance = 0.0;
String firstArg = args[0].toLowerCase();
DataCounter counter1 = null;
DataCounter counter2 = null;
if (firstArg.equals("-s")) {
counter1 = new HashTable_SC(new StringComparator(), new StringHasher());
counter2 = new HashTable_SC(new StringComparator(), new StringHasher());
} else if (firstArg.equals("-o")) {
counter1 = new HashTable_OA(new StringComparator(), new StringHasher());
counter2 = new HashTable_OA(new StringComparator(), new StringHasher());
} else {
return;
}
countWords(args[1], counter1);
countWords(args[2], counter2);
variance = calculateFreq(counter1, counter2);
System.out.println(variance);
}
/* Param counter1 is the first data counter storing the data of words, and their count.
* Param counter2 is the second data counter storing the data of words, and their count.
* This is used to calculate individual word frequencies of the two counters.
* It then compares the frequencies of each word that is in both counters, and
* finds the difference, and then gets the Euclidean sum of it. The final result
* of all the differences of frequencies is returned.
*
*/
private static double calculateFreq(DataCounter counter1, DataCounter counter2) {
double var = 0.0;
SimpleIterator itr = counter1.getIterator();
while (itr.hasNext()) {
DataCount next = itr.next();
if (checkWorthy(counter1.getCount(next.data), counter2.getCount(next.data),
counter1.getSize(), counter2.getSize())) {
double freq1 = (counter1.getCount(next.data) * 10.0) / (counter1.getSize() * 10.0);
double freq2 = (counter2.getCount(next.data) * 10.0) / (counter2.getSize() * 10.0);
double squared = (freq1 - freq2) * (freq1 - freq2);
var += squared;
}
}
return var;
}
/*
* Param first is the first count of a word being checked
* Param second is the second count of a word being checked
* Param firstSize is the total size of the first text file
* Param secondSize is the total size of the second text file
* This is used to check if a word should be considered or not. This is so that
* words that are rarely used (>0.01% of the time) and words that are used a lot
* (<1% of the time) are not checked, which would skew data.
*/
private static boolean checkWorthy(double first, double second, double firstSize,
double secondSize) {
return (first / firstSize >= 0.0001 && first / firstSize <= 0.01) &&
(second / secondSize >= 0.0001 && second / secondSize <= 0.01);
}
/*
* Param file is the name of the file which will be checked
* Param counter is the data counter which is used to store data from text file
* This is used to get a count of how many times each word is a text file is used.
*/
private static void countWords(String file, DataCounter counter) {
try {
FileWordReader reader = new FileWordReader(file);
String word = reader.nextWord();
while (word != null) {
counter.incCount(word);
word = reader.nextWord();
}
} catch (IOException e) {
System.err.println("Error processing " + file + " " + e);
System.exit(1);
}
}
}