-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #14 from breskos/feature/autoencoders
Feature/autoencoders
- Loading branch information
Showing
39 changed files
with
1,056 additions
and
61 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
package analysis | ||
|
||
import ( | ||
"math" | ||
"sort" | ||
) | ||
|
||
// Spearman returns the rank correlation coefficient between data1 and data2, and the associated p-value | ||
func Spearman(data1, data2 []float64) (rs float64, p float64) { | ||
n := len(data1) | ||
wksp1, wksp2 := make([]float64, n), make([]float64, n) | ||
copy(wksp1, data1) | ||
copy(wksp2, data2) | ||
|
||
sort.Sort(sorter{wksp1, wksp2}) | ||
sf := overwrite(wksp1) | ||
sort.Sort(sorter{wksp2, wksp1}) | ||
sg := overwrite(wksp2) | ||
d := 0.0 | ||
for j := 0; j < n; j++ { | ||
sq := wksp1[j] - wksp2[j] | ||
d += (sq * sq) | ||
} | ||
|
||
en := float64(n) | ||
en3n := en*en*en - en | ||
|
||
fac := (1.0 - sf/en3n) * (1.0 - sg/en3n) | ||
rs = (1.0 - (6.0/en3n)*(d+(sf+sg)/12.0)) / math.Sqrt(fac) | ||
|
||
if fac = (rs + 1.0) * (1.0 - rs); fac > 0 { | ||
t := rs * math.Sqrt((en-2.0)/fac) | ||
df := en - 2.0 | ||
p = betaIncomplete(df/(df+t*t), 0.5*df, 0.5) | ||
} | ||
|
||
return rs, p | ||
} | ||
|
||
func overwrite(w []float64) float64 { | ||
j, ji, jt, n := 1, 0, 0, len(w) | ||
var rank, s float64 | ||
for j < n { | ||
if w[j] != w[j-1] { | ||
w[j-1] = float64(j) | ||
j++ | ||
} else { | ||
for jt = j + 1; jt <= n && w[jt-1] == w[j-1]; jt++ { | ||
} | ||
rank = 0.5 * (float64(j) + float64(jt) - 1) | ||
for ji = j; ji <= (jt - 1); ji++ { | ||
w[ji-1] = rank | ||
} | ||
t := float64(jt - j) | ||
s += (t*t*t - t) | ||
j = jt | ||
} | ||
} | ||
if j == n { | ||
w[n-1] = float64(n) | ||
} | ||
return s | ||
} | ||
|
||
// betaIncomplete | ||
func betaIncomplete(x, a, b float64) float64 { | ||
if x < 0 || x > 1 { | ||
return math.NaN() | ||
} | ||
bt := 0.0 | ||
if 0 < x && x < 1 { | ||
bt = math.Exp(lgamma(a+b) - lgamma(a) - lgamma(b) + | ||
a*math.Log(x) + b*math.Log(1-x)) | ||
} | ||
if x < (a+1)/(a+b+2) { | ||
return bt * betaContinuedFractionComponent(x, a, b) / a | ||
} else { | ||
return 1 - bt*betaContinuedFractionComponent(1-x, b, a)/b | ||
} | ||
} | ||
|
||
func betaContinuedFractionComponent(x, a, b float64) float64 { | ||
const maxIterations = 200 | ||
const epsilon = 3e-14 | ||
raiseZero := func(z float64) float64 { | ||
if math.Abs(z) < math.SmallestNonzeroFloat64 { | ||
return math.SmallestNonzeroFloat64 | ||
} | ||
return z | ||
} | ||
c := 1.0 | ||
d := 1 / raiseZero(1-(a+b)*x/(a+1)) | ||
h := d | ||
for m := 1; m <= maxIterations; m++ { | ||
mf := float64(m) | ||
numer := mf * (b - mf) * x / ((a + 2*mf - 1) * (a + 2*mf)) | ||
d = 1 / raiseZero(1+numer*d) | ||
c = raiseZero(1 + numer/c) | ||
h *= d * c | ||
numer = -(a + mf) * (a + b + mf) * x / ((a + 2*mf) * (a + 2*mf + 1)) | ||
d = 1 / raiseZero(1+numer*d) | ||
c = raiseZero(1 + numer/c) | ||
hfac := d * c | ||
h *= hfac | ||
|
||
if math.Abs(hfac-1) < epsilon { | ||
return h | ||
} | ||
} | ||
panic("betainc: a or b too big; failed to converge") | ||
} | ||
|
||
func lgamma(x float64) float64 { | ||
y, _ := math.Lgamma(x) | ||
return y | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
package analysis | ||
|
||
type sorter struct { | ||
x []float64 | ||
y []float64 | ||
} | ||
|
||
func (s sorter) Len() int { return len(s.x) } | ||
func (s sorter) Less(i, j int) bool { return s.x[i] < s.x[j] } | ||
func (s sorter) Swap(i, j int) { | ||
s.x[i], s.x[j] = s.x[j], s.x[i] | ||
s.y[i], s.y[j] = s.y[j], s.y[i] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# Analysis | ||
|
||
*State: In concept* | ||
|
||
This module acts as helper for the encoders and the for the evaluation of a data set. | ||
|
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
# Encoders (experimental) | ||
|
||
## Overview | ||
|
||
**Attention:** Encoders are currently not able to serialize | ||
|
||
Encoders means that the incoming data is automatically put into a data vector (processable for the neural net.) | ||
Encoding of your data into processable feature vectors is very important, because strings are initially not suitable as feature vector. | ||
This module helps to find a representation of your data set without your intervention. | ||
Although the encoders can be controlled using by Config parameters in EncoderConfig. | ||
|
||
## EncoderConfig | ||
|
||
The encoder performs decisions during its runtime. | ||
For that reason a DefaultConfig is applied. | ||
It is possible to get the DefaultConfig, overwrite specific parameters and apply it again to the encoder. | ||
|
||
```go | ||
e := encoders.NewEncoder("test encoder") | ||
cfg := encoders.DefaultConfig() | ||
cfg.DictionaryMaxEntries = 300 | ||
e.Config = cfg | ||
``` | ||
You can find all possible options for editting the encoder config in encoders/config.go. | ||
|
||
## Example | ||
Below you can find an example for the encoder. | ||
|
||
```go | ||
// generating the encoder, the encoder can hold different input types and dimensions | ||
e := encoders.NewEncoder("test encoder") | ||
cfg := encoders.DefaultConfig() | ||
cfg.DictionaryMaxEntries = 300 | ||
e.Config = cfg | ||
inputName := "language-classification" | ||
set := encoders.NewInput(inputName, encoders.String) | ||
for _, v := range data { | ||
// add your strings here | ||
set.AddString(someStringSample) | ||
} | ||
// scan takes the set and decides (if it is: encoders.Automatic) which encoding to apply | ||
e.Scan(inputName, set, encoders.Automatic) | ||
// transform brings the input into the choosen encoding | ||
e.Transform(inputName, set) | ||
// explain can be used to see what the encoder has done | ||
e.Explain() | ||
// using encode() and an Unified (can be string or float slice) you get the corresponding vector | ||
vector := e.Encode(inputName, encoders.Unified{String: "Hello whats up with you?", Type: encoders.String}) | ||
``` | ||
|
||
|
||
## Workflow | ||
This is the workflow. An Encoder can contain different models for encoding. In the workflow below, (namespace) means that you perform an action on a namespace within the encoder. For example, if you have a mixed input vector with strings and floats, you an put all floats together in one namespace as well as the string. | ||
|
||
1. Collect data - The encoder needs the samples from the test or a similar set of data points to optimally fit and decide. | ||
1. Create Encoder - create an encoder with the config (the Encoder itself can encode different inputs) | ||
2. Scanner - (namespace) decides which Encoder to select if you choose encoders.Automatic, if not, the given Encoder will be applied | ||
3. Transform - (namespace) After scanning the set and deciding the data is tranformed into the new vector space | ||
4. After transformation is done, you are ready to go with your new vector representation | ||
5. Using - Encode() (namespace) method of the encoder the encode your input. | ||
|
||
## Encoders | ||
If you have no specific idea which encoder to use you can also run using encoders.Automatic. | ||
Using this the encoder will figure out by itself which encoding is applicable. | ||
|
||
The encoders work for different data types: | ||
|
||
1. N-Grams (strings), encoders.StringNGrams | ||
2. Splitted Dictionary (string), encoders.StringSplitDictionary | ||
3. Dictionary (strings), encoders.StringDictionary | ||
4. FloatExact (numbers), encoders.FloatExact | ||
5. FloatReducer (numbers), encoders.FloatReducer | ||
6. Topic Modelling coming soon (strings) - not implemented yet | ||
|
||
|
||
## Representation (experimental) | ||
|
||
Out of the encoder activity the network generates a representation of the input space. | ||
This representation can be persisted and loaded to continue working on the network. | ||
This representation looks like this: | ||
|
||
1. Number of feature vectors | ||
2. Mapping of value to neuron values |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Recos | ||
|
||
*State: In concept* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# Textos Extractor | ||
|
||
*State: In concept* | ||
|
||
|
||
This extractor can be used to extract strings and topics. | ||
In this document the nature of Textos is described. | ||
|
||
## Analysis | ||
- Occurence of tokens | ||
|
||
## Layers | ||
|
||
### Structural learner | ||
The stuctural learner uses the data from the corpus to decide which part of the text is structural and which one is topic related. | ||
|
||
|
||
### Topic Modelling |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
package encoders | ||
|
||
type EncoderConfig struct { | ||
DelimiterToken string | ||
DimToSamplesRatio float64 | ||
// Decision heuristics | ||
FloatReducerThreshold int | ||
TopicModelMinDelimiters int | ||
NGramsMaxTokens int | ||
DictionaryMaxEntries int | ||
DictionaryMaxDelimiters int | ||
SplitDictionaryMaxEntries int | ||
// Application settings | ||
FloatReducerSpearman float64 | ||
FloatReducerSkewness float64 | ||
FloatReducerZeroValues bool | ||
NGramMaxGrams int | ||
NGramMaxCapacity int | ||
NGramCropRatio float64 | ||
DefaultStringEncoder EncoderType | ||
} | ||
|
||
func DefaultConfig() *EncoderConfig { | ||
return &EncoderConfig{ | ||
DelimiterToken: " ", | ||
DimToSamplesRatio: 0.8, | ||
FloatReducerThreshold: 40, | ||
TopicModelMinDelimiters: 5, | ||
NGramsMaxTokens: 20, | ||
DictionaryMaxEntries: 50, | ||
DictionaryMaxDelimiters: 5, | ||
SplitDictionaryMaxEntries: 100, | ||
FloatReducerSpearman: 0.90, | ||
FloatReducerSkewness: 0.90, | ||
FloatReducerZeroValues: true, | ||
NGramMaxGrams: 3, | ||
NGramMaxCapacity: 100, | ||
NGramCropRatio: 0.05, | ||
DefaultStringEncoder: StringNGrams, | ||
} | ||
} |
Oops, something went wrong.