Skip to content

Commit

Permalink
Merge pull request #14 from breskos/feature/autoencoders
Browse files Browse the repository at this point in the history
Feature/autoencoders
  • Loading branch information
breskos authored Jun 6, 2021
2 parents fe30ed6 + 2c3177f commit 64bac5d
Show file tree
Hide file tree
Showing 39 changed files with 1,056 additions and 61 deletions.
20 changes: 18 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,26 @@
# Change Log
All notable changes to this project will be documented in this file.

## [0.2.5] 2021-06-06

With this version we introduce encoders (automatic encoders) to gopher-learn.
You now can reduce large float slice inputs or encode your string input right away.

### Added
- Encoders for float slices and string input.
- With encoders large float input can be reduced using Spearman.
- Also with encoders strings can be encoded as ngrams and dictionary (Topic modelling to come soon)

### Changed
- Relocated the neural net from neural package into an own package called net

### Fixed
- Nothing here


## [0.2] - 2021-05-09

Here we write upgrading notes for brands. It's a team effort to make them as
straightforward as possible.
Introducing online learning.

### Added
- Config for online learner to control learning behavior - easily inject your own config
Expand Down
116 changes: 116 additions & 0 deletions analysis/correlation.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
package analysis

import (
"math"
"sort"
)

// Spearman returns the rank correlation coefficient between data1 and data2, and the associated p-value
func Spearman(data1, data2 []float64) (rs float64, p float64) {
n := len(data1)
wksp1, wksp2 := make([]float64, n), make([]float64, n)
copy(wksp1, data1)
copy(wksp2, data2)

sort.Sort(sorter{wksp1, wksp2})
sf := overwrite(wksp1)
sort.Sort(sorter{wksp2, wksp1})
sg := overwrite(wksp2)
d := 0.0
for j := 0; j < n; j++ {
sq := wksp1[j] - wksp2[j]
d += (sq * sq)
}

en := float64(n)
en3n := en*en*en - en

fac := (1.0 - sf/en3n) * (1.0 - sg/en3n)
rs = (1.0 - (6.0/en3n)*(d+(sf+sg)/12.0)) / math.Sqrt(fac)

if fac = (rs + 1.0) * (1.0 - rs); fac > 0 {
t := rs * math.Sqrt((en-2.0)/fac)
df := en - 2.0
p = betaIncomplete(df/(df+t*t), 0.5*df, 0.5)
}

return rs, p
}

func overwrite(w []float64) float64 {
j, ji, jt, n := 1, 0, 0, len(w)
var rank, s float64
for j < n {
if w[j] != w[j-1] {
w[j-1] = float64(j)
j++
} else {
for jt = j + 1; jt <= n && w[jt-1] == w[j-1]; jt++ {
}
rank = 0.5 * (float64(j) + float64(jt) - 1)
for ji = j; ji <= (jt - 1); ji++ {
w[ji-1] = rank
}
t := float64(jt - j)
s += (t*t*t - t)
j = jt
}
}
if j == n {
w[n-1] = float64(n)
}
return s
}

// betaIncomplete
func betaIncomplete(x, a, b float64) float64 {
if x < 0 || x > 1 {
return math.NaN()
}
bt := 0.0
if 0 < x && x < 1 {
bt = math.Exp(lgamma(a+b) - lgamma(a) - lgamma(b) +
a*math.Log(x) + b*math.Log(1-x))
}
if x < (a+1)/(a+b+2) {
return bt * betaContinuedFractionComponent(x, a, b) / a
} else {
return 1 - bt*betaContinuedFractionComponent(1-x, b, a)/b
}
}

func betaContinuedFractionComponent(x, a, b float64) float64 {
const maxIterations = 200
const epsilon = 3e-14
raiseZero := func(z float64) float64 {
if math.Abs(z) < math.SmallestNonzeroFloat64 {
return math.SmallestNonzeroFloat64
}
return z
}
c := 1.0
d := 1 / raiseZero(1-(a+b)*x/(a+1))
h := d
for m := 1; m <= maxIterations; m++ {
mf := float64(m)
numer := mf * (b - mf) * x / ((a + 2*mf - 1) * (a + 2*mf))
d = 1 / raiseZero(1+numer*d)
c = raiseZero(1 + numer/c)
h *= d * c
numer = -(a + mf) * (a + b + mf) * x / ((a + 2*mf) * (a + 2*mf + 1))
d = 1 / raiseZero(1+numer*d)
c = raiseZero(1 + numer/c)
hfac := d * c
h *= hfac

if math.Abs(hfac-1) < epsilon {
return h
}
}
panic("betainc: a or b too big; failed to converge")
}

func lgamma(x float64) float64 {
y, _ := math.Lgamma(x)
return y
}
13 changes: 13 additions & 0 deletions analysis/utils.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package analysis

type sorter struct {
x []float64
y []float64
}

func (s sorter) Len() int { return len(s.x) }
func (s sorter) Less(i, j int) bool { return s.x[i] < s.x[j] }
func (s sorter) Swap(i, j int) {
s.x[i], s.x[j] = s.x[j], s.x[i]
s.y[i], s.y[j] = s.y[j], s.y[i]
}
6 changes: 6 additions & 0 deletions docs/analysis.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Analysis

*State: In concept*

This module acts as helper for the encoders and the for the evaluation of a data set.

25 changes: 0 additions & 25 deletions docs/encoder.md

This file was deleted.

83 changes: 83 additions & 0 deletions docs/encoders.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Encoders (experimental)

## Overview

**Attention:** Encoders are currently not able to serialize

Encoders means that the incoming data is automatically put into a data vector (processable for the neural net.)
Encoding of your data into processable feature vectors is very important, because strings are initially not suitable as feature vector.
This module helps to find a representation of your data set without your intervention.
Although the encoders can be controlled using by Config parameters in EncoderConfig.

## EncoderConfig

The encoder performs decisions during its runtime.
For that reason a DefaultConfig is applied.
It is possible to get the DefaultConfig, overwrite specific parameters and apply it again to the encoder.

```go
e := encoders.NewEncoder("test encoder")
cfg := encoders.DefaultConfig()
cfg.DictionaryMaxEntries = 300
e.Config = cfg
```
You can find all possible options for editting the encoder config in encoders/config.go.

## Example
Below you can find an example for the encoder.

```go
// generating the encoder, the encoder can hold different input types and dimensions
e := encoders.NewEncoder("test encoder")
cfg := encoders.DefaultConfig()
cfg.DictionaryMaxEntries = 300
e.Config = cfg
inputName := "language-classification"
set := encoders.NewInput(inputName, encoders.String)
for _, v := range data {
// add your strings here
set.AddString(someStringSample)
}
// scan takes the set and decides (if it is: encoders.Automatic) which encoding to apply
e.Scan(inputName, set, encoders.Automatic)
// transform brings the input into the choosen encoding
e.Transform(inputName, set)
// explain can be used to see what the encoder has done
e.Explain()
// using encode() and an Unified (can be string or float slice) you get the corresponding vector
vector := e.Encode(inputName, encoders.Unified{String: "Hello whats up with you?", Type: encoders.String})
```


## Workflow
This is the workflow. An Encoder can contain different models for encoding. In the workflow below, (namespace) means that you perform an action on a namespace within the encoder. For example, if you have a mixed input vector with strings and floats, you an put all floats together in one namespace as well as the string.

1. Collect data - The encoder needs the samples from the test or a similar set of data points to optimally fit and decide.
1. Create Encoder - create an encoder with the config (the Encoder itself can encode different inputs)
2. Scanner - (namespace) decides which Encoder to select if you choose encoders.Automatic, if not, the given Encoder will be applied
3. Transform - (namespace) After scanning the set and deciding the data is tranformed into the new vector space
4. After transformation is done, you are ready to go with your new vector representation
5. Using - Encode() (namespace) method of the encoder the encode your input.

## Encoders
If you have no specific idea which encoder to use you can also run using encoders.Automatic.
Using this the encoder will figure out by itself which encoding is applicable.

The encoders work for different data types:

1. N-Grams (strings), encoders.StringNGrams
2. Splitted Dictionary (string), encoders.StringSplitDictionary
3. Dictionary (strings), encoders.StringDictionary
4. FloatExact (numbers), encoders.FloatExact
5. FloatReducer (numbers), encoders.FloatReducer
6. Topic Modelling coming soon (strings) - not implemented yet


## Representation (experimental)

Out of the encoder activity the network generates a representation of the input space.
This representation can be persisted and loaded to continue working on the network.
This representation looks like this:

1. Number of feature vectors
2. Mapping of value to neuron values
3 changes: 3 additions & 0 deletions docs/recos.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Recos

*State: In concept*
18 changes: 18 additions & 0 deletions docs/textos.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Textos Extractor

*State: In concept*


This extractor can be used to extract strings and topics.
In this document the nature of Textos is described.

## Analysis
- Occurence of tokens

## Layers

### Structural learner
The stuctural learner uses the data from the corpus to decide which part of the text is structural and which one is topic related.


### Topic Modelling
41 changes: 41 additions & 0 deletions encoders/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package encoders

type EncoderConfig struct {
DelimiterToken string
DimToSamplesRatio float64
// Decision heuristics
FloatReducerThreshold int
TopicModelMinDelimiters int
NGramsMaxTokens int
DictionaryMaxEntries int
DictionaryMaxDelimiters int
SplitDictionaryMaxEntries int
// Application settings
FloatReducerSpearman float64
FloatReducerSkewness float64
FloatReducerZeroValues bool
NGramMaxGrams int
NGramMaxCapacity int
NGramCropRatio float64
DefaultStringEncoder EncoderType
}

func DefaultConfig() *EncoderConfig {
return &EncoderConfig{
DelimiterToken: " ",
DimToSamplesRatio: 0.8,
FloatReducerThreshold: 40,
TopicModelMinDelimiters: 5,
NGramsMaxTokens: 20,
DictionaryMaxEntries: 50,
DictionaryMaxDelimiters: 5,
SplitDictionaryMaxEntries: 100,
FloatReducerSpearman: 0.90,
FloatReducerSkewness: 0.90,
FloatReducerZeroValues: true,
NGramMaxGrams: 3,
NGramMaxCapacity: 100,
NGramCropRatio: 0.05,
DefaultStringEncoder: StringNGrams,
}
}
Loading

0 comments on commit 64bac5d

Please sign in to comment.