Merge pull request #14 from breskos/feature/autoencoders

Feature/autoencoders
breskos · Jun 6, 2021 · 64bac5d · 64bac5d
2 parents fe30ed6 + 2c3177f
commit 64bac5d
Show file tree

Hide file tree

Showing 39 changed files with 1,056 additions and 61 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,10 +2,26 @@
 # Change Log
 All notable changes to this project will be documented in this file.
 
+## [0.2.5] 2021-06-06
+
+With this version we introduce encoders (automatic encoders) to gopher-learn.
+You now can reduce large float slice inputs or encode your string input right away.
+
+### Added
+- Encoders for float slices and string input.
+- With encoders large float input can be reduced using Spearman.
+- Also with encoders strings can be encoded as ngrams and dictionary (Topic modelling to come soon)
+
+### Changed
+- Relocated the neural net from neural package into an own package called net
+
+### Fixed
+- Nothing here
+
+
 ## [0.2] - 2021-05-09
 
-Here we write upgrading notes for brands. It's a team effort to make them as
-straightforward as possible.
+Introducing online learning.
 
 ### Added
 - Config for online learner to control learning behavior - easily inject your own config

diff --git a/analysis/correlation.go b/analysis/correlation.go
@@ -0,0 +1,116 @@
+package analysis
+
+import (
+	"math"
+	"sort"
+)
+
+// Spearman returns the rank correlation coefficient between data1 and data2, and the associated p-value
+func Spearman(data1, data2 []float64) (rs float64, p float64) {
+	n := len(data1)
+	wksp1, wksp2 := make([]float64, n), make([]float64, n)
+	copy(wksp1, data1)
+	copy(wksp2, data2)
+
+	sort.Sort(sorter{wksp1, wksp2})
+	sf := overwrite(wksp1)
+	sort.Sort(sorter{wksp2, wksp1})
+	sg := overwrite(wksp2)
+	d := 0.0
+	for j := 0; j < n; j++ {
+		sq := wksp1[j] - wksp2[j]
+		d += (sq * sq)
+	}
+
+	en := float64(n)
+	en3n := en*en*en - en
+
+	fac := (1.0 - sf/en3n) * (1.0 - sg/en3n)
+	rs = (1.0 - (6.0/en3n)*(d+(sf+sg)/12.0)) / math.Sqrt(fac)
+
+	if fac = (rs + 1.0) * (1.0 - rs); fac > 0 {
+		t := rs * math.Sqrt((en-2.0)/fac)
+		df := en - 2.0
+		p = betaIncomplete(df/(df+t*t), 0.5*df, 0.5)
+	}
+
+	return rs, p
+}
+
+func overwrite(w []float64) float64 {
+	j, ji, jt, n := 1, 0, 0, len(w)
+	var rank, s float64
+	for j < n {
+		if w[j] != w[j-1] {
+			w[j-1] = float64(j)
+			j++
+		} else {
+			for jt = j + 1; jt <= n && w[jt-1] == w[j-1]; jt++ {
+			}
+			rank = 0.5 * (float64(j) + float64(jt) - 1)
+			for ji = j; ji <= (jt - 1); ji++ {
+				w[ji-1] = rank
+			}
+			t := float64(jt - j)
+			s += (t*t*t - t)
+			j = jt
+		}
+	}
+	if j == n {
+		w[n-1] = float64(n)
+	}
+	return s
+}
+
+// betaIncomplete
+func betaIncomplete(x, a, b float64) float64 {
+	if x < 0 || x > 1 {
+		return math.NaN()
+	}
+	bt := 0.0
+	if 0 < x && x < 1 {
+		bt = math.Exp(lgamma(a+b) - lgamma(a) - lgamma(b) +
+			a*math.Log(x) + b*math.Log(1-x))
+	}
+	if x < (a+1)/(a+b+2) {
+		return bt * betaContinuedFractionComponent(x, a, b) / a
+	} else {
+		return 1 - bt*betaContinuedFractionComponent(1-x, b, a)/b
+	}
+}
+
+func betaContinuedFractionComponent(x, a, b float64) float64 {
+	const maxIterations = 200
+	const epsilon = 3e-14
+	raiseZero := func(z float64) float64 {
+		if math.Abs(z) < math.SmallestNonzeroFloat64 {
+			return math.SmallestNonzeroFloat64
+		}
+		return z
+	}
+	c := 1.0
+	d := 1 / raiseZero(1-(a+b)*x/(a+1))
+	h := d
+	for m := 1; m <= maxIterations; m++ {
+		mf := float64(m)
+		numer := mf * (b - mf) * x / ((a + 2*mf - 1) * (a + 2*mf))
+		d = 1 / raiseZero(1+numer*d)
+		c = raiseZero(1 + numer/c)
+		h *= d * c
+		numer = -(a + mf) * (a + b + mf) * x / ((a + 2*mf) * (a + 2*mf + 1))
+		d = 1 / raiseZero(1+numer*d)
+		c = raiseZero(1 + numer/c)
+		hfac := d * c
+		h *= hfac
+
+		if math.Abs(hfac-1) < epsilon {
+			return h
+		}
+	}
+	panic("betainc: a or b too big; failed to converge")
+}
+
+func lgamma(x float64) float64 {
+	y, _ := math.Lgamma(x)
+	return y
+}
diff --git a/analysis/utils.go b/analysis/utils.go
@@ -0,0 +1,13 @@
+package analysis
+
+type sorter struct {
+	x []float64
+	y []float64
+}
+
+func (s sorter) Len() int           { return len(s.x) }
+func (s sorter) Less(i, j int) bool { return s.x[i] < s.x[j] }
+func (s sorter) Swap(i, j int) {
+	s.x[i], s.x[j] = s.x[j], s.x[i]
+	s.y[i], s.y[j] = s.y[j], s.y[i]
+}
diff --git a/docs/analysis.md b/docs/analysis.md
@@ -0,0 +1,6 @@
+# Analysis
+
+*State: In concept*
+
+This module acts as helper for the encoders and the for the evaluation of a data set.
+
diff --git a/docs/encoder.md b/docs/encoder.md
diff --git a/docs/encoders.md b/docs/encoders.md
@@ -0,0 +1,83 @@
+# Encoders (experimental)
+
+## Overview
+
+**Attention:** Encoders are currently not able to serialize
+
+Encoders means that the incoming data is automatically put into a data vector (processable for the neural net.)
+Encoding of your data into processable feature vectors is very important, because strings are initially not suitable as feature vector.
+This module helps to find a representation of your data set without your intervention.
+Although the encoders can be controlled using by Config parameters in EncoderConfig.
+
+## EncoderConfig
+
+The encoder performs decisions during its runtime.
+For that reason a DefaultConfig is applied.
+It is possible to get the DefaultConfig, overwrite specific parameters and apply it again to the encoder.
+
+```go
+e := encoders.NewEncoder("test encoder")
+cfg := encoders.DefaultConfig()
+cfg.DictionaryMaxEntries = 300
+e.Config = cfg
+```
+You can find all possible options for editting the encoder config in encoders/config.go.
+
+## Example
+Below you can find an example for the encoder.
+
+```go
+// generating the encoder, the encoder can hold different input types and dimensions
+e := encoders.NewEncoder("test encoder")
+cfg := encoders.DefaultConfig()
+cfg.DictionaryMaxEntries = 300
+e.Config = cfg
+inputName := "language-classification"
+set := encoders.NewInput(inputName, encoders.String)
+for _, v := range data {
+    // add your strings here
+    set.AddString(someStringSample)
+}
+// scan takes the set and decides (if it is: encoders.Automatic) which encoding to apply
+e.Scan(inputName, set, encoders.Automatic)
+// transform brings the input into the choosen encoding
+e.Transform(inputName, set)
+// explain can be used to see what the encoder has done
+e.Explain()
+// using encode() and an Unified (can be string or float slice) you get the corresponding vector
+vector := e.Encode(inputName, encoders.Unified{String: "Hello whats up with you?", Type: encoders.String})
+```
+
+
+## Workflow
+This is the workflow. An Encoder can contain different models for encoding. In the workflow below, (namespace) means that you perform an action on a namespace within the encoder. For example, if you have a mixed input vector with strings and floats, you an put all floats together in one namespace as well as the string.
+
+1. Collect data - The encoder needs the samples from the test or a similar set of data points to optimally fit and decide.
+1. Create Encoder - create an encoder with the config (the Encoder itself can encode different inputs)
+2. Scanner - (namespace) decides which Encoder to select if you choose encoders.Automatic, if not, the given Encoder will be applied
+3. Transform - (namespace) After scanning the set and deciding the data is tranformed into the new vector space
+4. After transformation is done, you are ready to go with your new vector representation
+5. Using - Encode() (namespace) method of the encoder the encode your input.
+
+## Encoders
+If you have no specific idea which encoder to use you can also run using encoders.Automatic.
+Using this the encoder will figure out by itself which encoding is applicable.
+
+The encoders work for different data types:
+
+1. N-Grams (strings), encoders.StringNGrams
+2. Splitted Dictionary (string), encoders.StringSplitDictionary
+3. Dictionary (strings), encoders.StringDictionary
+4. FloatExact (numbers), encoders.FloatExact
+5. FloatReducer (numbers), encoders.FloatReducer
+6. Topic Modelling coming soon (strings) - not implemented yet
+
+
+## Representation (experimental)
+
+Out of the encoder activity the network generates a representation of the input space.
+This representation can be persisted and loaded to continue working on the network.
+This representation looks like this:
+
+1. Number of feature vectors
+2. Mapping of value to neuron values
diff --git a/docs/recos.md b/docs/recos.md
@@ -0,0 +1,3 @@
+# Recos
+
+*State: In concept*
diff --git a/docs/textos.md b/docs/textos.md
@@ -0,0 +1,18 @@
+# Textos Extractor
+
+*State: In concept*
+
+
+This extractor can be used to extract strings and topics.
+In this document the nature of Textos is described.
+
+## Analysis
+- Occurence of tokens
+
+## Layers
+
+### Structural learner
+The stuctural learner uses the data from the corpus to decide which part of the text is structural and which one is topic related.
+
+
+### Topic Modelling
diff --git a/encoders/config.go b/encoders/config.go
@@ -0,0 +1,41 @@
+package encoders
+
+type EncoderConfig struct {
+	DelimiterToken    string
+	DimToSamplesRatio float64
+	// Decision heuristics
+	FloatReducerThreshold     int
+	TopicModelMinDelimiters   int
+	NGramsMaxTokens           int
+	DictionaryMaxEntries      int
+	DictionaryMaxDelimiters   int
+	SplitDictionaryMaxEntries int
+	// Application settings
+	FloatReducerSpearman   float64
+	FloatReducerSkewness   float64
+	FloatReducerZeroValues bool
+	NGramMaxGrams          int
+	NGramMaxCapacity       int
+	NGramCropRatio         float64
+	DefaultStringEncoder   EncoderType
+}
+
+func DefaultConfig() *EncoderConfig {
+	return &EncoderConfig{
+		DelimiterToken:            " ",
+		DimToSamplesRatio:         0.8,
+		FloatReducerThreshold:     40,
+		TopicModelMinDelimiters:   5,
+		NGramsMaxTokens:           20,
+		DictionaryMaxEntries:      50,
+		DictionaryMaxDelimiters:   5,
+		SplitDictionaryMaxEntries: 100,
+		FloatReducerSpearman:      0.90,
+		FloatReducerSkewness:      0.90,
+		FloatReducerZeroValues:    true,
+		NGramMaxGrams:             3,
+		NGramMaxCapacity:          100,
+		NGramCropRatio:            0.05,
+		DefaultStringEncoder:      StringNGrams,
+	}
+}