Merge pull request #15 from breskos/feature/autoencoders

Feature/autoencoders
breskos · Jun 7, 2021 · 3ecb6dd · 3ecb6dd
2 parents 64bac5d + 947cef7
commit 3ecb6dd
Show file tree

Hide file tree

Showing 30 changed files with 592 additions and 7,018 deletions.
diff --git a/.gitignore b/.gitignore
@@ -16,3 +16,5 @@
 
 # specific
 todo
+*.phrase
+*.json
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,21 @@
 # Change Log
 All notable changes to this project will be documented in this file.
 
+## [0.3.0] 2021-06-07
+
+This version introduces a persitence layer for encoders.
+
+### Added
+- Serialization for encoders
+
+### Changed
+- Interface of encoders
+- Minor things in the library
+
+### Fixed
+- Some issues with serialization of online, network
+
+
 ## [0.2.5] 2021-06-06
 
 With this version we introduce encoders (automatic encoders) to gopher-learn.

diff --git a/README.md b/README.md
@@ -12,13 +12,19 @@
 - Rich measurement mechanisms to control the training
 - Examples for fast understanding
 - Can also be used for iterative online learning (using online module) for autonomous agents
+- Encoders can be used to encoder string data or massive float slices
 
 ## Install
 
 ```
   go get github.com/breskos/gopher-learn/...
 ```
 
+## Examples
+
+Find the examples in the examples folder.
+All the data to run the examples [can be found here](https://github.com/breskos/gopher-learn-data).
+
 ## The gopher-learn engine
 
 The engine helps you with optimizing the learning process.
@@ -188,15 +194,15 @@ func main() {
 
 ```go
 
-  import "github.com/breskos/gopher-learn"
+  import "github.com/breskos/gopher-learn/net"
   // Network has 9 enters and 3 layers
   // ( 9 neurons, 9 neurons and 2 neurons).
   // Last layer is network output (2 neurons).
   // For these last neurons we need labels (like: spam, nospam, positive, negative)
   labels := make(map[int]string)
   labels[0] = "positive"
   labels[1] = "negative"
-  n := neural.NewNetwork(9, []int{9,9,2}, map[int])
+  n := net.NewNetwork(9, []int{9,9,2}, map[int])
   // Randomize sypaseses weights
   n.RandomizeSynapses()
 

diff --git a/docs/analysis.md b/docs/analysis.md
@@ -4,3 +4,9 @@
 
 This module acts as helper for the encoders and the for the evaluation of a data set.
 
+## Functionalities (planned)
+
+- Correlation, *Similarities* between vectors, frames and matrixes
+- Statistical measures on vectors and matrixes
+- Information gain, entropy
+- Bucketization of dimensions to determine how well a feature helps with classification
diff --git a/encoders/encoder.go b/encoders/encoder.go
@@ -37,12 +37,14 @@ func (e EncoderType) String() string {
 }
 
 type EncoderModel interface {
-	Fit(*Input)
+	Fit(*Input, *EncoderConfig)
 	CalculateString(string) []float64
 	CalculateFloats([]float64) []float64
 	GetDimensions() int
 	GetQuality() float64
 	Name() string
+	ToDump() ([]byte, error)
+	FromDump([]byte) error
 }
 
 type Encoder struct {
@@ -132,21 +134,21 @@ func (e *Encoder) Transform(name string, set *Input) {
 	switch model.Type {
 	case StringDictionary:
 		model.Model = NewDictionaryModel()
-		model.Model.Fit(set)
+		model.Model.Fit(set, e.Config)
 	case StringSplitDictionary:
-		model.Model = NewSplitDictionaryModel(e.Config)
-		model.Model.Fit(set)
+		model.Model = NewSplitDictionaryModel()
+		model.Model.Fit(set, e.Config)
 	case StringNGrams:
-		model.Model = NewNGramModel(e.Config)
-		model.Model.Fit(set)
+		model.Model = NewNGramModel()
+		model.Model.Fit(set, e.Config)
 	case StringTopics:
 		log.Fatal("not implemented")
 	case FloatReducer:
-		model.Model = NewFloatReducerModel(e.Config)
-		model.Model.Fit(set)
+		model.Model = NewFloatReducerModel()
+		model.Model.Fit(set, e.Config)
 	case FloatExact:
 		model.Model = NewFloatExactModel()
-		model.Model.Fit(set)
+		model.Model.Fit(set, e.Config)
 	}
 }
 

diff --git a/encoders/float_exact.go b/encoders/float_exact.go
@@ -1,5 +1,7 @@
 package encoders
 
+import "encoding/json"
+
 type FloatExactModel struct {
 	Dimensions int
 	Quality    float64
@@ -9,7 +11,7 @@ func NewFloatExactModel() *FloatExactModel {
 	return &FloatExactModel{}
 }
 
-func (m *FloatExactModel) Fit(set *Input) {
+func (m *FloatExactModel) Fit(set *Input, config *EncoderConfig) {
 	m.Dimensions = len(set.Values[0].Float)
 }
 
@@ -25,6 +27,14 @@ func (m *FloatExactModel) CalculateFloats(value []float64) []float64 {
 	return value
 }
 
+func (m *FloatExactModel) ToDump() ([]byte, error) {
+	return json.Marshal(m)
+}
+
+func (m *FloatExactModel) FromDump(dump []byte) error {
+	return json.Unmarshal(dump, m)
+}
+
 func (m *FloatExactModel) Name() string {
 	return "float_exact"
 }

diff --git a/encoders/float_reducer.go b/encoders/float_reducer.go
@@ -1,6 +1,7 @@
 package encoders
 
 import (
+	"encoding/json"
 	"log"
 	"math"
 
@@ -20,17 +21,15 @@ type FloatReducerModel struct {
 	Model      map[int]bool
 	Dimensions int
 	Quality    float64
-	Config     *EncoderConfig
 }
 
-func NewFloatReducerModel(config *EncoderConfig) *FloatReducerModel {
+func NewFloatReducerModel() *FloatReducerModel {
 	return &FloatReducerModel{
-		Model:  make(map[int]bool),
-		Config: config,
+		Model: make(map[int]bool),
 	}
 }
 
-func (m *FloatReducerModel) Fit(set *Input) {
+func (m *FloatReducerModel) Fit(set *Input, config *EncoderConfig) {
 	if len(set.Values) < 1 {
 		log.Fatalf("no values delivered for fit")
 	}
@@ -48,7 +47,7 @@ func (m *FloatReducerModel) Fit(set *Input) {
 			if i != j {
 				rs, _ := analysis.Spearman(dimensions[i], dimensions[j])
 				spearman[i][j] = rs
-				if math.Abs(rs) > math.Abs(m.Config.FloatReducerSpearman) {
+				if math.Abs(rs) > math.Abs(config.FloatReducerSpearman) {
 					m.Model[i] = false
 				}
 			}
@@ -60,7 +59,7 @@ func (m *FloatReducerModel) Fit(set *Input) {
 	}
 	for i := range spearman {
 		for j := range spearman[i] {
-			if spearman[i][j] >= m.Config.FloatReducerSpearman && m.Model[i] && m.Model[j] {
+			if spearman[i][j] >= config.FloatReducerSpearman && m.Model[i] && m.Model[j] {
 				m.Model[i] = false
 			}
 		}
@@ -99,6 +98,14 @@ func (m *FloatReducerModel) GetQuality() float64 {
 	return m.Quality
 }
 
+func (m *FloatReducerModel) ToDump() ([]byte, error) {
+	return json.Marshal(m)
+}
+
+func (m *FloatReducerModel) FromDump(dump []byte) error {
+	return json.Unmarshal(dump, m)
+}
+
 func similarValues(values []float64) bool {
 	len := len(values)
 	for i, v := range values {

diff --git a/encoders/string_dictionary.go b/encoders/string_dictionary.go
@@ -1,6 +1,9 @@
 package encoders
 
-import "fmt"
+import (
+	"encoding/json"
+	"fmt"
+)
 
 type DictionaryModel struct {
 	Dimensions int
@@ -12,7 +15,7 @@ func NewDictionaryModel() *DictionaryModel {
 	return &DictionaryModel{}
 }
 
-func (m *DictionaryModel) Fit(set *Input) {
+func (m *DictionaryModel) Fit(set *Input, config *EncoderConfig) {
 	for _, sample := range set.Values {
 		value := normalizeString(sample.String)
 		fmt.Printf("%s", value)
@@ -49,6 +52,14 @@ func (m *DictionaryModel) GetQuality() float64 {
 	return m.Quality
 }
 
+func (m *DictionaryModel) ToDump() ([]byte, error) {
+	return json.Marshal(m)
+}
+
+func (m *DictionaryModel) FromDump(dump []byte) error {
+	return json.Unmarshal(dump, m)
+}
+
 func getIndex(s []string, value string) int {
 	for k, v := range s {
 		if v == value {

diff --git a/encoders/string_ngrams.go b/encoders/string_ngrams.go
@@ -1,6 +1,7 @@
 package encoders
 
 import (
+	"encoding/json"
 	"sort"
 )
 
@@ -13,25 +14,19 @@ type NGramModel struct {
 	// Grams to index in vector
 	GramsLookup map[string]int
 	// Grams to number of appearances
-	Grams       map[string]int
-	Samples     int
-	MaxGrams    int
-	MaxCapacity int
-	CropRatio   float64
-	Quality     float64
+	Grams   map[string]int
+	Samples int
+	Quality float64
 }
 
-func NewNGramModel(config *EncoderConfig) *NGramModel {
+func NewNGramModel() *NGramModel {
 	return &NGramModel{
 		Grams:       make(map[string]int, 0),
 		GramsLookup: make(map[string]int),
-		MaxGrams:    config.NGramMaxGrams,
-		MaxCapacity: config.NGramMaxCapacity,
-		CropRatio:   config.NGramCropRatio,
 	}
 }
 
-func (m *NGramModel) Fit(set *Input) {
+func (m *NGramModel) Fit(set *Input, config *EncoderConfig) {
 	modelIndex := 0
 	for _, sample := range set.Values {
 		m.Samples++
@@ -51,7 +46,7 @@ func (m *NGramModel) Fit(set *Input) {
 		}
 	}
 	m.Dimensions = len(m.Grams)
-	m.optimize()
+	m.optimize(config.NGramMaxCapacity, config.NGramCropRatio)
 }
 
 func (m *NGramModel) CalculateString(s string) []float64 {
@@ -74,6 +69,14 @@ func (m *NGramModel) CalculateFloats([]float64) []float64 {
 	return []float64{}
 }
 
+func (m *NGramModel) ToDump() ([]byte, error) {
+	return json.Marshal(m)
+}
+
+func (m *NGramModel) FromDump(dump []byte) error {
+	return json.Unmarshal(dump, m)
+}
+
 func (m *NGramModel) Name() string {
 	return "ngrams"
 }
@@ -82,13 +85,13 @@ func (m *NGramModel) GetQuality() float64 {
 	return m.Quality
 }
 
-func (m *NGramModel) optimize() {
-	if m.MaxCapacity >= m.Dimensions {
+func (m *NGramModel) optimize(maxCapacity int, cropRatio float64) {
+	if maxCapacity >= m.Dimensions {
 		return
 	}
 
 	for gram, appearance := range m.Grams {
-		if float64(appearance)/float64(m.Samples) < m.CropRatio {
+		if float64(appearance)/float64(m.Samples) < cropRatio {
 			delete(m.Grams, gram)
 		}
 	}

diff --git a/encoders/string_split_dictionary.go b/encoders/string_split_dictionary.go
@@ -1,28 +1,31 @@
 package encoders
 
 import (
+	"encoding/json"
 	"fmt"
 	"strings"
 )
 
+const (
+	splitDictionaryDelimiter = " "
+)
+
 type SplitDictionaryModel struct {
 	Dimensions int
-	Delimiter  string
 	Dictionary []string
 	Quality    float64
 }
 
-func NewSplitDictionaryModel(config *EncoderConfig) *SplitDictionaryModel {
-	return &SplitDictionaryModel{
-		Delimiter: config.DelimiterToken,
-	}
+func NewSplitDictionaryModel() *SplitDictionaryModel {
+	return &SplitDictionaryModel{}
 }
 
-func (m *SplitDictionaryModel) Fit(set *Input) {
+func (m *SplitDictionaryModel) Fit(set *Input, config *EncoderConfig) {
+	delimiter := config.DelimiterToken
 	for _, sample := range set.Values {
 		value := normalizeString(sample.String)
 		fmt.Printf("%s", value)
-		values := strings.Split(value, m.Delimiter)
+		values := strings.Split(value, delimiter)
 		for _, v := range values {
 			if !contains(m.Dictionary, v) {
 				m.Dictionary = append(m.Dictionary, v)
@@ -50,6 +53,14 @@ func (m *SplitDictionaryModel) CalculateFloats([]float64) []float64 {
 	return []float64{}
 }
 
+func (m *SplitDictionaryModel) ToDump() ([]byte, error) {
+	return json.Marshal(m)
+}
+
+func (m *SplitDictionaryModel) FromDump(dump []byte) error {
+	return json.Unmarshal(dump, m)
+}
+
 func (m *SplitDictionaryModel) Name() string {
 	return "splitted_dictionary"
 }

diff --git a/engine/train.go b/engine/train.go
@@ -91,6 +91,7 @@ func compare(usage neural.NetworkType, criterion neural.Criterion, current *eval
 }
 
 // Copies a neural network from another
+// This function is very costly.
 func copy(from *neural.Network) *neural.Network {
 	return persist.FromDump(persist.ToDump(from))
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,3 +16,5 @@ @@
     # specific
     todo
+    *.phrase
+    *.json