Skip to content

Commit

Permalink
Merge pull request #15 from breskos/feature/autoencoders
Browse files Browse the repository at this point in the history
Feature/autoencoders
  • Loading branch information
breskos authored Jun 7, 2021
2 parents 64bac5d + 947cef7 commit 3ecb6dd
Show file tree
Hide file tree
Showing 30 changed files with 592 additions and 7,018 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,5 @@

# specific
todo
*.phrase
*.json
15 changes: 15 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,21 @@
# Change Log
All notable changes to this project will be documented in this file.

## [0.3.0] 2021-06-07

This version introduces a persitence layer for encoders.

### Added
- Serialization for encoders

### Changed
- Interface of encoders
- Minor things in the library

### Fixed
- Some issues with serialization of online, network


## [0.2.5] 2021-06-06

With this version we introduce encoders (automatic encoders) to gopher-learn.
Expand Down
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,19 @@
- Rich measurement mechanisms to control the training
- Examples for fast understanding
- Can also be used for iterative online learning (using online module) for autonomous agents
- Encoders can be used to encoder string data or massive float slices

## Install

```
go get github.com/breskos/gopher-learn/...
```

## Examples

Find the examples in the examples folder.
All the data to run the examples [can be found here](https://github.com/breskos/gopher-learn-data).

## The gopher-learn engine

The engine helps you with optimizing the learning process.
Expand Down Expand Up @@ -188,15 +194,15 @@ func main() {

```go

import "github.com/breskos/gopher-learn"
import "github.com/breskos/gopher-learn/net"
// Network has 9 enters and 3 layers
// ( 9 neurons, 9 neurons and 2 neurons).
// Last layer is network output (2 neurons).
// For these last neurons we need labels (like: spam, nospam, positive, negative)
labels := make(map[int]string)
labels[0] = "positive"
labels[1] = "negative"
n := neural.NewNetwork(9, []int{9,9,2}, map[int])
n := net.NewNetwork(9, []int{9,9,2}, map[int])
// Randomize sypaseses weights
n.RandomizeSynapses()

Expand Down
6 changes: 6 additions & 0 deletions docs/analysis.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,9 @@

This module acts as helper for the encoders and the for the evaluation of a data set.

## Functionalities (planned)

- Correlation, *Similarities* between vectors, frames and matrixes
- Statistical measures on vectors and matrixes
- Information gain, entropy
- Bucketization of dimensions to determine how well a feature helps with classification
20 changes: 11 additions & 9 deletions encoders/encoder.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,14 @@ func (e EncoderType) String() string {
}

type EncoderModel interface {
Fit(*Input)
Fit(*Input, *EncoderConfig)
CalculateString(string) []float64
CalculateFloats([]float64) []float64
GetDimensions() int
GetQuality() float64
Name() string
ToDump() ([]byte, error)
FromDump([]byte) error
}

type Encoder struct {
Expand Down Expand Up @@ -132,21 +134,21 @@ func (e *Encoder) Transform(name string, set *Input) {
switch model.Type {
case StringDictionary:
model.Model = NewDictionaryModel()
model.Model.Fit(set)
model.Model.Fit(set, e.Config)
case StringSplitDictionary:
model.Model = NewSplitDictionaryModel(e.Config)
model.Model.Fit(set)
model.Model = NewSplitDictionaryModel()
model.Model.Fit(set, e.Config)
case StringNGrams:
model.Model = NewNGramModel(e.Config)
model.Model.Fit(set)
model.Model = NewNGramModel()
model.Model.Fit(set, e.Config)
case StringTopics:
log.Fatal("not implemented")
case FloatReducer:
model.Model = NewFloatReducerModel(e.Config)
model.Model.Fit(set)
model.Model = NewFloatReducerModel()
model.Model.Fit(set, e.Config)
case FloatExact:
model.Model = NewFloatExactModel()
model.Model.Fit(set)
model.Model.Fit(set, e.Config)
}
}

Expand Down
12 changes: 11 additions & 1 deletion encoders/float_exact.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package encoders

import "encoding/json"

type FloatExactModel struct {
Dimensions int
Quality float64
Expand All @@ -9,7 +11,7 @@ func NewFloatExactModel() *FloatExactModel {
return &FloatExactModel{}
}

func (m *FloatExactModel) Fit(set *Input) {
func (m *FloatExactModel) Fit(set *Input, config *EncoderConfig) {
m.Dimensions = len(set.Values[0].Float)
}

Expand All @@ -25,6 +27,14 @@ func (m *FloatExactModel) CalculateFloats(value []float64) []float64 {
return value
}

func (m *FloatExactModel) ToDump() ([]byte, error) {
return json.Marshal(m)
}

func (m *FloatExactModel) FromDump(dump []byte) error {
return json.Unmarshal(dump, m)
}

func (m *FloatExactModel) Name() string {
return "float_exact"
}
Expand Down
21 changes: 14 additions & 7 deletions encoders/float_reducer.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package encoders

import (
"encoding/json"
"log"
"math"

Expand All @@ -20,17 +21,15 @@ type FloatReducerModel struct {
Model map[int]bool
Dimensions int
Quality float64
Config *EncoderConfig
}

func NewFloatReducerModel(config *EncoderConfig) *FloatReducerModel {
func NewFloatReducerModel() *FloatReducerModel {
return &FloatReducerModel{
Model: make(map[int]bool),
Config: config,
Model: make(map[int]bool),
}
}

func (m *FloatReducerModel) Fit(set *Input) {
func (m *FloatReducerModel) Fit(set *Input, config *EncoderConfig) {
if len(set.Values) < 1 {
log.Fatalf("no values delivered for fit")
}
Expand All @@ -48,7 +47,7 @@ func (m *FloatReducerModel) Fit(set *Input) {
if i != j {
rs, _ := analysis.Spearman(dimensions[i], dimensions[j])
spearman[i][j] = rs
if math.Abs(rs) > math.Abs(m.Config.FloatReducerSpearman) {
if math.Abs(rs) > math.Abs(config.FloatReducerSpearman) {
m.Model[i] = false
}
}
Expand All @@ -60,7 +59,7 @@ func (m *FloatReducerModel) Fit(set *Input) {
}
for i := range spearman {
for j := range spearman[i] {
if spearman[i][j] >= m.Config.FloatReducerSpearman && m.Model[i] && m.Model[j] {
if spearman[i][j] >= config.FloatReducerSpearman && m.Model[i] && m.Model[j] {
m.Model[i] = false
}
}
Expand Down Expand Up @@ -99,6 +98,14 @@ func (m *FloatReducerModel) GetQuality() float64 {
return m.Quality
}

func (m *FloatReducerModel) ToDump() ([]byte, error) {
return json.Marshal(m)
}

func (m *FloatReducerModel) FromDump(dump []byte) error {
return json.Unmarshal(dump, m)
}

func similarValues(values []float64) bool {
len := len(values)
for i, v := range values {
Expand Down
15 changes: 13 additions & 2 deletions encoders/string_dictionary.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package encoders

import "fmt"
import (
"encoding/json"
"fmt"
)

type DictionaryModel struct {
Dimensions int
Expand All @@ -12,7 +15,7 @@ func NewDictionaryModel() *DictionaryModel {
return &DictionaryModel{}
}

func (m *DictionaryModel) Fit(set *Input) {
func (m *DictionaryModel) Fit(set *Input, config *EncoderConfig) {
for _, sample := range set.Values {
value := normalizeString(sample.String)
fmt.Printf("%s", value)
Expand Down Expand Up @@ -49,6 +52,14 @@ func (m *DictionaryModel) GetQuality() float64 {
return m.Quality
}

func (m *DictionaryModel) ToDump() ([]byte, error) {
return json.Marshal(m)
}

func (m *DictionaryModel) FromDump(dump []byte) error {
return json.Unmarshal(dump, m)
}

func getIndex(s []string, value string) int {
for k, v := range s {
if v == value {
Expand Down
33 changes: 18 additions & 15 deletions encoders/string_ngrams.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package encoders

import (
"encoding/json"
"sort"
)

Expand All @@ -13,25 +14,19 @@ type NGramModel struct {
// Grams to index in vector
GramsLookup map[string]int
// Grams to number of appearances
Grams map[string]int
Samples int
MaxGrams int
MaxCapacity int
CropRatio float64
Quality float64
Grams map[string]int
Samples int
Quality float64
}

func NewNGramModel(config *EncoderConfig) *NGramModel {
func NewNGramModel() *NGramModel {
return &NGramModel{
Grams: make(map[string]int, 0),
GramsLookup: make(map[string]int),
MaxGrams: config.NGramMaxGrams,
MaxCapacity: config.NGramMaxCapacity,
CropRatio: config.NGramCropRatio,
}
}

func (m *NGramModel) Fit(set *Input) {
func (m *NGramModel) Fit(set *Input, config *EncoderConfig) {
modelIndex := 0
for _, sample := range set.Values {
m.Samples++
Expand All @@ -51,7 +46,7 @@ func (m *NGramModel) Fit(set *Input) {
}
}
m.Dimensions = len(m.Grams)
m.optimize()
m.optimize(config.NGramMaxCapacity, config.NGramCropRatio)
}

func (m *NGramModel) CalculateString(s string) []float64 {
Expand All @@ -74,6 +69,14 @@ func (m *NGramModel) CalculateFloats([]float64) []float64 {
return []float64{}
}

func (m *NGramModel) ToDump() ([]byte, error) {
return json.Marshal(m)
}

func (m *NGramModel) FromDump(dump []byte) error {
return json.Unmarshal(dump, m)
}

func (m *NGramModel) Name() string {
return "ngrams"
}
Expand All @@ -82,13 +85,13 @@ func (m *NGramModel) GetQuality() float64 {
return m.Quality
}

func (m *NGramModel) optimize() {
if m.MaxCapacity >= m.Dimensions {
func (m *NGramModel) optimize(maxCapacity int, cropRatio float64) {
if maxCapacity >= m.Dimensions {
return
}

for gram, appearance := range m.Grams {
if float64(appearance)/float64(m.Samples) < m.CropRatio {
if float64(appearance)/float64(m.Samples) < cropRatio {
delete(m.Grams, gram)
}
}
Expand Down
25 changes: 18 additions & 7 deletions encoders/string_split_dictionary.go
Original file line number Diff line number Diff line change
@@ -1,28 +1,31 @@
package encoders

import (
"encoding/json"
"fmt"
"strings"
)

const (
splitDictionaryDelimiter = " "
)

type SplitDictionaryModel struct {
Dimensions int
Delimiter string
Dictionary []string
Quality float64
}

func NewSplitDictionaryModel(config *EncoderConfig) *SplitDictionaryModel {
return &SplitDictionaryModel{
Delimiter: config.DelimiterToken,
}
func NewSplitDictionaryModel() *SplitDictionaryModel {
return &SplitDictionaryModel{}
}

func (m *SplitDictionaryModel) Fit(set *Input) {
func (m *SplitDictionaryModel) Fit(set *Input, config *EncoderConfig) {
delimiter := config.DelimiterToken
for _, sample := range set.Values {
value := normalizeString(sample.String)
fmt.Printf("%s", value)
values := strings.Split(value, m.Delimiter)
values := strings.Split(value, delimiter)
for _, v := range values {
if !contains(m.Dictionary, v) {
m.Dictionary = append(m.Dictionary, v)
Expand Down Expand Up @@ -50,6 +53,14 @@ func (m *SplitDictionaryModel) CalculateFloats([]float64) []float64 {
return []float64{}
}

func (m *SplitDictionaryModel) ToDump() ([]byte, error) {
return json.Marshal(m)
}

func (m *SplitDictionaryModel) FromDump(dump []byte) error {
return json.Unmarshal(dump, m)
}

func (m *SplitDictionaryModel) Name() string {
return "splitted_dictionary"
}
Expand Down
1 change: 1 addition & 0 deletions engine/train.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ func compare(usage neural.NetworkType, criterion neural.Criterion, current *eval
}

// Copies a neural network from another
// This function is very costly.
func copy(from *neural.Network) *neural.Network {
return persist.FromDump(persist.ToDump(from))
}
Loading

0 comments on commit 3ecb6dd

Please sign in to comment.