go-ml/export.go at main · dAppCore/go-ml · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
package ml

import (
	"bufio"
	"encoding/json"
	"fmt"
	"math/rand"
	"os"
	"strings"

	coreerr "dappco.re/go/core/log"
)

// ChatMessage is a single message in the chat training format.
type ChatMessage struct {
	Role    string `json:"role"`
	Content string `json:"content"`
}

// TrainingExample is a single training example in chat JSONL format.
type TrainingExample struct {
	Messages []ChatMessage `json:"messages"`
}

// ValidatePercentages checks that train+valid+test percentages sum to 100
// and that none are negative.
func ValidatePercentages(trainPct, validPct, testPct int) error {
	if trainPct < 0 || validPct < 0 || testPct < 0 {
		return coreerr.E("ml.ValidatePercentages", fmt.Sprintf("percentages must be non-negative: train=%d, valid=%d, test=%d", trainPct, validPct, testPct), nil)
	}
	sum := trainPct + validPct + testPct
	if sum != 100 {
		return coreerr.E("ml.ValidatePercentages", fmt.Sprintf("percentages must sum to 100, got %d (train=%d + valid=%d + test=%d)", sum, trainPct, validPct, testPct), nil)
	}
	return nil
}

// FilterResponses removes responses with empty content, "ERROR:" prefix,
// or response length < 50 characters.
func FilterResponses(responses []Response) []Response {
	var filtered []Response
	for _, r := range responses {
		if r.Response == "" {
			continue
		}
		if strings.HasPrefix(r.Response, "ERROR:") {
			continue
		}
		if len(r.Response) < 50 {
			continue
		}
		filtered = append(filtered, r)
	}
	return filtered
}

// SplitData shuffles responses with a deterministic seed and splits them
// into train, valid, and test sets by the given percentages.
func SplitData(responses []Response, trainPct, validPct, testPct int, seed int64) (train, valid, test []Response) {
	shuffled := make([]Response, len(responses))
	copy(shuffled, responses)

	rng := rand.New(rand.NewSource(seed))
	rng.Shuffle(len(shuffled), func(i, j int) {
		shuffled[i], shuffled[j] = shuffled[j], shuffled[i]
	})

	n := len(shuffled)
	trainN := n * trainPct / 100
	validN := n * validPct / 100
	_ = testPct

	train = shuffled[:trainN]
	valid = shuffled[trainN : trainN+validN]
	test = shuffled[trainN+validN:]

	return train, valid, test
}

// WriteTrainingJSONL writes responses in chat JSONL format suitable for
// MLX LoRA fine-tuning.
func WriteTrainingJSONL(path string, responses []Response) error {
	f, err := os.Create(path)
	if err != nil {
		return coreerr.E("ml.WriteTrainingJSONL", fmt.Sprintf("create %s", path), err)
	}
	defer f.Close()

	w := bufio.NewWriter(f)
	defer w.Flush()

	for _, r := range responses {
		example := TrainingExample{
			Messages: []ChatMessage{
				{Role: "user", Content: r.Prompt},
				{Role: "assistant", Content: r.Response},
			},
		}

		data, err := json.Marshal(example)
		if err != nil {
			return coreerr.E("ml.WriteTrainingJSONL", "marshal example", err)
		}

		if _, err := w.Write(data); err != nil {
			return coreerr.E("ml.WriteTrainingJSONL", "write line", err)
		}
		if _, err := w.WriteString("\n"); err != nil {
			return coreerr.E("ml.WriteTrainingJSONL", "write newline", err)
		}
	}

	return nil
}