go-rag/chunk.go at main · dAppCore/go-rag · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
package rag

import (
	"crypto/md5"
	"fmt"
	"iter"
	"path/filepath"
	"slices"
	"strings"
)

// ChunkConfig holds chunking configuration.
type ChunkConfig struct {
	Size    int // Characters per chunk
	Overlap int // Overlap between chunks
}

// DefaultChunkConfig returns default chunking configuration.
func DefaultChunkConfig() ChunkConfig {
	return ChunkConfig{
		Size:    500,
		Overlap: 50,
	}
}

// Chunk represents a text chunk with metadata.
type Chunk struct {
	Text    string
	Section string
	Index   int
}

// ChunkMarkdown splits markdown text into chunks by sections and paragraphs.
// Preserves context with configurable overlap. When a paragraph exceeds the
// configured Size, it is split at sentence boundaries. Overlap is aligned to
// word boundaries to avoid splitting mid-word.
func ChunkMarkdown(text string, cfg ChunkConfig) []Chunk {
	return slices.Collect(ChunkMarkdownSeq(text, cfg))
}

// ChunkMarkdownSeq returns an iterator that yields document chunks from markdown text.
func ChunkMarkdownSeq(text string, cfg ChunkConfig) iter.Seq[Chunk] {
	if cfg.Size <= 0 {
		cfg.Size = 500
	}
	if cfg.Overlap < 0 || cfg.Overlap >= cfg.Size {
		cfg.Overlap = 0
	}

	return func(yield func(Chunk) bool) {
		chunkIndex := 0

		// Split by ## headers
		for section := range splitBySectionsSeq(text) {
			section = strings.TrimSpace(section)
			if section == "" {
				continue
			}

			// Extract section title
			lines := strings.SplitN(section, "\n", 2)
			title := ""
			if strings.HasPrefix(lines[0], "#") {
				title = strings.TrimLeft(lines[0], "#")
				title = strings.TrimSpace(title)
			}

			// If section is small enough, yield as-is
			if len(section) <= cfg.Size {
				if !yield(Chunk{
					Text:    section,
					Section: title,
					Index:   chunkIndex,
				}) {
					return
				}
				chunkIndex++
				continue
			}

			// Otherwise, chunk by paragraphs
			currentChunk := ""
			for para := range splitByParagraphsSeq(section) {
				para = strings.TrimSpace(para)
				if para == "" {
					continue
				}

				// If the paragraph itself exceeds Size, split at sentence
				// boundaries and treat each sentence (or group of sentences)
				// as a separate sub-paragraph.
				for sp := range yieldSubParas(para, cfg.Size) {
					sp = strings.TrimSpace(sp)
					if sp == "" {
						continue
					}

					if len(currentChunk)+len(sp)+2 <= cfg.Size {
						if currentChunk != "" {
							currentChunk += "\n\n" + sp
						} else {
							currentChunk = sp
						}
					} else {
						if currentChunk != "" {
							if !yield(Chunk{
								Text:    strings.TrimSpace(currentChunk),
								Section: title,
								Index:   chunkIndex,
							}) {
								return
							}
							chunkIndex++
						}
						// Start new chunk with overlap from previous,
						// aligned to the nearest word boundary.
						currentChunk = overlapPrefix(currentChunk, cfg.Overlap, sp)
					}
				}
			}

			// Don't forget the last chunk of the section
			if strings.TrimSpace(currentChunk) != "" {
				if !yield(Chunk{
					Text:    strings.TrimSpace(currentChunk),
					Section: title,
					Index:   chunkIndex,
				}) {
					return
				}
				chunkIndex++
			}
		}
	}
}

func yieldSubParas(para string, size int) iter.Seq[string] {
	return func(yield func(string) bool) {
		if len(para) <= size {
			yield(para)
			return
		}
		for s := range splitBySentencesSeq(para) {
			if !yield(s) {
				return
			}
		}
	}
}

// overlapPrefix builds the start of a new chunk by taking word-boundary-aligned
// overlap text from the previous chunk and prepending it to the new paragraph.
func overlapPrefix(prevChunk string, overlap int, newPara string) string {
	if overlap <= 0 {
		return newPara
	}

	runes := []rune(prevChunk)
	if len(runes) <= overlap {
		return newPara
	}

	// Slice from the end of the previous chunk
	overlapRunes := runes[len(runes)-overlap:]

	// Align to the nearest word boundary: find the first space within the
	// overlap slice and start after it to avoid a partial leading word.
	overlapText := string(overlapRunes)
	if idx := strings.IndexByte(overlapText, ' '); idx >= 0 {
		overlapText = overlapText[idx+1:]
	}

	if overlapText == "" {
		return newPara
	}

	return overlapText + "\n\n" + newPara
}

// splitBySentences splits text at sentence boundaries (". ", "? ", "! ").
// Returns the original text in a single-element slice when no boundaries are found.
func splitBySentences(text string) []string {
	return slices.Collect(splitBySentencesSeq(text))
}

// splitBySentencesSeq returns an iterator that yields sentences split at
// boundaries (". ", "? ", "! ").
func splitBySentencesSeq(text string) iter.Seq[string] {
	return func(yield func(string) bool) {
		remaining := text

		for len(remaining) > 0 {
			// Find the earliest sentence boundary
			bestIdx := -1
			var bestSep string
			for _, sep := range []string{". ", "? ", "! "} {
				idx := strings.Index(remaining, sep)
				if idx >= 0 && (bestIdx < 0 || idx < bestIdx) {
					bestIdx = idx
					bestSep = sep
				}
			}

			if bestIdx < 0 {
				// No more boundaries — yield remainder if not empty
				if s := strings.TrimSpace(remaining); s != "" {
					if !yield(s) {
						return
					}
				}
				break
			}

			// Include the punctuation mark in the sentence, but not the trailing space
			sentence := remaining[:bestIdx+len(bestSep)-1]
			if s := strings.TrimSpace(sentence); s != "" {
				if !yield(s) {
					return
				}
			}
			remaining = remaining[bestIdx+len(bestSep):]
		}
	}
}

// splitBySections splits text by ## headers while preserving the header with its content.
func splitBySections(text string) []string {
	return slices.Collect(splitBySectionsSeq(text))
}

// splitBySectionsSeq returns an iterator that yields text sections split by ## headers.
func splitBySectionsSeq(text string) iter.Seq[string] {
	return func(yield func(string) bool) {
		var currentSection strings.Builder
		for line := range strings.SplitSeq(text, "\n") {
			// Check if this line is a ## header
			if strings.HasPrefix(line, "## ") {
				// Yield previous section if exists
				if currentSection.Len() > 0 {
					if !yield(currentSection.String()) {
						return
					}
					currentSection.Reset()
				}
			}
			currentSection.WriteString(line)
			currentSection.WriteString("\n")
		}

		// Don't forget the last section
		if currentSection.Len() > 0 {
			yield(currentSection.String())
		}
	}
}

// splitByParagraphs splits text by double newlines.
func splitByParagraphs(text string) []string {
	return slices.Collect(splitByParagraphsSeq(text))
}

// splitByParagraphsSeq returns an iterator that yields paragraphs split by double newlines.
func splitByParagraphsSeq(text string) iter.Seq[string] {
	return func(yield func(string) bool) {
		// Replace multiple newlines with a marker, then split
		normalized := text
		for strings.Contains(normalized, "\n\n\n") {
			normalized = strings.ReplaceAll(normalized, "\n\n\n", "\n\n")
		}
		for s := range strings.SplitSeq(normalized, "\n\n") {
			if !yield(s) {
				return
			}
		}
	}
}

// Category determines the document category from file path.
func Category(path string) string {
	lower := strings.ToLower(path)

	switch {
	case strings.Contains(lower, "flux") || strings.Contains(lower, "ui/component"):
		return "ui-component"
	case strings.Contains(lower, "brand") || strings.Contains(lower, "mascot"):
		return "brand"
	case strings.Contains(lower, "brief"):
		return "product-brief"
	case strings.Contains(lower, "help") || strings.Contains(lower, "draft"):
		return "help-doc"
	case strings.Contains(lower, "task") || strings.Contains(lower, "plan"):
		return "task"
	case strings.Contains(lower, "architecture") || strings.Contains(lower, "migration"):
		return "architecture"
	default:
		return "documentation"
	}
}

// ChunkID generates a unique ID for a chunk.
func ChunkID(path string, index int, text string) string {
	// Use first 100 runes of text for uniqueness (rune-safe for UTF-8)
	runes := []rune(text)
	if len(runes) > 100 {
		runes = runes[:100]
	}
	textPart := string(runes)
	data := fmt.Sprintf("%s:%d:%s", path, index, textPart)
	hash := md5.Sum([]byte(data))
	return fmt.Sprintf("%x", hash)
}

// FileExtensions returns the file extensions to process.
func FileExtensions() []string {
	return []string{".md", ".markdown", ".txt"}
}

// ShouldProcess checks if a file should be processed based on extension.
func ShouldProcess(path string) bool {
	ext := strings.ToLower(filepath.Ext(path))
	return slices.Contains(FileExtensions(), ext)
}