Skip to content

perf: improve memory consumption in 2ms file walk #287

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion .2ms.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,14 @@ ignore-result:
- 7b7c1a0b1c5760490d843e0b9bfe540665d20b28 # value used for testing, found at https://github.com/Checkmarx/2ms/commit/07aab5bb214c03fd9e75e46cebe2b407c88d4f73/reporting/report_test.go#diff-31d71ec2c2ba169dce79b1c2de097e30b43f1695ce364054ee7d6b33896c7040R10
- 92b1996f9815a2fbd9299a1997ce0bc2c153624f # value used for testing, found at https://github.com/Checkmarx/2ms/commit/07aab5bb214c03fd9e75e46cebe2b407c88d4f73/reporting/report_test.go#diff-31d71ec2c2ba169dce79b1c2de097e30b43f1695ce364054ee7d6b33896c7040R10
- bf2e01278453a987f05b69e6c536358cab343322 # value used for testing, found at https://github.com/Checkmarx/2ms/commit/07aab5bb214c03fd9e75e46cebe2b407c88d4f73/reporting/report_test.go#diff-31d71ec2c2ba169dce79b1c2de097e30b43f1695ce364054ee7d6b33896c7040R10
- c9ae034a5a03a540d50a2686f74fcbb5117f181c # value used for testing, found at https://github.com/Checkmarx/2ms/commit/07aab5bb214c03fd9e75e46cebe2b407c88d4f73/reporting/report_test.go#diff-31d71ec2c2ba169dce79b1c2de097e30b43f1695ce364054ee7d6b33896c7040R10
- c9ae034a5a03a540d50a2686f74fcbb5117f181c # value used for testing, found at https://github.com/Checkmarx/2ms/commit/07aab5bb214c03fd9e75e46cebe2b407c88d4f73/reporting/report_test.go#diff-31d71ec2c2ba169dce79b1c2de097e30b43f1695ce364054ee7d6b33896c7040R10
- e3b354d102fe73cd4f4016e1ee17e468256d2ae8 # value used for testing, found at https://github.com/Checkmarx/2ms/commit/9cce7dbf98e7954afd59e0c994f15d22bfc8c471#diff-e905b1effebb0b95aa63f317b0890c08ec18e554bf3a543cf3badf0faafe832a
- 5c2e640a480ca64c809133e1b157fd97960356bf # value used for testing, found at https://github.com/Checkmarx/2ms/commit/9cce7dbf98e7954afd59e0c994f15d22bfc8c471#diff-e905b1effebb0b95aa63f317b0890c08ec18e554bf3a543cf3badf0faafe832a
- bdd20706ea03aa38c8c9f3f87200cf6ab9010a53 # value used for testing, found at https://github.com/Checkmarx/2ms/commit/9cce7dbf98e7954afd59e0c994f15d22bfc8c471#diff-2ae36c3c74dde4f291e7d6caac2f9bbd9a63f2c29b7e92c4b4425acf6f0cc802
- 99f9ffb901cb72a0282ce32cf7dc050e5225cd81 # value used for testing, found at https://github.com/Checkmarx/2ms/commit/9cce7dbf98e7954afd59e0c994f15d22bfc8c471#diff-723d6b3fef6018f60691402e443924399f9c1b3cafb76dcf28eb5e7fc21dba3e
- 0f80a32cc85ea5c04b65dbf7d6db6ddb8c2e4d29 # value used for testing, found at https://github.com/Checkmarx/2ms/commit/9cce7dbf98e7954afd59e0c994f15d22bfc8c471#diff-2ea972bed8e88d77f19bfb442934dc9eedf64c0a3f9c08f19e4d03530935dcaf
- 29a593e19a06c138d63468b8a028696ccdfc7eb2 # value used for testing, found at https://github.com/Checkmarx/2ms/commit/9cce7dbf98e7954afd59e0c994f15d22bfc8c471#diff-2ea972bed8e88d77f19bfb442934dc9eedf64c0a3f9c08f19e4d03530935dcaf
- 8149f62cd847f3c4ba5ffc502bdcb8d66e800c7f # value used for testing, found at https://github.com/Checkmarx/2ms/commit/9cce7dbf98e7954afd59e0c994f15d22bfc8c471#diff-0d972d38be57cb9913139901c0ef0c425a8adfe46731cf6f10cec1eb10ed4338
- 1bd84965941175ee61639964adbff6170bea7703 # value used for testing, found at https://github.com/Checkmarx/2ms/commit/9cce7dbf98e7954afd59e0c994f15d22bfc8c471#diff-2ae36c3c74dde4f291e7d6caac2f9bbd9a63f2c29b7e92c4b4425acf6f0cc802
- f86543794ab8c77a54adc91581dcf72bfef6bf78 # value used for testing, found at https://github.com/Checkmarx/2ms/commit/9cce7dbf98e7954afd59e0c994f15d22bfc8c471#diff-0d972d38be57cb9913139901c0ef0c425a8adfe46731cf6f10cec1eb10ed4338
- ae0f7e65c291d7f0ea998dfa77485bfc632e5d62 # value used for testing, found at https://github.com/Checkmarx/2ms/commit/9cce7dbf98e7954afd59e0c994f15d22bfc8c471#diff-0d972d38be57cb9913139901c0ef0c425a8adfe46731cf6f10cec1eb10ed4338
10 changes: 5 additions & 5 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,17 +131,17 @@ func preRun(pluginName string, cmd *cobra.Command, args []string) error {
return err
}

engine, err := engine.Init(engineConfigVar)
engineInstance, err := engine.Init(engineConfigVar)
if err != nil {
return err
}

if err := engine.AddRegexRules(customRegexRuleVar); err != nil {
if err := engineInstance.AddRegexRules(customRegexRuleVar); err != nil {
return err
}

Channels.WaitGroup.Add(1)
go ProcessItems(engine, pluginName)
go ProcessItems(engineInstance, pluginName)

Channels.WaitGroup.Add(1)
go ProcessSecrets()
Expand All @@ -151,10 +151,10 @@ func preRun(pluginName string, cmd *cobra.Command, args []string) error {

if validateVar {
Channels.WaitGroup.Add(1)
go ProcessValidationAndScoreWithValidation(engine)
go ProcessValidationAndScoreWithValidation(engineInstance)
} else {
Channels.WaitGroup.Add(1)
go ProcessScoreWithoutValidation(engine)
go ProcessScoreWithoutValidation(engineInstance)
}

return nil
Expand Down
30 changes: 23 additions & 7 deletions cmd/workers.go
Original file line number Diff line number Diff line change
@@ -1,21 +1,37 @@
package cmd

import (
"context"
"github.com/checkmarx/2ms/engine"
"github.com/checkmarx/2ms/engine/extra"
"github.com/checkmarx/2ms/lib/secrets"
"golang.org/x/sync/errgroup"
"sync"
)

func ProcessItems(engine *engine.Engine, pluginName string) {
func ProcessItems(engineInstance engine.IEngine, pluginName string) {
defer Channels.WaitGroup.Done()
wgItems := &sync.WaitGroup{}

g, ctx := errgroup.WithContext(context.Background())
for item := range Channels.Items {
Report.TotalItemsScanned++
wgItems.Add(1)
go engine.Detect(item, SecretsChan, wgItems, pluginName, Channels.Errors)
item := item

switch pluginName {
case "filesystem":
g.Go(func() error {
return engineInstance.DetectFile(ctx, item, SecretsChan)
})
default:
g.Go(func() error {
return engineInstance.DetectFragment(item, SecretsChan, pluginName)
})
}
}

if err := g.Wait(); err != nil {
Channels.Errors <- err
}
wgItems.Wait()
close(SecretsChan)
}

Expand Down Expand Up @@ -48,7 +64,7 @@ func ProcessSecretsExtras() {
wgExtras.Wait()
}

func ProcessValidationAndScoreWithValidation(engine *engine.Engine) {
func ProcessValidationAndScoreWithValidation(engine engine.IEngine) {
defer Channels.WaitGroup.Done()

wgValidation := &sync.WaitGroup{}
Expand All @@ -64,7 +80,7 @@ func ProcessValidationAndScoreWithValidation(engine *engine.Engine) {
engine.Validate()
}

func ProcessScoreWithoutValidation(engine *engine.Engine) {
func ProcessScoreWithoutValidation(engine engine.IEngine) {
defer Channels.WaitGroup.Done()

wgScore := &sync.WaitGroup{}
Expand Down
196 changes: 196 additions & 0 deletions engine/chunk/chunk.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
package chunk

//go:generate mockgen -source=$GOFILE -destination=${GOPACKAGE}_mock.go -package=${GOPACKAGE}

import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"sync"
"unicode"

"github.com/h2non/filetype"
)

const (
defaultSize = 100 * 1024 // 100Kib
defaultMaxPeekSize = 25 * 1024 // 25Kib
defaultFileThreshold = 1 * 1024 * 1024 // 1MiB
)

var ErrUnsupportedFileType = errors.New("unsupported file type")

type Chunk struct {
BufPool *sync.Pool
PeekBufPool *sync.Pool
Size int
MaxPeekSize int
SmallFileThreshold int64
}

type IChunk interface {
GetBuf() (*[]byte, bool)
PutBuf(buf *[]byte)
GetPeekBuf(buf []byte) (*bytes.Buffer, bool)
PutPeekBuf(buf *bytes.Buffer)
GetSize() int
GetMaxPeekSize() int
GetFileThreshold() int64
ReadChunk(reader *bufio.Reader, totalLines int) (string, error)
}

func NewChunk() *Chunk {
return NewChunkWithSize(defaultSize, defaultMaxPeekSize, defaultFileThreshold)
}

func NewChunkWithSize(size, maxPeekSize, smallFileThreshold int) *Chunk {
return &Chunk{
BufPool: &sync.Pool{
New: func() interface{} {
b := make([]byte, size)
return &b
},
},
PeekBufPool: &sync.Pool{
New: func() interface{} {
// pre-allocate enough capacity for initial chunk + peek
return bytes.NewBuffer(make([]byte, 0, size+maxPeekSize))
},
},
Size: size,
MaxPeekSize: maxPeekSize,
SmallFileThreshold: int64(smallFileThreshold),
}
}

func (c *Chunk) GetBuf() (*[]byte, bool) {
buf, ok := c.BufPool.Get().(*[]byte)
return buf, ok
}

func (c *Chunk) PutBuf(buf *[]byte) {
c.BufPool.Put(buf)
}

func (c *Chunk) GetPeekBuf(buf []byte) (*bytes.Buffer, bool) {
peekBuf, ok := c.PeekBufPool.Get().(*bytes.Buffer)
peekBuf.Reset()
peekBuf.Write(buf) // seed with buf
return peekBuf, ok
}

func (c *Chunk) PutPeekBuf(buf *bytes.Buffer) {
c.PeekBufPool.Put(buf)
}

func (c *Chunk) GetSize() int {
return c.Size
}

func (c *Chunk) GetMaxPeekSize() int {
return c.MaxPeekSize
}

func (c *Chunk) GetFileThreshold() int64 {
return c.SmallFileThreshold
}

// ReadChunk reads the next chunk of data from file
func (c *Chunk) ReadChunk(reader *bufio.Reader, totalLines int) (string, error) {
chunk, ok := c.GetBuf()
if !ok {
return "", fmt.Errorf("expected *[]byte, got %T", chunk)
}
defer c.PutBuf(chunk)

n, err := reader.Read(*chunk)
var chunkStr string
// "Callers should always process the n > 0 bytes returned before considering the error err."
// https://pkg.go.dev/io#Reader
if n > 0 {
// only check the filetype at the start of file
if totalLines == 0 && ShouldSkipFile((*chunk)[:n]) {
return "", fmt.Errorf("skipping file: %w", ErrUnsupportedFileType)
}

chunkStr, err = c.processChunk(reader, (*chunk)[:n])
if err != nil {
return "", err
}
}
if err != nil {
return "", err
}
return chunkStr, nil
}

// processChunk processes the chunk, reading until a safe boundary
func (c *Chunk) processChunk(reader *bufio.Reader, chunk []byte) (string, error) {
peekBuf, ok := c.GetPeekBuf(chunk)
if !ok {
return "", fmt.Errorf("expected *bytes.Buffer, got %T", peekBuf)
}
defer c.PutPeekBuf(peekBuf)

if readErr := c.readUntilSafeBoundary(reader, len(chunk), peekBuf); readErr != nil {
return "", fmt.Errorf("failed to read until safe boundary for file: %w", readErr)
}

return peekBuf.String(), nil
}

// readUntilSafeBoundary (hopefully) avoids splitting (https://github.com/gitleaks/gitleaks/issues/1651)
func (c *Chunk) readUntilSafeBoundary(r *bufio.Reader, n int, peekBuf *bytes.Buffer) error {
if peekBuf.Len() == 0 {
return nil
}

// keep reading until see our “\n…\n” boundary or hit limits
for peekBuf.Len()-n < c.MaxPeekSize {
if endsWithTwoNewlines(peekBuf.Bytes()) {
return nil
}

b, err := r.ReadByte()
if err != nil {
if err == io.EOF {
return nil
}
return fmt.Errorf("failed to read byte: %w", err)
}
peekBuf.WriteByte(b)
}

return nil
}

// endsWithTwoNewlines returns true if b ends in at least two '\n's (ignoring any number of ' ', '\r', or '\t' between them)
func endsWithTwoNewlines(b []byte) bool {
count := 0
for i := len(b) - 1; i >= 0; i-- {
if b[i] == '\n' {
count++
if count >= 2 {
return true
}
} else if unicode.IsSpace(rune(b[i])) {
// the presence of other whitespace characters (`\r`, ` `, `\t`) shouldn't reset the count
continue
} else {
return false
}
}
return false
}

// ShouldSkipFile checks if the file should be skipped based on its content type
func ShouldSkipFile(data []byte) bool {
// TODO: could other optimizations be introduced here?
mimetype, err := filetype.Match(data)
if err != nil {
return true // could not determine file type
}
return mimetype.MIME.Type == "application" // skip binary files
}
Loading
Loading