-
Notifications
You must be signed in to change notification settings - Fork 21
perf: improve memory consumption in 2ms file walk #287
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
perf: improve memory consumption in 2ms file walk #287
Conversation
Policy Management Violations (1)
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should have a package under engine, something like engine/semaphore. We can have all the logic there that handles everything related to the weights and memory (including the one in cmd/memory).
In cmd
we can then handle the DI of the semaphore by injecting it on the engine instance. It is in the end, the semaphore that knows which memory budget to allocate and how to handle the weights, and the engine, that should operate and depend on the semaphore.
WDYT?
cmd/memory.go
Outdated
@@ -0,0 +1,67 @@ | |||
package cmd |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
will this work properly on container settings? If it does, let's add tests to them.
engine/engine.go
Outdated
) | ||
|
||
var ( | ||
// Hols the buffer for reading chunks |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Hols the buffer for reading chunks | |
// Holds the buffer for reading chunks |
engine/utils/utils.go
Outdated
func BuildSecret(item plugins.ISourceItem, idx int, values []report.Finding, value report.Finding, | ||
pluginName string) (*secrets.Secret, error) { | ||
gitInfo := item.GetGitInfo() | ||
itemId := getFindingId(item, value) | ||
startLine, endLine, err := getStartAndEndLines(pluginName, gitInfo, value) | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to get start and end lines for source %s: %w", item.GetSource(), err) | ||
} | ||
|
||
if idx == len(values)-1 && strings.HasSuffix(value.Line, CxFileEndMarker) { | ||
value.Line = value.Line[:len(value.Line)-len(CxFileEndMarker)] | ||
value.EndColumn-- | ||
} | ||
|
||
lineContent, err := linecontent.GetLineContent(value.Line, value.Secret) | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to get line content for source %s: %w", item.GetSource(), err) | ||
} | ||
|
||
secret := &secrets.Secret{ | ||
ID: itemId, | ||
Source: item.GetSource(), | ||
RuleID: value.RuleID, | ||
StartLine: startLine, | ||
StartColumn: value.StartColumn, | ||
EndLine: endLine, | ||
EndColumn: value.EndColumn, | ||
Value: value.Secret, | ||
LineContent: lineContent, | ||
RuleDescription: value.Description, | ||
} | ||
return secret, nil | ||
} | ||
|
||
func getFindingId(item plugins.ISourceItem, finding report.Finding) string { | ||
idParts := []string{item.GetID(), finding.RuleID, finding.Secret} | ||
sha := sha1.Sum([]byte(strings.Join(idParts, "-"))) | ||
return fmt.Sprintf("%x", sha) | ||
} | ||
|
||
func getStartAndEndLines(pluginName string, gitInfo *plugins.GitInfo, value report.Finding) (int, int, error) { | ||
var startLine, endLine int | ||
var err error | ||
|
||
if pluginName == "filesystem" { | ||
startLine = value.StartLine + 1 | ||
endLine = value.EndLine + 1 | ||
} else if pluginName == "git" { | ||
startLine, endLine, err = plugins.GetGitStartAndEndLine(gitInfo, value.StartLine, value.EndLine) | ||
if err != nil { | ||
return 0, 0, err | ||
} | ||
} else { | ||
startLine = value.StartLine | ||
endLine = value.EndLine | ||
} | ||
|
||
return startLine, endLine, nil | ||
} | ||
|
||
func IsSecretIgnored(secret *secrets.Secret, ignoredIds, allowedValues *[]string) bool { | ||
for _, allowedValue := range *allowedValues { | ||
if secret.Value == allowedValue { | ||
return true | ||
} | ||
} | ||
for _, ignoredId := range *ignoredIds { | ||
if secret.ID == ignoredId { | ||
return true | ||
} | ||
} | ||
return false | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think all of these belong to the engine, they operate on structs around our business logic of the scan and items.
engine/utils/utils.go
Outdated
// ReadUntilSafeBoundary This hopefully avoids splitting. (https://github.com/gitleaks/gitleaks/issues/1651) | ||
func ReadUntilSafeBoundary(r *bufio.Reader, n int, maxPeekSize int, peekBuf *bytes.Buffer) error { | ||
if peekBuf.Len() == 0 { | ||
return nil | ||
} | ||
|
||
// Does the buffer end in consecutive newlines? | ||
var ( | ||
data = peekBuf.Bytes() | ||
lastChar = data[len(data)-1] | ||
newlineCount = 0 // Tracks consecutive newlines | ||
) | ||
if isWhitespace(lastChar) { | ||
for i := len(data) - 1; i >= 0; i-- { | ||
lastChar = data[i] | ||
if lastChar == '\n' { | ||
newlineCount++ | ||
|
||
// Stop if two consecutive newlines are found | ||
if newlineCount >= 2 { | ||
return nil | ||
} | ||
} else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' { | ||
// The presence of other whitespace characters (`\r`, ` `, `\t`) shouldn't reset the count. | ||
// (Intentionally do nothing.) | ||
} else { | ||
break | ||
} | ||
} | ||
} | ||
|
||
// If not, read ahead until we (hopefully) find some. | ||
newlineCount = 0 | ||
for { | ||
data = peekBuf.Bytes() | ||
// Check if the last character is a newline. | ||
lastChar = data[len(data)-1] | ||
if lastChar == '\n' { | ||
newlineCount++ | ||
|
||
// Stop if two consecutive newlines are found | ||
if newlineCount >= 2 { | ||
break | ||
} | ||
} else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' { | ||
// The presence of other whitespace characters (`\r`, ` `, `\t`) shouldn't reset the count. | ||
// (Intentionally do nothing.) | ||
} else { | ||
newlineCount = 0 // Reset if a non-newline character is found | ||
} | ||
|
||
// Stop growing the buffer if it reaches maxSize | ||
if (peekBuf.Len() - n) >= maxPeekSize { | ||
break | ||
} | ||
|
||
// Read additional data into a temporary buffer | ||
b, err := r.ReadByte() | ||
if err != nil { | ||
if err == io.EOF { | ||
break | ||
} | ||
return fmt.Errorf("failed to read byte: %w", err) | ||
} | ||
peekBuf.WriteByte(b) | ||
} | ||
return nil | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the part that should belong to the semaphore logic
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This logic is specifically related to chunking/peeking logic, why do you think it should belong to the semaphore logic?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, you're right. Semaphore does not make sense here but this is not an util either. Maybe a chunk
package is what we are missing here because even in our new logic, these chunks seem to be the central piece but all we see are buffers on the code.
I am still getting familiar with the structure but for now it feels like the Engine
is a bit of a god struct where it handles all the details that doesn't need to be aware of. A new Chunk
structure could know how to process its data and the Engine
should just call it.
engine/utils/utils.go
Outdated
// AcquireMemoryWeight acquires a semaphore with a specified weight | ||
func AcquireMemoryWeight(ctx context.Context, weight, memoryBudget int64, sem *semaphore.Weighted) error { | ||
if weight > memoryBudget { | ||
return fmt.Errorf("buffer size %d exceeds memory budget %d", weight, memoryBudget) | ||
} | ||
if err := sem.Acquire(ctx, weight); err != nil { | ||
return fmt.Errorf("failed to acquire semaphore: %w", err) | ||
} | ||
return nil | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Semaphore logic
cmd/memory.go
Outdated
func getCgroupMemoryLimit() uint64 { | ||
// Try cgroup v2: unified hierarchy | ||
if data, err := os.ReadFile("/sys/fs/cgroup/memory.max"); err == nil { | ||
s := strings.TrimSpace(string(data)) | ||
if s != "max" { | ||
if v, err := strconv.ParseUint(s, 10, 64); err == nil { | ||
return v | ||
} | ||
} | ||
} | ||
// Fallback cgroup v1 | ||
if data, err := os.ReadFile("/sys/fs/cgroup/memory/memory.limit_in_bytes"); err == nil { | ||
if v, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64); err == nil { | ||
return v | ||
} | ||
} | ||
// No limit detected | ||
return ^uint64(0) // max uint64 | ||
} | ||
|
||
// getTotalMemory returns the total physical RAM in bytes | ||
func getTotalMemory() uint64 { | ||
if vm, err := mem.VirtualMemory(); err == nil { | ||
return vm.Total | ||
} | ||
return ^uint64(0) // max uint64 | ||
} | ||
|
||
// chooseMemoryBudget picks 50% of total RAM (but at least 256 MiB) | ||
func chooseMemoryBudget() int64 { | ||
// Physical RAM | ||
totalHost := getTotalMemory() | ||
|
||
// Cgroup limit | ||
cgroupLimit := getCgroupMemoryLimit() | ||
|
||
// Effective total = min(host, cgroup) | ||
var effectiveTotal uint64 | ||
if totalHost < cgroupLimit { | ||
effectiveTotal = totalHost | ||
} else { | ||
effectiveTotal = cgroupLimit | ||
} | ||
|
||
// use 50% but cap to [256 MiB -> total − safety margin] | ||
safetyMargin := uint64(200 * 1024 * 1024) // reserve 200 MiB for OS/other processes | ||
avail := effectiveTotal | ||
if effectiveTotal > safetyMargin { | ||
avail = effectiveTotal - safetyMargin | ||
} | ||
budget := int64(avail / 2) // use half of what remains | ||
if budget < 256*1024*1024 { | ||
budget = 256 * 1024 * 1024 | ||
} | ||
return budget | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this should be under semaphore logic, part of the constructor/initialization logic.
It makes perfect sense, it is a cleaner and more modular approach. |
Proposed Changes
Checklist