Skip to content

Commit e08b606

Browse files
authored
Add flag to ignore comments (#6)
* Add mvp for ignoring code comments * Add multiline regex * Update README.md with new command line options
1 parent b876a93 commit e08b606

File tree

7 files changed

+80
-20
lines changed

7 files changed

+80
-20
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,4 @@ go.work
2929
*.zip
3030
text.txt
3131
text.json
32+
git2gpt

README.md

+7-8
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,8 @@
22

33
git2gpt is a command-line utility that converts a Git repository to text for loading into ChatGPT and other NLP models. The output text file represents the Git repository in a structured format. You can also add a `.gptignore` file to your repos to have git2gpt ignore certain files. The text is prefixed with a preamble that explains to the AI what the text is:
44

5-
65
> The following text is a Git repository with code. The structure of the text are sections that begin with ----, followed by a single line containing the file path and file name, followed by a variable amount of lines containing the file contents. The text representing the Git repository ends when the symbols --END-- are encounted. Any further text beyond --END-- are meant to be interpreted as instructions using the aforementioned Git repository as context.
76
8-
97
## Installation
108

119
First, make sure you have the Go programming language installed on your system. You can download it from [the official Go website](https://golang.org/dl/).
@@ -32,12 +30,13 @@ By default, your `.git` directory and your `.gitignore` files are ignored. Any f
3230

3331
### Flags
3432

35-
* `-p`, `--preamble`: Path to a text file containing a preamble to include at the beginning of the output file.
36-
* `-o`, `--output`: Path to the output file. If not specified, will print to standard output.
37-
* `-e`, `--estimate`: Estimate the tokens of the output file. If not specified, does not estimate.
38-
* `-j`, `--json`: Output to JSON rather than plain text. Use with `-o` to specify the output file.
39-
* `-i`, `--ignore`: Path to the `.gptignore` file. If not specified, will look for a `.gptignore` file in the same directory as the `.gitignore` file.
40-
* `-g`, `--ignore-gitignore`: Ignore the `.gitignore` file.
33+
* `-p`, `--preamble`: Path to a text file containing a preamble to include at the beginning of the output file.
34+
* `-o`, `--output`: Path to the output file. If not specified, will print to standard output.
35+
* `-e`, `--estimate`: Estimate the tokens of the output file. If not specified, does not estimate.
36+
* `-j`, `--json`: Output to JSON rather than plain text. Use with `-o` to specify the output file.
37+
* `-i`, `--ignore`: Path to the `.gptignore` file. If not specified, will look for a `.gptignore` file in the same directory as the `.gitignore` file.
38+
* `-g`, `--ignore-gitignore`: Ignore the `.gitignore` file.
39+
* `-s`, `--scrub-comments`: Remove comments from the output file to save tokens.
4140

4241
## Contributing
4342

cmd/root.go

+5-2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ var ignoreFilePath string
1616
var ignoreGitignore bool
1717
var outputJSON bool
1818
var debug bool
19+
var scrubComments bool
1920

2021
var rootCmd = &cobra.Command{
2122
Use: "git2gpt [flags] /path/to/git/repository",
@@ -30,7 +31,7 @@ var rootCmd = &cobra.Command{
3031
os.Exit(1)
3132
}
3233
if outputJSON {
33-
output, err := prompt.MarshalRepo(repo)
34+
output, err := prompt.MarshalRepo(repo, scrubComments)
3435
if err != nil {
3536
fmt.Printf("Error: %s\n", err)
3637
os.Exit(1)
@@ -53,7 +54,7 @@ var rootCmd = &cobra.Command{
5354
}
5455
return
5556
}
56-
output, err := prompt.OutputGitRepo(repo, preambleFile)
57+
output, err := prompt.OutputGitRepo(repo, preambleFile, scrubComments)
5758
if err != nil {
5859
fmt.Printf("Error: %s\n", err)
5960
os.Exit(1)
@@ -94,6 +95,8 @@ func init() {
9495
rootCmd.Flags().BoolVarP(&outputJSON, "json", "j", false, "output JSON")
9596
// debug. Should be a bool
9697
rootCmd.Flags().BoolVarP(&debug, "debug", "d", false, "debug mode. Do not output to standard output")
98+
// scrub comments. Should be a bool
99+
rootCmd.Flags().BoolVarP(&scrubComments, "scrub-comments", "s", false, "scrub comments from the output. Decreases token count")
97100
}
98101

99102
func Execute() {

go.mod

+3
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ require (
88
)
99

1010
require (
11+
github.com/dlclark/regexp2 v1.10.0 // indirect
12+
github.com/google/uuid v1.3.0 // indirect
1113
github.com/inconshreveable/mousetrap v1.0.1 // indirect
14+
github.com/pkoukk/tiktoken-go v0.1.6 // indirect
1215
github.com/spf13/pflag v1.0.5 // indirect
1316
)

go.sum

+6
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
11
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
2+
github.com/dlclark/regexp2 v1.10.0 h1:+/GIL799phkJqYW+3YbOd8LCcbHzT0Pbo8zl70MHsq0=
3+
github.com/dlclark/regexp2 v1.10.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
24
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
35
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
6+
github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
7+
github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
48
github.com/inconshreveable/mousetrap v1.0.1 h1:U3uMjPSQEBMNp1lFxmllqCPM6P5u/Xq7Pgzkat/bFNc=
59
github.com/inconshreveable/mousetrap v1.0.1/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
10+
github.com/pkoukk/tiktoken-go v0.1.6 h1:JF0TlJzhTbrI30wCvFuiw6FzP2+/bR+FIxUdgEAcUsw=
11+
github.com/pkoukk/tiktoken-go v0.1.6/go.mod h1:9NiV+i9mJKGj1rYOT+njbv+ZwA/zJxYdewGl6qVatpg=
612
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
713
github.com/spf13/cobra v1.6.1 h1:o94oiPyS4KD1mPy2fmcYYHHfCxLqYjJOhGsCHFZtEzA=
814
github.com/spf13/cobra v1.6.1/go.mod h1:IOw/AERYS7UzyrGinqmz6HLUo219MORXGxhbaJUqzrY=

prompt/prompt.go

+17-10
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,14 @@ import (
44
"bufio"
55
"encoding/json"
66
"fmt"
7-
"math"
87
"os"
98
"path/filepath"
109
"strings"
1110
"unicode/utf8"
1211

12+
"github.com/chand1012/git2gpt/utils"
1313
"github.com/gobwas/glob"
14+
"github.com/pkoukk/tiktoken-go"
1415
)
1516

1617
// GitFile is a file in a Git repository
@@ -89,7 +90,7 @@ func GenerateIgnoreList(repoPath, ignoreFilePath string, useGitignore bool) []st
8990
// .gptignore file exists
9091
ignoreList, _ = getIgnoreList(ignoreFilePath)
9192
}
92-
ignoreList = append(ignoreList, ".git/**", ".gitignore")
93+
ignoreList = append(ignoreList, ".git/**", ".gitignore", ".gptignore")
9394

9495
if useGitignore {
9596
gitignorePath := filepath.Join(repoPath, ".gitignore")
@@ -131,7 +132,7 @@ func ProcessGitRepo(repoPath string, ignoreList []string) (*GitRepo, error) {
131132
}
132133

133134
// OutputGitRepo outputs a Git repository to a text file
134-
func OutputGitRepo(repo *GitRepo, preambleFile string) (string, error) {
135+
func OutputGitRepo(repo *GitRepo, preambleFile string, scrubComments bool) (string, error) {
135136
var repoBuilder strings.Builder
136137

137138
if preambleFile != "" {
@@ -148,6 +149,9 @@ func OutputGitRepo(repo *GitRepo, preambleFile string) (string, error) {
148149
for _, file := range repo.Files {
149150
repoBuilder.WriteString("----\n")
150151
repoBuilder.WriteString(fmt.Sprintf("%s\n", file.Path))
152+
if scrubComments {
153+
file.Contents = utils.RemoveCodeComments(file.Contents)
154+
}
151155
repoBuilder.WriteString(fmt.Sprintf("%s\n", file.Contents))
152156
}
153157

@@ -160,9 +164,9 @@ func OutputGitRepo(repo *GitRepo, preambleFile string) (string, error) {
160164
return output, nil
161165
}
162166

163-
func MarshalRepo(repo *GitRepo) ([]byte, error) {
167+
func MarshalRepo(repo *GitRepo, scrubComments bool) ([]byte, error) {
164168
// run the output function to get the total tokens
165-
_, err := OutputGitRepo(repo, "")
169+
_, err := OutputGitRepo(repo, "", scrubComments)
166170
if err != nil {
167171
return nil, fmt.Errorf("error marshalling repo: %w", err)
168172
}
@@ -208,9 +212,12 @@ func processRepository(repoPath string, ignoreList []string, repo *GitRepo) erro
208212

209213
// EstimateTokens estimates the number of tokens in a string
210214
func EstimateTokens(output string) int64 {
211-
tokenCount := float64(len(output))
212-
// divide by 3.5 to account for the fact that GPT-4 uses (roughly) 3.5 tokens per character
213-
tokenCount = tokenCount / 3.5
214-
// round up to the nearest integer
215-
return int64(math.Ceil(tokenCount))
215+
tke, err := tiktoken.GetEncoding("cl100k_base")
216+
if err != nil {
217+
fmt.Println("Error getting encoding:", err)
218+
return 0
219+
}
220+
221+
tokens := tke.Encode(output, nil, nil)
222+
return int64(len(tokens))
216223
}

utils/strings.go

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
package utils
2+
3+
import (
4+
"bufio"
5+
"fmt"
6+
"regexp"
7+
"strings"
8+
)
9+
10+
// RemoveCodeComments removes single-line and multiline comments from the provided code string.
11+
func RemoveCodeComments(code string) string {
12+
// Regex for single-line comments.
13+
singleLineCommentRegex := regexp.MustCompile(`^\s*(//|#|--|<!--|%|;|REM\s).*$`)
14+
15+
// Regex for multiline comments in C, JavaScript, Go, and HTML.
16+
multiLineCommentRegex := regexp.MustCompile(`(?s)/\*.*?\*/|<!--.*?-->`)
17+
18+
// Use a scanner to process each line of the input string.
19+
var result strings.Builder
20+
scanner := bufio.NewScanner(strings.NewReader(code))
21+
for scanner.Scan() {
22+
line := scanner.Text()
23+
// First remove multiline comments as they may span across multiple lines.
24+
line = multiLineCommentRegex.ReplaceAllString(line, "")
25+
// Then remove any single-line comment parts that remain.
26+
cleanLine := singleLineCommentRegex.ReplaceAllString(line, "")
27+
if cleanLine != "" {
28+
// Write the cleaned line to the result, preserving original line breaks.
29+
result.WriteString(cleanLine + "\n")
30+
}
31+
}
32+
33+
if err := scanner.Err(); err != nil {
34+
fmt.Fprintln(&result, "Error reading input:", err)
35+
}
36+
37+
// Additional cleanup in case of multiline comments spanning across multiple scanned lines.
38+
finalCleanedCode := multiLineCommentRegex.ReplaceAllString(result.String(), "")
39+
40+
return strings.TrimRight(finalCleanedCode, "\n")
41+
}

0 commit comments

Comments
 (0)