Skip to content

Commit 39cfe69

Browse files
committed
parse.go: Print process progress.
1 parent 2728c6d commit 39cfe69

File tree

3 files changed

+126
-63
lines changed

3 files changed

+126
-63
lines changed

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,9 +131,11 @@ Remember to run `M-x org-roam-db-sync` to sync the org-roam database.
131131
- [ ] One-click export from Kindle app.
132132
- [ ] Change parser/exporter type from string to safe type.
133133
- [x] `My Clippings.txt` parser.
134-
- [ ] Diff the previous processed `My Clippings.txt`
134+
- [ ] Diff the previous processed `My Clippings.txt` so don't parse the whole file again.
135135
- [ ] Support multiple authors.
136-
- [ ] Add progress indicator
136+
- [x] Add progress indicator.
137+
- [ ] Difference between createdAt and when notes are added to the database.
138+
- [ ] Figure out the difference in length between json and loaded mongodb json.
137139

138140
### Server Backend
139141
- [x] Database storage.

pkg/parser/kindlemyclippings/parser.go

Lines changed: 86 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -29,39 +29,71 @@ func (p *KindleMyClippingsParser) LoadConfigs(cmd *cobra.Command) {
2929
cmd.Flags().Float64Var(&p.minSimilarity, "min-similarity", 0.8, "Minimum similarity percentage (0-1) to consider a highlight as duplicate")
3030
}
3131

32-
// Parse processes the MyClippings.txt file and returns a list of Books with deduplicated Marks.
3332
func (p *KindleMyClippingsParser) Parse(inputPath string) ([]*model.Book, error) {
33+
// Open the file
3434
file, err := os.Open(inputPath)
3535
defer file.Close()
3636
if err != nil {
37-
return nil, errors.Wrap(err, "")
37+
return nil, errors.Wrap(err, "failed to open input file")
38+
}
39+
40+
// Count total lines in the file to calculate parsing progress
41+
totalLines, err := countLines(file)
42+
if err != nil {
43+
return nil, errors.Wrap(err, "failed to count lines in input file")
44+
}
45+
// Reset the file cursor to the beginning after counting lines
46+
if _, err := file.Seek(0, 0); err != nil {
47+
return nil, errors.Wrap(err, "failed to reset file cursor")
3848
}
3949

4050
var books []*model.Book
4151
markListMap := make(map[string][]*model.Mark) // Maps book title -> list of marks
4252

4353
scanner := bufio.NewScanner(file)
54+
lineCount := 0 // Total number of lines processed
55+
entryCount := 0 // Total number of marks (entries)
56+
57+
// Function to print parsing progress
58+
printParsingProgress := func() {
59+
if lineCount%500 == 0 || lineCount == totalLines {
60+
progress := (float64(lineCount) / float64(totalLines)) * 100
61+
fmt.Fprintf(os.Stderr, "\rParsing Progress: %.2f%% (%d/%d lines processed, %d entries parsed)", progress, lineCount, totalLines, entryCount)
62+
}
63+
}
64+
65+
// Parsing phase
4466
for scanner.Scan() {
67+
lineCount++
68+
printParsingProgress()
69+
4570
// Extract book details
4671
title, author, err := extractTitleAndAuthor(stripLeadingBOM(scanner.Text()))
4772
if err != nil {
48-
return nil, errors.Wrap(err, "")
73+
return nil, errors.Wrap(err, fmt.Sprintf("error parsing title and author at line %d", lineCount))
4974
}
5075

5176
// Parse metadata
5277
if !scanner.Scan() {
53-
return nil, fmt.Errorf("unexpected EOF, expecting metadata")
78+
return nil, fmt.Errorf("unexpected EOF at line %d, expecting metadata", lineCount)
5479
}
80+
lineCount++
81+
printParsingProgress()
5582
meta := scanner.Text()
5683
markType, location, createdAt, err := extractMeta(meta)
5784
if err != nil {
58-
return nil, errors.Wrap(err, "")
85+
return nil, errors.Wrap(err, fmt.Sprintf("error parsing metadata at line %d", lineCount))
5986
}
6087

6188
scanner.Scan() // Skip the empty line before the actual text.
89+
lineCount++
90+
printParsingProgress()
6291

6392
var text []string
6493
for scanner.Scan() {
94+
lineCount++
95+
printParsingProgress()
96+
6597
line := scanner.Text()
6698
if line == "==========" {
6799
break
@@ -71,7 +103,7 @@ func (p *KindleMyClippingsParser) Parse(inputPath string) ([]*model.Book, error)
71103

72104
// Parse data or note
73105
if text == nil { // Empty notes, skip
74-
return nil, nil
106+
continue
75107
}
76108

77109
// Create the mark
@@ -99,42 +131,42 @@ func (p *KindleMyClippingsParser) Parse(inputPath string) ([]*model.Book, error)
99131
markListMap[title] = []*model.Mark{}
100132
}
101133
markListMap[title] = append(markListMap[title], mark)
134+
135+
entryCount++
102136
}
103137

104-
// Deduplicate marks for each book
138+
// Deduplication phase
139+
totalBooks := len(markListMap)
140+
processedBooks := 0
141+
105142
for title, marks := range markListMap {
106-
deduplicatedMarks := deduplicateMarks(marks, p.minSimilarity) // Use the configured threshold
143+
processedBooks++
144+
fmt.Fprintf(os.Stderr, "\nStarting deduplication for book: %s (%d marks)\n", title, len(marks))
145+
146+
deduplicatedMarks := deduplicateMarksWithProgress(marks, p.minSimilarity) // Deduplication with progress inside
107147
book := &model.Book{
108148
Title: title,
109149
Author: marks[0].Author,
110150
Marks: deduplicatedMarks,
111151
}
112152
books = append(books, book)
113-
}
114153

115-
if err := scanner.Err(); err != nil {
116-
return nil, errors.Wrap(err, "")
154+
// Print progress across books
155+
bookProgress := (float64(processedBooks) / float64(totalBooks)) * 100
156+
fmt.Fprintf(os.Stderr, "\rTotal Deduplication Progress: %.2f%% (%d/%d books processed)", bookProgress, processedBooks, totalBooks)
117157
}
118158

119-
model.SortBooksByTitle(books)
120-
159+
// Final summary
160+
fmt.Fprintln(os.Stderr) // Add a newline after the progress output
161+
fmt.Fprintf(os.Stderr, "Finished processing: %d lines parsed, %d books deduplicated, %d marks processed\n", lineCount, totalBooks, entryCount)
121162
return books, nil
122163
}
123164

124-
func stripLeadingBOM(s string) string {
125-
runes := []rune(s)
126-
start := 0
127-
for start < len(runes) && runes[start] == '\uFEFF' {
128-
start++
129-
}
130-
return string(runes[start:])
131-
}
132-
133-
// Deduplicate marks by comparing similarity of content using a minimum common words threshold
134-
func deduplicateMarks(marks []*model.Mark, minSimilarity float64) []*model.Mark {
165+
func deduplicateMarksWithProgress(marks []*model.Mark, minSimilarity float64) []*model.Mark {
135166
var deduplicated []*model.Mark
167+
totalMarks := len(marks)
136168

137-
for _, mark := range marks {
169+
for index, mark := range marks {
138170
duplicateFound := false
139171
for i, dedupMark := range deduplicated {
140172
if hasMinCommonWords(dedupMark.Data, mark.Data, minSimilarity) {
@@ -149,11 +181,40 @@ func deduplicateMarks(marks []*model.Mark, minSimilarity float64) []*model.Mark
149181
if !duplicateFound {
150182
deduplicated = append(deduplicated, mark)
151183
}
184+
185+
// Print deduplication progress for current book every 100 marks
186+
if (index+1)%100 == 0 || index+1 == totalMarks {
187+
progress := (float64(index+1) / float64(totalMarks)) * 100
188+
fmt.Fprintf(os.Stderr, "\rDeduplication Progress for current book: %.2f%% (%d/%d marks processed)", progress, index+1, totalMarks)
189+
}
152190
}
153191

192+
// Print a new line after finishing the current book
193+
fmt.Fprintln(os.Stderr)
154194
return deduplicated
155195
}
156196

197+
func countLines(file *os.File) (int, error) {
198+
scanner := bufio.NewScanner(file)
199+
lineCount := 0
200+
for scanner.Scan() {
201+
lineCount++
202+
}
203+
if err := scanner.Err(); err != nil {
204+
return 0, err
205+
}
206+
return lineCount, nil
207+
}
208+
209+
func stripLeadingBOM(s string) string {
210+
runes := []rune(s)
211+
start := 0
212+
for start < len(runes) && runes[start] == '\uFEFF' {
213+
start++
214+
}
215+
return string(runes[start:])
216+
}
217+
157218
// Check if two strings share a longest common substring whose percentage of similarity meets the threshold
158219
func hasMinCommonWords(a, b string, minSimilarity float64) bool {
159220
// Preprocess both strings to handle Chinese text

tests/my_clippings_output.json

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,42 @@
5555
}
5656
]
5757
},
58+
{
59+
"title": "红楼梦(人文社权威定本彩皮版;国务院文化组批准,红研所校注;豆瓣读书TOP250首位,评论上万条;出版四十年,三次修订)",
60+
"author": "曹雪芹",
61+
"marks": [
62+
{
63+
"type": "HIGHLIGHT",
64+
"title": "红楼梦(人文社权威定本彩皮版;国务院文化组批准,红研所校注;豆瓣读书TOP250首位,评论上万条;出版四十年,三次修订)",
65+
"author": "曹雪芹",
66+
"location": {
67+
"location": 15868
68+
},
69+
"data": "都是尤二姐素习所穿的,不禁又伤心哭了起来。自己用个包袱一齐包了,也不命小厮丫鬟来拿,便自己提着来烧。",
70+
"createdAt": 1642265828
71+
},
72+
{
73+
"type": "HIGHLIGHT",
74+
"title": "红楼梦(人文社权威定本彩皮版;国务院文化组批准,红研所校注;豆瓣读书TOP250首位,评论上万条;出版四十年,三次修订)",
75+
"author": "曹雪芹",
76+
"location": {
77+
"location": 15869
78+
},
79+
"data": "平儿又是伤心,又是好笑,忙将二百两一包的碎银子偷了出来,",
80+
"createdAt": 1642265863
81+
},
82+
{
83+
"type": "HIGHLIGHT",
84+
"title": "红楼梦(人文社权威定本彩皮版;国务院文化组批准,红研所校注;豆瓣读书TOP250首位,评论上万条;出版四十年,三次修订)",
85+
"author": "曹雪芹",
86+
"location": {
87+
"location": 15871
88+
},
89+
"data": "又将一条裙子递与平儿,说:“这是他家常穿的,你好生替我收着,作个念心儿。”",
90+
"createdAt": 1642265884
91+
}
92+
]
93+
},
5894
{
5995
"title": "小行星掉在下午(首届宝珀理想国文学奖决选作者沈大成全新小说集,给沈大成6分钟,她给你一场颅内反乌托邦式冒险 理想国出品)",
6096
"author": "沈大成",
@@ -93,41 +129,5 @@
93129
"createdAt": 1642641563
94130
}
95131
]
96-
},
97-
{
98-
"title": "红楼梦(人文社权威定本彩皮版;国务院文化组批准,红研所校注;豆瓣读书TOP250首位,评论上万条;出版四十年,三次修订)",
99-
"author": "曹雪芹",
100-
"marks": [
101-
{
102-
"type": "HIGHLIGHT",
103-
"title": "红楼梦(人文社权威定本彩皮版;国务院文化组批准,红研所校注;豆瓣读书TOP250首位,评论上万条;出版四十年,三次修订)",
104-
"author": "曹雪芹",
105-
"location": {
106-
"location": 15868
107-
},
108-
"data": "都是尤二姐素习所穿的,不禁又伤心哭了起来。自己用个包袱一齐包了,也不命小厮丫鬟来拿,便自己提着来烧。",
109-
"createdAt": 1642265828
110-
},
111-
{
112-
"type": "HIGHLIGHT",
113-
"title": "红楼梦(人文社权威定本彩皮版;国务院文化组批准,红研所校注;豆瓣读书TOP250首位,评论上万条;出版四十年,三次修订)",
114-
"author": "曹雪芹",
115-
"location": {
116-
"location": 15869
117-
},
118-
"data": "平儿又是伤心,又是好笑,忙将二百两一包的碎银子偷了出来,",
119-
"createdAt": 1642265863
120-
},
121-
{
122-
"type": "HIGHLIGHT",
123-
"title": "红楼梦(人文社权威定本彩皮版;国务院文化组批准,红研所校注;豆瓣读书TOP250首位,评论上万条;出版四十年,三次修订)",
124-
"author": "曹雪芹",
125-
"location": {
126-
"location": 15871
127-
},
128-
"data": "又将一条裙子递与平儿,说:“这是他家常穿的,你好生替我收着,作个念心儿。”",
129-
"createdAt": 1642265884
130-
}
131-
]
132132
}
133133
]

0 commit comments

Comments
 (0)