@@ -29,39 +29,71 @@ func (p *KindleMyClippingsParser) LoadConfigs(cmd *cobra.Command) {
29
29
cmd .Flags ().Float64Var (& p .minSimilarity , "min-similarity" , 0.8 , "Minimum similarity percentage (0-1) to consider a highlight as duplicate" )
30
30
}
31
31
32
- // Parse processes the MyClippings.txt file and returns a list of Books with deduplicated Marks.
33
32
func (p * KindleMyClippingsParser ) Parse (inputPath string ) ([]* model.Book , error ) {
33
+ // Open the file
34
34
file , err := os .Open (inputPath )
35
35
defer file .Close ()
36
36
if err != nil {
37
- return nil , errors .Wrap (err , "" )
37
+ return nil , errors .Wrap (err , "failed to open input file" )
38
+ }
39
+
40
+ // Count total lines in the file to calculate parsing progress
41
+ totalLines , err := countLines (file )
42
+ if err != nil {
43
+ return nil , errors .Wrap (err , "failed to count lines in input file" )
44
+ }
45
+ // Reset the file cursor to the beginning after counting lines
46
+ if _ , err := file .Seek (0 , 0 ); err != nil {
47
+ return nil , errors .Wrap (err , "failed to reset file cursor" )
38
48
}
39
49
40
50
var books []* model.Book
41
51
markListMap := make (map [string ][]* model.Mark ) // Maps book title -> list of marks
42
52
43
53
scanner := bufio .NewScanner (file )
54
+ lineCount := 0 // Total number of lines processed
55
+ entryCount := 0 // Total number of marks (entries)
56
+
57
+ // Function to print parsing progress
58
+ printParsingProgress := func () {
59
+ if lineCount % 500 == 0 || lineCount == totalLines {
60
+ progress := (float64 (lineCount ) / float64 (totalLines )) * 100
61
+ fmt .Fprintf (os .Stderr , "\r Parsing Progress: %.2f%% (%d/%d lines processed, %d entries parsed)" , progress , lineCount , totalLines , entryCount )
62
+ }
63
+ }
64
+
65
+ // Parsing phase
44
66
for scanner .Scan () {
67
+ lineCount ++
68
+ printParsingProgress ()
69
+
45
70
// Extract book details
46
71
title , author , err := extractTitleAndAuthor (stripLeadingBOM (scanner .Text ()))
47
72
if err != nil {
48
- return nil , errors .Wrap (err , "" )
73
+ return nil , errors .Wrap (err , fmt . Sprintf ( "error parsing title and author at line %d" , lineCount ) )
49
74
}
50
75
51
76
// Parse metadata
52
77
if ! scanner .Scan () {
53
- return nil , fmt .Errorf ("unexpected EOF, expecting metadata" )
78
+ return nil , fmt .Errorf ("unexpected EOF at line %d , expecting metadata" , lineCount )
54
79
}
80
+ lineCount ++
81
+ printParsingProgress ()
55
82
meta := scanner .Text ()
56
83
markType , location , createdAt , err := extractMeta (meta )
57
84
if err != nil {
58
- return nil , errors .Wrap (err , "" )
85
+ return nil , errors .Wrap (err , fmt . Sprintf ( "error parsing metadata at line %d" , lineCount ) )
59
86
}
60
87
61
88
scanner .Scan () // Skip the empty line before the actual text.
89
+ lineCount ++
90
+ printParsingProgress ()
62
91
63
92
var text []string
64
93
for scanner .Scan () {
94
+ lineCount ++
95
+ printParsingProgress ()
96
+
65
97
line := scanner .Text ()
66
98
if line == "==========" {
67
99
break
@@ -71,7 +103,7 @@ func (p *KindleMyClippingsParser) Parse(inputPath string) ([]*model.Book, error)
71
103
72
104
// Parse data or note
73
105
if text == nil { // Empty notes, skip
74
- return nil , nil
106
+ continue
75
107
}
76
108
77
109
// Create the mark
@@ -99,42 +131,42 @@ func (p *KindleMyClippingsParser) Parse(inputPath string) ([]*model.Book, error)
99
131
markListMap [title ] = []* model.Mark {}
100
132
}
101
133
markListMap [title ] = append (markListMap [title ], mark )
134
+
135
+ entryCount ++
102
136
}
103
137
104
- // Deduplicate marks for each book
138
+ // Deduplication phase
139
+ totalBooks := len (markListMap )
140
+ processedBooks := 0
141
+
105
142
for title , marks := range markListMap {
106
- deduplicatedMarks := deduplicateMarks (marks , p .minSimilarity ) // Use the configured threshold
143
+ processedBooks ++
144
+ fmt .Fprintf (os .Stderr , "\n Starting deduplication for book: %s (%d marks)\n " , title , len (marks ))
145
+
146
+ deduplicatedMarks := deduplicateMarksWithProgress (marks , p .minSimilarity ) // Deduplication with progress inside
107
147
book := & model.Book {
108
148
Title : title ,
109
149
Author : marks [0 ].Author ,
110
150
Marks : deduplicatedMarks ,
111
151
}
112
152
books = append (books , book )
113
- }
114
153
115
- if err := scanner .Err (); err != nil {
116
- return nil , errors .Wrap (err , "" )
154
+ // Print progress across books
155
+ bookProgress := (float64 (processedBooks ) / float64 (totalBooks )) * 100
156
+ fmt .Fprintf (os .Stderr , "\r Total Deduplication Progress: %.2f%% (%d/%d books processed)" , bookProgress , processedBooks , totalBooks )
117
157
}
118
158
119
- model .SortBooksByTitle (books )
120
-
159
+ // Final summary
160
+ fmt .Fprintln (os .Stderr ) // Add a newline after the progress output
161
+ fmt .Fprintf (os .Stderr , "Finished processing: %d lines parsed, %d books deduplicated, %d marks processed\n " , lineCount , totalBooks , entryCount )
121
162
return books , nil
122
163
}
123
164
124
- func stripLeadingBOM (s string ) string {
125
- runes := []rune (s )
126
- start := 0
127
- for start < len (runes ) && runes [start ] == '\uFEFF' {
128
- start ++
129
- }
130
- return string (runes [start :])
131
- }
132
-
133
- // Deduplicate marks by comparing similarity of content using a minimum common words threshold
134
- func deduplicateMarks (marks []* model.Mark , minSimilarity float64 ) []* model.Mark {
165
+ func deduplicateMarksWithProgress (marks []* model.Mark , minSimilarity float64 ) []* model.Mark {
135
166
var deduplicated []* model.Mark
167
+ totalMarks := len (marks )
136
168
137
- for _ , mark := range marks {
169
+ for index , mark := range marks {
138
170
duplicateFound := false
139
171
for i , dedupMark := range deduplicated {
140
172
if hasMinCommonWords (dedupMark .Data , mark .Data , minSimilarity ) {
@@ -149,11 +181,40 @@ func deduplicateMarks(marks []*model.Mark, minSimilarity float64) []*model.Mark
149
181
if ! duplicateFound {
150
182
deduplicated = append (deduplicated , mark )
151
183
}
184
+
185
+ // Print deduplication progress for current book every 100 marks
186
+ if (index + 1 )% 100 == 0 || index + 1 == totalMarks {
187
+ progress := (float64 (index + 1 ) / float64 (totalMarks )) * 100
188
+ fmt .Fprintf (os .Stderr , "\r Deduplication Progress for current book: %.2f%% (%d/%d marks processed)" , progress , index + 1 , totalMarks )
189
+ }
152
190
}
153
191
192
+ // Print a new line after finishing the current book
193
+ fmt .Fprintln (os .Stderr )
154
194
return deduplicated
155
195
}
156
196
197
+ func countLines (file * os.File ) (int , error ) {
198
+ scanner := bufio .NewScanner (file )
199
+ lineCount := 0
200
+ for scanner .Scan () {
201
+ lineCount ++
202
+ }
203
+ if err := scanner .Err (); err != nil {
204
+ return 0 , err
205
+ }
206
+ return lineCount , nil
207
+ }
208
+
209
+ func stripLeadingBOM (s string ) string {
210
+ runes := []rune (s )
211
+ start := 0
212
+ for start < len (runes ) && runes [start ] == '\uFEFF' {
213
+ start ++
214
+ }
215
+ return string (runes [start :])
216
+ }
217
+
157
218
// Check if two strings share a longest common substring whose percentage of similarity meets the threshold
158
219
func hasMinCommonWords (a , b string , minSimilarity float64 ) bool {
159
220
// Preprocess both strings to handle Chinese text
0 commit comments