Skip to content

Commit f39c70d

Browse files
vikash390hash-data
andauthored
feat : Fallback to bucketAuto Strategy In Monngodb (#158)
Co-authored-by: hash-data <[email protected]> Co-authored-by: Ankit Sharma <[email protected]>
1 parent 69cb03c commit f39c70d

File tree

2 files changed

+82
-22
lines changed

2 files changed

+82
-22
lines changed

drivers/mongodb/go.mod

+2-3
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ require (
1212
github.com/apache/thrift v0.16.0 // indirect
1313
github.com/aws/aws-sdk-go v1.43.31 // indirect
1414
github.com/felixge/fgprof v0.9.5 // indirect
15-
github.com/fraugster/parquet-go v0.12.0 // indirect
1615
github.com/fsnotify/fsnotify v1.5.1 // indirect
1716
github.com/golang/snappy v0.0.4 // indirect
1817
github.com/google/pprof v0.0.0-20240227163752-401108e1b7e7 // indirect
@@ -64,14 +63,14 @@ require (
6463
github.com/jackc/pgio v1.0.0 // indirect
6564
github.com/jackc/pgpassfile v1.0.0 // indirect
6665
github.com/jackc/pgproto3/v2 v2.3.3 // indirect
67-
github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a // indirect
66+
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
6867
github.com/jackc/pgtype v1.14.0 // indirect
6968
github.com/leodido/go-urn v1.4.0 // indirect
7069
github.com/mitchellh/hashstructure v1.1.0 // indirect
7170
github.com/rogpeppe/go-internal v1.13.1 // indirect
7271
github.com/spf13/cobra v1.8.0 // indirect
7372
github.com/spf13/pflag v1.0.5 // indirect
74-
go.mongodb.org/mongo-driver v1.17.1
73+
go.mongodb.org/mongo-driver v1.17.3
7574
golang.org/x/crypto v0.31.0 // indirect
7675
golang.org/x/net v0.33.0 // indirect
7776
golang.org/x/sys v0.28.0 // indirect

drivers/mongodb/internal/backfill.go

+80-19
Original file line numberDiff line numberDiff line change
@@ -124,25 +124,24 @@ func (m *Mongo) backfill(stream protocol.Stream, pool *protocol.WriterPool) erro
124124

125125
func (m *Mongo) splitChunks(ctx context.Context, collection *mongo.Collection, stream protocol.Stream) ([]types.Chunk, error) {
126126
splitVectorStrategy := func() ([]types.Chunk, error) {
127-
getChunkBoundaries := func() ([]*primitive.ObjectID, error) {
128-
getID := func(order int) (primitive.ObjectID, error) {
129-
var doc bson.M
130-
err := collection.FindOne(ctx, bson.D{}, options.FindOne().SetSort(bson.D{{Key: "_id", Value: order}})).Decode(&doc)
131-
if err == mongo.ErrNoDocuments {
132-
return primitive.NilObjectID, nil
133-
}
134-
return doc["_id"].(primitive.ObjectID), err
135-
}
136-
137-
minID, err := getID(1)
138-
if err != nil || minID == primitive.NilObjectID {
139-
return nil, err
140-
}
141-
maxID, err := getID(-1)
142-
if err != nil {
143-
return nil, err
127+
getID := func(order int) (primitive.ObjectID, error) {
128+
var doc bson.M
129+
err := collection.FindOne(ctx, bson.D{}, options.FindOne().SetSort(bson.D{{Key: "_id", Value: order}})).Decode(&doc)
130+
if err == mongo.ErrNoDocuments {
131+
return primitive.NilObjectID, nil
144132
}
133+
return doc["_id"].(primitive.ObjectID), err
134+
}
145135

136+
minID, err := getID(1)
137+
if err != nil || minID == primitive.NilObjectID {
138+
return nil, err
139+
}
140+
maxID, err := getID(-1)
141+
if err != nil {
142+
return nil, err
143+
}
144+
getChunkBoundaries := func() ([]*primitive.ObjectID, error) {
146145
var result bson.M
147146
cmd := bson.D{
148147
{Key: "splitVector", Value: fmt.Sprintf("%s.%s", collection.Database().Name(), collection.Name())},
@@ -173,6 +172,57 @@ func (m *Mongo) splitChunks(ctx context.Context, collection *mongo.Collection, s
173172
Max: &boundaries[i+1],
174173
})
175174
}
175+
if len(boundaries) > 0 {
176+
chunks = append(chunks, types.Chunk{
177+
Min: &boundaries[len(boundaries)-1],
178+
Max: nil,
179+
})
180+
}
181+
return chunks, nil
182+
}
183+
bucketAutoStrategy := func() ([]types.Chunk, error) {
184+
logger.Info("using bucket auto strategy for stream: %s", stream.ID())
185+
// Use $bucketAuto for chunking
186+
pipeline := mongo.Pipeline{
187+
{{Key: "$sort", Value: bson.D{{Key: "_id", Value: 1}}}},
188+
{{Key: "$bucketAuto", Value: bson.D{
189+
{Key: "groupBy", Value: "$_id"},
190+
{Key: "buckets", Value: m.config.MaxThreads * 4},
191+
}}},
192+
}
193+
194+
cursor, err := collection.Aggregate(ctx, pipeline)
195+
if err != nil {
196+
return nil, fmt.Errorf("failed to execute bucketAuto aggregation: %s", err)
197+
}
198+
defer cursor.Close(ctx)
199+
200+
var buckets []struct {
201+
ID struct {
202+
Min primitive.ObjectID `bson:"min"`
203+
Max primitive.ObjectID `bson:"max"`
204+
} `bson:"_id"`
205+
Count int `bson:"count"`
206+
}
207+
208+
if err := cursor.All(ctx, &buckets); err != nil {
209+
return nil, fmt.Errorf("failed to decode bucketAuto results: %s", err)
210+
}
211+
212+
var chunks []types.Chunk
213+
for _, bucket := range buckets {
214+
chunks = append(chunks, types.Chunk{
215+
Min: &bucket.ID.Min,
216+
Max: &bucket.ID.Max,
217+
})
218+
}
219+
if len(buckets) > 0 {
220+
chunks = append(chunks, types.Chunk{
221+
Min: &buckets[len(buckets)-1].ID.Max,
222+
Max: nil,
223+
})
224+
}
225+
176226
return chunks, nil
177227
}
178228

@@ -205,14 +255,25 @@ func (m *Mongo) splitChunks(ctx context.Context, collection *mongo.Collection, s
205255
Max: maxObjectID,
206256
})
207257
}
258+
chunks = append(chunks, types.Chunk{
259+
Min: generateMinObjectID(last),
260+
Max: nil,
261+
})
262+
208263
return chunks, nil
209264
}
210265

211266
switch m.config.PartitionStrategy {
212267
case "timestamp":
213268
return timestampStrategy()
214269
default:
215-
return splitVectorStrategy()
270+
chunks, err := splitVectorStrategy()
271+
// check if authorization error occurs
272+
if err != nil && strings.Contains(err.Error(), "not authorized") {
273+
logger.Warnf("failed to get chunks via split vector strategy: %s", err)
274+
return bucketAutoStrategy()
275+
}
276+
return chunks, err
216277
}
217278
}
218279
func (m *Mongo) totalCountInCollection(ctx context.Context, collection *mongo.Collection) (int64, error) {
@@ -293,7 +354,7 @@ func generatePipeline(start, end any) mongo.Pipeline {
293354
andOperation = append(andOperation, bson.D{{
294355
Key: "_id",
295356
Value: bson.D{{
296-
Key: "$lte",
357+
Key: "$lt",
297358
Value: end,
298359
}},
299360
}})

0 commit comments

Comments
 (0)