4
4
package pusher
5
5
6
6
import (
7
+ "fmt"
7
8
"sync"
9
+ "time"
8
10
9
11
"github.com/aws/aws-sdk-go/aws"
10
12
"github.com/aws/aws-sdk-go/aws/awserr"
@@ -13,6 +15,15 @@ import (
13
15
"github.com/aws/amazon-cloudwatch-agent/sdk/service/cloudwatchlogs"
14
16
)
15
17
18
+ const (
19
+ retentionChannelSize = 100
20
+ // max wait time with backoff and jittering:
21
+ // 0 + 2.4 + 4.8 + 9.6 + 10 ~= 26.8 sec
22
+ baseRetryDelay = 1 * time .Second
23
+ maxRetryDelayTarget = 10 * time .Second
24
+ numBackoffRetries = 5
25
+ )
26
+
16
27
type Target struct {
17
28
Group , Stream , Class string
18
29
Retention int
@@ -29,69 +40,100 @@ type targetManager struct {
29
40
// cache of initialized targets
30
41
cache map [Target ]struct {}
31
42
mu sync.Mutex
43
+ dlg chan Target
44
+ prp chan Target
32
45
}
33
46
34
47
func NewTargetManager (logger telegraf.Logger , service cloudWatchLogsService ) TargetManager {
35
- return & targetManager {
48
+ tm := & targetManager {
36
49
logger : logger ,
37
50
service : service ,
38
51
cache : make (map [Target ]struct {}),
52
+ dlg : make (chan Target , retentionChannelSize ),
53
+ prp : make (chan Target , retentionChannelSize ),
39
54
}
55
+
56
+ go tm .processDescribeLogGroup ()
57
+ go tm .processPutRetentionPolicy ()
58
+ return tm
40
59
}
41
60
42
61
// InitTarget initializes a Target if it hasn't been initialized before.
43
62
func (m * targetManager ) InitTarget (target Target ) error {
44
63
m .mu .Lock ()
45
64
defer m .mu .Unlock ()
46
65
if _ , ok := m .cache [target ]; ! ok {
47
- err := m .createLogGroupAndStream (target )
66
+ newGroup , err := m .createLogGroupAndStream (target )
48
67
if err != nil {
49
68
return err
50
69
}
51
- m .PutRetentionPolicy (target )
70
+ if target .Retention > 0 {
71
+ if newGroup {
72
+ m .logger .Debugf ("sending new log group %v to prp channel" , target .Group )
73
+ m .prp <- target
74
+ } else {
75
+ m .logger .Debugf ("sending existing log group %v to dlg channel" , target .Group )
76
+ m .dlg <- target
77
+ }
78
+ }
52
79
m .cache [target ] = struct {}{}
53
80
}
54
81
return nil
55
82
}
56
83
57
- func (m * targetManager ) createLogGroupAndStream (t Target ) error {
84
+ func (m * targetManager ) PutRetentionPolicy (target Target ) {
85
+ // new pusher will call this so start with dlg
86
+ if target .Retention > 0 {
87
+ m .logger .Debugf ("sending log group %v to dlg channel by pusher" , target .Group )
88
+ m .dlg <- target
89
+ }
90
+ }
91
+
92
+ func (m * targetManager ) createLogGroupAndStream (t Target ) (bool , error ) {
58
93
err := m .createLogStream (t )
59
94
if err == nil {
60
- return nil
95
+ return false , nil
61
96
}
62
97
63
98
m .logger .Debugf ("creating stream fail due to : %v" , err )
99
+ newGroup := false
64
100
if awsErr , ok := err .(awserr.Error ); ok && awsErr .Code () == cloudwatchlogs .ErrCodeResourceNotFoundException {
65
101
err = m .createLogGroup (t )
102
+ newGroup = true
66
103
67
104
// attempt to create stream again if group created successfully.
68
105
if err == nil {
69
- m .logger .Debugf ("successfully created log group %v. Retrying log stream %v" , t . Group , t .Stream )
106
+ m .logger .Debugf ("retrying log stream %v" , t .Stream )
70
107
err = m .createLogStream (t )
71
108
} else {
72
109
m .logger .Debugf ("creating group fail due to : %v" , err )
73
110
}
74
111
}
75
112
76
113
if awsErr , ok := err .(awserr.Error ); ok && awsErr .Code () == cloudwatchlogs .ErrCodeResourceAlreadyExistsException {
77
- m .logger .Debugf ("Resource was already created. %v\n " , err )
78
- return nil // if the log group or log stream already exist, this is not worth returning an error for
114
+ m .logger .Debugf ("resource was already created. %v\n " , err )
115
+ return false , nil
79
116
}
80
117
81
- return err
118
+ return newGroup , err
82
119
}
83
120
84
121
func (m * targetManager ) createLogGroup (t Target ) error {
85
- var err error
122
+ var input * cloudwatchlogs. CreateLogGroupInput
86
123
if t .Class != "" {
87
- _ , err = m . service . CreateLogGroup ( & cloudwatchlogs.CreateLogGroupInput {
124
+ input = & cloudwatchlogs.CreateLogGroupInput {
88
125
LogGroupName : & t .Group ,
89
126
LogGroupClass : & t .Class ,
90
- })
127
+ }
91
128
} else {
92
- _ , err = m . service . CreateLogGroup ( & cloudwatchlogs.CreateLogGroupInput {
129
+ input = & cloudwatchlogs.CreateLogGroupInput {
93
130
LogGroupName : & t .Group ,
94
- })
131
+ }
132
+ }
133
+ _ , err := m .service .CreateLogGroup (input )
134
+ if err == nil {
135
+ m .logger .Debugf ("successfully created log group %v" , t .Group )
136
+ return nil
95
137
}
96
138
return err
97
139
}
@@ -109,26 +151,88 @@ func (m *targetManager) createLogStream(t Target) error {
109
151
return err
110
152
}
111
153
112
- // PutRetentionPolicy tries to set the retention policy for a log group. Does not retry on failure.
113
- func (m * targetManager ) PutRetentionPolicy (t Target ) {
114
- if t .Retention > 0 {
115
- i := aws .Int64 (int64 (t .Retention ))
116
- putRetentionInput := & cloudwatchlogs.PutRetentionPolicyInput {
117
- LogGroupName : & t .Group ,
118
- RetentionInDays : i ,
154
+ func (m * targetManager ) processDescribeLogGroup () {
155
+ for target := range m .dlg {
156
+ for attempt := 0 ; attempt < numBackoffRetries ; attempt ++ {
157
+ currentRetention , err := m .getRetention (target )
158
+ if err != nil {
159
+ m .logger .Errorf ("failed to describe log group retention for target %v: %v" , target , err )
160
+ time .Sleep (m .calculateBackoff (attempt ))
161
+ continue
162
+ }
163
+
164
+ if currentRetention != target .Retention && target .Retention > 0 {
165
+ m .logger .Debugf ("queueing log group %v to update retention policy" , target .Group )
166
+ m .prp <- target
167
+ }
168
+ break // no change in retention
119
169
}
120
- _ , err := m .service .PutRetentionPolicy (putRetentionInput )
121
- if err != nil {
122
- // since this gets called both before we start pushing logs, and after we first attempt
123
- // to push a log to a non-existent log group, we don't want to dirty the log with an error
124
- // if the error is that the log group doesn't exist (yet).
125
- if awsErr , ok := err .(awserr.Error ); ok && awsErr .Code () == cloudwatchlogs .ErrCodeResourceNotFoundException {
126
- m .logger .Debugf ("Log group %v not created yet: %v" , t .Group , err )
127
- } else {
128
- m .logger .Errorf ("Unable to put retention policy for log group %v: %v " , t .Group , err )
170
+ }
171
+ }
172
+
173
+ func (m * targetManager ) getRetention (target Target ) (int , error ) {
174
+ input := & cloudwatchlogs.DescribeLogGroupsInput {
175
+ LogGroupNamePrefix : aws .String (target .Group ),
176
+ }
177
+
178
+ output , err := m .service .DescribeLogGroups (input )
179
+ if err != nil {
180
+ return 0 , fmt .Errorf ("describe log groups failed: %w" , err )
181
+ }
182
+
183
+ for _ , group := range output .LogGroups {
184
+ if * group .LogGroupName == target .Group {
185
+ if group .RetentionInDays == nil {
186
+ return 0 , nil
129
187
}
130
- } else {
131
- m .logger .Debugf ("successfully updated log retention policy for log group %v" , t .Group )
188
+ return int (* group .RetentionInDays ), nil
189
+ }
190
+ }
191
+
192
+ return 0 , fmt .Errorf ("log group %v not found" , target .Group )
193
+ }
194
+
195
+ func (m * targetManager ) processPutRetentionPolicy () {
196
+ for target := range m .prp {
197
+ var updated bool
198
+ for attempt := 0 ; attempt < numBackoffRetries ; attempt ++ {
199
+ err := m .updateRetentionPolicy (target )
200
+ if err == nil {
201
+ updated = true
202
+ break
203
+ }
204
+
205
+ m .logger .Debugf ("retrying to update retention policy for target (%v) %v: %v" , attempt , target , err )
206
+ time .Sleep (m .calculateBackoff (attempt ))
132
207
}
208
+
209
+ if ! updated {
210
+ m .logger .Errorf ("failed to update retention policy for target %v after %d attempts" , target , numBackoffRetries )
211
+ }
212
+ }
213
+ }
214
+
215
+ func (m * targetManager ) updateRetentionPolicy (target Target ) error {
216
+ input := & cloudwatchlogs.PutRetentionPolicyInput {
217
+ LogGroupName : aws .String (target .Group ),
218
+ RetentionInDays : aws .Int64 (int64 (target .Retention )),
219
+ }
220
+
221
+ _ , err := m .service .PutRetentionPolicy (input )
222
+ if err != nil {
223
+ return fmt .Errorf ("put retention policy failed: %w" , err )
224
+ }
225
+ m .logger .Debugf ("successfully updated retention policy for log group %v" , target .Group )
226
+ return nil
227
+ }
228
+
229
+ func (m * targetManager ) calculateBackoff (retryCount int ) time.Duration {
230
+ delay := baseRetryDelay
231
+ if retryCount < numBackoffRetries {
232
+ delay = baseRetryDelay * time .Duration (1 << int64 (retryCount ))
233
+ }
234
+ if delay > maxRetryDelayTarget {
235
+ delay = maxRetryDelayTarget
133
236
}
237
+ return time .Duration (seededRand .Int63n (int64 (delay / 2 )) + int64 (delay / 2 ))
134
238
}
0 commit comments