-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrestore.diff
More file actions
3888 lines (3888 loc) · 357 KB
/
restore.diff
File metadata and controls
3888 lines (3888 loc) · 357 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
diff --git a/src/pages/AWSExam.jsx b/src/pages/AWSExam.jsx
index cfb7b86..eb9e548 100644
--- a/src/pages/AWSExam.jsx
+++ b/src/pages/AWSExam.jsx
@@ -1,1884 +1,1998 @@
-import { useState, useEffect } from 'react'
-import { useParams, useLocation, useNavigate } from 'react-router-dom'
-import { useAuth } from '../contexts/AuthContext'
-import {
- doc,
- getDoc,
- setDoc,
- updateDoc,
- collection,
- addDoc,
- serverTimestamp,
- increment,
- query,
- where,
- getDocs
-} from 'firebase/firestore'
-import { db } from '../config/firebase'
-import {
- Clock, Flag, ChevronLeft, ChevronRight, Check, X,
- AlertCircle, CheckCircle, ArrowLeft, Cloud
-} from 'lucide-react'
-import { updateStreak } from '../services/learningService'
-import { DEA_C01_PRACTICE_QUESTIONS } from '../data/deaC01PracticeQuestions'
-import { DEA_C01_PAPER_2_QUESTIONS } from '../data/deaC01Paper2Questions'
-import { DEA_C01_PAPER_3_QUESTIONS } from '../data/deaC01Paper3Questions'
-import { DEA_C01_PAPER_6_QUESTIONS } from '../data/deaC01Paper6Questions'
-import { DEA_C01_PAPER_4_QUESTIONS } from '../data/deaC01Paper4Questions'
-import { DEA_C01_PAPER_5_QUESTIONS } from '../data/deaC01Paper5Questions'
-import { DEA_C01_PAPER_8_QUESTIONS } from '../data/deaC01Paper8Questions'
-
-// AWS DEA-C01 Question Bank - 69 Scenario-Based Questions (matches real exam)
-const AWS_QUESTIONS = {
- // Domain 1: Data Ingestion and Transformation (34% - 24 questions)
- 1: [
- {
- id: 'd1_1',
- question: "A company needs to ingest streaming data from IoT sensors and store it in Amazon S3 with near real-time delivery. The data should be automatically converted to Parquet format. Which solution meets these requirements with the LEAST operational overhead?",
- options: [
- "Use Amazon Kinesis Data Streams with AWS Lambda to convert and write to S3",
- "Use Amazon Kinesis Data Firehose with record format conversion enabled",
- "Use Amazon MSK (Managed Streaming for Kafka) with Kafka Connect to S3",
- "Use AWS Glue Streaming ETL job to process and write to S3"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "Amazon Kinesis Data Firehose provides the LEAST operational overhead as it's a fully managed service that can automatically convert data formats (including to Parquet) and deliver to S3. It requires no code and handles scaling automatically.",
- domain: 1,
- topic: "Streaming Ingestion",
- difficulty: "medium"
- },
- {
- id: 'd1_2',
- question: "A data engineer needs to extract data from an on-premises Oracle database and load it into Amazon Redshift. The initial load is 10 TB and ongoing changes must be captured continuously. Which approach is MOST efficient?",
- options: [
- "Use AWS Database Migration Service (DMS) with full load and CDC",
- "Export data to CSV files, upload to S3, and use COPY command",
- "Use AWS Glue JDBC connection for initial and incremental loads",
- "Use Amazon AppFlow to connect Oracle and Redshift"
- ],
- correctAnswer: 0,
- correctAnswers: [0],
- type: "single",
- explanation: "AWS DMS with full load and Change Data Capture (CDC) is the most efficient solution. It handles the initial bulk load and then captures ongoing changes from the Oracle redo logs, providing continuous replication to Redshift.",
- domain: 1,
- topic: "Database Migration",
- difficulty: "medium"
- },
- {
- id: 'd1_3',
- question: "A company uses AWS Glue for ETL jobs. The jobs are failing because of schema changes in the source data. What should the data engineer implement to handle schema evolution? (Select TWO)",
- options: [
- "Enable the AWS Glue Schema Registry",
- "Use DynamicFrame instead of DataFrame in the Glue job",
- "Increase the memory allocation for the Glue job",
- "Configure the Glue crawler to update table schema on each run",
- "Use Amazon Athena instead of AWS Glue"
- ],
- correctAnswer: null,
- correctAnswers: [1, 3],
- type: "multiple",
- selectCount: 2,
- explanation: "DynamicFrame in AWS Glue can handle schema inconsistencies through features like 'choice' type handling. Configuring the Glue crawler to update the schema ensures the Data Catalog reflects the latest source schema.",
- domain: 1,
- topic: "Schema Evolution",
- difficulty: "hard"
- },
- {
- id: 'd1_4',
- question: "An AWS Glue ETL job processes 500 GB of JSON files daily. The job is taking 4 hours to complete. Which optimization will MOST likely reduce the job duration?",
- options: [
- "Increase the number of DPUs (Data Processing Units)",
- "Convert source files to Parquet format before processing",
- "Use AWS Glue bookmarks to process only new data",
- "Switch from Python to Scala for the ETL script"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "Converting JSON to Parquet before processing will significantly improve performance. Parquet is columnar, compressed, and supports predicate pushdown. Reading 500 GB of JSON is much slower than reading optimized Parquet files.",
- domain: 1,
- topic: "Performance Optimization",
- difficulty: "medium"
- },
- {
- id: 'd1_5',
- question: "A real-time analytics application requires sub-second latency for data ingestion. The data comes from mobile applications at a rate of 100,000 events per second during peak hours. Which architecture should be used?",
- options: [
- "Amazon Kinesis Data Firehose  Amazon S3  Amazon Athena",
- "Amazon Kinesis Data Streams  AWS Lambda  Amazon DynamoDB",
- "Amazon SQS  AWS Lambda  Amazon RDS",
- "Amazon MSK  Amazon EMR  Amazon Redshift"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "For sub-second latency with 100K events/second, Kinesis Data Streams to Lambda to DynamoDB provides real-time processing. DynamoDB offers single-digit millisecond latency. Firehose has 60-second minimum buffering.",
- domain: 1,
- topic: "Real-time Processing",
- difficulty: "hard"
- },
- {
- id: 'd1_6',
- question: "A data pipeline uses AWS Step Functions to orchestrate multiple AWS Glue jobs. One job frequently fails due to source system unavailability. How should the data engineer implement resilient error handling?",
- options: [
- "Add a Catch block with exponential backoff retry in Step Functions",
- "Increase the timeout for the Glue job",
- "Use Amazon EventBridge to schedule retries",
- "Implement error handling in the Glue job code only"
- ],
- correctAnswer: 0,
- correctAnswers: [0],
- type: "single",
- explanation: "Step Functions supports native Retry with exponential backoff and Catch blocks for error handling. This provides resilient orchestration at the workflow level without modifying individual job code.",
- domain: 1,
- topic: "Pipeline Orchestration",
- difficulty: "medium"
- },
- {
- id: 'd1_7',
- question: "A company needs to transform semi-structured JSON data with nested arrays into a flattened relational format for analysis. Which AWS Glue feature should be used?",
- options: [
- "Glue Data Quality",
- "Relationalize transform",
- "Glue Elastic Views",
- "ResolveChoice transform"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "The Relationalize transform in AWS Glue converts nested JSON/semi-structured data into a set of flat tables that can be easily queried with standard SQL.",
- domain: 1,
- topic: "Data Transformation",
- difficulty: "medium"
- },
- {
- id: 'd1_8',
- question: "A media company receives video metadata files in various formats (JSON, XML, CSV) from different content providers. The data must be normalized and stored in a consistent schema. Which approach provides the MOST flexibility?",
- options: [
- "Create separate Glue crawlers for each format",
- "Use AWS Glue classifiers with custom patterns",
- "Build custom Lambda functions for each format",
- "Use Amazon Athena with different SerDe configurations"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "AWS Glue classifiers with custom patterns provide the most flexibility for handling various file formats. They can automatically detect and parse different formats, then normalize the data to a consistent schema.",
- domain: 1,
- topic: "Data Normalization",
- difficulty: "medium"
- },
- {
- id: 'd1_9',
- question: "A financial services company needs to process credit card transactions in real-time to detect fraud. The system must handle 50,000 transactions per second with less than 100ms latency. Which architecture is MOST appropriate?",
- options: [
- "Kinesis Data Streams with enhanced fan-out consumers",
- "Kinesis Data Firehose with Lambda transformation",
- "Amazon SQS FIFO with Lambda",
- "Amazon MSK with consumer groups"
- ],
- correctAnswer: 0,
- correctAnswers: [0],
- type: "single",
- explanation: "Kinesis Data Streams with enhanced fan-out provides dedicated 2 MB/s throughput per consumer with ~70ms propagation delay. This is ideal for real-time fraud detection requiring low latency at high throughput.",
- domain: 1,
- topic: "Real-time Processing",
- difficulty: "hard"
- },
- {
- id: 'd1_10',
- question: "A data engineer needs to migrate data from Amazon RDS MySQL to Amazon Aurora PostgreSQL. The migration must minimize downtime. Which approach should be used?",
- options: [
- "AWS DMS with Schema Conversion Tool (SCT)",
- "AWS Glue with JDBC connections",
- "Export/Import using mysqldump and pg_restore",
- "AWS DataSync between the databases"
- ],
- correctAnswer: 0,
- correctAnswers: [0],
- type: "single",
- explanation: "AWS DMS with Schema Conversion Tool (SCT) is designed for heterogeneous database migrations. SCT converts the schema while DMS handles continuous data replication with minimal downtime.",
- domain: 1,
- topic: "Database Migration",
- difficulty: "medium"
- },
- {
- id: 'd1_11',
- question: "A retail company wants to sync data from Salesforce to Amazon S3 on a scheduled basis. The solution should require minimal coding. Which service should be used?",
- options: [
- "AWS Glue with custom connectors",
- "Amazon AppFlow",
- "AWS Lambda with Salesforce SDK",
- "Amazon EventBridge with API destinations"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "Amazon AppFlow is a fully managed integration service that enables secure data transfer between SaaS applications like Salesforce and AWS services like S3, with no coding required.",
- domain: 1,
- topic: "SaaS Integration",
- difficulty: "easy"
- },
- {
- id: 'd1_12',
- question: "An e-commerce platform generates clickstream data that needs to be enriched with product catalog information before storage. The enrichment should happen in near real-time. Which solution is MOST efficient?",
- options: [
- "Kinesis Data Streams with Lambda enrichment using DynamoDB lookups",
- "Kinesis Data Firehose with Glue transformation",
- "S3 event triggers with Step Functions",
- "EMR Spark Streaming with broadcast variables"
- ],
- correctAnswer: 0,
- correctAnswers: [0],
- type: "single",
- explanation: "Kinesis Data Streams with Lambda enrichment using DynamoDB lookups provides near real-time enrichment with single-digit millisecond DynamoDB access times.",
- domain: 1,
- topic: "Data Enrichment",
- difficulty: "hard"
- },
- {
- id: 'd1_13',
- question: "A healthcare company needs to ingest HL7 FHIR formatted patient data from multiple hospital systems. The data arrives via REST APIs and must be transformed to a standard schema. Which approach minimizes development effort?",
- options: [
- "AWS HealthLake with integrated data transformation",
- "API Gateway with Lambda and custom transformation",
- "AWS Glue with custom ETL scripts",
- "Amazon Kinesis with Lambda consumers"
- ],
- correctAnswer: 0,
- correctAnswers: [0],
- type: "single",
- explanation: "AWS HealthLake is specifically designed for healthcare data in FHIR format. It provides automatic data transformation, storage, and analytics capabilities with minimal development effort.",
- domain: 1,
- topic: "Healthcare Data Ingestion",
- difficulty: "medium"
- },
- {
- id: 'd1_14',
- question: "A gaming company processes 1 TB of player event data daily. The data arrives continuously and must be available for analysis within 15 minutes. Which ingestion pattern is MOST cost-effective?",
- options: [
- "Kinesis Data Streams with Lambda to S3",
- "Kinesis Data Firehose with 60-second buffering",
- "Direct S3 PutObject from game servers",
- "Amazon MSK with S3 Sink Connector"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "Kinesis Data Firehose with 60-second buffering is the most cost-effective for this use case. It's fully managed, automatically scales, and the 60-second buffer easily meets the 15-minute availability requirement.",
- domain: 1,
- topic: "Cost-Effective Ingestion",
- difficulty: "medium"
- },
- {
- id: 'd1_15',
- question: "A data engineer needs to deduplicate streaming records before storing them. Records have a unique transaction_id and duplicates can arrive within a 5-minute window. Which approach is MOST efficient?",
- options: [
- "Use Kinesis Data Analytics with a 5-minute tumbling window and DISTINCT",
- "Lambda with DynamoDB for deduplication tracking",
- "Kinesis Data Streams with Kinesis Client Library (KCL) checkpointing",
- "Store all records in S3 and deduplicate with Athena"
- ],
- correctAnswer: 0,
- correctAnswers: [0],
- type: "single",
- explanation: "Kinesis Data Analytics with SQL can efficiently deduplicate records within a time window using GROUP BY and aggregation. This provides real-time deduplication without managing additional infrastructure.",
- domain: 1,
- topic: "Stream Deduplication",
- difficulty: "hard"
- },
- {
- id: 'd1_16',
- question: "A company is migrating a legacy ETL system to AWS Glue. The existing jobs use complex stored procedures in Oracle. How should the data engineer approach this migration?",
- options: [
- "Rewrite all logic in PySpark for AWS Glue",
- "Use AWS Glue with Oracle JDBC and call stored procedures",
- "Convert procedures using AWS SCT, then implement in Glue",
- "Keep stored procedures and use AWS DMS for data movement"
- ],
- correctAnswer: 2,
- correctAnswers: [2],
- type: "single",
- explanation: "AWS Schema Conversion Tool (SCT) can convert Oracle stored procedures to equivalent AWS Glue ETL code (PySpark/Scala), reducing manual rewriting effort while ensuring logic is properly migrated.",
- domain: 1,
- topic: "ETL Migration",
- difficulty: "hard"
- },
- {
- id: 'd1_17',
- question: "A logistics company needs to process GPS coordinates from 10,000 delivery vehicles in real-time. Each vehicle sends location updates every 5 seconds. The data must be stored for route optimization analysis. Which architecture handles this efficiently?",
- options: [
- "IoT Core  IoT Rules  Kinesis Data Firehose  S3",
- "API Gateway  Lambda  DynamoDB",
- "IoT Core  IoT Analytics  QuickSight",
- "Direct MQTT to EC2 instances with custom processing"
- ],
- correctAnswer: 0,
- correctAnswers: [0],
- type: "single",
- explanation: "IoT Core with IoT Rules to Kinesis Data Firehose to S3 efficiently handles high-volume IoT data ingestion. IoT Core manages device connections while Firehose handles reliable delivery to S3.",
- domain: 1,
- topic: "IoT Data Ingestion",
- difficulty: "medium"
- },
- {
- id: 'd1_18',
- question: "A data pipeline must process files that arrive in S3 in any order but must be processed in sequence based on a timestamp in the filename. Which approach ensures correct ordering?",
- options: [
- "Use S3 event notifications to trigger Lambda immediately",
- "Use AWS Glue workflow with job bookmarks",
- "Implement a Step Functions workflow with file ordering logic",
- "Use S3 Batch Operations for sequential processing"
- ],
- correctAnswer: 2,
- correctAnswers: [2],
- type: "single",
- explanation: "Step Functions can implement custom ordering logic to sort files by timestamp and process them sequentially. This provides explicit control over processing order regardless of arrival time.",
- domain: 1,
- topic: "Ordered Processing",
- difficulty: "hard"
- },
- {
- id: 'd1_19',
- question: "A data engineer needs to incrementally load data from a source system that doesn't support change data capture. The source table has 100 million rows with a last_modified timestamp column. Which approach is MOST efficient?",
- options: [
- "Full table extraction daily",
- "AWS Glue job with bookmark on last_modified column",
- "AWS DMS with full load only",
- "Athena CTAS with date filter"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "AWS Glue job bookmarks can track the last processed value of the last_modified column, enabling efficient incremental loads without full table scans on each run.",
- domain: 1,
- topic: "Incremental Loading",
- difficulty: "medium"
- },
- {
- id: 'd1_20',
- question: "A company uses multiple AWS accounts for different departments. Data from all accounts must be consolidated in a central data lake. Which approach simplifies cross-account data ingestion?",
- options: [
- "Set up VPC peering between all accounts",
- "Use S3 replication with cross-account bucket policies",
- "Configure AWS RAM to share S3 buckets",
- "Build custom Lambda functions in each account"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "S3 replication with cross-account bucket policies is the most straightforward approach. S3 replication rules can automatically copy objects to a central bucket with proper IAM permissions.",
- domain: 1,
- topic: "Cross-Account Ingestion",
- difficulty: "medium"
- },
- {
- id: 'd1_21',
- question: "An AWS Glue job reads from a JDBC source and writes to S3 in Parquet format. The job is slow due to the JDBC connection being a bottleneck. How can the data engineer improve performance? (Select TWO)",
- options: [
- "Enable parallel reads using hashfield or hashpartitions",
- "Increase the number of Glue DPUs",
- "Use pushdown predicates to filter at the source",
- "Switch to Glue version 1.0",
- "Enable Glue job metrics"
- ],
- correctAnswer: null,
- correctAnswers: [0, 2],
- type: "multiple",
- selectCount: 2,
- explanation: "Enabling parallel reads (hashfield/hashpartitions) and pushdown predicates reduces JDBC bottleneck. Parallel reads split the table across multiple connections, and pushdown predicates minimize data transferred from source.",
- domain: 1,
- topic: "JDBC Optimization",
- difficulty: "hard"
- },
- {
- id: 'd1_22',
- question: "A streaming application needs exactly-once semantics when writing to Amazon S3 from Kinesis Data Streams. Which approach guarantees this?",
- options: [
- "Use Kinesis Data Firehose with S3 destination",
- "Implement idempotent writes in Lambda with DynamoDB tracking",
- "Use Kinesis Data Analytics for Flink with exactly-once checkpointing",
- "Configure Kinesis enhanced fan-out consumers"
- ],
- correctAnswer: 2,
- correctAnswers: [2],
- type: "single",
- explanation: "Kinesis Data Analytics for Apache Flink provides exactly-once processing semantics through its checkpointing mechanism, ensuring no data loss or duplication when writing to sinks like S3.",
- domain: 1,
- topic: "Exactly-Once Processing",
- difficulty: "hard"
- },
- {
- id: 'd1_23',
- question: "A data engineer needs to validate incoming data quality before loading to the data warehouse. Records failing validation should be sent to a dead-letter queue for investigation. Which service combination achieves this?",
- options: [
- "AWS Glue with Data Quality rules and SQS DLQ",
- "Lambda with custom validation and SNS notifications",
- "Kinesis Data Analytics with error output stream",
- "Step Functions with Choice states and SQS"
- ],
- correctAnswer: 0,
- correctAnswers: [0],
- type: "single",
- explanation: "AWS Glue Data Quality allows defining validation rules (DQDL) and can route failed records to an SQS dead-letter queue, providing an integrated solution for data quality validation.",
- domain: 1,
- topic: "Data Quality",
- difficulty: "medium"
- },
- {
- id: 'd1_24',
- question: "A company receives data files encrypted with customer-provided keys. The data must be re-encrypted with AWS-managed keys after ingestion. Which approach is MOST secure?",
- options: [
- "Decrypt in Lambda, then upload to S3 with SSE-KMS",
- "Use S3 Batch Operations with decryption and re-encryption",
- "Transfer via AWS Transfer Family with post-processing Lambda",
- "Use AWS Glue with temporary decryption in memory"
- ],
- correctAnswer: 2,
- correctAnswers: [2],
- type: "single",
- explanation: "AWS Transfer Family provides secure file transfer with automatic integration to S3. Post-processing Lambda can re-encrypt files with KMS, keeping plaintext data only in Lambda's secure execution environment.",
- domain: 1,
- topic: "Encryption Handling",
- difficulty: "hard"
- }
- ],
-
- // Domain 2: Data Store Management (26% - 18 questions)
- 2: [
- {
- id: 'd2_1',
- question: "A data lake on Amazon S3 contains 50 TB of data. Users query this data using Amazon Athena but report slow performance. The data is stored in CSV format with no partitioning. Which optimization will provide the GREATEST performance improvement?",
- options: [
- "Enable S3 Transfer Acceleration",
- "Convert data to Parquet format and implement partitioning",
- "Increase Athena query timeout settings",
- "Move data to Amazon EFS for faster access"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "Converting to Parquet (columnar, compressed) and implementing partitioning will dramatically improve Athena query performance. Parquet enables column pruning and compression, while partitioning allows Athena to scan only relevant data partitions.",
- domain: 2,
- topic: "Query Optimization",
- difficulty: "medium"
- },
- {
- id: 'd2_2',
- question: "A company stores transaction data in Amazon Redshift. Query performance has degraded over time. Analysis shows that the distribution style is causing data skew. The main table has a transaction_id column and a customer_id column. Queries frequently join on customer_id. What should be done?",
- options: [
- "Change the distribution style to EVEN",
- "Change the distribution key to customer_id",
- "Add a sort key on transaction_id",
- "Increase the number of nodes in the cluster"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "Changing the distribution key to customer_id will collocate data for the frequently used join column, reducing data movement during queries. This directly addresses the data skew issue.",
- domain: 2,
- topic: "Redshift Optimization",
- difficulty: "hard"
- },
- {
- id: 'd2_3',
- question: "A data engineer needs to design a DynamoDB table for a gaming application. The table will store player sessions with access patterns: 1) Get all sessions for a player, 2) Get sessions by game type for a player, 3) Get sessions within a date range for a player. What is the optimal key design?",
- options: [
- "Partition key: session_id",
- "Partition key: player_id, Sort key: game_type#session_date",
- "Partition key: player_id, Sort key: session_date, GSI on game_type",
- "Partition key: game_type, Sort key: player_id"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "Using player_id as partition key with a composite sort key (game_type#session_date) supports all three access patterns. Query on player_id gets all sessions, begins_with on sort key filters by game_type.",
- domain: 2,
- topic: "DynamoDB Design",
- difficulty: "hard"
- },
- {
- id: 'd2_4',
- question: "An organization needs to store 100 TB of log data in S3. The data is queried infrequently (once per month) but must be available within 12 hours when needed. Which S3 storage class is MOST cost-effective?",
- options: [
- "S3 Standard",
- "S3 Intelligent-Tiering",
- "S3 Glacier Flexible Retrieval",
- "S3 Glacier Deep Archive"
- ],
- correctAnswer: 2,
- correctAnswers: [2],
- type: "single",
- explanation: "S3 Glacier Flexible Retrieval is most cost-effective for infrequently accessed data needing retrieval within 3-12 hours. It's significantly cheaper than Standard for this access pattern.",
- domain: 2,
- topic: "S3 Storage Classes",
- difficulty: "medium"
- },
- {
- id: 'd2_5',
- question: "A company wants to build a data lake with: centralized data catalog, fine-grained access control at column level, and automatic schema discovery. Which AWS services should be used? (Select TWO)",
- options: [
- "AWS Lake Formation",
- "Amazon Macie",
- "AWS Glue Data Catalog",
- "Amazon Redshift Spectrum",
- "AWS Config"
- ],
- correctAnswer: null,
- correctAnswers: [0, 2],
- type: "multiple",
- selectCount: 2,
- explanation: "AWS Lake Formation provides fine-grained access control (column-level) and integrates with the Glue Data Catalog for centralized metadata management. Glue Crawlers provide automatic schema discovery.",
- domain: 2,
- topic: "Data Lake Architecture",
- difficulty: "medium"
- },
- {
- id: 'd2_6',
- question: "A Redshift cluster uses RA3 nodes. The data warehouse team wants to share data with another AWS account without copying the data. What feature should they use?",
- options: [
- "Redshift Spectrum",
- "Redshift Data Sharing",
- "AWS Data Exchange",
- "S3 Cross-Account Access"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "Redshift Data Sharing allows sharing live data between Redshift clusters across AWS accounts without physically copying the data. This is a native Redshift feature for RA3 nodes.",
- domain: 2,
- topic: "Data Sharing",
- difficulty: "medium"
- },
- {
- id: 'd2_7',
- question: "A data warehouse query that previously took 10 seconds now takes 2 minutes. The table hasn't changed in size, but maintenance hasn't been run recently. Which Redshift maintenance task should be performed first?",
- options: [
- "VACUUM to reclaim space and resort rows",
- "ANALYZE to update statistics",
- "Resize the cluster",
- "Modify the distribution key"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "ANALYZE updates table statistics that the query planner uses. Stale statistics cause poor query plans. This is the most likely cause when table size hasn't changed but performance degraded.",
- domain: 2,
- topic: "Redshift Maintenance",
- difficulty: "medium"
- },
- {
- id: 'd2_8',
- question: "A company stores time-series data in DynamoDB for IoT sensor readings. Each sensor writes 1 KB records every second. After 30 days, data should be moved to cold storage. Which approach is MOST cost-effective?",
- options: [
- "Use DynamoDB TTL with S3 export",
- "Use DynamoDB Streams with Lambda to archive to S3 Glacier",
- "Enable DynamoDB Global Tables for replication",
- "Use DynamoDB on-demand capacity mode"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "DynamoDB Streams with Lambda provides automatic archival workflow. When TTL deletes items, Streams captures the deletion, Lambda can write the data to S3 Glacier before it's removed.",
- domain: 2,
- topic: "Data Lifecycle",
- difficulty: "hard"
- },
- {
- id: 'd2_9',
- question: "A data lake uses S3 with millions of small files (1-10 KB each). Athena queries are slow despite using Parquet format. What optimization should be implemented?",
- options: [
- "Enable S3 Transfer Acceleration",
- "Compact small files into larger files (128MB-1GB)",
- "Change to ORC format",
- "Use S3 Intelligent-Tiering"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "Small files create overhead as Athena must open each file separately. Compacting into larger files (128MB-1GB) reduces S3 API calls and improves query performance significantly.",
- domain: 2,
- topic: "File Optimization",
- difficulty: "medium"
- },
- {
- id: 'd2_10',
- question: "A company runs complex analytical queries on Amazon Redshift. Some queries scan fact tables with billions of rows. The data engineer notices queries don't use zone maps effectively. What is the MOST likely cause?",
- options: [
- "Sort keys are not defined on frequently filtered columns",
- "Distribution style is set to EVEN",
- "The cluster has too few nodes",
- "Workload Management (WLM) queues are misconfigured"
- ],
- correctAnswer: 0,
- correctAnswers: [0],
- type: "single",
- explanation: "Zone maps work with sort keys to skip blocks of data. Without proper sort keys on filtered columns, Redshift cannot use zone maps effectively for query optimization.",
- domain: 2,
- topic: "Redshift Performance",
- difficulty: "hard"
- },
- {
- id: 'd2_11',
- question: "A data lake stores customer data that must comply with GDPR right-to-erasure requirements. Data is stored in Parquet format on S3 and cataloged in Glue. How should deletion requests be handled efficiently?",
- options: [
- "Rewrite entire Parquet files without deleted records",
- "Use S3 Object Lock to prevent deletion",
- "Implement Delta Lake or Apache Iceberg for record-level operations",
- "Delete entire partitions containing the customer"
- ],
- correctAnswer: 2,
- correctAnswers: [2],
- type: "single",
- explanation: "Delta Lake or Apache Iceberg support ACID transactions and efficient record-level deletes. They track deleted records in metadata without rewriting entire files.",
- domain: 2,
- topic: "Data Lake ACID",
- difficulty: "hard"
- },
- {
- id: 'd2_12',
- question: "An analytics workload on Redshift has unpredictable query patterns. Sometimes there are no queries for hours, then sudden bursts of complex queries. Which pricing model is MOST cost-effective?",
- options: [
- "Provisioned Redshift cluster with reserved instances",
- "Redshift Serverless",
- "Provisioned cluster with Concurrency Scaling",
- "Multiple provisioned clusters in different regions"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "Redshift Serverless automatically scales up during query bursts and scales to zero during idle periods. This is ideal for unpredictable workloads, charging only for actual usage.",
- domain: 2,
- topic: "Cost Optimization",
- difficulty: "medium"
- },
- {
- id: 'd2_13',
- question: "A data engineer needs to query data across S3 data lake and Redshift tables in a single query. Which feature enables this?",
- options: [
- "Redshift Federated Query",
- "Redshift Spectrum",
- "AWS Glue ETL",
- "Amazon Athena Federated Query"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "Redshift Spectrum allows querying S3 data directly from Redshift using external tables. You can join Spectrum external tables with native Redshift tables in a single query.",
- domain: 2,
- topic: "Data Federation",
- difficulty: "medium"
- },
- {
- id: 'd2_14',
- question: "A company migrates from on-premises Hadoop to AWS. The existing data lake uses Hive with ORC files. They want to minimize changes to existing ETL jobs. Which AWS service should host the metastore?",
- options: [
- "AWS Glue Data Catalog with Hive-compatible interface",
- "Amazon RDS MySQL as external Hive metastore",
- "Amazon DynamoDB for metadata storage",
- "AWS Lake Formation with new metadata format"
- ],
- correctAnswer: 0,
- correctAnswers: [0],
- type: "single",
- explanation: "AWS Glue Data Catalog is Hive metastore compatible. It can serve as a drop-in replacement for Hive metastore, minimizing changes to existing ETL jobs that use HiveContext or SparkSQL.",
- domain: 2,
- topic: "Metadata Management",
- difficulty: "medium"
- },
- {
- id: 'd2_15',
- question: "A data lake has grown to 500 TB across 10,000 tables. Different teams need access to different subsets of data. Managing individual IAM policies has become complex. Which solution simplifies access control?",
- options: [
- "Create IAM groups for each team",
- "Use S3 bucket policies with prefixes",
- "Implement Lake Formation with tag-based access control",
- "Set up VPC endpoints for each team"
- ],
- correctAnswer: 2,
- correctAnswers: [2],
- type: "single",
- explanation: "Lake Formation tag-based access control (LF-TBAC) allows tagging resources and defining access based on tags. This scales much better than managing individual resource policies.",
- domain: 2,
- topic: "Access Control",
- difficulty: "medium"
- },
- {
- id: 'd2_16',
- question: "A Redshift cluster stores 10 TB of data with 5 years of history. 90% of queries access only the last 3 months of data. How should data be organized for optimal cost and performance?",
- options: [
- "Archive old data to S3, keep recent data in Redshift, use Spectrum for historical queries",
- "Keep all data in Redshift with interleaved sort keys",
- "Move all data to S3 and use Athena exclusively",
- "Create separate Redshift clusters for historical and recent data"
- ],
- correctAnswer: 0,
- correctAnswers: [0],
- type: "single",
- explanation: "Archiving older data to S3 and querying via Spectrum reduces Redshift storage costs while keeping recent hot data in Redshift for performance.",
- domain: 2,
- topic: "Hot/Cold Data",
- difficulty: "hard"
- },
- {
- id: 'd2_17',
- question: "A DynamoDB table serves both transactional and analytical workloads. Analytical queries cause throttling on the main table. Which solution addresses this without affecting transactions?",
- options: [
- "Increase provisioned capacity",
- "Enable DynamoDB Global Tables",
- "Export data to S3 and use Athena for analytics",
- "Create a Global Secondary Index for analytical queries"
- ],
- correctAnswer: 2,
- correctAnswers: [2],
- type: "single",
- explanation: "Exporting DynamoDB data to S3 and using Athena separates analytical workloads from transactional. This prevents analytics from consuming capacity that impacts transactional queries.",
- domain: 2,
- topic: "Workload Separation",
- difficulty: "medium"
- },
- {
- id: 'd2_18',
- question: "A data lake team needs to maintain multiple versions of datasets for reproducibility. Users should be able to query historical states of data. Which approach provides this capability?",
- options: [
- "S3 versioning with Athena",
- "Apache Iceberg tables with time travel",
- "Glue job bookmarks",
- "S3 replication to backup bucket"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "Apache Iceberg supports time travel queries allowing users to query data as of a specific timestamp or snapshot. This provides data versioning with efficient storage and query capabilities.",
- domain: 2,
- topic: "Data Versioning",
- difficulty: "hard"
- }
- ],
-
- // Domain 3: Data Operations and Support (22% - 15 questions)
- 3: [
- {
- id: 'd3_1',
- question: "A data pipeline runs daily at 2 AM using Amazon MWAA (Managed Airflow). The pipeline has started failing intermittently. What should the data engineer use to identify the root cause?",
- options: [
- "Check AWS CloudTrail logs only",
- "Review Airflow task logs in CloudWatch Logs and MWAA metrics",
- "Enable VPC Flow Logs",
- "Check S3 access logs"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "Amazon MWAA integrates with CloudWatch Logs for Airflow task logs and provides MWAA-specific metrics in CloudWatch. This combination provides visibility into task execution for troubleshooting.",
- domain: 3,
- topic: "Monitoring and Logging",
- difficulty: "medium"
- },
- {
- id: 'd3_2',
- question: "A company needs to implement data quality checks in their AWS Glue ETL pipeline. They want to validate that email addresses match a valid format and customer_id has no null values. What should they use?",
- options: [
- "AWS Glue DataBrew data profiling",
- "AWS Glue Data Quality with DQDL rules",
- "Custom PySpark validation code in Glue",
- "Amazon Athena queries after loading"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "AWS Glue Data Quality with Data Quality Definition Language (DQDL) allows defining validation rules like CustomSql, ColumnValues, and Completeness checks natively in Glue ETL.",
- domain: 3,
- topic: "Data Quality",
- difficulty: "medium"
- },
- {
- id: 'd3_3',
- question: "An ETL pipeline uses AWS Step Functions to orchestrate AWS Glue jobs. The team needs to receive alerts when any job fails. What is the MOST efficient way to implement this?",
- options: [
- "Add a Catch state that invokes Lambda to send SNS notifications",
- "Create CloudWatch Alarms for each Glue job",
- "Poll Step Functions execution status using Lambda",
- "Enable AWS Config rules for Step Functions"
- ],
- correctAnswer: 0,
- correctAnswers: [0],
- type: "single",
- explanation: "Using a Catch state in Step Functions that invokes Lambda to send SNS notifications provides centralized error handling at the workflow level and immediate notifications.",
- domain: 3,
- topic: "Alerting",
- difficulty: "medium"
- },
- {
- id: 'd3_4',
- question: "A data engineer needs to optimize costs for an AWS Glue ETL workload that runs 4 hours daily. The job uses 20 DPUs. What cost optimization strategy should be considered?",
- options: [
- "Use Glue Auto Scaling to optimize DPU usage",
- "Run the job on Amazon EMR instead",
- "Increase DPUs to complete faster",
- "Use Glue version 0.9 for lower costs"
- ],
- correctAnswer: 0,
- correctAnswers: [0],
- type: "single",
- explanation: "AWS Glue Auto Scaling automatically adjusts workers based on actual resource needs. This can significantly reduce costs by using fewer DPUs when full capacity isn't needed.",
- domain: 3,
- topic: "Cost Optimization",
- difficulty: "medium"
- },
- {
- id: 'd3_5',
- question: "A company needs to implement a data pipeline that triggers when new files arrive in S3. The pipeline should process files in order and handle retries automatically. Which orchestration approach is recommended?",
- options: [
- "S3 Event  SNS  Lambda",
- "S3 Event  EventBridge  Step Functions",
- "S3 Event  SQS  Lambda (polling)",
- "CloudWatch Events  Lambda  Glue"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "S3 Event to EventBridge to Step Functions provides reliable event capture with built-in retry logic, error handling, and execution state management.",
- domain: 3,
- topic: "Event-Driven Architecture",
- difficulty: "hard"
- },
- {
- id: 'd3_6',
- question: "A data team runs 50+ Glue jobs with complex dependencies. Managing job scheduling and dependencies has become difficult. Which solution provides better orchestration?",
- options: [
- "AWS Glue Workflows",
- "EventBridge Scheduler with Lambda",
- "Amazon MWAA (Managed Apache Airflow)",
- "Step Functions Standard Workflows"
- ],
- correctAnswer: 2,
- correctAnswers: [2],
- type: "single",
- explanation: "Amazon MWAA provides a full-featured workflow orchestration with rich DAG visualization, dependency management, and extensive operator ecosystem for complex job dependencies.",
- domain: 3,
- topic: "Workflow Orchestration",
- difficulty: "medium"
- },
- {
- id: 'd3_7',
- question: "A streaming pipeline occasionally drops records during high-volume periods. The data engineer needs to identify which records were lost and from which source. Which approach enables this investigation?",
- options: [
- "Enable CloudWatch detailed monitoring",
- "Implement record counting at each pipeline stage with reconciliation",
- "Increase Kinesis shard count",
- "Enable S3 server access logging"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "Implementing record counting at each stage allows reconciliation between source and destination. This identifies exactly where and how many records are lost in the pipeline.",
- domain: 3,
- topic: "Data Reconciliation",
- difficulty: "hard"
- },
- {
- id: 'd3_8',
- question: "An AWS Glue job writes data to S3, but downstream consumers occasionally see incomplete data. The job logs show successful completion. What is the MOST likely cause?",
- options: [
- "Eventual consistency of S3",
- "Glue job is not committing the transaction",
- "Files are being written without _SUCCESS marker or manifest",
- "CloudWatch logs are delayed"
- ],
- correctAnswer: 2,
- correctAnswers: [2],
- type: "single",
- explanation: "Without a _SUCCESS marker or manifest file, downstream consumers may read incomplete data as Glue writes multiple files. The marker signals job completion for safe consumption.",
- domain: 3,
- topic: "Job Completion Signaling",
- difficulty: "hard"
- },
- {
- id: 'd3_9',
- question: "A company needs to run Spark jobs on EMR but wants to minimize cluster management overhead. Jobs run for 2-4 hours daily. Which EMR deployment option is MOST suitable?",
- options: [
- "EMR on EC2 with long-running cluster",
- "EMR on EKS",
- "EMR Serverless",
- "EMR on EC2 with transient clusters"
- ],
- correctAnswer: 2,
- correctAnswers: [2],
- type: "single",
- explanation: "EMR Serverless eliminates cluster management entirely. Jobs run without provisioning or managing clusters. For 2-4 hour daily jobs, this provides minimal overhead and pay-per-use billing.",
- domain: 3,
- topic: "Managed Services",
- difficulty: "medium"
- },
- {
- id: 'd3_10',
- question: "A data pipeline must meet an SLA of 99.9% availability. The pipeline uses Lambda, Glue, and S3. Which monitoring strategy ensures SLA compliance?",
- options: [
- "CloudWatch Logs only",
- "Custom dashboard with CloudWatch Metrics, Alarms, and automated remediation",
- "AWS X-Ray tracing only",
- "Manual log review daily"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "A comprehensive monitoring strategy with CloudWatch Metrics for uptime tracking, Alarms for immediate detection, and automated remediation ensures 99.9% SLA compliance.",
- domain: 3,
- topic: "SLA Monitoring",
- difficulty: "medium"
- },
- {
- id: 'd3_11',
- question: "A Glue job processes data from multiple S3 prefixes. The job should only process new files since the last successful run. Which feature enables this?",
- options: [
- "S3 Inventory",
- "AWS Glue job bookmarks",
- "S3 event notifications",
- "AWS Glue triggers"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "Glue job bookmarks track which files have been processed, allowing the job to automatically process only new or unprocessed files in subsequent runs.",
- domain: 3,
- topic: "Incremental Processing",
- difficulty: "easy"
- },
- {
- id: 'd3_12',
- question: "A data pipeline team needs to implement blue-green deployment for their ETL jobs. How should they approach this for AWS Glue jobs?",
- options: [
- "Use Glue versions feature",
- "Maintain separate job definitions with version suffixes and swap aliases",
- "Use AWS CodePipeline with Glue deployment actions",
- "Clone jobs manually before updates"
- ],
- correctAnswer: 1,
- correctAnswers: [1],
- type: "single",
- explanation: "Maintaining separate job definitions (blue/green versions) with workflow aliases allows safe deployment. Test the new version, then swap the alias to point to green, enabling instant rollback.",
- domain: 3,
- topic: "Deployment Strategy",
- difficulty: "hard"
- },
- {
- id: 'd3_13',
- question: "An MWAA environment runs out of memory when running many parallel tasks. Task logs show OOM errors. What should be modified? (Select TWO)",
- options: [
- "Increase MWAA environment size",
- "Reduce parallelism configuration in Airflow",
- "Use larger S3 bucket",
- "Enable MWAA auto-scaling",
- "Reduce DAG complexity"
- ],
- correctAnswer: null,
- correctAnswers: [0, 1],
- type: "multiple",
- selectCount: 2,
- explanation: "Increasing MWAA environment size provides more memory. Reducing parallelism limits concurrent tasks, preventing memory exhaustion. Both directly address OOM issues.",
- domain: 3,
- topic: "MWAA Tuning",
- difficulty: "hard"
- },
- {
- id: 'd3_14',
- question: "A data team wants to implement CI/CD for their Glue ETL scripts stored in Git. Which AWS service best integrates with Git for automated Glue deployments?",
- options: [
- "AWS CodePipeline with CodeBuild",
- "AWS CloudFormation StackSets",
- "AWS Systems Manager Automation",
- "AWS Glue DevEndpoints"
- ],
- correctAnswer: 0,