From 4e0f700171db534d2de414577f36122e7261cd67 Mon Sep 17 00:00:00 2001 From: sbeesla-gds <129942405+sbeesla-gds@users.noreply.github.com> Date: Wed, 20 Dec 2023 10:55:05 +0000 Subject: [PATCH] No int test/dac 1919 raw layer single table (#479) * raw layer events single table schema * raw layer events single table schema * created state machine defintion * created state machine resource * created state machine resource --- iac/main/resources/raw.yml | 17 ++++ iac/main/resources/state-machine.yml | 22 +++++ ...er_consolidated_schema_processing.asl.json | 97 +++++++++++++++++++ 3 files changed, 136 insertions(+) create mode 100644 statemachine/txma_raw_layer_consolidated_schema_processing.asl.json diff --git a/iac/main/resources/raw.yml b/iac/main/resources/raw.yml index 014d29251..7908e2feb 100644 --- a/iac/main/resources/raw.yml +++ b/iac/main/resources/raw.yml @@ -910,3 +910,20 @@ RawGlueDatabase: CatalogId: !Sub ${AWS::AccountId} DatabaseInput: Name: !Sub ${Environment}-${RawGlueDatabaseName} + +RawLayerSingleTableCrawler: + Type: AWS::Glue::Crawler + Properties: + Name: txma_raw_layer_events_schema_combined + Role: !GetAtt RawGlueCrawlerRole.Arn + Targets: + S3Targets: + - Path: !Sub 's3://${RawLayerBucket}/txma/' + DatabaseName: !Ref RawGlueDatabase + CrawlerSecurityConfiguration: !Ref GlueSecurityConfig + RecrawlPolicy: + RecrawlBehavior: CRAWL_EVERYTHING + SchemaChangePolicy: + UpdateBehavior: UPDATE_IN_DATABASE + DeleteBehavior: DELETE_FROM_DATABASE + Configuration: '{"Version":1,"CrawlerOutput":{"Partitions":{"AddOrUpdateBehavior":"InheritFromTable"}}, "Grouping": {"TableGroupingPolicy": "CombineCompatibleSchemas"}}' diff --git a/iac/main/resources/state-machine.yml b/iac/main/resources/state-machine.yml index c93b708e3..58c8381e4 100644 --- a/iac/main/resources/state-machine.yml +++ b/iac/main/resources/state-machine.yml @@ -247,6 +247,7 @@ StepFunctionRole: Resource: - !Sub 'arn:aws:states:${AWS::Region}:${AWS::AccountId}:stateMachine:${Environment}-dap-raw-to-stage-process' - !Sub 'arn:aws:states:${AWS::Region}:${AWS::AccountId}:stateMachine:${Environment}-dap-redshift-processing' + - !Sub 'arn:aws:states:${AWS::Region}:${AWS::AccountId}:stateMachine:${Environment}-dap-txma-raw-consolidated-schema-to-stage-process' Action: - states:ListExecutions - states:StartExecution @@ -660,3 +661,24 @@ StepFunctionRedshiftProcessRole: - redshift-data:GetStatementResult - redshift-data:DescribeStatement - redshift-data:ListStatements + +TxmaRawLayerConsolidatedSchemaProcessingStateMachine: + Type: AWS::Serverless::StateMachine # More info about State Machine Resource: https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/sam-resource-statemachine.html + Properties: + DefinitionUri: statemachine/txma_raw_layer_consolidated_schema_processing.asl.json + Name: !Sub ${Environment}-dap-txma-raw-consolidated-schema-to-stage-process + Role: !GetAtt StepFunctionRole.Arn + Logging: + Destinations: + - CloudWatchLogsLogGroup: + LogGroupArn: !GetAtt AthenaRawLayerProcessingLogGroup.Arn + IncludeExecutionData: true + Level: ALL + DefinitionSubstitutions: + GlueCrawlerRawName: !Ref RawLayerSingleTableCrawler + Events: + DailySchedule: + Type: Schedule # More info about Schedule Event Source: https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/sam-property-statemachine-schedule.html + Properties: + Enabled: !Not IsTest + Schedule: cron(0 6 * * ? *) diff --git a/statemachine/txma_raw_layer_consolidated_schema_processing.asl.json b/statemachine/txma_raw_layer_consolidated_schema_processing.asl.json new file mode 100644 index 000000000..cd74521bd --- /dev/null +++ b/statemachine/txma_raw_layer_consolidated_schema_processing.asl.json @@ -0,0 +1,97 @@ +{ + "Comment": "AWS Step Functions state machine that processes raw datasets (consolidated schema) from TxMA into the staging layer of Athena", + "StartAt": "ListExecutions", + "States": { + "ListExecutions": { + "Type": "Task", + "Next": "ValidateRunningInstances", + "Parameters": { + "StateMachineArn.$": "$$.StateMachine.Id", + "StatusFilter": "RUNNING" + }, + "Resource": "arn:aws:states:::aws-sdk:sfn:listExecutions", + "ResultSelector": { + "runningExecutionsCount.$": "States.ArrayLength($.Executions)" + } + }, + "ValidateRunningInstances": { + "Type": "Choice", + "Choices": [ + { + "Variable": "$.runningExecutionsCount", + "NumericGreaterThan": 1, + "Next": "RunningInstanceDetected" + } + ], + "Default": "StartRawCrawler" + }, + "RunningInstanceDetected": { + "Type": "Fail", + "Error": "RunningInstanceDetected" + }, + "StartRawCrawler": { + "Type": "Task", + "Resource": "arn:aws:states:::aws-sdk:glue:startCrawler", + "Parameters": { + "Name": "${GlueCrawlerRawName}" + }, + "Next": "MonitorRawCrawlerStatus", + "ResultPath": "$.rawCrawlerResult" + }, + "MonitorRawCrawlerStatus": { + "Type": "Task", + "Resource": "arn:aws:states:::aws-sdk:glue:getCrawler", + "Parameters": { + "Name": "${GlueCrawlerRawName}" + }, + "ResultPath": "$.crawlerRawStatus", + "Next": "CheckRawCrawlerStatus" + }, + "CheckRawCrawlerStatus": { + "Type": "Choice", + "Choices": [ + { + "Variable": "$.crawlerRawStatus.Crawler.State", + "StringEquals": "READY", + "Next": "RawCrawlerFinished" + }, + { + "Or": [ + { + "Variable": "$.crawlerRawStatus.Crawler.State", + "StringEquals": "RUNNING" + }, + { + "Variable": "$.crawlerRawStatus.Crawler.State", + "StringEquals": "STOPPING" + }, + { + "Variable": "$.crawlerRawStatus.Crawler.State", + "StringEquals": "STARTED" + } + ], + "Next": "WaitForRawCrawler" + } + ], + "Default": "CrawlerFailed" + }, + "WaitForRawCrawler": { + "Type": "Wait", + "Seconds": 30, + "Next": "MonitorRawCrawlerStatus" + }, + "RawCrawlerFinished": { + "Type": "Pass", + "Next": "StopProcessing" + }, + "StopProcessing": { + "Type": "Pass", + "End": true + }, + "CrawlerFailed": { + "Type": "Fail", + "Error": "CrawlerFailed", + "Cause": "The Glue crawler execution failed." + } + } +} \ No newline at end of file