datazone
10/1/25About 3 min
(2025-09-30) Target: Crawler can run successfully
- S3 bucket
create s3:
name: c3l-engage-ai-raw-dataset-t - Data
- Database
create glue database:
aws glue create-database \
--database-input '{"Name":"my_glue_db","Description":"Glue database for S3 tables"}' \
--profile dev-admin
Glue execution role
arn:aws:iam::123456789012:role/AWSGlueServiceRole-S3Access
Glue crawler
create crawler:
aws glue create-crawler \
--name my-glue-crawler \
--role arn:aws:iam::123456789012:role/AWSGlueServiceRole-S3Access \
--database-name my_glue_db \
--targets '{"S3Targets":[{"Path":"s3://my-test-bucket-12345/"}]}'
Info
crawler: engage_ai_crawler_glue_database
crawler_execution_role: service-role/AWSGlueServiceRole-engage_ai_crawler_role
Start Crawler
start crawler:
aws glue start-crawler --name engage_ai_crawler_glue_database --profile dev-admin
Warning
root user is not permitted to start crawler
- LF permissions
Nothing to do
(2025-10-01) Target: Datazone can view the database from glue
step1: Prepare our data
Glue
- Database: engage_ai_crawler_glue_database
- Tables:
Table name |
---|
engage_ai_raw_data_2025_sp6_20250921_log_csv |
engage_ai_raw_data_2025_sp6_2525_4640_duedate_csv |
engage_ai_raw_data_2025_sp6_videolog_2020915_0921_csv |
engage_ai_raw_data_2025_sp_video_overview_csv |
- Glue Data Catalog:
Datazone
- Domain
aws datazone list-domains --profile dev-admin
{
"items": [
{
"arn": "arn:aws:datazone:ap-southeast-2:723609007760:domain/dzd-bvogwir6r1n3s2",
"createdAt": "2025-09-26T14:00:16.558000+09:30",
"domainVersion": "V2",
"id": "dzd-bvogwir6r1n3s2",
"managedAccountId": "723609007760",
"name": "Engage_AI_domain-09-26-2025-135935",
"portalUrl": "https://dzd-bvogwir6r1n3s2.sagemaker.ap-southeast-2.on.aws",
"status": "AVAILABLE"
}
]
}
- Project
aws datazone list-projects --domain-identifier dzd-bvogwir6r1n3s2 --profile dev-admin
{
"items": [
{
"createdAt": "2025-09-26T04:30:24.541196+00:00",
"createdBy": "fc7f1d14-f412-4796-9222-3ef1552d0518",
"domainId": "dzd-bvogwir6r1n3s2",
"domainUnitId": "4s3svwa44wh6le",
"id": "avk4h1aqkcwj36",
"name": "GenerativeAIModelGovernanceProject",
"projectStatus": "ACTIVE",
"updatedAt": "2025-09-26T04:30:27.091206+00:00"
},
{
"createdAt": "2025-09-26T04:47:50.837395+00:00",
"createdBy": "fc7f1d14-f412-4796-9222-3ef1552d0518",
"description": "",
"domainId": "dzd-bvogwir6r1n3s2",
"domainUnitId": "4s3svwa44wh6le",
"id": "bvudb3cihl3naa",
"name": "Engage_AI_Project_mg0czj0g",
"projectStatus": "ACTIVE",
"updatedAt": "2025-09-26T06:09:11.888217+00:00"
}
]
}
- datasource
command line
aws datazone list-data-sources \
--domain-identifier dzd-bvogwir6r1n3s2 \
--project-identifier bvudb3cihl3naa \
--region ap-southeast-2 \
--profile dev-admin
meta
{
"items": [
{
"connectionId": "dtugvipetrmaoy",
"createdAt": "2025-10-01T02:42:20.552386+00:00",
"dataSourceId": "4sdumylv8cx89u",
"description": "engage_ai_crawler_output_glue_table",
"domainId": "dzd-bvogwir6r1n3s2",
"enableSetting": "ENABLED",
"lastRunAssetCount": 0,
"name": "engage_ai_crawler_output_glue_table",
"status": "READY",
"type": "GLUE",
"updatedAt": "2025-10-01T05:04:20.455228+00:00"
},
{
"connectionId": "dtugvipetrmaoy",
"createdAt": "2025-09-26T04:52:42.848591+00:00",
"dataSourceId": "d854oj1618mkrm",
"domainId": "dzd-bvogwir6r1n3s2",
"enableSetting": "ENABLED",
"lastRunAssetCount": 0,
"lastRunAt": "2025-10-01T04:53:30.274819+00:00",
"lastRunStatus": "SUCCESS",
"name": "723609007760-AwsDataCatalog-engage_ai_glue_db-default-datasource",
"schedule": {
"schedule": "cron(52 4 ? * * *)"
},
"status": "READY",
"type": "GLUE",
"updatedAt": "2025-10-01T05:05:24.256431+00:00"
},
{
"connectionId": "564wv5p8ba4gpu",
"createdAt": "2025-09-26T04:52:17.158276+00:00",
"dataSourceId": "57u8esxfxqrj1e",
"domainId": "dzd-bvogwir6r1n3s2",
"enableSetting": "ENABLED",
"lastRunAssetCount": 0,
"lastRunAt": "2025-10-01T04:59:43.312507+00:00",
"lastRunStatus": "RUNNING",
"name": "Tooling-default-sagemaker-modelpackagegroup-datasource",
"schedule": {
"schedule": "cron(52 4 * * ? *)"
},
"status": "RUNNING",
"type": "SAGEMAKER",
"updatedAt": "2025-10-01T04:59:43.495764+00:00"
}
]
}
- Environment
command
aws datazone list-environments \
--domain-identifier dzd-bvogwir6r1n3s2 \
--project-identifier bvudb3cihl3naa \
--profile dev-admin
meta
{
"items": [
{
"awsAccountId": "723609007760",
"awsAccountRegion": "ap-southeast-2",
"createdAt": "2025-09-26T04:47:53.938905+00:00",
"createdBy": "SYSTEM",
"description": "Configuration for the Tooling",
"domainId": "dzd-bvogwir6r1n3s2",
"environmentConfigurationId": "478ec968-2a42-484b-aecf-dae8b60668b8",
"id": "c1gwqadiijv5sy",
"name": "Tooling",
"projectId": "bvudb3cihl3naa",
"provider": "Amazon SageMaker",
"status": "ACTIVE",
"updatedAt": "2025-09-26T04:52:19.262598+00:00"
},
{
"awsAccountId": "723609007760",
"awsAccountRegion": "ap-southeast-2",
"createdAt": "2025-09-26T04:52:26.656347+00:00",
"createdBy": "SYSTEM",
"description": "Creates databases in Amazon SageMaker Lakehouse for storing tables in S3 and Amazon Athena resources for your SQL workloads",
"domainId": "dzd-bvogwir6r1n3s2",
"environmentConfigurationId": "c6cdbf30-a922-4bc6-9516-23ec1c946da5",
"id": "4rtzqbu8msmsrm",
"name": "Lakehouse Database",
"projectId": "bvudb3cihl3naa",
"provider": "Amazon SageMaker",
"status": "ACTIVE",
"updatedAt": "2025-09-26T04:52:43.413313+00:00"
}
]
}
Step 2: Link datazone and Glue
Pre-request
Execution role:
- Can read glue catalog and s3 meta data
- the trust policy contain datazone.amazonaws.com
Execution role
arn:aws:iam::723609007760:role/service-role/AmazonSageMakerDomainExecution
1. Create datasource for glue database
Create connection
aws datazone create-connection \
--domain-identifier dzd-bvogwir6r1n3s2 \
--environment-identifier 4rtzqbu8msmsrm \
--name engageai-glue-connection \
--profile dev-admin
v1
aws datazone create-data-source \
--domain-identifier dzd-bvogwir6r1n3s2 \
--project-identifier bvudb3cihl3naa \
--environment-identifier 4rtzqbu8msmsrm \
--name engageai-dzd-source \
--type GLUE \
--configuration '{
"glueRunConfiguration": {
"catalogName": "AwsDataCatalog",
"dataAccessRole": "arn:aws:iam::723609007760:role/service-role/AmazonSageMakerDomainExecution",
"relationalFilterConfigurations": [
{
"databaseName": "engage_ai_glue_database"
}
]
}
}' \
--profile dev-admin
v2
aws datazone create-data-source \
--domain-identifier dzd-bvogwir6r1n3s2 \
--project-identifier bvudb3cihl3naa \
--name engageai-dzd-source \
--type GLUE \
--configuration '{
"glueRunConfiguration": {
"catalogName": "AwsDataCatalog",
"dataAccessRole": "arn:aws:iam::723609007760:role/service-role/AmazonSageMakerDomainExecution",
"relationalFilterConfigurations": [
{
"databaseName": "engage_ai_glue_database"
}
]
}
}' \
--profile dev-admin
2. Sync the datasource
aws datazone start-data-source-ingestion-job \
--domain-identifier dzd-xxxxxxxx \
--data-source-identifier glue-source-mydb
Check the progress
aws datazone get-data-source-ingestion-job \
--domain-identifier dzd-xxxxxxxx \
--data-source-identifier glue-source-mydb \
--identifier <job-id>