Common Variables
import os
import time
import boto3
import numpy as np
import pandas as pd
import sagemaker
#from sagemaker import get_execution_role
def get_execution_role():
return "SageMaker-ExecutionRole-20241030T121452"
sess = boto3.Session()
sm = sess.client("sagemaker")
role = get_execution_role()
sagemaker_session = sagemaker.Session(boto_session=sess)
bucket = "shared-hs-mlops-bucket" #sagemaker_session.default_bucket()
region = boto3.Session().region_name
model_package_group_name = "PipelineModelPackageGroup"
prefix = "mas-pipeline-model-example"
pipeline_name = "mas-serial-inference-pipeline" # SageMaker Pipeline name
raw_dir = os.path.join(os.getcwd(), "data", "raw")
#data_dir = os.path.join(os.getcwd(), "data")
#os.makedirs(data_dir, exist_ok=True)
#raw_dir = os.path.join(os.getcwd(), "data/raw")
#os.makedirs(raw_dir, exist_ok=True)
#########################################################################
## Download Data
s3 = boto3.client("s3")
s3.download_file(
f"sagemaker-example-files-prod-{region}",
"datasets/tabular/california_housing/cal_housing.tgz",
"cal_housing.tgz",
)
#!tar -zxf cal_housing.tgz
columns = [
"longitude", "latitude", "housingMedianAge",
"totalRooms", "totalBedrooms","population",
"households", "medianIncome", "medianHouseValue",
]
cal_housing_df = pd.read_csv("CaliforniaHousing/cal_housing.data", names=columns, header=None)
cal_housing_df["medianHouseValue"] /= 500000 # Scaling target down to avoid overcomplicating the example
cal_housing_df.to_csv(f"./data/raw/raw_data_all.csv", header=True, index=False)
rawdata_s3_prefix = "{}/data/raw".format(prefix)
#########################################################################
## Upload to S3
raw_s3 = sagemaker_session.upload_data(
bucket = bucket,
path="./data/raw/",
key_prefix=rawdata_s3_prefix)
print(raw_s3) #s3://shared-hs-mlops-bucket/mas-pipeline-model-example/data/raw
November 2, 2023About 5 min