카테고리
Feature
생성자
U
Untitled‣
S3(no Catalog) on StarRocks
‣
S3(Glue) on StarRocks
‣
Hudi on StarRocks
‣
Iceberg on StarRocks
- 결론
- S3(Hive native)
- Icebearg
- Hudi
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
# Script generated for node AWS Glue Data Catalog
AWSGlueDataCatalog_node1744735871592 = glueContext.create_dynamic_frame.from_catalog(database="datalake", table_name="hive_nyc_final", transformation_ctx="AWSGlueDataCatalog_node1744735871592")
# Script generated for node Amazon S3
additional_options = {"hoodie.table.name": "nyc_hudi", "hoodie.datasource.write.table.type": "COPY_ON_WRITE", "hoodie.datasource.write.operation": "bulk_insert", "hoodie.datasource.write.recordkey.field": "pu_zone", "hoodie.datasource.write.precombine.field": "tpep_pickup_datetime", "hoodie.datasource.write.hive_style_partitioning": "true", "hoodie.parquet.compression.codec": "gzip"}
AmazonS3_node1744735883304_df = AWSGlueDataCatalog_node1744735871592.toDF()
AmazonS3_node1744735883304_df.write.format("hudi").options(**additional_options).mode("append").save("s3://data-bucket-edkim/datalab/test/nyc_hudi/")
job.commit()-- By Instance Profile
CREATE EXTERNAL CATALOG hudi_catalog_glue
PROPERTIES
(
"type" = "hudi",
"hive.metastore.type" = "glue",
"aws.glue.use_instance_profile" = "true",
"aws.glue.iam_role_arn" = "<role_arn>",
"aws.glue.region" = "us-west-2",
"aws.s3.use_instance_profile" = "true",
"aws.s3.iam_role_arn" = "<role_arn>",
"aws.s3.region" = "us-west-2"
);
-- By Access Key
CREATE EXTERNAL CATALOG hudi_catalog_glue
PROPERTIES
(
"type" = "hudi",
"hive.metastore.type" = "glue",
"aws.glue.use_instance_profile" = "false",
"aws.glue.access_key" = "<iam_user_access_key>",
"aws.glue.secret_key" = "<iam_user_secret_key>",
"aws.glue.region" = "us-west-2",
"aws.s3.use_instance_profile" = "false",
"aws.s3.access_key" = "<iam_user_access_key>",
"aws.s3.secret_key" = "<iam_user_secret_key>",
"aws.s3.region" = "us-west-2"
);
SELECT * FROM hudi_catalog_glue.datalake.nyc_hudi_snappy limit 10;