diff --git a/packages/@aws-cdk/aws-glue-alpha/README.md b/packages/@aws-cdk/aws-glue-alpha/README.md index cbbea0b1dbf84..9db34421f78d3 100644 --- a/packages/@aws-cdk/aws-glue-alpha/README.md +++ b/packages/@aws-cdk/aws-glue-alpha/README.md @@ -286,6 +286,39 @@ new glue.S3Table(this, 'MyTable', { }); ``` +### Partition Projection + +From the [Athena documentation](https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html): +> You can use partition projection in Athena to speed up query processing of highly partitioned tables and automate partition management. +> In partition projection, Athena calculates partition values and locations using the table properties that you configure directly on your table in AWS Glue. The table properties allow Athena to 'project', or determine, the necessary partition information instead of having to do a more time-consuming metadata lookup in the AWS Glue Data Catalog. Because in-memory operations are often faster than remote operations, partition projection can reduce the runtime of queries against highly partitioned tables. Depending on the specific characteristics of the query and underlying data, partition projection can significantly reduce query runtime for queries that are constrained on partition metadata retrieval. + +```ts +declare const myDatabase: glue.Database; +const partitionProjection = new DatePartitionProjection( + 'datehour', + 's3://DOC-EXAMPLE-BUCKET/prefix/${datehour}/', + '2021/01/01,NOW', + 'yyyy/MM/dd', + 1, + DateIntervalUnit.DAYS, +); + +new glue.S3Table(this, 'MyTable', { + database: myDatabase, + columns: [{ + name: 'col1', + type: glue.Schema.STRING, + }], + partitionKeys: [{ + name: 'datehour', + type: glue.Schema.STRING, + }], + partitionProjection, + dataFormat: glue.DataFormat.JSON, +}); +``` + + ### Partition Indexes Another way to improve query performance is to specify partition indexes. If no partition indexes are @@ -552,7 +585,7 @@ new glue.S3Table(this, 'MyTable', { |------------------------------------- |---------- |------------------------------------------------------------------- | | array(itemType: Type) | Function | An array of some other type | | map(keyType: Type, valueType: Type) | Function | A map of some primitive key type to any value type | -| struct(collumns: Column[]) | Function | Nested structure containing individually named and typed collumns | +| struct(columns: Column[]) | Function | Nested structure containing individually named and typed columns | ## Data Quality Ruleset diff --git a/packages/@aws-cdk/aws-glue-alpha/lib/index.ts b/packages/@aws-cdk/aws-glue-alpha/lib/index.ts index 1b9514c14625e..8f3fa639b99e2 100644 --- a/packages/@aws-cdk/aws-glue-alpha/lib/index.ts +++ b/packages/@aws-cdk/aws-glue-alpha/lib/index.ts @@ -12,5 +12,6 @@ export * from './s3-table'; export * from './schema'; export * from './security-configuration'; export * from './storage-parameter'; +export * from './partition-projection'; export * from './table-base'; export * from './table-deprecated'; diff --git a/packages/@aws-cdk/aws-glue-alpha/lib/partition-projection.ts b/packages/@aws-cdk/aws-glue-alpha/lib/partition-projection.ts new file mode 100644 index 0000000000000..efca5350caeaf --- /dev/null +++ b/packages/@aws-cdk/aws-glue-alpha/lib/partition-projection.ts @@ -0,0 +1,294 @@ +/** +* The partition projection type. +* +* @see https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html#partition-projection-date-type +*/ +export enum PartitionProjectionType { + /** + * ENUM_TYPE + */ + ENUM_TYPE = 'enum', + /** + * INTEGER_TYPE + */ + INTEGER_TYPE = 'integer', + /** + * DATE_TYPE + */ + DATE_TYPE = 'date', + /** + * INJECTED_TYPE + */ + INJECTED_TYPE = 'injected', +} + +/** +* Dynamic Partition Projection Class +* +* @see https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html +* +*/ +export abstract class PartitionProjection { + constructor( + /** + * the type of the partition projection + * @see https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html#partition-projection-specifying-custom-s3-storage-locations + */ + public readonly type: PartitionProjectionType, + /** + * Required. The projection use for column columnName. + */ + public readonly columnName: string, + /** + * The prefix format of the S3 bucket and keys that store the partitions. + */ + public readonly storageLocationTemplate: string) {} + + /** + * Get the parameter key for the partition projection + * @param paramName the name of the parameter + * @returns the parameter key for the partition projection + */ + public getParameterKey(paramName: string): string { + return `${this.columnName}.${paramName}`; + } + /** + * Create the output format for the partition projection + * @returns the output format for the partition projection + */ + public toOutputFormat(): any { + throw new Error('Method not implemented.'); + } +} + +/** +* A time unit word that represents the serialized form of a ChronoUnit. +* Possible values are YEARS, MONTHS, WEEKS, DAYS, HOURS, MINUTES, SECONDS, or MILLISECONDS. These values are case insensitive. +* @see https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html#partition-projection-date-type +*/ +export enum DateIntervalUnit { + /** + * YEARS + */ + YEARS = 'YEARS', + /** + * MONTHS + */ + MONTHS = 'MONTHS', + /** + * WEEKS + */ + WEEKS = 'WEEKS', + /** + * DAYS + */ + DAYS = 'DAYS', + /** + * HOURS + */ + HOURS = 'HOURS', + /** + * MINUTES + */ + MINUTES = 'MINUTES', + /** + * SECONDS + */ + SECONDS = 'SECONDS', + /** + * MILLISECONDS + */ + MILLISECONDS = 'MILLISECONDS', +} + +/** +* Implementation of DatePartitionProjection +* +* @see https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html#partition-projection-date-type +* @public +*/ +export class DatePartitionProjection extends PartitionProjection { + /** + * @param columnName + * @param storageLocationTemplate + * @param range + * @param format + * @param interval + * @param intervalUnit + */ + constructor( + /** + * Required. The projection use for column columnName. + */ + public readonly columnName: string, + /** + * The prefix format of the S3 bucket and keys that store the partitions. + * @see https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html#partition-projection-specifying-custom-s3-storage-locations + */ + public readonly storageLocationTemplate: string, + /** + * Required. A two-element, comma-separated list which provides the minimum and maximum range values for the column columnName. These values are inclusive and can use any format compatible with the Java java.time.* date types. Both the minimum and maximum values must use the same format. The format specified in the .format property must be the format used for these values. + * + * This column can also contain relative date strings, formatted in this regular expression pattern: + * + * \s*NOW\s*(([\+\-])\s*([0-9]+)\s*(YEARS?|MONTHS?|WEEKS?|DAYS?|HOURS?|MINUTES?|SECONDS?)\s*)? + * + * White spaces are allowed, but in date literals are considered part of the date strings themselves. + */ + public readonly range: string, + /** + * Required. A date format string based on the Java date format DateTimeFormatter. Can be any supported Java.time.* type. + */ + public readonly format: string, + /** + * A positive integer that specifies the interval between successive partition values for column columnName. For example, a range value of 2017-01,2018-12 with an interval value of 1 and an interval.unit value of MONTHS produces the values 2017-01, 2017-02, 2017-03, and so on. The same range value with an interval value of 2 and an interval.unit value of MONTHS produces the values 2017-01, 2017-03, 2017-05, and so on. Leading and trailing white space is allowed. + * + * When the provided dates are at single-day or single-month precision, the interval is optional and defaults to 1 day or 1 month, respectively. Otherwise, interval is required. + */ + public readonly interval?: number, + /** + * A time unit word that represents the serialized form of a ChronoUnit. Possible values are YEARS, MONTHS, WEEKS, DAYS, HOURS, MINUTES, SECONDS, or MILLISECONDS. These values are case insensitive. + */ + public readonly intervalUnit?: DateIntervalUnit, + ) { + super( + PartitionProjectionType.DATE_TYPE, + columnName, + storageLocationTemplate); + } + + /** + * Create the output format for the partition projection + * @returns the output format for the partition projection + */ + toOutputFormat(): any { + const baseKey = `projection.${this.columnName}`; + return { + ['projection.enabled']: true, + ['storage.location.template']: this.storageLocationTemplate, + [`${baseKey}.type`]: this.type, + [`${baseKey}.format`]: this.format, + [`${baseKey}.range`]: this.range, + [`${baseKey}.interval`]: this.interval ? String(this.interval) : undefined, + [`${baseKey}.interval.unit`]: this.intervalUnit ? this.intervalUnit : undefined, + }; + } +} + +/** +* Implementation of IntegerPartitionProjection +* +* @see https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html#partition-projection-integer-type +*/ +export class IntegerPartitionProjection extends PartitionProjection { + constructor( + /** + * Required. The projection use for column columnName. + */ + public readonly columnName: string, + /** + * The prefix format of the S3 bucket and keys that store the partitions. + * @see https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html#partition-projection-specifying-custom-s3-storage-locations + */ + public readonly storageLocationTemplate: string, + /** + * Required. A two-element comma-separated list that provides the minimum and maximum range values to be returned by queries on the column columnName. Note that the values must be separated by a comma, not a hyphen. These values are inclusive, can be negative, and can have leading zeroes. Leading and trailing white space is allowed. + */ + public readonly range: string, + /** + * Optional. A positive integer that specifies the interval between successive partition values for the column columnName. For example, a range value of "1,3" with an interval value of "1" produces the values 1, 2, and 3. The same range value with an interval value of "2" produces the values 1 and 3, skipping 2. Leading and trailing white space is allowed. The default is 1. + */ + public readonly interval?: number, + /** + * Optional. A positive integer that specifies the number of digits to include in the partition value's final representation for column columnName. For example, a range value of "1,3" that has a digits value of "1" produces the values 1, 2, and 3. The same range value with a digits value of "2" produces the values 01, 02, and 03. Leading and trailing white space is allowed. The default is no static number of digits and no leading zeroes. + */ + public readonly digits?: number) { + super( + PartitionProjectionType.INTEGER_TYPE, + columnName, + storageLocationTemplate); + } + + /** + * Create the output format for the partition projection + * @returns the output format for the partition projection + */ + toOutputFormat(): any { + const baseKey = `projection.${this.columnName}`; + return { + ['projection.enabled']: true, + ['storage.location.template']: this.storageLocationTemplate, + [`${baseKey}.type`]: this.type, + [`${baseKey}.range`]: this.range, + [`${baseKey}.interval`]: this.interval ? String(this.interval) : undefined, + [`${baseKey}.digits`]: this.digits ? String(this.digits) : undefined, + }; + } + +} +/** + * Implenetation of EnumPartitionProjection + * + * @see https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html#partition-projection-enum-type + */ +export class EnumPartitionProjection extends PartitionProjection { + constructor( + /** + * Required. The projection use for column columnName. + */ + public readonly columnName: string, + /** + * The prefix format of the S3 bucket and keys that store the partitions. + * @see https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html#partition-projection-specifying-custom-s3-storage-locations + */ + public readonly storageLocationTemplate: string, + /** + * Required. A comma-separated list of enumerated partition values for column columnName. Any white space is considered part of an enum value. + */ + public readonly values: string) { + super( + PartitionProjectionType.ENUM_TYPE, + columnName, + storageLocationTemplate, + ); + } + + /** + * Create the output format for the partition projection + * @returns the output format for the partition projection + */ + toOutputFormat(): any { + const baseKey = `projection.${this.columnName}`; + return { + ['projection.enabled']: true, + ['storage.location.template']: this.storageLocationTemplate, + [`${baseKey}.type`]: this.type, + [`${baseKey}.values`]: this.values, + }; + } + +} + +/** + * Implementation of InjectedPartitionProjection + * + * @see https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html#partition-projection-injected-type + */ +export class InjectedPartitionProjection extends PartitionProjection { + constructor( + /** + * Required. The projection use for column columnName. + */ + public readonly columnName: string, + /** + * The prefix format of the S3 bucket and keys that store the partitions. + * + * @see https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html#partition-projection-specifying-custom-s3-storage-locations + */ + public readonly storageLocationTemplate: string) { + super( + PartitionProjectionType.INJECTED_TYPE, + columnName, + storageLocationTemplate); + } +} \ No newline at end of file diff --git a/packages/@aws-cdk/aws-glue-alpha/lib/s3-table.ts b/packages/@aws-cdk/aws-glue-alpha/lib/s3-table.ts index 56c81e4e473c4..b6345057b9b54 100644 --- a/packages/@aws-cdk/aws-glue-alpha/lib/s3-table.ts +++ b/packages/@aws-cdk/aws-glue-alpha/lib/s3-table.ts @@ -5,6 +5,7 @@ import * as s3 from 'aws-cdk-lib/aws-s3'; import { Construct } from 'constructs'; import { Column } from './schema'; import { PartitionIndex, TableBase, TableBaseProps } from './table-base'; +import { PartitionProjection } from './partition-projection'; /** * Encryption options for a Table. @@ -54,6 +55,13 @@ export interface S3TableProps extends TableBaseProps { */ readonly s3Prefix?: string; + /** + * Optional Partition Projection for this table. + * TODO: Add the option for multiple partition projections. + * @default - No partition projection. + */ + readonly partitionProjection?: PartitionProjection; + /** * The kind of encryption to secure the data with. * @@ -137,11 +145,11 @@ export class S3Table extends TableBase { partitionKeys: renderColumns(props.partitionKeys), - parameters: { + parameters: Object.assign({ 'classification': props.dataFormat.classificationString?.value, 'has_encrypted_data': true, 'partition_filtering.enabled': props.enablePartitionFiltering, - }, + }, props.partitionProjection ? props.partitionProjection.toOutputFormat() : {}), storageDescriptor: { location: `s3://${this.bucket.bucketName}/${this.s3Prefix}`, compressed: this.compressed, diff --git a/packages/@aws-cdk/aws-glue-alpha/test/integ.table.ts b/packages/@aws-cdk/aws-glue-alpha/test/integ.table.ts index 7c175b5bfa3bc..4df3132630b44 100644 --- a/packages/@aws-cdk/aws-glue-alpha/test/integ.table.ts +++ b/packages/@aws-cdk/aws-glue-alpha/test/integ.table.ts @@ -124,6 +124,22 @@ new glue.S3Table(stack, 'MyTableWithStorageDescriptorParameters', { ], }); +new glue.S3Table(stack, 'MyTableWithPartitionProjectionParameters', { + database, + bucket, + tableName: 'table_with_partition_projection_parameters', + columns, + dataFormat: glue.DataFormat.JSON, + partitionProjection: new glue.DatePartitionProjection( + 'columnName', + 's3://DOC-EXAMPLE-BUCKET/prefix/${columnName}/', + 'NOW-3YEARS,NOW', + 'dd-MM-yyyy', + 1, + glue.DateIntervalUnit.DAYS, + ), +}); + new glue.Table(stack, 'MyDeprecatedTable', { database, bucket, diff --git a/packages/@aws-cdk/aws-glue-alpha/test/integ.table.ts.snapshot/aws-cdk-glue.template.json b/packages/@aws-cdk/aws-glue-alpha/test/integ.table.ts.snapshot/aws-cdk-glue.template.json new file mode 100644 index 0000000000000..2dc8cc5485ab5 --- /dev/null +++ b/packages/@aws-cdk/aws-glue-alpha/test/integ.table.ts.snapshot/aws-cdk-glue.template.json @@ -0,0 +1,1117 @@ +{ + "Resources": { + "DataBucketE3889A50": { + "Type": "AWS::S3::Bucket", + "UpdateReplacePolicy": "Delete", + "DeletionPolicy": "Delete" + }, + "MyDatabase1E2517DB": { + "Type": "AWS::Glue::Database", + "Properties": { + "CatalogId": { + "Ref": "AWS::AccountId" + }, + "DatabaseInput": { + "Name": "my_database" + } + } + }, + "AVROTable58646ABF": { + "Type": "AWS::Glue::Table", + "Properties": { + "CatalogId": { + "Ref": "AWS::AccountId" + }, + "DatabaseName": { + "Ref": "MyDatabase1E2517DB" + }, + "TableInput": { + "Description": "avro_table generated by CDK", + "Name": "avro_table", + "Parameters": { + "classification": "avro", + "has_encrypted_data": true + }, + "PartitionKeys": [ + { + "Name": "year", + "Type": "smallint" + } + ], + "StorageDescriptor": { + "Columns": [ + { + "Name": "col1", + "Type": "string" + }, + { + "Comment": "col2 comment", + "Name": "col2", + "Type": "string" + }, + { + "Name": "col3", + "Type": "array" + }, + { + "Name": "col4", + "Type": "map" + }, + { + "Name": "col5", + "Type": "struct" + } + ], + "Compressed": false, + "InputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat", + "Location": { + "Fn::Join": [ + "", + [ + "s3://", + { + "Ref": "DataBucketE3889A50" + }, + "/" + ] + ] + }, + "OutputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat", + "SerdeInfo": { + "SerializationLibrary": "org.apache.hadoop.hive.serde2.avro.AvroSerDe" + }, + "StoredAsSubDirectories": false + }, + "TableType": "EXTERNAL_TABLE" + } + } + }, + "CSVTableE499CABA": { + "Type": "AWS::Glue::Table", + "Properties": { + "CatalogId": { + "Ref": "AWS::AccountId" + }, + "DatabaseName": { + "Ref": "MyDatabase1E2517DB" + }, + "TableInput": { + "Description": "csv_table generated by CDK", + "Name": "csv_table", + "Parameters": { + "classification": "csv", + "has_encrypted_data": true + }, + "PartitionKeys": [ + { + "Name": "year", + "Type": "smallint" + } + ], + "StorageDescriptor": { + "Columns": [ + { + "Name": "col1", + "Type": "string" + }, + { + "Comment": "col2 comment", + "Name": "col2", + "Type": "string" + }, + { + "Name": "col3", + "Type": "array" + }, + { + "Name": "col4", + "Type": "map" + }, + { + "Name": "col5", + "Type": "struct" + } + ], + "Compressed": false, + "InputFormat": "org.apache.hadoop.mapred.TextInputFormat", + "Location": { + "Fn::Join": [ + "", + [ + "s3://", + { + "Ref": "DataBucketE3889A50" + }, + "/" + ] + ] + }, + "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "SerdeInfo": { + "SerializationLibrary": "org.apache.hadoop.hive.serde2.OpenCSVSerde" + }, + "StoredAsSubDirectories": false + }, + "TableType": "EXTERNAL_TABLE" + } + } + }, + "JSONTable00348F1D": { + "Type": "AWS::Glue::Table", + "Properties": { + "CatalogId": { + "Ref": "AWS::AccountId" + }, + "DatabaseName": { + "Ref": "MyDatabase1E2517DB" + }, + "TableInput": { + "Description": "json_table generated by CDK", + "Name": "json_table", + "Parameters": { + "classification": "json", + "has_encrypted_data": true + }, + "PartitionKeys": [ + { + "Name": "year", + "Type": "smallint" + } + ], + "StorageDescriptor": { + "Columns": [ + { + "Name": "col1", + "Type": "string" + }, + { + "Comment": "col2 comment", + "Name": "col2", + "Type": "string" + }, + { + "Name": "col3", + "Type": "array" + }, + { + "Name": "col4", + "Type": "map" + }, + { + "Name": "col5", + "Type": "struct" + } + ], + "Compressed": false, + "InputFormat": "org.apache.hadoop.mapred.TextInputFormat", + "Location": { + "Fn::Join": [ + "", + [ + "s3://", + { + "Ref": "DataBucketE3889A50" + }, + "/" + ] + ] + }, + "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "SerdeInfo": { + "SerializationLibrary": "org.openx.data.jsonserde.JsonSerDe" + }, + "StoredAsSubDirectories": false + }, + "TableType": "EXTERNAL_TABLE" + } + } + }, + "ParquetTableE84E985F": { + "Type": "AWS::Glue::Table", + "Properties": { + "CatalogId": { + "Ref": "AWS::AccountId" + }, + "DatabaseName": { + "Ref": "MyDatabase1E2517DB" + }, + "TableInput": { + "Description": "parquet_table generated by CDK", + "Name": "parquet_table", + "Parameters": { + "classification": "parquet", + "has_encrypted_data": true + }, + "PartitionKeys": [ + { + "Name": "year", + "Type": "smallint" + } + ], + "StorageDescriptor": { + "Columns": [ + { + "Name": "col1", + "Type": "string" + }, + { + "Comment": "col2 comment", + "Name": "col2", + "Type": "string" + }, + { + "Name": "col3", + "Type": "array" + }, + { + "Name": "col4", + "Type": "map" + }, + { + "Name": "col5", + "Type": "struct" + } + ], + "Compressed": false, + "InputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat", + "Location": { + "Fn::Join": [ + "", + [ + "s3://", + { + "Ref": "DataBucketE3889A50" + }, + "/" + ] + ] + }, + "OutputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat", + "SerdeInfo": { + "SerializationLibrary": "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe" + }, + "StoredAsSubDirectories": false + }, + "TableType": "EXTERNAL_TABLE" + } + } + }, + "MyKey6AB29FA6": { + "Type": "AWS::KMS::Key", + "Properties": { + "KeyPolicy": { + "Statement": [ + { + "Action": "kms:*", + "Effect": "Allow", + "Principal": { + "AWS": { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":iam::", + { + "Ref": "AWS::AccountId" + }, + ":root" + ] + ] + } + }, + "Resource": "*" + } + ], + "Version": "2012-10-17" + } + }, + "UpdateReplacePolicy": "Delete", + "DeletionPolicy": "Delete" + }, + "MyEncryptedTableBucket7B28486D": { + "Type": "AWS::S3::Bucket", + "Properties": { + "BucketEncryption": { + "ServerSideEncryptionConfiguration": [ + { + "ServerSideEncryptionByDefault": { + "KMSMasterKeyID": { + "Fn::GetAtt": [ + "MyKey6AB29FA6", + "Arn" + ] + }, + "SSEAlgorithm": "aws:kms" + } + } + ] + } + }, + "UpdateReplacePolicy": "Retain", + "DeletionPolicy": "Retain" + }, + "MyEncryptedTable981A88C6": { + "Type": "AWS::Glue::Table", + "Properties": { + "CatalogId": { + "Ref": "AWS::AccountId" + }, + "DatabaseName": { + "Ref": "MyDatabase1E2517DB" + }, + "TableInput": { + "Description": "my_encrypted_table generated by CDK", + "Name": "my_encrypted_table", + "Parameters": { + "classification": "json", + "has_encrypted_data": true + }, + "PartitionKeys": [ + { + "Name": "year", + "Type": "smallint" + } + ], + "StorageDescriptor": { + "Columns": [ + { + "Name": "col1", + "Type": "string" + }, + { + "Comment": "col2 comment", + "Name": "col2", + "Type": "string" + }, + { + "Name": "col3", + "Type": "array" + }, + { + "Name": "col4", + "Type": "map" + }, + { + "Name": "col5", + "Type": "struct" + } + ], + "Compressed": false, + "InputFormat": "org.apache.hadoop.mapred.TextInputFormat", + "Location": { + "Fn::Join": [ + "", + [ + "s3://", + { + "Ref": "MyEncryptedTableBucket7B28486D" + }, + "/" + ] + ] + }, + "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "SerdeInfo": { + "SerializationLibrary": "org.openx.data.jsonserde.JsonSerDe" + }, + "StoredAsSubDirectories": false + }, + "TableType": "EXTERNAL_TABLE" + } + } + }, + "MyPartitionFilteredTable324BA27A": { + "Type": "AWS::Glue::Table", + "Properties": { + "CatalogId": { + "Ref": "AWS::AccountId" + }, + "DatabaseName": { + "Ref": "MyDatabase1E2517DB" + }, + "TableInput": { + "Description": "partition_filtered_table generated by CDK", + "Name": "partition_filtered_table", + "Parameters": { + "classification": "json", + "has_encrypted_data": true, + "partition_filtering.enabled": true + }, + "StorageDescriptor": { + "Columns": [ + { + "Name": "col1", + "Type": "string" + }, + { + "Comment": "col2 comment", + "Name": "col2", + "Type": "string" + }, + { + "Name": "col3", + "Type": "array" + }, + { + "Name": "col4", + "Type": "map" + }, + { + "Name": "col5", + "Type": "struct" + } + ], + "Compressed": false, + "InputFormat": "org.apache.hadoop.mapred.TextInputFormat", + "Location": { + "Fn::Join": [ + "", + [ + "s3://", + { + "Ref": "DataBucketE3889A50" + }, + "/" + ] + ] + }, + "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "SerdeInfo": { + "SerializationLibrary": "org.openx.data.jsonserde.JsonSerDe" + }, + "StoredAsSubDirectories": false + }, + "TableType": "EXTERNAL_TABLE" + } + } + }, + "MyTableWithConnectionTable4BCA8495": { + "Type": "AWS::Glue::Table", + "Properties": { + "CatalogId": { + "Ref": "AWS::AccountId" + }, + "DatabaseName": { + "Ref": "MyDatabase1E2517DB" + }, + "TableInput": { + "Description": "connection_table generated by CDK", + "Name": "connection_table", + "Parameters": { + "classification": "json", + "has_encrypted_data": true + }, + "StorageDescriptor": { + "Columns": [ + { + "Name": "col1", + "Type": "string" + }, + { + "Comment": "col2 comment", + "Name": "col2", + "Type": "string" + }, + { + "Name": "col3", + "Type": "array" + }, + { + "Name": "col4", + "Type": "map" + }, + { + "Name": "col5", + "Type": "struct" + } + ], + "Compressed": false, + "InputFormat": "org.apache.hadoop.mapred.TextInputFormat", + "Location": { + "Fn::Join": [ + "", + [ + "s3://", + { + "Ref": "DataBucketE3889A50" + }, + "/" + ] + ] + }, + "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "SerdeInfo": { + "SerializationLibrary": "org.openx.data.jsonserde.JsonSerDe" + }, + "StoredAsSubDirectories": false + }, + "TableType": "EXTERNAL_TABLE" + } + } + }, + "MyTableWithStorageDescriptorParametersTable1A347345": { + "Type": "AWS::Glue::Table", + "Properties": { + "CatalogId": { + "Ref": "AWS::AccountId" + }, + "DatabaseName": { + "Ref": "MyDatabase1E2517DB" + }, + "TableInput": { + "Description": "table_with_storage_descriptor_parameters generated by CDK", + "Name": "table_with_storage_descriptor_parameters", + "Parameters": { + "classification": "json", + "has_encrypted_data": true + }, + "StorageDescriptor": { + "Columns": [ + { + "Name": "col1", + "Type": "string" + }, + { + "Comment": "col2 comment", + "Name": "col2", + "Type": "string" + }, + { + "Name": "col3", + "Type": "array" + }, + { + "Name": "col4", + "Type": "map" + }, + { + "Name": "col5", + "Type": "struct" + } + ], + "Compressed": false, + "InputFormat": "org.apache.hadoop.mapred.TextInputFormat", + "Location": { + "Fn::Join": [ + "", + [ + "s3://", + { + "Ref": "DataBucketE3889A50" + }, + "/" + ] + ] + }, + "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Parameters": { + "skip.header.line.count": "1", + "compression_type": "gzip", + "foo": "bar", + "separatorChar": ",", + "write.parallel": "off" + }, + "SerdeInfo": { + "SerializationLibrary": "org.openx.data.jsonserde.JsonSerDe" + }, + "StoredAsSubDirectories": false + }, + "TableType": "EXTERNAL_TABLE" + } + } + }, + "MyTableWithPartitionProjectionParametersTable5A53BD2E": { + "Type": "AWS::Glue::Table", + "Properties": { + "CatalogId": { + "Ref": "AWS::AccountId" + }, + "DatabaseName": { + "Ref": "MyDatabase1E2517DB" + }, + "TableInput": { + "Description": "table_with_partition_projection_parameters generated by CDK", + "Name": "table_with_partition_projection_parameters", + "Parameters": { + "classification": "json", + "has_encrypted_data": true, + "projection.enabled": true, + "storage.location.template": "s3://DOC-EXAMPLE-BUCKET/prefix/${columnName}/", + "projection.columnName.type": "date", + "projection.columnName.format": "dd-MM-yyyy", + "projection.columnName.range": "NOW-3YEARS,NOW", + "projection.columnName.interval": "1", + "projection.columnName.interval.unit": "DAYS" + }, + "StorageDescriptor": { + "Columns": [ + { + "Name": "col1", + "Type": "string" + }, + { + "Comment": "col2 comment", + "Name": "col2", + "Type": "string" + }, + { + "Name": "col3", + "Type": "array" + }, + { + "Name": "col4", + "Type": "map" + }, + { + "Name": "col5", + "Type": "struct" + } + ], + "Compressed": false, + "InputFormat": "org.apache.hadoop.mapred.TextInputFormat", + "Location": { + "Fn::Join": [ + "", + [ + "s3://", + { + "Ref": "DataBucketE3889A50" + }, + "/" + ] + ] + }, + "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "SerdeInfo": { + "SerializationLibrary": "org.openx.data.jsonserde.JsonSerDe" + }, + "StoredAsSubDirectories": false + }, + "TableType": "EXTERNAL_TABLE" + } + } + }, + "MyDeprecatedTableAA0364FD": { + "Type": "AWS::Glue::Table", + "Properties": { + "CatalogId": { + "Ref": "AWS::AccountId" + }, + "DatabaseName": { + "Ref": "MyDatabase1E2517DB" + }, + "TableInput": { + "Description": "deprecated_table generated by CDK", + "Name": "deprecated_table", + "Parameters": { + "classification": "json", + "has_encrypted_data": true + }, + "StorageDescriptor": { + "Columns": [ + { + "Name": "col1", + "Type": "string" + }, + { + "Comment": "col2 comment", + "Name": "col2", + "Type": "string" + }, + { + "Name": "col3", + "Type": "array" + }, + { + "Name": "col4", + "Type": "map" + }, + { + "Name": "col5", + "Type": "struct" + } + ], + "Compressed": false, + "InputFormat": "org.apache.hadoop.mapred.TextInputFormat", + "Location": { + "Fn::Join": [ + "", + [ + "s3://", + { + "Ref": "DataBucketE3889A50" + }, + "/" + ] + ] + }, + "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "SerdeInfo": { + "SerializationLibrary": "org.openx.data.jsonserde.JsonSerDe" + }, + "StoredAsSubDirectories": false + }, + "TableType": "EXTERNAL_TABLE" + } + } + }, + "MyUserDC45028B": { + "Type": "AWS::IAM::User" + }, + "MyUserDefaultPolicy7B897426": { + "Type": "AWS::IAM::Policy", + "Properties": { + "PolicyDocument": { + "Statement": [ + { + "Action": [ + "glue:BatchCreatePartition", + "glue:BatchDeletePartition", + "glue:BatchGetPartition", + "glue:CreatePartition", + "glue:DeletePartition", + "glue:GetPartition", + "glue:GetPartitions", + "glue:GetTable", + "glue:GetTableVersion", + "glue:GetTableVersions", + "glue:GetTables", + "glue:UpdatePartition" + ], + "Effect": "Allow", + "Resource": [ + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":glue:", + { + "Ref": "AWS::Region" + }, + ":", + { + "Ref": "AWS::AccountId" + }, + ":table/", + { + "Ref": "MyDatabase1E2517DB" + }, + "/", + { + "Ref": "CSVTableE499CABA" + } + ] + ] + }, + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":glue:", + { + "Ref": "AWS::Region" + }, + ":", + { + "Ref": "AWS::AccountId" + }, + ":table/", + { + "Ref": "MyDatabase1E2517DB" + }, + "/", + { + "Ref": "MyEncryptedTable981A88C6" + } + ] + ] + } + ] + }, + { + "Action": [ + "s3:Abort*", + "s3:DeleteObject*", + "s3:GetBucket*", + "s3:GetObject*", + "s3:List*", + "s3:PutObject", + "s3:PutObjectLegalHold", + "s3:PutObjectRetention", + "s3:PutObjectTagging", + "s3:PutObjectVersionTagging" + ], + "Effect": "Allow", + "Resource": [ + { + "Fn::GetAtt": [ + "DataBucketE3889A50", + "Arn" + ] + }, + { + "Fn::GetAtt": [ + "MyEncryptedTableBucket7B28486D", + "Arn" + ] + }, + { + "Fn::Join": [ + "", + [ + { + "Fn::GetAtt": [ + "DataBucketE3889A50", + "Arn" + ] + }, + "/*" + ] + ] + }, + { + "Fn::Join": [ + "", + [ + { + "Fn::GetAtt": [ + "MyEncryptedTableBucket7B28486D", + "Arn" + ] + }, + "/*" + ] + ] + } + ] + }, + { + "Action": [ + "kms:Decrypt", + "kms:DescribeKey", + "kms:Encrypt", + "kms:GenerateDataKey*", + "kms:ReEncrypt*" + ], + "Effect": "Allow", + "Resource": { + "Fn::GetAtt": [ + "MyKey6AB29FA6", + "Arn" + ] + } + } + ], + "Version": "2012-10-17" + }, + "PolicyName": "MyUserDefaultPolicy7B897426", + "Users": [ + { + "Ref": "MyUserDC45028B" + } + ] + } + }, + "AnotherUser254B09E3": { + "Type": "AWS::IAM::User" + }, + "AnotherUserDefaultPolicyDBDB9923": { + "Type": "AWS::IAM::Policy", + "Properties": { + "PolicyDocument": { + "Statement": [ + { + "Action": [ + "glue:BatchCreatePartition", + "glue:BatchDeletePartition", + "glue:BatchGetPartition", + "glue:CreatePartition", + "glue:DeletePartition", + "glue:GetPartition", + "glue:GetPartitions", + "glue:GetTable", + "glue:GetTableVersion", + "glue:GetTableVersions", + "glue:GetTables", + "glue:UpdatePartition" + ], + "Effect": "Allow", + "Resource": [ + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":glue:", + { + "Ref": "AWS::Region" + }, + ":", + { + "Ref": "AWS::AccountId" + }, + ":table/", + { + "Ref": "MyDatabase1E2517DB" + }, + "/", + { + "Ref": "AVROTable58646ABF" + } + ] + ] + }, + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":glue:", + { + "Ref": "AWS::Region" + }, + ":", + { + "Ref": "AWS::AccountId" + }, + ":table/", + { + "Ref": "MyDatabase1E2517DB" + }, + "/", + { + "Ref": "JSONTable00348F1D" + } + ] + ] + }, + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":glue:", + { + "Ref": "AWS::Region" + }, + ":", + { + "Ref": "AWS::AccountId" + }, + ":table/", + { + "Ref": "MyDatabase1E2517DB" + }, + "/", + { + "Ref": "ParquetTableE84E985F" + } + ] + ] + } + ] + }, + { + "Action": [ + "s3:Abort*", + "s3:DeleteObject*", + "s3:GetBucket*", + "s3:GetObject*", + "s3:List*", + "s3:PutObject", + "s3:PutObjectLegalHold", + "s3:PutObjectRetention", + "s3:PutObjectTagging", + "s3:PutObjectVersionTagging" + ], + "Effect": "Allow", + "Resource": [ + { + "Fn::GetAtt": [ + "DataBucketE3889A50", + "Arn" + ] + }, + { + "Fn::Join": [ + "", + [ + { + "Fn::GetAtt": [ + "DataBucketE3889A50", + "Arn" + ] + }, + "/*" + ] + ] + } + ] + } + ], + "Version": "2012-10-17" + }, + "PolicyName": "AnotherUserDefaultPolicyDBDB9923", + "Users": [ + { + "Ref": "AnotherUser254B09E3" + } + ] + } + } + }, + "Parameters": { + "BootstrapVersion": { + "Type": "AWS::SSM::Parameter::Value", + "Default": "/cdk-bootstrap/hnb659fds/version", + "Description": "Version of the CDK Bootstrap resources in this environment, automatically retrieved from SSM Parameter Store. [cdk:skip]" + } + }, + "Rules": { + "CheckBootstrapVersion": { + "Assertions": [ + { + "Assert": { + "Fn::Not": [ + { + "Fn::Contains": [ + [ + "1", + "2", + "3", + "4", + "5" + ], + { + "Ref": "BootstrapVersion" + } + ] + } + ] + }, + "AssertDescription": "CDK bootstrap stack version 6 required. Please run 'cdk bootstrap' with a recent version of the CDK CLI." + } + ] + } + } +} \ No newline at end of file diff --git a/packages/@aws-cdk/aws-glue-alpha/test/partition-projection.test.ts b/packages/@aws-cdk/aws-glue-alpha/test/partition-projection.test.ts new file mode 100644 index 0000000000000..e4bff7e119e6c --- /dev/null +++ b/packages/@aws-cdk/aws-glue-alpha/test/partition-projection.test.ts @@ -0,0 +1,52 @@ +import { + DatePartitionProjection, + EnumPartitionProjection, + IntegerPartitionProjection, +} from '../lib'; + +// console.log(proj.toOutputFormat()); +describe('partition projection output', () => { + test('Date partition projection', () => { + const proj = new DatePartitionProjection( + 'date', + 's3://DOC-EXAMPLE-BUCKET/prefix/${date}/', + 'NOW-3YEARS,NOW', + 'yyyy-MM-dd'); + expect(proj.toOutputFormat()).toEqual({ + 'projection.enabled': true, + 'projection.date.type': 'date', + 'projection.date.format': 'yyyy-MM-dd', + 'projection.date.range': 'NOW-3YEARS,NOW', + 'storage.location.template': 's3://DOC-EXAMPLE-BUCKET/prefix/${date}/', + }); + }); + + test('Integer partition projection', () => { + const proj = new IntegerPartitionProjection( + 'hour', + 's3://DOC-EXAMPLE-BUCKET/prefix/${hour}/', + '0,23', + 2); + expect(proj.toOutputFormat()).toEqual({ + 'projection.enabled': true, + 'projection.hour.type': 'integer', + 'projection.hour.range': '0,23', + 'projection.hour.interval': '2', + 'storage.location.template': 's3://DOC-EXAMPLE-BUCKET/prefix/${hour}/', + }); + }); + + test('Enum partition projection', () => { + const proj = new EnumPartitionProjection( + 'unit', + 's3://DOC-EXAMPLE-BUCKET/prefix/${unit}/', + 'A,B,C'); + expect(proj.toOutputFormat()).toEqual({ + 'projection.enabled': true, + 'projection.unit.type': 'enum', + 'projection.unit.values': 'A,B,C', + 'storage.location.template': 's3://DOC-EXAMPLE-BUCKET/prefix/${unit}/', + }); + }); + +}); \ No newline at end of file diff --git a/packages/@aws-cdk/aws-glue-alpha/test/s3-table.test.ts b/packages/@aws-cdk/aws-glue-alpha/test/s3-table.test.ts index c5e498ad61c41..9f4513c8ff187 100644 --- a/packages/@aws-cdk/aws-glue-alpha/test/s3-table.test.ts +++ b/packages/@aws-cdk/aws-glue-alpha/test/s3-table.test.ts @@ -4,6 +4,7 @@ import * as iam from 'aws-cdk-lib/aws-iam'; import * as kms from 'aws-cdk-lib/aws-kms'; import * as s3 from 'aws-cdk-lib/aws-s3'; import * as glue from '../lib'; +import { DateIntervalUnit, DatePartitionProjection } from '../lib'; test('encrypted table: SSE-S3', () => { const stack = new cdk.Stack(); @@ -1095,6 +1096,145 @@ describe('validate', () => { }); }); +describe('Partition-Projection', () => { + test('Date partition projection', () => { + const stack = new cdk.Stack(); + const database = new glue.Database(stack, 'Database'); + const partitionProjection = new DatePartitionProjection( + 'columnName', + 's3://DOC-EXAMPLE-BUCKET/prefix/${columnName}/', + 'NOW-3YEARS,NOW', + 'dd-MM-yyyy', + 1, + DateIntervalUnit.DAYS, + ); + + new glue.S3Table(stack, 'Table', { + database, + columns: [{ + name: 'col', + type: glue.Schema.STRING, + }], + partitionProjection: partitionProjection, + dataFormat: glue.DataFormat.JSON, + }); + + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Table', { + TableInput: { + Parameters: { + 'projection.enabled': true, + 'projection.columnName.type': 'date', + 'projection.columnName.range': 'NOW-3YEARS,NOW', + 'projection.columnName.format': 'dd-MM-yyyy', + 'projection.columnName.interval': '1', + 'projection.columnName.interval.unit': 'DAYS', + 'storage.location.template': 's3://DOC-EXAMPLE-BUCKET/prefix/${columnName}/', + }, + }, + }); + }); + + test('Date partition projection with bucket', () => { + const stack = new cdk.Stack(); + const database = new glue.Database(stack, 'Database'); + const bucket = new s3.Bucket(stack, 'Bucket'); + const partitionProjection = new DatePartitionProjection( + 'columnName', + 's3://DOC-EXAMPLE-BUCKET/prefix/${columnName}/', + '2021/01/01,NOW', + 'yyyy/MM/dd', + 1, + DateIntervalUnit.DAYS, + ); + + new glue.S3Table(stack, 'Table', { + tableName: 'glue-table-on-s3', + description: 'The raw data from firehose', + database, + columns: [{ + name: 'col', + type: glue.Schema.STRING, + }], + partitionKeys: [{ + name: 'columnName', + type: glue.Schema.STRING, + }], + bucket: bucket, + s3Prefix: 'events/table=event-table/', + storedAsSubDirectories: true, + partitionProjection: partitionProjection, + storageParameters: [ + glue.StorageParameter.compressionType(glue.CompressionType.GZIP), + ], + dataFormat: glue.DataFormat.JSON, + enablePartitionFiltering: true, + compressed: true, + }); + + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Table', { + CatalogId: { + Ref: 'AWS::AccountId', + }, + DatabaseName: { + Ref: 'DatabaseB269D8BB', + }, + TableInput: { + Name: 'glue-table-on-s3', + Description: 'The raw data from firehose', + Parameters: { + 'classification': 'json', + 'has_encrypted_data': true, + 'projection.enabled': true, + 'projection.columnName.type': 'date', + 'projection.columnName.range': '2021/01/01,NOW', + 'projection.columnName.format': 'yyyy/MM/dd', + 'projection.columnName.interval': '1', + 'projection.columnName.interval.unit': 'DAYS', + 'storage.location.template': 's3://DOC-EXAMPLE-BUCKET/prefix/${columnName}/', + }, + PartitionKeys: [ + { + Name: 'columnName', + Type: 'string', + }, + ], + StorageDescriptor: { + Columns: [ + { + Name: 'col', + Type: 'string', + }, + ], + Compressed: true, + InputFormat: 'org.apache.hadoop.mapred.TextInputFormat', + Location: { + 'Fn::Join': [ + '', + [ + 's3://', + { + Ref: 'Bucket83908E77', + }, + '/events/table=event-table/', + ], + ], + }, + OutputFormat: 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat', + Parameters: { + compression_type: 'gzip', + }, + SerdeInfo: { + SerializationLibrary: 'org.openx.data.jsonserde.JsonSerDe', + }, + StoredAsSubDirectories: true, + }, + TableType: 'EXTERNAL_TABLE', + }, + }); + }); + +}); + function createTable(props: Pick>): void { const stack = new cdk.Stack(); new glue.S3Table(stack, 'table', {