-
Notifications
You must be signed in to change notification settings - Fork 134
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Query rewrite for partition skipping index (#1690)
* Add skipping data file index Signed-off-by: Chen Dai <daichen@amazon.com> * Fix query plan integral check failure Signed-off-by: Chen Dai <daichen@amazon.com> * Add comments for critical code path Signed-off-by: Chen Dai <daichen@amazon.com> * Add check for filtering condition cannot be rewritten Signed-off-by: Chen Dai <daichen@amazon.com> * Add logging Signed-off-by: Chen Dai <daichen@amazon.com> * Add more comments and logging Signed-off-by: Chen Dai <daichen@amazon.com> * Add Flint enabled config Signed-off-by: Chen Dai <daichen@amazon.com> * Add more IT for query rewrite Signed-off-by: Chen Dai <daichen@amazon.com> * Refactor to use Flint data type mapping Signed-off-by: Chen Dai <daichen@amazon.com> * Polish doc and comments for PR review Signed-off-by: Chen Dai <daichen@amazon.com> * Addressed PR comments Signed-off-by: Chen Dai <daichen@amazon.com> --------- Signed-off-by: Chen Dai <daichen@amazon.com>
- Loading branch information
Showing
12 changed files
with
324 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
42 changes: 42 additions & 0 deletions
42
...int-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSparkOptimizer.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.flint.spark | ||
|
||
import scala.collection.JavaConverters._ | ||
|
||
import org.opensearch.flint.spark.skipping.ApplyFlintSparkSkippingIndex | ||
|
||
import org.apache.spark.sql.SparkSession | ||
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||
import org.apache.spark.sql.catalyst.rules.Rule | ||
import org.apache.spark.sql.flint.config.FlintSparkConf | ||
|
||
/** | ||
* Flint Spark optimizer that manages all Flint related optimizer rule. | ||
* @param spark | ||
* Spark session | ||
*/ | ||
class FlintSparkOptimizer(spark: SparkSession) extends Rule[LogicalPlan] { | ||
|
||
/** Flint Spark API */ | ||
private val flint: FlintSpark = new FlintSpark(spark) | ||
|
||
/** Only one Flint optimizer rule for now. Need to estimate cost if more than one in future. */ | ||
private val rule = new ApplyFlintSparkSkippingIndex(flint) | ||
|
||
override def apply(plan: LogicalPlan): LogicalPlan = { | ||
if (isOptimizerEnabled) { | ||
rule.apply(plan) | ||
} else { | ||
plan | ||
} | ||
} | ||
|
||
private def isOptimizerEnabled: Boolean = { | ||
val flintConf = new FlintSparkConf(spark.conf.getAll.asJava) | ||
flintConf.isOptimizerEnabled | ||
} | ||
} |
85 changes: 85 additions & 0 deletions
85
...ion/src/main/scala/org/opensearch/flint/spark/skipping/ApplyFlintSparkSkippingIndex.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.flint.spark.skipping | ||
|
||
import org.opensearch.flint.spark.FlintSpark | ||
import org.opensearch.flint.spark.skipping.FlintSparkSkippingIndex.{getSkippingIndexName, FILE_PATH_COLUMN, SKIPPING_INDEX_TYPE} | ||
|
||
import org.apache.spark.sql.{Column, DataFrame} | ||
import org.apache.spark.sql.catalyst.expressions.{And, Predicate} | ||
import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan} | ||
import org.apache.spark.sql.catalyst.rules.Rule | ||
import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} | ||
import org.apache.spark.sql.flint.FlintDataSourceV2.FLINT_DATASOURCE | ||
|
||
/** | ||
* Flint Spark skipping index apply rule that rewrites applicable query's filtering condition and | ||
* table scan operator to leverage additional skipping data structure and accelerate query by | ||
* reducing data scanned significantly. | ||
* | ||
* @param flint | ||
* Flint Spark API | ||
*/ | ||
class ApplyFlintSparkSkippingIndex(flint: FlintSpark) extends Rule[LogicalPlan] { | ||
|
||
override def apply(plan: LogicalPlan): LogicalPlan = plan transform { | ||
case filter @ Filter( // TODO: abstract pattern match logic for different table support | ||
condition: Predicate, | ||
relation @ LogicalRelation( | ||
baseRelation @ HadoopFsRelation(location, _, _, _, _, _), | ||
_, | ||
Some(table), | ||
false)) if !location.isInstanceOf[FlintSparkSkippingFileIndex] => | ||
|
||
val indexName = getSkippingIndexName(table.identifier.table) // TODO: database name | ||
val index = flint.describeIndex(indexName) | ||
if (index.exists(_.kind == SKIPPING_INDEX_TYPE)) { | ||
val skippingIndex = index.get.asInstanceOf[FlintSparkSkippingIndex] | ||
val indexPred = rewriteToIndexPredicate(skippingIndex, condition) | ||
|
||
/* | ||
* Replace original file index with Flint skipping file index: | ||
* Filter(a=b) | ||
* |- LogicalRelation(A) | ||
* |- HadoopFsRelation | ||
* |- FileIndex <== replaced with FlintSkippingFileIndex | ||
*/ | ||
if (indexPred.isDefined) { | ||
val filterByIndex = buildFilterIndexQuery(skippingIndex, indexPred.get) | ||
val fileIndex = new FlintSparkSkippingFileIndex(location, filterByIndex) | ||
val indexRelation = baseRelation.copy(location = fileIndex)(baseRelation.sparkSession) | ||
filter.copy(child = relation.copy(relation = indexRelation)) | ||
} else { | ||
filter | ||
} | ||
} else { | ||
filter | ||
} | ||
} | ||
|
||
private def rewriteToIndexPredicate( | ||
index: FlintSparkSkippingIndex, | ||
condition: Predicate): Option[Predicate] = { | ||
|
||
// TODO: currently only handle conjunction, namely the given condition is consist of | ||
// one or more expression concatenated by AND only. | ||
index.indexedColumns | ||
.flatMap(index => index.rewritePredicate(condition)) | ||
.reduceOption(And(_, _)) | ||
} | ||
|
||
private def buildFilterIndexQuery( | ||
index: FlintSparkSkippingIndex, | ||
rewrittenPredicate: Predicate): DataFrame = { | ||
|
||
// Get file list based on the rewritten predicates on index data | ||
flint.spark.read | ||
.format(FLINT_DATASOURCE) | ||
.load(index.name()) | ||
.filter(new Column(rewrittenPredicate)) | ||
.select(FILE_PATH_COLUMN) | ||
} | ||
} |
55 changes: 55 additions & 0 deletions
55
...tion/src/main/scala/org/opensearch/flint/spark/skipping/FlintSparkSkippingFileIndex.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.flint.spark.skipping | ||
|
||
import org.apache.hadoop.fs.{FileStatus, Path} | ||
|
||
import org.apache.spark.sql.DataFrame | ||
import org.apache.spark.sql.catalyst.expressions.Expression | ||
import org.apache.spark.sql.execution.datasources.{FileIndex, PartitionDirectory} | ||
import org.apache.spark.sql.types.StructType | ||
|
||
/** | ||
* File index that skips source files based on the selected files by Flint skipping index. | ||
* | ||
* @param baseFileIndex | ||
* original file index | ||
* @param filterByIndex | ||
* pushed down filtering on index data | ||
*/ | ||
class FlintSparkSkippingFileIndex(baseFileIndex: FileIndex, filterByIndex: DataFrame) | ||
extends FileIndex { | ||
|
||
override def listFiles( | ||
partitionFilters: Seq[Expression], | ||
dataFilters: Seq[Expression]): Seq[PartitionDirectory] = { | ||
|
||
val selectedFiles = | ||
filterByIndex.collect | ||
.map(_.getString(0)) | ||
.toSet | ||
|
||
// TODO: figure out if list file call can be avoided | ||
val partitions = baseFileIndex.listFiles(partitionFilters, dataFilters) | ||
partitions | ||
.map(p => p.copy(files = p.files.filter(f => isFileNotSkipped(selectedFiles, f)))) | ||
.filter(p => p.files.nonEmpty) | ||
} | ||
|
||
override def rootPaths: Seq[Path] = baseFileIndex.rootPaths | ||
|
||
override def inputFiles: Array[String] = baseFileIndex.inputFiles | ||
|
||
override def refresh(): Unit = baseFileIndex.refresh() | ||
|
||
override def sizeInBytes: Long = baseFileIndex.sizeInBytes | ||
|
||
override def partitionSchema: StructType = baseFileIndex.partitionSchema | ||
|
||
private def isFileNotSkipped(selectedFiles: Set[String], f: FileStatus) = { | ||
selectedFiles.contains(f.getPath.toUri.toString) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.