Merge branch 'main' into add_jdk_21

Signed-off-by: Martin Gaievski <gaievski@amazon.com>
opensearch-project · Dec 6, 2023 · f02cccd · f02cccd
2 parents 2a27d01 + c338b94
commit f02cccd
Show file tree

Hide file tree

Showing 7 changed files with 757 additions and 17 deletions.
diff --git a/.github/workflows/copy-linked-issue-labels.yml b/.github/workflows/copy-linked-issue-labels.yml
@@ -0,0 +1,21 @@
+name: Copy labels from linked issues
+on:
+  pull_request_target:
+    types: [opened, edited, review_requested, synchronize, reopened, ready_for_review]
+
+jobs:
+  copy-issue-labels:
+    if: github.repository == 'opensearch-project/neural-search'
+    runs-on: ubuntu-latest
+    permissions:
+      issues: read
+      contents: read
+      pull-requests: write
+    steps:
+      - name: copy-issue-labels
+        uses: michalvankodev/copy-issue-labels@v1.3.0
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+          labels-to-exclude: |
+            untriaged
+            triaged
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,8 +7,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 ### Features
 ### Enhancements
 ### Bug Fixes
-Fix async actions are left in neural_sparse query ([438](https://github.com/opensearch-project/neural-search/pull/438))
-Fixed exception for case when Hybrid query being wrapped into bool query ([#490](https://github.com/opensearch-project/neural-search/pull/490)
+- Fix async actions are left in neural_sparse query ([#438](https://github.com/opensearch-project/neural-search/pull/438))
+- Fixed exception for case when Hybrid query being wrapped into bool query ([#490](https://github.com/opensearch-project/neural-search/pull/490))
+- Hybrid query and nested type fields ([#498](https://github.com/opensearch-project/neural-search/pull/498))
 ### Infrastructure
 ### Documentation
 ### Maintenance

diff --git a/build.gradle b/build.gradle
@@ -255,7 +255,7 @@ testClusters.integTest {
 
     // Increase heap size from default of 512mb to 1gb. When heap size is 512mb, our integ tests sporadically fail due
     // to ml-commons memory circuit breaker exception
-    jvmArgs("-Xms2g", "-Xmx2g")
+    jvmArgs("-Xms1g", "-Xmx4g")
 }
 
 // Remote Integration Tests

diff --git a/src/main/java/org/opensearch/neuralsearch/search/query/HybridQueryPhaseSearcher.java b/src/main/java/org/opensearch/neuralsearch/search/query/HybridQueryPhaseSearcher.java
@@ -17,12 +17,19 @@
 import java.util.Objects;
 
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.FieldExistsQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.TotalHitCountCollector;
 import org.apache.lucene.search.TotalHits;
 import org.opensearch.common.lucene.search.TopDocsAndMaxScore;
+import org.opensearch.common.settings.Settings;
+import org.opensearch.index.mapper.MapperService;
+import org.opensearch.index.mapper.SeqNoFieldMapper;
+import org.opensearch.index.search.NestedHelper;
 import org.opensearch.neuralsearch.query.HybridQuery;
 import org.opensearch.neuralsearch.search.HitsThresholdChecker;
 import org.opensearch.neuralsearch.search.HybridTopScoreDocCollector;
@@ -60,12 +67,120 @@ public boolean searchWith(
         final boolean hasFilterCollector,
         final boolean hasTimeout
     ) throws IOException {
-        if (query instanceof HybridQuery) {
-            return searchWithCollector(searchContext, searcher, query, collectors, hasFilterCollector, hasTimeout);
+        if (isHybridQuery(query, searchContext)) {
+            Query hybridQuery = extractHybridQuery(searchContext, query);
+            return searchWithCollector(searchContext, searcher, hybridQuery, collectors, hasFilterCollector, hasTimeout);
         }
+        validateQuery(searchContext, query);
         return super.searchWith(searchContext, searcher, query, collectors, hasFilterCollector, hasTimeout);
     }
 
+    private boolean isHybridQuery(final Query query, final SearchContext searchContext) {
+        if (query instanceof HybridQuery) {
+            return true;
+        } else if (isWrappedHybridQuery(query) && hasNestedFieldOrNestedDocs(query, searchContext)) {
+            /* Checking if this is a hybrid query that is wrapped into a Bool query by core Opensearch code
+            https://github.com/opensearch-project/OpenSearch/blob/main/server/src/main/java/org/opensearch/search/DefaultSearchContext.java#L367-L370.
+            main reason for that is performance optimization, at time of writing we are ok with loosing on performance if that's unblocks
+            hybrid query for indexes with nested field types.
+            in such case we consider query a valid hybrid query. Later in the code we will extract it and execute as a main query for
+            this search request.
+            below is sample structure of such query:
+
+            Boolean {
+               should: {
+                   hybrid: {
+                       sub_query1 {}
+                       sub_query2 {}
+                   }
+               }
+               filter: {
+                   exists: {
+                       field: "_primary_term"
+                   }
+               }
+            }
+            TODO Need to add logic for passing hybrid sub-queries through the same logic in core to ensure there is no latency regression */
+            // we have already checked if query in instance of Boolean in higher level else if condition
+            return ((BooleanQuery) query).clauses()
+                .stream()
+                .filter(clause -> clause.getQuery() instanceof HybridQuery == false)
+                .allMatch(clause -> {
+                    return clause.getOccur() == BooleanClause.Occur.FILTER
+                        && clause.getQuery() instanceof FieldExistsQuery
+                        && SeqNoFieldMapper.PRIMARY_TERM_NAME.equals(((FieldExistsQuery) clause.getQuery()).getField());
+                });
+        }
+        return false;
+    }
+
+    private boolean hasNestedFieldOrNestedDocs(final Query query, final SearchContext searchContext) {
+        return searchContext.mapperService().hasNested() && new NestedHelper(searchContext.mapperService()).mightMatchNestedDocs(query);
+    }
+
+    private boolean isWrappedHybridQuery(final Query query) {
+        return query instanceof BooleanQuery
+            && ((BooleanQuery) query).clauses().stream().anyMatch(clauseQuery -> clauseQuery.getQuery() instanceof HybridQuery);
+    }
+
+    private Query extractHybridQuery(final SearchContext searchContext, final Query query) {
+        if (hasNestedFieldOrNestedDocs(query, searchContext)
+            && isWrappedHybridQuery(query)
+            && ((BooleanQuery) query).clauses().size() > 0) {
+            // extract hybrid query and replace bool with hybrid query
+            List<BooleanClause> booleanClauses = ((BooleanQuery) query).clauses();
+            if (booleanClauses.isEmpty() || booleanClauses.get(0).getQuery() instanceof HybridQuery == false) {
+                throw new IllegalStateException("cannot process hybrid query due to incorrect structure of top level bool query");
+            }
+            return booleanClauses.get(0).getQuery();
+        }
+        return query;
+    }
+
+    /**
+     * Validate the query from neural-search plugin point of view. Current main goal for validation is to block cases
+     * when hybrid query is wrapped into other compound queries.
+     * For example, if we have Bool query like below we need to throw an error
+     * bool: {
+     *   should: [
+     *      match: {},
+     *      hybrid: {
+     *        sub_query1 {}
+     *        sub_query2 {}
+     *      }
+     *   ]
+     * }
+     * TODO add similar validation for other compound type queries like dis_max, constant_score etc.
+     *
+     * @param query query to validate
+     */
+    private void validateQuery(final SearchContext searchContext, final Query query) {
+        if (query instanceof BooleanQuery) {
+            List<BooleanClause> booleanClauses = ((BooleanQuery) query).clauses();
+            for (BooleanClause booleanClause : booleanClauses) {
+                validateNestedBooleanQuery(booleanClause.getQuery(), getMaxDepthLimit(searchContext));
+            }
+        }
+    }
+
+    private void validateNestedBooleanQuery(final Query query, final int level) {
+        if (query instanceof HybridQuery) {
+            throw new IllegalArgumentException("hybrid query must be a top level query and cannot be wrapped into other queries");
+        }
+        if (level <= 0) {
+            // ideally we should throw an error here but this code is on the main search workflow path and that might block
+            // execution of some queries. Instead, we're silently exit and allow such query to execute and potentially produce incorrect
+            // results in case hybrid query is wrapped into such bool query
+            log.error("reached max nested query limit, cannot process bool query with that many nested clauses");
+            return;
+        }
+        if (query instanceof BooleanQuery) {
+            for (BooleanClause booleanClause : ((BooleanQuery) query).clauses()) {
+                validateNestedBooleanQuery(booleanClause.getQuery(), level - 1);
+            }
+        }
+    }
+
     @VisibleForTesting
     protected boolean searchWithCollector(
         final SearchContext searchContext,
@@ -209,4 +324,9 @@ private float getMaxScore(final List<TopDocs> topDocs) {
     private DocValueFormat[] getSortValueFormats(final SortAndFormats sortAndFormats) {
         return sortAndFormats == null ? null : sortAndFormats.formats;
     }
+
+    private int getMaxDepthLimit(final SearchContext searchContext) {
+        Settings indexSettings = searchContext.getQueryShardContext().getIndexSettings().getSettings();
+        return MapperService.INDEX_MAPPING_DEPTH_LIMIT_SETTING.get(indexSettings).intValue();
+    }
 }
diff --git a/src/test/java/org/opensearch/neuralsearch/common/BaseNeuralSearchIT.java b/src/test/java/org/opensearch/neuralsearch/common/BaseNeuralSearchIT.java
@@ -413,6 +413,18 @@ protected void addKnnDoc(String index, String docId, List<String> vectorFieldNam
         addKnnDoc(index, docId, vectorFieldNames, vectors, Collections.emptyList(), Collections.emptyList());
     }
 
+    @SneakyThrows
+    protected void addKnnDoc(
+        String index,
+        String docId,
+        List<String> vectorFieldNames,
+        List<Object[]> vectors,
+        List<String> textFieldNames,
+        List<String> texts
+    ) {
+        addKnnDoc(index, docId, vectorFieldNames, vectors, textFieldNames, texts, Collections.emptyList(), Collections.emptyList());
+    }
+
     /**
      * Add a set of knn vectors and text to an index
      *
@@ -422,6 +434,8 @@ protected void addKnnDoc(String index, String docId, List<String> vectorFieldNam
      * @param vectors List of vectors corresponding to those fields
      * @param textFieldNames List of text fields to be added
      * @param texts List of text corresponding to those fields
+     * @param nestedFieldNames List of nested fields to be added
+     * @param nestedFields List of fields and values corresponding to those fields
      */
     @SneakyThrows
     protected void addKnnDoc(
@@ -430,7 +444,9 @@ protected void addKnnDoc(
         List<String> vectorFieldNames,
         List<Object[]> vectors,
         List<String> textFieldNames,
-        List<String> texts
+        List<String> texts,
+        List<String> nestedFieldNames,
+        List<Map<String, String>> nestedFields
     ) {
         Request request = new Request("POST", "/" + index + "/_doc/" + docId + "?refresh=true");
         XContentBuilder builder = XContentFactory.jsonBuilder().startObject();
@@ -441,6 +457,16 @@ protected void addKnnDoc(
         for (int i = 0; i < textFieldNames.size(); i++) {
             builder.field(textFieldNames.get(i), texts.get(i));
         }
+
+        for (int i = 0; i < nestedFieldNames.size(); i++) {
+            builder.field(nestedFieldNames.get(i));
+            builder.startObject();
+            Map<String, String> nestedValues = nestedFields.get(i);
+            for (Map.Entry<String, String> entry : nestedValues.entrySet()) {
+                builder.field(entry.getKey(), entry.getValue());
+            }
+            builder.endObject();
+        }
         builder.endObject();
 
         request.setJsonEntity(builder.toString());
@@ -523,7 +549,16 @@ protected boolean checkComplete(Map<String, Object> node) {
     }
 
     @SneakyThrows
-    private String buildIndexConfiguration(List<KNNFieldConfig> knnFieldConfigs, int numberOfShards) {
+    protected String buildIndexConfiguration(final List<KNNFieldConfig> knnFieldConfigs, final int numberOfShards) {
+        return buildIndexConfiguration(knnFieldConfigs, Collections.emptyList(), numberOfShards);
+    }
+
+    @SneakyThrows
+    protected String buildIndexConfiguration(
+        final List<KNNFieldConfig> knnFieldConfigs,
+        final List<String> nestedFields,
+        final int numberOfShards
+    ) {
         XContentBuilder xContentBuilder = XContentFactory.jsonBuilder()
             .startObject()
             .startObject("settings")
@@ -544,6 +579,11 @@ private String buildIndexConfiguration(List<KNNFieldConfig> knnFieldConfigs, int
                 .endObject()
                 .endObject();
         }
+
+        for (String nestedField : nestedFields) {
+            xContentBuilder.startObject(nestedField).field("type", "nested").endObject();
+        }
+
         xContentBuilder.endObject().endObject().endObject();
         return xContentBuilder.toString();
     }