This is an automated email from the ASF dual-hosted git repository.
paulk pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/groovy-website.git
The following commit(s) were added to refs/heads/asf-site by this push:
new 988209b additional examples
988209b is described below
commit 988209bcb369a40ac0cebf0d0bebd2abec19c484
Author: Paul King <[email protected]>
AuthorDate: Sat Nov 23 07:56:47 2024 +1000
additional examples
---
site/src/site/blog/groovy-lucene.adoc | 45 ++++++++++++++++++++++++++---------
1 file changed, 34 insertions(+), 11 deletions(-)
diff --git a/site/src/site/blog/groovy-lucene.adoc
b/site/src/site/blog/groovy-lucene.adoc
index e31fc0e..91722aa 100644
--- a/site/src/site/blog/groovy-lucene.adoc
+++ b/site/src/site/blog/groovy-lucene.adoc
@@ -699,6 +699,13 @@ assert results.totalHits.value() == 1 &&
== More complex queries
+As a final example, we chose earlier to extract project names at index time.
+We could have instead used the normal analyzer at the cost of needing more
+complex span queries to pull out our project names at search time.
+Let's have a look at what the could for that scenario could look like.
+
+First, we'll do indexing with the `StandardAnalyzer`.
+
[source,groovy]
----
var analyzer = new StandardAnalyzer()
@@ -722,22 +729,36 @@ new IndexWriter(indexDir, config).withCloseable { writer
->
}
----
+Now our queries will need to be more complex. We have a few options up our
sleeve,
+but we'll choose to put together our queries using some low level query
classes.
+We'll look for "apache commons <namepart>"
+or "(apache|eclipse) <namepart>",
+where _namepart_ is the project name
+without the foundation prefix.
+
[source,groovy]
----
IndexReader reader = DirectoryReader.open(indexDir)
var searcher = new IndexSearcher(reader)
-var namepart = new SpanMultiTermQueryWrapper(new RegexpQuery(new
Term("content", '''(
-math|spark|lucene|collections|deeplearning4j
-|beam|wayang|csv|io|numbers|ignite|mxnet|age
-|nlpcraft|pekko|hugegraph|tinkerpop|commons
-|cli|opennlp|ofbiz|codec|kie|flink
-)'''.replaceAll('\n', ''))))
-
-var (apache, commons) = ['apache', 'commons'].collect{ new Term('content', it)
}
-var apacheCommons = new SpanNearQuery([new SpanTermQuery(apache), new
SpanTermQuery(commons), namepart] as SpanQuery[], 0, true)
-
-var foundation = new SpanMultiTermQueryWrapper(new RegexpQuery(new
Term("content", "(apache|eclipse)")))
+var projects = [
+ 'math', 'spark', 'lucene', 'collections', 'deeplearning4j',
+ 'beam', 'wayang', 'csv', 'io', 'numbers', 'ignite', 'mxnet', 'age',
+ 'nlpcraft', 'pekko', 'hugegraph', 'tinkerpop', 'commons',
+ 'cli', 'opennlp', 'ofbiz', 'codec', 'kie', 'flink'
+]
+var namepart = new SpanMultiTermQueryWrapper(new RegexpQuery(
+ new Term('content', "(${projects.join('|')})")))
+
+// look for apache commons <namepart>
+SpanQuery[] spanTerms = ['apache', 'commons'].collect{
+ new SpanTermQuery(new Term('content', it))
+} + namepart
+var apacheCommons = new SpanNearQuery(spanTerms, 0, true)
+
+// look for (apache|eclipse) <namepart>
+var foundation = new SpanMultiTermQueryWrapper(new RegexpQuery(
+ new Term('content', '(apache|eclipse)')))
var otherProject = new SpanNearQuery([foundation, namepart] as SpanQuery[], 0,
true)
var builder = new BooleanQuery.Builder(minimumNumberShouldMatch: 1)
@@ -748,6 +769,8 @@ var results = searcher.search(query, 30)
println "Total documents with hits for $query --> $results.totalHits"
----
+When we run this we see the same number of hits as before:
+
----
Total documents with hits for
(spanNear([SpanMultiTermQueryWrapper(content:/(apache|eclipse)/),
SpanMultiTermQueryWrapper(content:/(math|spark|lucene|collections|deeplearning4j|beam|wayang|csv|io|numbers|ignite|mxnet|age|nlpcraft|pekko|hugegraph|tinkerpop|commons|cli|opennlp|ofbiz|codec|kie|flink)/)],
0, true) spanNear([content:apache, content:commons,
SpanMultiTermQueryWrapper(content:/(math|spark|lucene|collections|deeplearning4j|beam|wayang|csv|io|numbers|ignite|mxnet|age|nlpcraft|pek
[...]
----