Revision: 16393 http://gate.svn.sourceforge.net/gate/?rev=16393&view=rev Author: valyt Date: 2012-11-30 17:28:03 +0000 (Fri, 30 Nov 2012) Log Message: ----------- More work on TermsQueries: - provided a Swapper implementation to be used when reordering TermsResultSet (TRS) instances - added support for grouping the insides of a TRS by term description. - when grouped by description, TRSs now store the old term strings so that they can map back to them if needed. - the grouping implementation is also capable of merging multiple TRSs (e.g. coming from multiple sub-indexes of a FederateIndex answering the same TermsQuery). In this case, mapping to the old term strings is also preserved, and new term strings are'invented' (the term position is used).
Modified Paths: -------------- mimir/trunk/mimir-core/src/gate/mimir/search/terms/AbstractIndexTermsQuery.java mimir/trunk/mimir-core/src/gate/mimir/search/terms/AbstractTermsQuery.java mimir/trunk/mimir-core/src/gate/mimir/search/terms/AndTermsQuery.java mimir/trunk/mimir-core/src/gate/mimir/search/terms/LimitTermsQuery.java mimir/trunk/mimir-core/src/gate/mimir/search/terms/OrTermsQuery.java mimir/trunk/mimir-core/src/gate/mimir/search/terms/SortedTermsQuery.java mimir/trunk/mimir-core/src/gate/mimir/search/terms/TermsResultSet.java Modified: mimir/trunk/mimir-core/src/gate/mimir/search/terms/AbstractIndexTermsQuery.java =================================================================== --- mimir/trunk/mimir-core/src/gate/mimir/search/terms/AbstractIndexTermsQuery.java 2012-11-30 15:17:09 UTC (rev 16392) +++ mimir/trunk/mimir-core/src/gate/mimir/search/terms/AbstractIndexTermsQuery.java 2012-11-30 17:28:03 UTC (rev 16393) @@ -279,12 +279,14 @@ termId = documentIterator.nextDocument(); } // construct the result - return new TermsResultSet( + TermsResultSet res = new TermsResultSet( termStrings.toArray(new String[termStrings.size()]), null, countsEnabled ? termCounts.toIntArray() : null, describeAnnotations ? termDescriptions.toArray(new String[termDescriptions.size()]) : null); + if(describeAnnotations) res = TermsResultSet.groupByDescription(res); + return res; } /** Modified: mimir/trunk/mimir-core/src/gate/mimir/search/terms/AbstractTermsQuery.java =================================================================== --- mimir/trunk/mimir-core/src/gate/mimir/search/terms/AbstractTermsQuery.java 2012-11-30 15:17:09 UTC (rev 16392) +++ mimir/trunk/mimir-core/src/gate/mimir/search/terms/AbstractTermsQuery.java 2012-11-30 17:28:03 UTC (rev 16393) @@ -17,9 +17,6 @@ import java.io.IOException; import gate.mimir.search.QueryEngine; -import it.unimi.dsi.fastutil.Arrays; -import it.unimi.dsi.fastutil.Swapper; -import it.unimi.dsi.fastutil.ints.AbstractIntComparator; /** @@ -42,35 +39,4 @@ // TODO Auto-generated method stub return null; } - - /** - * Sorts the arrays inside a {@link TermsResultSet} using the termString for - * comparison. - * @param trs - */ - public static void sortTermsResultSetByTermString(final TermsResultSet trs) { - Arrays.quickSort(0, trs.termStrings.length, new AbstractIntComparator() { - @Override - public int compare(int k1, int k2) { - return trs.termStrings[k1].compareTo(trs.termStrings[k2]); - } - }, new Swapper() { - @Override - public void swap(int a, int b) { - String termString = trs.termStrings[a]; - trs.termStrings[a] = trs.termStrings[b]; - trs.termStrings[b] = termString; - if(trs.termCounts != null) { - int termCount = trs.termCounts[a]; - trs.termCounts[a] = trs.termCounts[b]; - trs.termCounts[b] = termCount; - } - if(trs.termLengths != null) { - int termLength = trs.termLengths[a]; - trs.termLengths[a] = trs.termLengths[b]; - trs.termLengths[b] = termLength; - } - } - }); - } } Modified: mimir/trunk/mimir-core/src/gate/mimir/search/terms/AndTermsQuery.java =================================================================== --- mimir/trunk/mimir-core/src/gate/mimir/search/terms/AndTermsQuery.java 2012-11-30 15:17:09 UTC (rev 16392) +++ mimir/trunk/mimir-core/src/gate/mimir/search/terms/AndTermsQuery.java 2012-11-30 17:28:03 UTC (rev 16393) @@ -56,13 +56,16 @@ boolean lengthsAvailable = false; boolean countsAvailable = true; boolean descriptionsAvaialble = false; + boolean origTermsAvailable = false; for(int i = 0; i < resSets.length; i++) { if(resSets[i].termStrings.length == 0) return TermsResultSet.EMPTY; // this implementation requires that all sub-queries return terms in a // consistent order, so we sort them lexicographically by termString - sortTermsResultSetByTermString(resSets[i]); + TermsResultSet.sortTermsResultSetByTermString(resSets[i]); // at least one sub-query must provide lengths, for us to be able to if(resSets[i].termLengths != null) lengthsAvailable = true; + // at least one sub-query must provide original terms, for us to be able to + if(resSets[i].originalTermStrings != null) origTermsAvailable = true; // at least one sub-query must provide descriptions, for us to be able to if(resSets[i].termDescriptions != null) descriptionsAvaialble = true; // all sub-queries must provide counts, for us to be able to @@ -91,6 +94,8 @@ ObjectArrayList<String> termStrings = new ObjectArrayList<String>(); ObjectArrayList<String> termDescriptions = descriptionsAvaialble ? new ObjectArrayList<String>() : null; + ObjectArrayList<String[][]> origTerms = origTermsAvailable ? + new ObjectArrayList<String[][]>() : null; IntArrayList termCounts = countsAvailable ? new IntArrayList() : null; IntArrayList termLengths = lengthsAvailable ? new IntArrayList() : null; // merge the inputs @@ -141,6 +146,18 @@ } termDescriptions.add(termDescription); } + // extract original terms + if(origTermsAvailable) { + String[][] origTerm = null; + for(int i = 0; + i < resSets.length && origTerm == null; + i++) { + if(resSets[i].originalTermStrings != null){ + origTerm = resSets[i].originalTermStrings[indexes[i]]; + } + } + origTerms.add(origTerm); + } // and start fresh currRunner = 0; indexes[currRunner]++; @@ -170,11 +187,16 @@ } } // top while // construct the result - return new TermsResultSet( + TermsResultSet res = new TermsResultSet( termStrings.toArray(new String[termStrings.size()]), lengthsAvailable? termLengths.toIntArray() : null, countsAvailable ? termCounts.toIntArray() : null, descriptionsAvaialble ? termDescriptions.toArray(new String[termDescriptions.size()]): null); + if(origTermsAvailable){ + res.originalTermStrings = origTerms.toArray( + new String[origTerms.size()][][]); + } + return res; } } Modified: mimir/trunk/mimir-core/src/gate/mimir/search/terms/LimitTermsQuery.java =================================================================== --- mimir/trunk/mimir-core/src/gate/mimir/search/terms/LimitTermsQuery.java 2012-11-30 15:17:09 UTC (rev 16392) +++ mimir/trunk/mimir-core/src/gate/mimir/search/terms/LimitTermsQuery.java 2012-11-30 17:28:03 UTC (rev 16393) @@ -71,8 +71,15 @@ termDescriptions = new String[limit]; System.arraycopy(trs.termDescriptions, 0, termDescriptions, 0, limit); } - return new TermsResultSet(termStrings, termLengths, termCounts, + + TermsResultSet res = new TermsResultSet(termStrings, termLengths, termCounts, termDescriptions); + if(trs.originalTermStrings != null) { + res.originalTermStrings = new String[limit][][]; + System.arraycopy(trs.originalTermStrings, 0, + res.originalTermStrings, 0, limit); + } + return res; } else { return trs; } Modified: mimir/trunk/mimir-core/src/gate/mimir/search/terms/OrTermsQuery.java =================================================================== --- mimir/trunk/mimir-core/src/gate/mimir/search/terms/OrTermsQuery.java 2012-11-30 15:17:09 UTC (rev 16392) +++ mimir/trunk/mimir-core/src/gate/mimir/search/terms/OrTermsQuery.java 2012-11-30 17:28:03 UTC (rev 16393) @@ -64,11 +64,12 @@ int[] termIndex = new int[resSets.length]; boolean lengthsAvailable = true; boolean descriptionsAvailable = true; + boolean origTermsAvailable = true; boolean countsAvailable = true; for(int i = 0; i < resSets.length; i++) { // this implementation requires that all sub-queries return terms in a // consistent order, so we sort them lexicographically by termString - sortTermsResultSetByTermString(resSets[i]); + TermsResultSet.sortTermsResultSetByTermString(resSets[i]); if(resSets[i].termStrings.length > 0){ termIndex[i] = 0; currentTerm[i] = resSets[i].termStrings[termIndex[i]]; @@ -79,12 +80,15 @@ if(resSets[i].termLengths == null) lengthsAvailable = false; if(resSets[i].termCounts == null) countsAvailable = false; if(resSets[i].termDescriptions == null) descriptionsAvailable = false; + if(resSets[i].originalTermStrings == null) origTermsAvailable = false; } // prepare local data ObjectArrayList<String> termStrings = new ObjectArrayList<String>(); ObjectArrayList<String> termDescriptions = descriptionsAvailable ? new ObjectArrayList<String>() : null; + ObjectArrayList<String[][]> origTerms = origTermsAvailable ? + new ObjectArrayList<String[][]>() : null; IntArrayList termLengths = lengthsAvailable ? new IntArrayList() : null; IntArrayList termCounts = countsAvailable ? new IntArrayList() : null; int front[] = new int[resSets.length]; @@ -99,6 +103,9 @@ if(descriptionsAvailable) { termDescriptions.add(resSets[first].termDescriptions[termIndex[first]]); } + if(origTermsAvailable) { + origTerms.add(resSets[first].originalTermStrings[termIndex[first]]); + } if(countsAvailable) { // sum all counts int frontSize = queue.front(front); @@ -125,11 +132,16 @@ } } // construct the result - return new TermsResultSet( + TermsResultSet res = new TermsResultSet( termStrings.toArray(new String[termStrings.size()]), lengthsAvailable ? termLengths.toIntArray() : null, countsAvailable ? termCounts.toIntArray() : null, descriptionsAvailable ? termDescriptions.toArray(new String[termDescriptions.size()]) : null); + if(origTermsAvailable){ + res.originalTermStrings = origTerms.toArray( + new String[origTerms.size()][][]); + } + return res; } } Modified: mimir/trunk/mimir-core/src/gate/mimir/search/terms/SortedTermsQuery.java =================================================================== --- mimir/trunk/mimir-core/src/gate/mimir/search/terms/SortedTermsQuery.java 2012-11-30 15:17:09 UTC (rev 16392) +++ mimir/trunk/mimir-core/src/gate/mimir/search/terms/SortedTermsQuery.java 2012-11-30 17:28:03 UTC (rev 16393) @@ -15,7 +15,6 @@ package gate.mimir.search.terms; import it.unimi.dsi.fastutil.Arrays; -import it.unimi.dsi.fastutil.Swapper; import it.unimi.dsi.fastutil.ints.IntComparator; /** @@ -146,31 +145,7 @@ } return retval; } - }, new Swapper() { - @Override - public void swap(int a, int b) { - if(trs.termStrings != null) { - String termString = trs.termStrings[a]; - trs.termStrings[a] = trs.termStrings[b]; - trs.termStrings[b] = termString; - } - if(trs.termLengths != null) { - int termLen = trs.termLengths[a]; - trs.termLengths[a] = trs.termLengths[b]; - trs.termLengths[b] = termLen; - } - if(trs.termCounts != null) { - int termCount = trs.termCounts[a]; - trs.termCounts[a] = trs.termCounts[b]; - trs.termCounts[b] = termCount; - } - if(trs.termDescriptions != null) { - String termDescription = trs.termDescriptions[a]; - trs.termDescriptions[a] = trs.termDescriptions[b]; - trs.termDescriptions[b] = termDescription; - } - } - }); + }, new TermsResultSet.Swapper(trs)); return trs; } } Modified: mimir/trunk/mimir-core/src/gate/mimir/search/terms/TermsResultSet.java =================================================================== --- mimir/trunk/mimir-core/src/gate/mimir/search/terms/TermsResultSet.java 2012-11-30 15:17:09 UTC (rev 16392) +++ mimir/trunk/mimir-core/src/gate/mimir/search/terms/TermsResultSet.java 2012-11-30 17:28:03 UTC (rev 16393) @@ -16,6 +16,15 @@ import gate.mimir.SemanticAnnotationHelper; +import it.unimi.dsi.fastutil.Arrays; +import it.unimi.dsi.fastutil.ints.AbstractIntComparator; +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.objects.Object2ObjectLinkedOpenHashMap; +import it.unimi.dsi.fastutil.objects.Object2ObjectMap; +import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap; +import it.unimi.dsi.fastutil.objects.ObjectArrayList; +import it.unimi.dsi.fastutil.objects.ObjectIterator; + import java.io.Serializable; /** @@ -43,8 +52,17 @@ * {@link #termCounts} and {@link #termDescriptions}. */ public final String[] termStrings; + + /** + * This field is populated by the + * {@link #groupByDescription(TermsResultSet...)} method. It contains term + * strings from the original result sets indexed by position in this result + * set, and the index of the results set. For example + * originalTermStrings[i][j] is a String[], containing all the term strings + * associated with termDescriptions[i] in the j<sup>th</sup> result set. + */ + public String[][][] originalTermStrings; - /** * For annotation indexes, the term string is simply a URI in whatever format * is used by the {@link SemanticAnnotationHelper} that was used to index the @@ -60,7 +78,7 @@ * {@link #termStrings} and {@link #termIds}. */ public final int[] termCounts; - + public TermsResultSet(String[] termStrings, int[] termLengths, int[] termCounts, String[] termDescriptions) { super(); @@ -76,4 +94,259 @@ public static final TermsResultSet EMPTY = new TermsResultSet( new String[]{}, new int[] {}, new int[]{}, new String[]{}); + + /** + * Given a position in {@link #termDescriptions}, this method computes all + * term strings that had that description in each of the sub-indexes of the + * federated index that produced this result set. + * @param termPosition the term for which the original term strings are being + * requested. + * @return An array where element at position i is an array containing all the + * term strings (in the dictionary of sub-index i) that had the given term + * description when the original query was answered by sub-index i, or null + * if original terms strings are not available. + */ + public String[][] getSubIndexTerms(int termPosition) { + return (originalTermStrings != null) ? + originalTermStrings[termPosition] : null; + } + + /** + * Tries to locate the correct term position and calls + * {@link #getSubIndexTerms(int)}. + * @param termString + * @return + */ + public String[][] getSubIndexTerms(String termString) { + int termPos = -1; + try { + termPos = Integer.parseInt(termString); + } catch (Exception e) {} + if(termStrings[termPos].equals(termString)) { + return getSubIndexTerms(termPos); + } else{ + // could not convert it: leave it unchanged + return new String[][]{{termString}}; + } + } + + /** + * Sorts the arrays inside a {@link TermsResultSet} using the termString for + * comparison. + * @param trs + */ + public static void sortTermsResultSetByTermString(final TermsResultSet trs) { + Arrays.quickSort(0, trs.termStrings.length, new AbstractIntComparator() { + @Override + public int compare(int k1, int k2) { + return trs.termStrings[k1].compareTo(trs.termStrings[k2]); + } + }, new Swapper(trs)); + } + + /** + * This method re-arranges the data included in one or more + * {@link TermsResultSet} values so that each term description occurs only + * once in the {@link #termDescriptions} array. + * + * A {@link TermsResultSet} obtained when calling + * {@link TermsQuery#execute(gate.mimir.search.QueryEngine)} may include the + * same description for multiple term strings: depending on the implementation + * used to describe terms, distinct terms may end up with the same + * description. This could cause confusion when the output is presented to + * the user, as they would have no way to distinguish between the different + * terms. + * + * When executing a terms query against a federated index, each sub-index + * returns its own result set. Terms originating in different sub-indexes can + * have the same description. + * + * This method combines these into a unified result set that preserves the + * right term ID to term description mappings by populating the + * {@link #originalTermStrings} array. + * + * @param resSets the result sets produced by the sub-indexes of a federated + * index. + * @return the combined result set. + */ + public static TermsResultSet groupByDescription(TermsResultSet... resSets) { + boolean descriptionsAvaialble = true; + boolean countsAvaialble = true; + boolean lengthsAvaialble = false; + for(TermsResultSet trs : resSets) { + if(trs.termDescriptions == null) { + descriptionsAvaialble = false; + } + if(trs.termCounts == null) { + countsAvaialble = false; + } + if(trs.termLengths != null) { + lengthsAvaialble = true; + } + } + + Object2ObjectOpenHashMap<String, TermData> desc2TermData = + new Object2ObjectOpenHashMap<String, TermData>(); + + for(int subIndexPos = 0; subIndexPos < resSets.length; subIndexPos++) { + TermsResultSet trs = resSets[subIndexPos]; + for(int i = 0; i < trs.termStrings.length; i++) { + String description = descriptionsAvaialble ? + trs.termDescriptions[i] : trs.termStrings[i]; +// String string = descriptionsAvaialble ? trs.termStrings[i] : null; + // get all the strings describing the current term + String[] strings = null; + if(trs.originalTermStrings != null) { + // old TRS already has original term strings + if(trs.originalTermStrings[i].length == 1) { + // old TRS was not federated + strings = trs.originalTermStrings[i][0]; + } else { + // old TRS was federated: get the term strings from the correct sub-index + strings = trs.originalTermStrings[i][subIndexPos]; + } + } else { + // no old original term strings: use the actual term string + strings = descriptionsAvaialble ? + new String[]{trs.termStrings[i]} : null; + } + + TermData tData = desc2TermData.get(description); + if(tData == null) { + tData = new TermData(description, resSets.length); + desc2TermData.put(description, tData); + } + if(descriptionsAvaialble && strings != null){ + for(String s : strings) tData.addString(subIndexPos, s); +// tData.addString(subIndexPos, string); + } + if(countsAvaialble) { + tData.count += trs.termCounts[i]; + } + if(lengthsAvaialble && trs.termLengths != null && tData.length < 0) { + tData.length = trs.termLengths[i]; + } + } + } + // produce the compound result set + String[] newStrings = new String[desc2TermData.size()]; + String[] newDescriptions = descriptionsAvaialble ? + new String[desc2TermData.size()] : null; + int[] newCounts = countsAvaialble ? new int[desc2TermData.size()] : null; + int[] newLenghts = lengthsAvaialble ? new int[desc2TermData.size()] : null; + String[][][] originalTermStrings = descriptionsAvaialble ? + new String[desc2TermData.size()][][] : null; + ObjectIterator<Object2ObjectMap.Entry<String, TermData>> iter = + desc2TermData.object2ObjectEntrySet().fastIterator(); + int pos = 0; + while(iter.hasNext()) { + TermData tData = iter.next().getValue(); + if(descriptionsAvaialble) { + newDescriptions[pos] = tData.description; + originalTermStrings[pos] = tData.getStrings(); + // term string does not actually mean anything; we use the term position + // instead + newStrings[pos] = Integer.toString(pos); + } else { + newStrings[pos] = tData.description; + } + if(countsAvaialble) newCounts[pos] = tData.count; + if(lengthsAvaialble) newLenghts[pos] = tData.length; + pos++; + } + + TermsResultSet res = new TermsResultSet(newStrings, newLenghts, newCounts, + newDescriptions); + res.originalTermStrings = originalTermStrings; + return res; + } + + /** + * Class used internally to store the term data when grouping terms results sets. + * See {@link TermsResultSet#groupByDescription(TermsResultSet...)}. + */ + private static class TermData { + private String description; + private int count; + private int length; + + /** + * The number of result sets being combined + */ + private int arity; + + /** + * An array of size {@link #arity}, element at position i containing the + * term strings in the result set at position i, for this term description. + */ + private ObjectArrayList<String>[] strings; + + public TermData(String description, int arity) { + super(); + this.description = description; + this.arity = arity; + strings = new ObjectArrayList[arity]; + this.count = 0; + this.length = -1; + } + + /** + * Adds a new term string for the sub-index at a given position. + * @param position + * @param string + */ + public void addString(int position, String string) { + if(strings[position] == null) { + strings[position] = new ObjectArrayList<String>(); + } + strings[position].add(string); + } + + public String[][] getStrings() { + String[][] res = new String[strings.length][]; + for(int i = 0; i < strings.length; i++) { + res[i] = strings[i].toArray(new String[strings[i].size()]); + } + return res; + } + } + + /** + * A {@link it.unimi.dsi.fastutil.Swapper} implementation for + * {@link TermsResultSet}s. + */ + public static class Swapper implements it.unimi.dsi.fastutil.Swapper { + private TermsResultSet trs; + + public Swapper(TermsResultSet trs) { + this.trs = trs; + } + + @Override + public void swap(int a, int b) { + String termString = trs.termStrings[a]; + trs.termStrings[a] = trs.termStrings[b]; + trs.termStrings[b] = termString; + if(trs.termCounts != null) { + int termCount = trs.termCounts[a]; + trs.termCounts[a] = trs.termCounts[b]; + trs.termCounts[b] = termCount; + } + if(trs.termLengths != null) { + int termLength = trs.termLengths[a]; + trs.termLengths[a] = trs.termLengths[b]; + trs.termLengths[b] = termLength; + } + if(trs.termDescriptions != null) { + String termDesc = trs.termDescriptions[a]; + trs.termDescriptions[a] = trs.termDescriptions[b]; + trs.termDescriptions[b] = termDesc; + } + if(trs.originalTermStrings != null) { + String[][] origTSs = trs.originalTermStrings[a]; + trs.originalTermStrings[a] = trs.originalTermStrings[b]; + trs.originalTermStrings[b] = origTSs; + } + } + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ Keep yourself connected to Go Parallel: TUNE You got it built. Now make it sing. Tune shows you how. http://goparallel.sourceforge.net _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs