I implemented a ranked multi-term search routine in searchGenerator()
the patch is attached.
Pros:
- makes the search return put the more valuable stuff at the top of
search results
- makes multiterm searching behave like more people expect (like
google)
Cons:
- b/c you have to sort the search results after you get them it slows
down the results and partially defeats the purpose of the generator, at
all.
- the more terms you add the slower it gets for obvious reasons.
thoughts on better ways to do this are welcome.
-sv
Index: __init__.py
===================================================================
RCS file: /home/groups/yum/cvs/yum/yum/__init__.py,v
retrieving revision 1.328
diff -u -r1.328 __init__.py
--- __init__.py 29 May 2007 15:52:33 -0000 1.328
+++ __init__.py 5 Jun 2007 04:22:47 -0000
@@ -1151,12 +1151,6 @@
def searchGenerator(self, fields, criteria):
"""Generator method to lighten memory load for some searches.
This is the preferred search function to use."""
- # FIXME - regex or globs - pick one
- # convert the fields
- # check the criteria for %
- # maybe convert globs to sql?
- # get back results, for each of the results run the old query and
- # render results
sql_fields = []
for f in fields:
if RPM_TO_SQLITE.has_key(f):
@@ -1164,14 +1158,34 @@
else:
sql_fields.append(f)
+ scores = {}
+ my_sets = {}
+ matched_values = {}
+
+ def __sortbyVal(x, y):
+ (k, v) = x
+ (k2, v2) = y
+ if v > v2:
+ return 1
+ if v < v2:
+ return -1
+ if v == v2:
+ return 0
+
+ # go through each item in the criteria list
+ # figure out if it matches and what it matches
+ # tally up the scores for the pkgs
+ # yield the results in order of most terms matched first
+
for s in criteria:
narrowed_list = []
+ my_sets[s] = []
if s.find('%') != -1:
continue
for sack in self.pkgSack.sacks.values():
narrowed_list.extend(sack.searchPrimaryFields(sql_fields, s))
-
+
for po in narrowed_list:
tmpvalues = []
for field in fields:
@@ -1180,8 +1194,9 @@
tmpvalues.append(value)
if len(tmpvalues) > 0:
- yield (po, tmpvalues)
-
+ matched_values[po] = tmpvalues
+ my_sets[s].append(po)
+
for po in self.rpmdb:
tmpvalues = []
for field in fields:
@@ -1190,7 +1205,31 @@
tmpvalues.append(value)
if len(tmpvalues) > 0:
- yield (po, tmpvalues)
+ matched_values[po] = tmpvalues
+ my_sets[s].append(po)
+
+ for pkg in matched_values.keys():
+ if scores.has_key(pkg):
+ continue
+ count = 0
+
+ for this_set in my_sets.values():
+ if pkg in this_set:
+ count+=1
+
+ scores[pkg] = count
+
+ i = scores.items()
+ i.sort(__sortbyVal)
+ i.reverse()
+
+ for (pkg,count) in i:
+ if matched_values.has_key(pkg):
+ yield (pkg, matched_values[pkg])
+ else:
+ print pkg
+
+
def searchPackages(self, fields, criteria, callback=None):
"""Search specified fields for matches to criteria
_______________________________________________
Yum-devel mailing list
[email protected]
https://lists.dulug.duke.edu/mailman/listinfo/yum-devel