William Grant has proposed merging lp:~wgrant/launchpad/ts-rank into lp:launchpad.
Commit message: Switch all ts2.rank callsites to pg_catalog.ts_rank. Requested reviews: Launchpad code reviewers (launchpad-reviewers) For more details, see: https://code.launchpad.net/~wgrant/launchpad/ts-rank/+merge/345504 ts2 is long-deprecated and going away, and pg_catalog.rank is a window function. -- Your team Launchpad code reviewers is requested to review the proposed merge of lp:~wgrant/launchpad/ts-rank into lp:launchpad.
=== modified file 'lib/lp/registry/model/distribution.py' --- lib/lp/registry/model/distribution.py 2016-11-12 21:47:32 +0000 +++ lib/lp/registry/model/distribution.py 2018-05-14 09:30:49 +0000 @@ -993,7 +993,7 @@ find_spec = ( DistributionSourcePackageCache, SourcePackageName, - SQL('rank(fti, ftq(?)) AS rank', params=(text,)), + SQL('ts_rank(fti, ftq(?)) AS rank', params=(text,)), ) origin = [ DistributionSourcePackageCache, === modified file 'lib/lp/registry/model/distroseries.py' --- lib/lp/registry/model/distroseries.py 2018-01-26 13:47:51 +0000 +++ lib/lp/registry/model/distroseries.py 2018-05-14 09:30:49 +0000 @@ -1234,7 +1234,7 @@ find_spec = ( DistroSeriesPackageCache, BinaryPackageName, - SQL('rank(fti, ftq(%s)) AS rank' % sqlvalues(text))) + SQL('ts_rank(fti, ftq(%s)) AS rank' % sqlvalues(text))) origin = [ DistroSeriesPackageCache, Join( === modified file 'lib/lp/registry/model/pillar.py' --- lib/lp/registry/model/pillar.py 2017-11-10 14:07:52 +0000 +++ lib/lp/registry/model/pillar.py 2018-05-14 09:30:49 +0000 @@ -214,7 +214,7 @@ # row should get the highest search rank (9999999). # Each row in the PillarName table will join with only one # of either the Product, Project, or Distribution tables, - # so the coalesce() is necessary to find the rank() which + # so the coalesce() is necessary to find the ts_rank() which # is not null. result.order_by(SQL(''' (CASE WHEN PillarName.name = lower(%(text)s) @@ -222,9 +222,9 @@ OR lower(Project.title) = lower(%(text)s) OR lower(Distribution.title) = lower(%(text)s) THEN 9999999 - ELSE coalesce(rank(Product.fti, ftq(%(text)s)), - rank(Project.fti, ftq(%(text)s)), - rank(Distribution.fti, ftq(%(text)s))) + ELSE coalesce(ts_rank(Product.fti, ftq(%(text)s)), + ts_rank(Project.fti, ftq(%(text)s)), + ts_rank(Distribution.fti, ftq(%(text)s))) END) DESC, PillarName.name ''' % sqlvalues(text=text))) # People shouldn't be calling this method with too big limits === modified file 'lib/lp/registry/vocabularies.py' --- lib/lp/registry/vocabularies.py 2016-09-19 13:44:28 +0000 +++ lib/lp/registry/vocabularies.py 2018-05-14 09:30:49 +0000 @@ -321,7 +321,7 @@ getUtility(ILaunchBag).user), *vocab_filter) order_by = SQL( '(CASE name WHEN %s THEN 1 ' - ' ELSE rank(fti, ftq(%s)) END) DESC, displayname, name' + ' ELSE ts_rank(fti, ftq(%s)) END) DESC, displayname, name' % (fti_query, fti_query)) return IStore(Product).find(self._table, where_clause).order_by( order_by).config(limit=100) @@ -628,7 +628,7 @@ when person.name like lower(?) || '%%' then 0.6 when lower(person.displayname) like lower(?) || '%%' then 0.5 - else rank(fti, ftq(?)) + else ts_rank(fti, ftq(?)) end) as rank FROM Person WHERE Person.name LIKE lower(?) || '%%' === modified file 'lib/lp/services/database/doc/textsearching.txt' --- lib/lp/services/database/doc/textsearching.txt 2016-11-14 11:45:46 +0000 +++ lib/lp/services/database/doc/textsearching.txt 2018-05-14 09:30:49 +0000 @@ -563,11 +563,11 @@ ------- We have ranking information stored in the indexes, as specified in fti.py. -The rank of a result is calculated using the tsearch2 rank() function. +The rank of a result is calculated using the ts_rank() function. >>> runsql(r""" ... SELECT - ... name, rank(fti, ftq('gnome')) AS rank + ... name, ts_rank(fti, ftq('gnome')) AS rank ... FROM product ... WHERE fti @@ ftq('gnome') ... ORDER BY rank DESC, name @@ -581,7 +581,7 @@ You can also build complex multi table queries and mush all the ranked results together. This query does a full text search on the Bug and Message tables, as well as substring name searches on -SourcepackageName.name and Product.name. The rank() function returns an +SourcepackageName.name and Product.name. The ts_rank() function returns an float between 0 and 1, so I just chose some arbitrary constants for name matches that seemed appropriate. It is also doing a full text search against the Product table, and manually lowering the rank (again using @@ -589,7 +589,7 @@ >>> runsql(r""" ... SELECT title, max(ranking) FROM ( - ... SELECT Bug.title,rank(Bug.fti||Message.fti,ftq('firefox')) + ... SELECT Bug.title,ts_rank(Bug.fti||Message.fti,ftq('firefox')) ... AS ranking ... FROM Bug, BugMessage, Message ... WHERE Bug.id = BugMessage.bug AND Message.id = BugMessage.message @@ -607,7 +607,7 @@ ... AND BugTask.product = Product.id ... AND Product.name LIKE lower('%firefox%') ... UNION - ... SELECT Bug.title, rank(Product.fti, ftq('firefox')) - 0.3 + ... SELECT Bug.title, ts_rank(Product.fti, ftq('firefox')) - 0.3 ... AS ranking ... FROM Bug, BugTask, Product ... WHERE Bug.id = BugTask.bug @@ -657,10 +657,10 @@ Implementing something similar with tsearch2 is straightforward: tsearch2 to_tsquery() already removes stop-words (it also stems the -words). Relevance can be computed using the rank() or rank_cd() +words). Relevance can be computed using the ts_rank() or ts_rank_cd() functions. These are not TD-IDF scoring functions, but they take into -account where the words appeared (in the case of rank()) or proximity -of the words (in the case of rank_cd()). Both scoring functions can +account where the words appeared (in the case of ts_rank()) or proximity +of the words (in the case of ts_rank_cd()). Both scoring functions can normalize based on document length. So the only part left to implement is the >50% filtering part. Howevert the > 50% filtering is very expensive, and so is processing every single returned item (> 200000 for common queries === modified file 'lib/lp/services/database/nl_search.py' --- lib/lp/services/database/nl_search.py 2011-12-30 06:14:56 +0000 +++ lib/lp/services/database/nl_search.py 2018-05-14 09:30:49 +0000 @@ -116,11 +116,11 @@ eliminated from the query. That term eliminatation is only done when there are 5 candidate rows or more. - The remaining terms are then ORed together. One should use the rank() or - rank_cd() function to order the results from running that query. This will - make rows that use more of the terms and for which the terms are found - closer in the text at the top of the list, while still returning rows that - use only some of the terms. + The remaining terms are then ORed together. One should use the + ts_rank() or ts_rank_cd() function to order the results from running + that query. This will make rows that use more of the terms and for + which the terms are found closer in the text at the top of the list, + while still returning rows that use only some of the terms. :terms: Some candidate search terms. === modified file 'lib/lp/services/database/stormexpr.py' --- lib/lp/services/database/stormexpr.py 2016-06-10 22:02:37 +0000 +++ lib/lp/services/database/stormexpr.py 2018-05-14 09:30:49 +0000 @@ -340,5 +340,5 @@ def rank_by_fti(table, text, ftq=True): table, query_fragment = determine_table_and_fragment(table, ftq) return SQL( - '-rank(%s.fti, %s)' % (table.name, query_fragment), params=(text,), + '-ts_rank(%s.fti, %s)' % (table.name, query_fragment), params=(text,), tables=(table,))
_______________________________________________ Mailing list: https://launchpad.net/~launchpad-reviewers Post to : launchpad-reviewers@lists.launchpad.net Unsubscribe : https://launchpad.net/~launchpad-reviewers More help : https://help.launchpad.net/ListHelp