Thanks for looking in to this Geert!

It looks promising. But it actually took longer time to execute, 3 minutes
and 29 seconds. Looking at the profile tab in the console i see that the
"-$uris" expression takes more than 99% of the time.

Any suggestions based on this?

Regards,
Johan


On Fri, Jun 26, 2015 at 1:39 PM Geert Josten <[email protected]>
wrote:

>  Hi Johan,
>
>  Your approach will do n+1 index lookups, where n is the number of
> documents you have. It will be more efficient to do just one co-occurrence
> lookup, and then do in memory calculation. Here a bit of code that
> generates sample docs, does a count the classical way (for comparison), and
> an approach that leverages a range index on file/@size:
>
>  xquery version "1.0-ml";
>
>  for $i in (1 to 100)
> let $xml := <mets id="{$i}">{
>   for $j in (1 to xdmp:random($i))
>   return <file size="{$j}"/>
> }</mets>
> return
>   xdmp:document-insert("/mets/file"||$i||".xml", $xml, (), "mets")
>
> ;
>
>  (
> for $doc in collection()
> let $size-count := count($doc//@size)
> order by $size-count descending
> return xdmp:node-uri($doc) || " - " || $size-count
> )[1 to 10]
>
>  ;
> "--------"
> ;
>
>  let $uris := map:map()
> let $_ :=
>   for $tuple in
>     cts:value-tuples(
>       (
>         cts:uri-reference(),
>         cts:element-attribute-reference(xs:QName("file"), xs:QName("size"))
>       )
>     )
>   let $uri := $tuple[1]
>   let $value := $tuple[2]
>   let $freq := cts:frequency($tuple)
>   return map:put($uris, $uri, sum((map:get($uris, $uri), $freq)))
> let $counts := -$uris
> let $top-keys :=
>   for $key in map:keys($counts)
>   order by xs:int($key) descending
>   return $key
> return (
>   for $key in $top-keys
>   for $value in map:get($counts, $key)
>   return $value || " - " || $key
> )[1 to 10]
>
>  I don’t have enough volume to properly test performance, but I’d expect
> that the cts:value-tuples approach should run within seconds..
>
>  Cheers,
> Geert
>
>   From: Johan Mörén <[email protected]>
> Reply-To: MarkLogic Developer Discussion <[email protected]>
> Date: Friday, June 26, 2015 at 12:07 PM
> To: MarkLogic Developer Discussion <[email protected]>
> Subject: [MarkLogic Dev General] Find the document(s) with max
> occurrences of an element-attribute reference
>
>   Hi!
>
>  I want to find out which document(s) that have the highest number of
> occurrences of a particular element-attribute regardless of its value. In
> this case //mets:file/@SIZE
>
>  I have an range index on this, and operations like getting the count,
> sum, avg, min and max works fine and are very performant. But they operate
> on the whole database or a subset constrained by another cts:query.
>
>  The included code gives me what i want but it is not very performant.
>
>  Running this query on a database with 300k documents returns in 1 minute
> and 25 seconds.
>
>  Is there a better way to solve this apart from including the count of
> mets:file/@SIZE as a separate element in each document?
>
>  --- code --
> xquery version "1.0-ml";
> declare namespace mets="http://www.loc.gov/METS/";;
> declare variable $sizeRef :=
> cts:element-attribute-reference(xs:QName("mets:file"), xs:QName("SIZE"));
>
>  declare function local:updateMax($theMap, $size, $uri) {
>   let $currentMax := (map:get($theMap, "max"),0)[1]
>   let $currentUris := (map:get($theMap, "uris"))
>   return if($size > $currentMax) then (
>   map:put($theMap, "max", $size),
>   map:put($theMap, "uris", $uri)
>   ) else if ($size = $currentMax) then (
>     map:put($theMap, "uris", ($currentUris, $uri))
>   ) else ()
> };
>
>  let $map := map:new((
>     map:entry("max",0),
>     map:entry("uris",())
>   ))
>
>  let $puts := for $uri in cts:uris()
>   return local:updateMax($map, cts:count-aggregate($sizeRef,(),
> cts:document-query($uri)), $uri)
>
> return $map
>
>  --- end code ---
>
>  Regards,
> Johan Mörén
> National Library of Sweden
>
>
>    _______________________________________________
> General mailing list
> [email protected]
> Manage your subscription at:
> http://developer.marklogic.com/mailman/listinfo/general
>
_______________________________________________
General mailing list
[email protected]
Manage your subscription at: 
http://developer.marklogic.com/mailman/listinfo/general

Reply via email to