[ https://issues.apache.org/jira/browse/TRAFODION-2259?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15546963#comment-15546963 ]
ASF GitHub Bot commented on TRAFODION-2259: ------------------------------------------- Github user DaveBirdsall commented on a diff in the pull request: https://github.com/apache/incubator-trafodion/pull/743#discussion_r81872336 --- Diff: core/sql/sort/Topn.cpp --- @@ -0,0 +1,337 @@ +/********************************************************************** +// @@@ START COPYRIGHT @@@ +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// @@@ END COPYRIGHT @@@ +**********************************************************************/ +/* -*-C++-*- +****************************************************************************** +* +* File: TopN.cpp +* +* Description: This file contains the implementation of all member functions +* of the class TopN. +* +* 1. Sort would initially maintain Top N array of elements to being with. +* 2. Read records into TopN array. +* 3. Once TopN array is full, heapify the array into max heap. Top node in the heap is always the highest node. +* 4. Subsequent record read either gets discarded( if greater than top node) or replace top node( if lesser then top node) . if replaced top node, re-balance the heap. +* 5. Repeat steps 4 until last record is read. +* 6. sort the final heap using heap sort. +*******************************************************************************/ + +#include <iostream> +#include <fstream> + +#ifndef DEBUG +#undef NDEBUG +#define NDEBUG +#endif +#include "ex_stdh.h" +#include "Topn.h" +#include "ScratchSpace.h" +#include "logmxevent.h" +#include "SortUtil.h" +#include "ex_ex.h" +#include "ExStats.h" + +//------------------------------------------------------------------------ +// Class Constructor. +//------------------------------------------------------------------------ +TopN::TopN(ULng32 runsize, ULng32 sortmaxmem, ULng32 recsize, + NABoolean doNotallocRec, ULng32 keysize, + SortScratchSpace* scratch, NABoolean iterSort, + CollHeap* heap, SortError* sorterror, Lng32 explainNodeId, SortUtil* sortutil): + SortAlgo(runsize, recsize, doNotallocRec, keysize, scratch, explainNodeId), + loopIndex_(0), heap_(heap), sortError_(sorterror), + sortUtil_(sortutil) +{ + //runsize is TopN size. Fixed. + allocRunSize_ = runsize; + + //Actual run size after all elements read. + runSize_ = 0; + + isHeapified_ = FALSE; + + //allocateMemory failureIsFatal is defaulted to TRUE means allocation failure results in + //longjump to jump handler defined in ex_sort.cpp. Only applicable on NSK. + topNKeys_ = (RecKeyBuffer *)heap_->allocateMemory(sizeof(RecKeyBuffer) * allocRunSize_); + + // Below asserts useful in debug mode. Also asserts if longjmp did not happen. + //ex_assert(rootRecord_!= NULL, "Sort: Initial rootRecord_ allocation failed"); + ex_assert(topNKeys_ != NULL, "Sort: Initial topNKeys_ allocation failed"); + + recNum_ = 0; + ExOperStats *stat = sortUtil_->config()->getCallingTcb()->getStatsEntry(); + if (stat) + bmoStats_ = stat->castToExBMOStats(); + else + bmoStats_ = NULL; + if (bmoStats_) + bmoStats_->updateBMOHeapUsage((NAHeap *)heap_); +} + + +//------------------------------------------------------------------------ +// Class Destructor: Delete all the heap space pointed by pointers in Qsort +//------------------------------------------------------------------------ +TopN::~TopN(void) +{ + + for (int i = 0; i < runSize_; i++) + topNKeys_[i].rec_->releaseTupp(); + + + if (topNKeys_ != NULL) { + NADELETEBASIC(topNKeys_, heap_); + topNKeys_ = NULL; + } + if (bmoStats_) + bmoStats_->updateBMOHeapUsage((NAHeap *)heap_); + +} + +Lng32 TopN::sortSend(void *rec, ULng32 len, void* tupp) +{ + //if heap not built means, TopN array has more slots + //available to fill. + if(! isHeapified_) + { + ex_assert(loopIndex_ >= 0, "TopN::sortSend: loopIndex_ is < 0"); + ex_assert(loopIndex_ < allocRunSize_, "TopN::sortSend: loopIndex_ > allocRunSize_"); + ex_assert(rec != NULL, "TopN::sortSend: rec is NULL"); + + Record * insertRec = new Record(rec, len, tupp, heap_, sortError_); + + topNKeys_[loopIndex_].key_ = insertRec->extractKey(keySize_, sortUtil_->config()->numberOfBytesForRecordSize()); + topNKeys_[loopIndex_].rec_ = insertRec; + if (++loopIndex_ == allocRunSize_) + { + //Reaching here means, we have filled up the array. + //Now heapify the array to start accepting/eliminating new elements from now on. + + //Note lookIndex_ contains the current number of filled elements. + + buildHeap(); + } + return SORT_SUCCESS; + } + + //Reaching here means, heap is already build. + //Check if the new rec belongs to this heap by comparing the + //new rec key with the root node of the heap ( root node is always the greatest). + insertRec(rec, len, tupp); + return SORT_SUCCESS; + } + + +void TopN::buildHeap() +{ + if(!isHeapified_) + { + //loopIndex_ is now <= TopN + runSize_ = loopIndex_; + + satisfyHeap(); + + isHeapified_ = TRUE; + } +} + +//Satisfy Heap makes sure the heap is balanced maxHeap. +//Note this does not mean heap is sorted. It just makes sure +//the higher level nodes are greater than lower level nodes. +//Topmost node or root will be the highest. +void TopN::satisfyHeap() +{ + for (int i = (runSize_/2 ); i >= 0; i--) + siftDown(topNKeys_, i, runSize_-1); +} + + +void TopN::insertRec(void *rec, ULng32 len, void* tupp) +{ + ex_assert(isHeapified_, "TopN::insertRec: not heapified"); + + int root = 0; //Always, root node is the zero'th element in array. + + Record * insertRec = new Record(rec, len, tupp, heap_, sortError_); + insertRecKey_.key_ = insertRec->extractKey(keySize_, sortUtil_->config()->numberOfBytesForRecordSize()); + insertRecKey_.rec_ = insertRec; + + if (compare(topNKeys_[root].key_ ,insertRecKey_.key_) == KEY1_IS_GREATER) + { + swap( &topNKeys_[root],&insertRecKey_); --- End diff -- It's places like this where the mixture of tabs and non-tabs makes the code hard to read due to odd indentation. > Sort TopN operator > ------------------ > > Key: TRAFODION-2259 > URL: https://issues.apache.org/jira/browse/TRAFODION-2259 > Project: Apache Trafodion > Issue Type: Improvement > Components: sql-exe > Affects Versions: 2.1-incubating > Reporter: Prashanth Vasudev > Assignee: Prashanth Vasudev > > Sort operator consumes all records before producing sorted records. For > certain use cases where only Top N records are required, today sort consumes > all records into memory and overflows( spills ) to disk. This impacts > performance. > if topN is pushed down to sort, only required memory can be allocated and > sort would only hold topN records in memory. Once all the records are read, > sorted records in topN is returned. -- This message was sent by Atlassian JIRA (v6.3.4#6332)