Can you print a log message inside the CellStoreScannerV0 constructor and
destructor to make sure that the scanners are actually getting destructed?
In the destructor, you can see the statement:
if (m_block.base != 0)
Global::block_cache->checkin(m_file_id, m_block.offset);
If the scanner is not getting destructed then the last scanned block will
not get checked in (above statement), preventing its reference count from
dropping to zero, making it release-able.
- Doug
On Fri, Mar 6, 2009 at 6:12 PM, Mateusz Berezecki <[email protected]>wrote:
>
> On Sat, Mar 7, 2009 at 3:01 AM, Doug Judd <[email protected]> wrote:
> > Is it possible that you're creating scanners, but not destroying them?
> That
> > would cause this kind of problem. Can you post the code that does
> queries
> > in the second phase?
>
> Here it is:
>
> TablePtr tweets = c->open_table("tweets");
>
> ScanSpec sss;
> sss.max_versions = 1;
> sss.columns.push_back("text");
> sss.return_deletes = false;
>
> TableScannerPtr tweet_scanner = tweets->create_scanner(sss);
>
> while (tweet_scanner->next(cell1))
> {
> DocumentMap document;
>
> /* convert text from whatever encoding we got to unicode (utf16),
> next break it up into tokens, and store or increment counts in
> the words[word_id]
> */
>
> cout << "Computing TFIDF and preparing output..." << endl;
> DocumentMap::const_iterator doc_it;
> string output;
>
> unsigned int sum = 0;
> for (doc_it = document.begin(); doc_it != document.end(); ++doc_it)
> {
> sum += doc_it->second;
> }
>
> for (doc_it = document.begin(); doc_it != document.end(); ++doc_it)
> {
> /* compute TF */
> cout << "All terms: " << sum << endl;
> double tf = doc_it->second / (double)sum;
> cout << "TF: " << tf << endl;
>
> /* compute IDF */
> double idf = term_idf(doc_it->first, c);
> cout << "IDF: " << idf << endl;
>
> double tfidf = tf * idf;
>
> if (tfidf_map.find(doc_it->first) == tfidf_map.end())
> tfidf_map[doc_it->first] = 0;
> tfidf_map[doc_it->first] = tfidf_map[doc_it->first] + tfidf;
> cout << "TFIDF for term: " << doc_it->first << " = " <<
> tfidf_map[doc_it->first] << endl;
>
> if (output.size() > 0)
> {
> output += ",";
> }
> output += lexical_cast<string>(doc_it->first) + ":" +
> lexical_cast<string>(doc_it->second);
> }
>
> cout << cell1.row_key << ": " << output << endl;
> }
>
>
> where idf computation is done by
>
> typedef std::map<UnicodeString, unsigned long> WordMap;
> typedef std::map<unsigned long, unsigned long> DocumentMap;
> typedef std::map<unsigned int, double> TFIDFMap;
> typedef std::map<unsigned int, double> IDFMap;
> typedef std::map<unsigned int, string> ReverseMap;
>
> double term_idf(unsigned int term_id, ClientPtr& c)
> {
> double score = 0.0;
> static IDFMap idf_map;
> static ReverseMap rev_map;
> static TablePtr index_table = c->open_table("twitter_word_index");
> static TablePtr reverse_map = c->open_table("id_word_map");
>
> if (idf_map.find(term_id) != idf_map.end())
> {
> return idf_map[term_id];
> }
>
> Cell cell;
> string key2;
> if (rev_map.find(term_id) == rev_map.end())
> {
> ScanSpec ss1;
> string doc_key = lexical_cast<string>(term_id);
>
> ss1.row_intervals.push_back(RowInterval(doc_key.c_str(), true,
> doc_key.c_str(), true));
> ss1.max_versions = 1;
> ss1.columns.push_back("word");
> ss1.return_deletes = false;
>
> cout << "reverse_map->create_scanner" << endl;
> TableScannerPtr scanner = reverse_map->create_scanner(ss1);
>
> cout << "scanner->next" << endl;
> if (!scanner->next(cell))
> {
> return 0.0;
> }
>
> key2 = string((char *)cell.value, cell.value_len);
> rev_map[term_id] = key2;
> }
>
> key2 = rev_map[term_id];
> cout << "Scanning for term '" << term_id << "' (" << key2 << ")"<<endl;
>
> ScanSpec ss2;
> ss2.max_versions = 1;
> ss2.columns.push_back("locations");
> ss2.return_deletes = false;
> ss2.row_intervals.push_back(RowInterval(key2.c_str(), true,
> key2.c_str(), true));
>
> cout << "index_table->create_scanner" << endl;
> TableScannerPtr scanner2 = index_table->create_scanner(ss2);
>
> cout << "scanner2->next" << endl;
> if (!scanner2->next(cell))
> {
> return 0.0;
> }
>
> /* parse the locations list to get the IDF */
> string locations((char *)cell.value, cell.value_len);
> vector<string> values;
> split(values, locations, is_any_of(","));
>
> idf_map[term_id] = double(values.size());
> return idf_map[term_id];
> }
>
> Mateusz
>
> >
>
--~--~---------~--~----~------------~-------~--~----~
You received this message because you are subscribed to the Google Groups
"Hypertable Development" group.
To post to this group, send email to [email protected]
To unsubscribe from this group, send email to
[email protected]
For more options, visit this group at
http://groups.google.com/group/hypertable-dev?hl=en
-~----------~----~----~----~------~----~------~--~---