On Sat, Mar 7, 2009 at 3:01 AM, Doug Judd <[email protected]> wrote:
> Is it possible that you're creating scanners, but not destroying them? That
> would cause this kind of problem. Can you post the code that does queries
> in the second phase?
Here it is:
TablePtr tweets = c->open_table("tweets");
ScanSpec sss;
sss.max_versions = 1;
sss.columns.push_back("text");
sss.return_deletes = false;
TableScannerPtr tweet_scanner = tweets->create_scanner(sss);
while (tweet_scanner->next(cell1))
{
DocumentMap document;
/* convert text from whatever encoding we got to unicode (utf16),
next break it up into tokens, and store or increment counts in
the words[word_id]
*/
cout << "Computing TFIDF and preparing output..." << endl;
DocumentMap::const_iterator doc_it;
string output;
unsigned int sum = 0;
for (doc_it = document.begin(); doc_it != document.end(); ++doc_it)
{
sum += doc_it->second;
}
for (doc_it = document.begin(); doc_it != document.end(); ++doc_it)
{
/* compute TF */
cout << "All terms: " << sum << endl;
double tf = doc_it->second / (double)sum;
cout << "TF: " << tf << endl;
/* compute IDF */
double idf = term_idf(doc_it->first, c);
cout << "IDF: " << idf << endl;
double tfidf = tf * idf;
if (tfidf_map.find(doc_it->first) == tfidf_map.end())
tfidf_map[doc_it->first] = 0;
tfidf_map[doc_it->first] = tfidf_map[doc_it->first] + tfidf;
cout << "TFIDF for term: " << doc_it->first << " = " <<
tfidf_map[doc_it->first] << endl;
if (output.size() > 0)
{
output += ",";
}
output += lexical_cast<string>(doc_it->first) + ":" +
lexical_cast<string>(doc_it->second);
}
cout << cell1.row_key << ": " << output << endl;
}
where idf computation is done by
typedef std::map<UnicodeString, unsigned long> WordMap;
typedef std::map<unsigned long, unsigned long> DocumentMap;
typedef std::map<unsigned int, double> TFIDFMap;
typedef std::map<unsigned int, double> IDFMap;
typedef std::map<unsigned int, string> ReverseMap;
double term_idf(unsigned int term_id, ClientPtr& c)
{
double score = 0.0;
static IDFMap idf_map;
static ReverseMap rev_map;
static TablePtr index_table = c->open_table("twitter_word_index");
static TablePtr reverse_map = c->open_table("id_word_map");
if (idf_map.find(term_id) != idf_map.end())
{
return idf_map[term_id];
}
Cell cell;
string key2;
if (rev_map.find(term_id) == rev_map.end())
{
ScanSpec ss1;
string doc_key = lexical_cast<string>(term_id);
ss1.row_intervals.push_back(RowInterval(doc_key.c_str(), true,
doc_key.c_str(), true));
ss1.max_versions = 1;
ss1.columns.push_back("word");
ss1.return_deletes = false;
cout << "reverse_map->create_scanner" << endl;
TableScannerPtr scanner = reverse_map->create_scanner(ss1);
cout << "scanner->next" << endl;
if (!scanner->next(cell))
{
return 0.0;
}
key2 = string((char *)cell.value, cell.value_len);
rev_map[term_id] = key2;
}
key2 = rev_map[term_id];
cout << "Scanning for term '" << term_id << "' (" << key2 << ")"<<endl;
ScanSpec ss2;
ss2.max_versions = 1;
ss2.columns.push_back("locations");
ss2.return_deletes = false;
ss2.row_intervals.push_back(RowInterval(key2.c_str(), true,
key2.c_str(), true));
cout << "index_table->create_scanner" << endl;
TableScannerPtr scanner2 = index_table->create_scanner(ss2);
cout << "scanner2->next" << endl;
if (!scanner2->next(cell))
{
return 0.0;
}
/* parse the locations list to get the IDF */
string locations((char *)cell.value, cell.value_len);
vector<string> values;
split(values, locations, is_any_of(","));
idf_map[term_id] = double(values.size());
return idf_map[term_id];
}
Mateusz
--~--~---------~--~----~------------~-------~--~----~
You received this message because you are subscribed to the Google Groups
"Hypertable Development" group.
To post to this group, send email to [email protected]
To unsubscribe from this group, send email to
[email protected]
For more options, visit this group at
http://groups.google.com/group/hypertable-dev?hl=en
-~----------~----~----~----~------~----~------~--~---