On Sat, Mar 7, 2009 at 3:01 AM, Doug Judd <[email protected]> wrote:
> Is it possible that you're creating scanners, but not destroying them?  That
> would cause this kind of problem.  Can you post the code that does queries
> in the second phase?

Here it is:

  TablePtr tweets = c->open_table("tweets");

  ScanSpec sss;
  sss.max_versions = 1;
  sss.columns.push_back("text");
  sss.return_deletes = false;

  TableScannerPtr tweet_scanner = tweets->create_scanner(sss);

  while (tweet_scanner->next(cell1))
  {
    DocumentMap document;

    /* convert text from whatever encoding we got to unicode (utf16),
       next break it up into tokens, and store or increment counts in
       the words[word_id]
    */

    cout << "Computing TFIDF and preparing output..." << endl;
    DocumentMap::const_iterator doc_it;
    string output;

    unsigned int sum = 0;
    for (doc_it = document.begin(); doc_it != document.end(); ++doc_it)
    {
      sum += doc_it->second;
    }

    for (doc_it = document.begin(); doc_it != document.end(); ++doc_it)
    {
      /* compute TF */
      cout << "All terms: " << sum << endl;
      double tf = doc_it->second / (double)sum;
      cout << "TF: " << tf << endl;

      /* compute IDF */
      double idf = term_idf(doc_it->first, c);
      cout << "IDF: " << idf << endl;

      double tfidf = tf * idf;

      if (tfidf_map.find(doc_it->first) == tfidf_map.end())
        tfidf_map[doc_it->first] = 0;
      tfidf_map[doc_it->first] = tfidf_map[doc_it->first] + tfidf;
      cout << "TFIDF for term: " << doc_it->first << " = " <<
tfidf_map[doc_it->first] << endl;

      if (output.size() > 0)
      {
        output += ",";
      }
      output += lexical_cast<string>(doc_it->first) + ":" +
lexical_cast<string>(doc_it->second);
    }

    cout << cell1.row_key << ": " << output << endl;
  }


where idf computation is done by

typedef std::map<UnicodeString, unsigned long> WordMap;
typedef std::map<unsigned long, unsigned long> DocumentMap;
typedef std::map<unsigned int, double> TFIDFMap;
typedef std::map<unsigned int, double> IDFMap;
typedef std::map<unsigned int, string> ReverseMap;

double term_idf(unsigned int term_id, ClientPtr& c)
{
  double score = 0.0;
  static IDFMap idf_map;
  static ReverseMap rev_map;
  static TablePtr index_table = c->open_table("twitter_word_index");
  static TablePtr reverse_map = c->open_table("id_word_map");

  if (idf_map.find(term_id) != idf_map.end())
  {
    return idf_map[term_id];
  }

  Cell cell;
  string key2;
  if (rev_map.find(term_id) == rev_map.end())
  {
    ScanSpec ss1;
    string doc_key = lexical_cast<string>(term_id);

    ss1.row_intervals.push_back(RowInterval(doc_key.c_str(), true,
                doc_key.c_str(), true));
    ss1.max_versions = 1;
    ss1.columns.push_back("word");
    ss1.return_deletes = false;

    cout << "reverse_map->create_scanner" << endl;
    TableScannerPtr scanner = reverse_map->create_scanner(ss1);

    cout << "scanner->next" << endl;
    if (!scanner->next(cell))
    {
      return 0.0;
    }

    key2 = string((char *)cell.value, cell.value_len);
    rev_map[term_id] = key2;
  }

  key2 = rev_map[term_id];
  cout << "Scanning for term '" << term_id << "' (" << key2 << ")"<<endl;

  ScanSpec ss2;
  ss2.max_versions = 1;
  ss2.columns.push_back("locations");
  ss2.return_deletes = false;
  ss2.row_intervals.push_back(RowInterval(key2.c_str(), true,
        key2.c_str(), true));

  cout << "index_table->create_scanner" << endl;
  TableScannerPtr scanner2 = index_table->create_scanner(ss2);

  cout << "scanner2->next" << endl;
  if (!scanner2->next(cell))
  {
    return 0.0;
  }

  /* parse the locations list to get the IDF */
  string locations((char *)cell.value, cell.value_len);
  vector<string> values;
  split(values, locations, is_any_of(","));

  idf_map[term_id] = double(values.size());
  return idf_map[term_id];
}

Mateusz

--~--~---------~--~----~------------~-------~--~----~
You received this message because you are subscribed to the Google Groups 
"Hypertable Development" group.
To post to this group, send email to [email protected]
To unsubscribe from this group, send email to 
[email protected]
For more options, visit this group at 
http://groups.google.com/group/hypertable-dev?hl=en
-~----------~----~----~----~------~----~------~--~---

Reply via email to