Generate main problems
----------------------

                 Key: NUTCH-1269
                 URL: https://issues.apache.org/jira/browse/NUTCH-1269
             Project: Nutch
          Issue Type: Improvement
          Components: generator
    Affects Versions: 1.4
         Environment: software
            Reporter: behnam nikbakht


there are some problems with current Generate method, with maxNumSegments and 
maxHostCount options:
1. first, size of generated segments are different
2. with maxHostCount option, it is unclear that it was applied or not
3. urls from one host are distributed non-uniform between segments
we change Generator.java as described below:
in Selector class:
    private int maxNumSegments;
    private int segmentSize;
    private int maxHostCount;
public void config
...
      maxNumSegments = job.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1);
      segmentSize=(int)job.getInt(GENERATOR_TOP_N, 10000000)/maxNumSegments;
      maxHostCount=job.getInt("GENERATE_MAX_PER_HOST", 100);  
...
public void reduce(FloatWritable key, Iterator<SelectorEntry> values,
        OutputCollector<FloatWritable,SelectorEntry> output, Reporter reporter)
        throws IOException {
        int limit2=(int)((limit*3)/2);
      while (values.hasNext()) {
        if(count == limit)
                break;
        if (count % segmentSize == 0 ) {
          if (currentsegmentnum < maxNumSegments-1){
            currentsegmentnum++;
          }
          else
                currentsegmentnum=0;
        }

        boolean full=true;
        for(int jk=0;jk<maxNumSegments;jk++){
                if (segCounts[jk]<segmentSize){
                        full=false;
                }
        }
        if(full){
                break;
        }
        SelectorEntry entry = values.next();
        Text url = entry.url;
                //logWrite("Generated3:"+limit+"-"+count+"-"+url.toString());
        String urlString = url.toString();
        URL u = null;
        String hostordomain = null;
        try {
          if (normalise && normalizers != null) {
            urlString = normalizers.normalize(urlString,
                URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
          }
       
          u = new URL(urlString);
          if (byDomain) {
            hostordomain = URLUtil.getDomainName(u);
          } else {
            hostordomain = new URL(urlString).getHost();
          }
 
        hostordomain = hostordomain.toLowerCase();

        boolean countLimit=true;
        // only filter if we are counting hosts or domains
             int[] hostCount = hostCounts.get(hostordomain);
             //host count: {a,b,c,d} means that from this host there are a urls 
in segment 0 and b urls in seg 1 and ...
             if (hostCount == null) {
                 hostCount = new int[maxNumSegments];
                 for(int kl=0;kl<hostCount.length;kl++)
                         hostCount[kl]=0;
                 hostCounts.put(hostordomain, hostCount);
             }  
                 int selectedSeg=currentsegmentnum;
                 int minCount=hostCount[selectedSeg];
                 for(int jk=0;jk<maxNumSegments;jk++){
                         if(hostCount[jk]<minCount){
                                 minCount=hostCount[jk];
                                 selectedSeg=jk;
                         }
                }
                if(hostCount[selectedSeg]<=maxHostCount){
                        count++;
                        entry.segnum = new IntWritable(selectedSeg);
                        hostCount[selectedSeg]++;
                        output.collect(key, entry);
                }

        } catch (Exception e) {
          LOG.warn("Malformed URL: '" + urlString + "', skipping ("
                logWrite("Generate-malform:"+hostordomain+"-"+url.toString());
              + StringUtils.stringifyException(e) + ")");
          //continue;
        }
      }
    }
    

--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators: 
https://issues.apache.org/jira/secure/ContactAdministrators!default.jspa
For more information on JIRA, see: http://www.atlassian.com/software/jira

        

Reply via email to