[ https://issues.apache.org/jira/browse/NUTCH-1269?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Julien Nioche reassigned NUTCH-1269: ------------------------------------ Assignee: Julien Nioche > Generate main problems > ---------------------- > > Key: NUTCH-1269 > URL: https://issues.apache.org/jira/browse/NUTCH-1269 > Project: Nutch > Issue Type: Improvement > Components: generator > Affects Versions: 1.4 > Environment: software > Reporter: behnam nikbakht > Assignee: Julien Nioche > Labels: Generate, MaxHostCount, MaxNumSegments > Fix For: 1.9 > > Attachments: NUTCH-1269-v.2.patch, NUTCH-1269.patch > > > there are some problems with current Generate method, with maxNumSegments and > maxHostCount options: > 1. first, size of generated segments are different > 2. with maxHostCount option, it is unclear that it was applied or not > 3. urls from one host are distributed non-uniform between segments > we change Generator.java as described below: > in Selector class: > private int maxNumSegments; > private int segmentSize; > private int maxHostCount; > public void config > ... > maxNumSegments = job.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1); > segmentSize=(int)job.getInt(GENERATOR_TOP_N, 10000000)/maxNumSegments; > maxHostCount=job.getInt("GENERATE_MAX_PER_HOST", 100); > ... > public void reduce(FloatWritable key, Iterator<SelectorEntry> values, > OutputCollector<FloatWritable,SelectorEntry> output, Reporter > reporter) > throws IOException { > int limit2=(int)((limit*3)/2); > while (values.hasNext()) { > if(count == limit) > break; > if (count % segmentSize == 0 ) { > if (currentsegmentnum < maxNumSegments-1){ > currentsegmentnum++; > } > else > currentsegmentnum=0; > } > boolean full=true; > for(int jk=0;jk<maxNumSegments;jk++){ > if (segCounts[jk]<segmentSize){ > full=false; > } > } > if(full){ > break; > } > SelectorEntry entry = values.next(); > Text url = entry.url; > //logWrite("Generated3:"+limit+"-"+count+"-"+url.toString()); > String urlString = url.toString(); > URL u = null; > String hostordomain = null; > try { > if (normalise && normalizers != null) { > urlString = normalizers.normalize(urlString, > URLNormalizers.SCOPE_GENERATE_HOST_COUNT); > } > > u = new URL(urlString); > if (byDomain) { > hostordomain = URLUtil.getDomainName(u); > } else { > hostordomain = new URL(urlString).getHost(); > } > > hostordomain = hostordomain.toLowerCase(); > boolean countLimit=true; > // only filter if we are counting hosts or domains > int[] hostCount = hostCounts.get(hostordomain); > //host count: {a,b,c,d} means that from this host there are a > urls in segment 0 and b urls in seg 1 and ... > if (hostCount == null) { > hostCount = new int[maxNumSegments]; > for(int kl=0;kl<hostCount.length;kl++) > hostCount[kl]=0; > hostCounts.put(hostordomain, hostCount); > } > int selectedSeg=currentsegmentnum; > int minCount=hostCount[selectedSeg]; > for(int jk=0;jk<maxNumSegments;jk++){ > if(hostCount[jk]<minCount){ > minCount=hostCount[jk]; > selectedSeg=jk; > } > } > if(hostCount[selectedSeg]<=maxHostCount){ > count++; > entry.segnum = new IntWritable(selectedSeg); > hostCount[selectedSeg]++; > output.collect(key, entry); > } > } catch (Exception e) { > LOG.warn("Malformed URL: '" + urlString + "', skipping (" > logWrite("Generate-malform:"+hostordomain+"-"+url.toString()); > + StringUtils.stringifyException(e) + ")"); > //continue; > } > } > } > -- This message was sent by Atlassian JIRA (v6.2#6252)