Thank you, but shouldn't this be a part of the "analyzer"? Lucene has analyzers that do this by default, why not Nutch? Thanks, Frank.
On 2/20/06, Howie Wang <[EMAIL PROTECTED]> wrote: > I threw this code together a while ago and it seems to work for me. > The performance could probably be improved, but > if anyone wants, they're free to check it in. It goes under > src/java/org/apache/nutch/util/AccentReplacer.java. > > Howie > > > > /** > * Copyright 2005 The Apache Software Foundation > * > * Licensed under the Apache License, Version 2.0 (the "License"); > * you may not use this file except in compliance with the License. > * You may obtain a copy of the License at > * > * http://www.apache.org/licenses/LICENSE-2.0 > * > * Unless required by applicable law or agreed to in writing, software > * distributed under the License is distributed on an "AS IS" BASIS, > * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > * See the License for the specific language governing permissions and > * limitations under the License. > */ > > package org.apache.nutch.util; > > import java.util.HashMap; > > /** > * A collection of String processing utility methods. > */ > public class AccentReplacer { > > private char[] translateChars; > private HashMap translateMap; > > > public AccentReplacer() { > translateChars = new char[256]; > translateMap = new HashMap(); > initializeAccentReplacement(); > > } > > public void initializeAccentReplacement() { > > // Make an array of 256 with replace chars. > for (char c=0; c<255; c++) { > translateChars[c] = c; > } > > > translateChars['\300'] = 'A'; > translateChars['\301'] = 'A'; > translateChars['\302'] = 'A'; > translateChars['\303'] = 'A'; > translateChars['\304'] = 'A'; > translateChars['\305'] = 'A'; > translateChars['\307'] = 'C'; > translateChars['\310'] = 'E'; > translateChars['\311'] = 'E'; > translateChars['\312'] = 'E'; > translateChars['\313'] = 'E'; > translateChars['\314'] = 'I'; > translateChars['\315'] = 'I'; > translateChars['\316'] = 'I'; > translateChars['\317'] = 'I'; > translateChars['\321'] = 'N'; > translateChars['\322'] = 'O'; > translateChars['\323'] = 'O'; > translateChars['\324'] = 'O'; > translateChars['\325'] = 'O'; > translateChars['\326'] = 'O'; > translateChars['\331'] = 'U'; > translateChars['\332'] = 'U'; > translateChars['\333'] = 'U'; > translateChars['\334'] = 'U'; > translateChars['\335'] = 'Y'; > translateChars['\340'] = 'a'; > translateChars['\341'] = 'a'; > translateChars['\342'] = 'a'; > translateChars['\343'] = 'a'; > translateChars['\344'] = 'a'; > translateChars['\345'] = 'a'; > translateChars['\347'] = 'c'; > translateChars['\350'] = 'e'; > translateChars['\351'] = 'e'; > translateChars['\352'] = 'e'; > translateChars['\353'] = 'e'; > translateChars['\354'] = 'i'; > translateChars['\355'] = 'i'; > translateChars['\356'] = 'i'; > translateChars['\357'] = 'i'; > translateChars['\361'] = 'n'; > translateChars['\362'] = 'o'; > translateChars['\363'] = 'o'; > translateChars['\364'] = 'o'; > translateChars['\365'] = 'o'; > translateChars['\366'] = 'o'; > translateChars['\371'] = 'u'; > translateChars['\372'] = 'u'; > translateChars['\373'] = 'u'; > translateChars['\374'] = 'u'; > translateChars['\375'] = 'y'; > translateChars['\377'] = 'y'; > > > translateMap.put("À", "A"); > translateMap.put("Á", "A"); > translateMap.put("Â", "A"); > translateMap.put("Ã", "A"); > translateMap.put("Ä", "A"); > translateMap.put("Å", "A"); > > translateMap.put("À", "A"); > translateMap.put("Á", "A"); > translateMap.put("Â", "A"); > translateMap.put("Ã", "A"); > translateMap.put("Ä", "A"); > translateMap.put("Å", "A"); > > translateMap.put("Æ", "Ae"); > translateMap.put("Æ", "Ae"); > > translateMap.put("Ç", "C"); > translateMap.put("Ç", "C"); > > translateMap.put("È", "E"); > translateMap.put("É", "E"); > translateMap.put("Ê", "E"); > translateMap.put("Ë", "E"); > > translateMap.put("È", "E"); > translateMap.put("É", "E"); > translateMap.put("Ê", "E"); > translateMap.put("Ë", "E"); > > translateMap.put("Ì", "I"); > translateMap.put("Í", "I"); > translateMap.put("Î", "I"); > translateMap.put("Ï", "I"); > > translateMap.put("Ì", "I"); > translateMap.put("Í", "I"); > translateMap.put("Î", "I"); > translateMap.put("Ï", "I"); > > translateMap.put("Ñ", "N"); > translateMap.put("Ñ", "N"); > > translateMap.put("Ò", "O"); > translateMap.put("Ó", "O"); > translateMap.put("Ô", "O"); > translateMap.put("Õ", "O"); > translateMap.put("Ö", "O"); > > translateMap.put("Ò", "O"); > translateMap.put("Ó", "O"); > translateMap.put("Ô", "O"); > translateMap.put("Õ", "O"); > translateMap.put("Ö", "O"); > > translateMap.put("Ù", "U"); > translateMap.put("Ú", "U"); > translateMap.put("Û", "U"); > translateMap.put("Ü", "U"); > > translateMap.put("Ù", "U"); > translateMap.put("Ú", "U"); > translateMap.put("Û", "U"); > translateMap.put("Ü", "U"); > > translateMap.put("Ý", "Y"); > translateMap.put("Ý", "Y"); > > translateMap.put("à", "a"); > translateMap.put("á", "a"); > translateMap.put("â", "a"); > translateMap.put("ã", "a"); > translateMap.put("ä", "a"); > translateMap.put("å", "a"); > > translateMap.put("à", "A"); > translateMap.put("á", "A"); > translateMap.put("â", "A"); > translateMap.put("ã", "A"); > translateMap.put("ä", "A"); > translateMap.put("å", "A"); > > translateMap.put("æ", "ae"); > > translateMap.put("ç", "c"); > > translateMap.put("è", "e"); > translateMap.put("é", "e"); > translateMap.put("ê", "e"); > translateMap.put("ë", "e"); > > translateMap.put("è", "e"); > translateMap.put("é", "e"); > translateMap.put("ê", "e"); > translateMap.put("ë", "e"); > > translateMap.put("ì", "i"); > translateMap.put("í", "i"); > translateMap.put("î", "i"); > translateMap.put("ï", "i"); > > translateMap.put("ì", "i"); > translateMap.put("í", "i"); > translateMap.put("î", "i"); > translateMap.put("ï", "i"); > > translateMap.put("ñ", "n"); > translateMap.put("ñ", "n"); > > translateMap.put("ò", "o"); > translateMap.put("ó", "o"); > translateMap.put("ô", "o"); > translateMap.put("õ", "o"); > translateMap.put("ö", "o"); > > translateMap.put("ò", "o"); > translateMap.put("ó", "o"); > translateMap.put("ô", "o"); > translateMap.put("õ", "o"); > translateMap.put("ö", "o"); > > translateMap.put("ù", "u"); > translateMap.put("ú", "u"); > translateMap.put("û", "u"); > translateMap.put("ü", "u"); > > translateMap.put("ù", "u"); > translateMap.put("ú", "u"); > translateMap.put("û", "u"); > translateMap.put("ü", "u"); > > translateMap.put("ý", "y"); > translateMap.put("ÿ", "y"); > translateMap.put("ý", "y"); > translateMap.put("ÿ", "y"); > > } > > public String replaceAccents(String s) { > StringBuffer sb = new StringBuffer(s); > > int pos = 0; > int end = -1; > while (pos < sb.length()) { > if (sb.charAt(pos) == '&') { > end = findChar(sb, ';', pos, 8); > if (end >= 0) { > String temp = sb.substring(pos, end); > String replace = (String)translateMap.get(temp); > if (replace != null) > sb.replace(pos, end, replace); > } > } else { > if (sb.charAt(pos) < translateChars.length) > sb.setCharAt(pos, translateChars[sb.charAt(pos)]); > } > pos++; > } > return sb.toString(); > } > > public int findChar(StringBuffer sb, char ch, int start, int maxChars) { > int end = start+maxChars; > if (start+maxChars > sb.length()) { > end = sb.length(); > } > for (int i=start; i<end; i++) { > if (sb.charAt(i) == ch) { > return i+1; > } > } > return -1; > } > > } > > >
