Author: sshafroi Date: 2008-11-14 19:24:41 +0100 (Fri, 14 Nov 2008) New Revision: 6932
Modified: trunk/result-spi/src/main/java/no/sesat/search/result/StringChopper.java Log: Issue SKER4947: (StringChopper will not handle cdata) New version of StringChopper. My initial tests shows that this is about twice as fast as the old version, depending on the input. StringChopper is a hot-spot in Sesat and this is therefor a welcome speedup. Modified: trunk/result-spi/src/main/java/no/sesat/search/result/StringChopper.java =================================================================== --- trunk/result-spi/src/main/java/no/sesat/search/result/StringChopper.java 2008-11-14 15:05:28 UTC (rev 6931) +++ trunk/result-spi/src/main/java/no/sesat/search/result/StringChopper.java 2008-11-14 18:24:41 UTC (rev 6932) @@ -1,8 +1,5 @@ -/* Copyright (2005-2008) Schibsted Søk AS +/* Copyright (2008) Schibsted Søk AS * This file is part of SESAT. - * You can use, redistribute, and/or modify it, under the terms of the SESAT License. - * You should have received a copy of the SESAT License along with this program. - * If not, see https://dev.sesat.no/confluence/display/SESAT/SESAT+License * * SESAT is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by @@ -16,207 +13,192 @@ * * You should have received a copy of the GNU Affero General Public License * along with SESAT. If not, see <http://www.gnu.org/licenses/>. - * - * StringChopper.java - * - * Created on June 22, 2006, 5:10 PM - * */ package no.sesat.search.result; -import java.util.LinkedList; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import org.apache.log4j.Logger; +import java.util.Stack; -/** My favourite dish of ChopSuey. - * - * @version $Id$ - * - */ -public final class StringChopper { +public class StringChopper { - // Constants ----------------------------------------------------- + private enum State { + none, tag, startTag, endTag, cdata, comment, declaration + }; - private static final Logger LOG = Logger.getLogger(StringChopper.class); - - private static final String DEBUG_CHOPSUEY = "Chopped it up to "; - - private static final Pattern openTag = Pattern.compile("<[^?!][^<]*>"); - private static final Pattern closeTag = Pattern.compile("</[^<]+>"); - private static final Pattern singleTag = Pattern.compile("<[^<]+/>"); - - // Attributes ---------------------------------------------------- - - // Static -------------------------------------------------------- - /** - * null safe. - * @param s + * Truncate s to the given length at closest space or xml tag. Any xml tags will be closed/balanced. + * + * @param input The string that should be truncated. * @param length - * @return + * @return The truncated string */ - public static String chop(final String s, final int length) { - return chop(s, length, false); + public static String chop(final String input, final int length) { + return chop(input, length, false); } /** - * null safe. - * @param s - * @param length - * @param chopWord allowed to chop a word in half - * @return + * Truncate s to the given length or to closest space/tag depending on chop. Any xml tags will be closed/balanced. + * @param input The string that should be truncated. + * @param length max length of string (if choped the string will be '...' longer then max.) + * @param chop If words should be choped, or if we chop inbetween spaces. + * @return The truncated string */ - public static String chop(final String s, final int length, final boolean chopWord) { + public static String chop(final String input, final int length, final boolean chop) { - if(null != s){ + if (input == null) + return null; - final StringBuilder choppedString = new StringBuilder(s); + Stack<Integer> stack = new Stack<Integer>(); + char[] s = input.toCharArray(); + StringBuilder res = new StringBuilder(s.length); + State state = State.none; + int count = 0; + int i = 0; - int laOriginalCount = 0, raOriginalCount = 0, markupLength = 0; - boolean insideMarkup = false; - for(int i = 0; i < choppedString.length(); ++i){ - if( '<' == choppedString.charAt(i) ){ ++laOriginalCount; insideMarkup = true;} + main: for (; i < s.length; i++) { + char c = s[i]; + switch (state) { + case none: + if (c == '<') { + state = State.tag; + } else { + count++; + if (count == length) { + res.append(c); + break main; + } + } + break; - if (insideMarkup) { - ++markupLength; + case tag: + if (c == '/') { + state = State.endTag; + } else if (c == '!') { + // ![CDATA[ + if (s.length > (i + 7) && s[i + 1] == '[' && (s[i + 2] == 'C' || s[i + 2] == 'c') + && (s[i + 3] == 'D' || s[i + 3] == 'd') && (s[i + 4] == 'A' || s[i + 4] == 'a') + && (s[i + 5] == 'T' || s[i + 5] == 't') && (s[i + 6] == 'A' || s[i + 6] == 'a') + && s[i + 7] == '[') { + state = State.cdata; + res.append("![CDATA["); + i += 7; + continue; + } + // !-- + else if (s.length > (i + 2) && s[i + 1] == '-' && s[i + 2] == '-') { + state = State.comment; + res.append("!--"); + i += 2; + continue; + } + } else if (c == '?') { + state = State.declaration; + } else { + stack.push(i); + state = State.startTag; } + break; - if( '>' == choppedString.charAt(i) ){ ++raOriginalCount; insideMarkup = false;} - - } - - // if we have more left than right arrows - while(laOriginalCount > raOriginalCount){ - choppedString.append('>'); - ++raOriginalCount; - markupLength = 0; // Be safe, use original length if markup unbalanced. - } - - // We're interested in limiting the length of the rendered string excluding the length of the markup. - final int maxLength = length + markupLength; - - if(length > 0 && choppedString.length() > maxLength){ - - // chop the string first - choppedString.setLength(maxLength); - - // if we chopped a tag in half remove the half left over. - int laCount = 0, raCount = 0; - for(int i = 0; i < choppedString.length(); ++i){ - if( '<' == choppedString.charAt(i) ){ ++laCount; } - else if( '>' == choppedString.charAt(i) ){ ++raCount; } + case startTag: + if (c == '/') { + if (s.length > (i + 1) && s[i + 1] == '>') { + state = State.none; + res.append("/>"); + i += 1; + if(!stack.isEmpty()) + stack.pop(); + continue; + } + } else if (c == '>') { + state = State.none; } + break; - // if we have more left than right arrows - if( laCount > raCount ){ - choppedString.setLength(choppedString.lastIndexOf("<")); + case endTag: + if (c == '>') { + state = State.none; + if(!stack.isEmpty()) + stack.pop(); } + break; - // append the dot-dot-dot - switch( choppedString.length() >0 ? choppedString.charAt( choppedString.length() - 1 ) : ' '){ - case '.': - final String toString = choppedString.toString(); - if( !toString.endsWith("...")){ - if( toString.endsWith("..")){ - choppedString.append('.'); - }else { - choppedString.append(".."); - } - } - break; - default: - if(!chopWord){ - final int lastSpace = choppedString.lastIndexOf(" "); + case cdata: - if (lastSpace >= 0) { - choppedString.setLength(lastSpace + 1); - } - } - choppedString.append("..."); - break; + if (c == ']') {// ]]> + if (s.length > (i + 2) && s[i + 1] == ']' && s[i + 2] == '>') { + state = State.none; + res.append("]]>"); + i += 2; + continue; + } + } else { + count++; + if (count == length) { + res.append(c); + break main; + } } + break; - } - - if(0 < laOriginalCount){ - // balance opening tags if the chop happened inbetween open and close tags. - //LOG.debug("");LOG.debug("Balancing " + choppedString); - - final LinkedList<String> tags = new LinkedList<String>(); - final LinkedList<int[]> tagsToRemove = new LinkedList<int[]>(); - - final Matcher matcher = openTag.matcher(choppedString); - - while( matcher.find() ){ - if( closeTag.matcher(matcher.group()).find()) { - - if(tags.size() > 0 && matcher.group().equalsIgnoreCase(tags.getFirst().replaceFirst("<", "</"))){ - - //LOG.debug("Found closing tag " + matcher.group()); - tags.removeFirst(); - - }else{ - - // we've found a premature closing tag. remove it. - //LOG.debug("Found unmatched closing tag " + matcher.group()); - tagsToRemove.addFirst(new int[]{matcher.start(), matcher.end()}); - } - - }else if( singleTag.matcher(matcher.group()).find() ){ - - //LOG.debug("Ignoring single tag " + matcher.group()); - }else{ - - // Removing attributes etc to find the correct closing tag. - //LOG.debug("Found opening tag " + matcher.group()); - //LOG.debug(" adding to stack: " + matcher.group().replaceFirst(" [^>]+", "")); - tags.addFirst(matcher.group().replaceFirst(" [^>]+", "")); + case comment: + if (c == '-') { + // --> + if (s.length > (i + 2) && s[i + 1] == '-' && s[i + 2] == '>') { + state = State.none; + res.append("-->"); + i += 2; + continue; } } + break; - // remove tags that had no opening - for(final int[] startEnd : tagsToRemove){ - - //LOG.debug("Removing " + matcher.group()); - choppedString.delete(startEnd[0], startEnd[1]); + case declaration: + if (c == '?') { + if (s.length > (i + 1) && s[i + 1] == '>') { + state = State.none; + res.append("?>"); + i += 1; + continue; + } } + break; + } + res.append(c); + } - // add tags to balance - for(final String tag : tags){ - - //LOG.debug("Adding " + tag.replaceFirst("<", "</")); - choppedString.append(tag.replaceFirst("<", "</")); + // append dots + dot: if (i < s.length - 1) { + if (chop) { + res.append("..."); + } else { + for (int k = i; k > 0; k--) { + if (s[k] == ' ' || s[k] == ((state == State.cdata) ? '[' : '>')) { + res.setLength(k + 1); + res.append("..."); + break dot; + } } + res.append("..."); } - LOG.trace(DEBUG_CHOPSUEY + choppedString); + } - return choppedString.toString(); + // close CDATA if we are in one + if (state == State.cdata) { + res.append("]]>"); } - return null; - } - // Constructors -------------------------------------------------- + // close all other open tags + while (!stack.isEmpty()) { + int j = stack.pop(); + char c = s[j]; + res.append("</"); + while (s.length > j && c != '>') { + res.append(c); + c = s[++j]; + } + res.append('>'); + } - /** Creates a new instance of StringChopper */ - private StringChopper(){ + return res.toString(); } - - // Public -------------------------------------------------------- - - // Z implementation ---------------------------------------------- - - // Y overrides --------------------------------------------------- - - // Package protected --------------------------------------------- - - // Protected ----------------------------------------------------- - - // Private ------------------------------------------------------- - - // Inner classes ------------------------------------------------- - - - } _______________________________________________ Kernel-commits mailing list [email protected] http://sesat.no/mailman/listinfo/kernel-commits
