Hi to all,

I don't have too much time to work in Droids now. But I saw that Ryan
commits the Tika patch and I was working in a better integration with
Tika. In te attachment there the .java file with the new version of
LinkExtraction class. Change a lot of things and i think is better to
send you in this way instead a diff.

I hope to have time in the next year to work more actively with this
project. For now i will help with everything as i can do.

Salu10.
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.droids.tika;

import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.droids.LinkTask;
import org.apache.droids.api.Link;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

public class LinkExtractor extends DefaultHandler {

  protected final Log log = LogFactory.getLog(this.getClass());

  /**
   * List of links
   */
  private Collection<Link> links = new ArrayList<Link>();

  /**
   * Map with the pair label-attribute for the accepted items
   */
  private Map<String, String> elements;

  /**
   * Base url for host reference
   */
  private Link base = null;

  /**
   * Set of URIs visited yet
   */
  private Set<String> history = null;

  /**
   * The parsed link
   */
  private URI link = null;

  @Override
  public void startDocument() throws SAXException {
    history = new HashSet<String>();
    history.add(base.getURI().toString());
  }

  @Override
  public void startElement(String uri, String loc, String raw, Attributes att)
      throws SAXException {
    Iterator<String> it = elements.keySet().iterator();
    String elem, linkAtt;
    while (it.hasNext()) {
      elem = it.next();
      linkAtt = elements.get(elem);
      if (elem.equalsIgnoreCase(loc) && att.getValue(linkAtt) != null) {
        link = getURI(att.getValue(linkAtt));
        log.debug("Found element: " + elem + " with link: " + link);
      }
    }
  }

  @Override
  public void characters(char[] chars, int start, int length)
      throws SAXException {
    if (link != null) {
      addOutlinkURI(new StringBuilder().append(chars, start, length).toString());
      link = null;
    }
  }

  @Override
  public void endDocument() throws SAXException {
    history = null;
    log.debug("Found " + links.size() + " outliks");
  }

  public void addOutlinkURI(String value) {
    if (history == null)
      history = new HashSet<String>();
    if (links == null)
      links = new ArrayList<Link>();
    if (history.add(link.toString())) {
      links.add(new LinkTask(base, link, base.getDepth() + 1));
      log.debug("Added outlink: " + link + " with depth: " + base.getDepth()
          + 1);
    }
  }

  public void setBase(Link base) {
    this.base = base;
  }

  public Collection<Link> getLinks() {
    return links;
  }

  public Map<String, String> getElements() {
    return elements;
  }

  public void setElements(Map<String, String> elements) {
    this.elements = elements;
  }

  private URI getURI(String target) {
    try {
      if (!target.toLowerCase().startsWith("javascript")
          && !target.contains(":/")) {
        return base.getURI().resolve(target.split("#")[0]);
      } else if (!target.toLowerCase().startsWith("javascript")) {
        return new URI(target.split("#")[0]);
      }
    } catch (Exception e) {
      log.error("URI not valid: " + target);
    }
    return null;
  }
}

Reply via email to