Hi,

I have experienced problems when saving documents to file. The domain I was trying to crawl was "http://www.dn.no/finans/";. This is what I ended up with:
///////////////////// START //////////////////////////
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.droids.handle;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;

import org.apache.droids.api.ContentEntity;
import org.apache.droids.api.Handler;

/**
 * Handler which is writing the stream to the file system.
 * <p>
 * Before using make sure you have set the export directory
* {...@link #setOutputDir(String outputDir)} and whether you want to use the host
 * as prefix {...@link #setIncludeHost(boolean includeHost)}.
 *
 * @version 1.0
 *
 */
public class Save extends WriterHandler implements Handler {

  private String outputDir = null;

  private boolean includeHost = false;
  protected int bufferSize=8192;

public void handle(URI uri, ContentEntity entity) throws IOException {
    InputStream instream = entity.obtainContent();
    try {
      writeOutput(uri, instream);
    } finally {
      instream.close();
    }
  }

private void writeOutput(URI uri, InputStream stream) throws IOException {
        byte[] buffer = new byte[bufferSize];
      int length = stream.read(buffer);

      String file = outputDir;
      String path = uri.getPath();
      if (path.equals(uri.getHost()) || path.matches(".*/\\w+")) {
          path += "/<ROOT>.html";
      }
      if(path.endsWith("/"))
          path += "<ROOT>.html";
      if (includeHost) {
        file += uri.getHost() + path;
      } else {
        file += path.substring(1);
      }
      log.info("Trying to save " + uri + " to " + file);
      File cache = new File(file);
      createFile(cache);
OutputStream output = new BufferedOutputStream(new FileOutputStream(cache));

      while (length > -1) {
        output.write(buffer, 0, length);
        length = stream.read(buffer);
      }
      output.flush();
      output.close();
  }

  private static void createFile(File cache) throws IOException {
if (!cache.isDirectory() && ! cache.getAbsolutePath().endsWith("/")) {
      try {
        cache.createNewFile();
      } catch (IOException e) {
        // if we cannot create a file that means that the parent path
        // does not exists
        File path = new File(cache.getParent());
        path.mkdirs();
        cache.createNewFile();
      }
    }
  }

  /**
   * Get the directory where we want to save the stream.
   *
   * @return directory where we want to save the stream.
   */
  public String getOutputDir() {
    return outputDir;
  }

  /**
   * Set the directory where we want to save the stream.
   *
   * @param outputDir
   *                the directory where we want to save the stream.
   */
  public void setOutputDir(String outputDir) {
    this.outputDir = outputDir;
  }

  /**
   * Do we want to prefix the export dir with the host name.
   *
   * @return true if we want to use the prefix; false otherwise
   */
  public boolean isIncludeHost() {
    return includeHost;
  }

  /**
   * Do we want to prefix the export dir with the host name.
   *
   * @param includeHost
   *                true if we want to use the prefix; false otherwise
   */
  public void setIncludeHost(boolean includeHost) {
    this.includeHost = includeHost;
  }

}

/////////////////////// END //////////////////////////

What i did was sending in the URI (as it makes it thread safe) and handling documents that is part of a document hierarchy by adding <ROOT.html>. I'm not sure that it is 100% robust but it is better.

Keep ut the good work!

BR

Stein K

Reply via email to