On Tue, Jun 3, 2008 at 6:17 AM, Lin Guo <[EMAIL PROTECTED]> wrote: > I am wondering whether it is possible to deserialize the keys and values in a > hadoop output file where the output format is SequenceFileOutputFormat.
I wrote some code to do this, samples attached. -Stuart
/* SeqKeyList.java - print list of keys in a SequenceFile * * Copyright (C) 2008 Stuart Sierra * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You * may obtain a copy of the License at * http:www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ import java.nio.charset.Charset; /* From hadoop-*-core.jar, http://hadoop.apache.org/ * Developed with Hadoop 0.16.3. */ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.BytesWritable; /** Prints the contents of a SequenceFile. * * @author Stuart Sierra, http://stuartsierra.com/ */ public class SeqFilePrinter { private String inputFile; private LocalSetup setup; public SeqFilePrinter() throws Exception { setup = new LocalSetup(); } /** Set the name of the input sequence file. * * @param filename a local path string */ public void setInput(String filename) { inputFile = filename; } /** Runs the process. Keys are printed to standard output; * information about the sequence file is printed to standard * error. */ public void execute() throws Exception { Path path = new Path(inputFile); SequenceFile.Reader reader = new SequenceFile.Reader(setup.getLocalFileSystem(), path, setup.getConf()); try { System.out.println("Key type is " + reader.getKeyClassName()); System.out.println("Value type is " + reader.getValueClassName()); if (reader.isCompressed()) { System.err.println("Values are compressed."); if (reader.isBlockCompressed()) { System.err.println("Records are block-compressed."); } System.err.println("Compression type is " + reader.getCompressionCodec().getClass().getName()); } System.out.println(""); Writable key = (Writable)(reader.getKeyClass().newInstance()); Writable val = (Writable)(reader.getValueClass().newInstance()); String value; while (reader.next(key, val)) { System.out.println("============================================================"); System.out.println("KEY:\t" + key.toString()); if (val instanceof BytesWritable) { BytesWritable v = (BytesWritable)val; value = new String(v.get(), 0, v.getSize()); } else { value = val.toString(); } System.out.println("VALUE:\n" + value); } } finally { reader.close(); } } public static void main(String[] args) { if (args.length != 1) { exitWithHelp(); } try { SeqFilePrinter me = new SeqFilePrinter(); me.setInput(args[0]); me.execute(); } catch (Exception e) { e.printStackTrace(); exitWithHelp(); } } /** Prints usage instructions to standard error and exits. */ public static void exitWithHelp() { System.err.println("Usage: java SeqFilePrinter <sequence-file>\n" + "Prints the contents of the sequence file."); System.exit(1); } }
/* LocalSetup.java -- support for the Hadoop API outside of Hadoop * * Copyright (C) 2008 Stuart Sierra * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You * may obtain a copy of the License at * http:www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ /* From hadoop-*-core.jar, http://hadoop.apache.org/ * Developed with Hadoop 0.16.3. */ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; /** Provides Hadoop configuration and local file system objects for * other classes. This is for situations where you want to use some * part of the Hadoop code outside of the Hadoop Map/Reduce * framework. * * @author Stuart Sierra, http://stuartsierra.com/ */ public class LocalSetup { private FileSystem fileSystem; private Configuration config; /** Sets up Configuration and LocalFileSystem instances for * Hadoop. Throws Exception if they fail. Does not load any * Hadoop XML configuration files, just sets the minimum * configuration necessary to use the local file system. */ public LocalSetup() throws Exception { config = new Configuration(); /* Normally set in hadoop-default.xml, without it you get * "java.io.IOException: No FileSystem for scheme: file" */ config.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); fileSystem = FileSystem.get(config); if (fileSystem.getConf() == null) { /* This happens if the FileSystem is not properly * initialized, causes NullPointerException later. */ throw new Exception("LocalFileSystem configuration is null"); } } /** Returns a Hadoop Configuration instance for use in Hadoop API * calls. */ public Configuration getConf() { return config; } /** Returns a Hadoop FileSystem instance that provides access to * the local filesystem. */ public FileSystem getLocalFileSystem() { return fileSystem; } }
