Modified: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java (original) +++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java Thu Jan 29 05:38:59 2015 @@ -40,565 +40,556 @@ import org.apache.commons.net.ftp.FTPRep import org.apache.commons.net.ftp.FTPConnectionClosedException; /*********************************************** - * Client.java encapsulates functionalities necessary for nutch to - * get dir list and retrieve file from an FTP server. - * This class takes care of all low level details of interacting - * with an FTP server and provides a convenient higher level interface. - * + * Client.java encapsulates functionalities necessary for nutch to get dir list + * and retrieve file from an FTP server. This class takes care of all low level + * details of interacting with an FTP server and provides a convenient higher + * level interface. + * * Modified from FtpClient.java in apache commons-net. * - * Notes by John Xing: - * ftp server implementations are hardly uniform and none seems to follow - * RFCs whole-heartedly. We have no choice, but assume common denominator - * as following: - * (1) Use stream mode for data transfer. Block mode will be better for - * multiple file downloading and partial file downloading. However - * not every ftpd has block mode support. - * (2) Use passive mode for data connection. - * So Nutch will work if we run behind firewall. - * (3) Data connection is opened/closed per ftp command for the reasons - * listed in (1). There are ftp servers out there, - * when partial downloading is enforced by closing data channel - * socket on our client side, the server side immediately closes - * control channel (socket). Our codes deal with such a bad behavior. - * (4) LIST is used to obtain remote file attributes if possible. - * MDTM & SIZE would be nice, but not as ubiquitously implemented as LIST. - * (5) Avoid using ABOR in single thread? Do not use it at all. - * - * About exceptions: - * Some specific exceptions are re-thrown as one of FtpException*.java - * In fact, each function throws FtpException*.java or pass IOException. - * + * Notes by John Xing: ftp server implementations are hardly uniform and none + * seems to follow RFCs whole-heartedly. We have no choice, but assume common + * denominator as following: (1) Use stream mode for data transfer. Block mode + * will be better for multiple file downloading and partial file downloading. + * However not every ftpd has block mode support. (2) Use passive mode for data + * connection. So Nutch will work if we run behind firewall. (3) Data connection + * is opened/closed per ftp command for the reasons listed in (1). There are ftp + * servers out there, when partial downloading is enforced by closing data + * channel socket on our client side, the server side immediately closes control + * channel (socket). Our codes deal with such a bad behavior. (4) LIST is used + * to obtain remote file attributes if possible. MDTM & SIZE would be nice, but + * not as ubiquitously implemented as LIST. (5) Avoid using ABOR in single + * thread? Do not use it at all. + * + * About exceptions: Some specific exceptions are re-thrown as one of + * FtpException*.java In fact, each function throws FtpException*.java or pass + * IOException. + * * @author John Xing ***********************************************/ -public class Client extends FTP -{ - private int __dataTimeout; - private int __passivePort; - private String __passiveHost; -// private int __fileType, __fileFormat; - private boolean __remoteVerificationEnabled; -// private FTPFileEntryParser __entryParser; - private String __systemName; - - /** Public default constructor */ - public Client() - { - __initDefaults(); - __dataTimeout = -1; - __remoteVerificationEnabled = true; - } - - // defaults when initialize - private void __initDefaults() - { - __passiveHost = null; - __passivePort = -1; - __systemName = null; -// __fileType = FTP.ASCII_FILE_TYPE; -// __fileFormat = FTP.NON_PRINT_TEXT_FORMAT; -// __entryParser = null; - } - - // parse reply for pass() - private void __parsePassiveModeReply(String reply) - throws MalformedServerReplyException - { - int i, index, lastIndex; - String octet1, octet2; - StringBuffer host; - - reply = reply.substring(reply.indexOf('(') + 1, - reply.indexOf(')')).trim(); - - host = new StringBuffer(24); - lastIndex = 0; - index = reply.indexOf(','); - host.append(reply.substring(lastIndex, index)); - - for (i = 0; i < 3; i++) - { - host.append('.'); - lastIndex = index + 1; - index = reply.indexOf(',', lastIndex); - host.append(reply.substring(lastIndex, index)); - } - - lastIndex = index + 1; - index = reply.indexOf(',', lastIndex); - - octet1 = reply.substring(lastIndex, index); - octet2 = reply.substring(index + 1); - - // index and lastIndex now used as temporaries - try - { - index = Integer.parseInt(octet1); - lastIndex = Integer.parseInt(octet2); - } - catch (NumberFormatException e) - { - throw new MalformedServerReplyException( - "Could not parse passive host information.\nServer Reply: " + reply); - } - - index <<= 8; - index |= lastIndex; - - __passiveHost = host.toString(); - __passivePort = index; - } - - /** - * open a passive data connection socket - * @param command - * @param arg - * @return - * @throws IOException - * @throws FtpExceptionCanNotHaveDataConnection - */ - protected Socket __openPassiveDataConnection(int command, String arg) +public class Client extends FTP { + private int __dataTimeout; + private int __passivePort; + private String __passiveHost; + // private int __fileType, __fileFormat; + private boolean __remoteVerificationEnabled; + // private FTPFileEntryParser __entryParser; + private String __systemName; + + /** Public default constructor */ + public Client() { + __initDefaults(); + __dataTimeout = -1; + __remoteVerificationEnabled = true; + } + + // defaults when initialize + private void __initDefaults() { + __passiveHost = null; + __passivePort = -1; + __systemName = null; + // __fileType = FTP.ASCII_FILE_TYPE; + // __fileFormat = FTP.NON_PRINT_TEXT_FORMAT; + // __entryParser = null; + } + + // parse reply for pass() + private void __parsePassiveModeReply(String reply) + throws MalformedServerReplyException { + int i, index, lastIndex; + String octet1, octet2; + StringBuffer host; + + reply = reply.substring(reply.indexOf('(') + 1, reply.indexOf(')')).trim(); + + host = new StringBuffer(24); + lastIndex = 0; + index = reply.indexOf(','); + host.append(reply.substring(lastIndex, index)); + + for (i = 0; i < 3; i++) { + host.append('.'); + lastIndex = index + 1; + index = reply.indexOf(',', lastIndex); + host.append(reply.substring(lastIndex, index)); + } + + lastIndex = index + 1; + index = reply.indexOf(',', lastIndex); + + octet1 = reply.substring(lastIndex, index); + octet2 = reply.substring(index + 1); + + // index and lastIndex now used as temporaries + try { + index = Integer.parseInt(octet1); + lastIndex = Integer.parseInt(octet2); + } catch (NumberFormatException e) { + throw new MalformedServerReplyException( + "Could not parse passive host information.\nServer Reply: " + reply); + } + + index <<= 8; + index |= lastIndex; + + __passiveHost = host.toString(); + __passivePort = index; + } + + /** + * open a passive data connection socket + * + * @param command + * @param arg + * @return + * @throws IOException + * @throws FtpExceptionCanNotHaveDataConnection + */ + protected Socket __openPassiveDataConnection(int command, String arg) throws IOException, FtpExceptionCanNotHaveDataConnection { - Socket socket; + Socket socket; -// // 20040317, xing, accommodate ill-behaved servers, see below -// int port_previous = __passivePort; + // // 20040317, xing, accommodate ill-behaved servers, see below + // int port_previous = __passivePort; - if (pasv() != FTPReply.ENTERING_PASSIVE_MODE) - throw new FtpExceptionCanNotHaveDataConnection( - "pasv() failed. " + getReplyString()); - - try { - __parsePassiveModeReply(getReplyStrings()[0]); - } catch (MalformedServerReplyException e) { - throw new FtpExceptionCanNotHaveDataConnection(e.getMessage()); - } - -// // 20040317, xing, accommodate ill-behaved servers, see above -// int count = 0; -// System.err.println("__passivePort "+__passivePort); -// System.err.println("port_previous "+port_previous); -// while (__passivePort == port_previous) { -// // just quit if too many tries. make it an exception here? -// if (count++ > 10) -// return null; -// // slow down further for each new try -// Thread.sleep(500*count); -// if (pasv() != FTPReply.ENTERING_PASSIVE_MODE) -// throw new FtpExceptionCanNotHaveDataConnection( -// "pasv() failed. " + getReplyString()); -// //return null; -// try { -// __parsePassiveModeReply(getReplyStrings()[0]); -// } catch (MalformedServerReplyException e) { -// throw new FtpExceptionCanNotHaveDataConnection(e.getMessage()); -// } -// } - - socket = _socketFactory_.createSocket(__passiveHost, __passivePort); - - if (!FTPReply.isPositivePreliminary(sendCommand(command, arg))) { - socket.close(); - return null; - } - - if (__remoteVerificationEnabled && !verifyRemote(socket)) - { - InetAddress host1, host2; - - host1 = socket.getInetAddress(); - host2 = getRemoteAddress(); - - socket.close(); - - // our precaution - throw new FtpExceptionCanNotHaveDataConnection( - "Host attempting data connection " + host1.getHostAddress() + - " is not same as server " + host2.getHostAddress() + - " So we intentionally close it for security precaution." - ); - } - - if (__dataTimeout >= 0) - socket.setSoTimeout(__dataTimeout); - - return socket; - } - - /*** - * Sets the timeout in milliseconds to use for data connection. - * set immediately after opening the data connection. - ***/ - public void setDataTimeout(int timeout) - { - __dataTimeout = timeout; - } - - /*** - * Closes the connection to the FTP server and restores - * connection parameters to the default values. - * <p> - * @exception IOException If an error occurs while disconnecting. - ***/ - public void disconnect() throws IOException - { - __initDefaults(); - super.disconnect(); - // no worry for data connection, since we always close it - // in every ftp command that invloves data connection - } - - /*** - * Enable or disable verification that the remote host taking part - * of a data connection is the same as the host to which the control - * connection is attached. The default is for verification to be - * enabled. You may set this value at any time, whether the - * FTPClient is currently connected or not. - * <p> - * @param enable True to enable verification, false to disable verification. - ***/ - public void setRemoteVerificationEnabled(boolean enable) - { - __remoteVerificationEnabled = enable; - } - - /*** - * Return whether or not verification of the remote host participating - * in data connections is enabled. The default behavior is for - * verification to be enabled. - * <p> - * @return True if verification is enabled, false if not. - ***/ - public boolean isRemoteVerificationEnabled() - { - return __remoteVerificationEnabled; - } - - /*** - * Login to the FTP server using the provided username and password. - * <p> - * @param username The username to login under. - * @param password The password to use. - * @return True if successfully completed, false if not. - * @exception FTPConnectionClosedException - * If the FTP server prematurely closes the connection as a result - * of the client being idle or some other reason causing the server - * to send FTP reply code 421. This exception may be caught either - * as an IOException or independently as itself. - * @exception IOException If an I/O error occurs while either sending a - * command to the server or receiving a reply from the server. - ***/ - public boolean login(String username, String password) throws IOException - { - user(username); - - if (FTPReply.isPositiveCompletion(getReplyCode())) - return true; - - // If we get here, we either have an error code, or an intermmediate - // reply requesting password. - if (!FTPReply.isPositiveIntermediate(getReplyCode())) - return false; - - return FTPReply.isPositiveCompletion(pass(password)); - } - - /*** - * Logout of the FTP server by sending the QUIT command. - * <p> - * @return True if successfully completed, false if not. - * @exception FTPConnectionClosedException - * If the FTP server prematurely closes the connection as a result - * of the client being idle or some other reason causing the server - * to send FTP reply code 421. This exception may be caught either - * as an IOException or independently as itself. - * @exception IOException If an I/O error occurs while either sending a - * command to the server or receiving a reply from the server. - ***/ - public boolean logout() throws IOException - { - return FTPReply.isPositiveCompletion(quit()); - } - - /** - * retrieve list reply for path - * @param path - * @param entries - * @param limit - * @param parser - * @throws IOException - * @throws FtpExceptionCanNotHaveDataConnection - * @throws FtpExceptionUnknownForcedDataClose - * @throws FtpExceptionControlClosedByForcedDataClose - */ - public void retrieveList(String path, List<FTPFile> entries, int limit, - FTPFileEntryParser parser) - throws IOException, - FtpExceptionCanNotHaveDataConnection, - FtpExceptionUnknownForcedDataClose, - FtpExceptionControlClosedByForcedDataClose { - Socket socket = __openPassiveDataConnection(FTPCommand.LIST, path); + if (pasv() != FTPReply.ENTERING_PASSIVE_MODE) + throw new FtpExceptionCanNotHaveDataConnection("pasv() failed. " + + getReplyString()); + + try { + __parsePassiveModeReply(getReplyStrings()[0]); + } catch (MalformedServerReplyException e) { + throw new FtpExceptionCanNotHaveDataConnection(e.getMessage()); + } + + // // 20040317, xing, accommodate ill-behaved servers, see above + // int count = 0; + // System.err.println("__passivePort "+__passivePort); + // System.err.println("port_previous "+port_previous); + // while (__passivePort == port_previous) { + // // just quit if too many tries. make it an exception here? + // if (count++ > 10) + // return null; + // // slow down further for each new try + // Thread.sleep(500*count); + // if (pasv() != FTPReply.ENTERING_PASSIVE_MODE) + // throw new FtpExceptionCanNotHaveDataConnection( + // "pasv() failed. " + getReplyString()); + // //return null; + // try { + // __parsePassiveModeReply(getReplyStrings()[0]); + // } catch (MalformedServerReplyException e) { + // throw new FtpExceptionCanNotHaveDataConnection(e.getMessage()); + // } + // } - if (socket == null) - throw new FtpExceptionCanNotHaveDataConnection("LIST " - + ((path == null) ? "" : path)); + socket = _socketFactory_.createSocket(__passiveHost, __passivePort); - BufferedReader reader = - new BufferedReader(new InputStreamReader(socket.getInputStream())); + if (!FTPReply.isPositivePreliminary(sendCommand(command, arg))) { + socket.close(); + return null; + } - // force-close data channel socket, when download limit is reached -// boolean mandatory_close = false; + if (__remoteVerificationEnabled && !verifyRemote(socket)) { + InetAddress host1, host2; - //List entries = new LinkedList(); - int count = 0; - String line = parser.readNextEntry(reader); - while (line != null) { - FTPFile ftpFile = parser.parseFTPEntry(line); - // skip non-formatted lines - if (ftpFile == null) { - line = parser.readNextEntry(reader); - continue; - } - entries.add(ftpFile); - count += line.length(); - // impose download limit if limit >= 0, otherwise no limit - // here, cut off is up to the line when total bytes is just over limit - if (limit >= 0 && count > limit) { -// mandatory_close = true; - break; - } - line = parser.readNextEntry(reader); - } + host1 = socket.getInetAddress(); + host2 = getRemoteAddress(); - //if (mandatory_close) - // you always close here, no matter mandatory_close or not. - // however different ftp servers respond differently, see below. socket.close(); - // scenarios: - // (1) mandatory_close is false, download limit not reached - // no special care here - // (2) mandatory_close is true, download limit is reached - // different servers have different reply codes: - - try { - int reply = getReply(); - if (!_notBadReply(reply)) - throw new FtpExceptionUnknownForcedDataClose(getReplyString()); - } catch (FTPConnectionClosedException e) { - // some ftp servers will close control channel if data channel socket - // is closed by our end before all data has been read out. Check: - // tux414.q-tam.hp.com FTP server (hp.com version whp02) - // so must catch FTPConnectionClosedException thrown by getReply() above - //disconnect(); - throw new FtpExceptionControlClosedByForcedDataClose(e.getMessage()); - } - - } + // our precaution + throw new FtpExceptionCanNotHaveDataConnection( + "Host attempting data connection " + host1.getHostAddress() + + " is not same as server " + host2.getHostAddress() + + " So we intentionally close it for security precaution."); + } + + if (__dataTimeout >= 0) + socket.setSoTimeout(__dataTimeout); + + return socket; + } + + /*** + * Sets the timeout in milliseconds to use for data connection. set + * immediately after opening the data connection. + ***/ + public void setDataTimeout(int timeout) { + __dataTimeout = timeout; + } + + /*** + * Closes the connection to the FTP server and restores connection parameters + * to the default values. + * <p> + * + * @exception IOException + * If an error occurs while disconnecting. + ***/ + public void disconnect() throws IOException { + __initDefaults(); + super.disconnect(); + // no worry for data connection, since we always close it + // in every ftp command that invloves data connection + } + + /*** + * Enable or disable verification that the remote host taking part of a data + * connection is the same as the host to which the control connection is + * attached. The default is for verification to be enabled. You may set this + * value at any time, whether the FTPClient is currently connected or not. + * <p> + * + * @param enable + * True to enable verification, false to disable verification. + ***/ + public void setRemoteVerificationEnabled(boolean enable) { + __remoteVerificationEnabled = enable; + } + + /*** + * Return whether or not verification of the remote host participating in data + * connections is enabled. The default behavior is for verification to be + * enabled. + * <p> + * + * @return True if verification is enabled, false if not. + ***/ + public boolean isRemoteVerificationEnabled() { + return __remoteVerificationEnabled; + } + + /*** + * Login to the FTP server using the provided username and password. + * <p> + * + * @param username + * The username to login under. + * @param password + * The password to use. + * @return True if successfully completed, false if not. + * @exception FTPConnectionClosedException + * If the FTP server prematurely closes the connection as a + * result of the client being idle or some other reason causing + * the server to send FTP reply code 421. This exception may be + * caught either as an IOException or independently as itself. + * @exception IOException + * If an I/O error occurs while either sending a command to the + * server or receiving a reply from the server. + ***/ + public boolean login(String username, String password) throws IOException { + user(username); - /** - * retrieve file for path - * @param path - * @param os - * @param limit - * @throws IOException - * @throws FtpExceptionCanNotHaveDataConnection - * @throws FtpExceptionUnknownForcedDataClose - * @throws FtpExceptionControlClosedByForcedDataClose - */ - public void retrieveFile(String path, OutputStream os, int limit) - throws IOException, - FtpExceptionCanNotHaveDataConnection, - FtpExceptionUnknownForcedDataClose, - FtpExceptionControlClosedByForcedDataClose { + if (FTPReply.isPositiveCompletion(getReplyCode())) + return true; - Socket socket = __openPassiveDataConnection(FTPCommand.RETR, path); + // If we get here, we either have an error code, or an intermmediate + // reply requesting password. + if (!FTPReply.isPositiveIntermediate(getReplyCode())) + return false; + + return FTPReply.isPositiveCompletion(pass(password)); + } + + /*** + * Logout of the FTP server by sending the QUIT command. + * <p> + * + * @return True if successfully completed, false if not. + * @exception FTPConnectionClosedException + * If the FTP server prematurely closes the connection as a + * result of the client being idle or some other reason causing + * the server to send FTP reply code 421. This exception may be + * caught either as an IOException or independently as itself. + * @exception IOException + * If an I/O error occurs while either sending a command to the + * server or receiving a reply from the server. + ***/ + public boolean logout() throws IOException { + return FTPReply.isPositiveCompletion(quit()); + } + + /** + * retrieve list reply for path + * + * @param path + * @param entries + * @param limit + * @param parser + * @throws IOException + * @throws FtpExceptionCanNotHaveDataConnection + * @throws FtpExceptionUnknownForcedDataClose + * @throws FtpExceptionControlClosedByForcedDataClose + */ + public void retrieveList(String path, List<FTPFile> entries, int limit, + FTPFileEntryParser parser) throws IOException, + FtpExceptionCanNotHaveDataConnection, FtpExceptionUnknownForcedDataClose, + FtpExceptionControlClosedByForcedDataClose { + Socket socket = __openPassiveDataConnection(FTPCommand.LIST, path); - if (socket == null) - throw new FtpExceptionCanNotHaveDataConnection("RETR " + if (socket == null) + throw new FtpExceptionCanNotHaveDataConnection("LIST " + ((path == null) ? "" : path)); - InputStream input = socket.getInputStream(); + BufferedReader reader = new BufferedReader(new InputStreamReader( + socket.getInputStream())); + + // force-close data channel socket, when download limit is reached + // boolean mandatory_close = false; - // 20040318, xing, treat everything as BINARY_FILE_TYPE for now - // do we ever need ASCII_FILE_TYPE? - //if (__fileType == ASCII_FILE_TYPE) - // input = new FromNetASCIIInputStream(input); - - // fixme, should we instruct server here for binary file type? - - // force-close data channel socket - // boolean mandatory_close = false; - - int len; int count = 0; - byte[] buf = - new byte[org.apache.commons.net.io.Util.DEFAULT_COPY_BUFFER_SIZE]; - while((len=input.read(buf,0,buf.length)) != -1){ - count += len; - // impose download limit if limit >= 0, otherwise no limit - // here, cut off is exactly of limit bytes - if (limit >= 0 && count > limit) { - os.write(buf,0,len-(count-limit)); - // mandatory_close = true; - break; - } - os.write(buf,0,len); - os.flush(); + // List entries = new LinkedList(); + int count = 0; + String line = parser.readNextEntry(reader); + while (line != null) { + FTPFile ftpFile = parser.parseFTPEntry(line); + // skip non-formatted lines + if (ftpFile == null) { + line = parser.readNextEntry(reader); + continue; + } + entries.add(ftpFile); + count += line.length(); + // impose download limit if limit >= 0, otherwise no limit + // here, cut off is up to the line when total bytes is just over limit + if (limit >= 0 && count > limit) { + // mandatory_close = true; + break; } + line = parser.readNextEntry(reader); + } - //if (mandatory_close) - // you always close here, no matter mandatory_close or not. - // however different ftp servers respond differently, see below. - socket.close(); + // if (mandatory_close) + // you always close here, no matter mandatory_close or not. + // however different ftp servers respond differently, see below. + socket.close(); + + // scenarios: + // (1) mandatory_close is false, download limit not reached + // no special care here + // (2) mandatory_close is true, download limit is reached + // different servers have different reply codes: + + try { + int reply = getReply(); + if (!_notBadReply(reply)) + throw new FtpExceptionUnknownForcedDataClose(getReplyString()); + } catch (FTPConnectionClosedException e) { + // some ftp servers will close control channel if data channel socket + // is closed by our end before all data has been read out. Check: + // tux414.q-tam.hp.com FTP server (hp.com version whp02) + // so must catch FTPConnectionClosedException thrown by getReply() above + // disconnect(); + throw new FtpExceptionControlClosedByForcedDataClose(e.getMessage()); + } + + } + + /** + * retrieve file for path + * + * @param path + * @param os + * @param limit + * @throws IOException + * @throws FtpExceptionCanNotHaveDataConnection + * @throws FtpExceptionUnknownForcedDataClose + * @throws FtpExceptionControlClosedByForcedDataClose + */ + public void retrieveFile(String path, OutputStream os, int limit) + throws IOException, FtpExceptionCanNotHaveDataConnection, + FtpExceptionUnknownForcedDataClose, + FtpExceptionControlClosedByForcedDataClose { - // scenarios: - // (1) mandatory_close is false, download limit not reached - // no special care here - // (2) mandatory_close is true, download limit is reached - // different servers have different reply codes: - - // do not need this - //sendCommand("ABOR"); - - try { - int reply = getReply(); - if (!_notBadReply(reply)) - throw new FtpExceptionUnknownForcedDataClose(getReplyString()); - } catch (FTPConnectionClosedException e) { - // some ftp servers will close control channel if data channel socket - // is closed by our end before all data has been read out. Check: - // tux414.q-tam.hp.com FTP server (hp.com version whp02) - // so must catch FTPConnectionClosedException thrown by getReply() above - //disconnect(); - throw new FtpExceptionControlClosedByForcedDataClose(e.getMessage()); - } + Socket socket = __openPassiveDataConnection(FTPCommand.RETR, path); + + if (socket == null) + throw new FtpExceptionCanNotHaveDataConnection("RETR " + + ((path == null) ? "" : path)); + + InputStream input = socket.getInputStream(); + // 20040318, xing, treat everything as BINARY_FILE_TYPE for now + // do we ever need ASCII_FILE_TYPE? + // if (__fileType == ASCII_FILE_TYPE) + // input = new FromNetASCIIInputStream(input); + + // fixme, should we instruct server here for binary file type? + + // force-close data channel socket + // boolean mandatory_close = false; + + int len; + int count = 0; + byte[] buf = new byte[org.apache.commons.net.io.Util.DEFAULT_COPY_BUFFER_SIZE]; + while ((len = input.read(buf, 0, buf.length)) != -1) { + count += len; + // impose download limit if limit >= 0, otherwise no limit + // here, cut off is exactly of limit bytes + if (limit >= 0 && count > limit) { + os.write(buf, 0, len - (count - limit)); + // mandatory_close = true; + break; + } + os.write(buf, 0, len); + os.flush(); } - /** - * reply check after closing data connection - * @param reply - * @return - */ - private boolean _notBadReply(int reply) { - - if (FTPReply.isPositiveCompletion(reply)) { - // do nothing - } else if (reply == 426) { // FTPReply.TRANSFER_ABORTED + // if (mandatory_close) + // you always close here, no matter mandatory_close or not. + // however different ftp servers respond differently, see below. + socket.close(); + + // scenarios: + // (1) mandatory_close is false, download limit not reached + // no special care here + // (2) mandatory_close is true, download limit is reached + // different servers have different reply codes: + + // do not need this + // sendCommand("ABOR"); + + try { + int reply = getReply(); + if (!_notBadReply(reply)) + throw new FtpExceptionUnknownForcedDataClose(getReplyString()); + } catch (FTPConnectionClosedException e) { + // some ftp servers will close control channel if data channel socket + // is closed by our end before all data has been read out. Check: + // tux414.q-tam.hp.com FTP server (hp.com version whp02) + // so must catch FTPConnectionClosedException thrown by getReply() above + // disconnect(); + throw new FtpExceptionControlClosedByForcedDataClose(e.getMessage()); + } + + } + + /** + * reply check after closing data connection + * + * @param reply + * @return + */ + private boolean _notBadReply(int reply) { + + if (FTPReply.isPositiveCompletion(reply)) { + // do nothing + } else if (reply == 426) { // FTPReply.TRANSFER_ABORTED // some ftp servers reply 426, e.g., // foggy FTP server (Version wu-2.6.2(2) - // there is second reply witing? no! - //getReply(); - } else if (reply == 450) { // FTPReply.FILE_ACTION_NOT_TAKEN + // there is second reply witing? no! + // getReply(); + } else if (reply == 450) { // FTPReply.FILE_ACTION_NOT_TAKEN // some ftp servers reply 450, e.g., // ProFTPD [ftp.kernel.org] - // there is second reply witing? no! - //getReply(); - } else if (reply == 451) { // FTPReply.ACTION_ABORTED + // there is second reply witing? no! + // getReply(); + } else if (reply == 451) { // FTPReply.ACTION_ABORTED // some ftp servers reply 451, e.g., // ProFTPD [ftp.kernel.org] - // there is second reply witing? no! - //getReply(); - } else if (reply == 451) { // FTPReply.ACTION_ABORTED - } else { + // there is second reply witing? no! + // getReply(); + } else if (reply == 451) { // FTPReply.ACTION_ABORTED + } else { // what other kind of ftp server out there? - return false; - } + return false; + } + + return true; + } + /*** + * Sets the file type to be transferred. This should be one of + * <code> FTP.ASCII_FILE_TYPE </code>, <code> FTP.IMAGE_FILE_TYPE </code>, + * etc. The file type only needs to be set when you want to change the type. + * After changing it, the new type stays in effect until you change it again. + * The default file type is <code> FTP.ASCII_FILE_TYPE </code> if this method + * is never called. + * <p> + * + * @param fileType + * The <code> _FILE_TYPE </code> constant indcating the type of file. + * @return True if successfully completed, false if not. + * @exception FTPConnectionClosedException + * If the FTP server prematurely closes the connection as a + * result of the client being idle or some other reason causing + * the server to send FTP reply code 421. This exception may be + * caught either as an IOException or independently as itself. + * @exception IOException + * If an I/O error occurs while either sending a command to the + * server or receiving a reply from the server. + ***/ + public boolean setFileType(int fileType) throws IOException { + if (FTPReply.isPositiveCompletion(type(fileType))) { + /* + * __fileType = fileType; __fileFormat = FTP.NON_PRINT_TEXT_FORMAT; + */ return true; } + return false; + } - /*** - * Sets the file type to be transferred. This should be one of - * <code> FTP.ASCII_FILE_TYPE </code>, <code> FTP.IMAGE_FILE_TYPE </code>, - * etc. The file type only needs to be set when you want to change the - * type. After changing it, the new type stays in effect until you change - * it again. The default file type is <code> FTP.ASCII_FILE_TYPE </code> - * if this method is never called. - * <p> - * @param fileType The <code> _FILE_TYPE </code> constant indcating the - * type of file. - * @return True if successfully completed, false if not. - * @exception FTPConnectionClosedException - * If the FTP server prematurely closes the connection as a result - * of the client being idle or some other reason causing the server - * to send FTP reply code 421. This exception may be caught either - * as an IOException or independently as itself. - * @exception IOException If an I/O error occurs while either sending a - * command to the server or receiving a reply from the server. - ***/ - public boolean setFileType(int fileType) throws IOException - { - if (FTPReply.isPositiveCompletion(type(fileType))) - { -/* __fileType = fileType; - __fileFormat = FTP.NON_PRINT_TEXT_FORMAT;*/ - return true; - } - return false; - } - - /*** - * Fetches the system type name from the server and returns the string. - * This value is cached for the duration of the connection after the - * first call to this method. In other words, only the first time - * that you invoke this method will it issue a SYST command to the - * FTP server. FTPClient will remember the value and return the - * cached value until a call to disconnect. - * <p> - * @return The system type name obtained from the server. null if the - * information could not be obtained. - * @exception FTPConnectionClosedException - * If the FTP server prematurely closes the connection as a result - * of the client being idle or some other reason causing the server - * to send FTP reply code 421. This exception may be caught either - * as an IOException or independently as itself. - * @exception IOException If an I/O error occurs while either sending a - * command to the server or receiving a reply from the server. - ***/ - public String getSystemName() - throws IOException, FtpExceptionBadSystResponse - { - //if (syst() == FTPReply.NAME_SYSTEM_TYPE) - // Technically, we should expect a NAME_SYSTEM_TYPE response, but - // in practice FTP servers deviate, so we soften the condition to - // a positive completion. - if (__systemName == null && FTPReply.isPositiveCompletion(syst())) { - __systemName = (getReplyStrings()[0]).substring(4); - } else { - throw new FtpExceptionBadSystResponse( - "Bad response of SYST: " + getReplyString()); - } - - return __systemName; - } - - /*** - * Sends a NOOP command to the FTP server. This is useful for preventing - * server timeouts. - * <p> - * @return True if successfully completed, false if not. - * @exception FTPConnectionClosedException - * If the FTP server prematurely closes the connection as a result - * of the client being idle or some other reason causing the server - * to send FTP reply code 421. This exception may be caught either - * as an IOException or independently as itself. - * @exception IOException If an I/O error occurs while either sending a - * command to the server or receiving a reply from the server. - ***/ - public boolean sendNoOp() throws IOException - { - return FTPReply.isPositiveCompletion(noop()); - } - -// client.stat(path); -// client.sendCommand("STAT"); -// client.sendCommand("STAT",path); -// client.sendCommand("MDTM",path); -// client.sendCommand("SIZE",path); -// client.sendCommand("HELP","SITE"); -// client.sendCommand("SYST"); -// client.setRestartOffset(120); + /*** + * Fetches the system type name from the server and returns the string. This + * value is cached for the duration of the connection after the first call to + * this method. In other words, only the first time that you invoke this + * method will it issue a SYST command to the FTP server. FTPClient will + * remember the value and return the cached value until a call to disconnect. + * <p> + * + * @return The system type name obtained from the server. null if the + * information could not be obtained. + * @exception FTPConnectionClosedException + * If the FTP server prematurely closes the connection as a + * result of the client being idle or some other reason causing + * the server to send FTP reply code 421. This exception may be + * caught either as an IOException or independently as itself. + * @exception IOException + * If an I/O error occurs while either sending a command to the + * server or receiving a reply from the server. + ***/ + public String getSystemName() throws IOException, FtpExceptionBadSystResponse { + // if (syst() == FTPReply.NAME_SYSTEM_TYPE) + // Technically, we should expect a NAME_SYSTEM_TYPE response, but + // in practice FTP servers deviate, so we soften the condition to + // a positive completion. + if (__systemName == null && FTPReply.isPositiveCompletion(syst())) { + __systemName = (getReplyStrings()[0]).substring(4); + } else { + throw new FtpExceptionBadSystResponse("Bad response of SYST: " + + getReplyString()); + } + + return __systemName; + } + + /*** + * Sends a NOOP command to the FTP server. This is useful for preventing + * server timeouts. + * <p> + * + * @return True if successfully completed, false if not. + * @exception FTPConnectionClosedException + * If the FTP server prematurely closes the connection as a + * result of the client being idle or some other reason causing + * the server to send FTP reply code 421. This exception may be + * caught either as an IOException or independently as itself. + * @exception IOException + * If an I/O error occurs while either sending a command to the + * server or receiving a reply from the server. + ***/ + public boolean sendNoOp() throws IOException { + return FTPReply.isPositiveCompletion(noop()); + } + + // client.stat(path); + // client.sendCommand("STAT"); + // client.sendCommand("STAT",path); + // client.sendCommand("MDTM",path); + // client.sendCommand("SIZE",path); + // client.sendCommand("HELP","SITE"); + // client.sendCommand("SYST"); + // client.setRestartOffset(120); }
Modified: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java (original) +++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java Thu Jan 29 05:38:59 2015 @@ -39,13 +39,12 @@ import java.net.URL; import java.io.IOException; /** - * This class is a protocol plugin used for ftp: scheme. - * It creates {@link FtpResponse} object and gets the content of the url from it. + * This class is a protocol plugin used for ftp: scheme. It creates + * {@link FtpResponse} object and gets the content of the url from it. * Configurable parameters are {@code ftp.username}, {@code ftp.password}, - * {@code ftp.content.limit}, {@code ftp.timeout}, - * {@code ftp.server.timeout}, {@code ftp.password}, - * {@code ftp.keep.connection} and {@code ftp.follow.talk}. - * For details see "FTP properties" section in {@code nutch-default.xml}. + * {@code ftp.content.limit}, {@code ftp.timeout}, {@code ftp.server.timeout}, + * {@code ftp.password}, {@code ftp.keep.connection} and {@code ftp.follow.talk} + * . For details see "FTP properties" section in {@code nutch-default.xml}. */ public class Ftp implements Protocol { @@ -60,7 +59,7 @@ public class Ftp implements Protocol { int maxContentLength; String userName; - String passWord; + String passWord; // typical/default server timeout is 120*1000 millisec. // better be conservative here @@ -107,12 +106,14 @@ public class Ftp implements Protocol { this.keepConnection = keepConnection; } - /** - * Creates a {@link FtpResponse} object corresponding to the url and - * returns a {@link ProtocolOutput} object as per the content received + /** + * Creates a {@link FtpResponse} object corresponding to the url and returns a + * {@link ProtocolOutput} object as per the content received * - * @param url Text containing the ftp url - * @param datum The CrawlDatum object corresponding to the url + * @param url + * Text containing the ftp url + * @param datum + * The CrawlDatum object corresponding to the url * * @return {@link ProtocolOutput} object for the url */ @@ -120,36 +121,36 @@ public class Ftp implements Protocol { String urlString = url.toString(); try { URL u = new URL(urlString); - + int redirects = 0; - + while (true) { FtpResponse response; - response = new FtpResponse(u, datum, this, getConf()); // make a request - + response = new FtpResponse(u, datum, this, getConf()); // make a request + int code = response.getCode(); - - if (code == 200) { // got a good response - return new ProtocolOutput(response.toContent()); // return it - - } else if (code >= 300 && code < 400) { // handle redirect + + if (code == 200) { // got a good response + return new ProtocolOutput(response.toContent()); // return it + + } else if (code >= 300 && code < 400) { // handle redirect if (redirects == MAX_REDIRECTS) throw new FtpException("Too many redirects: " + url); u = new URL(response.getHeader("Location")); - redirects++; + redirects++; if (LOG.isTraceEnabled()) { - LOG.trace("redirect to " + u); + LOG.trace("redirect to " + u); } - } else { // convert to exception + } else { // convert to exception throw new FtpError(code); } - } + } } catch (Exception e) { return new ProtocolOutput(null, new ProtocolStatus(e)); } } - protected void finalize () { + protected void finalize() { try { if (this.client != null && this.client.isConnected()) { this.client.logout(); @@ -176,7 +177,7 @@ public class Ftp implements Protocol { System.err.println(usage); System.exit(-1); } - + for (int i = 0; i < args.length; i++) { if (args[i].equals("-logLevel")) { logLevel = args[++i]; @@ -190,7 +191,7 @@ public class Ftp implements Protocol { maxContentLength = Integer.parseInt(args[++i]); } else if (args[i].equals("-dumpContent")) { dumpContent = true; - } else if (i != args.length-1) { + } else if (i != args.length - 1) { System.err.println(usage); System.exit(-1); } else { @@ -210,15 +211,16 @@ public class Ftp implements Protocol { ftp.setMaxContentLength(maxContentLength); // set log level - //LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase())); + // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase())); - Content content = ftp.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); + Content content = ftp.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); System.err.println("Content-Type: " + content.getContentType()); - System.err.println("Content-Length: " + - content.getMetadata().get(Response.CONTENT_LENGTH)); - System.err.println("Last-Modified: " + - content.getMetadata().get(Response.LAST_MODIFIED)); + System.err.println("Content-Length: " + + content.getMetadata().get(Response.CONTENT_LENGTH)); + System.err.println("Last-Modified: " + + content.getMetadata().get(Response.LAST_MODIFIED)); if (dumpContent) { System.out.print(new String(content.getContent())); } @@ -248,7 +250,7 @@ public class Ftp implements Protocol { return this.conf; } - /** + /** * Get the robots rules for a given url */ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) { @@ -259,4 +261,3 @@ public class Ftp implements Protocol { return BUFFER_SIZE; } } - Modified: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpError.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpError.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpError.java (original) +++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpError.java Thu Jan 29 05:38:59 2015 @@ -17,13 +17,16 @@ package org.apache.nutch.protocol.ftp; -/** Thrown for Ftp error codes. +/** + * Thrown for Ftp error codes. */ public class FtpError extends FtpException { private int code; - - public int getCode(int code) { return code; } + + public int getCode(int code) { + return code; + } public FtpError(int code) { super("Ftp Error: " + code); Modified: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpException.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpException.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpException.java (original) +++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpException.java Thu Jan 29 05:38:59 2015 @@ -20,9 +20,9 @@ package org.apache.nutch.protocol.ftp; import org.apache.nutch.protocol.ProtocolException; /*** - * Superclass for important exceptions thrown during FTP talk, - * that must be handled with care. - * + * Superclass for important exceptions thrown during FTP talk, that must be + * handled with care. + * * @author John Xing */ public class FtpException extends ProtocolException { Modified: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java (original) +++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java Thu Jan 29 05:38:59 2015 @@ -19,7 +19,7 @@ package org.apache.nutch.protocol.ftp; /** * Exception indicating bad reply of SYST command. - * + * * @author John Xing */ public class FtpExceptionBadSystResponse extends FtpException { Modified: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java (original) +++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java Thu Jan 29 05:38:59 2015 @@ -19,7 +19,7 @@ package org.apache.nutch.protocol.ftp; /** * Exception indicating failure of opening data connection. - * + * * @author John Xing */ public class FtpExceptionCanNotHaveDataConnection extends FtpException { Modified: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java (original) +++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java Thu Jan 29 05:38:59 2015 @@ -18,9 +18,9 @@ package org.apache.nutch.protocol.ftp; /** - * Exception indicating control channel is closed by server end, due to - * forced closure of data channel at client (our) end. - * + * Exception indicating control channel is closed by server end, due to forced + * closure of data channel at client (our) end. + * * @author John Xing */ public class FtpExceptionControlClosedByForcedDataClose extends FtpException { Modified: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java (original) +++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java Thu Jan 29 05:38:59 2015 @@ -18,9 +18,9 @@ package org.apache.nutch.protocol.ftp; /** - * Exception indicating unrecognizable reply from server after - * forced closure of data channel by client (our) side. - * + * Exception indicating unrecognizable reply from server after forced closure of + * data channel by client (our) side. + * * @author John Xing */ public class FtpExceptionUnknownForcedDataClose extends FtpException { Modified: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java (original) +++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java Thu Jan 29 05:38:59 2015 @@ -37,15 +37,12 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; /** - * FtpResponse.java mimics ftp replies as http response. - * It tries its best to follow http's way for headers, response codes - * as well as exceptions. - * - * Comments: - * In this class, all FtpException*.java thrown by Client.java - * and some important commons-net exceptions passed by Client.java - * must have been properly dealt with. They'd better not be leaked - * to the caller of this class. + * FtpResponse.java mimics ftp replies as http response. It tries its best to + * follow http's way for headers, response codes as well as exceptions. + * + * Comments: In this class, all FtpException*.java thrown by Client.java and + * some important commons-net exceptions passed by Client.java must have been + * properly dealt with. They'd better not be leaked to the caller of this class. */ public class FtpResponse { @@ -60,23 +57,26 @@ public class FtpResponse { private Configuration conf; /** Returns the response code. */ - public int getCode() { return code; } + public int getCode() { + return code; + } /** Returns the value of a named header. */ public String getHeader(String name) { return headers.get(name); } - public byte[] getContent() { return content; } + public byte[] getContent() { + return content; + } public Content toContent() { return new Content(orig, base, (content != null ? content : EMPTY_CONTENT), - getHeader(Response.CONTENT_TYPE), - headers, this.conf); + getHeader(Response.CONTENT_TYPE), headers, this.conf); } public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf) - throws FtpException, IOException { + throws FtpException, IOException { this.orig = url.toString(); this.base = url.toString(); @@ -98,17 +98,16 @@ public class FtpResponse { if (ftp.followTalk) { if (Ftp.LOG.isInfoEnabled()) { - Ftp.LOG.info("fetching "+url); + Ftp.LOG.info("fetching " + url); } } else { if (Ftp.LOG.isTraceEnabled()) { - Ftp.LOG.trace("fetching "+url); + Ftp.LOG.trace("fetching " + url); } } InetAddress addr = InetAddress.getByName(url.getHost()); - if (addr != null - && conf.getBoolean("store.ip.address", false) == true) { + if (addr != null && conf.getBoolean("store.ip.address", false) == true) { headers.add("_ip_", addr.getHostAddress()); } @@ -116,7 +115,7 @@ public class FtpResponse { // should start anew. if (ftp.client != null && ftp.keepConnection && ftp.renewalTime < System.currentTimeMillis()) { - if (Ftp.LOG.isInfoEnabled()) { + if (Ftp.LOG.isInfoEnabled()) { Ftp.LOG.info("delete client because idled too long"); } ftp.client = null; @@ -130,8 +129,9 @@ public class FtpResponse { // the real client ftp.client = new Client(); // when to renew, take the lesser - //ftp.renewalTime = System.currentTimeMillis() - // + ((ftp.timeout<ftp.serverTimeout) ? ftp.timeout : ftp.serverTimeout); + // ftp.renewalTime = System.currentTimeMillis() + // + ((ftp.timeout<ftp.serverTimeout) ? ftp.timeout : + // ftp.serverTimeout); // timeout for control connection ftp.client.setDefaultTimeout(ftp.timeout); @@ -140,8 +140,8 @@ public class FtpResponse { // follow ftp talk? if (ftp.followTalk) - ftp.client.addProtocolCommandListener( - new PrintCommandListener(Ftp.LOG)); + ftp.client.addProtocolCommandListener(new PrintCommandListener( + Ftp.LOG)); } // quit from previous site if at a different site now @@ -149,8 +149,8 @@ public class FtpResponse { InetAddress remoteAddress = ftp.client.getRemoteAddress(); if (!addr.equals(remoteAddress)) { if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { - Ftp.LOG.info("disconnect from "+remoteAddress - +" before connect to "+addr); + Ftp.LOG.info("disconnect from " + remoteAddress + + " before connect to " + addr); } // quit from current site ftp.client.logout(); @@ -162,22 +162,22 @@ public class FtpResponse { if (!ftp.client.isConnected()) { if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { - Ftp.LOG.info("connect to "+addr); + Ftp.LOG.info("connect to " + addr); } ftp.client.connect(addr); if (!FTPReply.isPositiveCompletion(ftp.client.getReplyCode())) { ftp.client.disconnect(); if (Ftp.LOG.isWarnEnabled()) { - Ftp.LOG.warn("ftp.client.connect() failed: " - + addr + " " + ftp.client.getReplyString()); + Ftp.LOG.warn("ftp.client.connect() failed: " + addr + " " + + ftp.client.getReplyString()); } this.code = 500; // http Internal Server Error return; } if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { - Ftp.LOG.info("log into "+addr); + Ftp.LOG.info("log into " + addr); } if (!ftp.client.login(ftp.userName, ftp.passWord)) { @@ -188,9 +188,9 @@ public class FtpResponse { // (not dealt with here at all) . ftp.client.disconnect(); if (Ftp.LOG.isWarnEnabled()) { - Ftp.LOG.warn("ftp.client.login() failed: "+addr); + Ftp.LOG.warn("ftp.client.login() failed: " + addr); } - this.code = 401; // http Unauthorized + this.code = 401; // http Unauthorized return; } @@ -199,14 +199,14 @@ public class FtpResponse { ftp.client.logout(); ftp.client.disconnect(); if (Ftp.LOG.isWarnEnabled()) { - Ftp.LOG.warn("ftp.client.setFileType() failed: "+addr); + Ftp.LOG.warn("ftp.client.setFileType() failed: " + addr); } this.code = 500; // http Internal Server Error return; } if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { - Ftp.LOG.info("set parser for "+addr); + Ftp.LOG.info("set parser for " + addr); } // SYST is valid only after login @@ -217,17 +217,18 @@ public class FtpResponse { if (parserKey.startsWith("UNKNOWN Type: L8")) parserKey = "UNIX Type: L8"; ftp.parser = (new DefaultFTPFileEntryParserFactory()) - .createFileEntryParser(parserKey); + .createFileEntryParser(parserKey); } catch (FtpExceptionBadSystResponse e) { if (Ftp.LOG.isWarnEnabled()) { - Ftp.LOG.warn("ftp.client.getSystemName() failed: "+addr+" "+e); + Ftp.LOG + .warn("ftp.client.getSystemName() failed: " + addr + " " + e); } ftp.parser = null; } catch (ParserInitializationException e) { // ParserInitializationException is RuntimeException defined in // org.apache.commons.net.ftp.parser.ParserInitializationException if (Ftp.LOG.isWarnEnabled()) { - Ftp.LOG.warn("createFileEntryParser() failed. "+addr+" "+e); + Ftp.LOG.warn("createFileEntryParser() failed. " + addr + " " + e); } ftp.parser = null; } finally { @@ -235,7 +236,7 @@ public class FtpResponse { // do not log as severe, otherwise // FetcherThread/RequestScheduler will abort if (Ftp.LOG.isWarnEnabled()) { - Ftp.LOG.warn("ftp.parser is null: "+addr); + Ftp.LOG.warn("ftp.parser is null: " + addr); } ftp.client.logout(); ftp.client.disconnect(); @@ -261,10 +262,11 @@ public class FtpResponse { // reset next renewalTime, take the lesser if (ftp.client != null && ftp.keepConnection) { ftp.renewalTime = System.currentTimeMillis() - + ((ftp.timeout<ftp.serverTimeout) ? ftp.timeout : ftp.serverTimeout); + + ((ftp.timeout < ftp.serverTimeout) ? ftp.timeout + : ftp.serverTimeout); if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { Ftp.LOG.info("reset renewalTime to " - + HttpDateFormat.toString(ftp.renewalTime)); + + HttpDateFormat.toString(ftp.renewalTime)); } } @@ -272,12 +274,12 @@ public class FtpResponse { // may have deleted ftp.client if (ftp.client != null && !ftp.keepConnection) { if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { - Ftp.LOG.info("disconnect from "+addr); + Ftp.LOG.info("disconnect from " + addr); } ftp.client.logout(); ftp.client.disconnect(); } - + } catch (Exception e) { if (Ftp.LOG.isWarnEnabled()) { Ftp.LOG.warn("Error: ", e); @@ -290,21 +292,21 @@ public class FtpResponse { ftp.client = null; // or do explicit garbage collection? // System.gc(); -// can we be less dramatic, using the following instead? -// probably unnecessary for our practical purpose here -// try { -// ftp.client.logout(); -// ftp.client.disconnect(); -// } + // can we be less dramatic, using the following instead? + // probably unnecessary for our practical purpose here + // try { + // ftp.client.logout(); + // ftp.client.disconnect(); + // } throw new FtpException(e); - //throw e; + // throw e; } } // get ftp file as http response private void getFileAsHttpResponse(String path, long lastModified) - throws IOException { + throws IOException { ByteArrayOutputStream os = null; List<FTPFile> list = null; @@ -316,9 +318,9 @@ public class FtpResponse { FTPFile ftpFile = (FTPFile) list.get(0); this.headers.set(Response.CONTENT_LENGTH, - new Long(ftpFile.getSize()).toString()); + new Long(ftpFile.getSize()).toString()); this.headers.set(Response.LAST_MODIFIED, - HttpDateFormat.toString(ftpFile.getTimestamp())); + HttpDateFormat.toString(ftpFile.getTimestamp())); // don't retrieve the file if not changed. if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) { code = 304; @@ -329,11 +331,11 @@ public class FtpResponse { this.content = os.toByteArray(); -// // approximate bytes sent and read -// if (this.httpAccounting != null) { -// this.httpAccounting.incrementBytesSent(path.length()); -// this.httpAccounting.incrementBytesRead(this.content.length); -// } + // // approximate bytes sent and read + // if (this.httpAccounting != null) { + // this.httpAccounting.incrementBytesSent(path.length()); + // this.httpAccounting.incrementBytesRead(this.content.length); + // } this.code = 200; // http OK @@ -342,64 +344,64 @@ public class FtpResponse { // control connection is off, clean up // ftp.client.disconnect(); if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { - Ftp.LOG.info("delete client because server cut off control channel: "+e); + Ftp.LOG.info("delete client because server cut off control channel: " + + e); } ftp.client = null; // in case this FtpExceptionControlClosedByForcedDataClose is // thrown by retrieveList() (not retrieveFile()) above, if (os == null) { // indicating throwing by retrieveList() - //throw new FtpException("fail to get attibutes: "+path); + // throw new FtpException("fail to get attibutes: "+path); if (Ftp.LOG.isWarnEnabled()) { - Ftp.LOG.warn( - "Please try larger maxContentLength for ftp.client.retrieveList(). " - + e); + Ftp.LOG + .warn("Please try larger maxContentLength for ftp.client.retrieveList(). " + + e); } // in a way, this is our request fault - this.code = 400; // http Bad request + this.code = 400; // http Bad request return; } FTPFile ftpFile = (FTPFile) list.get(0); this.headers.set(Response.CONTENT_LENGTH, - new Long(ftpFile.getSize()).toString()); - //this.headers.put("content-type", "text/html"); + new Long(ftpFile.getSize()).toString()); + // this.headers.put("content-type", "text/html"); this.headers.set(Response.LAST_MODIFIED, - HttpDateFormat.toString(ftpFile.getTimestamp())); + HttpDateFormat.toString(ftpFile.getTimestamp())); this.content = os.toByteArray(); if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) { code = 304; return; } -// // approximate bytes sent and read -// if (this.httpAccounting != null) { -// this.httpAccounting.incrementBytesSent(path.length()); -// this.httpAccounting.incrementBytesRead(this.content.length); -// } + // // approximate bytes sent and read + // if (this.httpAccounting != null) { + // this.httpAccounting.incrementBytesSent(path.length()); + // this.httpAccounting.incrementBytesRead(this.content.length); + // } this.code = 200; // http OK } catch (FtpExceptionCanNotHaveDataConnection e) { if (FTPReply.isPositiveCompletion(ftp.client.cwd(path))) { - // it is not a file, but dir, so redirect as a dir + // it is not a file, but dir, so redirect as a dir this.headers.set(Response.LOCATION, path + "/"); - this.code = 300; // http redirect + this.code = 300; // http redirect // fixme, should we do ftp.client.cwd("/"), back to top dir? } else { - // it is not a dir either - this.code = 404; // http Not Found + // it is not a dir either + this.code = 404; // http Not Found } } catch (FtpExceptionUnknownForcedDataClose e) { // Please note control channel is still live. // in a way, this is our request fault if (Ftp.LOG.isWarnEnabled()) { - Ftp.LOG.warn( - "Unrecognized reply after forced close of data channel. " - + "If this is acceptable, please modify Client.java accordingly. " - + e); + Ftp.LOG.warn("Unrecognized reply after forced close of data channel. " + + "If this is acceptable, please modify Client.java accordingly. " + + e); } this.code = 400; // http Bad Request } @@ -408,14 +410,14 @@ public class FtpResponse { // get ftp dir list as http response private void getDirAsHttpResponse(String path, long lastModified) - throws IOException { + throws IOException { List<FTPFile> list = new LinkedList<FTPFile>(); try { // change to that dir first if (!FTPReply.isPositiveCompletion(ftp.client.cwd(path))) { - this.code = 404; // http Not Found + this.code = 404; // http Not Found return; } @@ -424,15 +426,15 @@ public class FtpResponse { ftp.client.retrieveList(null, list, ftp.maxContentLength, ftp.parser); this.content = list2html(list, path, "/".equals(path) ? false : true); this.headers.set(Response.CONTENT_LENGTH, - new Integer(this.content.length).toString()); + new Integer(this.content.length).toString()); this.headers.set(Response.CONTENT_TYPE, "text/html"); // this.headers.put("Last-Modified", null); -// // approximate bytes sent and read -// if (this.httpAccounting != null) { -// this.httpAccounting.incrementBytesSent(path.length()); -// this.httpAccounting.incrementBytesRead(this.content.length); -// } + // // approximate bytes sent and read + // if (this.httpAccounting != null) { + // this.httpAccounting.incrementBytesSent(path.length()); + // this.httpAccounting.incrementBytesRead(this.content.length); + // } this.code = 200; // http OK @@ -441,21 +443,22 @@ public class FtpResponse { // control connection is off, clean up // ftp.client.disconnect(); if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) { - Ftp.LOG.info("delete client because server cut off control channel: "+e); + Ftp.LOG.info("delete client because server cut off control channel: " + + e); } ftp.client = null; this.content = list2html(list, path, "/".equals(path) ? false : true); this.headers.set(Response.CONTENT_LENGTH, - new Integer(this.content.length).toString()); + new Integer(this.content.length).toString()); this.headers.set(Response.CONTENT_TYPE, "text/html"); // this.headers.put("Last-Modified", null); -// // approximate bytes sent and read -// if (this.httpAccounting != null) { -// this.httpAccounting.incrementBytesSent(path.length()); -// this.httpAccounting.incrementBytesRead(this.content.length); -// } + // // approximate bytes sent and read + // if (this.httpAccounting != null) { + // this.httpAccounting.incrementBytesSent(path.length()); + // this.httpAccounting.incrementBytesRead(this.content.length); + // } this.code = 200; // http OK @@ -463,32 +466,35 @@ public class FtpResponse { // Please note control channel is still live. // in a way, this is our request fault if (Ftp.LOG.isWarnEnabled()) { - Ftp.LOG.warn( - "Unrecognized reply after forced close of data channel. " - + "If this is acceptable, please modify Client.java accordingly. " - + e); + Ftp.LOG.warn("Unrecognized reply after forced close of data channel. " + + "If this is acceptable, please modify Client.java accordingly. " + + e); } this.code = 400; // http Bad Request } catch (FtpExceptionCanNotHaveDataConnection e) { - if (Ftp.LOG.isWarnEnabled()) { Ftp.LOG.warn(""+ e); } + if (Ftp.LOG.isWarnEnabled()) { + Ftp.LOG.warn("" + e); + } this.code = 500; // http Iternal Server Error } } // generate html page from ftp dir list - private byte[] list2html(List<FTPFile> list, String path, boolean includeDotDot) { + private byte[] list2html(List<FTPFile> list, String path, + boolean includeDotDot) { - //StringBuffer x = new StringBuffer("<!doctype html public \"-//ietf//dtd html//en\"><html><head>"); + // StringBuffer x = new + // StringBuffer("<!doctype html public \"-//ietf//dtd html//en\"><html><head>"); StringBuffer x = new StringBuffer("<html><head>"); - x.append("<title>Index of "+path+"</title></head>\n"); - x.append("<body><h1>Index of "+path+"</h1><pre>\n"); + x.append("<title>Index of " + path + "</title></head>\n"); + x.append("<body><h1>Index of " + path + "</h1><pre>\n"); if (includeDotDot) { x.append("<a href='../'>../</a>\t-\t-\t-\n"); } - for (int i=0; i<list.size(); i++) { + for (int i = 0; i < list.size(); i++) { FTPFile f = (FTPFile) list.get(i); String name = f.getName(); String time = HttpDateFormat.toString(f.getTimestamp()); @@ -496,11 +502,11 @@ public class FtpResponse { // some ftp server LIST "." and "..", we skip them here if (name.equals(".") || name.equals("..")) continue; - x.append("<a href='"+name+"/"+"'>"+name+"/</a>\t"); - x.append(time+"\t-\n"); + x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t"); + x.append(time + "\t-\n"); } else if (f.isFile()) { - x.append("<a href='"+name+ "'>"+name+"</a>\t"); - x.append(time+"\t"+f.getSize()+"\n"); + x.append("<a href='" + name + "'>" + name + "</a>\t"); + x.append(time + "\t" + f.getSize() + "\n"); } else { // ignore isSymbolicLink() // ignore isUnknown() Modified: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java (original) +++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java Thu Jan 29 05:38:59 2015 @@ -33,55 +33,62 @@ import crawlercommons.robots.BaseRobotRu import crawlercommons.robots.SimpleRobotRules; /** - * This class is used for parsing robots for urls belonging to FTP protocol. - * It extends the generic {@link RobotRulesParser} class and contains - * Ftp protocol specific implementation for obtaining the robots file. + * This class is used for parsing robots for urls belonging to FTP protocol. It + * extends the generic {@link RobotRulesParser} class and contains Ftp protocol + * specific implementation for obtaining the robots file. */ public class FtpRobotRulesParser extends RobotRulesParser { private static final String CONTENT_TYPE = "text/plain"; - public static final Logger LOG = LoggerFactory.getLogger(FtpRobotRulesParser.class); + public static final Logger LOG = LoggerFactory + .getLogger(FtpRobotRulesParser.class); - FtpRobotRulesParser() { } + FtpRobotRulesParser() { + } public FtpRobotRulesParser(Configuration conf) { super(conf); } /** - * The hosts for which the caching of robots rules is yet to be done, - * it sends a Ftp request to the host corresponding to the {@link URL} - * passed, gets robots file, parses the rules and caches the rules object - * to avoid re-work in future. + * The hosts for which the caching of robots rules is yet to be done, it sends + * a Ftp request to the host corresponding to the {@link URL} passed, gets + * robots file, parses the rules and caches the rules object to avoid re-work + * in future. + * + * @param ftp + * The {@link Protocol} object + * @param url + * URL * - * @param ftp The {@link Protocol} object - * @param url URL - * - * @return robotRules A {@link BaseRobotRules} object for the rules + * @return robotRules A {@link BaseRobotRules} object for the rules */ public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url) { - String protocol = url.getProtocol().toLowerCase(); // normalize to lower case - String host = url.getHost().toLowerCase(); // normalize to lower case + String protocol = url.getProtocol().toLowerCase(); // normalize to lower + // case + String host = url.getHost().toLowerCase(); // normalize to lower case - BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(protocol + ":" + host); + BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(protocol + ":" + + host); boolean cacheRule = true; - if (robotRules == null) { // cache miss + if (robotRules == null) { // cache miss if (LOG.isTraceEnabled()) LOG.trace("cache miss " + url); try { Text robotsUrl = new Text(new URL(url, "/robots.txt").toString()); - ProtocolOutput output = ((Ftp)ftp).getProtocolOutput(robotsUrl, new CrawlDatum()); + ProtocolOutput output = ((Ftp) ftp).getProtocolOutput(robotsUrl, + new CrawlDatum()); ProtocolStatus status = output.getStatus(); if (status.getCode() == ProtocolStatus.SUCCESS) { - robotRules = parseRules(url.toString(), output.getContent().getContent(), - CONTENT_TYPE, agentNames); - } else { - robotRules = EMPTY_RULES; // use default rules + robotRules = parseRules(url.toString(), output.getContent() + .getContent(), CONTENT_TYPE, agentNames); + } else { + robotRules = EMPTY_RULES; // use default rules } } catch (Throwable t) { if (LOG.isInfoEnabled()) { @@ -92,7 +99,7 @@ public class FtpRobotRulesParser extends } if (cacheRule) - CACHE.put(protocol + ":" + host, robotRules); // cache rules for host + CACHE.put(protocol + ":" + host, robotRules); // cache rules for host } return robotRules; } Modified: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java (original) +++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java Thu Jan 29 05:38:59 2015 @@ -28,45 +28,44 @@ import org.apache.commons.net.ProtocolCo /*** * This is a support class for logging all ftp command/reply traffic. - * + * * @author John Xing ***/ -public class PrintCommandListener implements ProtocolCommandListener -{ - private Logger __logger; +public class PrintCommandListener implements ProtocolCommandListener { + private Logger __logger; - public PrintCommandListener(Logger logger) - { - __logger = logger; - } + public PrintCommandListener(Logger logger) { + __logger = logger; + } - public void protocolCommandSent(ProtocolCommandEvent event) { - try { - __logIt(event); - } catch (IOException e) { - if (__logger.isInfoEnabled()) { - __logger.info("PrintCommandListener.protocolCommandSent(): "+e); - } + public void protocolCommandSent(ProtocolCommandEvent event) { + try { + __logIt(event); + } catch (IOException e) { + if (__logger.isInfoEnabled()) { + __logger.info("PrintCommandListener.protocolCommandSent(): " + e); } } + } - public void protocolReplyReceived(ProtocolCommandEvent event) { - try { - __logIt(event); - } catch (IOException e) { - if (__logger.isInfoEnabled()) { - __logger.info("PrintCommandListener.protocolReplyReceived(): "+e); - } + public void protocolReplyReceived(ProtocolCommandEvent event) { + try { + __logIt(event); + } catch (IOException e) { + if (__logger.isInfoEnabled()) { + __logger.info("PrintCommandListener.protocolReplyReceived(): " + e); } } + } - private void __logIt(ProtocolCommandEvent event) throws IOException { - if (!__logger.isInfoEnabled()) { return; } - BufferedReader br = - new BufferedReader(new StringReader(event.getMessage())); - String line; - while ((line = br.readLine()) != null) { - __logger.info("ftp> "+line); - } + private void __logIt(ProtocolCommandEvent event) throws IOException { + if (!__logger.isInfoEnabled()) { + return; + } + BufferedReader br = new BufferedReader(new StringReader(event.getMessage())); + String line; + while ((line = br.readLine()) != null) { + __logger.info("ftp> " + line); } + } } Modified: nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java (original) +++ nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java Thu Jan 29 05:38:59 2015 @@ -34,12 +34,10 @@ import org.apache.nutch.protocol.Protoco import org.apache.nutch.protocol.http.api.HttpBase; import org.apache.nutch.util.NutchConfiguration; - public class Http extends HttpBase { public static final Logger LOG = LoggerFactory.getLogger(Http.class); - /** * Public default constructor. */ @@ -48,17 +46,17 @@ public class Http extends HttpBase { } /** - * Set the {@link org.apache.hadoop.conf.Configuration} - * object. + * Set the {@link org.apache.hadoop.conf.Configuration} object. + * * @param conf */ public void setConf(Configuration conf) { super.setConf(conf); -// Level logLevel = Level.WARNING; -// if (conf.getBoolean("http.verbose", false)) { -// logLevel = Level.FINE; -// } -// LOG.setLevel(logLevel); + // Level logLevel = Level.WARNING; + // if (conf.getBoolean("http.verbose", false)) { + // logLevel = Level.FINE; + // } + // LOG.setLevel(logLevel); } public static void main(String[] args) throws Exception { @@ -68,7 +66,7 @@ public class Http extends HttpBase { } protected Response getResponse(URL url, CrawlDatum datum, boolean redirect) - throws ProtocolException, IOException { + throws ProtocolException, IOException { return new HttpResponse(this, url, datum); }