Github user mxm commented on a diff in the pull request: https://github.com/apache/flink/pull/2618#discussion_r83016682 --- Diff: flink-fs-tests/src/test/java/org/apache/flink/hdfstests/ContinuousFileProcessingTests.java --- @@ -336,237 +348,294 @@ public int compare(String o1, String o2) { Assert.assertEquals(expectedFileContents.get(fileIdx), cntntStr.toString()); } - for(org.apache.hadoop.fs.Path file: filesCreated) { + for (org.apache.hadoop.fs.Path file: filesCreated) { hdfs.delete(file, false); } } - private static class PathFilter extends FilePathFilter { - - @Override - public boolean filterPath(Path filePath) { - return filePath.getName().startsWith("**"); - } - } + //// Monitoring Function Tests ////// @Test public void testFilePathFiltering() throws Exception { - Set<String> uniqFilesFound = new HashSet<>(); Set<org.apache.hadoop.fs.Path> filesCreated = new HashSet<>(); + Set<String> filesKept = new TreeSet<>(); // create the files to be discarded for (int i = 0; i < NO_OF_FILES; i++) { - Tuple2<org.apache.hadoop.fs.Path, String> file = fillWithData(hdfsURI, "**file", i, "This is test line."); + Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(hdfsURI, "**file", i, "This is test line."); filesCreated.add(file.f0); } // create the files to be kept for (int i = 0; i < NO_OF_FILES; i++) { - Tuple2<org.apache.hadoop.fs.Path, String> file = fillWithData(hdfsURI, "file", i, "This is test line."); + Tuple2<org.apache.hadoop.fs.Path, String> file = + createFileAndFillWithData(hdfsURI, "file", i, "This is test line."); filesCreated.add(file.f0); + filesKept.add(file.f0.getName()); } TextInputFormat format = new TextInputFormat(new Path(hdfsURI)); format.setFilesFilter(new PathFilter()); + ContinuousFileMonitoringFunction<String> monitoringFunction = new ContinuousFileMonitoringFunction<>(format, hdfsURI, FileProcessingMode.PROCESS_ONCE, 1, INTERVAL); + final FileVerifyingSourceContext context = + new FileVerifyingSourceContext(new OneShotLatch(), monitoringFunction, 0, -1); + monitoringFunction.open(new Configuration()); - monitoringFunction.run(new TestingSourceContext(monitoringFunction, uniqFilesFound)); + monitoringFunction.run(context); - Assert.assertEquals(NO_OF_FILES, uniqFilesFound.size()); - for(int i = 0; i < NO_OF_FILES; i++) { - org.apache.hadoop.fs.Path file = new org.apache.hadoop.fs.Path(hdfsURI + "/file" + i); - Assert.assertTrue(uniqFilesFound.contains(file.toString())); - } + Assert.assertArrayEquals(filesKept.toArray(), context.getSeenFiles().toArray()); - for(org.apache.hadoop.fs.Path file: filesCreated) { + // finally delete the files created for the test. + for (org.apache.hadoop.fs.Path file: filesCreated) { hdfs.delete(file, false); } } + private static class PathFilter extends FilePathFilter { + @Override + public boolean filterPath(Path filePath) { + return filePath.getName().startsWith("**"); + } + } + @Test - public void testFileSplitMonitoringReprocessWithAppended() throws Exception { - final Set<String> uniqFilesFound = new HashSet<>(); + public void testSortingOnModTime() throws Exception { + final long[] modTimes = new long[NO_OF_FILES]; + final org.apache.hadoop.fs.Path[] filesCreated = new org.apache.hadoop.fs.Path[NO_OF_FILES]; + + // create some files + for (int i = 0; i < NO_OF_FILES; i++) { + Tuple2<org.apache.hadoop.fs.Path, String> file = + createFileAndFillWithData(hdfsURI, "file", i, "This is test line."); + Thread.sleep(10); + + filesCreated[i] = file.f0; + modTimes[i] = hdfs.getFileStatus(file.f0).getModificationTime(); + } + + TextInputFormat format = new TextInputFormat(new Path(hdfsURI)); + format.setFilesFilter(FilePathFilter.createDefaultFilter()); - FileCreator fc = new FileCreator(INTERVAL, NO_OF_FILES); - fc.start(); + // this is just to verify that all splits have been forwarded later. + FileInputSplit[] splits = format.createInputSplits(1); - Thread t = new Thread(new Runnable() { + ContinuousFileMonitoringFunction<String> monitoringFunction = + new ContinuousFileMonitoringFunction<>(format, hdfsURI, + FileProcessingMode.PROCESS_ONCE, 1, INTERVAL); + + ModTimeVerifyingSourceContext context = new ModTimeVerifyingSourceContext(modTimes); + + monitoringFunction.open(new Configuration()); + monitoringFunction.run(context); + Assert.assertEquals(splits.length, context.getCounter()); + + // delete the created files. + for (int i = 0; i < NO_OF_FILES; i++) { + hdfs.delete(filesCreated[i], false); + } + } + + @Test + public void testProcessOnce() throws Exception { + final OneShotLatch latch = new OneShotLatch(); + + // create a single file in the directory + Tuple2<org.apache.hadoop.fs.Path, String> bootstrap = + createFileAndFillWithData(hdfsURI, "file", NO_OF_FILES + 1, "This is test line."); + Assert.assertTrue(hdfs.exists(bootstrap.f0)); + + // the source is supposed to read only this file. + final Set<String> filesToBeRead = new TreeSet<>(); + filesToBeRead.add(bootstrap.f0.getName()); + + TextInputFormat format = new TextInputFormat(new Path(hdfsURI)); + format.setFilesFilter(FilePathFilter.createDefaultFilter()); + + final ContinuousFileMonitoringFunction<String> monitoringFunction = + new ContinuousFileMonitoringFunction<>(format, hdfsURI, + FileProcessingMode.PROCESS_ONCE, 1, INTERVAL); + + final FileVerifyingSourceContext context = + new FileVerifyingSourceContext(latch, monitoringFunction, 1, -1); + + final Thread t = new Thread() { @Override public void run() { - TextInputFormat format = new TextInputFormat(new Path(hdfsURI)); - format.setFilesFilter(FilePathFilter.createDefaultFilter()); - ContinuousFileMonitoringFunction<String> monitoringFunction = - new ContinuousFileMonitoringFunction<>(format, hdfsURI, - FileProcessingMode.PROCESS_CONTINUOUSLY, 1, INTERVAL); - try { monitoringFunction.open(new Configuration()); - monitoringFunction.run(new TestingSourceContext(monitoringFunction, uniqFilesFound)); + monitoringFunction.run(context); } catch (Exception e) { - // do nothing as we interrupted the thread. + Assert.fail(e.getMessage()); } } - }); + }; t.start(); - // wait until the sink also sees all the splits. - synchronized (uniqFilesFound) { - uniqFilesFound.wait(); + if (!latch.isTriggered()) { + latch.await(); } - t.interrupt(); - fc.join(); - Assert.assertEquals(NO_OF_FILES, fc.getFilesCreated().size()); - Assert.assertEquals(NO_OF_FILES, uniqFilesFound.size()); - - Set<org.apache.hadoop.fs.Path> filesCreated = fc.getFilesCreated(); - Set<String> fileNamesCreated = new HashSet<>(); - for (org.apache.hadoop.fs.Path path: fc.getFilesCreated()) { - fileNamesCreated.add(path.toString()); + // create some additional files that would be processed in the case of PROCESS_CONTINUOUSLY + final org.apache.hadoop.fs.Path[] filesCreated = new org.apache.hadoop.fs.Path[NO_OF_FILES]; + for (int i = 0; i < NO_OF_FILES; i++) { + Tuple2<org.apache.hadoop.fs.Path, String> ignoredFile = + createFileAndFillWithData(hdfsURI, "file", i, "This is test line."); + filesCreated[i] = ignoredFile.f0; } - for(String file: uniqFilesFound) { - Assert.assertTrue(fileNamesCreated.contains(file)); - } + // wait until the monitoring thread exits + t.join(); - for(org.apache.hadoop.fs.Path file: filesCreated) { - hdfs.delete(file, false); + Assert.assertArrayEquals(filesToBeRead.toArray(), context.getSeenFiles().toArray()); + + // finally delete the files created for the test. + hdfs.delete(bootstrap.f0, false); + for (org.apache.hadoop.fs.Path path: filesCreated) { + hdfs.delete(path, false); } } @Test - public void testFileSplitMonitoringProcessOnce() throws Exception { - Set<String> uniqFilesFound = new HashSet<>(); - - FileCreator fc = new FileCreator(INTERVAL, 1); - Set<org.apache.hadoop.fs.Path> filesCreated = fc.getFilesCreated(); - fc.start(); - - // to make sure that at least one file is created - if (filesCreated.size() == 0) { - synchronized (filesCreated) { - if (filesCreated.size() == 0) { - filesCreated.wait(); - } - } - } - Assert.assertTrue(fc.getFilesCreated().size() >= 1); + public void testProcessContinuously() throws Exception { + final OneShotLatch latch = new OneShotLatch(); + + // create a single file in the directory + Tuple2<org.apache.hadoop.fs.Path, String> bootstrap = + createFileAndFillWithData(hdfsURI, "file", NO_OF_FILES + 1, "This is test line."); + Assert.assertTrue(hdfs.exists(bootstrap.f0)); + + // the source is supposed to read only this file. + final Set<String> filesToBeRead = new TreeSet<>(); + filesToBeRead.add(bootstrap.f0.getName()); TextInputFormat format = new TextInputFormat(new Path(hdfsURI)); format.setFilesFilter(FilePathFilter.createDefaultFilter()); - ContinuousFileMonitoringFunction<String> monitoringFunction = + + final ContinuousFileMonitoringFunction<String> monitoringFunction = new ContinuousFileMonitoringFunction<>(format, hdfsURI, - FileProcessingMode.PROCESS_ONCE, 1, INTERVAL); + FileProcessingMode.PROCESS_CONTINUOUSLY, 1, INTERVAL); - monitoringFunction.open(new Configuration()); - monitoringFunction.run(new TestingSourceContext(monitoringFunction, uniqFilesFound)); + final int totalNoOfFilesToBeRead = 11; // 1 for the bootstrap + NO_OF_FILES + final FileVerifyingSourceContext context = new FileVerifyingSourceContext(latch, + monitoringFunction, 1, totalNoOfFilesToBeRead); - // wait until all the files are created - fc.join(); + final Thread t = new Thread() { - Assert.assertEquals(NO_OF_FILES, filesCreated.size()); + @Override + public void run() { + try { + monitoringFunction.open(new Configuration()); + monitoringFunction.run(context); + } catch (Exception e) { + Assert.fail(e.getMessage()); + } + } + }; + t.start(); - Set<String> fileNamesCreated = new HashSet<>(); - for (org.apache.hadoop.fs.Path path: fc.getFilesCreated()) { - fileNamesCreated.add(path.toString()); + if (!latch.isTriggered()) { + latch.await(); } - Assert.assertTrue(uniqFilesFound.size() >= 1 && uniqFilesFound.size() < fileNamesCreated.size()); - for(String file: uniqFilesFound) { - Assert.assertTrue(fileNamesCreated.contains(file)); + // create some additional files that would be processed in the case of PROCESS_CONTINUOUSLY --- End diff -- > // create some additional files that **should** be processed in the case of PROCESS_CONTINUOUSLY
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---