Hi, I have provided another patch for CTAKES-450[1].
The fix provided in the original patch is correct. However, I tried to address also small issues and some refactoring, especially addressing the Unit Testing. If you think it requires more explanation, let me know. * extracted methods * removed dead code: sorted_segments.get(index + 1).getBegin(); * replaced deprecated junit.framework.Assert with org.junit.Assert * assert for all elements in the jCAS, so in the future, if Segments aren't processed well to be automatically identified * replaced System.out.println with logger. I think, this commit can fix and close CTAKES-450. This diff is on top of latest trunk. As such, patching should be straightforward. I look forward to your feedback, Alex Zbarcea [1] - https://issues.apache.org/jira/browse/CTAKES-450
diff --git ctakes-core/src/main/java/org/apache/ctakes/core/ae/CDASegmentAnnotator.java ctakes-core/src/main/java/org/apache/ctakes/core/ae/CDASegmentAnnotator.java index 7a56cee7..dd18815c 100644 --- ctakes-core/src/main/java/org/apache/ctakes/core/ae/CDASegmentAnnotator.java +++ ctakes-core/src/main/java/org/apache/ctakes/core/ae/CDASegmentAnnotator.java @@ -133,8 +133,16 @@ public class CDASegmentAnnotator extends JCasAnnotator_ImplBase { return p; } + private final Segment createSegment(JCas jCas, int begin, int end, String id) { + Segment segment = new Segment(jCas); + segment.setBegin(begin); + segment.setEnd(end); + segment.setId(id); + return segment; + } + @Override - public void process(JCas jCas) throws AnalysisEngineProcessException { + public void process(JCas jCas) throws AnalysisEngineProcessException { String text = jCas.getDocumentText(); if (text == null) { String docId = DocumentIDAnnotationUtil.getDocumentID(jCas); @@ -146,20 +154,13 @@ public class CDASegmentAnnotator extends JCasAnnotator_ImplBase { // System.out.println("Pattern" + p); Matcher m = p.matcher(text); while (m.find()) { - Segment segment = new Segment(jCas); - segment.setBegin(m.start()); - segment.setEnd(m.end()); - segment.setId(id); + Segment segment = createSegment(jCas, m.start(), m.end(), id); sorted_segments.add(segment); } } - // If there are non segments, create a simple one that spans the - // entire doc - if (sorted_segments.size() <= 0) { - Segment header = new Segment(jCas); - header.setBegin(0); - header.setEnd(text.length()); - header.setId(SIMPLE_SEGMENT); + // If there are non segments, create a simple one that spans the entire doc + if (sorted_segments.isEmpty()) { + Segment header = createSegment(jCas, 0, text.length(), SIMPLE_SEGMENT); sorted_segments.add(header); } // TODO: this is kinda redundant, but needed the sections in sorted @@ -174,10 +175,6 @@ public class CDASegmentAnnotator extends JCasAnnotator_ImplBase { for (Segment s : sorted_segments) { int prevEnd = s.getEnd(); int nextBegin = text.length(); - if (index > 0) { - // handle case for first section - sorted_segments.get(index - 1).getEnd(); - } if (index + 1 < sorted_segments.size()) { // handle case for last section nextBegin = sorted_segments.get(index + 1).getBegin(); @@ -185,23 +182,16 @@ public class CDASegmentAnnotator extends JCasAnnotator_ImplBase { // Only create a segment if there is some text. // Handle the case where it's an empty segement if (nextBegin > prevEnd) { - Segment segment = new Segment(jCas); - segment.setBegin(prevEnd); - segment.setEnd(nextBegin); - segment.setId(s.getId()); + Segment segment = createSegment(jCas, prevEnd, nextBegin, s.getId()); segment.addToIndexes(); - segment.setPreferredText(section_names.get(s.getId())); - index++; + segment.setPreferredText(section_names.get(s.getId())); } // handle case where there is only a single SIMPLE_SEGMENT else if (nextBegin == prevEnd && nextBegin > 0 && index == 0) { - Segment segment = new Segment(jCas); - segment.setBegin(0); - segment.setEnd(nextBegin); - segment.setId(s.getId()); + Segment segment = createSegment(jCas, 0, nextBegin, s.getId()); segment.addToIndexes(); - index++; } + index++; } } } diff --git ctakes-core/src/test/java/org/apache/ctakes/core/ae/TestCDASegmentAnnotator.java ctakes-core/src/test/java/org/apache/ctakes/core/ae/TestCDASegmentAnnotator.java index b00733a4..e7be6313 100644 --- ctakes-core/src/test/java/org/apache/ctakes/core/ae/TestCDASegmentAnnotator.java +++ ctakes-core/src/test/java/org/apache/ctakes/core/ae/TestCDASegmentAnnotator.java @@ -18,9 +18,8 @@ */ package org.apache.ctakes.core.ae; -import junit.framework.Assert; - import org.apache.ctakes.typesystem.type.textspan.Segment; +import org.apache.log4j.Logger; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.collection.CollectionReaderDescription; @@ -29,62 +28,111 @@ import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.CollectionReaderFactory; import org.apache.uima.fit.factory.TypeSystemDescriptionFactory; import org.apache.uima.fit.pipeline.JCasIterable; +import org.apache.uima.fit.pipeline.JCasIterator; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.metadata.TypeSystemDescription; import org.cleartk.util.cr.FilesCollectionReader; import org.junit.Test; +import java.util.Collection; +import java.util.Iterator; + +import static org.junit.Assert.*; + public class TestCDASegmentAnnotator { public static String INPUT_FILE = "../ctakes-regression-test/testdata/input/plaintext/doc2_07543210_sample_current.txt"; + static Logger LOGGER = Logger.getLogger(TestCDASegmentAnnotator.class); + + public static final void printSegments(JCas jCas) { + for (Segment segment : JCasUtil.select(jCas, Segment.class)) + LOGGER.info(String.format("Segment:%s\tBegin:%d\tEnd:%d\t%s", + segment.getId(), segment.getBegin(), segment.getEnd(), segment.getPreferredText())); + } + @Test - public void TestCDASegmentPipeLine() throws Exception { + public void TestCDASegmentPipeLine() throws ResourceInitializationException { TypeSystemDescription typeSystem = TypeSystemDescriptionFactory .createTypeSystemDescription(); - CollectionReaderDescription reader = CollectionReaderFactory - .createReaderDescription(FilesCollectionReader.class, - typeSystem, FilesCollectionReader.PARAM_ROOT_FILE, - INPUT_FILE); + CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription( + FilesCollectionReader.class, + typeSystem, + FilesCollectionReader.PARAM_ROOT_FILE, + INPUT_FILE); AnalysisEngineDescription sectionAnnotator = AnalysisEngineFactory .createEngineDescription(CDASegmentAnnotator.class, typeSystem); AnalysisEngineDescription dumpOutput = AnalysisEngineFactory.createEngineDescription( DumpOutputAE.class, typeSystem); + // SimplePipeline.runPipeline(reader, sectionAnnotator, dumpOutput); - JCasIterable casIter = new JCasIterable(reader, sectionAnnotator, - dumpOutput); - final String expected_hpi_section = "2.16.840.1.113883.10.20.22.2.20"; - final int expected_begin = 1634; - final int expected_end = 1696; - boolean section_exists = false; - int section_begin = 0; - int section_end = 0; - - for(JCas jCas : casIter){ - for (Segment segment : JCasUtil.select(jCas, Segment.class)) { - if (expected_hpi_section.equalsIgnoreCase(segment.getId())) { - section_exists = true; - section_begin = segment.getBegin(); - section_end = segment.getEnd(); - break; - } - } - } - Assert.assertEquals(section_exists, true); - Assert.assertEquals(expected_begin, section_begin); - Assert.assertEquals(expected_end, section_end); + JCasIterable casIter = new JCasIterable(reader, sectionAnnotator, dumpOutput); + JCasIterator casIt = casIter.iterator(); + + assertTrue(casIt.hasNext()); + JCas jCas = casIt.next(); + + // DEBUG: TestCDASegmentAnnotator.printSegments(jCas); + + // iterate through segments + Collection<Segment> segments = JCasUtil.select(jCas, Segment.class); + assertEquals("No. of segments are provided by: ctakes-regression-test/testdata/input/plaintext/doc2_07543210_sample_current.txt", + 6, segments.size()); + + Iterator<Segment> segIt = segments.iterator(); + + Segment segment = segIt.next(); + assertNotNull("Segment (0) should not be null", segment); + assertEquals("2.16.840.1.113883.10.20.22.1.1", segment.getId()); + assertEquals(92, segment.getBegin()); + assertEquals(159, segment.getEnd()); + assertEquals("Header", segment.getPreferredText()); + + segment = segIt.next(); + assertNotNull("Segment (1) should not be null", segment); + assertEquals("1.3.6.1.4.1.19376.1.5.3.1.1.13.2.1", segment.getId()); + assertEquals(176, segment.getBegin()); + assertEquals(1612, segment.getEnd()); + assertEquals("CHIEF COMPLAINT", segment.getPreferredText()); + + segment = segIt.next(); + assertNotNull("Segment (2) should not be null", segment); + assertEquals("2.16.840.1.113883.10.20.22.2.20", segment.getId()); + assertEquals(1634, segment.getBegin()); + assertEquals(1696, segment.getEnd()); + assertEquals("HISTORY OF PAST ILLNESS", segment.getPreferredText()); + + segment = segIt.next(); + assertNotNull("Segment (3) should not be null", segment); + assertEquals("2.16.840.1.113883.10.20.22.2.2.1", segment.getId()); + assertEquals(1711, segment.getBegin()); + assertEquals(2271, segment.getEnd()); + assertEquals("History of immunizations", segment.getPreferredText()); + + segment = segIt.next(); + assertNotNull("Segment (4) should not be null", segment); + assertEquals("2.16.840.1.113883.10.20.22.2.1.1", segment.getId()); + assertEquals(2307, segment.getBegin()); + assertEquals(3506, segment.getEnd()); + assertEquals("HISTORY OF MEDICATION USE", segment.getPreferredText()); + + segment = segIt.next(); + assertNotNull("Segment (5) should not be null", segment); + assertEquals("2.16.840.1.113883.10.20.22.2.15", segment.getId()); + assertEquals(3522, segment.getBegin()); + assertEquals(5608, segment.getEnd()); + assertEquals("Family History", segment.getPreferredText()); + + assertFalse("No other jCas should be found", casIt.hasNext()); } public static class DumpOutputAE extends JCasAnnotator_ImplBase { public void process(JCas jCas) throws AnalysisEngineProcessException { - for (Segment segment : JCasUtil.select(jCas, Segment.class)) { - System.out.println("Segment:" + segment.getId() + " Begin:" - + segment.getBegin() + " End:" + segment.getEnd()); - // System.out.println("Text" + segment.getCoveredText()); - } + TestCDASegmentAnnotator.printSegments(jCas); } } }