Hi,
I have provided another patch for CTAKES-450[1].
The fix provided in the original patch is correct. However, I tried to
address also small issues and some refactoring, especially addressing the
Unit Testing. If you think it requires more explanation, let me know.
* extracted methods
* removed dead code: sorted_segments.get(index + 1).getBegin();
* replaced deprecated junit.framework.Assert with org.junit.Assert
* assert for all elements in the jCAS, so in the future, if Segments aren't
processed well to be automatically identified
* replaced System.out.println with logger.
I think, this commit can fix and close CTAKES-450.
This diff is on top of latest trunk. As such, patching should be
straightforward.
I look forward to your feedback,
Alex Zbarcea
[1] - https://issues.apache.org/jira/browse/CTAKES-450
diff --git ctakes-core/src/main/java/org/apache/ctakes/core/ae/CDASegmentAnnotator.java ctakes-core/src/main/java/org/apache/ctakes/core/ae/CDASegmentAnnotator.java
index 7a56cee7..dd18815c 100644
--- ctakes-core/src/main/java/org/apache/ctakes/core/ae/CDASegmentAnnotator.java
+++ ctakes-core/src/main/java/org/apache/ctakes/core/ae/CDASegmentAnnotator.java
@@ -133,8 +133,16 @@ public class CDASegmentAnnotator extends JCasAnnotator_ImplBase {
return p;
}
+ private final Segment createSegment(JCas jCas, int begin, int end, String id) {
+ Segment segment = new Segment(jCas);
+ segment.setBegin(begin);
+ segment.setEnd(end);
+ segment.setId(id);
+ return segment;
+ }
+
@Override
- public void process(JCas jCas) throws AnalysisEngineProcessException {
+ public void process(JCas jCas) throws AnalysisEngineProcessException {
String text = jCas.getDocumentText();
if (text == null) {
String docId = DocumentIDAnnotationUtil.getDocumentID(jCas);
@@ -146,20 +154,13 @@ public class CDASegmentAnnotator extends JCasAnnotator_ImplBase {
// System.out.println("Pattern" + p);
Matcher m = p.matcher(text);
while (m.find()) {
- Segment segment = new Segment(jCas);
- segment.setBegin(m.start());
- segment.setEnd(m.end());
- segment.setId(id);
+ Segment segment = createSegment(jCas, m.start(), m.end(), id);
sorted_segments.add(segment);
}
}
- // If there are non segments, create a simple one that spans the
- // entire doc
- if (sorted_segments.size() <= 0) {
- Segment header = new Segment(jCas);
- header.setBegin(0);
- header.setEnd(text.length());
- header.setId(SIMPLE_SEGMENT);
+ // If there are non segments, create a simple one that spans the entire doc
+ if (sorted_segments.isEmpty()) {
+ Segment header = createSegment(jCas, 0, text.length(), SIMPLE_SEGMENT);
sorted_segments.add(header);
}
// TODO: this is kinda redundant, but needed the sections in sorted
@@ -174,10 +175,6 @@ public class CDASegmentAnnotator extends JCasAnnotator_ImplBase {
for (Segment s : sorted_segments) {
int prevEnd = s.getEnd();
int nextBegin = text.length();
- if (index > 0) {
- // handle case for first section
- sorted_segments.get(index - 1).getEnd();
- }
if (index + 1 < sorted_segments.size()) {
// handle case for last section
nextBegin = sorted_segments.get(index + 1).getBegin();
@@ -185,23 +182,16 @@ public class CDASegmentAnnotator extends JCasAnnotator_ImplBase {
// Only create a segment if there is some text.
// Handle the case where it's an empty segement
if (nextBegin > prevEnd) {
- Segment segment = new Segment(jCas);
- segment.setBegin(prevEnd);
- segment.setEnd(nextBegin);
- segment.setId(s.getId());
+ Segment segment = createSegment(jCas, prevEnd, nextBegin, s.getId());
segment.addToIndexes();
- segment.setPreferredText(section_names.get(s.getId()));
- index++;
+ segment.setPreferredText(section_names.get(s.getId()));
}
// handle case where there is only a single SIMPLE_SEGMENT
else if (nextBegin == prevEnd && nextBegin > 0 && index == 0) {
- Segment segment = new Segment(jCas);
- segment.setBegin(0);
- segment.setEnd(nextBegin);
- segment.setId(s.getId());
+ Segment segment = createSegment(jCas, 0, nextBegin, s.getId());
segment.addToIndexes();
- index++;
}
+ index++;
}
}
}
diff --git ctakes-core/src/test/java/org/apache/ctakes/core/ae/TestCDASegmentAnnotator.java ctakes-core/src/test/java/org/apache/ctakes/core/ae/TestCDASegmentAnnotator.java
index b00733a4..e7be6313 100644
--- ctakes-core/src/test/java/org/apache/ctakes/core/ae/TestCDASegmentAnnotator.java
+++ ctakes-core/src/test/java/org/apache/ctakes/core/ae/TestCDASegmentAnnotator.java
@@ -18,9 +18,8 @@
*/
package org.apache.ctakes.core.ae;
-import junit.framework.Assert;
-
import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.log4j.Logger;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.collection.CollectionReaderDescription;
@@ -29,62 +28,111 @@ import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.CollectionReaderFactory;
import org.apache.uima.fit.factory.TypeSystemDescriptionFactory;
import org.apache.uima.fit.pipeline.JCasIterable;
+import org.apache.uima.fit.pipeline.JCasIterator;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.cleartk.util.cr.FilesCollectionReader;
import org.junit.Test;
+import java.util.Collection;
+import java.util.Iterator;
+
+import static org.junit.Assert.*;
+
public class TestCDASegmentAnnotator {
public static String INPUT_FILE = "../ctakes-regression-test/testdata/input/plaintext/doc2_07543210_sample_current.txt";
+ static Logger LOGGER = Logger.getLogger(TestCDASegmentAnnotator.class);
+
+ public static final void printSegments(JCas jCas) {
+ for (Segment segment : JCasUtil.select(jCas, Segment.class))
+ LOGGER.info(String.format("Segment:%s\tBegin:%d\tEnd:%d\t%s",
+ segment.getId(), segment.getBegin(), segment.getEnd(), segment.getPreferredText()));
+ }
+
@Test
- public void TestCDASegmentPipeLine() throws Exception {
+ public void TestCDASegmentPipeLine() throws ResourceInitializationException {
TypeSystemDescription typeSystem = TypeSystemDescriptionFactory
.createTypeSystemDescription();
- CollectionReaderDescription reader = CollectionReaderFactory
- .createReaderDescription(FilesCollectionReader.class,
- typeSystem, FilesCollectionReader.PARAM_ROOT_FILE,
- INPUT_FILE);
+ CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(
+ FilesCollectionReader.class,
+ typeSystem,
+ FilesCollectionReader.PARAM_ROOT_FILE,
+ INPUT_FILE);
AnalysisEngineDescription sectionAnnotator = AnalysisEngineFactory
.createEngineDescription(CDASegmentAnnotator.class, typeSystem);
AnalysisEngineDescription dumpOutput = AnalysisEngineFactory.createEngineDescription(
DumpOutputAE.class, typeSystem);
+
// SimplePipeline.runPipeline(reader, sectionAnnotator, dumpOutput);
- JCasIterable casIter = new JCasIterable(reader, sectionAnnotator,
- dumpOutput);
- final String expected_hpi_section = "2.16.840.1.113883.10.20.22.2.20";
- final int expected_begin = 1634;
- final int expected_end = 1696;
- boolean section_exists = false;
- int section_begin = 0;
- int section_end = 0;
-
- for(JCas jCas : casIter){
- for (Segment segment : JCasUtil.select(jCas, Segment.class)) {
- if (expected_hpi_section.equalsIgnoreCase(segment.getId())) {
- section_exists = true;
- section_begin = segment.getBegin();
- section_end = segment.getEnd();
- break;
- }
- }
- }
- Assert.assertEquals(section_exists, true);
- Assert.assertEquals(expected_begin, section_begin);
- Assert.assertEquals(expected_end, section_end);
+ JCasIterable casIter = new JCasIterable(reader, sectionAnnotator, dumpOutput);
+ JCasIterator casIt = casIter.iterator();
+
+ assertTrue(casIt.hasNext());
+ JCas jCas = casIt.next();
+
+ // DEBUG: TestCDASegmentAnnotator.printSegments(jCas);
+
+ // iterate through segments
+ Collection<Segment> segments = JCasUtil.select(jCas, Segment.class);
+ assertEquals("No. of segments are provided by: ctakes-regression-test/testdata/input/plaintext/doc2_07543210_sample_current.txt",
+ 6, segments.size());
+
+ Iterator<Segment> segIt = segments.iterator();
+
+ Segment segment = segIt.next();
+ assertNotNull("Segment (0) should not be null", segment);
+ assertEquals("2.16.840.1.113883.10.20.22.1.1", segment.getId());
+ assertEquals(92, segment.getBegin());
+ assertEquals(159, segment.getEnd());
+ assertEquals("Header", segment.getPreferredText());
+
+ segment = segIt.next();
+ assertNotNull("Segment (1) should not be null", segment);
+ assertEquals("1.3.6.1.4.1.19376.1.5.3.1.1.13.2.1", segment.getId());
+ assertEquals(176, segment.getBegin());
+ assertEquals(1612, segment.getEnd());
+ assertEquals("CHIEF COMPLAINT", segment.getPreferredText());
+
+ segment = segIt.next();
+ assertNotNull("Segment (2) should not be null", segment);
+ assertEquals("2.16.840.1.113883.10.20.22.2.20", segment.getId());
+ assertEquals(1634, segment.getBegin());
+ assertEquals(1696, segment.getEnd());
+ assertEquals("HISTORY OF PAST ILLNESS", segment.getPreferredText());
+
+ segment = segIt.next();
+ assertNotNull("Segment (3) should not be null", segment);
+ assertEquals("2.16.840.1.113883.10.20.22.2.2.1", segment.getId());
+ assertEquals(1711, segment.getBegin());
+ assertEquals(2271, segment.getEnd());
+ assertEquals("History of immunizations", segment.getPreferredText());
+
+ segment = segIt.next();
+ assertNotNull("Segment (4) should not be null", segment);
+ assertEquals("2.16.840.1.113883.10.20.22.2.1.1", segment.getId());
+ assertEquals(2307, segment.getBegin());
+ assertEquals(3506, segment.getEnd());
+ assertEquals("HISTORY OF MEDICATION USE", segment.getPreferredText());
+
+ segment = segIt.next();
+ assertNotNull("Segment (5) should not be null", segment);
+ assertEquals("2.16.840.1.113883.10.20.22.2.15", segment.getId());
+ assertEquals(3522, segment.getBegin());
+ assertEquals(5608, segment.getEnd());
+ assertEquals("Family History", segment.getPreferredText());
+
+ assertFalse("No other jCas should be found", casIt.hasNext());
}
public static class DumpOutputAE extends JCasAnnotator_ImplBase {
public void process(JCas jCas) throws AnalysisEngineProcessException {
- for (Segment segment : JCasUtil.select(jCas, Segment.class)) {
- System.out.println("Segment:" + segment.getId() + " Begin:"
- + segment.getBegin() + " End:" + segment.getEnd());
- // System.out.println("Text" + segment.getCoveredText());
- }
+ TestCDASegmentAnnotator.printSegments(jCas);
}
}
}