Santhosh has uploaded a new change for review. https://gerrit.wikimedia.org/r/123596
Change subject: Segmentation: Handle the case of repeated references ...................................................................... Segmentation: Handle the case of repeated references Example: This is a sentence.[1][2][3][4] Added tests and simplified the text handler Change-Id: I992d9e02157e0649ff8bcb02992206c8a793548d --- M segmentation/languages/CXParser.js M segmentation/languages/hi/CXParserHi.js M tests/segmentation/SegmentationTests.json 3 files changed, 35 insertions(+), 14 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver refs/changes/96/123596/1 diff --git a/segmentation/languages/CXParser.js b/segmentation/languages/CXParser.js index 5713b2d..70c071c 100644 --- a/segmentation/languages/CXParser.js +++ b/segmentation/languages/CXParser.js @@ -90,6 +90,14 @@ }; /** + * Undo end a sentence + */ +CXParser.prototype.undoEndSentence = function () { + this.inSentence = true; + this.segmentedContent = this.segmentedContent.substr( 0, this.segmentedContent.length - 7 ); +}; + +/** * Text handler * @param {string} text */ @@ -107,7 +115,6 @@ var replacement, nextLetter; replacement = prevWord + sentenceSeparator; - //console.log([match, prevWord, sentenceSeparator, offset]); nextLetter = sentence[ offset + match.length ]; if ( prevWord && prevWord.length < 3 && prevWord[ 0 ].toUpperCase() === prevWord[ 0 ] || nextLetter && nextLetter.toLowerCase() === nextLetter ) { @@ -120,11 +127,6 @@ } text = text.replace( /(\w*)([.!?][\s])/g, textSplit ); - // content terminating with [.|!|?] - text = text.replace( /([.!?])$/, function ( match, p1 ) { - parser.seenSentenceEnd = true; - return p1; // + parser.endSentence(); - } ); this.print( text ); }; @@ -155,18 +157,35 @@ this.print( this.endSentence() ); } } + if ( tag.name === 'a' && !this.inSentence ) { // sentences starting with a link this.print( this.startSentence() ); } - if ( tag.name === 'span' && this.seenSentenceEnd && - tag.attributes.class === 'reference' && this.inSentence ) { + if ( tag.name === 'span' && + tag.attributes.class === 'reference' && ( this.inSentence || this.inReference ) ) { // Sentences staring with reference links. // Example: Sentence one.[1] Sentence two // Here [1] is not part of Sentence two. It is reference for Sentence one. + // It is also possible to have this references repeated n times + // Example: Sentence one.[1][2][3][4] Sentence two + if ( this.inReference ) { + // This is already in Reference state. That means, this is a + // case of multiple references. We need to remove the last sentence close + this.inSentence = true; + this.undoEndSentence(); + } this.inReference = true; } + + // Check if we need to reset inReference state. References contains an 'a' tag + // inside 'span' tag + if ( this.inReference && !( tag.name === 'a' || tag.name === 'span' ) ) { + // Reset inReference + this.inReference = false; + } + // Start of tag this.print( '<' + tag.name ); @@ -208,7 +227,8 @@ } else { this.print( '</' + tag + '>' ); } - + // See if we have to print the left over </span> + // from reference handling if ( tag === 'span' && this.inReference ) { this.print( this.endSentence() ); } diff --git a/segmentation/languages/hi/CXParserHi.js b/segmentation/languages/hi/CXParserHi.js index 06aa299..8615f80 100644 --- a/segmentation/languages/hi/CXParserHi.js +++ b/segmentation/languages/hi/CXParserHi.js @@ -32,10 +32,6 @@ return replacement; } text = text.replace( /([a-zA-Zअ-ह]*)([।!?][\s])/g, textSplit ); - // content terminating with [.|!|?] - text = text.replace( /([।!?])$/, function ( match, p1 ) { - return p1 + parser.endSentence(); - } ); this.print( text ); }; diff --git a/tests/segmentation/SegmentationTests.json b/tests/segmentation/SegmentationTests.json index 774039a..7fc0194 100644 --- a/tests/segmentation/SegmentationTests.json +++ b/tests/segmentation/SegmentationTests.json @@ -35,12 +35,17 @@ { "desc": "References can appear after period. Example: Hydrogen is a gas. [1] It is ...", "source": "<p>Sentence one. <span class=\"reference\"><a href=\"#\">reference</a></span> Starts with reference</p>", - "result": "<p id=\"0\"><span class=\"cx-segment\" data-segmentid=\"1\">Sentence one. </span><span class=\"cx-segment\" data-segmentid=\"2\"><span class=\"reference\"><a class=\"cx-link\" data-linkid=\"3\" href=\"#\">reference</a></span> Starts with reference</span></p>" + "result": "<p id=\"0\"><span class=\"cx-segment\" data-segmentid=\"1\">Sentence one. </span><span class=\"cx-segment\" data-segmentid=\"2\"><span class=\"reference\"><a class=\"cx-link\" data-linkid=\"3\" href=\"#\">reference</a></span></span><span class=\"cx-segment\" data-segmentid=\"4\"> Starts with reference</span></p>" }, { "desc": "References can appear after period without space. Example: Hydrogen is a gas.[1] It is ...", "source": "<p>Sentence one.<span class=\"reference\"><a href=\"#\">reference</a></span> Starts with reference</p>", "result": "<p id=\"0\"><span class=\"cx-segment\" data-segmentid=\"1\">Sentence one.<span class=\"reference\"><a class=\"cx-link\" data-linkid=\"2\" href=\"#\">reference</a></span></span><span class=\"cx-segment\" data-segmentid=\"3\"> Starts with reference</span></p>" + }, + { + "desc": "References can appear after period without space, repeated. Example: Hydrogen is a gas.[1][2][3] It is ...", + "source": "<p>Sentence one.<span class=\"reference\"><a href=\"#\">1</a></span><span class=\"reference\"><a href=\"#\">2</a></span><span class=\"reference\"><a href=\"#\">3</a></span> Starts with reference</p>", + "result": "<p id=\"0\"><span class=\"cx-segment\" data-segmentid=\"1\">Sentence one.<span class=\"reference\"><a class=\"cx-link\" data-linkid=\"2\" href=\"#\">1</a></span><span class=\"reference\"><a class=\"cx-link\" data-linkid=\"3\" href=\"#\">2</a></span><span class=\"reference\"><a class=\"cx-link\" data-linkid=\"4\" href=\"#\">3</a></span></span><span class=\"cx-segment\" data-segmentid=\"5\"> Starts with reference</span></p>" } ], "hi": [ -- To view, visit https://gerrit.wikimedia.org/r/123596 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I992d9e02157e0649ff8bcb02992206c8a793548d Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/cxserver Gerrit-Branch: master Gerrit-Owner: Santhosh <santhosh.thottin...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits