Santhosh has uploaded a new change for review. https://gerrit.wikimedia.org/r/123522
Change subject: Segmentation: Reference handling improvements ...................................................................... Segmentation: Reference handling improvements This patch tries to improve the segementation of cases like First sentence.[1] Another sentence We cannot split at period because [1] is a reference link for First sentence Change-Id: Id0a16d7ab77d9860e5690090f15f8f07679ca5c4 --- M segmentation/languages/CXParser.js M tests/segmentation/SegmentationTests.json 2 files changed, 18 insertions(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver refs/changes/22/123522/1 diff --git a/segmentation/languages/CXParser.js b/segmentation/languages/CXParser.js index a57502f..5713b2d 100644 --- a/segmentation/languages/CXParser.js +++ b/segmentation/languages/CXParser.js @@ -122,7 +122,8 @@ text = text.replace( /(\w*)([.!?][\s])/g, textSplit ); // content terminating with [.|!|?] text = text.replace( /([.!?])$/, function ( match, p1 ) { - return p1 + parser.endSentence(); + parser.seenSentenceEnd = true; + return p1; // + parser.endSentence(); } ); this.print( text ); }; @@ -159,6 +160,13 @@ this.print( this.startSentence() ); } + if ( tag.name === 'span' && this.seenSentenceEnd && + tag.attributes.class === 'reference' && this.inSentence ) { + // Sentences staring with reference links. + // Example: Sentence one.[1] Sentence two + // Here [1] is not part of Sentence two. It is reference for Sentence one. + this.inReference = true; + } // Start of tag this.print( '<' + tag.name ); @@ -200,6 +208,10 @@ } else { this.print( '</' + tag + '>' ); } + + if ( tag === 'span' && this.inReference ) { + this.print( this.endSentence() ); + } }; module.exports = CXParser; diff --git a/tests/segmentation/SegmentationTests.json b/tests/segmentation/SegmentationTests.json index b150841..774039a 100644 --- a/tests/segmentation/SegmentationTests.json +++ b/tests/segmentation/SegmentationTests.json @@ -36,6 +36,11 @@ "desc": "References can appear after period. Example: Hydrogen is a gas. [1] It is ...", "source": "<p>Sentence one. <span class=\"reference\"><a href=\"#\">reference</a></span> Starts with reference</p>", "result": "<p id=\"0\"><span class=\"cx-segment\" data-segmentid=\"1\">Sentence one. </span><span class=\"cx-segment\" data-segmentid=\"2\"><span class=\"reference\"><a class=\"cx-link\" data-linkid=\"3\" href=\"#\">reference</a></span> Starts with reference</span></p>" + }, + { + "desc": "References can appear after period without space. Example: Hydrogen is a gas.[1] It is ...", + "source": "<p>Sentence one.<span class=\"reference\"><a href=\"#\">reference</a></span> Starts with reference</p>", + "result": "<p id=\"0\"><span class=\"cx-segment\" data-segmentid=\"1\">Sentence one.<span class=\"reference\"><a class=\"cx-link\" data-linkid=\"2\" href=\"#\">reference</a></span></span><span class=\"cx-segment\" data-segmentid=\"3\"> Starts with reference</span></p>" } ], "hi": [ -- To view, visit https://gerrit.wikimedia.org/r/123522 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Id0a16d7ab77d9860e5690090f15f8f07679ca5c4 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/cxserver Gerrit-Branch: master Gerrit-Owner: Santhosh <santhosh.thottin...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits