Santhosh has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/123522

Change subject: Segmentation: Reference handling improvements
......................................................................

Segmentation: Reference handling improvements

This patch tries to improve the segementation of cases like
First sentence.[1] Another sentence

We cannot split at period because [1] is a reference link for
First sentence

Change-Id: Id0a16d7ab77d9860e5690090f15f8f07679ca5c4
---
M segmentation/languages/CXParser.js
M tests/segmentation/SegmentationTests.json
2 files changed, 18 insertions(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver 
refs/changes/22/123522/1

diff --git a/segmentation/languages/CXParser.js 
b/segmentation/languages/CXParser.js
index a57502f..5713b2d 100644
--- a/segmentation/languages/CXParser.js
+++ b/segmentation/languages/CXParser.js
@@ -122,7 +122,8 @@
        text = text.replace( /(\w*)([.!?][\s])/g, textSplit );
        // content terminating with [.|!|?]
        text = text.replace( /([.!?])$/, function ( match, p1 ) {
-               return p1 + parser.endSentence();
+               parser.seenSentenceEnd = true;
+               return p1; // + parser.endSentence();
        } );
        this.print( text );
 };
@@ -159,6 +160,13 @@
                this.print( this.startSentence() );
        }
 
+       if ( tag.name === 'span' && this.seenSentenceEnd &&
+               tag.attributes.class === 'reference' && this.inSentence ) {
+               // Sentences staring with reference links.
+               // Example: Sentence one.[1] Sentence two
+               // Here [1] is not part of Sentence two. It is reference for 
Sentence one.
+               this.inReference = true;
+       }
        // Start of tag
        this.print( '<' + tag.name );
 
@@ -200,6 +208,10 @@
        } else {
                this.print( '</' + tag + '>' );
        }
+
+       if ( tag === 'span' && this.inReference ) {
+               this.print( this.endSentence() );
+       }
 };
 
 module.exports = CXParser;
diff --git a/tests/segmentation/SegmentationTests.json 
b/tests/segmentation/SegmentationTests.json
index b150841..774039a 100644
--- a/tests/segmentation/SegmentationTests.json
+++ b/tests/segmentation/SegmentationTests.json
@@ -36,6 +36,11 @@
                        "desc": "References can appear after period. Example: 
Hydrogen is a gas. [1] It is ...",
                        "source": "<p>Sentence one. <span 
class=\"reference\"><a href=\"#\">reference</a></span> Starts with 
reference</p>",
                        "result": "<p id=\"0\"><span class=\"cx-segment\" 
data-segmentid=\"1\">Sentence one. </span><span class=\"cx-segment\" 
data-segmentid=\"2\"><span class=\"reference\"><a class=\"cx-link\" 
data-linkid=\"3\" href=\"#\">reference</a></span> Starts with 
reference</span></p>"
+               },
+               {
+                       "desc": "References can appear after period without 
space. Example: Hydrogen is a gas.[1] It is ...",
+                       "source": "<p>Sentence one.<span class=\"reference\"><a 
href=\"#\">reference</a></span> Starts with reference</p>",
+                       "result": "<p id=\"0\"><span class=\"cx-segment\" 
data-segmentid=\"1\">Sentence one.<span class=\"reference\"><a 
class=\"cx-link\" data-linkid=\"2\" href=\"#\">reference</a></span></span><span 
class=\"cx-segment\" data-segmentid=\"3\"> Starts with reference</span></p>"
                }
        ],
        "hi": [

-- 
To view, visit https://gerrit.wikimedia.org/r/123522
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Id0a16d7ab77d9860e5690090f15f8f07679ca5c4
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/cxserver
Gerrit-Branch: master
Gerrit-Owner: Santhosh <santhosh.thottin...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to