Santhosh has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/123596

Change subject: Segmentation: Handle the case of repeated references
......................................................................

Segmentation: Handle the case of repeated references

Example: This is a sentence.[1][2][3][4]
Added tests and simplified the text handler

Change-Id: I992d9e02157e0649ff8bcb02992206c8a793548d
---
M segmentation/languages/CXParser.js
M segmentation/languages/hi/CXParserHi.js
M tests/segmentation/SegmentationTests.json
3 files changed, 35 insertions(+), 14 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver 
refs/changes/96/123596/1

diff --git a/segmentation/languages/CXParser.js 
b/segmentation/languages/CXParser.js
index 5713b2d..70c071c 100644
--- a/segmentation/languages/CXParser.js
+++ b/segmentation/languages/CXParser.js
@@ -90,6 +90,14 @@
 };
 
 /**
+ * Undo end a sentence
+ */
+CXParser.prototype.undoEndSentence = function () {
+       this.inSentence = true;
+       this.segmentedContent = this.segmentedContent.substr( 0, 
this.segmentedContent.length - 7 );
+};
+
+/**
  * Text handler
  * @param {string} text
  */
@@ -107,7 +115,6 @@
                var replacement, nextLetter;
 
                replacement = prevWord + sentenceSeparator;
-               //console.log([match, prevWord, sentenceSeparator, offset]);
                nextLetter = sentence[ offset + match.length ];
                if ( prevWord && prevWord.length < 3 && prevWord[ 0 
].toUpperCase() === prevWord[ 0 ] ||
                        nextLetter && nextLetter.toLowerCase() === nextLetter ) 
{
@@ -120,11 +127,6 @@
        }
 
        text = text.replace( /(\w*)([.!?][\s])/g, textSplit );
-       // content terminating with [.|!|?]
-       text = text.replace( /([.!?])$/, function ( match, p1 ) {
-               parser.seenSentenceEnd = true;
-               return p1; // + parser.endSentence();
-       } );
        this.print( text );
 };
 
@@ -155,18 +157,35 @@
                        this.print( this.endSentence() );
                }
        }
+
        if ( tag.name === 'a' && !this.inSentence ) {
                // sentences starting with a link
                this.print( this.startSentence() );
        }
 
-       if ( tag.name === 'span' && this.seenSentenceEnd &&
-               tag.attributes.class === 'reference' && this.inSentence ) {
+       if ( tag.name === 'span' &&
+               tag.attributes.class === 'reference' && ( this.inSentence || 
this.inReference ) ) {
                // Sentences staring with reference links.
                // Example: Sentence one.[1] Sentence two
                // Here [1] is not part of Sentence two. It is reference for 
Sentence one.
+               // It is also possible to have this references repeated n times
+               // Example: Sentence one.[1][2][3][4] Sentence two
+               if ( this.inReference ) {
+                       // This is already in Reference state. That means, this 
is a
+                       // case of multiple references. We need to remove the 
last sentence close
+                       this.inSentence = true;
+                       this.undoEndSentence();
+               }
                this.inReference = true;
        }
+
+       // Check if we need to reset inReference state. References contains an 
'a' tag
+       // inside 'span' tag
+       if ( this.inReference && !( tag.name === 'a' || tag.name === 'span' ) ) 
{
+               // Reset inReference
+               this.inReference = false;
+       }
+
        // Start of tag
        this.print( '<' + tag.name );
 
@@ -208,7 +227,8 @@
        } else {
                this.print( '</' + tag + '>' );
        }
-
+       // See if we have to print the left over </span>
+       // from reference handling
        if ( tag === 'span' && this.inReference ) {
                this.print( this.endSentence() );
        }
diff --git a/segmentation/languages/hi/CXParserHi.js 
b/segmentation/languages/hi/CXParserHi.js
index 06aa299..8615f80 100644
--- a/segmentation/languages/hi/CXParserHi.js
+++ b/segmentation/languages/hi/CXParserHi.js
@@ -32,10 +32,6 @@
                return replacement;
        }
        text = text.replace( /([a-zA-Zअ-ह]*)([।!?][\s])/g, textSplit );
-       // content terminating with [.|!|?]
-       text = text.replace( /([।!?])$/, function ( match, p1 ) {
-               return p1 + parser.endSentence();
-       } );
        this.print( text );
 };
 
diff --git a/tests/segmentation/SegmentationTests.json 
b/tests/segmentation/SegmentationTests.json
index 774039a..7fc0194 100644
--- a/tests/segmentation/SegmentationTests.json
+++ b/tests/segmentation/SegmentationTests.json
@@ -35,12 +35,17 @@
                {
                        "desc": "References can appear after period. Example: 
Hydrogen is a gas. [1] It is ...",
                        "source": "<p>Sentence one. <span 
class=\"reference\"><a href=\"#\">reference</a></span> Starts with 
reference</p>",
-                       "result": "<p id=\"0\"><span class=\"cx-segment\" 
data-segmentid=\"1\">Sentence one. </span><span class=\"cx-segment\" 
data-segmentid=\"2\"><span class=\"reference\"><a class=\"cx-link\" 
data-linkid=\"3\" href=\"#\">reference</a></span> Starts with 
reference</span></p>"
+                       "result": "<p id=\"0\"><span class=\"cx-segment\" 
data-segmentid=\"1\">Sentence one. </span><span class=\"cx-segment\" 
data-segmentid=\"2\"><span class=\"reference\"><a class=\"cx-link\" 
data-linkid=\"3\" href=\"#\">reference</a></span></span><span 
class=\"cx-segment\" data-segmentid=\"4\"> Starts with reference</span></p>"
                },
                {
                        "desc": "References can appear after period without 
space. Example: Hydrogen is a gas.[1] It is ...",
                        "source": "<p>Sentence one.<span class=\"reference\"><a 
href=\"#\">reference</a></span> Starts with reference</p>",
                        "result": "<p id=\"0\"><span class=\"cx-segment\" 
data-segmentid=\"1\">Sentence one.<span class=\"reference\"><a 
class=\"cx-link\" data-linkid=\"2\" href=\"#\">reference</a></span></span><span 
class=\"cx-segment\" data-segmentid=\"3\"> Starts with reference</span></p>"
+               },
+               {
+                       "desc": "References can appear after period without 
space, repeated. Example: Hydrogen is a gas.[1][2][3] It is ...",
+                       "source": "<p>Sentence one.<span class=\"reference\"><a 
href=\"#\">1</a></span><span class=\"reference\"><a 
href=\"#\">2</a></span><span class=\"reference\"><a href=\"#\">3</a></span> 
Starts with reference</p>",
+                       "result": "<p id=\"0\"><span class=\"cx-segment\" 
data-segmentid=\"1\">Sentence one.<span class=\"reference\"><a 
class=\"cx-link\" data-linkid=\"2\" href=\"#\">1</a></span><span 
class=\"reference\"><a class=\"cx-link\" data-linkid=\"3\" 
href=\"#\">2</a></span><span class=\"reference\"><a class=\"cx-link\" 
data-linkid=\"4\" href=\"#\">3</a></span></span><span class=\"cx-segment\" 
data-segmentid=\"5\"> Starts with reference</span></p>"
                }
        ],
        "hi": [

-- 
To view, visit https://gerrit.wikimedia.org/r/123596
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I992d9e02157e0649ff8bcb02992206c8a793548d
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/cxserver
Gerrit-Branch: master
Gerrit-Owner: Santhosh <santhosh.thottin...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to