jenkins-bot has submitted this change and it was merged.

Change subject: Segmentation: More fixes related to references at end of 
sentence
......................................................................


Segmentation: More fixes related to references at end of sentence

References need special handling because it can appear after a period
but still part of previous sentences.

It is also possible that they come repeated. example: "...end.[1][2]"

To handle this repeated reference, we are undoing the last segment close.

But there were some bugs in resetting the inReference flag causing the
"undo end sentence" deleting actual content

This patch fixes them and add more tests

Change-Id: Icc4d1b6ddad804d5365a0001c7b93f3ac78c9b23
---
M segmentation/languages/CXParser.js
M tests/segmentation/SegmentationTests.json
A tests/segmentation/data/result-15.html
A tests/segmentation/data/result-ends-with-bracket.html
A tests/segmentation/data/result-ends-with-references-missing-letters.html
A tests/segmentation/data/test-15.html
A tests/segmentation/data/test-ends-with-bracket.html
A tests/segmentation/data/test-ends-with-references-missing-letters.html
8 files changed, 39 insertions(+), 10 deletions(-)

Approvals:
  KartikMistry: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/segmentation/languages/CXParser.js 
b/segmentation/languages/CXParser.js
index bf1ce9e..076c500 100644
--- a/segmentation/languages/CXParser.js
+++ b/segmentation/languages/CXParser.js
@@ -1,6 +1,7 @@
 'use strict';
 
 var SAXParser = require( 'sax' ).SAXParser,
+       logger = require( __dirname + '/../../utils/Logger.js' ),
        util = require( 'util' );
 
 /**
@@ -79,6 +80,7 @@
 CXParser.prototype.startSentence = function () {
        this.inSentence = true;
        this.sawSentenceEndCandidate = false;
+       this.inReference = false;
        return '\n\t<span class="cx-segment" data-segmentid="' + ( 
this.segmentCount++ ) + '">';
 };
 
@@ -94,8 +96,15 @@
  * Undo end a sentence
  */
 CXParser.prototype.undoEndSentence = function () {
+       var lastClose;
        this.inSentence = true;
-       this.segmentedContent = this.segmentedContent.substr( 0, 
this.segmentedContent.length - 7 );
+       lastClose = this.segmentedContent.substr( this.segmentedContent.length 
- 7, this.segmentedContent.length );
+       // Make sure we are not deleting anything other than close tag.
+       if ( lastClose === '</span>' ) {
+               this.segmentedContent = this.segmentedContent.substr( 0, 
this.segmentedContent.length - 7 );
+       } else {
+               logger.warn( 'Error in undoEndSentence. Attempted deleting 
content' );
+       }
 };
 
 /**
@@ -174,7 +183,7 @@
 
        if ( this.sawSentenceEndCandidate ) {
                if ( tag.name === 'span' &&
-                       tag.attributes.class === 'reference' && ( 
this.inSentence || this.inReference ) && this.sawSentenceEndCandidate ) {
+                       tag.attributes.class === 'reference' && ( 
this.inSentence || this.inReference ) ) {
                        // Sentences staring with reference links.
                        // Example: Sentence one.[1] Sentence two
                        // Here [1] is not part of Sentence two. It is 
reference for Sentence one.
@@ -188,14 +197,13 @@
                        }
                        this.inReference = true;
                }
-
-               // Check if we need to reset inReference state. References 
contains an 'a' tag
-               // inside 'span' tag
-               if ( this.inReference && !( tag.name === 'a' || tag.name === 
'span' ) ) {
-                       // Reset inReference
-                       this.inReference = false;
-                       this.sawSentenceEndCandidate = false;
-               }
+       }
+       // Check if we need to reset inReference state. References contains an 
'a' tag
+       // inside 'span' tag
+       if ( this.inReference && tag.name !== 'a' && tag.name !== 'span' ) {
+               // Reset inReference
+               this.inReference = false;
+               this.sawSentenceEndCandidate = false;
        }
        // Start of tag
        this.print( '<' + tag.name );
diff --git a/tests/segmentation/SegmentationTests.json 
b/tests/segmentation/SegmentationTests.json
index 205bf1f..aec60ca 100644
--- a/tests/segmentation/SegmentationTests.json
+++ b/tests/segmentation/SegmentationTests.json
@@ -6,6 +6,11 @@
                        "result": "result-1.html"
                },
                {
+                       "desc": "Sentence ending with )",
+                       "source": "test-ends-with-bracket.html",
+                       "result": "result-ends-with-bracket.html"
+               },
+               {
                        "desc": "Exclamation, punctuation test",
                        "source": "test-2.html",
                        "result": "result-2.html"
@@ -69,6 +74,16 @@
                        "desc": "Paragraph from Debian article- repeating 
references",
                        "source": "test-debian-1.html",
                        "result": "result-debian-1.html"
+               },
+               {
+                       "desc": "References after closing bracket and period.",
+                       "source": "test-15.html",
+                       "result": "result-15.html"
+               },
+               {
+                       "desc": "Paragraph ending with reference and already 
having reference in between. The second reference should not be identified as 
repeating reference. If identified as repeating reference, 7 letters from last 
word will be missing.",
+                       "source": 
"test-ends-with-references-missing-letters.html",
+                       "result": 
"result-ends-with-references-missing-letters.html"
                }
        ],
        "hi": [
diff --git a/tests/segmentation/data/result-15.html 
b/tests/segmentation/data/result-15.html
new file mode 100644
index 0000000..b36bb5b
--- /dev/null
+++ b/tests/segmentation/data/result-15.html
@@ -0,0 +1 @@
+<p id="0"><span class="cx-segment" data-segmentid="1">When the GNU project 
first started they "had an <a class="cx-link" data-linkid="2" 
href="/wiki/Emacs" title="Emacs">Emacs</a> text editor with <a class="cx-link" 
data-linkid="3" href="/wiki/Lisp_(programming_language)" title="Lisp 
(programming language)">Lisp</a> for writing editor commands, a source level <a 
class="cx-link" data-linkid="4" href="/wiki/Debugger" 
title="Debugger">debugger</a>, a <a class="cx-link" data-linkid="5" 
href="/wiki/Yacc" title="Yacc" data-original-title="">yacc</a>-compatible <a 
class="cx-link" data-linkid="6" href="/wiki/Parsing" title="Parsing">parser</a> 
generator, and a <a class="cx-link" data-linkid="7" 
href="/wiki/Linker_(computing)" title="Linker (computing)">linker</a>".<span 
id="cite_ref-4" class="reference"><a class="cx-link" data-linkid="8" 
href="#cite_note-4">[4]</a></span></span><span class="cx-segment" 
data-segmentid="9"> The GNU system required its own C compiler and tools to be 
free software, so that these also had to be developed. </span><span 
class="cx-segment" data-segmentid="10">By June 1987 the project had accumulated 
and developed free software for an assembler, an almost finished portable 
optimizing C compiler (<a class="cx-link" data-linkid="11" 
href="/wiki/GNU_Compiler_Collection" title="GNU Compiler Collection" 
data-original-title="">GCC</a>), an editor (<a class="cx-link" data-linkid="12" 
href="/wiki/Emacs" title="Emacs">GNU Emacs</a>), and various Unix utilities 
(such as <code>ls</code>, <code>grep</code>, <code>awk</code>, 
<code>make</code> and <code>ld</code>).<span id="cite_ref-5" 
class="reference"><a class="cx-link" data-linkid="13" href="#cite_note-5" 
title="" data-original-title="">[5]</a></span></span><span class="cx-segment" 
data-segmentid="14"> They had an initial kernel that needed more 
updates.</span></p>
diff --git a/tests/segmentation/data/result-ends-with-bracket.html 
b/tests/segmentation/data/result-ends-with-bracket.html
new file mode 100644
index 0000000..cb1b849
--- /dev/null
+++ b/tests/segmentation/data/result-ends-with-bracket.html
@@ -0,0 +1 @@
+<p id="0"><span class="cx-segment" data-segmentid="1">By June 1987 the project 
had accumulated and developed free software for an assembler, an almost 
finished portable optimizing C compiler (GCC), an editor (GNU Emacs), and 
various Unix utilities (such as <code>ls</code>, <code>grep</code>, 
<code>awk</code>, <code>make</code> and <code>ld</code>). </span><span 
class="cx-segment" data-segmentid="2">They had an initial kernel that needed 
more updates.</span></p>
diff --git 
a/tests/segmentation/data/result-ends-with-references-missing-letters.html 
b/tests/segmentation/data/result-ends-with-references-missing-letters.html
new file mode 100644
index 0000000..4af0e04
--- /dev/null
+++ b/tests/segmentation/data/result-ends-with-references-missing-letters.html
@@ -0,0 +1 @@
+<p id="0"><span class="cx-segment" data-segmentid="1">By June 1987 the project 
had accumulated and developed free software for an assembler, an almost 
finished portable optimizing C compiler (<a class="cx-link" data-linkid="2" 
href="/wiki/GNU_Compiler_Collection" title="GNU Compiler Collection" 
data-original-title="">GCC</a>), an editor (<a class="cx-link" data-linkid="3" 
href="/wiki/Emacs" title="Emacs">GNU Emacs</a>), and various Unix utilities 
(such as ls, grep, awk, make and ld).<span id="cite_ref-5" class="reference"><a 
class="cx-link" data-linkid="4" href="#cite_note-5" title="" 
data-original-title="">[5]</a></span></span><span class="cx-segment" 
data-segmentid="5"> They had an initial kernel that needed more updates. 
</span><span class="cx-segment" data-segmentid="6">By June 1987 the project had 
accumulated and developed free software for an assembler, an almost finished 
portable optimizing C compiler (<a class="cx-link" data-linkid="7" 
href="/wiki/GNU_Compiler_Collection" title="GNU Compiler Collection" 
data-original-title="">GCC</a>), an editor (<a class="cx-link" data-linkid="8" 
href="/wiki/Emacs" title="Emacs">GNU Emacs</a>), and various Unix utilities 
(such as ls, grep, awk, make and ld.<span id="cite_ref-5" class="reference"><a 
class="cx-link" data-linkid="9" href="#cite_note-5" title="" 
data-original-title="">[6]</a></span></span></p>
diff --git a/tests/segmentation/data/test-15.html 
b/tests/segmentation/data/test-15.html
new file mode 100644
index 0000000..d464ef8
--- /dev/null
+++ b/tests/segmentation/data/test-15.html
@@ -0,0 +1 @@
+<p>When the GNU project first started they "had an <a href="/wiki/Emacs" 
title="Emacs">Emacs</a> text editor with <a 
href="/wiki/Lisp_(programming_language)" title="Lisp (programming 
language)">Lisp</a> for writing editor commands, a source level <a 
href="/wiki/Debugger" title="Debugger">debugger</a>, a <a href="/wiki/Yacc" 
title="Yacc" data-original-title="">yacc</a>-compatible <a href="/wiki/Parsing" 
title="Parsing">parser</a> generator, and a <a href="/wiki/Linker_(computing)" 
title="Linker (computing)">linker</a>".<span id="cite_ref-4" 
class="reference"><a href="#cite_note-4">[4]</a></span> The GNU system required 
its own C compiler and tools to be free software, so that these also had to be 
developed. By June 1987 the project had accumulated and developed free software 
for an assembler, an almost finished portable optimizing C compiler (<a 
href="/wiki/GNU_Compiler_Collection" title="GNU Compiler Collection" 
data-original-title="">GCC</a>), an editor (<a href="/wiki/Emacs" 
title="Emacs">GNU Emacs</a>), and various Unix utilities (such as 
<code>ls</code>, <code>grep</code>, <code>awk</code>, <code>make</code> and 
<code>ld</code>).<span id="cite_ref-5" class="reference"><a href="#cite_note-5" 
title="" data-original-title="">[5]</a></span> They had an initial kernel that 
needed more updates.</p>
\ No newline at end of file
diff --git a/tests/segmentation/data/test-ends-with-bracket.html 
b/tests/segmentation/data/test-ends-with-bracket.html
new file mode 100644
index 0000000..b28319d
--- /dev/null
+++ b/tests/segmentation/data/test-ends-with-bracket.html
@@ -0,0 +1 @@
+<p>By June 1987 the project had accumulated and developed free software for an 
assembler, an almost finished portable optimizing C compiler (GCC), an editor 
(GNU Emacs), and various Unix utilities (such as <code>ls</code>, 
<code>grep</code>, <code>awk</code>, <code>make</code> and <code>ld</code>). 
They had an initial kernel that needed more updates.</p>
\ No newline at end of file
diff --git 
a/tests/segmentation/data/test-ends-with-references-missing-letters.html 
b/tests/segmentation/data/test-ends-with-references-missing-letters.html
new file mode 100644
index 0000000..79ec575
--- /dev/null
+++ b/tests/segmentation/data/test-ends-with-references-missing-letters.html
@@ -0,0 +1 @@
+<p>By June 1987 the project had accumulated and developed free software for an 
assembler, an almost finished portable optimizing C compiler (<a 
href="/wiki/GNU_Compiler_Collection" title="GNU Compiler Collection" 
data-original-title="">GCC</a>), an editor (<a href="/wiki/Emacs" 
title="Emacs">GNU Emacs</a>), and various Unix utilities (such as ls, grep, 
awk, make and ld).<span id="cite_ref-5" class="reference"><a 
href="#cite_note-5" title="" data-original-title="">[5]</a></span> They had an 
initial kernel that needed more updates. By June 1987 the project had 
accumulated and developed free software for an assembler, an almost finished 
portable optimizing C compiler (<a href="/wiki/GNU_Compiler_Collection" 
title="GNU Compiler Collection" data-original-title="">GCC</a>), an editor (<a 
href="/wiki/Emacs" title="Emacs">GNU Emacs</a>), and various Unix utilities 
(such as ls, grep, awk, make and ld.<span id="cite_ref-5" class="reference"><a 
href="#cite_note-5" title="" data-original-title="">[6]</a></span></p>
\ No newline at end of file

-- 
To view, visit https://gerrit.wikimedia.org/r/125103
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Icc4d1b6ddad804d5365a0001c7b93f3ac78c9b23
Gerrit-PatchSet: 2
Gerrit-Project: mediawiki/services/cxserver
Gerrit-Branch: master
Gerrit-Owner: Santhosh <santhosh.thottin...@gmail.com>
Gerrit-Reviewer: KartikMistry <kartik.mis...@gmail.com>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to