jenkins-bot has submitted this change and it was merged. Change subject: Segmentation: More fixes related to references at end of sentence ......................................................................
Segmentation: More fixes related to references at end of sentence References need special handling because it can appear after a period but still part of previous sentences. It is also possible that they come repeated. example: "...end.[1][2]" To handle this repeated reference, we are undoing the last segment close. But there were some bugs in resetting the inReference flag causing the "undo end sentence" deleting actual content This patch fixes them and add more tests Change-Id: Icc4d1b6ddad804d5365a0001c7b93f3ac78c9b23 --- M segmentation/languages/CXParser.js M tests/segmentation/SegmentationTests.json A tests/segmentation/data/result-15.html A tests/segmentation/data/result-ends-with-bracket.html A tests/segmentation/data/result-ends-with-references-missing-letters.html A tests/segmentation/data/test-15.html A tests/segmentation/data/test-ends-with-bracket.html A tests/segmentation/data/test-ends-with-references-missing-letters.html 8 files changed, 39 insertions(+), 10 deletions(-) Approvals: KartikMistry: Looks good to me, approved jenkins-bot: Verified diff --git a/segmentation/languages/CXParser.js b/segmentation/languages/CXParser.js index bf1ce9e..076c500 100644 --- a/segmentation/languages/CXParser.js +++ b/segmentation/languages/CXParser.js @@ -1,6 +1,7 @@ 'use strict'; var SAXParser = require( 'sax' ).SAXParser, + logger = require( __dirname + '/../../utils/Logger.js' ), util = require( 'util' ); /** @@ -79,6 +80,7 @@ CXParser.prototype.startSentence = function () { this.inSentence = true; this.sawSentenceEndCandidate = false; + this.inReference = false; return '\n\t<span class="cx-segment" data-segmentid="' + ( this.segmentCount++ ) + '">'; }; @@ -94,8 +96,15 @@ * Undo end a sentence */ CXParser.prototype.undoEndSentence = function () { + var lastClose; this.inSentence = true; - this.segmentedContent = this.segmentedContent.substr( 0, this.segmentedContent.length - 7 ); + lastClose = this.segmentedContent.substr( this.segmentedContent.length - 7, this.segmentedContent.length ); + // Make sure we are not deleting anything other than close tag. + if ( lastClose === '</span>' ) { + this.segmentedContent = this.segmentedContent.substr( 0, this.segmentedContent.length - 7 ); + } else { + logger.warn( 'Error in undoEndSentence. Attempted deleting content' ); + } }; /** @@ -174,7 +183,7 @@ if ( this.sawSentenceEndCandidate ) { if ( tag.name === 'span' && - tag.attributes.class === 'reference' && ( this.inSentence || this.inReference ) && this.sawSentenceEndCandidate ) { + tag.attributes.class === 'reference' && ( this.inSentence || this.inReference ) ) { // Sentences staring with reference links. // Example: Sentence one.[1] Sentence two // Here [1] is not part of Sentence two. It is reference for Sentence one. @@ -188,14 +197,13 @@ } this.inReference = true; } - - // Check if we need to reset inReference state. References contains an 'a' tag - // inside 'span' tag - if ( this.inReference && !( tag.name === 'a' || tag.name === 'span' ) ) { - // Reset inReference - this.inReference = false; - this.sawSentenceEndCandidate = false; - } + } + // Check if we need to reset inReference state. References contains an 'a' tag + // inside 'span' tag + if ( this.inReference && tag.name !== 'a' && tag.name !== 'span' ) { + // Reset inReference + this.inReference = false; + this.sawSentenceEndCandidate = false; } // Start of tag this.print( '<' + tag.name ); diff --git a/tests/segmentation/SegmentationTests.json b/tests/segmentation/SegmentationTests.json index 205bf1f..aec60ca 100644 --- a/tests/segmentation/SegmentationTests.json +++ b/tests/segmentation/SegmentationTests.json @@ -6,6 +6,11 @@ "result": "result-1.html" }, { + "desc": "Sentence ending with )", + "source": "test-ends-with-bracket.html", + "result": "result-ends-with-bracket.html" + }, + { "desc": "Exclamation, punctuation test", "source": "test-2.html", "result": "result-2.html" @@ -69,6 +74,16 @@ "desc": "Paragraph from Debian article- repeating references", "source": "test-debian-1.html", "result": "result-debian-1.html" + }, + { + "desc": "References after closing bracket and period.", + "source": "test-15.html", + "result": "result-15.html" + }, + { + "desc": "Paragraph ending with reference and already having reference in between. The second reference should not be identified as repeating reference. If identified as repeating reference, 7 letters from last word will be missing.", + "source": "test-ends-with-references-missing-letters.html", + "result": "result-ends-with-references-missing-letters.html" } ], "hi": [ diff --git a/tests/segmentation/data/result-15.html b/tests/segmentation/data/result-15.html new file mode 100644 index 0000000..b36bb5b --- /dev/null +++ b/tests/segmentation/data/result-15.html @@ -0,0 +1 @@ +<p id="0"><span class="cx-segment" data-segmentid="1">When the GNU project first started they "had an <a class="cx-link" data-linkid="2" href="/wiki/Emacs" title="Emacs">Emacs</a> text editor with <a class="cx-link" data-linkid="3" href="/wiki/Lisp_(programming_language)" title="Lisp (programming language)">Lisp</a> for writing editor commands, a source level <a class="cx-link" data-linkid="4" href="/wiki/Debugger" title="Debugger">debugger</a>, a <a class="cx-link" data-linkid="5" href="/wiki/Yacc" title="Yacc" data-original-title="">yacc</a>-compatible <a class="cx-link" data-linkid="6" href="/wiki/Parsing" title="Parsing">parser</a> generator, and a <a class="cx-link" data-linkid="7" href="/wiki/Linker_(computing)" title="Linker (computing)">linker</a>".<span id="cite_ref-4" class="reference"><a class="cx-link" data-linkid="8" href="#cite_note-4">[4]</a></span></span><span class="cx-segment" data-segmentid="9"> The GNU system required its own C compiler and tools to be free software, so that these also had to be developed. </span><span class="cx-segment" data-segmentid="10">By June 1987 the project had accumulated and developed free software for an assembler, an almost finished portable optimizing C compiler (<a class="cx-link" data-linkid="11" href="/wiki/GNU_Compiler_Collection" title="GNU Compiler Collection" data-original-title="">GCC</a>), an editor (<a class="cx-link" data-linkid="12" href="/wiki/Emacs" title="Emacs">GNU Emacs</a>), and various Unix utilities (such as <code>ls</code>, <code>grep</code>, <code>awk</code>, <code>make</code> and <code>ld</code>).<span id="cite_ref-5" class="reference"><a class="cx-link" data-linkid="13" href="#cite_note-5" title="" data-original-title="">[5]</a></span></span><span class="cx-segment" data-segmentid="14"> They had an initial kernel that needed more updates.</span></p> diff --git a/tests/segmentation/data/result-ends-with-bracket.html b/tests/segmentation/data/result-ends-with-bracket.html new file mode 100644 index 0000000..cb1b849 --- /dev/null +++ b/tests/segmentation/data/result-ends-with-bracket.html @@ -0,0 +1 @@ +<p id="0"><span class="cx-segment" data-segmentid="1">By June 1987 the project had accumulated and developed free software for an assembler, an almost finished portable optimizing C compiler (GCC), an editor (GNU Emacs), and various Unix utilities (such as <code>ls</code>, <code>grep</code>, <code>awk</code>, <code>make</code> and <code>ld</code>). </span><span class="cx-segment" data-segmentid="2">They had an initial kernel that needed more updates.</span></p> diff --git a/tests/segmentation/data/result-ends-with-references-missing-letters.html b/tests/segmentation/data/result-ends-with-references-missing-letters.html new file mode 100644 index 0000000..4af0e04 --- /dev/null +++ b/tests/segmentation/data/result-ends-with-references-missing-letters.html @@ -0,0 +1 @@ +<p id="0"><span class="cx-segment" data-segmentid="1">By June 1987 the project had accumulated and developed free software for an assembler, an almost finished portable optimizing C compiler (<a class="cx-link" data-linkid="2" href="/wiki/GNU_Compiler_Collection" title="GNU Compiler Collection" data-original-title="">GCC</a>), an editor (<a class="cx-link" data-linkid="3" href="/wiki/Emacs" title="Emacs">GNU Emacs</a>), and various Unix utilities (such as ls, grep, awk, make and ld).<span id="cite_ref-5" class="reference"><a class="cx-link" data-linkid="4" href="#cite_note-5" title="" data-original-title="">[5]</a></span></span><span class="cx-segment" data-segmentid="5"> They had an initial kernel that needed more updates. </span><span class="cx-segment" data-segmentid="6">By June 1987 the project had accumulated and developed free software for an assembler, an almost finished portable optimizing C compiler (<a class="cx-link" data-linkid="7" href="/wiki/GNU_Compiler_Collection" title="GNU Compiler Collection" data-original-title="">GCC</a>), an editor (<a class="cx-link" data-linkid="8" href="/wiki/Emacs" title="Emacs">GNU Emacs</a>), and various Unix utilities (such as ls, grep, awk, make and ld.<span id="cite_ref-5" class="reference"><a class="cx-link" data-linkid="9" href="#cite_note-5" title="" data-original-title="">[6]</a></span></span></p> diff --git a/tests/segmentation/data/test-15.html b/tests/segmentation/data/test-15.html new file mode 100644 index 0000000..d464ef8 --- /dev/null +++ b/tests/segmentation/data/test-15.html @@ -0,0 +1 @@ +<p>When the GNU project first started they "had an <a href="/wiki/Emacs" title="Emacs">Emacs</a> text editor with <a href="/wiki/Lisp_(programming_language)" title="Lisp (programming language)">Lisp</a> for writing editor commands, a source level <a href="/wiki/Debugger" title="Debugger">debugger</a>, a <a href="/wiki/Yacc" title="Yacc" data-original-title="">yacc</a>-compatible <a href="/wiki/Parsing" title="Parsing">parser</a> generator, and a <a href="/wiki/Linker_(computing)" title="Linker (computing)">linker</a>".<span id="cite_ref-4" class="reference"><a href="#cite_note-4">[4]</a></span> The GNU system required its own C compiler and tools to be free software, so that these also had to be developed. By June 1987 the project had accumulated and developed free software for an assembler, an almost finished portable optimizing C compiler (<a href="/wiki/GNU_Compiler_Collection" title="GNU Compiler Collection" data-original-title="">GCC</a>), an editor (<a href="/wiki/Emacs" title="Emacs">GNU Emacs</a>), and various Unix utilities (such as <code>ls</code>, <code>grep</code>, <code>awk</code>, <code>make</code> and <code>ld</code>).<span id="cite_ref-5" class="reference"><a href="#cite_note-5" title="" data-original-title="">[5]</a></span> They had an initial kernel that needed more updates.</p> \ No newline at end of file diff --git a/tests/segmentation/data/test-ends-with-bracket.html b/tests/segmentation/data/test-ends-with-bracket.html new file mode 100644 index 0000000..b28319d --- /dev/null +++ b/tests/segmentation/data/test-ends-with-bracket.html @@ -0,0 +1 @@ +<p>By June 1987 the project had accumulated and developed free software for an assembler, an almost finished portable optimizing C compiler (GCC), an editor (GNU Emacs), and various Unix utilities (such as <code>ls</code>, <code>grep</code>, <code>awk</code>, <code>make</code> and <code>ld</code>). They had an initial kernel that needed more updates.</p> \ No newline at end of file diff --git a/tests/segmentation/data/test-ends-with-references-missing-letters.html b/tests/segmentation/data/test-ends-with-references-missing-letters.html new file mode 100644 index 0000000..79ec575 --- /dev/null +++ b/tests/segmentation/data/test-ends-with-references-missing-letters.html @@ -0,0 +1 @@ +<p>By June 1987 the project had accumulated and developed free software for an assembler, an almost finished portable optimizing C compiler (<a href="/wiki/GNU_Compiler_Collection" title="GNU Compiler Collection" data-original-title="">GCC</a>), an editor (<a href="/wiki/Emacs" title="Emacs">GNU Emacs</a>), and various Unix utilities (such as ls, grep, awk, make and ld).<span id="cite_ref-5" class="reference"><a href="#cite_note-5" title="" data-original-title="">[5]</a></span> They had an initial kernel that needed more updates. By June 1987 the project had accumulated and developed free software for an assembler, an almost finished portable optimizing C compiler (<a href="/wiki/GNU_Compiler_Collection" title="GNU Compiler Collection" data-original-title="">GCC</a>), an editor (<a href="/wiki/Emacs" title="Emacs">GNU Emacs</a>), and various Unix utilities (such as ls, grep, awk, make and ld.<span id="cite_ref-5" class="reference"><a href="#cite_note-5" title="" data-original-title="">[6]</a></span></p> \ No newline at end of file -- To view, visit https://gerrit.wikimedia.org/r/125103 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Icc4d1b6ddad804d5365a0001c7b93f3ac78c9b23 Gerrit-PatchSet: 2 Gerrit-Project: mediawiki/services/cxserver Gerrit-Branch: master Gerrit-Owner: Santhosh <santhosh.thottin...@gmail.com> Gerrit-Reviewer: KartikMistry <kartik.mis...@gmail.com> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits