Santhosh has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/392790 )
Change subject: Section wrapping at cxserver for v2 page fetch api
......................................................................
Section wrapping at cxserver for v2 page fetch api
v2 page fetch api will wrap the translatable sections
with <section> tag. Immediate childrens under <body>
will be considered as translatable sections.
Bug: T177752
Change-Id: Ibb5937061c6980579d35cd24a0ba8205b109f8c7
---
M bin/segment
M lib/lineardoc/Doc.js
M lib/lineardoc/MwContextualizer.js
M lib/lineardoc/Parser.js
M lib/mw/MWPageLoader.js
M lib/routes/v2.js
A test/mw/SectionWrap.test.js
M test/segmentation/data/result-18.html
M test/segmentation/data/result-4.html
M test/segmentation/data/test-18.html
10 files changed, 108 insertions(+), 24 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver
refs/changes/90/392790/1
diff --git a/bin/segment b/bin/segment
index 2a18675..c23f7b0 100755
--- a/bin/segment
+++ b/bin/segment
@@ -11,7 +11,9 @@
}
function getParsedDoc( content ) {
- const parser = new LinearDoc.Parser( new LinearDoc.MwContextualizer() );
+ const parser = new LinearDoc.Parser( new LinearDoc.MwContextualizer(), {
+ wrapSections: true
+ } );
parser.init();
parser.write( content );
return parser.builder.doc;
diff --git a/lib/lineardoc/Doc.js b/lib/lineardoc/Doc.js
index 6c4047d..30d2909 100644
--- a/lib/lineardoc/Doc.js
+++ b/lib/lineardoc/Doc.js
@@ -77,10 +77,14 @@
Doc.prototype.segment = function ( getBoundaries ) {
var i, len, item, tag, textBlock, hash,
newDoc = new Doc(),
+ nextSectionId = 0,
nextId = 0;
// TODO: return different counters depending on type
- function getNextId( type ) {
+ function getNextId( type, tagName ) {
+ if ( tagName === 'section' ) {
+ return String( 'cxSourceSection' + nextSectionId++ );
+ }
if ( type === 'segment' || type === 'link' || type === 'block'
) {
return String( nextId++ );
} else {
@@ -115,7 +119,7 @@
).substr( 0, 30 );
}
} else {
- tag.attributes.id = getNextId( 'block' );
+ tag.attributes.id = getNextId( 'block',
tag.name );
}
newDoc.addItem( item.type, tag );
} else if ( this.items[ i ].type !== 'textblock' ) {
diff --git a/lib/lineardoc/MwContextualizer.js
b/lib/lineardoc/MwContextualizer.js
index c5d3742..d964443 100644
--- a/lib/lineardoc/MwContextualizer.js
+++ b/lib/lineardoc/MwContextualizer.js
@@ -1,7 +1,7 @@
'use strict';
const Contextualizer = require( './Contextualizer' );
-const contentBranchNodeNames = [ 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5',
'h6', 'p', 'pre' ];
+const contentBranchNodeNames = [ 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5',
'h6', 'p', 'pre', 'div', 'table', 'ol', 'li' ];
/**
* Contextualizer for MediaWiki DOM HTML
@@ -31,13 +31,18 @@
return 'media';
}
+ // Otherwise, figure is media
+ if ( context === undefined && tag.name === 'body' ) {
+ return 'section';
+ }
+
// And figure//figcaption is contentBranch
if ( context === 'media' && tag.name === 'figcaption' ) {
return 'contentBranch';
}
// And ContentBranchNodes are contentBranch
- if ( context === undefined && contentBranchNodeNames.indexOf(
tag.name ) > -1 ) {
+ if ( ( context === 'section' || context === undefined ) &&
contentBranchNodeNames.indexOf( tag.name ) > -1 ) {
return 'contentBranch';
}
diff --git a/lib/lineardoc/Parser.js b/lib/lineardoc/Parser.js
index d2410da..5291228 100644
--- a/lib/lineardoc/Parser.js
+++ b/lib/lineardoc/Parser.js
@@ -57,7 +57,7 @@
this.builder = this.rootBuilder;
// Stack of tags currently open
this.allTags = [];
- // context for each tag currently open;
undefined|'verbatim'|'media'|'contentBranch'
+ // context for each tag currently open;
undefined|'verbatim'|'media'|'contentBranch'|'section'
this.contexts = [];
};
@@ -78,6 +78,14 @@
} else if ( this.isInlineAnnotationTag( tag.name ) ) {
this.builder.pushInlineAnnotationTag( tag );
} else {
+ if ( this.options.wrapSections &&
this.contextualizer.getContext() === 'section' ) {
+ this.builder.pushBlockTag( {
+ name: 'section',
+ attributes: {
+ rel: 'cx:Section'
+ }
+ } );
+ }
this.builder.pushBlockTag( tag );
}
this.allTags.push( tag );
@@ -106,6 +114,9 @@
this.builder = this.builder.parent;
} else if ( !isAnn ) {
this.builder.popBlockTag( tagName );
+ if ( this.options.wrapSections &&
this.contextualizer.getContext() === 'section' ) {
+ this.builder.popBlockTag( 'section' );
+ }
} else {
throw new Error( 'Unexpected close tag: ' + tagName );
}
diff --git a/lib/mw/MWPageLoader.js b/lib/mw/MWPageLoader.js
index e9c5941..14597bb 100644
--- a/lib/mw/MWPageLoader.js
+++ b/lib/mw/MWPageLoader.js
@@ -5,11 +5,17 @@
CXSegmenter = require( '../segmentation/CXSegmenter' );
class MWPageLoader extends ApiRequest {
- getPage( page, revision ) {
+ /**
+ *
+ * @param {string} page The page title
+ * @param {string} revision The revision id
+ * @param {boolean} wrapSections Whether translatable sections should
be wrapped in <section> tag
+ * @return {Promise}
+ */
+ getPage( page, revision, wrapSections ) {
return this.fetch( page, revision ).then( ( response ) => {
- const parsedDoc = this.getParsedDoc( response.body );
- const segmentedDoc = new CXSegmenter().segment(
parsedDoc, this.sourceLanguage );
- // TODO: segmentedDoc.wrapSections();
+ const parsedDoc = this.getParsedDoc( response.body,
wrapSections );
+ let segmentedDoc = new CXSegmenter().segment(
parsedDoc, this.sourceLanguage );
return {
content: segmentedDoc.getHtml(),
revision: response.revision
@@ -17,8 +23,16 @@
} );
}
- getParsedDoc( content ) {
- const parser = new LinearDoc.Parser( new
LinearDoc.MwContextualizer() );
+ /**
+ *
+ * @param {string} content
+ * @param {boolean} wrapSections Whether translatable sections should
be wrapped in <section> tag
+ * @return {Object}
+ */
+ getParsedDoc( content, wrapSections ) {
+ const parser = new LinearDoc.Parser( new
LinearDoc.MwContextualizer(), {
+ wrapSections
+ } );
parser.init();
parser.write( content );
return parser.builder.doc;
diff --git a/lib/routes/v2.js b/lib/routes/v2.js
index 188d4ec..00aa567 100644
--- a/lib/routes/v2.js
+++ b/lib/routes/v2.js
@@ -45,7 +45,7 @@
} );
this.app.logger.log( 'debug', `Getting page
${sourceLanguage}:${title}` );
- return pageLoader.getPage( title, revision ).then(
+ return pageLoader.getPage( title, revision, true /*
wrapSections */ ).then(
( response ) => {
res.send( {
sourceLanguage,
diff --git a/test/mw/SectionWrap.test.js b/test/mw/SectionWrap.test.js
new file mode 100644
index 0000000..ac0b2a2
--- /dev/null
+++ b/test/mw/SectionWrap.test.js
@@ -0,0 +1,50 @@
+'use strict';
+
+const assert = require( '../utils/assert.js' ),
+ LinearDoc = require( '../../lib/lineardoc' );
+
+function normalize( html ) {
+ var normalizer = new LinearDoc.Normalizer();
+ normalizer.init();
+ normalizer.write( html.replace( /(\r\n|\n|\t|\r)/gm, '' ) );
+ return normalizer.getHtml();
+}
+
+function getParsedDoc( content ) {
+ const parser = new LinearDoc.Parser( new LinearDoc.MwContextualizer(), {
+ wrapSections: true
+ } );
+ parser.init();
+ parser.write( content );
+ return parser.builder.doc;
+}
+
+/* eslint-disable no-multi-str */
+const sourceHTML = `<body>
+ <p>Paragraph <b>bold</b> <a href="/wiki/Title">Title</a>.</p>
+ <h3>Heading</h3>
+ <table><tr><td>data</td></tr></table>
+ <div>Content<div>innerdiv</div></div>
+ <p>Content<div>Div in paragraph</div></p>
+ <ol><li>Item</li><li>Item</li></ol>
+ <link href="./Category:Oxygen#%20" id="mwCKQ" rel="mw:PageProp/Category" />
+ </body>`;
+
+const expectedSectionWrappedHTML = `<body>
+ <section rel="cx:Section"><p>Paragraph <b>bold</b> <a
href="/wiki/Title">Title</a>.</p></section>
+ <section rel="cx:Section"><h3>Heading</h3></section>
+ <section rel="cx:Section"><table><tr><td>data</td></tr></table></section>
+ <section rel="cx:Section"><div>Content<div>innerdiv</div></div></section>
+ <section rel="cx:Section"><p>Content<div>Div in
paragraph</div></p></section>
+ <section rel="cx:Section"><ol><li>Item</li><li>Item</li></ol></section>
+ <link href="./Category:Oxygen#%20" id="mwCKQ" rel="mw:PageProp/Category" />
+ </body>`;
+
+describe( 'Section wrapping test', () => {
+ const parsedDoc = getParsedDoc( sourceHTML );
+ const result = normalize( parsedDoc.getHtml() );
+ const expectedResultData = normalize( expectedSectionWrappedHTML );
+ it( 'should not have any errors when section wrapping', () => {
+ assert.deepEqual( result, expectedResultData );
+ } );
+} );
diff --git a/test/segmentation/data/result-18.html
b/test/segmentation/data/result-18.html
index de21b5d..ebdf783 100644
--- a/test/segmentation/data/result-18.html
+++ b/test/segmentation/data/result-18.html
@@ -1,3 +1 @@
-<div id="0"><p id="1"><span><span class="cx-segment"
data-segmentid="2">Foo.<link/> </span><span class="cx-segment"
data-segmentid="3">Bar.</span></span></p>
-<meta />
-<p id="4"><span class="cx-segment" data-segmentid="5">Baz.</span></p></div>
+<body id="0"><p id="1"><span><span class="cx-segment"
data-segmentid="2">Foo.<link /> </span><span class="cx-segment"
data-segmentid="3">Bar.</span></span></p><meta /><p id="4"><span
class="cx-segment" data-segmentid="5">Baz.</span></p></body>
\ No newline at end of file
diff --git a/test/segmentation/data/result-4.html
b/test/segmentation/data/result-4.html
index 494f294..0de936d 100644
--- a/test/segmentation/data/result-4.html
+++ b/test/segmentation/data/result-4.html
@@ -1,10 +1,10 @@
<div id="0">
- <div id="1">Some div</div>
- <h1 id="2">
- <span class="cx-segment" data-segmentid="3">Some heading</span>
+ <div id="1"><span class="cx-segment" data-segmentid="2">Some
div</span></div>
+ <h1 id="3">
+ <span class="cx-segment" data-segmentid="4">Some heading</span>
</h1>
- <p id="4">
- <span class="cx-segment" data-segmentid="5">This is first
sentence. </span>
- <span class="cx-segment" data-segmentid="6">This is second
sentence</span>
+ <p id="5">
+ <span class="cx-segment" data-segmentid="6">This is first
sentence. </span>
+ <span class="cx-segment" data-segmentid="7">This is second
sentence</span>
</p>
</div>
diff --git a/test/segmentation/data/test-18.html
b/test/segmentation/data/test-18.html
index a8f5591..7657f1d 100644
--- a/test/segmentation/data/test-18.html
+++ b/test/segmentation/data/test-18.html
@@ -1,5 +1,5 @@
-<div>
+<body>
<p><span>Foo.<link/> Bar.</span></p>
<meta/>
<p>Baz.</p>
-</div>
+</body>
--
To view, visit https://gerrit.wikimedia.org/r/392790
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ibb5937061c6980579d35cd24a0ba8205b109f8c7
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/cxserver
Gerrit-Branch: master
Gerrit-Owner: Santhosh <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits