BearND has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/404247 )
Change subject: summary: just parse the lead section ...................................................................... summary: just parse the lead section No need to try to parse other sections for the summary endpoint. Change-Id: I02828100fd1a81c94e1a842c3264caba51c634ed --- M lib/parsoidSections.js M lib/parsoidSectionsUsingDivs.js M lib/parsoidSectionsUsingSectionTags.js M lib/summary.js M routes/summary.js M test/lib/parsoid/parsoid-sections-div-element-test.js M test/lib/parsoid/parsoid-sections-section-elements-tests.js 7 files changed, 69 insertions(+), 16 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/mobileapps refs/changes/47/404247/1 diff --git a/lib/parsoidSections.js b/lib/parsoidSections.js index f8e21d6..f785685 100644 --- a/lib/parsoidSections.js +++ b/lib/parsoidSections.js @@ -38,8 +38,23 @@ } } +/** + * Reduces the input Parsoid document to a new Document containing + * just the first section. + * @param {!document} doc the parsed DOM Document of the Parsoid output + * @return {!Document} a new DOM Document containing only the lead section + */ +function justLeadSection(doc) { + if (!hasParsoidSections(doc)) { + return parsoidSectionsUsingDivs.justLeadSection(doc); + } else { + return parsoidSectionsUsingSectionTags.justLeadSection(doc); + } +} + module.exports = { hasParsoidSections, addSectionDivs, - getSectionsText + getSectionsText, + justLeadSection }; diff --git a/lib/parsoidSectionsUsingDivs.js b/lib/parsoidSectionsUsingDivs.js index a5886dd..5d11837 100644 --- a/lib/parsoidSectionsUsingDivs.js +++ b/lib/parsoidSectionsUsingDivs.js @@ -1,11 +1,11 @@ 'use strict'; /* - This is an old way of doing sectioning, which is preserved only for the definitions endpoint. - TODO: We should move the definition parsing implementation to the new way - (parsoidSectionsUsingSectionTags.js) soon since when Parsoid deploys adding <section> tags the - definitions code may be negatively affected by that. + This is an old way of doing sectioning, which is preserved only just in case + we encounter a payload from an older Parsoid version. */ + +const domino = require('domino'); function parseNextSection(sectionDiv, startingNode) { let nextNode; @@ -96,7 +96,18 @@ return sections; } +/** + * @param {!document} doc the parsed DOM Document of the Parsoid output + * @return {!Document} a new DOM Document containing only the lead section + */ +function justLeadSection(doc) { + const sectionDiv = doc.createElement('div'); + parseNextSection(sectionDiv, doc.body.firstChild); + return domino.createDocument(sectionDiv.innerHTML); +} + module.exports = { addSectionDivs, - getSectionsText + getSectionsText, + justLeadSection }; diff --git a/lib/parsoidSectionsUsingSectionTags.js b/lib/parsoidSectionsUsingSectionTags.js index a30d537..aab9cd3 100644 --- a/lib/parsoidSectionsUsingSectionTags.js +++ b/lib/parsoidSectionsUsingSectionTags.js @@ -1,5 +1,6 @@ 'use strict'; +const domino = require('domino'); const parsoidDomUtils = require('parsoid-dom-utils'); const NodeType = require('./nodeType'); @@ -18,7 +19,7 @@ /** * Gets the section number from Parsoid. - * @param {!DOMElement} sectionElement a <section> DOM element + * @param {!Element} sectionElement a <section> DOM element * @return {int} the section number as reported by Parsoid */ function getSectionNumber(sectionElement) { @@ -66,7 +67,7 @@ * In this case, starts a new section object when a <section> tag is encountered that is a * direct descendant * @param {!Logger} logger the app's bunyan logger - * @param {!DOMNode} node the node to visit + * @param {!Node} node the node to visit * @param {!Object[]} allSections the array containing the results * @param {!Object} state some state to pass around */ @@ -103,7 +104,7 @@ * descending so we don't duplicate content since elsewhere outerHTML is used (which contains * content of sub-nodes). * @param {!Logger} logger the app's bunyan logger - * @param {!DOMElement} rootElement the root of the DOM tree which needs to be traversed + * @param {!Element} rootElement the root of the DOM tree which needs to be traversed * @param {!Object[]} allSections holds the results */ function traverseDF(logger, rootElement, allSections) { @@ -130,9 +131,19 @@ return allSections; } +/** + * @param {!document} doc the parsed DOM Document of the Parsoid output + * @return {!Document} a new DOM Document containing only the lead section + */ +function justLeadSection(doc) { + const currentSectionElement = doc.querySelector('section[data-mw-section-id]'); + return domino.createDocument(currentSectionElement.innerHTML); +} + module.exports = { addSectionTags, getSectionsText, + justLeadSection, testing: { parseSections: traverseDF, shouldLogInvalidSectionWarning, diff --git a/lib/summary.js b/lib/summary.js index 6e18df5..37e5450 100644 --- a/lib/summary.js +++ b/lib/summary.js @@ -77,10 +77,9 @@ * @param {!String} html page content and metadata from Parsoid * @param {!Object} revTid revision and tid from Parsoid * @param {!Object} meta metadata from MW API - * @param {!Logger} logger a bunyan logger * @return {!Object} a summary 2.0 spec-compliant page summary object */ -function buildSummary(domain, title, html, revTid, meta, logger) { +function buildSummary(domain, title, html, revTid, meta) { const isContentModelWikitext = meta.contentmodel === 'wikitext'; const isWhiteListedNamespace = SUMMARY_NS_WHITELIST.includes(meta.ns); const isRedirect = meta.redirect; @@ -98,10 +97,7 @@ } const doc = domino.createDocument(html); - parsoidSections.addSectionDivs(doc); - const sections = parsoidSections.getSectionsText(doc, logger); - - const leadSectionDoc = domino.createDocument(sections[0].text); + const leadSectionDoc = parsoidSections.justLeadSection(doc); const intro = transforms.extractLeadIntroduction(leadSectionDoc); const summary = intro.length ? transforms.summarize(intro) : { extract: '', extract_html: '' }; diff --git a/routes/summary.js b/routes/summary.js index 07577e3..9bd89af 100644 --- a/routes/summary.js +++ b/routes/summary.js @@ -32,7 +32,7 @@ const revTid = parsoid.getRevAndTidFromEtag(response.html.headers); const title = Title.newFromText(req.params.title, response.siteinfo); const summary = lib.buildSummary(req.params.domain, title, - response.html.body, revTid, response.meta, req.logger); + response.html.body, revTid, response.meta); res.status(summary.code); if (summary.code === 200) { delete summary.code; diff --git a/test/lib/parsoid/parsoid-sections-div-element-test.js b/test/lib/parsoid/parsoid-sections-div-element-test.js index 04622b9..df407ba 100644 --- a/test/lib/parsoid/parsoid-sections-div-element-test.js +++ b/test/lib/parsoid/parsoid-sections-div-element-test.js @@ -92,4 +92,14 @@ assertSection0(sections); assertSection1(sections, extraText); }); + + describe('justLeadSection', () => { + it('should just return the first section', () => { + const doc = domino.createDocument('<body><p>text0</p>' + + '<h2 id="foo">foo</h2>text1' + + '</body>'); + const leadSectionDoc = parsoid.justLeadSection(doc); + assert.deepEqual(leadSectionDoc.body.innerHTML, '<p>text0</p>'); + }); + }); }); diff --git a/test/lib/parsoid/parsoid-sections-section-elements-tests.js b/test/lib/parsoid/parsoid-sections-section-elements-tests.js index 36d8644..c05eb79 100644 --- a/test/lib/parsoid/parsoid-sections-section-elements-tests.js +++ b/test/lib/parsoid/parsoid-sections-section-elements-tests.js @@ -167,4 +167,14 @@ detail: 'Cannot find heading for section number 1.' }]]); }); + + describe('justLeadSection', () => { + it('should just return the first section', () => { + const doc = domino.createDocument( + '<section data-mw-section-id="0"><p>text0</p></section>' + + '<section data-mw-section-id="1"><h2 id="foo">foo</h2>text1</section>'); + const leadSectionDoc = parsoidSectionsUsingSectionTags.justLeadSection(doc); + assert.deepEqual(leadSectionDoc.body.innerHTML, '<p>text0</p>'); + }); + }); }); -- To view, visit https://gerrit.wikimedia.org/r/404247 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I02828100fd1a81c94e1a842c3264caba51c634ed Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/mobileapps Gerrit-Branch: master Gerrit-Owner: BearND <bsitzm...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits