[MediaWiki-commits] [Gerrit] mediawiki...mobileapps[master]: Hygiene: remove extracts.js
BearND has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/327396 ) Change subject: Hygiene: remove extracts.js .. Hygiene: remove extracts.js Follow-up of Idc70d6e989a67421ee5af1b0d2350cfc06a3da94. Should have removed that then. Change-Id: I35ee543be4c9ae6d14857fefc1ef42b64f18fa34 --- D lib/extract.js 1 file changed, 0 insertions(+), 112 deletions(-) Approvals: jenkins-bot: Verified Jdlrobson: Looks good to me, approved diff --git a/lib/extract.js b/lib/extract.js deleted file mode 100644 index 9142f97..000 --- a/lib/extract.js +++ /dev/null @@ -1,112 +0,0 @@ -'use strict'; - -/** - Article extracts - */ - -/** - * @param {string} [str] - * @return {string} str, less parenthetical expressions and their leading whitespace, if balanced. - */ -function removeParens(str) { -function count(paren) { -return ((str || '').match(new RegExp(`\\${paren}`, 'g')) || []).length; -} - -const openCount = count('('); -const closeCount = count(')'); -const regex = /\s*\([^()]*\)/g; -return openCount && openCount === closeCount ? removeParens(str.replace(regex, '')) : str; -} - -/** - * Find all matches of regex in text, calling callback with each match object - * - * TODO: remove when switching to Parsoid. Copied from: - * https://github.com/wikimedia/mediawiki-services-cxserver/blob/0d21a808f7ab6b82086171af927467c1b9460626/lineardoc/Utils.js - * - * @param {string} text The text to search - * @param {Regex} regex The regex to search; should be created for this function call - * @param {Function} callback Function to call with each match - * @return {Array} The return values from the callback - */ -function findAll(text, regex, callback) { -const boundaries = []; -do { -const match = regex.exec(text); -if (match === null) { -break; -} -const boundary = callback(text, match); -if (boundary !== null) { -boundaries.push(boundary); -} -} while (regex.test(text)); -return boundaries; -} - -/** - * Test a possible English sentence boundary match - * - * TODO: remove when switching to Parsoid. Copied from: - * https://github.com/wikimedia/mediawiki-services-cxserver/blob/0d21a808f7ab6b82086171af927467c1b9460626/segmentation/languages/SegmenterDefault.js - * - * @param {string} text The plaintext to segment - * @param {Object} match The possible boundary match (returned by regex.exec) - * @return {number|null} The boundary offset, or null if not a sentence boundary - */ -function findBoundary(text, match) { -const tail = text.slice(match.index + 1, text.length); -const head = text.slice(0, match.index); - -// Trailing non-final punctuation: not a sentence boundary -if (tail.match(/^[,;:]/)) { -return null; -} -// Next word character is number or lower-case: not a sentence boundary -if (tail.match(/^\W*[0-9a-z]/)) { -return null; -} - -// Do not break in abbreviations. Example D. John, St. Peter -const lastWord = head.match(/(\w*)$/)[0]; -// Exclude at most 2 letter abbreviations. Examples: T. Dr. St. Jr. Sr. Ms. Mr. -// But not all caps like "UK." as in "UK. Not US", -if (lastWord.length <= 2 && lastWord.match(/^\W*[A-Z][a-z]?$/) && tail.match(/^\W*[A-Z]/)) { -return null; -} - -// Include any closing punctuation and trailing space -return match.index + 1 + tail.match(/^['”"’]*\s*/)[0].length; -} - -/** - * Find English sentence boundaries - * - * TODO: remove when switching to Parsoid. Copied from: - * https://github.com/wikimedia/mediawiki-services-cxserver/blob/0d21a808f7ab6b82086171af927467c1b9460626/segmentation/languages/SegmenterDefault.js - * - * @param {string} text The plaintext to segment - * @returns {number[]} Sentence boundary offsets - */ -function getBoundaries(text) { -// Regex to find possible English sentence boundaries. -// Must not use a shared regex instance (re.lastIndex is used) -return findAll(text, /[.!?]/g, findBoundary); -} - -function format(extract) { -const MAX_SENTENCES = 2; -const cleanStr = removeParens(extract.replace(/\s+/g, ' ')); -const boundaries = getBoundaries(cleanStr); -const cleanStrEndIndex = boundaries[Math.min(boundaries.length, MAX_SENTENCES - 1)]; - -const ret = cleanStr.slice(0, cleanStrEndIndex).trim(); -if (ret !== '…' && ret !== '..') { -return ret; -} -} - -module.exports = { -format -}; -- To view, visit https://gerrit.wikimedia.org/r/327396 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I35ee543be4c9ae6d14857fefc1ef42b64f18fa34 Gerrit-PatchSet: 2 Gerrit-Project: mediawiki/services/mobileapps Gerrit-Branch: master Gerrit-Owner: BearND Gerrit-Reviewer: BearND Gerrit-Reviewer: Jdlrobson Gerrit-Reviewer: jenkins-bot <> ___
[MediaWiki-commits] [Gerrit] mediawiki...mobileapps[master]: Hygiene: remove extracts.js
BearND has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/327396 ) Change subject: Hygiene: remove extracts.js .. Hygiene: remove extracts.js Follow-up of Idc70d6e989a67421ee5af1b0d2350cfc06a3da94. Should have removed that then. Change-Id: I35ee543be4c9ae6d14857fefc1ef42b64f18fa34 --- D lib/extract.js 1 file changed, 0 insertions(+), 112 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/mobileapps refs/changes/96/327396/1 diff --git a/lib/extract.js b/lib/extract.js deleted file mode 100644 index 9142f97..000 --- a/lib/extract.js +++ /dev/null @@ -1,112 +0,0 @@ -'use strict'; - -/** - Article extracts - */ - -/** - * @param {string} [str] - * @return {string} str, less parenthetical expressions and their leading whitespace, if balanced. - */ -function removeParens(str) { -function count(paren) { -return ((str || '').match(new RegExp(`\\${paren}`, 'g')) || []).length; -} - -const openCount = count('('); -const closeCount = count(')'); -const regex = /\s*\([^()]*\)/g; -return openCount && openCount === closeCount ? removeParens(str.replace(regex, '')) : str; -} - -/** - * Find all matches of regex in text, calling callback with each match object - * - * TODO: remove when switching to Parsoid. Copied from: - * https://github.com/wikimedia/mediawiki-services-cxserver/blob/0d21a808f7ab6b82086171af927467c1b9460626/lineardoc/Utils.js - * - * @param {string} text The text to search - * @param {Regex} regex The regex to search; should be created for this function call - * @param {Function} callback Function to call with each match - * @return {Array} The return values from the callback - */ -function findAll(text, regex, callback) { -const boundaries = []; -do { -const match = regex.exec(text); -if (match === null) { -break; -} -const boundary = callback(text, match); -if (boundary !== null) { -boundaries.push(boundary); -} -} while (regex.test(text)); -return boundaries; -} - -/** - * Test a possible English sentence boundary match - * - * TODO: remove when switching to Parsoid. Copied from: - * https://github.com/wikimedia/mediawiki-services-cxserver/blob/0d21a808f7ab6b82086171af927467c1b9460626/segmentation/languages/SegmenterDefault.js - * - * @param {string} text The plaintext to segment - * @param {Object} match The possible boundary match (returned by regex.exec) - * @return {number|null} The boundary offset, or null if not a sentence boundary - */ -function findBoundary(text, match) { -const tail = text.slice(match.index + 1, text.length); -const head = text.slice(0, match.index); - -// Trailing non-final punctuation: not a sentence boundary -if (tail.match(/^[,;:]/)) { -return null; -} -// Next word character is number or lower-case: not a sentence boundary -if (tail.match(/^\W*[0-9a-z]/)) { -return null; -} - -// Do not break in abbreviations. Example D. John, St. Peter -const lastWord = head.match(/(\w*)$/)[0]; -// Exclude at most 2 letter abbreviations. Examples: T. Dr. St. Jr. Sr. Ms. Mr. -// But not all caps like "UK." as in "UK. Not US", -if (lastWord.length <= 2 && lastWord.match(/^\W*[A-Z][a-z]?$/) && tail.match(/^\W*[A-Z]/)) { -return null; -} - -// Include any closing punctuation and trailing space -return match.index + 1 + tail.match(/^['”"’]*\s*/)[0].length; -} - -/** - * Find English sentence boundaries - * - * TODO: remove when switching to Parsoid. Copied from: - * https://github.com/wikimedia/mediawiki-services-cxserver/blob/0d21a808f7ab6b82086171af927467c1b9460626/segmentation/languages/SegmenterDefault.js - * - * @param {string} text The plaintext to segment - * @returns {number[]} Sentence boundary offsets - */ -function getBoundaries(text) { -// Regex to find possible English sentence boundaries. -// Must not use a shared regex instance (re.lastIndex is used) -return findAll(text, /[.!?]/g, findBoundary); -} - -function format(extract) { -const MAX_SENTENCES = 2; -const cleanStr = removeParens(extract.replace(/\s+/g, ' ')); -const boundaries = getBoundaries(cleanStr); -const cleanStrEndIndex = boundaries[Math.min(boundaries.length, MAX_SENTENCES - 1)]; - -const ret = cleanStr.slice(0, cleanStrEndIndex).trim(); -if (ret !== '…' && ret !== '..') { -return ret; -} -} - -module.exports = { -format -}; -- To view, visit https://gerrit.wikimedia.org/r/327396 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I35ee543be4c9ae6d14857fefc1ef42b64f18fa34 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/mobileapps Gerrit-Branch: master Gerrit-Owner: BearND ___ MediaWiki-commits mailing list MediaW