Santhosh has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/383109 )
Change subject: Combine page fetch, segmentation to a new class ...................................................................... Combine page fetch, segmentation to a new class * Simplify the API end point for page/:language/:title/:revision? in v1.js. Router is not supposed to data processing as it does now. * Move pageloader/PageLoader.js to mw/MWPage class which now does page fetch and segmentation call. In follow ups, we will add section wrapping call to ths class(T177752). * The new MWPage class inherits APIRequest. Move the remaining utility methods in apiUtil to APIRequest class so that api related methods are in single place. Known issues to be fixed: * The language parameter passed to the /page api is sometimes domain because of the way services is keeping cxserver behind xx.wikipedia.org/api. Using our domain template we need to understand which one is passed and derive language code from it. Change-Id: I2ad6e1f183c8fe69c960597a29632989965cce82 --- M app.js D lib/api-util.js M lib/mw/ApiRequest.js A lib/mw/MWPage.js D lib/pageloader/PageLoader.js M lib/routes/v1.js 6 files changed, 145 insertions(+), 169 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver refs/changes/09/383109/1 diff --git a/app.js b/app.js index 907fbe6..1795d04 100644 --- a/app.js +++ b/app.js @@ -7,7 +7,6 @@ bodyParser = require( 'body-parser' ), fs = BBPromise.promisifyAll( require( 'fs' ) ), sUtil = require( './lib/util' ), - apiUtil = require( './lib/api-util' ), packageInfo = require( './package.json' ), yaml = require( 'js-yaml' ); @@ -59,9 +58,6 @@ } } } - - // set up the request templates for the APIs - apiUtil.setupApiTemplates( app ); // set up the spec if ( !app.conf.spec ) { diff --git a/lib/api-util.js b/lib/api-util.js deleted file mode 100644 index 435cd37..0000000 --- a/lib/api-util.js +++ /dev/null @@ -1,80 +0,0 @@ -'use strict'; - -var preq = require( 'preq' ), - Template = require( 'swagger-router' ).Template; - -/** - * Calls the REST API with the supplied domain, path and request parameters - * - * @param {Object} app the application object - * @param {string} domain the domain to issue the request for - * @param {string} path the REST API path to contact without the leading slash - * @param {Object} [restReq={}] the object containing the REST request details - * @param {string} [restReq.method=get] the request method - * @param {Object} [restReq.query={}] the query string to send, if any - * @param {Object} [restReq.headers={}] the request headers to send - * @param {Object} [restReq.body=null] the body of the request, if any - * @return {Promise} a promise resolving as the response object from the REST API - * - */ -function restApiGet( app, domain, path, restReq ) { - var request; - restReq = restReq || {}; - path = path[ 0 ] === '/' ? path.slice( 1 ) : path; - - request = app.restbase_tpl.expand( { - request: { - method: restReq.method, - params: { - domain: domain, - path: path - }, - query: restReq.query, - headers: Object.assign( { - 'user-agent': app.conf.user_agent - }, restReq.headers ), - body: restReq.body - } - } ); - - return preq( request ); -} - -/** - * Sets up the request templates for MW and RESTBase API requests - * - * @param {Application} app the application object - */ -function setupApiTemplates( app ) { - /* eslint-disable camelcase */ - // set up the MW API request template - if ( !app.conf.mwapi_req ) { - app.conf.mwapi_req = { - uri: 'https://{{domain}}/w/api.php', - headers: { - 'user-agent': '{{user-agent}}' - }, - query: '{{ default(request.query, {}) }}', - body: '{{request.body}}' - }; - } - app.mwapi_tpl = new Template( app.conf.mwapi_req ); - - // set up the RESTBase request template - if ( !app.conf.restbase_req ) { - app.conf.restbase_req = { - method: '{{request.method}}', - uri: 'http://{{domain}}/api/rest_v1/{+path}', - query: '{{ default(request.query, {}) }}', - headers: '{{request.headers}}', - body: '{{request.body}}' - }; - } - app.restbase_tpl = new Template( app.conf.restbase_req ); - /* eslint-enable camelcase */ -} - -module.exports = { - restApiGet: restApiGet, - setupApiTemplates: setupApiTemplates -}; diff --git a/lib/mw/ApiRequest.js b/lib/mw/ApiRequest.js index 58c910d..492b5a3 100644 --- a/lib/mw/ApiRequest.js +++ b/lib/mw/ApiRequest.js @@ -1,10 +1,10 @@ 'use strict'; -const apiUtil = require( '../api-util.js' ), - preq = require( 'preq' ), +const preq = require( 'preq' ), cxUtil = require( '../util.js' ), Title = require( 'mediawiki-title' ).Title, cxutil = require( '../util.js' ), + Template = require( 'swagger-router' ).Template, languageDomainNameMapping = require( './../language-domain-mapping.json' ), HTTPError = cxUtil.HTTPError; @@ -20,7 +20,7 @@ // Source and target languages this.sourceLanguage = config.sourceLanguage; this.targetLanguage = config.targetLanguage; - apiUtil.setupApiTemplates( config.context ); + this.setupApiTemplates( config.context ); } /** @@ -132,14 +132,14 @@ } htmlToWikiText( html, language ) { - var restReq, path, domain = this.getDomain( language ); + const domain = this.getDomain( language ); if ( cxUtil.isPlainText( html ) ) { // Does not contain HTML elements. Save api call. return Promise.resolve( html ); } - restReq = { + const restReq = { method: 'post', body: { html }, headers: { @@ -147,21 +147,21 @@ accept: 'text/html; charset=utf-8; profile="https://www.mediawiki.org/wiki/Specs/HTML/1.5.0"' } }; - path = '/transform/html/to/wikitext'; + const path = '/transform/html/to/wikitext'; - return apiUtil.restApiGet( this.context, domain, path, restReq ) + return this.restApiGet( domain, path, restReq ) .then( ( response ) => response.body ); } wikitextToHTML( wikitext, language ) { - var restReq, path, domain = this.getDomain( language ); + const domain = this.getDomain( language ); if ( !wikitext || !wikitext.trim() ) { // Save api call. return Promise.resolve( wikitext ); } - restReq = { + const restReq = { method: 'post', body: { wikitext, @@ -173,11 +173,79 @@ accept: 'text/html; charset=utf-8; profile="https://www.mediawiki.org/wiki/Specs/HTML/1.5.0"' } }; - path = '/transform/wikitext/to/html'; + const path = '/transform/wikitext/to/html'; - return apiUtil.restApiGet( this.context, domain, path, restReq ) + return this.restApiGet( domain, path, restReq ) .then( ( response ) => response.body ); } + + /** + * Sets up the request templates for MW and RESTBase API requests + * + * @param {Application} app the application object + */ + setupApiTemplates( app ) { + /* eslint-disable camelcase */ + // set up the MW API request template + if ( !this.context.conf.mwapi_req ) { + this.context.conf.mwapi_req = { + uri: 'https://{{domain}}/w/api.php', + headers: { + 'user-agent': '{{user-agent}}' + }, + query: '{{ default(request.query, {}) }}', + body: '{{request.body}}' + }; + } + this.context.mwapi_tpl = new Template( this.context.conf.mwapi_req ); + + // set up the RESTBase request template + if ( !app.conf.restbase_req ) { + app.conf.restbase_req = { + method: '{{request.method}}', + uri: 'http://{{domain}}/api/rest_v1/{+path}', + query: '{{ default(request.query, {}) }}', + headers: '{{request.headers}}', + body: '{{request.body}}' + }; + } + app.restbase_tpl = new Template( this.context.conf.restbase_req ); + /* eslint-enable camelcase */ + } + + /** + * Calls the REST API with the supplied domain, path and request parameters + * + * @param {string} domain the domain to issue the request for + * @param {string} path the REST API path to contact without the leading slash + * @param {Object} [restReq={}] the object containing the REST request details + * @param {string} [restReq.method=get] the request method + * @param {Object} [restReq.query={}] the query string to send, if any + * @param {Object} [restReq.headers={}] the request headers to send + * @param {Object} [restReq.body=null] the body of the request, if any + * @return {Promise} a promise resolving as the response object from the REST API + * + */ + restApiGet( domain, path, restReq ) { + restReq = restReq || {}; + path = path[ 0 ] === '/' ? path.slice( 1 ) : path; + + const request = this.context.restbase_tpl.expand( { + request: { + method: restReq.method, + params: { + domain, + path + }, + query: restReq.query, + headers: Object.assign( { + 'user-agent': this.context.conf.user_agent + }, restReq.headers ), + body: restReq.body + } + } ); + return preq( request ); + } } /** diff --git a/lib/mw/MWPage.js b/lib/mw/MWPage.js new file mode 100644 index 0000000..73b8287 --- /dev/null +++ b/lib/mw/MWPage.js @@ -0,0 +1,57 @@ +'use strict'; + +const ApiRequest = require( '../mw/ApiRequest' ), + CXSegmenter = require( '../segmentation/CXSegmenter' ); + +class MWPage extends ApiRequest { + getPage( page, revision ) { + return this.fetch( page, revision ).then( ( response ) => { + let segmentedDoc = this.segment( response.body ); + // TODO: segmentedDoc.wrapSections(); + return { + content: segmentedDoc.getHtml(), + revision: response.revision + }; + } ); + } + + /** + * @param {string} page The page title + * @param {string} revision The revision id + * @return {Promise} + */ + fetch( page, revision ) { + let path = 'page/html/' + encodeURIComponent( page ); + const domain = this.getDomain( this.sourceLanguage ); + + if ( revision ) { + path += '/' + revision; + } + + const restReq = { + method: 'get', + headers: { + // See https://www.mediawiki.org/wiki/Specs/HTML/1.5.0 + accept: 'text/html; charset=utf-8; profile="https://www.mediawiki.org/wiki/Specs/HTML/1.5.0"' + } + }; + return this.restApiGet( domain, path, restReq ).then( ( response ) => { + return { + body: response.body, + // Restbase returns revision ID in etag header. + // Example: + // ETag: "123456/c4e494da-ee8f-11e4-83a1-8b80de1cde5f" + revision: response.headers.etag.split( '/' )[ 0 ].replace( '"', '' ) + }; + } ); + } + + segment( pageContent ) { + const segmenter = new CXSegmenter( pageContent, this.sourceLanguage ); + segmenter.segment(); + return segmenter.getSegmentedDoc(); + } + +} + +module.exports = MWPage; diff --git a/lib/pageloader/PageLoader.js b/lib/pageloader/PageLoader.js deleted file mode 100644 index b1c8256..0000000 --- a/lib/pageloader/PageLoader.js +++ /dev/null @@ -1,51 +0,0 @@ -'use strict'; - -const apiUtil = require( '../api-util' ); -const ApiRequest = require( '../mw/ApiRequest' ); - -function PageLoader( app ) { - this.app = app; - this.log = app.logger.log || function () {}; -} - -/** - * @param {string} page The page title - * @param {string} source The language code or the domain of the wiki - * @param {string} revision The revision id - * @return {Promise} - */ -PageLoader.prototype.load = function ( page, source, revision ) { - var path, domain, restReq; - - path = 'page/html/' + encodeURIComponent( page ); - if ( /.+\.org$/.test( source ) ) { - // We got an actual domain - domain = source; - } else { - domain = new ApiRequest( { context: this.app } ).getDomain( source ); - } - - if ( revision ) { - path += '/' + revision; - } - - restReq = { - method: 'get', - headers: { - // See https://www.mediawiki.org/wiki/Specs/HTML/1.5.0 - accept: 'text/html; charset=utf-8; profile="https://www.mediawiki.org/wiki/Specs/HTML/1.5.0"' - } - }; - - return apiUtil.restApiGet( this.app, domain, path, restReq ).then( function ( response ) { - return { - body: response.body, - // Restbase returns revision ID in etag header. - // Example: - // ETag: "123456/c4e494da-ee8f-11e4-83a1-8b80de1cde5f" - revision: response.headers.etag.split( '/' )[ 0 ].replace( '"', '' ) - }; - } ); -}; - -module.exports = PageLoader; diff --git a/lib/routes/v1.js b/lib/routes/v1.js index 8ba2cf7..405f7f4 100644 --- a/lib/routes/v1.js +++ b/lib/routes/v1.js @@ -58,39 +58,25 @@ var sourceLanguage = req.params.language, title = req.params.title, revision = req.params.revision, - CXSegmenter = require( __dirname + '/../segmentation/CXSegmenter.js' ), - PageLoader = require( __dirname + '/../pageloader/PageLoader.js' ), - pageloader = new PageLoader( this.app ); + MWPage = require( __dirname + '/../mw/MWPage' ), + page = new MWPage( { + context: this.app, + sourceLanguage + } ); - return pageloader.load( title, sourceLanguage, revision ).then( + return page.getPage( title, revision ).then( ( response ) => { - var segmenter, segmentedContent; - - try { - this.app.logger.log( 'debug', 'Fetch page', { - title: title, - sourceLanguage: sourceLanguage - } ); - segmenter = new CXSegmenter( response.body, sourceLanguage ); - segmenter.segment(); - segmentedContent = segmenter.getSegmentedContent(); - this.app.logger.log( 'debug', 'Segment page', { - title: title, - sourceLanguage: sourceLanguage - } ); - } catch ( error ) { - res.status( 500 ).end( `Page ${sourceLanguage} : ${title} could not be fetched or segmented: ` + error.toString() ); - } res.send( { sourceLanguage: sourceLanguage, title: title, revision: response.revision, - segmentedContent: segmentedContent + segmentedContent: response.content } ); this.app.logger.log( 'debug', 'Page sent' ); }, ( error ) => { - res.status( 404 ).end( 'Page ' + sourceLanguage + ':' + title + ' could not be found. ' + error.toString() ); + res.status( 404 ) + .end( `Page ${sourceLanguage}:${title} could not be found. ` + error.toString() ); } ); } -- To view, visit https://gerrit.wikimedia.org/r/383109 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I2ad6e1f183c8fe69c960597a29632989965cce82 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/cxserver Gerrit-Branch: master Gerrit-Owner: Santhosh <santhosh.thottin...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits