Mvolz has uploaded a new change for review. https://gerrit.wikimedia.org/r/200314
Change subject: Change how pubmed and pmcs are requested ...................................................................... Change how pubmed and pmcs are requested The PubMed ID converted API was frequently not returning results for valid PMIDs, so instead we request metadata from pubmed URLs directly. CitoidService * Remove requestFromPubMed, which used the ID converter API to get a DOI * Replace with requestFromPMCID and PMID which concatenates id onto a PubMed URL, verifies the server sends a 200 ok response code, and sends to requestFromURL. Tests * Convert pmid test for to a PMID which has no results in the converter API but has a valid URL. * Add test for PMC with PMC prefix * Add test for PMC without PMC prefix * Add test for invalid PMCID * Add test for invald PMID Bug: T93335 Change-Id: Ie2830d42fa63fecd30db72f723e6d5d9979f51b5 --- M lib/CitoidService.js M package.json M test/features/errors/index.js M test/features/scraping/index.js 4 files changed, 114 insertions(+), 37 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/citoid refs/changes/14/200314/1 diff --git a/lib/CitoidService.js b/lib/CitoidService.js index 9b3d79d..f801f65 100644 --- a/lib/CitoidService.js +++ b/lib/CitoidService.js @@ -131,32 +131,73 @@ urlOpts.search = res.headers.location; citoidService.requestFromURL(urlOpts, callback); } else { - citoidService.logger.log('debug/DOI', "Unable to resolve DOI " + doiOpts.search); - var message = 'Unable to resolve DOI'; + var message = 'Unable to resolve DOI ' + doiOpts.search; var error = new Error(message); + citoidService.logger.log('debug/DOI', message); callback(error, 404, {Error: message}); } }); }; /** - * Request citation metadata from a PubMed identifier. Supports PMID, PMCID, Manuscript ID and versioned identifiers - * @param {Object} opts options object containing PubMed identifier. PMCID identifiers must begin with 'PMC' + * Request citation metadata from a PMID identifier. + * @param {Object} pmidOpts options object containing PMID * @param {Function} callback callback (error, statusCode, body) */ -CitoidService.prototype.requestFromPubMedID = function(opts, callback){ +CitoidService.prototype.requestFromPMID = function(pmidOpts, callback){ var citoidService = this; - pubMedRequest(opts.search, this.logger, function(error, obj){ - if(error) { - callback(error, null, null); + var baseURL = 'http://www.ncbi.nlm.nih.gov/pubmed/'; + var urlOpts = Object.assign({}, pmidOpts); // Shallow clone doiOpts + var pmidURL = baseURL + pmidOpts.search; + + urlOpts.search = pmidURL; + + citoidService.logger.log('debug/pmid', "Converting PMID " + pmidOpts.search + + 'to URL ' + urlOpts.search); + + // Check if url is 200 okay + http.get(pmidURL, function (res) { + if (res && res.statusCode === 200) { + citoidService.requestFromURL(urlOpts, callback); } else { - var doi = obj.records[0].doi; - citoidService.logger.log('debug/pubmed', "Got DOI " + doi); - opts.search = doi; - citoidService.requestFromDOI(opts, callback); + var message = 'Unable to locate resource with PMID ' + + pmidOpts.search; + var error = new Error(message); + citoidService.logger.log('debug/PMID', message); + callback(error, 404, {Error: message}); } }); }; + +/** + * Request citation metadata from a PMCID identifier. + * @param {Object} pmcidOpts options object containing PMCID + * @param {Function} callback callback (error, statusCode, body) + */ +CitoidService.prototype.requestFromPMCID = function(pmcidOpts, callback){ + var citoidService = this; + var baseURL = 'http://www.ncbi.nlm.nih.gov/pmc/articles/'; + var urlOpts = Object.assign({}, pmcidOpts); // Shallow clone doiOpts + var pmcidURL = baseURL + pmcidOpts.search + '/'; + + urlOpts.search = pmcidURL; + + citoidService.logger.log('debug/pmcid', "Converting PMCID " + + pmcidOpts.search + 'to URL ' + urlOpts.search); + // Check if url is 200 okay + http.get(pmcidURL, function (res) { + if (res && res.statusCode === 200) { + citoidService.requestFromURL(urlOpts, callback); + } else { + var message = 'Unable to locate resource with PMCID ' + + pmcidOpts.search; + var error = new Error(message); + citoidService.logger.log('debug/PMCID', message); + callback(error, 404, {Error: message}); + } + }); +}; + /** * Determine type of string (doi, url) and callback on correct handler @@ -164,24 +205,20 @@ * @param {Function} callback callback(extractedValue, correctFunction) */ CitoidService.prototype.distinguish = function(rawSearchInput, callback){ - var reDOI, rePMID, rePMCID, rePMCID2, reHTTP, reWWW, - parsedURL, - matchDOI, matchPMID, matchPMCID, matchHTTP, matchWWW, - search = rawSearchInput.trim(); + var search = rawSearchInput.trim(); - reHTTP = new RegExp('^((https?)://.+\\..+)'); // Assumes all strings with http/s protocol are URLs - reWWW = new RegExp('^((www)\\..+\\..+)'); // Assumes all strings with www substring are URLs - reDOI = new RegExp('\\b10\\.?[0-9]{3,4}(?:[.][0-9]+)*/.*'); - rePMID = new RegExp('^\\d{8}\\b'); - rePMCID = new RegExp('\\bPMC\\d{7}\\b'); - rePMCID2 = new RegExp('^\\d{7}\\b'); + var reHTTP = new RegExp('^((https?)://.+\\..+)'); // Assumes all strings with http/s protocol are URLs + var reWWW = new RegExp('^((www)\\..+\\..+)'); // Assumes all strings with www substring are URLs + var reDOI = new RegExp('\\b10\\.?[0-9]{3,4}(?:[.][0-9]+)*/.*'); + var rePMID = new RegExp('^\\d{8}\\b'); + var rePMCID = new RegExp('\\bPMC\\d{7}\\b'); + var rePMCID2 = new RegExp('^\\d{7}\\b'); - matchHTTP = search.match(reHTTP); - matchDOI = search.match(reDOI); - matchPMID = search.match(rePMID); - matchPMCID = search.match(rePMCID); - matchWWW = search.match(reWWW); - + var matchHTTP = search.match(reHTTP); + var matchDOI = search.match(reDOI); + var matchPMID = search.match(rePMID); + var matchPMCID = search.match(rePMCID); + var matchWWW = search.match(reWWW); if (matchHTTP || matchWWW){ this.stats.increment('input.url'); @@ -191,18 +228,18 @@ callback(matchDOI[0], this.requestFromDOI.bind(this)); } else if (matchPMID) { this.stats.increment('input.pmid'); - callback(matchPMID[0], this.requestFromPubMedID.bind(this)); + callback(matchPMID[0], this.requestFromPMID.bind(this)); } else if (matchPMCID) { this.stats.increment('input.pmcid'); - callback(matchPMCID[0], this.requestFromPubMedID.bind(this)); + callback(matchPMCID[0], this.requestFromPMCID.bind(this)); } else { - matchPMCID = search.match(rePMCID2); + matchPMCID = search.match(rePMCID2); // Detects PMCIDs with no PMC prefix if (matchPMCID) { this.stats.increment('input.pmcid'); - callback('PMC' + matchPMCID[0], this.requestFromPubMedID.bind(this)); + callback('PMC' + matchPMCID[0], this.requestFromPMCID.bind(this)); } else { this.stats.increment('input.url'); - parsedURL = urlParse.parse(search); + var parsedURL = urlParse.parse(search); if (!parsedURL.protocol){ search = 'http://'+ search; } diff --git a/package.json b/package.json index 4bdfddc..1707ab2 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "citoid", - "version": "0.2.3", + "version": "0.2.4", "description": "Converts search terms such as URL or DOI into citations.", "scripts": { "start": "service-runner", diff --git a/test/features/errors/index.js b/test/features/errors/index.js index b997ed9..b56e120 100644 --- a/test/features/errors/index.js +++ b/test/features/errors/index.js @@ -77,7 +77,32 @@ assert.status(res, 404); }, function(err) { assert.status(err, 404); - assert.deepEqual(err.body.Error, 'Unable to resolve DOI', + assert.deepEqual(err.body.Error, 'Unable to resolve DOI ' + doi, + 'Unexpected error message ' + err.body.Error); + }); + }); + + it('bad pmid', function() { + var pmid = '99999999'; + return server.query(pmid, 'mediawiki', 'en') + .then(function(res) { + assert.status(res, 404); + }, function(err) { + assert.status(err, 404); + assert.deepEqual(err.body.Error, + 'Unable to locate resource with PMID ' + pmid, + 'Unexpected error message ' + err.body.Error); + }); + }); + + it('bad pmcid', function() { + var pmcid = 'PMC9999999'; + return server.query(pmcid, 'mediawiki', 'en') + .then(function(res) { + assert.status(res, 404); + }, function(err) { + assert.status(err, 404); + assert.deepEqual(err.body.Error, 'Unable to locate resource with PMCID ' + pmcid, 'Unexpected error message ' + err.body.Error); }); }); diff --git a/test/features/scraping/index.js b/test/features/scraping/index.js index ff2e333..5538399 100644 --- a/test/features/scraping/index.js +++ b/test/features/scraping/index.js @@ -12,8 +12,23 @@ before(function () { return server.start(); }); - it('pmid', function() { - return server.query('23555203').then(function(res) { + //PMID on NIH website that is not found in the id converter api + it('pmid (not in id converter)', function() { + return server.query('14656957').then(function(res) { + assert.status(res, 200); + assert.checkCitation(res, 'Seventh report of the Joint National Committee on Prevention, Detection, Evaluation, and Treatment of High Blood Pressure'); + }); + }); + + it('pmcid with prefix', function() { + return server.query('PMC3605911').then(function(res) { + assert.status(res, 200); + assert.checkCitation(res, 'Viral Phylodynamics'); + }); + }); + + it('pmcid without prefix', function() { + return server.query('3605911').then(function(res) { assert.status(res, 200); assert.checkCitation(res, 'Viral Phylodynamics'); }); -- To view, visit https://gerrit.wikimedia.org/r/200314 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ie2830d42fa63fecd30db72f723e6d5d9979f51b5 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/citoid Gerrit-Branch: master Gerrit-Owner: Mvolz <mv...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits