Mvolz has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/200314

Change subject: Change how pubmed and pmcs are requested
......................................................................

Change how pubmed and pmcs are requested

The PubMed ID converted API was frequently
not returning results for valid PMIDs, so
instead we request metadata from pubmed
URLs directly.

CitoidService

* Remove requestFromPubMed, which used
the ID converter API to get a DOI
* Replace with requestFromPMCID and PMID
which concatenates id onto a PubMed URL,
verifies the server sends a 200 ok
response code, and sends to requestFromURL.

Tests

* Convert pmid test for to a PMID which has
no results in the converter API but has a
valid URL.
* Add test for PMC with PMC prefix
* Add test for PMC without PMC prefix
* Add test for invalid PMCID
* Add test for invald PMID

Bug: T93335
Change-Id: Ie2830d42fa63fecd30db72f723e6d5d9979f51b5
---
M lib/CitoidService.js
M package.json
M test/features/errors/index.js
M test/features/scraping/index.js
4 files changed, 114 insertions(+), 37 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/citoid 
refs/changes/14/200314/1

diff --git a/lib/CitoidService.js b/lib/CitoidService.js
index 9b3d79d..f801f65 100644
--- a/lib/CitoidService.js
+++ b/lib/CitoidService.js
@@ -131,32 +131,73 @@
                        urlOpts.search = res.headers.location;
                        citoidService.requestFromURL(urlOpts, callback);
                } else {
-                       citoidService.logger.log('debug/DOI', "Unable to 
resolve DOI " + doiOpts.search);
-                       var message = 'Unable to resolve DOI';
+                       var message = 'Unable to resolve DOI ' + doiOpts.search;
                        var error = new Error(message);
+                       citoidService.logger.log('debug/DOI', message);
                        callback(error, 404, {Error: message});
                }
        });
 };
 
 /**
- * Request citation metadata from a PubMed identifier. Supports PMID, PMCID, 
Manuscript ID and versioned identifiers
- * @param  {Object}   opts       options object containing PubMed identifier. 
PMCID identifiers must begin with 'PMC'
+ * Request citation metadata from a PMID identifier.
+ * @param  {Object}   pmidOpts   options object containing PMID
  * @param  {Function} callback   callback (error, statusCode, body)
  */
-CitoidService.prototype.requestFromPubMedID = function(opts, callback){
+CitoidService.prototype.requestFromPMID = function(pmidOpts, callback){
        var citoidService = this;
-       pubMedRequest(opts.search, this.logger, function(error, obj){
-               if(error) {
-                       callback(error, null, null);
+       var baseURL = 'http://www.ncbi.nlm.nih.gov/pubmed/';
+       var urlOpts =  Object.assign({}, pmidOpts); // Shallow clone doiOpts
+       var pmidURL = baseURL + pmidOpts.search;
+
+       urlOpts.search = pmidURL;
+
+       citoidService.logger.log('debug/pmid', "Converting PMID " + 
pmidOpts.search +
+               'to URL ' + urlOpts.search);
+
+       // Check if url is 200 okay
+       http.get(pmidURL, function (res) {
+               if (res && res.statusCode === 200) {
+                       citoidService.requestFromURL(urlOpts, callback);
                } else {
-                       var doi = obj.records[0].doi;
-                       citoidService.logger.log('debug/pubmed', "Got DOI " + 
doi);
-                       opts.search = doi;
-                       citoidService.requestFromDOI(opts, callback);
+                       var message = 'Unable to locate resource with PMID '
+                               + pmidOpts.search;
+                       var error = new Error(message);
+                       citoidService.logger.log('debug/PMID', message);
+                       callback(error, 404, {Error: message});
                }
        });
 };
+
+/**
+ * Request citation metadata from a PMCID identifier.
+ * @param  {Object}   pmcidOpts   options object containing PMCID
+ * @param  {Function} callback   callback (error, statusCode, body)
+ */
+CitoidService.prototype.requestFromPMCID = function(pmcidOpts, callback){
+       var citoidService = this;
+       var baseURL = 'http://www.ncbi.nlm.nih.gov/pmc/articles/';
+       var urlOpts =  Object.assign({}, pmcidOpts); // Shallow clone doiOpts
+       var pmcidURL = baseURL + pmcidOpts.search + '/';
+
+       urlOpts.search = pmcidURL;
+
+       citoidService.logger.log('debug/pmcid', "Converting PMCID "
+               + pmcidOpts.search + 'to URL ' + urlOpts.search);
+       // Check if url is 200 okay
+       http.get(pmcidURL, function (res) {
+               if (res && res.statusCode === 200) {
+                       citoidService.requestFromURL(urlOpts, callback);
+               } else {
+                       var message = 'Unable to locate resource with PMCID '
+                               + pmcidOpts.search;
+                       var error = new Error(message);
+                       citoidService.logger.log('debug/PMCID', message);
+                       callback(error, 404, {Error: message});
+               }
+       });
+};
+
 
 /**
  * Determine type of string (doi, url) and callback on correct handler
@@ -164,24 +205,20 @@
  * @param  {Function} callback          callback(extractedValue, 
correctFunction)
  */
 CitoidService.prototype.distinguish = function(rawSearchInput, callback){
-       var reDOI, rePMID, rePMCID, rePMCID2, reHTTP, reWWW,
-               parsedURL,
-               matchDOI, matchPMID, matchPMCID, matchHTTP, matchWWW,
-               search = rawSearchInput.trim();
+       var search = rawSearchInput.trim();
 
-       reHTTP = new RegExp('^((https?)://.+\\..+)'); // Assumes all strings 
with http/s protocol are URLs
-       reWWW = new RegExp('^((www)\\..+\\..+)'); // Assumes all strings with 
www substring are URLs
-       reDOI = new RegExp('\\b10\\.?[0-9]{3,4}(?:[.][0-9]+)*/.*');
-       rePMID = new RegExp('^\\d{8}\\b');
-       rePMCID = new RegExp('\\bPMC\\d{7}\\b');
-       rePMCID2 = new RegExp('^\\d{7}\\b');
+       var reHTTP = new RegExp('^((https?)://.+\\..+)'); // Assumes all 
strings with http/s protocol are URLs
+       var reWWW = new RegExp('^((www)\\..+\\..+)'); // Assumes all strings 
with www substring are URLs
+       var reDOI = new RegExp('\\b10\\.?[0-9]{3,4}(?:[.][0-9]+)*/.*');
+       var rePMID = new RegExp('^\\d{8}\\b');
+       var rePMCID = new RegExp('\\bPMC\\d{7}\\b');
+       var rePMCID2 = new RegExp('^\\d{7}\\b');
 
-       matchHTTP = search.match(reHTTP);
-       matchDOI = search.match(reDOI);
-       matchPMID = search.match(rePMID);
-       matchPMCID = search.match(rePMCID);
-       matchWWW = search.match(reWWW);
-
+       var matchHTTP = search.match(reHTTP);
+       var matchDOI = search.match(reDOI);
+       var matchPMID = search.match(rePMID);
+       var matchPMCID = search.match(rePMCID);
+       var matchWWW = search.match(reWWW);
 
        if (matchHTTP || matchWWW){
                this.stats.increment('input.url');
@@ -191,18 +228,18 @@
                callback(matchDOI[0], this.requestFromDOI.bind(this));
        } else if (matchPMID) {
                this.stats.increment('input.pmid');
-               callback(matchPMID[0], this.requestFromPubMedID.bind(this));
+               callback(matchPMID[0], this.requestFromPMID.bind(this));
        } else if (matchPMCID) {
                this.stats.increment('input.pmcid');
-               callback(matchPMCID[0], this.requestFromPubMedID.bind(this));
+               callback(matchPMCID[0], this.requestFromPMCID.bind(this));
        } else {
-               matchPMCID = search.match(rePMCID2);
+               matchPMCID = search.match(rePMCID2); // Detects PMCIDs with no 
PMC prefix
                if (matchPMCID) {
                        this.stats.increment('input.pmcid');
-                       callback('PMC' + matchPMCID[0], 
this.requestFromPubMedID.bind(this));
+                       callback('PMC' + matchPMCID[0], 
this.requestFromPMCID.bind(this));
                } else {
                        this.stats.increment('input.url');
-                       parsedURL = urlParse.parse(search);
+                       var parsedURL = urlParse.parse(search);
                        if (!parsedURL.protocol){
                                search = 'http://'+ search;
                        }
diff --git a/package.json b/package.json
index 4bdfddc..1707ab2 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "citoid",
-  "version": "0.2.3",
+  "version": "0.2.4",
   "description": "Converts search terms such as URL or DOI into citations.",
   "scripts": {
     "start": "service-runner",
diff --git a/test/features/errors/index.js b/test/features/errors/index.js
index b997ed9..b56e120 100644
--- a/test/features/errors/index.js
+++ b/test/features/errors/index.js
@@ -77,7 +77,32 @@
                        assert.status(res, 404);
                }, function(err) {
                        assert.status(err, 404);
-                       assert.deepEqual(err.body.Error, 'Unable to resolve 
DOI',
+                       assert.deepEqual(err.body.Error, 'Unable to resolve DOI 
' + doi,
+                               'Unexpected error message ' + err.body.Error);
+               });
+       });
+
+       it('bad pmid', function() {
+               var pmid = '99999999';
+               return server.query(pmid, 'mediawiki', 'en')
+               .then(function(res) {
+                       assert.status(res, 404);
+               }, function(err) {
+                       assert.status(err, 404);
+                       assert.deepEqual(err.body.Error,
+                               'Unable to locate resource with PMID ' + pmid,
+                               'Unexpected error message ' + err.body.Error);
+               });
+       });
+
+       it('bad pmcid', function() {
+               var pmcid = 'PMC9999999';
+               return server.query(pmcid, 'mediawiki', 'en')
+               .then(function(res) {
+                       assert.status(res, 404);
+               }, function(err) {
+                       assert.status(err, 404);
+                       assert.deepEqual(err.body.Error, 'Unable to locate 
resource with PMCID ' + pmcid,
                                'Unexpected error message ' + err.body.Error);
                });
        });
diff --git a/test/features/scraping/index.js b/test/features/scraping/index.js
index ff2e333..5538399 100644
--- a/test/features/scraping/index.js
+++ b/test/features/scraping/index.js
@@ -12,8 +12,23 @@
 
        before(function () { return server.start(); });
 
-       it('pmid', function() {
-               return server.query('23555203').then(function(res) {
+       //PMID on NIH website that is not found in the id converter api
+       it('pmid (not in id converter)', function() {
+               return server.query('14656957').then(function(res) {
+                       assert.status(res, 200);
+                       assert.checkCitation(res, 'Seventh report of the Joint 
National Committee on Prevention, Detection, Evaluation, and Treatment of High 
Blood Pressure');
+               });
+       });
+
+       it('pmcid with prefix', function() {
+               return server.query('PMC3605911').then(function(res) {
+                       assert.status(res, 200);
+                       assert.checkCitation(res, 'Viral Phylodynamics');
+               });
+       });
+
+       it('pmcid without prefix', function() {
+               return server.query('3605911').then(function(res) {
                        assert.status(res, 200);
                        assert.checkCitation(res, 'Viral Phylodynamics');
                });

-- 
To view, visit https://gerrit.wikimedia.org/r/200314
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie2830d42fa63fecd30db72f723e6d5d9979f51b5
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/citoid
Gerrit-Branch: master
Gerrit-Owner: Mvolz <mv...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to