Mvolz has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/211718

Change subject: [WIP] Promisify Scraper.js
......................................................................

[WIP] Promisify Scraper.js

Currently WIP; only method currently
promisified is parseHTML.

Uses promisifymerged branch of html-metadata
until changes are merged into master there.

Bug: T75993
Change-Id: I4c2c3638f4c6cc580f1ed45f82874da8ed3db06c
---
M lib/Scraper.js
M package.json
2 files changed, 69 insertions(+), 58 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/citoid 
refs/changes/18/211718/1

diff --git a/lib/Scraper.js b/lib/Scraper.js
index 0df0d52..2022760 100644
--- a/lib/Scraper.js
+++ b/lib/Scraper.js
@@ -11,11 +11,12 @@
 var request = require('request');
 var urlParse = require('url');
 var cheerio = require('cheerio');
-var parseMetaData = require('html-metadata').parseAll;
+var parseOpenGraph = require('html-metadata').parseOpenGraph;
 var og = require('./translators/openGraph.js');
 var gen = require('./translators/general.js');
 var iconv = require('iconv-lite');
 var contentType = require('content-type');
+var BBPromise = require('bluebird');
 
 var Scraper = function(citoidConfig, logger){
        this.logger = logger;
@@ -103,7 +104,8 @@
                                // If the html has been successfully loaded 
into cheerio, proceed.
                                if (chtml){
                                        citation.title = null;
-                                       scraper.parseHTML(url, chtml, citation, 
function(citation){
+                                       scraper.parseHTML(url, chtml, citation)
+                                       .then(function(citation){
                                                logger.log('debug/scraper', 
"Sucessfully scraped resource at " + url);
                                                callback(null, 200, [citation]);
                                        });
@@ -196,68 +198,77 @@
 };
 
 /**
- * Adds html metadata to a given citation object given
- * the html loaded into cheerio
- * @param  {String}   url      url being scraped
- * @param  {Objct}   chtml     Cheerio object with html loaded
- * @param  {Object}   citation a citation object contain default parameters
- * @param  {Function} callback callback(citation)
+ * Promise for citation object with html metadata added to default
+ * citation object
+ *
+ * @param  {String} url             url being scraped
+ * @param  {Object} chtml           Cheerio object with html loaded
+ * @param  {Object} citation        a citation object contain default 
parameters
+ * @return {Object}                 Bluebird promise for citation object
  */
-Scraper.prototype.parseHTML = function(url, chtml, citation, callback){
-       var metaData, typeTranslator, parsedUrl;
+Scraper.prototype.parseHTML = BBPromise.method(function(url, chtml, citation){
+       var metaData = {};
+       var typeTranslator;
+       var parsedUrl;
 
-       parseMetaData(chtml, function(err, results){
-               metaData = results; //only use open graph here
-       });
+       return parseOpenGraph(chtml)
+       .catch(function(e){
+               console.log('error');
+               return citation;
+       })
+       .then(function(results){
+               console.log('results');
+               metaData.openGraph = results; //only currently using open graph
 
-       // translator/openGraph.js properties
+               // translator/openGraph.js properties
 
-       // Set zotero type from OpenGraph type
-       if (metaData.openGraph['type'] && 
og.types[metaData.openGraph['type']]){ // if there is a type in the results and 
that type is defined in openGraph.js
-               citation.itemType = og.types[metaData.openGraph['type']];
-       }
-       else {
-               citation.itemType = 'webpage'; //default itemType
-       }
-
-       // Add universal (non type specific) OpenGraph properties
-       citation = translate(citation, metaData.openGraph, og.general);
-
-       // Add type specific Open Graph properties
-       typeTranslator = og[citation.itemType];
-       if (typeTranslator){
-               citation = translate(citation, metaData.openGraph, 
typeTranslator);
-       }
-
-       // Fall back on general metadata
-       citation  = translate(citation, metaData.general, gen.general);
-
-       // Fall back methods
-
-       // Title
-       if (!citation.title){
-               citation.title = getTitle(url, chtml);
-       }
-
-       // Access date - universal - format YYYY-MM-DD
-       citation.accessDate = (new Date()).toISOString().substring(0, 10);
-
-       // Fall back websiteTitle - webpage only
-       if (citation.itemType === 'webpage' && !citation.websiteTitle){
-               parsedUrl = urlParse.parse(url);
-               if (citation.title && parsedUrl && parsedUrl.hostname) {
-                       citation.websiteTitle = parsedUrl.hostname;
+               // Set zotero type from OpenGraph type
+               if (metaData.openGraph['type'] && 
og.types[metaData.openGraph['type']]){ // if there is a type in the results and 
that type is defined in openGraph.js
+                       citation.itemType = 
og.types[metaData.openGraph['type']];
                }
-       }
+               else {
+                       citation.itemType = 'webpage'; //default itemType
+               }
 
-       // Fall back publicationTitle - webpage only
-       // TODO: REMOVE BLOCK - temporarily kept in for backwards compatibility
-       if (citation.itemType === 'webpage' && citation.websiteTitle){
-               citation.publicationTitle = citation.websiteTitle;
-       }
+               // Add universal (non type specific) OpenGraph properties
+               citation = translate(citation, metaData.openGraph, og.general);
 
-       callback(citation);
-};
+               // Add type specific Open Graph properties
+               typeTranslator = og[citation.itemType];
+               if (typeTranslator){
+                       citation = translate(citation, metaData.openGraph, 
typeTranslator);
+               }
+
+               // Fall back on general metadata
+               citation  = translate(citation, metaData.general, gen.general);
+
+               // Fall back methods
+
+               // Title
+               if (!citation.title){
+                       citation.title = getTitle(url, chtml);
+               }
+
+               // Access date - universal - format YYYY-MM-DD
+               citation.accessDate = (new Date()).toISOString().substring(0, 
10);
+
+               // Fall back websiteTitle - webpage only
+               if (citation.itemType === 'webpage' && !citation.websiteTitle){
+                       parsedUrl = urlParse.parse(url);
+                       if (citation.title && parsedUrl && parsedUrl.hostname) {
+                               citation.websiteTitle = parsedUrl.hostname;
+                       }
+               }
+
+               // Fall back publicationTitle - webpage only
+               // TODO: REMOVE BLOCK - temporarily kept in for backwards 
compatibility
+               if (citation.itemType === 'webpage' && citation.websiteTitle){
+                       citation.publicationTitle = citation.websiteTitle;
+               }
+
+               return citation;
+       });
+});
 
 /**
  * Gets title in other ways if not metadata is available
diff --git a/package.json b/package.json
index 9727bf7..e06867c 100644
--- a/package.json
+++ b/package.json
@@ -17,7 +17,7 @@
     "compression": "1.4.3",
     "content-type": "1.0.1",
     "express": "4.12.3",
-    "html-metadata": "0.1.3",
+    "html-metadata": 
"git://github.com/wikimedia/html-metadata.git#promisifymerged",
     "iconv-lite": "0.4.8",
     "js-yaml": "3.2.7",
     "node-uuid": "1.4.3",

-- 
To view, visit https://gerrit.wikimedia.org/r/211718
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I4c2c3638f4c6cc580f1ed45f82874da8ed3db06c
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/citoid
Gerrit-Branch: master
Gerrit-Owner: Mvolz <mv...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to