Mvolz has uploaded a new change for review. https://gerrit.wikimedia.org/r/211718
Change subject: [WIP] Promisify Scraper.js ...................................................................... [WIP] Promisify Scraper.js Currently WIP; only method currently promisified is parseHTML. Uses promisifymerged branch of html-metadata until changes are merged into master there. Bug: T75993 Change-Id: I4c2c3638f4c6cc580f1ed45f82874da8ed3db06c --- M lib/Scraper.js M package.json 2 files changed, 69 insertions(+), 58 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/citoid refs/changes/18/211718/1 diff --git a/lib/Scraper.js b/lib/Scraper.js index 0df0d52..2022760 100644 --- a/lib/Scraper.js +++ b/lib/Scraper.js @@ -11,11 +11,12 @@ var request = require('request'); var urlParse = require('url'); var cheerio = require('cheerio'); -var parseMetaData = require('html-metadata').parseAll; +var parseOpenGraph = require('html-metadata').parseOpenGraph; var og = require('./translators/openGraph.js'); var gen = require('./translators/general.js'); var iconv = require('iconv-lite'); var contentType = require('content-type'); +var BBPromise = require('bluebird'); var Scraper = function(citoidConfig, logger){ this.logger = logger; @@ -103,7 +104,8 @@ // If the html has been successfully loaded into cheerio, proceed. if (chtml){ citation.title = null; - scraper.parseHTML(url, chtml, citation, function(citation){ + scraper.parseHTML(url, chtml, citation) + .then(function(citation){ logger.log('debug/scraper', "Sucessfully scraped resource at " + url); callback(null, 200, [citation]); }); @@ -196,68 +198,77 @@ }; /** - * Adds html metadata to a given citation object given - * the html loaded into cheerio - * @param {String} url url being scraped - * @param {Objct} chtml Cheerio object with html loaded - * @param {Object} citation a citation object contain default parameters - * @param {Function} callback callback(citation) + * Promise for citation object with html metadata added to default + * citation object + * + * @param {String} url url being scraped + * @param {Object} chtml Cheerio object with html loaded + * @param {Object} citation a citation object contain default parameters + * @return {Object} Bluebird promise for citation object */ -Scraper.prototype.parseHTML = function(url, chtml, citation, callback){ - var metaData, typeTranslator, parsedUrl; +Scraper.prototype.parseHTML = BBPromise.method(function(url, chtml, citation){ + var metaData = {}; + var typeTranslator; + var parsedUrl; - parseMetaData(chtml, function(err, results){ - metaData = results; //only use open graph here - }); + return parseOpenGraph(chtml) + .catch(function(e){ + console.log('error'); + return citation; + }) + .then(function(results){ + console.log('results'); + metaData.openGraph = results; //only currently using open graph - // translator/openGraph.js properties + // translator/openGraph.js properties - // Set zotero type from OpenGraph type - if (metaData.openGraph['type'] && og.types[metaData.openGraph['type']]){ // if there is a type in the results and that type is defined in openGraph.js - citation.itemType = og.types[metaData.openGraph['type']]; - } - else { - citation.itemType = 'webpage'; //default itemType - } - - // Add universal (non type specific) OpenGraph properties - citation = translate(citation, metaData.openGraph, og.general); - - // Add type specific Open Graph properties - typeTranslator = og[citation.itemType]; - if (typeTranslator){ - citation = translate(citation, metaData.openGraph, typeTranslator); - } - - // Fall back on general metadata - citation = translate(citation, metaData.general, gen.general); - - // Fall back methods - - // Title - if (!citation.title){ - citation.title = getTitle(url, chtml); - } - - // Access date - universal - format YYYY-MM-DD - citation.accessDate = (new Date()).toISOString().substring(0, 10); - - // Fall back websiteTitle - webpage only - if (citation.itemType === 'webpage' && !citation.websiteTitle){ - parsedUrl = urlParse.parse(url); - if (citation.title && parsedUrl && parsedUrl.hostname) { - citation.websiteTitle = parsedUrl.hostname; + // Set zotero type from OpenGraph type + if (metaData.openGraph['type'] && og.types[metaData.openGraph['type']]){ // if there is a type in the results and that type is defined in openGraph.js + citation.itemType = og.types[metaData.openGraph['type']]; } - } + else { + citation.itemType = 'webpage'; //default itemType + } - // Fall back publicationTitle - webpage only - // TODO: REMOVE BLOCK - temporarily kept in for backwards compatibility - if (citation.itemType === 'webpage' && citation.websiteTitle){ - citation.publicationTitle = citation.websiteTitle; - } + // Add universal (non type specific) OpenGraph properties + citation = translate(citation, metaData.openGraph, og.general); - callback(citation); -}; + // Add type specific Open Graph properties + typeTranslator = og[citation.itemType]; + if (typeTranslator){ + citation = translate(citation, metaData.openGraph, typeTranslator); + } + + // Fall back on general metadata + citation = translate(citation, metaData.general, gen.general); + + // Fall back methods + + // Title + if (!citation.title){ + citation.title = getTitle(url, chtml); + } + + // Access date - universal - format YYYY-MM-DD + citation.accessDate = (new Date()).toISOString().substring(0, 10); + + // Fall back websiteTitle - webpage only + if (citation.itemType === 'webpage' && !citation.websiteTitle){ + parsedUrl = urlParse.parse(url); + if (citation.title && parsedUrl && parsedUrl.hostname) { + citation.websiteTitle = parsedUrl.hostname; + } + } + + // Fall back publicationTitle - webpage only + // TODO: REMOVE BLOCK - temporarily kept in for backwards compatibility + if (citation.itemType === 'webpage' && citation.websiteTitle){ + citation.publicationTitle = citation.websiteTitle; + } + + return citation; + }); +}); /** * Gets title in other ways if not metadata is available diff --git a/package.json b/package.json index 9727bf7..e06867c 100644 --- a/package.json +++ b/package.json @@ -17,7 +17,7 @@ "compression": "1.4.3", "content-type": "1.0.1", "express": "4.12.3", - "html-metadata": "0.1.3", + "html-metadata": "git://github.com/wikimedia/html-metadata.git#promisifymerged", "iconv-lite": "0.4.8", "js-yaml": "3.2.7", "node-uuid": "1.4.3", -- To view, visit https://gerrit.wikimedia.org/r/211718 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I4c2c3638f4c6cc580f1ed45f82874da8ed3db06c Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/citoid Gerrit-Branch: master Gerrit-Owner: Mvolz <mv...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits