Jon a écrit : > Hi, I'm new to Rhino and would like to make a scraper using xpath > queries. Specifically, I wanted to utilize firefox's xpath evaluation > (document.evaluate), which seems to handle lots of "dirty" html pages > quite well. Normally, I use/write greasy scripts, but I wanted to run > the scraper on a server machine. > > I've found some example scrappers, but they all use regular expressions. > I would like to use xpath as firebug makes it easy to scrape from it's > xpath links. > > Any thoughts? > -Jon
You can build on John Resig work: http://ejohn.org/blog/bringing-the-browser-to-the-server/ and on this: // XPath stuff var xpathFactory = new javax.xml.xpath.XPathFactory.newInstance(); var xpathTypeNodeSet = javax.xml.xpath.XPathConstants.NODESET; var xpathTypeNode = javax.xml.xpath.XPathConstants.NODE; var xpathTypeBoolean = javax.xml.xpath.XPathConstants.BOOLEAN; var xpathTypeNumber = javax.xml.xpath.XPathConstants.NUMBER; var XPathResult = function(_dom, xpath) { this._dom = _dom; this._xpath = xpath; } XPathResult.prototype = { toString: function() { return this._xpath.evaluate(this._dom); }, toBoolean: function() { return this._xpath.evaluate(this._dom, xpathTypeNodeBoolean); }, toNumber: function() { return this._xpath.evaluate(this._dom, xpathTypeNodeNumber); }, get nodes() { return new DOMNodeList(this._xpath.evaluate(this._dom, xpathTypeNodeSet)); }, set nodes(n) { if (n instanceof DOMDocument) n = n.documentElement; var _dom = this._dom; var doc = _dom.ownerDocument || makeNode(_dom); if (!(n instanceof DOMNode)) n = doc.createTextNode(String(n)); this.nodes.forEach(function(node) { node.parentNode.replaceChild(doc.importNode(n, true), node)}); }, get node() { return makeNode(this._xpath.evaluate(this._dom, xpathTypeNode)); }, set node(n) { if (n instanceof DOMDocument) n = n.documentElement; var _dom = this._dom; var doc = _dom.ownerDocument || makeNode(_dom); if (!(n instanceof DOMNode)) n = doc.createTextNode(String(n)); var oldNode = this.node; oldNode.parentNode.replaceChild(doc.importNode(n, true), oldNode); } } var xpathMixin = { xpath: function (sxpath) { var xpath = xpathFactory.newXPath(); return new XPathResult(this._dom, xpath.compile(sxpath)); }, select: function (sxpath) { return this.xpath(sxpath).nodes; } }; [DOMNode, DOMNodeList, DOMDocument].forEach(function(x){ extend(x.prototype, xpathMixin); }); _______________________________________________ dev-tech-js-engine-rhino mailing list [email protected] https://lists.mozilla.org/listinfo/dev-tech-js-engine-rhino
