Santhosh has uploaded a new change for review. https://gerrit.wikimedia.org/r/129653
Change subject: Simple clean up of article before parsing ...................................................................... Simple clean up of article before parsing Just get body of the content from Parsoid Change-Id: I8f4127f4d21f1c00652ef1e18baac0c9d1ff0612 --- M segmentation/CXSegmenter.js 1 file changed, 11 insertions(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver refs/changes/53/129653/1 diff --git a/segmentation/CXSegmenter.js b/segmentation/CXSegmenter.js index 7591432..9ea81fb 100644 --- a/segmentation/CXSegmenter.js +++ b/segmentation/CXSegmenter.js @@ -27,6 +27,7 @@ }; CXSegmenter.prototype.parse = function () { + this.content = this.getBody( this.content ); this.parser.parse( this.content ); this.links = this.parser.links; this.segmentedContent = this.parser.segmentedContent; @@ -34,6 +35,15 @@ CXSegmenter.prototype.getLinks = function () { return this.links; +}; + +/** + * Get the body part alone for the content. + */ +CXSegmenter.prototype.getBody = function () { + var $container = $( '<div>' ).html( this.content ); + $container = $container.find( 'body' ); + return $container.length ? $container[ 0 ].outerHTML : this.content; }; CXSegmenter.prototype.extractSegments = function () { @@ -44,7 +54,7 @@ var $section = $( section ), segmentId = $section.data( 'segmentid' ); - segmenter.segments[segmentId] = { + segmenter.segments[ segmentId ] = { source: $section.html() }; } ); -- To view, visit https://gerrit.wikimedia.org/r/129653 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I8f4127f4d21f1c00652ef1e18baac0c9d1ff0612 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/cxserver Gerrit-Branch: master Gerrit-Owner: Santhosh <santhosh.thottin...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits