Santhosh has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/129653

Change subject: Simple clean up of article before parsing
......................................................................

Simple clean up of article before parsing

Just get body of the content from Parsoid

Change-Id: I8f4127f4d21f1c00652ef1e18baac0c9d1ff0612
---
M segmentation/CXSegmenter.js
1 file changed, 11 insertions(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver 
refs/changes/53/129653/1

diff --git a/segmentation/CXSegmenter.js b/segmentation/CXSegmenter.js
index 7591432..9ea81fb 100644
--- a/segmentation/CXSegmenter.js
+++ b/segmentation/CXSegmenter.js
@@ -27,6 +27,7 @@
 };
 
 CXSegmenter.prototype.parse = function () {
+       this.content = this.getBody( this.content );
        this.parser.parse( this.content );
        this.links = this.parser.links;
        this.segmentedContent = this.parser.segmentedContent;
@@ -34,6 +35,15 @@
 
 CXSegmenter.prototype.getLinks = function () {
        return this.links;
+};
+
+/**
+ * Get the body part alone for the content.
+ */
+CXSegmenter.prototype.getBody = function () {
+       var $container = $( '<div>' ).html( this.content );
+       $container = $container.find( 'body' );
+       return $container.length ? $container[ 0 ].outerHTML : this.content;
 };
 
 CXSegmenter.prototype.extractSegments = function () {
@@ -44,7 +54,7 @@
                var $section = $( section ),
                        segmentId = $section.data( 'segmentid' );
 
-               segmenter.segments[segmentId] = {
+               segmenter.segments[ segmentId ] = {
                        source: $section.html()
                };
        } );

-- 
To view, visit https://gerrit.wikimedia.org/r/129653
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I8f4127f4d21f1c00652ef1e18baac0c9d1ff0612
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/cxserver
Gerrit-Branch: master
Gerrit-Owner: Santhosh <santhosh.thottin...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to