jenkins-bot has submitted this change and it was merged.

Change subject: Functional segmentation
......................................................................


Functional segmentation

Change-Id: I0aa31a0e5035f124910b41f4a4f8a72cab969c73
---
M server/models/dataModelManager.js
M server/public/index.html
M server/public/js/main.js
A server/segmentation/CXSegmenter.js
A server/segmentation/linkSegmenter.js
A server/segmentation/paragraphSegmenter.js
A server/segmentation/segmenter.js
A server/segmentation/sentenceSegmenter.js
8 files changed, 279 insertions(+), 16 deletions(-)

Approvals:
  KartikMistry: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/server/models/dataModelManager.js 
b/server/models/dataModelManager.js
index 9c60840..346bd62 100644
--- a/server/models/dataModelManager.js
+++ b/server/models/dataModelManager.js
@@ -1,13 +1,15 @@
 /**
  * ContentTranslation Server
-
-* @file
+ *
+ * @file
  * @ingroup Extensions
  * @copyright See AUTHORS.txt
  * @license GPL-2.0+
  */
 
 'use strict';
+
+var CXSegmenter = require( __dirname + '/../segmentation/CXSegmenter.js' 
).CXSegmenter;
 
 /**
  * CXDataModelManager
@@ -23,19 +25,18 @@
  * Initialize
  */
 CXDataModelManager.prototype.init = function () {
-       var dataModelManager = this;
+       var dataModelManager = this, segmenter;
 
+       segmenter = new CXSegmenter( this.context.sourceText );
+       segmenter.segment();
        this.dataModel = {
                version: 0,
-               sourceLang: this.context.sourceLanguage,
-               targetLang: this.context.targetLanguage,
+               sourceLanguage: this.context.sourceLanguage,
+               targetLanguage: this.context.targetLanguage,
                sourceLocation: this.context.sourceTitle,
-               segments: [],
-               segmentedContent: [],
-               segmentCount: 0,
-               dictionary: null,
-               glossary: null,
-               links: null
+               segments: segmenter.getSegments(),
+               segmentedContent: segmenter.getSegmentedContent(),
+               links: segmenter.getLinks()
        };
        dataModelManager.refresh();
 };
diff --git a/server/public/index.html b/server/public/index.html
index c38253c..da8c352 100644
--- a/server/public/index.html
+++ b/server/public/index.html
@@ -33,9 +33,14 @@
                        color: green;
                        font-size: 0.8em;
                }
-               .segment:hover {
+               .cx-segment:hover {
                        background-color: #ccc;
                }
+
+               .cx-link:hover {
+                       background-color: #aaa;
+               }
+
                .sourceText {
                        clear: both;
                }
@@ -52,7 +57,8 @@
                <input name="targetLanguage" value="cy" />
                <br/>
                <div contenteditable class="sourceText">
-                       Hydrogen is a chemical element with chemical symbol H 
and atomic number 1. With an atomic weight of 1.00794 u, hydrogen is the 
lightest element on the periodic table. Its monatomic form (H) is the most 
abundant chemical substance in the universe, constituting roughly 75% of all 
baryonic mass.Non-remnant stars are mainly composed of hydrogen in its plasma 
state. The most common isotope of hydrogen, termed protium (name rarely used, 
symbol 1H), has a single proton and zero neutrons.
+                       <p><b>Hydrogen</b> is a <a 
href="/wiki/Chemical_element" title="Chemical element">chemical element</a> 
with <a href="/wiki/Chemical_symbol" title="Chemical symbol" 
class="mw-redirect">chemical symbol</a>&nbsp;<b>H</b> and <a 
href="/wiki/Atomic_number" title="Atomic number">atomic number</a>&nbsp;1. With 
an <a href="/wiki/Atomic_weight" title="Atomic weight" 
class="mw-redirect">atomic weight</a> of <span 
style="white-space:nowrap">1.00794&nbsp;<a href="/wiki/Atomic_mass_unit" 
title="Atomic mass unit">u</a></span>, hydrogen is the lightest element on the 
<a href="/wiki/Periodic_table" title="Periodic table">periodic table</a>. Its 
<a href="/wiki/Monatomic" title="Monatomic" class="mw-redirect">monatomic</a> 
form (H) is the <a href="/wiki/Abundance_of_the_chemical_elements" 
title="Abundance of the chemical elements">most abundant</a> chemical substance 
in the universe, constituting roughly 75% of all <a href="/wiki/Baryon" 
title="Baryon">baryonic</a> mass.<sup id="cite_ref-7" class="reference"><a 
href="#cite_note-7"><span>[</span>7<span>]</span></a></sup><sup id="cite_ref-8" 
class="reference"><a href="#cite_note-8"><span>[</span>note 
1<span>]</span></a></sup> Non-<a href="/wiki/Stellar_remnant" title="Stellar 
remnant" class="mw-redirect">remnant</a> <a href="/wiki/Star" 
title="Star">stars</a> are mainly composed of hydrogen in its <a 
href="/wiki/Plasma_(physics)" title="Plasma (physics)">plasma</a> state. The 
most common <a href="/wiki/Isotope" title="Isotope">isotope</a> of hydrogen, 
termed <i>protium</i> (name rarely used, symbol <sup>1</sup>H), has a single 
proton and zero <a href="/wiki/Neutron" title="Neutron">neutrons</a>.</p>
+                       <p>The universal emergence of atomic hydrogen first 
occurred during the <a href="/wiki/Recombination_(cosmology)" 
title="Recombination (cosmology)">recombination epoch</a>. At <a 
href="/wiki/Standard_temperature_and_pressure" title="Standard temperature and 
pressure" class="mw-redirect">standard temperature and pressure</a>, hydrogen 
is a <a href="/wiki/Transparency_(optics)" title="Transparency (optics)" 
class="mw-redirect">colorless</a>, <a href="/wiki/Odorless" title="Odorless" 
class="mw-redirect">odorless</a>, <a href="/wiki/Taste" 
title="Taste">tasteless</a>, non-toxic, <a href="/wiki/Nonmetal" 
title="Nonmetal">nonmetallic</a>, highly <a href="/wiki/Combustion" 
title="Combustion">combustible</a> <a href="/wiki/Diatomic_molecule" 
title="Diatomic molecule">diatomic</a> <a href="/wiki/Gas" title="Gas">gas</a> 
with the <a href="/wiki/Molecular_formula" title="Molecular formula" 
class="mw-redirect">molecular formula</a> H<sub>2</sub>. Since hydrogen readily 
forms <a href="/wiki/Covalent_bond" title="Covalent bond">covalent</a> 
compounds with most <a href="/wiki/Nonmetal" title="Nonmetal">non-metallic</a> 
elements, most of the hydrogen on Earth exists in <a href="/wiki/Molecule" 
title="Molecule">molecular forms</a> such as in the form of <a 
href="/wiki/Water" title="Water">water</a> or <a href="/wiki/Organic_compound" 
title="Organic compound">organic compounds</a>. Hydrogen plays a particularly 
important role in <a href="/wiki/Acid%E2%80%93base_reaction" title="Acid–base 
reaction">acid–base reactions</a>. In <a href="/wiki/Ionic_compound" 
title="Ionic compound">ionic compounds</a>, hydrogen can take the form of a 
negative charge (i.e., <a href="/wiki/Anion" title="Anion" 
class="mw-redirect">anion</a>) known as a <a href="/wiki/Hydride" 
title="Hydride">hydride</a>, or as a positively charged (i.e., <a 
href="/wiki/Cation" title="Cation" class="mw-redirect">cation</a>) <a 
href="/wiki/Chemical_species" title="Chemical species">species</a> denoted by 
the symbol H<sup>+</sup>. The hydrogen <a href="/wiki/Cation" title="Cation" 
class="mw-redirect">cation</a> is written as though composed of a bare proton, 
but in reality, hydrogen cations in <a href="/wiki/Ionic_compound" title="Ionic 
compound">ionic compounds</a> are always more complex species than that would 
suggest.</p>
                </div>
        </form>
        <button>Submit</button>
diff --git a/server/public/js/main.js b/server/public/js/main.js
index 349bb31..82e17ed 100644
--- a/server/public/js/main.js
+++ b/server/public/js/main.js
@@ -1,10 +1,21 @@
 ( function ( $ ) {
        'use strict';
+       var cxdata;
+       $( '.sourceText' ).on( 'click', '.cx-segment', function () {
+               var segment = cxdata.segments[$( this ).data( 'segment' )];
+               console.log( segment );
+       } );
+
+       $( '.sourceText' ).on( 'click', '.cx-link', function () {
+               var linkid = cxdata.links[$( this ).data( 'linkid' )];
+               console.log( linkid );
+       } );
+
        /* global io */
        $( document ).ready( function () {
                var socket = io.connect( '/', { port: 8000 } );
 
-               $( 'button' ).click( function() {
+               $( 'button' ).click( function () {
                        $( 'progress' ).show();
                        socket.emit( 'cx.init', {
                                sourceText: $('.sourceText').html(),
@@ -12,8 +23,10 @@
                                targetLanguage: 
$('input[name=targetLanguage').val()
                        } );
                        socket.on( 'cx.data.update', function ( data ) {
-                               $( '.status' ).text( 'Recieved version ' + 
data.version );
-                               console.log( data );
+                               cxdata = data;
+                               $( '.status' ).text( 'Recieved version ' + 
cxdata.version );
+                               $( '.sourceText' ).html( 
cxdata.segmentedContent );
+                               console.log( cxdata );
                        } );
                } );
        } );
diff --git a/server/segmentation/CXSegmenter.js 
b/server/segmentation/CXSegmenter.js
new file mode 100644
index 0000000..e37aa4a
--- /dev/null
+++ b/server/segmentation/CXSegmenter.js
@@ -0,0 +1,36 @@
+/**
+ * ContentTranslation Server
+ *
+ * @file
+ * @ingroup Extensions
+ * @copyright See AUTHORS.txt
+ * @license GPL-2.0+
+ */
+
+'use strict';
+
+var util = require( 'util' ),
+       Segmenter = require( __dirname + '/segmenter.js' ).Segmenter,
+       ParagraphSegmenter = require( __dirname + '/paragraphSegmenter.js' 
).ParagraphSegmenter;
+
+function CXSegmenter( content ) {
+       Segmenter.call( this, content );
+       this.links = {};
+}
+
+// Extend Segmenter
+util.inherits( CXSegmenter, Segmenter );
+
+CXSegmenter.prototype.segment = function () {
+       var paragraphSegmenter = new ParagraphSegmenter( this.content );
+       paragraphSegmenter.segment();
+       this.segments = paragraphSegmenter.getSegments();
+       this.segmentedContent = paragraphSegmenter.toHTML();
+       this.links = paragraphSegmenter.getLinks();
+};
+
+CXSegmenter.prototype.getLinks = function () {
+       return this.links;
+};
+
+module.exports.CXSegmenter = CXSegmenter;
diff --git a/server/segmentation/linkSegmenter.js 
b/server/segmentation/linkSegmenter.js
new file mode 100644
index 0000000..5b08b73
--- /dev/null
+++ b/server/segmentation/linkSegmenter.js
@@ -0,0 +1,47 @@
+/**
+ * ContentTranslation Server
+ *
+ * @file
+ * @ingroup Extensions
+ * @copyright See AUTHORS.txt
+ * @license GPL-2.0+
+ */
+
+
+'use strict';
+
+var util = require( 'util' ),
+       crypto = require( 'crypto' ),
+       Segmenter = require( __dirname + '/segmenter.js' ).Segmenter,
+       $ = require( 'jquery' );
+
+function LinkSegmenter( content ) {
+       Segmenter.call( this, content );
+}
+
+// Extend Segmenter
+util.inherits( LinkSegmenter, Segmenter );
+
+LinkSegmenter.prototype.segment = function () {
+       var segmenter = this, $container = $( '<div>' ).html( this.content );
+
+       $container.find( 'a' ).each( function( index, link ) {
+               var $link = $( link ), hash;
+
+               hash = crypto.createHash( 'md5' ).update( $link.prop( 'href' ) )
+                       .digest( 'hex' ).substr( 0, 5 );
+               $link
+                       .attr( 'data-linkid', hash )
+                       .addClass( 'cx-link' );
+               segmenter.segments[hash] = {
+                       href: $link.prop( 'href' )
+               };
+       } );
+       this.segmentedContent = $container.html();
+};
+
+LinkSegmenter.prototype.toHTML = function () {
+       return this.segmentedContent;
+};
+
+module.exports.LinkSegmenter = LinkSegmenter;
diff --git a/server/segmentation/paragraphSegmenter.js 
b/server/segmentation/paragraphSegmenter.js
new file mode 100644
index 0000000..dcae4f5
--- /dev/null
+++ b/server/segmentation/paragraphSegmenter.js
@@ -0,0 +1,60 @@
+/**
+ * ContentTranslation Server
+ *
+ * @file
+ * @ingroup Extensions
+ * @copyright See AUTHORS.txt
+ * @license GPL-2.0+
+ */
+
+
+'use strict';
+
+var util = require( 'util' ),
+       Segmenter = require( __dirname + '/segmenter.js' ).Segmenter,
+       SentenceSegmenter = require( __dirname + '/sentenceSegmenter.js' 
).SentenceSegmenter,
+       $ = require( 'jquery' );
+
+function ParagraphSegmenter( content ) {
+       Segmenter.call( this, content );
+       this.paragraphs = [];
+       this.links = {};
+}
+
+// Extend Segmenter
+util.inherits( ParagraphSegmenter, Segmenter );
+
+ParagraphSegmenter.prototype.segment = function () {
+       var segmenter = this,
+               $container = $( '<div>' ).html( this.content );
+
+       $container.find( 'p' ).each( function ( index, paragraph ) {
+               var $paragraph = $( paragraph ),
+                       sentenceSegments,
+                       sentenceSegmenter = new SentenceSegmenter( 
$paragraph.html() );
+
+               sentenceSegmenter.segment();
+               sentenceSegments = sentenceSegmenter.getSegments();
+               segmenter.segments = $.extend( segmenter.segments, 
sentenceSegments );
+               segmenter.segmentCount += sentenceSegmenter.getSegmentCount();
+               segmenter.links = $.extend( segmenter.links, 
sentenceSegmenter.getLinks() );
+               segmenter.paragraphs.push( sentenceSegmenter.toHTML() );
+       } );
+};
+
+ParagraphSegmenter.prototype.getLinks = function () {
+       return this.links;
+};
+
+ParagraphSegmenter.prototype.toHTML = function () {
+       var i, paragraph, paragraphs = '';
+
+       for ( i = 0; i< this.paragraphs.length; i++ ) {
+               paragraph = $( this.paragraphs[i] );
+               paragraphs += $( '<p>' ).append( paragraph ).prop( 'outerHTML' 
);
+       }
+
+       return paragraphs;
+};
+
+module.exports.ParagraphSegmenter = ParagraphSegmenter;
diff --git a/server/segmentation/segmenter.js b/server/segmentation/segmenter.js
new file mode 100644
index 0000000..ff2ceb3
--- /dev/null
+++ b/server/segmentation/segmenter.js
@@ -0,0 +1,32 @@
+/*
+ * Content Translation
+ *
+ */
+
+'use strict';
+
+/**
+ * Segmenter
+ * @class
+ */
+function Segmenter( content ) {
+       this.content = content;
+       this.segmentCount = 0;
+       this.segmentIndex = 0;
+       this.segments = {};
+       this.segmentedContent = null;
+}
+
+Segmenter.prototype.getSegmentCount = function () {
+       return this.segmentCount;
+};
+
+Segmenter.prototype.getSegments = function () {
+       return this.segments;
+};
+
+Segmenter.prototype.getSegmentedContent = function () {
+       return this.segmentedContent;
+};
+
+module.exports.Segmenter = Segmenter;
diff --git a/server/segmentation/sentenceSegmenter.js 
b/server/segmentation/sentenceSegmenter.js
new file mode 100644
index 0000000..2500104
--- /dev/null
+++ b/server/segmentation/sentenceSegmenter.js
@@ -0,0 +1,68 @@
+/**
+ * ContentTranslation Server
+ *
+ * @file
+ * @ingroup Extensions
+ * @copyright See AUTHORS.txt
+ * @license GPL-2.0+
+ */
+
+
+'use strict';
+
+var util = require( 'util' ),
+       crypto = require( 'crypto' ),
+       Segmenter = require( __dirname + '/segmenter.js' ).Segmenter,
+       LinkSegmenter = require( __dirname + '/linkSegmenter.js' 
).LinkSegmenter,
+       $ = require( 'jquery' );
+
+function SentenceSegmenter( content ) {
+       Segmenter.call( this, content );
+       this.lookup = [];
+       this.links = {};
+}
+
+// Extend Segmenter
+util.inherits( SentenceSegmenter, Segmenter );
+
+SentenceSegmenter.prototype.segment = function () {
+       var i, segmentId, linkSegmenter,
+               sentences = this.content.split( '.' );
+
+       this.segmentCount += sentences.length;
+       for ( i = 0; i< this.segmentCount; i++ ) {
+               segmentId = crypto.createHash( 'md5' ).update( sentences[i] )
+                       .digest( 'hex' ).substr( 0, 5 );
+               linkSegmenter = new LinkSegmenter(sentences[i]);
+               linkSegmenter.segment();
+               this.links = $.extend( this.links, linkSegmenter.getSegments() 
);
+               this.segments[segmentId] = {
+                       source: linkSegmenter.toHTML() + '.'
+               };
+               // We need this lookup to keep the order of segments
+               // while constructing the segmented content using
+               // toHTML method.
+               this.lookup.push( segmentId );
+       }
+};
+
+SentenceSegmenter.prototype.getLinks = function () {
+       return this.links;
+};
+
+SentenceSegmenter.prototype.toHTML = function () {
+       var i, segmentId, $sentence, $sentences = '';
+
+       for ( i = 0; i< this.segmentCount; i++ ) {
+               segmentId =  this.lookup[i];
+               $sentence = $( '<span>' )
+                       .addClass( 'cx-segment' )
+                       .attr( 'data-segment', segmentId )
+                       .html( this.segments[segmentId].source );
+               $sentences += $sentence.prop( 'outerHTML' );
+       }
+
+       return $sentences;
+};
+
+module.exports.SentenceSegmenter = SentenceSegmenter;

-- 
To view, visit https://gerrit.wikimedia.org/r/114150
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I0aa31a0e5035f124910b41f4a4f8a72cab969c73
Gerrit-PatchSet: 6
Gerrit-Project: mediawiki/extensions/ContentTranslation
Gerrit-Branch: master
Gerrit-Owner: Santhosh <santhosh.thottin...@gmail.com>
Gerrit-Reviewer: Divec <da...@sheetmusic.org.uk>
Gerrit-Reviewer: KartikMistry <kartik.mis...@gmail.com>
Gerrit-Reviewer: Nikerabbit <niklas.laxst...@gmail.com>
Gerrit-Reviewer: Santhosh <santhosh.thottin...@gmail.com>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to