jenkins-bot has submitted this change and it was merged. Change subject: Functional segmentation ......................................................................
Functional segmentation Change-Id: I0aa31a0e5035f124910b41f4a4f8a72cab969c73 --- M server/models/dataModelManager.js M server/public/index.html M server/public/js/main.js A server/segmentation/CXSegmenter.js A server/segmentation/linkSegmenter.js A server/segmentation/paragraphSegmenter.js A server/segmentation/segmenter.js A server/segmentation/sentenceSegmenter.js 8 files changed, 279 insertions(+), 16 deletions(-) Approvals: KartikMistry: Looks good to me, approved jenkins-bot: Verified diff --git a/server/models/dataModelManager.js b/server/models/dataModelManager.js index 9c60840..346bd62 100644 --- a/server/models/dataModelManager.js +++ b/server/models/dataModelManager.js @@ -1,13 +1,15 @@ /** * ContentTranslation Server - -* @file + * + * @file * @ingroup Extensions * @copyright See AUTHORS.txt * @license GPL-2.0+ */ 'use strict'; + +var CXSegmenter = require( __dirname + '/../segmentation/CXSegmenter.js' ).CXSegmenter; /** * CXDataModelManager @@ -23,19 +25,18 @@ * Initialize */ CXDataModelManager.prototype.init = function () { - var dataModelManager = this; + var dataModelManager = this, segmenter; + segmenter = new CXSegmenter( this.context.sourceText ); + segmenter.segment(); this.dataModel = { version: 0, - sourceLang: this.context.sourceLanguage, - targetLang: this.context.targetLanguage, + sourceLanguage: this.context.sourceLanguage, + targetLanguage: this.context.targetLanguage, sourceLocation: this.context.sourceTitle, - segments: [], - segmentedContent: [], - segmentCount: 0, - dictionary: null, - glossary: null, - links: null + segments: segmenter.getSegments(), + segmentedContent: segmenter.getSegmentedContent(), + links: segmenter.getLinks() }; dataModelManager.refresh(); }; diff --git a/server/public/index.html b/server/public/index.html index c38253c..da8c352 100644 --- a/server/public/index.html +++ b/server/public/index.html @@ -33,9 +33,14 @@ color: green; font-size: 0.8em; } - .segment:hover { + .cx-segment:hover { background-color: #ccc; } + + .cx-link:hover { + background-color: #aaa; + } + .sourceText { clear: both; } @@ -52,7 +57,8 @@ <input name="targetLanguage" value="cy" /> <br/> <div contenteditable class="sourceText"> - Hydrogen is a chemical element with chemical symbol H and atomic number 1. With an atomic weight of 1.00794 u, hydrogen is the lightest element on the periodic table. Its monatomic form (H) is the most abundant chemical substance in the universe, constituting roughly 75% of all baryonic mass.Non-remnant stars are mainly composed of hydrogen in its plasma state. The most common isotope of hydrogen, termed protium (name rarely used, symbol 1H), has a single proton and zero neutrons. + <p><b>Hydrogen</b> is a <a href="/wiki/Chemical_element" title="Chemical element">chemical element</a> with <a href="/wiki/Chemical_symbol" title="Chemical symbol" class="mw-redirect">chemical symbol</a> <b>H</b> and <a href="/wiki/Atomic_number" title="Atomic number">atomic number</a> 1. With an <a href="/wiki/Atomic_weight" title="Atomic weight" class="mw-redirect">atomic weight</a> of <span style="white-space:nowrap">1.00794 <a href="/wiki/Atomic_mass_unit" title="Atomic mass unit">u</a></span>, hydrogen is the lightest element on the <a href="/wiki/Periodic_table" title="Periodic table">periodic table</a>. Its <a href="/wiki/Monatomic" title="Monatomic" class="mw-redirect">monatomic</a> form (H) is the <a href="/wiki/Abundance_of_the_chemical_elements" title="Abundance of the chemical elements">most abundant</a> chemical substance in the universe, constituting roughly 75% of all <a href="/wiki/Baryon" title="Baryon">baryonic</a> mass.<sup id="cite_ref-7" class="reference"><a href="#cite_note-7"><span>[</span>7<span>]</span></a></sup><sup id="cite_ref-8" class="reference"><a href="#cite_note-8"><span>[</span>note 1<span>]</span></a></sup> Non-<a href="/wiki/Stellar_remnant" title="Stellar remnant" class="mw-redirect">remnant</a> <a href="/wiki/Star" title="Star">stars</a> are mainly composed of hydrogen in its <a href="/wiki/Plasma_(physics)" title="Plasma (physics)">plasma</a> state. The most common <a href="/wiki/Isotope" title="Isotope">isotope</a> of hydrogen, termed <i>protium</i> (name rarely used, symbol <sup>1</sup>H), has a single proton and zero <a href="/wiki/Neutron" title="Neutron">neutrons</a>.</p> + <p>The universal emergence of atomic hydrogen first occurred during the <a href="/wiki/Recombination_(cosmology)" title="Recombination (cosmology)">recombination epoch</a>. At <a href="/wiki/Standard_temperature_and_pressure" title="Standard temperature and pressure" class="mw-redirect">standard temperature and pressure</a>, hydrogen is a <a href="/wiki/Transparency_(optics)" title="Transparency (optics)" class="mw-redirect">colorless</a>, <a href="/wiki/Odorless" title="Odorless" class="mw-redirect">odorless</a>, <a href="/wiki/Taste" title="Taste">tasteless</a>, non-toxic, <a href="/wiki/Nonmetal" title="Nonmetal">nonmetallic</a>, highly <a href="/wiki/Combustion" title="Combustion">combustible</a> <a href="/wiki/Diatomic_molecule" title="Diatomic molecule">diatomic</a> <a href="/wiki/Gas" title="Gas">gas</a> with the <a href="/wiki/Molecular_formula" title="Molecular formula" class="mw-redirect">molecular formula</a> H<sub>2</sub>. Since hydrogen readily forms <a href="/wiki/Covalent_bond" title="Covalent bond">covalent</a> compounds with most <a href="/wiki/Nonmetal" title="Nonmetal">non-metallic</a> elements, most of the hydrogen on Earth exists in <a href="/wiki/Molecule" title="Molecule">molecular forms</a> such as in the form of <a href="/wiki/Water" title="Water">water</a> or <a href="/wiki/Organic_compound" title="Organic compound">organic compounds</a>. Hydrogen plays a particularly important role in <a href="/wiki/Acid%E2%80%93base_reaction" title="Acid–base reaction">acid–base reactions</a>. In <a href="/wiki/Ionic_compound" title="Ionic compound">ionic compounds</a>, hydrogen can take the form of a negative charge (i.e., <a href="/wiki/Anion" title="Anion" class="mw-redirect">anion</a>) known as a <a href="/wiki/Hydride" title="Hydride">hydride</a>, or as a positively charged (i.e., <a href="/wiki/Cation" title="Cation" class="mw-redirect">cation</a>) <a href="/wiki/Chemical_species" title="Chemical species">species</a> denoted by the symbol H<sup>+</sup>. The hydrogen <a href="/wiki/Cation" title="Cation" class="mw-redirect">cation</a> is written as though composed of a bare proton, but in reality, hydrogen cations in <a href="/wiki/Ionic_compound" title="Ionic compound">ionic compounds</a> are always more complex species than that would suggest.</p> </div> </form> <button>Submit</button> diff --git a/server/public/js/main.js b/server/public/js/main.js index 349bb31..82e17ed 100644 --- a/server/public/js/main.js +++ b/server/public/js/main.js @@ -1,10 +1,21 @@ ( function ( $ ) { 'use strict'; + var cxdata; + $( '.sourceText' ).on( 'click', '.cx-segment', function () { + var segment = cxdata.segments[$( this ).data( 'segment' )]; + console.log( segment ); + } ); + + $( '.sourceText' ).on( 'click', '.cx-link', function () { + var linkid = cxdata.links[$( this ).data( 'linkid' )]; + console.log( linkid ); + } ); + /* global io */ $( document ).ready( function () { var socket = io.connect( '/', { port: 8000 } ); - $( 'button' ).click( function() { + $( 'button' ).click( function () { $( 'progress' ).show(); socket.emit( 'cx.init', { sourceText: $('.sourceText').html(), @@ -12,8 +23,10 @@ targetLanguage: $('input[name=targetLanguage').val() } ); socket.on( 'cx.data.update', function ( data ) { - $( '.status' ).text( 'Recieved version ' + data.version ); - console.log( data ); + cxdata = data; + $( '.status' ).text( 'Recieved version ' + cxdata.version ); + $( '.sourceText' ).html( cxdata.segmentedContent ); + console.log( cxdata ); } ); } ); } ); diff --git a/server/segmentation/CXSegmenter.js b/server/segmentation/CXSegmenter.js new file mode 100644 index 0000000..e37aa4a --- /dev/null +++ b/server/segmentation/CXSegmenter.js @@ -0,0 +1,36 @@ +/** + * ContentTranslation Server + * + * @file + * @ingroup Extensions + * @copyright See AUTHORS.txt + * @license GPL-2.0+ + */ + +'use strict'; + +var util = require( 'util' ), + Segmenter = require( __dirname + '/segmenter.js' ).Segmenter, + ParagraphSegmenter = require( __dirname + '/paragraphSegmenter.js' ).ParagraphSegmenter; + +function CXSegmenter( content ) { + Segmenter.call( this, content ); + this.links = {}; +} + +// Extend Segmenter +util.inherits( CXSegmenter, Segmenter ); + +CXSegmenter.prototype.segment = function () { + var paragraphSegmenter = new ParagraphSegmenter( this.content ); + paragraphSegmenter.segment(); + this.segments = paragraphSegmenter.getSegments(); + this.segmentedContent = paragraphSegmenter.toHTML(); + this.links = paragraphSegmenter.getLinks(); +}; + +CXSegmenter.prototype.getLinks = function () { + return this.links; +}; + +module.exports.CXSegmenter = CXSegmenter; diff --git a/server/segmentation/linkSegmenter.js b/server/segmentation/linkSegmenter.js new file mode 100644 index 0000000..5b08b73 --- /dev/null +++ b/server/segmentation/linkSegmenter.js @@ -0,0 +1,47 @@ +/** + * ContentTranslation Server + * + * @file + * @ingroup Extensions + * @copyright See AUTHORS.txt + * @license GPL-2.0+ + */ + + +'use strict'; + +var util = require( 'util' ), + crypto = require( 'crypto' ), + Segmenter = require( __dirname + '/segmenter.js' ).Segmenter, + $ = require( 'jquery' ); + +function LinkSegmenter( content ) { + Segmenter.call( this, content ); +} + +// Extend Segmenter +util.inherits( LinkSegmenter, Segmenter ); + +LinkSegmenter.prototype.segment = function () { + var segmenter = this, $container = $( '<div>' ).html( this.content ); + + $container.find( 'a' ).each( function( index, link ) { + var $link = $( link ), hash; + + hash = crypto.createHash( 'md5' ).update( $link.prop( 'href' ) ) + .digest( 'hex' ).substr( 0, 5 ); + $link + .attr( 'data-linkid', hash ) + .addClass( 'cx-link' ); + segmenter.segments[hash] = { + href: $link.prop( 'href' ) + }; + } ); + this.segmentedContent = $container.html(); +}; + +LinkSegmenter.prototype.toHTML = function () { + return this.segmentedContent; +}; + +module.exports.LinkSegmenter = LinkSegmenter; diff --git a/server/segmentation/paragraphSegmenter.js b/server/segmentation/paragraphSegmenter.js new file mode 100644 index 0000000..dcae4f5 --- /dev/null +++ b/server/segmentation/paragraphSegmenter.js @@ -0,0 +1,60 @@ +/** + * ContentTranslation Server + * + * @file + * @ingroup Extensions + * @copyright See AUTHORS.txt + * @license GPL-2.0+ + */ + + +'use strict'; + +var util = require( 'util' ), + Segmenter = require( __dirname + '/segmenter.js' ).Segmenter, + SentenceSegmenter = require( __dirname + '/sentenceSegmenter.js' ).SentenceSegmenter, + $ = require( 'jquery' ); + +function ParagraphSegmenter( content ) { + Segmenter.call( this, content ); + this.paragraphs = []; + this.links = {}; +} + +// Extend Segmenter +util.inherits( ParagraphSegmenter, Segmenter ); + +ParagraphSegmenter.prototype.segment = function () { + var segmenter = this, + $container = $( '<div>' ).html( this.content ); + + $container.find( 'p' ).each( function ( index, paragraph ) { + var $paragraph = $( paragraph ), + sentenceSegments, + sentenceSegmenter = new SentenceSegmenter( $paragraph.html() ); + + sentenceSegmenter.segment(); + sentenceSegments = sentenceSegmenter.getSegments(); + segmenter.segments = $.extend( segmenter.segments, sentenceSegments ); + segmenter.segmentCount += sentenceSegmenter.getSegmentCount(); + segmenter.links = $.extend( segmenter.links, sentenceSegmenter.getLinks() ); + segmenter.paragraphs.push( sentenceSegmenter.toHTML() ); + } ); +}; + +ParagraphSegmenter.prototype.getLinks = function () { + return this.links; +}; + +ParagraphSegmenter.prototype.toHTML = function () { + var i, paragraph, paragraphs = ''; + + for ( i = 0; i< this.paragraphs.length; i++ ) { + paragraph = $( this.paragraphs[i] ); + paragraphs += $( '<p>' ).append( paragraph ).prop( 'outerHTML' ); + } + + return paragraphs; +}; + +module.exports.ParagraphSegmenter = ParagraphSegmenter; diff --git a/server/segmentation/segmenter.js b/server/segmentation/segmenter.js new file mode 100644 index 0000000..ff2ceb3 --- /dev/null +++ b/server/segmentation/segmenter.js @@ -0,0 +1,32 @@ +/* + * Content Translation + * + */ + +'use strict'; + +/** + * Segmenter + * @class + */ +function Segmenter( content ) { + this.content = content; + this.segmentCount = 0; + this.segmentIndex = 0; + this.segments = {}; + this.segmentedContent = null; +} + +Segmenter.prototype.getSegmentCount = function () { + return this.segmentCount; +}; + +Segmenter.prototype.getSegments = function () { + return this.segments; +}; + +Segmenter.prototype.getSegmentedContent = function () { + return this.segmentedContent; +}; + +module.exports.Segmenter = Segmenter; diff --git a/server/segmentation/sentenceSegmenter.js b/server/segmentation/sentenceSegmenter.js new file mode 100644 index 0000000..2500104 --- /dev/null +++ b/server/segmentation/sentenceSegmenter.js @@ -0,0 +1,68 @@ +/** + * ContentTranslation Server + * + * @file + * @ingroup Extensions + * @copyright See AUTHORS.txt + * @license GPL-2.0+ + */ + + +'use strict'; + +var util = require( 'util' ), + crypto = require( 'crypto' ), + Segmenter = require( __dirname + '/segmenter.js' ).Segmenter, + LinkSegmenter = require( __dirname + '/linkSegmenter.js' ).LinkSegmenter, + $ = require( 'jquery' ); + +function SentenceSegmenter( content ) { + Segmenter.call( this, content ); + this.lookup = []; + this.links = {}; +} + +// Extend Segmenter +util.inherits( SentenceSegmenter, Segmenter ); + +SentenceSegmenter.prototype.segment = function () { + var i, segmentId, linkSegmenter, + sentences = this.content.split( '.' ); + + this.segmentCount += sentences.length; + for ( i = 0; i< this.segmentCount; i++ ) { + segmentId = crypto.createHash( 'md5' ).update( sentences[i] ) + .digest( 'hex' ).substr( 0, 5 ); + linkSegmenter = new LinkSegmenter(sentences[i]); + linkSegmenter.segment(); + this.links = $.extend( this.links, linkSegmenter.getSegments() ); + this.segments[segmentId] = { + source: linkSegmenter.toHTML() + '.' + }; + // We need this lookup to keep the order of segments + // while constructing the segmented content using + // toHTML method. + this.lookup.push( segmentId ); + } +}; + +SentenceSegmenter.prototype.getLinks = function () { + return this.links; +}; + +SentenceSegmenter.prototype.toHTML = function () { + var i, segmentId, $sentence, $sentences = ''; + + for ( i = 0; i< this.segmentCount; i++ ) { + segmentId = this.lookup[i]; + $sentence = $( '<span>' ) + .addClass( 'cx-segment' ) + .attr( 'data-segment', segmentId ) + .html( this.segments[segmentId].source ); + $sentences += $sentence.prop( 'outerHTML' ); + } + + return $sentences; +}; + +module.exports.SentenceSegmenter = SentenceSegmenter; -- To view, visit https://gerrit.wikimedia.org/r/114150 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I0aa31a0e5035f124910b41f4a4f8a72cab969c73 Gerrit-PatchSet: 6 Gerrit-Project: mediawiki/extensions/ContentTranslation Gerrit-Branch: master Gerrit-Owner: Santhosh <santhosh.thottin...@gmail.com> Gerrit-Reviewer: Divec <da...@sheetmusic.org.uk> Gerrit-Reviewer: KartikMistry <kartik.mis...@gmail.com> Gerrit-Reviewer: Nikerabbit <niklas.laxst...@gmail.com> Gerrit-Reviewer: Santhosh <santhosh.thottin...@gmail.com> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits