[ 
https://issues.apache.org/jira/browse/FLINK-4964?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15652290#comment-15652290
 ] 

ASF GitHub Bot commented on FLINK-4964:
---------------------------------------

Github user tfournier314 commented on a diff in the pull request:

    https://github.com/apache/flink/pull/2740#discussion_r87295380
  
    --- Diff: 
flink-libraries/flink-ml/src/main/scala/org/apache/flink/ml/preprocessing/StringIndexer.scala
 ---
    @@ -0,0 +1,108 @@
    +package org.apache.flink.ml.preprocessing
    +
    +import org.apache.flink.api.scala._
    +import org.apache.flink.api.scala.extensions.acceptPartialFunctions
    +import org.apache.flink.api.scala.utils._
    +import org.apache.flink.ml.common.{Parameter, ParameterMap}
    +import org.apache.flink.ml.pipeline.{FitOperation, 
TransformDataSetOperation, Transformer}
    +import org.apache.flink.ml.preprocessing.StringIndexer.HandleInvalid
    +
    +import scala.collection.immutable.Seq
    +
    +/**
    +  * String Indexer
    +  */
    +class StringIndexer extends Transformer[StringIndexer] {
    +
    +  private[preprocessing] var metricsOption: Option[DataSet[(String, 
Long)]] = None
    +
    +
    +  def setHandleInvalid(value: String): this.type ={
    +    parameters.add( HandleInvalid, value )
    +    this
    +  }
    +
    +}
    +
    +object StringIndexer {
    +
    +  case object HandleInvalid extends Parameter[String] {
    +    val defaultValue: Option[String] = Some( "skip" )
    +  }
    +
    +  // ==================================== Factory methods 
==========================================
    +
    +  def apply(): StringIndexer ={
    +    new StringIndexer( )
    +  }
    +
    +  // ====================================== Operations 
=============================================
    +
    +  /**
    +    * Trains [[StringIndexer]] by learning the count of each string in the 
input DataSet.
    +    */
    +
    +  implicit def fitStringIndexer ={
    +    new FitOperation[StringIndexer, String] {
    +      def fit(instance: StringIndexer, fitParameters: ParameterMap, input: 
DataSet[String]):Unit ={
    +        val metrics = extractIndices( input )
    +        instance.metricsOption = Some( metrics )
    +      }
    +    }
    +  }
    +
    +  private def extractIndices(input: DataSet[String]): DataSet[(String, 
Long)] = {
    +
    +    val mapping = input
    +      .mapWith( s => (s, 1) )
    +      .groupBy( 0 )
    +      .reduce( (a, b) => (a._1, a._2 + b._2) )
    +      .partitionByRange( 1 )
    --- End diff --
    
    Indeed, I need to do a global sort, because mapping is a sorted 
DataSet[(String,Long)] of (labels,index), where the most frequent item has 
index = 0.
    
    I need to sort a dataSet of (labels,frequency), then zipWithIndex to get 
the associated index.
    
    I've just realised that sortPartition() will only sort my partitions 
locally, so how can I achieve a global sort this way ? 
    
    



> FlinkML - Add StringIndexer
> ---------------------------
>
>                 Key: FLINK-4964
>                 URL: https://issues.apache.org/jira/browse/FLINK-4964
>             Project: Flink
>          Issue Type: New Feature
>            Reporter: Thomas FOURNIER
>            Priority: Minor
>
> Add StringIndexer as described here:
> http://spark.apache.org/docs/latest/ml-features.html#stringindexer
> This will be added in package preprocessing of FlinkML



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to