[ 
https://issues.apache.org/jira/browse/SPARK-32060?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17148412#comment-17148412
 ] 

zhengruifeng edited comment on SPARK-32060 at 6/30/20, 8:17 AM:
----------------------------------------------------------------

I found that the optimization of Huber Loss is unstable, if the input dataset 
is shuffled:

 

spark: 2.4.5

cmd:spark-shell --driver-memory=96G --conf spark.driver.maxResultSize=10g

 
{code:java}
import org.apache.spark.ml.classification._
import org.apache.spark.ml.regression._
import org.apache.spark.storage.StorageLevel
val df = spark.read.option("numFeatures", 
"2000").format("libsvm").load("/data1/Datasets/epsilon/epsilon_normalized.t").withColumn("label",
 (col("label")+1)/2)df.persist(StorageLevel.MEMORY_AND_DISK)
df.count

val svc = new LinearSVC().setMaxIter(100).setTol(0)
val svcmodel = svc.fit(df)
val svcmodels = Seq.range(0, 5).map { seed => val df2 = 
df.sort(rand(seed)).persist(StorageLevel.MEMORY_AND_DISK); df2.count; val model 
= svc.fit(df2); df2.unpersist(); model }

val lir = new LinearRegression().setMaxIter(100).setSolver("l-bfgs").setTol(0)
val lirmodel = lir.fit(df)
val lirmodels = Seq.range(0, 5).map { seed => val df2 = 
df.sort(rand(seed)).persist(StorageLevel.MEMORY_AND_DISK); df2.count; val model 
= lir.fit(df2); df2.unpersist(); model }

val lir = new 
LinearRegression().setMaxIter(100).setSolver("l-bfgs").setLoss("huber").setTol(0)
val hubermodel = lir.fit(df)
val hubermodels = Seq.range(0, 5).map { seed => val df2 = 
df.sort(rand(seed)).persist(StorageLevel.MEMORY_AND_DISK); df2.count; val model 
= lir.fit(df2); df2.unpersist(); model }

 {code}
 

results:
{code:java}
scala> svcmodel.coefficients
res4: org.apache.spark.ml.linalg.Vector = 
[-2.0998984926889244,-0.3252719796828287,1.6224962635038596,-0.09459144575027117,-0.024713074721534507,-0.04893864248356599,4.521280777017717,-0.3920439314738444,0.027631053567458274,-0.010013241182040592,0.3259469228241217,-1.1125182474604842,-0.173266660114704,0.03365461088305983,0.02162518688538647,0.07204641375676599,-0.07429479630422156,0.005457557625321678,-2.9532140605652275,2.7240907567070676,-0.3066203528914533,-0.11475917863808731,-0.20761462370516978,-0.4066885419952761,-0.08185889069309363,-0.6318876493014741,0.06405628348073204,-0.32732378261855793,0.01462176019045602,0.8492238295542848,-0.20854294380974547,0.008039275953692854,0.05597077397428801,-0.06302333216930013,0.005602373131582006,-0.05995911252186677,0.3381639630496303,0.63...

scala> 
svcmodels.map(_.coefficients.toArray.take(10).mkString(",")).foreach(println)
-2.099898492688956,-0.32527197968284316,1.622496263503862,-0.09459144575027553,-0.02471307472153921,-0.048938642483572034,4.521280777017707,-0.39204393147385175,0.027631053567457448,-0.010013241182041081
-2.0998984926889364,-0.32527197968284405,1.6224962635038631,-0.09459144575027562,-0.024713074721539482,-0.0489386424835727,4.521280777017712,-0.3920439314738487,0.027631053567458107,-0.010013241182040594
-2.0998984926889346,-0.3252719796828256,1.6224962635038638,-0.09459144575027183,-0.02471307472153609,-0.04893864248356505,4.52128077701772,-0.392043931473848,0.027631053567458715,-0.010013241182040738
-2.0998984926889612,-0.3252719796828379,1.6224962635038718,-0.09459144575027294,-0.024713074721542546,-0.04893864248357126,4.521280777017708,-0.3920439314738543,0.027631053567458354,-0.01001324118203996
-2.099898492688976,-0.32527197968284066,1.6224962635038813,-0.09459144575027299,-0.024713074721538494,-0.04893864248357121,4.521280777017711,-0.39204393147382943,0.027631053567457917,-0.01001324118203988



scala> lirmodel.coefficients
res6: org.apache.spark.ml.linalg.Vector = 
[-0.15694030457077052,-0.03297314855191394,0.21896060695714925,0.043191022987982185,0.029914098626947626,0.037812647639103455,0.3777274539423792,-0.004353078286124242,-0.15575156684399277,-5.036838920393178E-4,0.058833881325688855,-0.08017543724230564,-0.0492390194915689,-0.038321196923869975,-0.130894554829739,-0.042355695456345384,-0.009015454450718942,-0.04370761619435822,-0.29579850779021977,0.004520096449137435,-0.006967975798308324,-0.06845005746315802,0.266918095634905,-0.02940614765439654,-0.026540163642704145,-0.06595165721239701,0.014980713825836588,-0.04093065488345465,0.01959430385382978,0.050848208119031076,-0.05637861639180545,0.14531387392683578,-0.010366403339646989,-0.06789567679629482,-0.01138250848452352,0.012813168874534375,0...

scala> 
lirmodels.map(_.coefficients.toArray.take(10).mkString(",")).foreach(println)
-0.15782290165350424,-0.032888518926744874,0.21830737648275617,0.042901864085398485,0.030379578212786106,0.03726297774165828,0.37590079059854775,-0.005096797676258801,-0.16294717929387345,-3.4683642628292866E-4
-0.158134002701788,-0.03297488977298756,0.223484223933237,0.04362644996012737,0.02955839240619015,0.037886954492652274,0.3781987525027171,-0.0042601468957566635,-0.15377036026585994,-7.808247198886335E-4
-0.15725060355390777,-0.03274293711657846,0.2255425335042434,0.04414557619894905,0.029592347607608316,0.038140084783380566,0.38014500533792417,-0.004041812003758675,-0.1532244524431212,-6.139071642605764E-4
-0.15808251904715098,-0.03277497701747291,0.22415793808220286,0.04381212676769461,0.029645628307462257,0.03827272865297394,0.3779602823322227,-0.004579990558798104,-0.1619060581635579,-2.924684072438361E-4
-0.1584244946556921,-0.03308679728952589,0.22109423848026052,0.04314487897259751,0.02974043814809942,0.03767583465126916,0.37660774731442753,-0.004666143631917849,-0.15908500093611755,-6.032795854870288E-4



scala> hubermodel.coefficients
res8: org.apache.spark.ml.linalg.Vector = 
[-0.5734110263378509,0.14651463113107283,0.31695293497905314,0.09766723609660509,0.9751922937594425,-0.45078361519199234,0.1796011011914836,0.1307590238399803,29.4483208484545,0.0010971438711490464,-0.3774494299116957,-0.0972074860792813,0.19308365683399073,0.7342453168750395,-1.5272939597106356,0.26236258910811083,-0.2625004448211019,0.9162955516808651,0.770900888136527,2.02312824266805,-0.2122323515204783,-0.7335830805152952,-0.6830804460739696,-0.2683927532715843,-0.009356259933091637,0.010101144450626055,0.04419139197964755,-0.1537149993025953,-0.057369897837715504,-0.21594494553457308,-0.11921794315942034,21.49391633332987,-0.03273344441395022,0.15185884919764855,-0.41260352897506936,0.2051431942644116,0.038619787153916126,0.059096910694226...

scala> 
hubermodels.map(_.coefficients.toArray.take(10).mkString(",")).foreach(println)
-1.0447443587500074,0.36317099225890404,-0.037510945673721416,-0.00499241712258674,0.7166274375452902,0.2883567264867333,0.8955748447650599,-0.09183466803327618,28.38969112834551,-0.1732703380224992
-0.9612521831063224,0.1330548989982154,-0.40321780540593827,0.07021956471272095,1.1672691564804965,0.39020978586678995,0.6565458868026381,-0.007652674389259566,25.870475377334614,-0.14530934468533313
-0.6203964039612594,0.13852113595924576,-0.05944676566281877,0.06282848629910238,1.3543656514896878,0.13935144813830452,0.8196295581480677,-0.04905550542415592,28.332175980950254,-0.020571474994685857
-0.8515959580662419,0.23175038852141236,-0.013615846688186758,0.03050371704421431,0.7718618660731084,-0.20282296703766906,0.3065121638304942,0.009840552518133401,28.873627374557977,-0.050971722924554645
-0.5366805901626359,0.12048487553084265,0.17808067158945728,0.1747893757985052,1.1205458676278635,-0.506018568748868,0.16694592848749892,0.20585232042284707,29.260697861543417,0.1112312158197619
 {code}
 

the solvers {{BreezeOWLQN}} and {{BreezeLBFGS seems stable, while solver 
BreezeLBFGSB}} for Huber seems quite unstable.

 

I also test scikit-learn's {{HuberRegressor which also use LBFGSB 
(scipy.optimize.minimize(method="L-BFGS-B")}} )  as the solover:
{code:java}
 import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import HuberRegressor
from sklearn.datasets import load_svmlight_file
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
X, y = load_svmlight_file('/data1/Datasets/epsilon/epsilon_normalized.t')
X = X.toarray()
y = (y + 1) / 2scaler = StandardScaler().fit(X)
X = scaler.transform(X)
X1, y1 = shuffle(X, y, random_state=1)
huber1 = HuberRegressor(max_iter=200).fit(X1, y1)
del X1, y1X2, y2 = shuffle(X, y, random_state=2)
huber2 = HuberRegressor(max_iter=200).fit(X2, y2)
del X2, y2X3, y3 = shuffle(X, y, random_state=3)
huber3 = HuberRegressor(max_iter=200).fit(X3, y3)
del X3, y3
{code}
 

its solutions are relatively stable:
{code:java}
huber1.coef_
Out[3]: 
array([-0.00323102, -0.00116995,  0.00561726, ..., -0.00079748,
        0.00158621,  0.00189728])

huber2.coef_
Out[4]: 
array([-0.00323499, -0.00117458,  0.00562527, ..., -0.00080695,
        0.00160452,  0.0019021 ])

huber3.coef_
Out[5]: 
array([-0.00327077, -0.00116289,  0.00568099, ..., -0.00078838,
        0.00158942,  0.0019131 ]) {code}
 

So I think this unstability lies in the impl of {{BreezeLBFGSB}}

 

 

 


was (Author: podongfeng):
I found that the optimization of Huber Loss is unstable, if the input dataset 
is shuffled:

 

spark: 2.4.5

cmd:spark-shell --driver-memory=96G --conf spark.driver.maxResultSize=10g

 
{code:java}
import org.apache.spark.ml.classification._
import org.apache.spark.ml.regression._
import org.apache.spark.storage.StorageLevel
val df = spark.read.option("numFeatures", 
"2000").format("libsvm").load("/data1/Datasets/epsilon/epsilon_normalized.t").withColumn("label",
 (col("label")+1)/2)df.persist(StorageLevel.MEMORY_AND_DISK)
df.count

val svc = new LinearSVC().setMaxIter(100).setTol(0)
val svcmodel = svc.fit(df)
val svcmodels = Seq.range(0, 5).map { seed => val df2 = 
df.sort(rand(seed)).persist(StorageLevel.MEMORY_AND_DISK); df2.count; val model 
= svc.fit(df2); df2.unpersist(); model }

val lir = new LinearRegression().setMaxIter(100).setSolver("l-bfgs").setTol(0)
val lirmodel = lir.fit(df)
val lirmodels = Seq.range(0, 5).map { seed => val df2 = 
df.sort(rand(seed)).persist(StorageLevel.MEMORY_AND_DISK); df2.count; val model 
= lir.fit(df2); df2.unpersist(); model }

val lir = new 
LinearRegression().setMaxIter(100).setSolver("l-bfgs").setLoss("huber").setTol(0)
val hubermodel = lir.fit(df)
val hubermodels = Seq.range(0, 5).map { seed => val df2 = 
df.sort(rand(seed)).persist(StorageLevel.MEMORY_AND_DISK); df2.count; val model 
= lir.fit(df2); df2.unpersist(); model }

 {code}
 

results:
{code:java}
scala> svcmodel.coefficients
res4: org.apache.spark.ml.linalg.Vector = 
[-2.0998984926889244,-0.3252719796828287,1.6224962635038596,-0.09459144575027117,-0.024713074721534507,-0.04893864248356599,4.521280777017717,-0.3920439314738444,0.027631053567458274,-0.010013241182040592,0.3259469228241217,-1.1125182474604842,-0.173266660114704,0.03365461088305983,0.02162518688538647,0.07204641375676599,-0.07429479630422156,0.005457557625321678,-2.9532140605652275,2.7240907567070676,-0.3066203528914533,-0.11475917863808731,-0.20761462370516978,-0.4066885419952761,-0.08185889069309363,-0.6318876493014741,0.06405628348073204,-0.32732378261855793,0.01462176019045602,0.8492238295542848,-0.20854294380974547,0.008039275953692854,0.05597077397428801,-0.06302333216930013,0.005602373131582006,-0.05995911252186677,0.3381639630496303,0.63...

scala> 
svcmodels.map(_.coefficients.toArray.take(10).mkString(",")).foreach(println)
-2.099898492688956,-0.32527197968284316,1.622496263503862,-0.09459144575027553,-0.02471307472153921,-0.048938642483572034,4.521280777017707,-0.39204393147385175,0.027631053567457448,-0.010013241182041081
-2.0998984926889364,-0.32527197968284405,1.6224962635038631,-0.09459144575027562,-0.024713074721539482,-0.0489386424835727,4.521280777017712,-0.3920439314738487,0.027631053567458107,-0.010013241182040594
-2.0998984926889346,-0.3252719796828256,1.6224962635038638,-0.09459144575027183,-0.02471307472153609,-0.04893864248356505,4.52128077701772,-0.392043931473848,0.027631053567458715,-0.010013241182040738
-2.0998984926889612,-0.3252719796828379,1.6224962635038718,-0.09459144575027294,-0.024713074721542546,-0.04893864248357126,4.521280777017708,-0.3920439314738543,0.027631053567458354,-0.01001324118203996
-2.099898492688976,-0.32527197968284066,1.6224962635038813,-0.09459144575027299,-0.024713074721538494,-0.04893864248357121,4.521280777017711,-0.39204393147382943,0.027631053567457917,-0.01001324118203988



scala> lirmodel.coefficients
res6: org.apache.spark.ml.linalg.Vector = 
[-0.15694030457077052,-0.03297314855191394,0.21896060695714925,0.043191022987982185,0.029914098626947626,0.037812647639103455,0.3777274539423792,-0.004353078286124242,-0.15575156684399277,-5.036838920393178E-4,0.058833881325688855,-0.08017543724230564,-0.0492390194915689,-0.038321196923869975,-0.130894554829739,-0.042355695456345384,-0.009015454450718942,-0.04370761619435822,-0.29579850779021977,0.004520096449137435,-0.006967975798308324,-0.06845005746315802,0.266918095634905,-0.02940614765439654,-0.026540163642704145,-0.06595165721239701,0.014980713825836588,-0.04093065488345465,0.01959430385382978,0.050848208119031076,-0.05637861639180545,0.14531387392683578,-0.010366403339646989,-0.06789567679629482,-0.01138250848452352,0.012813168874534375,0...

scala> 
lirmodels.map(_.coefficients.toArray.take(10).mkString(",")).foreach(println)
-0.15782290165350424,-0.032888518926744874,0.21830737648275617,0.042901864085398485,0.030379578212786106,0.03726297774165828,0.37590079059854775,-0.005096797676258801,-0.16294717929387345,-3.4683642628292866E-4
-0.158134002701788,-0.03297488977298756,0.223484223933237,0.04362644996012737,0.02955839240619015,0.037886954492652274,0.3781987525027171,-0.0042601468957566635,-0.15377036026585994,-7.808247198886335E-4
-0.15725060355390777,-0.03274293711657846,0.2255425335042434,0.04414557619894905,0.029592347607608316,0.038140084783380566,0.38014500533792417,-0.004041812003758675,-0.1532244524431212,-6.139071642605764E-4
-0.15808251904715098,-0.03277497701747291,0.22415793808220286,0.04381212676769461,0.029645628307462257,0.03827272865297394,0.3779602823322227,-0.004579990558798104,-0.1619060581635579,-2.924684072438361E-4
-0.1584244946556921,-0.03308679728952589,0.22109423848026052,0.04314487897259751,0.02974043814809942,0.03767583465126916,0.37660774731442753,-0.004666143631917849,-0.15908500093611755,-6.032795854870288E-4



scala> hubermodel.coefficients
res8: org.apache.spark.ml.linalg.Vector = 
[-0.5734110263378509,0.14651463113107283,0.31695293497905314,0.09766723609660509,0.9751922937594425,-0.45078361519199234,0.1796011011914836,0.1307590238399803,29.4483208484545,0.0010971438711490464,-0.3774494299116957,-0.0972074860792813,0.19308365683399073,0.7342453168750395,-1.5272939597106356,0.26236258910811083,-0.2625004448211019,0.9162955516808651,0.770900888136527,2.02312824266805,-0.2122323515204783,-0.7335830805152952,-0.6830804460739696,-0.2683927532715843,-0.009356259933091637,0.010101144450626055,0.04419139197964755,-0.1537149993025953,-0.057369897837715504,-0.21594494553457308,-0.11921794315942034,21.49391633332987,-0.03273344441395022,0.15185884919764855,-0.41260352897506936,0.2051431942644116,0.038619787153916126,0.059096910694226...

scala> 
hubermodels.map(_.coefficients.toArray.take(10).mkString(",")).foreach(println)
-1.0447443587500074,0.36317099225890404,-0.037510945673721416,-0.00499241712258674,0.7166274375452902,0.2883567264867333,0.8955748447650599,-0.09183466803327618,28.38969112834551,-0.1732703380224992
-0.9612521831063224,0.1330548989982154,-0.40321780540593827,0.07021956471272095,1.1672691564804965,0.39020978586678995,0.6565458868026381,-0.007652674389259566,25.870475377334614,-0.14530934468533313
-0.6203964039612594,0.13852113595924576,-0.05944676566281877,0.06282848629910238,1.3543656514896878,0.13935144813830452,0.8196295581480677,-0.04905550542415592,28.332175980950254,-0.020571474994685857
-0.8515959580662419,0.23175038852141236,-0.013615846688186758,0.03050371704421431,0.7718618660731084,-0.20282296703766906,0.3065121638304942,0.009840552518133401,28.873627374557977,-0.050971722924554645
-0.5366805901626359,0.12048487553084265,0.17808067158945728,0.1747893757985052,1.1205458676278635,-0.506018568748868,0.16694592848749892,0.20585232042284707,29.260697861543417,0.1112312158197619
 {code}
 

the solvers {{BreezeOWLQN}} and {{BreezeLBFGS seems stable, while solver 
}}{{BreezeLBFGSB}} for Huber seems quite unstable.{{}}

 

I also test scikit-learn's {{HuberRegressor which also use 
LBFGSB({{scipy.optimize.minimize(method="L-BFGS-B")}} ) as the solover}}:
{code:java}
 import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import HuberRegressor
from sklearn.datasets import load_svmlight_file
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
X, y = load_svmlight_file('/data1/Datasets/epsilon/epsilon_normalized.t')
X = X.toarray()
y = (y + 1) / 2scaler = StandardScaler().fit(X)
X = scaler.transform(X)
X1, y1 = shuffle(X, y, random_state=1)
huber1 = HuberRegressor(max_iter=200).fit(X1, y1)
del X1, y1X2, y2 = shuffle(X, y, random_state=2)
huber2 = HuberRegressor(max_iter=200).fit(X2, y2)
del X2, y2X3, y3 = shuffle(X, y, random_state=3)
huber3 = HuberRegressor(max_iter=200).fit(X3, y3)
del X3, y3
{code}
 

its solutions are relatively stable:
{code:java}
huber1.coef_
Out[3]: 
array([-0.00323102, -0.00116995,  0.00561726, ..., -0.00079748,
        0.00158621,  0.00189728])

huber2.coef_
Out[4]: 
array([-0.00323499, -0.00117458,  0.00562527, ..., -0.00080695,
        0.00160452,  0.0019021 ])

huber3.coef_
Out[5]: 
array([-0.00327077, -0.00116289,  0.00568099, ..., -0.00078838,
        0.00158942,  0.0019131 ]) {code}
 

So I think this unstability lies in the impl of {{BreezeLBFGSB}}

 

 

 

> Huber loss Convergence
> ----------------------
>
>                 Key: SPARK-32060
>                 URL: https://issues.apache.org/jira/browse/SPARK-32060
>             Project: Spark
>          Issue Type: Sub-task
>          Components: ML
>    Affects Versions: 3.1.0
>            Reporter: zhengruifeng
>            Priority: Minor
>         Attachments: huber.xlsx, image-2020-06-28-18-05-28-867.png
>
>
> |performace test in https://issues.apache.org/jira/browse/SPARK-31783,
>  Huber loss seems start to diverge since 70 iters.
>   {code:scala}
>  for (size <- Seq(1, 4, 16, 64); iter <- Seq(10, 50, 100)) {
>     Thread.sleep(10000)
>     val hlir = new 
> LinearRegression().setLoss("huber").setSolver("l-bfgs").setMaxIter(iter).setTol(0)
>     val start = System.currentTimeMillis
>     val model = hlir.setBlockSize(size).fit(df)
>     val end = System.currentTimeMillis
>     println((model.uid, size, iter, end - start, 
> model.summary.objectiveHistory.last, model.summary.totalIterations, 
> model.coefficients.toString.take(100)))
> }{code}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to