[ https://issues.apache.org/jira/browse/SPARK-32060?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17148412#comment-17148412 ]
zhengruifeng edited comment on SPARK-32060 at 6/30/20, 8:17 AM: ---------------------------------------------------------------- I found that the optimization of Huber Loss is unstable, if the input dataset is shuffled: spark: 2.4.5 cmd:spark-shell --driver-memory=96G --conf spark.driver.maxResultSize=10g {code:java} import org.apache.spark.ml.classification._ import org.apache.spark.ml.regression._ import org.apache.spark.storage.StorageLevel val df = spark.read.option("numFeatures", "2000").format("libsvm").load("/data1/Datasets/epsilon/epsilon_normalized.t").withColumn("label", (col("label")+1)/2)df.persist(StorageLevel.MEMORY_AND_DISK) df.count val svc = new LinearSVC().setMaxIter(100).setTol(0) val svcmodel = svc.fit(df) val svcmodels = Seq.range(0, 5).map { seed => val df2 = df.sort(rand(seed)).persist(StorageLevel.MEMORY_AND_DISK); df2.count; val model = svc.fit(df2); df2.unpersist(); model } val lir = new LinearRegression().setMaxIter(100).setSolver("l-bfgs").setTol(0) val lirmodel = lir.fit(df) val lirmodels = Seq.range(0, 5).map { seed => val df2 = df.sort(rand(seed)).persist(StorageLevel.MEMORY_AND_DISK); df2.count; val model = lir.fit(df2); df2.unpersist(); model } val lir = new LinearRegression().setMaxIter(100).setSolver("l-bfgs").setLoss("huber").setTol(0) val hubermodel = lir.fit(df) val hubermodels = Seq.range(0, 5).map { seed => val df2 = df.sort(rand(seed)).persist(StorageLevel.MEMORY_AND_DISK); df2.count; val model = lir.fit(df2); df2.unpersist(); model } {code} results: {code:java} scala> svcmodel.coefficients res4: org.apache.spark.ml.linalg.Vector = [-2.0998984926889244,-0.3252719796828287,1.6224962635038596,-0.09459144575027117,-0.024713074721534507,-0.04893864248356599,4.521280777017717,-0.3920439314738444,0.027631053567458274,-0.010013241182040592,0.3259469228241217,-1.1125182474604842,-0.173266660114704,0.03365461088305983,0.02162518688538647,0.07204641375676599,-0.07429479630422156,0.005457557625321678,-2.9532140605652275,2.7240907567070676,-0.3066203528914533,-0.11475917863808731,-0.20761462370516978,-0.4066885419952761,-0.08185889069309363,-0.6318876493014741,0.06405628348073204,-0.32732378261855793,0.01462176019045602,0.8492238295542848,-0.20854294380974547,0.008039275953692854,0.05597077397428801,-0.06302333216930013,0.005602373131582006,-0.05995911252186677,0.3381639630496303,0.63... scala> svcmodels.map(_.coefficients.toArray.take(10).mkString(",")).foreach(println) -2.099898492688956,-0.32527197968284316,1.622496263503862,-0.09459144575027553,-0.02471307472153921,-0.048938642483572034,4.521280777017707,-0.39204393147385175,0.027631053567457448,-0.010013241182041081 -2.0998984926889364,-0.32527197968284405,1.6224962635038631,-0.09459144575027562,-0.024713074721539482,-0.0489386424835727,4.521280777017712,-0.3920439314738487,0.027631053567458107,-0.010013241182040594 -2.0998984926889346,-0.3252719796828256,1.6224962635038638,-0.09459144575027183,-0.02471307472153609,-0.04893864248356505,4.52128077701772,-0.392043931473848,0.027631053567458715,-0.010013241182040738 -2.0998984926889612,-0.3252719796828379,1.6224962635038718,-0.09459144575027294,-0.024713074721542546,-0.04893864248357126,4.521280777017708,-0.3920439314738543,0.027631053567458354,-0.01001324118203996 -2.099898492688976,-0.32527197968284066,1.6224962635038813,-0.09459144575027299,-0.024713074721538494,-0.04893864248357121,4.521280777017711,-0.39204393147382943,0.027631053567457917,-0.01001324118203988 scala> lirmodel.coefficients res6: org.apache.spark.ml.linalg.Vector = [-0.15694030457077052,-0.03297314855191394,0.21896060695714925,0.043191022987982185,0.029914098626947626,0.037812647639103455,0.3777274539423792,-0.004353078286124242,-0.15575156684399277,-5.036838920393178E-4,0.058833881325688855,-0.08017543724230564,-0.0492390194915689,-0.038321196923869975,-0.130894554829739,-0.042355695456345384,-0.009015454450718942,-0.04370761619435822,-0.29579850779021977,0.004520096449137435,-0.006967975798308324,-0.06845005746315802,0.266918095634905,-0.02940614765439654,-0.026540163642704145,-0.06595165721239701,0.014980713825836588,-0.04093065488345465,0.01959430385382978,0.050848208119031076,-0.05637861639180545,0.14531387392683578,-0.010366403339646989,-0.06789567679629482,-0.01138250848452352,0.012813168874534375,0... scala> lirmodels.map(_.coefficients.toArray.take(10).mkString(",")).foreach(println) -0.15782290165350424,-0.032888518926744874,0.21830737648275617,0.042901864085398485,0.030379578212786106,0.03726297774165828,0.37590079059854775,-0.005096797676258801,-0.16294717929387345,-3.4683642628292866E-4 -0.158134002701788,-0.03297488977298756,0.223484223933237,0.04362644996012737,0.02955839240619015,0.037886954492652274,0.3781987525027171,-0.0042601468957566635,-0.15377036026585994,-7.808247198886335E-4 -0.15725060355390777,-0.03274293711657846,0.2255425335042434,0.04414557619894905,0.029592347607608316,0.038140084783380566,0.38014500533792417,-0.004041812003758675,-0.1532244524431212,-6.139071642605764E-4 -0.15808251904715098,-0.03277497701747291,0.22415793808220286,0.04381212676769461,0.029645628307462257,0.03827272865297394,0.3779602823322227,-0.004579990558798104,-0.1619060581635579,-2.924684072438361E-4 -0.1584244946556921,-0.03308679728952589,0.22109423848026052,0.04314487897259751,0.02974043814809942,0.03767583465126916,0.37660774731442753,-0.004666143631917849,-0.15908500093611755,-6.032795854870288E-4 scala> hubermodel.coefficients res8: org.apache.spark.ml.linalg.Vector = [-0.5734110263378509,0.14651463113107283,0.31695293497905314,0.09766723609660509,0.9751922937594425,-0.45078361519199234,0.1796011011914836,0.1307590238399803,29.4483208484545,0.0010971438711490464,-0.3774494299116957,-0.0972074860792813,0.19308365683399073,0.7342453168750395,-1.5272939597106356,0.26236258910811083,-0.2625004448211019,0.9162955516808651,0.770900888136527,2.02312824266805,-0.2122323515204783,-0.7335830805152952,-0.6830804460739696,-0.2683927532715843,-0.009356259933091637,0.010101144450626055,0.04419139197964755,-0.1537149993025953,-0.057369897837715504,-0.21594494553457308,-0.11921794315942034,21.49391633332987,-0.03273344441395022,0.15185884919764855,-0.41260352897506936,0.2051431942644116,0.038619787153916126,0.059096910694226... scala> hubermodels.map(_.coefficients.toArray.take(10).mkString(",")).foreach(println) -1.0447443587500074,0.36317099225890404,-0.037510945673721416,-0.00499241712258674,0.7166274375452902,0.2883567264867333,0.8955748447650599,-0.09183466803327618,28.38969112834551,-0.1732703380224992 -0.9612521831063224,0.1330548989982154,-0.40321780540593827,0.07021956471272095,1.1672691564804965,0.39020978586678995,0.6565458868026381,-0.007652674389259566,25.870475377334614,-0.14530934468533313 -0.6203964039612594,0.13852113595924576,-0.05944676566281877,0.06282848629910238,1.3543656514896878,0.13935144813830452,0.8196295581480677,-0.04905550542415592,28.332175980950254,-0.020571474994685857 -0.8515959580662419,0.23175038852141236,-0.013615846688186758,0.03050371704421431,0.7718618660731084,-0.20282296703766906,0.3065121638304942,0.009840552518133401,28.873627374557977,-0.050971722924554645 -0.5366805901626359,0.12048487553084265,0.17808067158945728,0.1747893757985052,1.1205458676278635,-0.506018568748868,0.16694592848749892,0.20585232042284707,29.260697861543417,0.1112312158197619 {code} the solvers {{BreezeOWLQN}} and {{BreezeLBFGS seems stable, while solver BreezeLBFGSB}} for Huber seems quite unstable. I also test scikit-learn's {{HuberRegressor which also use LBFGSB (scipy.optimize.minimize(method="L-BFGS-B")}} ) as the solover: {code:java} import matplotlib.pyplot as plt import pandas as pd import numpy as np from sklearn.linear_model import HuberRegressor from sklearn.datasets import load_svmlight_file from sklearn.preprocessing import StandardScaler from sklearn.utils import shuffle X, y = load_svmlight_file('/data1/Datasets/epsilon/epsilon_normalized.t') X = X.toarray() y = (y + 1) / 2scaler = StandardScaler().fit(X) X = scaler.transform(X) X1, y1 = shuffle(X, y, random_state=1) huber1 = HuberRegressor(max_iter=200).fit(X1, y1) del X1, y1X2, y2 = shuffle(X, y, random_state=2) huber2 = HuberRegressor(max_iter=200).fit(X2, y2) del X2, y2X3, y3 = shuffle(X, y, random_state=3) huber3 = HuberRegressor(max_iter=200).fit(X3, y3) del X3, y3 {code} its solutions are relatively stable: {code:java} huber1.coef_ Out[3]: array([-0.00323102, -0.00116995, 0.00561726, ..., -0.00079748, 0.00158621, 0.00189728]) huber2.coef_ Out[4]: array([-0.00323499, -0.00117458, 0.00562527, ..., -0.00080695, 0.00160452, 0.0019021 ]) huber3.coef_ Out[5]: array([-0.00327077, -0.00116289, 0.00568099, ..., -0.00078838, 0.00158942, 0.0019131 ]) {code} So I think this unstability lies in the impl of {{BreezeLBFGSB}} was (Author: podongfeng): I found that the optimization of Huber Loss is unstable, if the input dataset is shuffled: spark: 2.4.5 cmd:spark-shell --driver-memory=96G --conf spark.driver.maxResultSize=10g {code:java} import org.apache.spark.ml.classification._ import org.apache.spark.ml.regression._ import org.apache.spark.storage.StorageLevel val df = spark.read.option("numFeatures", "2000").format("libsvm").load("/data1/Datasets/epsilon/epsilon_normalized.t").withColumn("label", (col("label")+1)/2)df.persist(StorageLevel.MEMORY_AND_DISK) df.count val svc = new LinearSVC().setMaxIter(100).setTol(0) val svcmodel = svc.fit(df) val svcmodels = Seq.range(0, 5).map { seed => val df2 = df.sort(rand(seed)).persist(StorageLevel.MEMORY_AND_DISK); df2.count; val model = svc.fit(df2); df2.unpersist(); model } val lir = new LinearRegression().setMaxIter(100).setSolver("l-bfgs").setTol(0) val lirmodel = lir.fit(df) val lirmodels = Seq.range(0, 5).map { seed => val df2 = df.sort(rand(seed)).persist(StorageLevel.MEMORY_AND_DISK); df2.count; val model = lir.fit(df2); df2.unpersist(); model } val lir = new LinearRegression().setMaxIter(100).setSolver("l-bfgs").setLoss("huber").setTol(0) val hubermodel = lir.fit(df) val hubermodels = Seq.range(0, 5).map { seed => val df2 = df.sort(rand(seed)).persist(StorageLevel.MEMORY_AND_DISK); df2.count; val model = lir.fit(df2); df2.unpersist(); model } {code} results: {code:java} scala> svcmodel.coefficients res4: org.apache.spark.ml.linalg.Vector = [-2.0998984926889244,-0.3252719796828287,1.6224962635038596,-0.09459144575027117,-0.024713074721534507,-0.04893864248356599,4.521280777017717,-0.3920439314738444,0.027631053567458274,-0.010013241182040592,0.3259469228241217,-1.1125182474604842,-0.173266660114704,0.03365461088305983,0.02162518688538647,0.07204641375676599,-0.07429479630422156,0.005457557625321678,-2.9532140605652275,2.7240907567070676,-0.3066203528914533,-0.11475917863808731,-0.20761462370516978,-0.4066885419952761,-0.08185889069309363,-0.6318876493014741,0.06405628348073204,-0.32732378261855793,0.01462176019045602,0.8492238295542848,-0.20854294380974547,0.008039275953692854,0.05597077397428801,-0.06302333216930013,0.005602373131582006,-0.05995911252186677,0.3381639630496303,0.63... scala> svcmodels.map(_.coefficients.toArray.take(10).mkString(",")).foreach(println) -2.099898492688956,-0.32527197968284316,1.622496263503862,-0.09459144575027553,-0.02471307472153921,-0.048938642483572034,4.521280777017707,-0.39204393147385175,0.027631053567457448,-0.010013241182041081 -2.0998984926889364,-0.32527197968284405,1.6224962635038631,-0.09459144575027562,-0.024713074721539482,-0.0489386424835727,4.521280777017712,-0.3920439314738487,0.027631053567458107,-0.010013241182040594 -2.0998984926889346,-0.3252719796828256,1.6224962635038638,-0.09459144575027183,-0.02471307472153609,-0.04893864248356505,4.52128077701772,-0.392043931473848,0.027631053567458715,-0.010013241182040738 -2.0998984926889612,-0.3252719796828379,1.6224962635038718,-0.09459144575027294,-0.024713074721542546,-0.04893864248357126,4.521280777017708,-0.3920439314738543,0.027631053567458354,-0.01001324118203996 -2.099898492688976,-0.32527197968284066,1.6224962635038813,-0.09459144575027299,-0.024713074721538494,-0.04893864248357121,4.521280777017711,-0.39204393147382943,0.027631053567457917,-0.01001324118203988 scala> lirmodel.coefficients res6: org.apache.spark.ml.linalg.Vector = [-0.15694030457077052,-0.03297314855191394,0.21896060695714925,0.043191022987982185,0.029914098626947626,0.037812647639103455,0.3777274539423792,-0.004353078286124242,-0.15575156684399277,-5.036838920393178E-4,0.058833881325688855,-0.08017543724230564,-0.0492390194915689,-0.038321196923869975,-0.130894554829739,-0.042355695456345384,-0.009015454450718942,-0.04370761619435822,-0.29579850779021977,0.004520096449137435,-0.006967975798308324,-0.06845005746315802,0.266918095634905,-0.02940614765439654,-0.026540163642704145,-0.06595165721239701,0.014980713825836588,-0.04093065488345465,0.01959430385382978,0.050848208119031076,-0.05637861639180545,0.14531387392683578,-0.010366403339646989,-0.06789567679629482,-0.01138250848452352,0.012813168874534375,0... scala> lirmodels.map(_.coefficients.toArray.take(10).mkString(",")).foreach(println) -0.15782290165350424,-0.032888518926744874,0.21830737648275617,0.042901864085398485,0.030379578212786106,0.03726297774165828,0.37590079059854775,-0.005096797676258801,-0.16294717929387345,-3.4683642628292866E-4 -0.158134002701788,-0.03297488977298756,0.223484223933237,0.04362644996012737,0.02955839240619015,0.037886954492652274,0.3781987525027171,-0.0042601468957566635,-0.15377036026585994,-7.808247198886335E-4 -0.15725060355390777,-0.03274293711657846,0.2255425335042434,0.04414557619894905,0.029592347607608316,0.038140084783380566,0.38014500533792417,-0.004041812003758675,-0.1532244524431212,-6.139071642605764E-4 -0.15808251904715098,-0.03277497701747291,0.22415793808220286,0.04381212676769461,0.029645628307462257,0.03827272865297394,0.3779602823322227,-0.004579990558798104,-0.1619060581635579,-2.924684072438361E-4 -0.1584244946556921,-0.03308679728952589,0.22109423848026052,0.04314487897259751,0.02974043814809942,0.03767583465126916,0.37660774731442753,-0.004666143631917849,-0.15908500093611755,-6.032795854870288E-4 scala> hubermodel.coefficients res8: org.apache.spark.ml.linalg.Vector = [-0.5734110263378509,0.14651463113107283,0.31695293497905314,0.09766723609660509,0.9751922937594425,-0.45078361519199234,0.1796011011914836,0.1307590238399803,29.4483208484545,0.0010971438711490464,-0.3774494299116957,-0.0972074860792813,0.19308365683399073,0.7342453168750395,-1.5272939597106356,0.26236258910811083,-0.2625004448211019,0.9162955516808651,0.770900888136527,2.02312824266805,-0.2122323515204783,-0.7335830805152952,-0.6830804460739696,-0.2683927532715843,-0.009356259933091637,0.010101144450626055,0.04419139197964755,-0.1537149993025953,-0.057369897837715504,-0.21594494553457308,-0.11921794315942034,21.49391633332987,-0.03273344441395022,0.15185884919764855,-0.41260352897506936,0.2051431942644116,0.038619787153916126,0.059096910694226... scala> hubermodels.map(_.coefficients.toArray.take(10).mkString(",")).foreach(println) -1.0447443587500074,0.36317099225890404,-0.037510945673721416,-0.00499241712258674,0.7166274375452902,0.2883567264867333,0.8955748447650599,-0.09183466803327618,28.38969112834551,-0.1732703380224992 -0.9612521831063224,0.1330548989982154,-0.40321780540593827,0.07021956471272095,1.1672691564804965,0.39020978586678995,0.6565458868026381,-0.007652674389259566,25.870475377334614,-0.14530934468533313 -0.6203964039612594,0.13852113595924576,-0.05944676566281877,0.06282848629910238,1.3543656514896878,0.13935144813830452,0.8196295581480677,-0.04905550542415592,28.332175980950254,-0.020571474994685857 -0.8515959580662419,0.23175038852141236,-0.013615846688186758,0.03050371704421431,0.7718618660731084,-0.20282296703766906,0.3065121638304942,0.009840552518133401,28.873627374557977,-0.050971722924554645 -0.5366805901626359,0.12048487553084265,0.17808067158945728,0.1747893757985052,1.1205458676278635,-0.506018568748868,0.16694592848749892,0.20585232042284707,29.260697861543417,0.1112312158197619 {code} the solvers {{BreezeOWLQN}} and {{BreezeLBFGS seems stable, while solver }}{{BreezeLBFGSB}} for Huber seems quite unstable.{{}} I also test scikit-learn's {{HuberRegressor which also use LBFGSB({{scipy.optimize.minimize(method="L-BFGS-B")}} ) as the solover}}: {code:java} import matplotlib.pyplot as plt import pandas as pd import numpy as np from sklearn.linear_model import HuberRegressor from sklearn.datasets import load_svmlight_file from sklearn.preprocessing import StandardScaler from sklearn.utils import shuffle X, y = load_svmlight_file('/data1/Datasets/epsilon/epsilon_normalized.t') X = X.toarray() y = (y + 1) / 2scaler = StandardScaler().fit(X) X = scaler.transform(X) X1, y1 = shuffle(X, y, random_state=1) huber1 = HuberRegressor(max_iter=200).fit(X1, y1) del X1, y1X2, y2 = shuffle(X, y, random_state=2) huber2 = HuberRegressor(max_iter=200).fit(X2, y2) del X2, y2X3, y3 = shuffle(X, y, random_state=3) huber3 = HuberRegressor(max_iter=200).fit(X3, y3) del X3, y3 {code} its solutions are relatively stable: {code:java} huber1.coef_ Out[3]: array([-0.00323102, -0.00116995, 0.00561726, ..., -0.00079748, 0.00158621, 0.00189728]) huber2.coef_ Out[4]: array([-0.00323499, -0.00117458, 0.00562527, ..., -0.00080695, 0.00160452, 0.0019021 ]) huber3.coef_ Out[5]: array([-0.00327077, -0.00116289, 0.00568099, ..., -0.00078838, 0.00158942, 0.0019131 ]) {code} So I think this unstability lies in the impl of {{BreezeLBFGSB}} > Huber loss Convergence > ---------------------- > > Key: SPARK-32060 > URL: https://issues.apache.org/jira/browse/SPARK-32060 > Project: Spark > Issue Type: Sub-task > Components: ML > Affects Versions: 3.1.0 > Reporter: zhengruifeng > Priority: Minor > Attachments: huber.xlsx, image-2020-06-28-18-05-28-867.png > > > |performace test in https://issues.apache.org/jira/browse/SPARK-31783, > Huber loss seems start to diverge since 70 iters. > {code:scala} > for (size <- Seq(1, 4, 16, 64); iter <- Seq(10, 50, 100)) { > Thread.sleep(10000) > val hlir = new > LinearRegression().setLoss("huber").setSolver("l-bfgs").setMaxIter(iter).setTol(0) > val start = System.currentTimeMillis > val model = hlir.setBlockSize(size).fit(df) > val end = System.currentTimeMillis > println((model.uid, size, iter, end - start, > model.summary.objectiveHistory.last, model.summary.totalIterations, > model.coefficients.toString.take(100))) > }{code} -- This message was sent by Atlassian Jira (v8.3.4#803005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org