aglinxinyuan commented on code in PR #4827: URL: https://github.com/apache/texera/pull/4827#discussion_r3180318269
########## common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnOpDescRegistrySpec.scala: ########## @@ -0,0 +1,346 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.sklearn + +import org.apache.texera.amber.operator.sklearn.training._ +import org.scalatest.flatspec.AnyFlatSpec + +/** + * Pins the wiring (Python import statement + user-friendly model name) for + * every concrete `SklearnClassifierOpDesc` and `SklearnTrainingOpDesc`. A + * typo in either string would silently misroute downstream UI labels and + * breakage of the generated Python pipeline. + */ +class SklearnOpDescRegistrySpec extends AnyFlatSpec { + + // --------------------------------------------------------------------------- + // Classifier registry (25 concrete SklearnClassifierOpDesc subclasses) + // --------------------------------------------------------------------------- + + private val classifierEntries: List[(SklearnClassifierOpDesc, String, String)] = List( + ( + new SklearnAdaptiveBoostingOpDesc(), + "from sklearn.ensemble import AdaBoostClassifier", + "Adaptive Boosting" Review Comment: Done in 95a382b177 — added two completeness assertions, one for classifierEntries and one for trainingEntries. Both reuse `PythonReflectionUtils.scanCandidates` (the same scanner PythonCodeRawInvalidTextSpec uses) so the two suites agree on what counts as 'concrete'. The new tests fail with a 'drift' diff if a concrete subclass appears on the classpath that isn't in the manual list, or vice versa. `SklearnTrainingOpDesc` itself is concrete (used as a default fallback), so it's excluded from the training-side diff. ########## common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnOpDescRegistrySpec.scala: ########## @@ -0,0 +1,346 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.sklearn + +import org.apache.texera.amber.operator.sklearn.training._ +import org.scalatest.flatspec.AnyFlatSpec + +/** + * Pins the wiring (Python import statement + user-friendly model name) for + * every concrete `SklearnClassifierOpDesc` and `SklearnTrainingOpDesc`. A + * typo in either string would silently misroute downstream UI labels and + * breakage of the generated Python pipeline. + */ +class SklearnOpDescRegistrySpec extends AnyFlatSpec { + + // --------------------------------------------------------------------------- + // Classifier registry (25 concrete SklearnClassifierOpDesc subclasses) + // --------------------------------------------------------------------------- + + private val classifierEntries: List[(SklearnClassifierOpDesc, String, String)] = List( + ( + new SklearnAdaptiveBoostingOpDesc(), + "from sklearn.ensemble import AdaBoostClassifier", + "Adaptive Boosting" + ), + (new SklearnBaggingOpDesc(), "from sklearn.ensemble import BaggingClassifier", "Bagging"), + ( + new SklearnBernoulliNaiveBayesOpDesc(), + "from sklearn.naive_bayes import BernoulliNB", + "Bernoulli Naive Bayes" + ), + ( + new SklearnComplementNaiveBayesOpDesc(), + "from sklearn.naive_bayes import ComplementNB", + "Complement Naive Bayes" + ), + ( + new SklearnDummyClassifierOpDesc(), + "from sklearn.dummy import DummyClassifier", + "Dummy Classifier" + ), + ( + new SklearnDecisionTreeOpDesc(), + "from sklearn.tree import DecisionTreeClassifier", + "Decision Tree" + ), + (new SklearnExtraTreeOpDesc(), "from sklearn.tree import ExtraTreeClassifier", "Extra Tree"), + ( + new SklearnExtraTreesOpDesc(), + "from sklearn.ensemble import ExtraTreesClassifier", + "Extra Trees" + ), + ( + new SklearnGaussianNaiveBayesOpDesc(), + "from sklearn.naive_bayes import GaussianNB", + "Gaussian Naive Bayes" + ), + ( + new SklearnGradientBoostingOpDesc(), + "from sklearn.ensemble import GradientBoostingClassifier", + "Gradient Boosting" + ), + ( + new SklearnKNNOpDesc(), + "from sklearn.neighbors import KNeighborsClassifier", + "K-nearest Neighbors" + ), + ( + new SklearnLinearSVMOpDesc(), + "from sklearn.svm import LinearSVC", + "Linear Support Vector Machine" + ), + ( + new SklearnLogisticRegressionCVOpDesc(), + "from sklearn.linear_model import LogisticRegressionCV", + "Logistic Regression Cross Validation" + ), + ( + new SklearnLogisticRegressionOpDesc(), + "from sklearn.linear_model import LogisticRegression", + "Logistic Regression" + ), + ( + new SklearnMultiLayerPerceptronOpDesc(), + "from sklearn.neural_network import MLPClassifier", + "Multi-layer Perceptron" + ), + ( + new SklearnMultinomialNaiveBayesOpDesc(), + "from sklearn.naive_bayes import MultinomialNB", + "Multinomial Naive Bayes" + ), + ( + new SklearnNearestCentroidOpDesc(), + "from sklearn.neighbors import NearestCentroid", + "Nearest Centroid" + ), + ( + new SklearnPassiveAggressiveOpDesc(), + "from sklearn.linear_model import PassiveAggressiveClassifier", + "Passive Aggressive" + ), + ( + new SklearnPerceptronOpDesc(), + "from sklearn.linear_model import Perceptron", + "Linear Perceptron" + ), + ( + new SklearnProbabilityCalibrationOpDesc(), + "from sklearn.calibration import CalibratedClassifierCV", + "Probability Calibration" + ), + ( + new SklearnRandomForestOpDesc(), + "from sklearn.ensemble import RandomForestClassifier", + "Random Forest" + ), + ( + new SklearnRidgeCVOpDesc(), + "from sklearn.linear_model import RidgeClassifierCV", + "Ridge Regression Cross Validation" + ), + ( + new SklearnRidgeOpDesc(), + "from sklearn.linear_model import RidgeClassifier", + "Ridge Regression" + ), + ( + new SklearnSDGOpDesc(), + "from sklearn.linear_model import SGDClassifier", + "Stochastic Gradient Descent" + ), + (new SklearnSVMOpDesc(), "from sklearn.svm import SVC", "Support Vector Machine") + ) + + classifierEntries.foreach { + case (desc, expectedImport, expectedName) => + val cls = desc.getClass.getSimpleName + cls should s"return import statement '$expectedImport'" in { + assert(desc.getImportStatements == expectedImport) + } + it should s"return user-friendly model name '$expectedName'" in { + assert(desc.getUserFriendlyModelName == expectedName) + } + } + + "SklearnClassifierOpDesc" should "embed the import statement into generatePythonCode for a concrete subclass" in { + val desc = new SklearnLogisticRegressionOpDesc() + desc.target = "y" + desc.countVectorizer = false + // `tfidfTransformer` is a val on the base class, defaults to false. + val code = desc.generatePythonCode() + assert(code.contains("from sklearn.linear_model import LogisticRegression")) + // Classifier OpDescs emit a UDFTableOperator pipeline. + assert(code.contains("ProcessTableOperator")) + } + // NOTE: the abstract base class's empty-string defaults are NOT tested here. + // Instantiating `SklearnClassifierOpDesc` from this spec (e.g. via + // `new SklearnClassifierOpDesc {}`) creates an anonymous test-package class + // under `org.apache.texera.amber.operator.sklearn`, which the + // PythonCodeRawInvalidTextSpec classpath scan then picks up as a descriptor + // candidate and fails on (anonymous classes have no accessible no-arg + // constructor). Every concrete subclass below overrides both methods, so + // the base default is never observable in production anyway. Review Comment: Took option 2 — updated the [PR description](https://github.com/apache/texera/pull/4827) and [linked issue](https://github.com/apache/texera/issues/4826) to remove the base-defaults claim and document why it's not tested (anonymous-subclass instantiation breaks the PythonCodeRawInvalidTextSpec classpath scan; base default is never observable in production because every concrete subclass overrides both methods). The earlier 95a382b177 commit's spec-level NOTE comment also documents this so a future contributor won't reflexively re-add it. ########## common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnOpDescRegistrySpec.scala: ########## @@ -0,0 +1,346 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.sklearn + +import org.apache.texera.amber.operator.sklearn.training._ +import org.scalatest.flatspec.AnyFlatSpec + +/** + * Pins the wiring (Python import statement + user-friendly model name) for + * every concrete `SklearnClassifierOpDesc` and `SklearnTrainingOpDesc`. A + * typo in either string would silently misroute downstream UI labels and + * breakage of the generated Python pipeline. Review Comment: Done in 95a382b177 — rephrased to '...labels and cause breakage in the generated Python pipeline'. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
