GitHub user sarutak opened a pull request: https://github.com/apache/spark/pull/12979
[SPARK-15205][SQL][WIP] Codegen can compile more than twice for the same source code ## What changes were proposed in this pull request? Sometimes, we have generated codes they are equal except for comments. One example is here. {code} val df = sc.parallelize(1 to 10).toDF df.selectExpr("value + 1").show // query1 df.selectExpr("value + 2").show // query2 {code} The following code is one of generated code when query1 above is executed. {code} /* 001 */ /* 002 */ public java.lang.Object generate(Object[] references) { /* 003 */ return new SpecificSafeProjection(references); /* 004 */ } /* 005 */ /* 006 */ class SpecificSafeProjection extends org.apache.spark.sql.catalyst.expressions.codegen.BaseProjection { /* 007 */ /* 008 */ private Object[] references; /* 009 */ private MutableRow mutableRow; /* 010 */ private Object[] values; /* 011 */ private org.apache.spark.sql.types.StructType schema; /* 012 */ /* 013 */ /* 014 */ public SpecificSafeProjection(Object[] references) { /* 015 */ this.references = references; /* 016 */ mutableRow = (MutableRow) references[references.length - 1]; /* 017 */ /* 018 */ this.schema = (org.apache.spark.sql.types.StructType) references[0]; /* 019 */ } /* 020 */ /* 021 */ public java.lang.Object apply(java.lang.Object _i) { /* 022 */ InternalRow i = (InternalRow) _i; /* 023 */ /* createexternalrow(if (isnull(input[0, int])) null else input[0, int], StructField((value + 1),IntegerType,false)) */ /* 024 */ values = new Object[1]; /* 025 */ /* if (isnull(input[0, int])) null else input[0, int] */ /* 026 */ /* isnull(input[0, int]) */ /* 027 */ /* input[0, int] */ /* 028 */ int value3 = i.getInt(0); /* 029 */ boolean isNull1 = false; /* 030 */ int value1 = -1; /* 031 */ if (!false && false) { /* 032 */ /* null */ /* 033 */ final int value4 = -1; /* 034 */ isNull1 = true; /* 035 */ value1 = value4; /* 036 */ } else { /* 037 */ /* input[0, int] */ /* 038 */ int value5 = i.getInt(0); /* 039 */ isNull1 = false; /* 040 */ value1 = value5; /* 041 */ } /* 042 */ if (isNull1) { /* 043 */ values[0] = null; /* 044 */ } else { /* 045 */ values[0] = value1; /* 046 */ } /* 047 */ /* 048 */ final org.apache.spark.sql.Row value = new org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema(values, this.schema); /* 049 */ if (false) { /* 050 */ mutableRow.setNullAt(0); /* 051 */ } else { /* 052 */ /* 053 */ mutableRow.update(0, value); /* 054 */ } /* 055 */ /* 056 */ return mutableRow; /* 057 */ } /* 058 */ } /* 059 */ {code} On the other hand, the following code is for query2. {code} /* 001 */ /* 002 */ public java.lang.Object generate(Object[] references) { /* 003 */ return new SpecificSafeProjection(references); /* 004 */ } /* 005 */ /* 006 */ class SpecificSafeProjection extends org.apache.spark.sql.catalyst.expressions.codegen.BaseProjection { /* 007 */ /* 008 */ private Object[] references; /* 009 */ private MutableRow mutableRow; /* 010 */ private Object[] values; /* 011 */ private org.apache.spark.sql.types.StructType schema; /* 012 */ /* 013 */ /* 014 */ public SpecificSafeProjection(Object[] references) { /* 015 */ this.references = references; /* 016 */ mutableRow = (MutableRow) references[references.length - 1]; /* 017 */ /* 018 */ this.schema = (org.apache.spark.sql.types.StructType) references[0]; /* 019 */ } /* 020 */ /* 021 */ public java.lang.Object apply(java.lang.Object _i) { /* 022 */ InternalRow i = (InternalRow) _i; /* 023 */ /* createexternalrow(if (isnull(input[0, int])) null else input[0, int], StructField((value + 2),IntegerType,false)) */ /* 024 */ values = new Object[1]; /* 025 */ /* if (isnull(input[0, int])) null else input[0, int] */ /* 026 */ /* isnull(input[0, int]) */ /* 027 */ /* input[0, int] */ /* 028 */ int value3 = i.getInt(0); /* 029 */ boolean isNull1 = false; /* 030 */ int value1 = -1; /* 031 */ if (!false && false) { /* 032 */ /* null */ /* 033 */ final int value4 = -1; /* 034 */ isNull1 = true; /* 035 */ value1 = value4; /* 036 */ } else { /* 037 */ /* input[0, int] */ /* 038 */ int value5 = i.getInt(0); /* 039 */ isNull1 = false; /* 040 */ value1 = value5; /* 041 */ } /* 042 */ if (isNull1) { /* 043 */ values[0] = null; /* 044 */ } else { /* 045 */ values[0] = value1; /* 046 */ } /* 047 */ /* 048 */ final org.apache.spark.sql.Row value = new org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema(values, this.schema); /* 049 */ if (false) { /* 050 */ mutableRow.setNullAt(0); /* 051 */ } else { /* 052 */ /* 053 */ mutableRow.update(0, value); /* 054 */ } /* 055 */ /* 056 */ return mutableRow; /* 057 */ } /* 058 */ } /* 059 */ {code} As you can notice, those two generated codes are essentially equal but not equal as String objects so they will be compiled each. In this PR, I introduced place holder for comments like /*{comment_placeholder1}*/. The code to be compiled has only comment-style place holder. When logging the generated code, place holders are replaced with corresponding actual comments. ## How was this patch tested? Currently I mark this PR as WIP and I'll add test cases in this PR. You can merge this pull request into a Git repository by running: $ git pull https://github.com/sarutak/spark SPARK-15205 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/12979.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #12979 ---- commit 766f5c64f93163ee1b8b7ea8b1ef986f9a731733 Author: Kousuke Saruta <saru...@oss.nttdata.co.jp> Date: 2016-05-08T00:27:41Z Fixed duplicated generated code issue ---- --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org