spark git commit: [SPARK-25108][SQL] Fix the show method to display the wide character alignment problem

srowen Thu, 06 Sep 2018 07:17:58 -0700

Repository: spark
Updated Branches:
  refs/heads/master 64c314e22 -> f5817d8bb



[SPARK-25108][SQL] Fix the show method to display the wide character alignment 
problem

This is not a perfect solution. It is designed to minimize complexity on the 
basis of solving problems.

It is effective for English, Chinese characters, Japanese, Korean and so on.

```scala
before:
+---+---------------------------+-------------+
|id |ä¸å½                         |s2           |
+---+---------------------------+-------------+
|1  |ab                         |[a]          |
|2  |null                       |[ä¸å½, abc]    |
|3  |ab1                        |[hello world]|
|4  |ãè¡ ãã(kya) ãã(kyu) ãã(kyo) |[âä¸å½]        |
|5  |ä¸å½ï¼ä½ å¥½ï¼a                    |[âä¸ï¼å½ï¼, 312] |
|6  |ä¸å½å±±(ä¸)æå¡åº                  |[âä¸(å½ï¼]      |
|7  |ä¸å½å±±ä¸æå¡åº                    |[ä¸(å½)]       |
|8  |                           |[ä¸å½]         |
+---+---------------------------+-------------+

after:
+---+-----------------------------------+----------------+
|id |ä¸å½                               |s2              |
+---+-----------------------------------+----------------+
|1  |ab                                 |[a]             |
|2  |null                               |[ä¸å½, abc]     |
|3  |ab1                                |[hello world]   |
|4  |ãè¡ ãã(kya) ãã(kyu) ãã(kyo) |[âä¸å½]         |
|5  |ä¸å½ï¼ä½ å¥½ï¼a                      |[âä¸ï¼å½ï¼, 312]|
|6  |ä¸å½å±±(ä¸)æå¡åº                   |[âä¸(å½ï¼]      |
|7  |ä¸å½å±±ä¸æå¡åº                     |[ä¸(å½)]        |
|8  |                                   |[ä¸å½]          |
+---+-----------------------------------+----------------+
```

## What changes were proposed in this pull request?

When there are wide characters such as Chinese characters or Japanese 
characters in the data, the show method has a alignment problem.
Try to fix this problem.

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, 
manual tests)

![image](https://user-images.githubusercontent.com/13044869/44250564-69f6b400-a227-11e8-88b2-6cf6960377ff.png)

Please review http://spark.apache.org/contributing.html before opening a pull 
request.

Closes #22048 from xuejianbest/master.

Authored-by: xuejianbest <384329...@qq.com>
Signed-off-by: Sean Owen <sean.o...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f5817d8b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f5817d8b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f5817d8b

Branch: refs/heads/master
Commit: f5817d8bb33b733eeca0154d1ed207c8d1e8513f
Parents: 64c314e
Author: xuejianbest <384329...@qq.com>
Authored: Thu Sep 6 07:17:37 2018 -0700
Committer: Sean Owen <sean.o...@databricks.com>
Committed: Thu Sep 6 07:17:37 2018 -0700

----------------------------------------------------------------------
 .../scala/org/apache/spark/util/Utils.scala     | 30 ++++++++++++
 .../org/apache/spark/util/UtilsSuite.scala      | 21 +++++++++
 .../scala/org/apache/spark/sql/Dataset.scala    | 18 +++----
 .../org/apache/spark/sql/DatasetSuite.scala     | 49 ++++++++++++++++++++
 4 files changed, 109 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/f5817d8b/core/src/main/scala/org/apache/spark/util/Utils.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala 
b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 15c958d..4593b05 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2795,6 +2795,36 @@ private[spark] object Utils extends Logging {
       }
     }
   }
+
+  /**
+   * Regular expression matching full width characters.
+   *
+   * Looked at all the 0x0000-0xFFFF characters (unicode) and showed them 
under Xshell.
+   * Found all the full width characters, then get the regular expression.
+   */
+  private val fullWidthRegex = ("""[""" +
+    // scalastyle:off nonascii
+    """\u1100-\u115F""" +
+    """\u2E80-\uA4CF""" +
+    """\uAC00-\uD7A3""" +
+    """\uF900-\uFAFF""" +
+    """\uFE10-\uFE19""" +
+    """\uFE30-\uFE6F""" +
+    """\uFF00-\uFF60""" +
+    """\uFFE0-\uFFE6""" +
+    // scalastyle:on nonascii
+    """]""").r
+
+  /**
+   * Return the number of half widths in a given string. Note that a full 
width character
+   * occupies two half widths.
+   *
+   * For a string consisting of 1 million characters, the execution of this 
method requires
+   * about 50ms.
+   */
+  def stringHalfWidth(str: String): Int = {
+    if (str == null) 0 else str.length + fullWidthRegex.findAllIn(str).size
+  }
 }
 
 private[util] object CallerContext extends Logging {

http://git-wip-us.apache.org/repos/asf/spark/blob/f5817d8b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
----------------------------------------------------------------------
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala 
b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 418d2f9..943b535 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -1184,6 +1184,27 @@ class UtilsSuite extends SparkFunSuite with 
ResetSystemProperties with Logging {
     assert(Utils.getSimpleName(classOf[MalformedClassObject.MalformedClass]) 
===
       "UtilsSuite$MalformedClassObject$MalformedClass")
   }
+
+  test("stringHalfWidth") {
+    // scalastyle:off nonascii
+    assert(Utils.stringHalfWidth(null) == 0)
+    assert(Utils.stringHalfWidth("") == 0)
+    assert(Utils.stringHalfWidth("ab c") == 4)
+    assert(Utils.stringHalfWidth("1098") == 4)
+    assert(Utils.stringHalfWidth("mÃ¸") == 2)
+    assert(Utils.stringHalfWidth("Î³ÏÏ") == 3)
+    assert(Utils.stringHalfWidth("pÃª") == 2)
+    assert(Utils.stringHalfWidth("ã¼") == 2)
+    assert(Utils.stringHalfWidth("æµ") == 2)
+    assert(Utils.stringHalfWidth("ã") == 2)
+    assert(Utils.stringHalfWidth("ê±¸") == 2)
+    assert(Utils.stringHalfWidth("Ã ") == 1)
+    assert(Utils.stringHalfWidth("ç¼") == 2)
+    assert(Utils.stringHalfWidth("ç¾ã") == 4)
+    assert(Utils.stringHalfWidth("ëºá¾") == 3)
+    assert(Utils.stringHalfWidth("\u0967\u0968\u0969") == 3)
+    // scalastyle:on nonascii
+  }
 }
 
 private class SimpleExtension

http://git-wip-us.apache.org/repos/asf/spark/blob/f5817d8b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index db439b1..fa14aa1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -306,16 +306,16 @@ class Dataset[T] private[sql](
       // Compute the width of each column
       for (row <- rows) {
         for ((cell, i) <- row.zipWithIndex) {
-          colWidths(i) = math.max(colWidths(i), cell.length)
+          colWidths(i) = math.max(colWidths(i), Utils.stringHalfWidth(cell))
         }
       }
 
       val paddedRows = rows.map { row =>
         row.zipWithIndex.map { case (cell, i) =>
           if (truncate > 0) {
-            StringUtils.leftPad(cell, colWidths(i))
+            StringUtils.leftPad(cell, colWidths(i) - 
Utils.stringHalfWidth(cell) + cell.length)
           } else {
-            StringUtils.rightPad(cell, colWidths(i))
+            StringUtils.rightPad(cell, colWidths(i) - 
Utils.stringHalfWidth(cell) + cell.length)
           }
         }
       }
@@ -337,12 +337,10 @@ class Dataset[T] private[sql](
 
       // Compute the width of field name and data columns
       val fieldNameColWidth = fieldNames.foldLeft(minimumColWidth) { case 
(curMax, fieldName) =>
-        math.max(curMax, fieldName.length)
+        math.max(curMax, Utils.stringHalfWidth(fieldName))
       }
       val dataColWidth = dataRows.foldLeft(minimumColWidth) { case (curMax, 
row) =>
-        math.max(curMax, row.map(_.length).reduceLeftOption[Int] { case 
(cellMax, cell) =>
-          math.max(cellMax, cell)
-        }.getOrElse(0))
+        math.max(curMax, row.map(cell => Utils.stringHalfWidth(cell)).max)
       }
 
       dataRows.zipWithIndex.foreach { case (row, i) =>
@@ -351,8 +349,10 @@ class Dataset[T] private[sql](
           s"-RECORD $i", fieldNameColWidth + dataColWidth + 5, "-")
         sb.append(rowHeader).append("\n")
         row.zipWithIndex.map { case (cell, j) =>
-          val fieldName = StringUtils.rightPad(fieldNames(j), 
fieldNameColWidth)
-          val data = StringUtils.rightPad(cell, dataColWidth)
+          val fieldName = StringUtils.rightPad(fieldNames(j),
+            fieldNameColWidth - Utils.stringHalfWidth(fieldNames(j)) + 
fieldNames(j).length)
+          val data = StringUtils.rightPad(cell,
+            dataColWidth - Utils.stringHalfWidth(cell) + cell.length)
           s" $fieldName | $data "
         }.addString(sb, "", "\n", "\n")
       }

http://git-wip-us.apache.org/repos/asf/spark/blob/f5817d8b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index cf24eba..ca8fbc9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -969,6 +969,55 @@ class DatasetSuite extends QueryTest with SharedSQLContext 
{
     checkShowString(ds, expected)
   }
 
+  test("SPARK-25108 Fix the show method to display the full width character 
alignment problem") {
+    // scalastyle:off nonascii
+    val df = Seq(
+      (0, null, 1),
+      (0, "", 1),
+      (0, "ab c", 1),
+      (0, "1098", 1),
+      (0, "mÃ¸", 1),
+      (0, "Î³ÏÏ", 1),
+      (0, "pÃª", 1),
+      (0, "ã¼", 1),
+      (0, "æµ", 1),
+      (0, "ã", 1),
+      (0, "ê±¸", 1),
+      (0, "Ã ", 1),
+      (0, "ç¼", 1),
+      (0, "ç¾ã", 1),
+      (0, "ëºá¾", 1),
+      (0, "\u0967\u0968\u0969", 1)
+    ).toDF("b", "a", "c")
+    // scalastyle:on nonascii
+    val ds = df.as[ClassData]
+    val expected =
+      // scalastyle:off nonascii
+      """+---+----+---+
+        ||  b|   a|  c|
+        |+---+----+---+
+        ||  0|null|  1|
+        ||  0|    |  1|
+        ||  0|ab c|  1|
+        ||  0|1098|  1|
+        ||  0|  mÃ¸|  1|
+        ||  0| Î³ÏÏ|  1|
+        ||  0|  pÃª|  1|
+        ||  0|  ã¼|  1|
+        ||  0|  æµ|  1|
+        ||  0|  ã|  1|
+        ||  0|  ê±¸|  1|
+        ||  0|   Ã |  1|
+        ||  0|  ç¼|  1|
+        ||  0|ç¾ã|  1|
+        ||  0| ëºá¾|  1|
+        ||  0| à¥§à¥¨à¥©|  1|
+        |+---+----+---+
+        |""".stripMargin
+    // scalastyle:on nonascii
+    checkShowString(ds, expected)
+  }
+
   test(
     "SPARK-15112: EmbedDeserializerInFilter should not optimize plan fragment 
that changes schema"
   ) {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-25108][SQL] Fix the show method to display the wide character alignment problem

Reply via email to