This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-sedona.git
The following commit(s) were added to refs/heads/master by this push:
new 71c83119 [DOCS] Create ApacheSedonaImageFilter (#644)
71c83119 is described below
commit 71c83119da945df5c8e8d732796486d0d49e8c15
Author: Ana Caroline Ferreira <[email protected]>
AuthorDate: Tue Jul 12 21:45:29 2022 -0300
[DOCS] Create ApacheSedonaImageFilter (#644)
---
binder/ApacheSedonaImageFilter | 556 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 556 insertions(+)
diff --git a/binder/ApacheSedonaImageFilter b/binder/ApacheSedonaImageFilter
new file mode 100644
index 00000000..ed248092
--- /dev/null
+++ b/binder/ApacheSedonaImageFilter
@@ -0,0 +1,556 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f00c8864",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "a90d47f8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# EXECUTAR NO TERMINAL\n",
+ "# pip install pandas\n",
+ "# pip install apache-sedona\n",
+ "# COPIAR TIF PARA PASTA RASTER/BIG\n",
+ "# EXECUTAR FORBIGRASTER para dividir a Imagem em Imagens menores\n",
+ "\n",
+ "## TODO - ENCONTRAR FORMA DE COPIAR DIRETO PARA O HADOOP PELO USUÀRIO
(PARA FAZER PELO JUPYTER OLHAR ANOTACAO NO FIM DO ARQUIVO ForBigRaster)\n",
+ "# sudo docker exec -it hadoop bash\n",
+ "# hadoop fs -copyFromLocal /opt/workspace/raster/* /"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "1184ba86",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from IPython.display import display, HTML\n",
+ "from pyspark.sql import SparkSession\n",
+ "from pyspark import StorageLevel\n",
+ "import pandas as pd\n",
+ "from pyspark.sql.types import StructType, StructField,StringType,
LongType, IntegerType, DoubleType, ArrayType\n",
+ "from pyspark.sql.functions import regexp_replace\n",
+ "from sedona.register import SedonaRegistrator\n",
+ "from sedona.utils import SedonaKryoRegistrator, KryoSerializer\n",
+ "from pyspark.sql.functions import col, split, expr\n",
+ "from pyspark.sql.functions import udf, lit\n",
+ "from sedona.utils import SedonaKryoRegistrator, KryoSerializer\n",
+ "from pyspark.sql.functions import col, split, expr\n",
+ "from pyspark.sql.functions import udf, lit"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "661a3fc3",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Ivy Default Cache set to: /root/.ivy2/cache\n",
+ "The jars for the packages stored in: /root/.ivy2/jars\n",
+ ":: loading settings :: url =
jar:file:/usr/local/lib/python3.9/dist-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n",
+ "org.apache.sedona#sedona-python-adapter-3.0_2.12 added as a
dependency\n",
+ "org.datasyslab#geotools-wrapper added as a dependency\n",
+ ":: resolving dependencies ::
org.apache.spark#spark-submit-parent-158e8878-0532-4bc3-b6b9-016f34becad3;1.0\n",
+ "\tconfs: [default]\n",
+ "\tfound
org.apache.sedona#sedona-python-adapter-3.0_2.12;1.1.0-incubating in central\n",
+ "\tfound org.locationtech.jts#jts-core;1.18.0 in central\n",
+ "\tfound org.wololo#jts2geojson;0.16.1 in central\n",
+ "\tfound com.fasterxml.jackson.core#jackson-databind;2.12.2 in
central\n",
+ "\tfound com.fasterxml.jackson.core#jackson-annotations;2.12.2 in
central\n",
+ "\tfound com.fasterxml.jackson.core#jackson-core;2.12.2 in central\n",
+ "\tfound org.apache.sedona#sedona-core-3.0_2.12;1.1.0-incubating in
central\n",
+ "\tfound org.apache.sedona#sedona-sql-3.0_2.12;1.1.0-incubating in
central\n",
+ "\tfound org.datasyslab#geotools-wrapper;1.1.0-25.2 in central\n",
+ ":: resolution report :: resolve 594ms :: artifacts dl 5ms\n",
+ "\t:: modules in use:\n",
+ "\tcom.fasterxml.jackson.core#jackson-annotations;2.12.2 from central in
[default]\n",
+ "\tcom.fasterxml.jackson.core#jackson-core;2.12.2 from central in
[default]\n",
+ "\tcom.fasterxml.jackson.core#jackson-databind;2.12.2 from central in
[default]\n",
+ "\torg.apache.sedona#sedona-core-3.0_2.12;1.1.0-incubating from central
in [default]\n",
+ "\torg.apache.sedona#sedona-python-adapter-3.0_2.12;1.1.0-incubating
from central in [default]\n",
+ "\torg.apache.sedona#sedona-sql-3.0_2.12;1.1.0-incubating from central
in [default]\n",
+ "\torg.datasyslab#geotools-wrapper;1.1.0-25.2 from central in
[default]\n",
+ "\torg.locationtech.jts#jts-core;1.18.0 from central in [default]\n",
+ "\torg.wololo#jts2geojson;0.16.1 from central in [default]\n",
+ "\t:: evicted modules:\n",
+ "\torg.locationtech.jts#jts-core;1.18.1 by
[org.locationtech.jts#jts-core;1.18.0] in [default]\n",
+
"\t---------------------------------------------------------------------\n",
+ "\t| | modules || artifacts
|\n",
+ "\t| conf | number| search|dwnlded|evicted||
number|dwnlded|\n",
+
"\t---------------------------------------------------------------------\n",
+ "\t| default | 10 | 0 | 0 | 1 || 9 | 0
|\n",
+
"\t---------------------------------------------------------------------\n",
+ ":: retrieving ::
org.apache.spark#spark-submit-parent-158e8878-0532-4bc3-b6b9-016f34becad3\n",
+ "\tconfs: [default]\n",
+ "\t0 artifacts copied, 9 already retrieved (0kB/5ms)\n",
+ "21/12/29 16:13:37 WARN NativeCodeLoader: Unable to load native-hadoop
library for your platform... using builtin-java classes where applicable\n",
+ "Using Spark's default log4j profile:
org/apache/spark/log4j-defaults.properties\n",
+ "Setting default log level to \"WARN\".\n",
+ "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use
setLogLevel(newLevel).\n",
+ "
\r"
+ ]
+ }
+ ],
+ "source": [
+ "spark = SparkSession.\\\n",
+ " builder.\\\n",
+ " appName(\"Demo-app\").\\\n",
+ " enableHiveSupport().\\\n",
+ " master(\"local[*]\").\\\n",
+ " master(\"spark://spark-master:7077\").\\\n",
+ " config(\"spark.executor.memory\", \"15G\").\\\n",
+ " config(\"spark.driver.maxResultSize\", \"15G\").\\\n",
+ " config(\"spark.serializer\", KryoSerializer.getName).\\\n",
+ " config(\"spark.kryo.registrator\",
SedonaKryoRegistrator.getName).\\\n",
+ " config(\"spark.jars.packages\",
\"org.apache.sedona:sedona-python-adapter-3.0_2.12:1.1.0-incubating,org.datasyslab:geotools-wrapper:1.1.0-25.2\")
.\\\n",
+ " getOrCreate()\n",
+ "# config(\"spark.rpc.message.maxSize\", 2047).\\\n",
+ "# rdd = spark.sparkContext.parallelize(range(1000))\n",
+ "# rdd.takeSample(False, 5)\n",
+ "\n",
+ "SedonaRegistrator.registerAll(spark)\n",
+ "sc = spark.sparkContext"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "f1eb9469",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Path to directory of geotiff images \n",
+ "DATA_DIR = \"hdfs://776faf4d6a1e:8020/tmp/\"\n",
+ "df =
spark.read.format(\"geotiff\").option(\"dropInvalid\",True).load(DATA_DIR)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "f88896a0",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "root\n",
+ " |-- image: struct (nullable = true)\n",
+ " | |-- origin: string (nullable = true)\n",
+ " | |-- wkt: string (nullable = true)\n",
+ " | |-- height: integer (nullable = true)\n",
+ " | |-- width: integer (nullable = true)\n",
+ " | |-- nBands: integer (nullable = true)\n",
+ " | |-- data: array (nullable = true)\n",
+ " | | |-- element: double (containsNull = true)\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "df.cache()\n",
+ "df.printSchema()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "8024ef1e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.is_cached"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c567af25",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[Stage 2:> (0
+ 1) / 1]\r"
+ ]
+ }
+ ],
+ "source": [
+ "# Java Heap Out Of Memory => Ir nas máquinas e aumentar o export
_JAVA_OPTIONS=\"-Xmx15g\"\n",
+ "# Java lang Assertion Error image is too large =>\n",
+ "df = df.selectExpr(\"image.origin as origin\",\"ST_GeomFromWkt(image.wkt)
as Geom\", \"image.height as height\", \"image.width as width\", \"image.data
as data\", \"image.nBands as bands\").cache()\n",
+ "df.show(5)\n",
+ "# df.count()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6d16bdcc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ,\"RS_GetBand(data, 2,bands) as Band2\",\"RS_GetBand(data, 3,bands) as
Band3\", \"RS_GetBand(data, 4,bands) as Band4\"\n",
+ "df = df.selectExpr(\"Geom\",\"RS_GetBand(data, 1,bands) as
Band1\",\"RS_GetBand(data, 2,bands) as Band2\",\"RS_GetBand(data, 3,bands) as
Band3\", \"RS_GetBand(data, 4,bands) as Band4\").cache()\n",
+ "df.createOrReplaceTempView(\"allbands\")\n",
+ "df.show(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9ad7d109",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# spark.catalog.cacheTable('df')\n",
+ "# spark.catalog.isCached(tableName='df')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "19ef669a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "NomalizedDifference = df.selectExpr(\"RS_NormalizedDifference(Band1,
Band2) as normDiff\").cache()\n",
+ "NomalizedDifference.show(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f26ef1a4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "meanDF = df.selectExpr(\"RS_Mean(Band1) as mean\").cache()\n",
+ "meanDF.show(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "98e8350a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "modeDF = df.selectExpr(\"RS_Mode(Band1) as mode\").cache()\n",
+ "modeDF.show(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5cf6c21b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "greaterthanDF = spark.sql(\"Select RS_GreaterThan(Band1,1000.0) as
greaterthan from allbands\").cache()\n",
+ "greaterthanDF.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c2360bad",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "greaterthanEqualDF = spark.sql(\"Select RS_GreaterThanEqual(Band1,360.0)
as greaterthanEqual from allbands\").cache()\n",
+ "greaterthanEqualDF.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bde3a7a5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lessthanDF = spark.sql(\"Select RS_LessThan(Band1,1000.0) as lessthan
from allbands\").cache()\n",
+ "lessthanDF.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7315e340",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lessthanEqualDF = spark.sql(\"Select RS_LessThanEqual(Band1,2890.0) as
lessthanequal from allbands\").cache()\n",
+ "lessthanEqualDF.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "373920b7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sumDF = df.selectExpr(\"RS_AddBands(Band1, Band2) as
sumOfBand\").cache()\n",
+ "sumDF.show(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "92be3eba",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "subtractDF = df.selectExpr(\"RS_SubtractBands(Band1, Band2) as
diffOfBand\").cache()\n",
+ "subtractDF.show(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c52af8bb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "multiplyDF = df.selectExpr(\"RS_MultiplyBands(Band1, Band2) as
productOfBand\").cache()\n",
+ "multiplyDF.show(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b7e186ca",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "divideDF = df.selectExpr(\"RS_DivideBands(Band1, Band2) as
divisionOfBand\").cache()\n",
+ "divideDF.show(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "68b56ea9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "mulfacDF = df.selectExpr(\"RS_MultiplyFactor(Band2, 2) as
target\").cache()\n",
+ "mulfacDF.show(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8ca30b96",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bitwiseAND = df.selectExpr(\"RS_BitwiseAND(Band1, Band2) as
AND\").cache()\n",
+ "bitwiseAND.show(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "67b08806",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bitwiseOR = df.selectExpr(\"RS_BitwiseOR(Band1, Band2) as
OR\").cache()\n",
+ "bitwiseOR.show(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ead2d2a0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "countDF = df.selectExpr(\"RS_Count(RS_GreaterThan(Band1,1000.0), 1.0) as
count\").cache()\n",
+ "countDF.show(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8174502f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "moduloDF = df.selectExpr(\"RS_Modulo(Band1, 21.0) as modulo
\").cache()\n",
+ "moduloDF.show(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4f7cb317",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "rootDF = df.selectExpr(\"RS_SquareRoot(Band1) as root\").cache()\n",
+ "rootDF.show(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "23835487",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "logDiff = df.selectExpr(\"RS_LogicalDifference(Band1, Band2) as
loggDifference\").cache()\n",
+ "logDiff.show(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0bd25e2e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "logOver = df.selectExpr(\"RS_LogicalOver(Band3, Band2) as
logicalOver\").cache()\n",
+ "logOver.show(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b0418cb5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df =
spark.read.format(\"geotiff\").option(\"dropInvalid\",True).load(DATA_DIR)\n",
+ "df = df.selectExpr(\"image.origin as origin\",\"ST_GeomFromWkt(image.wkt)
as Geom\", \"image.height as height\", \"image.width as width\", \"image.data
as data\", \"image.nBands as bands\").cache()\n",
+ "\n",
+ "df = df.selectExpr(\"RS_GetBand(data,1,bands) as targetband\",
\"height\", \"width\", \"bands\", \"Geom\")\n",
+ "df_base64 = df.selectExpr(\"Geom\",
\"RS_Base64(height,width,RS_Normalize(targetBand), RS_Array(height*width,0.0),
RS_Array(height*width, 0.0)) as
red\",\"RS_Base64(height,width,RS_Array(height*width, 0.0),
RS_Normalize(targetBand), RS_Array(height*width, 0.0)) as green\",
\"RS_Base64(height,width,RS_Array(height*width, 0.0), RS_Array(height*width,
0.0), RS_Normalize(targetBand)) as
blue\",\"RS_Base64(height,width,RS_Normalize(targetBand),
RS_Normalize(targetBand),RS_Normalize(targ [...]
+ "df_HTML = df_base64.selectExpr(\"Geom\",\"RS_HTML(red) as
RedBand\",\"RS_HTML(blue) as BlueBand\",\"RS_HTML(green) as GreenBand\",
\"RS_HTML(RGB) as CombinedBand\").cache()\n",
+ "df_HTML.show(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "63c86f79",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display(HTML(df_HTML.limit(2).toPandas().to_html(escape=False)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "aca271df",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "def SumOfValues(band):\n",
+ " total = 0.0\n",
+ " for num in band:\n",
+ " if num>1000.0:\n",
+ " total+=1\n",
+ " return total\n",
+ " \n",
+ "calculateSum = udf(SumOfValues, DoubleType())\n",
+ "spark.udf.register(\"RS_Sum\", calculateSum)\n",
+ "\n",
+ "sumDF = df.selectExpr(\"RS_Sum(targetband) as sum\").cache()\n",
+ "sumDF.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "211e89c9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def generatemask(band, width,height):\n",
+ " for (i,val) in enumerate(band):\n",
+ " if (i%width>=12 and i%width<26) and (i%height>=12 and
i%height<26):\n",
+ " band[i] = 255.0\n",
+ " else:\n",
+ " band[i] = 0.0\n",
+ " return band\n",
+ "\n",
+ "maskValues = udf(generatemask, ArrayType(DoubleType()))\n",
+ "spark.udf.register(\"RS_MaskValues\", maskValues)\n",
+ "\n",
+ "\n",
+ "df_base64 = df.selectExpr(\"Geom\",
\"RS_Base64(height,width,RS_Normalize(targetband), RS_Array(height*width,0.0),
RS_Array(height*width, 0.0), RS_MaskValues(targetband,width,height)) as
region\" ).cache()\n",
+ "df_HTML = df_base64.selectExpr(\"Geom\",\"RS_HTML(region) as
selectedregion\").cache()\n",
+ "display(HTML(df_HTML.limit(2).toPandas().to_html(escape=False)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "209005e7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark.stop()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "da5fb78b",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}