[GitHub] [hudi] stym06 commented on issue #4318: [SUPPORT] Duplicate records in COW table within same partition path

GitBox Sat, 18 Dec 2021 21:58:35 -0800


stym06 commented on issue #4318:
URL: https://github.com/apache/hudi/issues/4318#issuecomment-997334777



   @nsivabalan I tried querying through both Presto and Hive and got duplicate 
records. Yet to query through Spark datasource. Will post the `.hoodie` folder 
in some time. Posting the spark-submit below:
   
   ```
   #
   # Copyright 2018 Google LLC
   #
   # Licensed under the Apache License, Version 2.0 (the "License");
   # you may not use this file except in compliance with the License.
   # You may obtain a copy of the License at
   #
   #     https://www.apache.org/licenses/LICENSE-2.0
   #
   # Unless required by applicable law or agreed to in writing, software
   # distributed under the License is distributed on an "AS IS" BASIS,
   # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   # See the License for the specific language governing permissions and
   # limitations under the License.
   # 
   
   apiVersion: "sparkoperator.k8s.io/v1beta2"
   kind: SparkApplication
   metadata:
     name: hudi-lpe-ds-{{ ti.job_id }} 
     namespace: dataplatform
     annotations:
       spark.platform/type: streaming
     labels:
       spark_name: hudi-lpe-ds-{{ ti.job_id }}
       dag_name: hudi-lpe
       task_name: ds
       environment: "prod"
       cloud: "azure"
       tier: "t1"
       team: "dataplatform"
       service_type: "airflow"
       k8s_cluster_name: "kai"
       
   spec:
     type: Java
     mode: cluster
     image: "hudi-ds-azure-0.2"
     imagePullPolicy: Always
     mainClass: org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer
     mainApplicationFile: 
"local:///opt/spark/hudi/hudi-utilities-bundle_2.11-0.9.0-SNAPSHOT.jar"
     sparkConf:
       "spark.serializer": "org.apache.spark.serializer.KryoSerializer"
     arguments:
       - "--table-type"
       - "COPY_ON_WRITE"
       - "--props"
       - "/opt/spark/hudi/config/source.properties"
       - "--schemaprovider-class"
       - "org.apache.hudi.utilities.schema.SchemaRegistryProvider"
       - "--source-class"
       - "org.apache.hudi.utilities.sources.JsonKafkaSource"
       - "--target-base-path"
       - 
"wasb://container...@account.blob.core.windows.net/data/pipelines/hudi/kafka/telemetrics_v2/dp.hmi.quectel.event.lpe.packet.v2"
       - "--target-table"
       - "dp_hmi_quectel_event_lpe_packet_v2"
       - "--op"
       - "INSERT"
       - "--source-ordering-field"
       - "timestamp"
       - "--continuous"
       - "--min-sync-interval-seconds"
       - "60"
     sparkVersion: "2.4.4"
     restartPolicy:
       type: Always
       onFailureRetries: 100000
       onFailureRetryInterval: 60
       onSubmissionFailureRetries: 100000
       onSubmissionFailureRetryInterval: 60
     timeToLiveSeconds: 3600
     volumes:
       - name: hudi-lpe-ds
         configMap:
           name: hudi-lpe-ds
     driver:
       env:
         - name: HOODIE_ENV_fs_DOT_azure_DOT_wasb_DOT_account_DOT_name
           value: {{ 
var.value.HOODIE_ENV_fs_DOT_azure_DOT_wasb_DOT_account_DOT_name }}
         - name: HOODIE_ENV_fs_DOT_azure_DOT_account_DOT_key_DOT_{{ 
var.value.DP_DPV3_BLOB_STORAGE }}_DOT_blob_DOT_core_DOT_windows_DOT_net
           value: {{ 
var.value.HOODIE_ENV_fs_DOT_azure_DOT_account_DOT_key_DOT_account_DOT_blob_DOT_core_DOT_windows_DOT_net
 }}
       cores: 1
       coreLimit: "1200m"
       memory: "4G"
       serviceAccount: "dataplatform"
       volumeMounts:
         - name: hudi-lpe-ds
           mountPath: /opt/spark/hudi/config
           subpath: config.yaml
       memoryOverhead: "1024"
       javaOptions: "-Dnetworkaddress.cache.ttl=60 -Duser.timezone=IST 
-XX:+PrintGCApplicationStoppedTime -XX:+PrintGCApplicationConcurrentTime 
-XX:+PrintGCTimeStamps -XX:+HeapDumpOnOutOfMemoryError 
-XX:HeapDumpPath=/tmp/varadarb_ds_driver.hprof"
       # affinity:
       #  nodeAffinity:
       #    requiredDuringSchedulingIgnoredDuringExecution:
       #      nodeSelectorTerms:
       #      - matchExpressions:
       #        - key: service
       #          operator: In
       #          values:
       #          - airflow-spark
       #        - key: "node-lifecycle"
       #          operator: In
       #          values:
       #          - "ondemand"
     executor:
       env:
         - name: HOODIE_ENV_fs_DOT_azure_DOT_wasb_DOT_account_DOT_name
           value: {{ 
var.value.HOODIE_ENV_fs_DOT_azure_DOT_wasb_DOT_account_DOT_name }}
         - name: HOODIE_ENV_fs_DOT_azure_DOT_account_DOT_key_DOT_{{ 
var.value.DP_DPV3_BLOB_STORAGE }}_DOT_blob_DOT_core_DOT_windows_DOT_net
           value: {{ 
var.value.HOODIE_ENV_fs_DOT_azure_DOT_account_DOT_key_DOT_account_DOT_blob_DOT_core_DOT_windows_DOT_net
 }}
       cores: 1
       instances: 3
       memory: "6G"
       volumeMounts:
         - name: hudi-lpe-ds
           mountPath: /opt/spark/hudi/config
           subpath: config.yaml
       memoryOverhead: "3072"
       javaOptions: "-Dnetworkaddress.cache.ttl=60 -Duser.timezone=IST 
-XX:+PrintGCApplicationStoppedTime -XX:+PrintGCApplicationConcurrentTime 
-XX:+PrintGCTimeStamps -XX:+HeapDumpOnOutOfMemoryError 
-XX:HeapDumpPath=/tmp/varadarb_ds_driver.hprof"
       # affinity:
       #  nodeAffinity:
       #    requiredDuringSchedulingIgnoredDuringExecution:
       #      nodeSelectorTerms:
       #      - matchExpressions:
       #        - key: service
       #          operator: In
       #          values:
       #          - airflow-spark
       #        - key: "node-lifecycle"
       #          operator: In
       #          values:
       #          - "ondemand"
     sparkUIOptions:
       ingressAnnotations:
         kubernetes.io/ingress.class: nginx
     # Prometheus Monitoring
     # Comment out if not supported
     # monitoring:
     #   exposeDriverMetrics: true
     #   exposeExecutorMetrics: true
     #   prometheus:
     #     jmxExporterJar: "/prometheus/jmx_prometheus_javaagent-0.11.0.jar"
     #     port: 8090
   ``` 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [hudi] stym06 commented on issue #4318: [SUPPORT] Duplicate records in COW table within same partition path

Reply via email to