Anna Szonyi created SQOOP-3146:
----------------------------------
Summary: Sqoop (import + --as-parquetfile) with Oracle (CLOB vs.
BLOB) is inconsistent
Key: SQOOP-3146
URL: https://issues.apache.org/jira/browse/SQOOP-3146
Project: Sqoop
Issue Type: Bug
Reporter: Anna Szonyi
########################################################################
# Owner: Sqoopinators
# Component: Sqoop1
# Purpose: Escalation Test Case
# SFDC Case ID:127558
# SFDC EscalationID: CDH-50699
# File: SupportTest_Case_127558_JIRA_CDH-50699.txt
#
# Description
# 1. Sqoop import + —as-parquetfile + CLOB Data Types (Gives Error)
# 2. Sqoop import + —as-parquetfile + BLOB Data Types (Works Good)
########################################################################
######################################################
USE CASE [1] . Sqoop import + —as-parquetfile + CLOB Data Types (Gives Error)
######################################################
#######################
# STEP 01 - CREATE DATA
#######################
sqoop eval --connect $MYCONN --username $MYUSER --password $MYPSWD --query
"create table t1_clob (c1 int,c2 clob)"
sqoop eval --connect $MYCONN --username $MYUSER --password $MYPSWD --query
"insert into t1_clob values(1,'qwqewewqrerew121212121212’)”
sqoop eval --connect $MYCONN --username $MYUSER --password $MYPSWD --query
"select * from t1_clob"
#########
OUTPUT
#########
-----------------------------------------------
| C1 | C2 |
-----------------------------------------------
| 1 | qwqewewqrerew121212121212 |
-----------------------------------------------
########################################################
STEP 02 - IMPORT AS PARQUET FILE (Without —map-column-java) [REPRODUCING
THE ERROR]
########################################################
sqoop import --connect $MYCONN --username $MYUSER --password $MYPSWD
--as-parquetfile --table T1_CLOB --delete-target-dir --target-dir
'/projects/t1_clob' -m 1
OUTPUT
——————
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
17/02/21 10:07:08 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.8.3
17/02/21 10:07:08 WARN tool.BaseSqoopTool: Setting your password on the
command-line is insecure. Consider using -P instead.
17/02/21 10:07:08 INFO oracle.OraOopManagerFactory: Data Connector for Oracle
and Hadoop is disabled.
17/02/21 10:07:08 INFO manager.SqlManager: Using default fetchSize of 1000
17/02/21 10:07:08 INFO tool.CodeGenTool: Beginning code generation
17/02/21 10:07:08 INFO tool.CodeGenTool: Will generate java class as
codegen_T1_CLOB
17/02/21 10:07:09 INFO manager.OracleManager: Time zone has been set to GMT
17/02/21 10:07:09 INFO manager.SqlManager: Executing SQL statement: SELECT t.*
FROM "T1_CLOB" t WHERE 1=0
17/02/21 10:07:09 INFO orm.CompilationManager: HADOOP_MAPRED_HOME is
/opt/cloudera/parcels/CDH/lib/hadoop-mapreduce
Note:
/tmp/sqoop-root/compile/cbaf5013e6bc9dad7283090f9d761289/codegen_T1_CLOB.java
uses or overrides a deprecated API.
Note: Recompile with -Xlint:deprecation for details.
17/02/21 10:07:11 INFO orm.CompilationManager: Writing jar file:
/tmp/sqoop-root/compile/cbaf5013e6bc9dad7283090f9d761289/codegen_T1_CLOB.jar
17/02/21 10:07:13 INFO tool.ImportTool: Destination directory /projects/t1_clob
is not present, hence not deleting.
17/02/21 10:07:13 INFO manager.OracleManager: Time zone has been set to GMT
17/02/21 10:07:13 INFO manager.OracleManager: Time zone has been set to GMT
17/02/21 10:07:13 INFO mapreduce.ImportJobBase: Beginning import of T1_CLOB
17/02/21 10:07:13 INFO Configuration.deprecation: mapred.jar is deprecated.
Instead, use mapreduce.job.jar
17/02/21 10:07:13 INFO manager.OracleManager: Time zone has been set to GMT
17/02/21 10:07:13 INFO manager.OracleManager: Time zone has been set to GMT
17/02/21 10:07:13 INFO manager.SqlManager: Executing SQL statement: SELECT t.*
FROM "T1_CLOB" t WHERE 1=0
17/02/21 10:07:13 INFO manager.SqlManager: Executing SQL statement: SELECT t.*
FROM "T1_CLOB" t WHERE 1=0
17/02/21 10:07:13 ERROR tool.ImportTool: Imported Failed: Cannot convert SQL
type 2005
#####################################################################
STEP 02.1 - IMPORT AS PARQUET FILE + —map-column-java (For CLOB data type)
#####################################################################
sqoop import --connect $MYCONN --username $MYUSER --password $MYPSWD
--as-parquet file --table T1_CLOB --map-column-java C2=String
--delete-target-dir --target-dir '/projects/t1_clob' -m 1
OUTPUT
———————
17/02/21 10:09:50 INFO tool.ImportTool: Destination directory /projects/t1_clob
is not present, hence not deleting.
17/02/21 10:09:50 INFO mapreduce.ImportJobBase: Beginning import of T1_CLOB
17/02/21 10:10:09 INFO mapreduce.Job: map 0% reduce 0%
17/02/21 10:10:34 INFO mapreduce.Job: map 100% reduce 0%
17/02/21 10:10:35 INFO mapreduce.Job: Job job_1487339765830_0009 completed
successfully
17/02/21 10:10:35 INFO mapreduce.ImportJobBase: Transferred 1.7402 KB in
43.4127 seconds (41.0479 bytes/sec)
17/02/21 10:10:35 INFO mapreduce.ImportJobBase: Retrieved 1 records.
[root@host-10-17-81-247 ~]# hadoop fs -ls /projects/t1_clob
Found 3 items
drwxr-xr-x - root supergroup 0 2017-02-21 10:09
/projects/t1_clob/.metadata
drwxr-xr-x - root supergroup 0 2017-02-21 10:10
/projects/t1_clob/.signals
-rw-r--r-- 3 root supergroup 698 2017-02-21 10:10
/projects/t1_clob/20aeeff1-68e1-4c9c-9a74-51a5f2ccb369.parquet
###################################################################
USE CASE [2] . Sqoop import + —as-parquetfile BLOB Data Types (Works Good)
###################################################################
#######################
# STEP 01 - CREATE DATA
#######################
export
MYCONN=jdbc:oracle:thin:@host-10-17-101-252.coe.cloudera.com:1521/orcl12c;
export MYUSER=sqoop
export MYPSWD=welcome1
sqoop eval --connect $MYCONN --username $MYUSER --password $MYPSWD --query
"create table t1_blob_parq_new (c1 int,c2 blob)"
sqoop eval --connect $MYCONN --username $MYUSER --password $MYPSWD --query
"insert into t1_blob_parq_new values(1,utl_raw.cast_to_raw('testblobtype'))"
sqoop eval --connect $MYCONN --username $MYUSER --password $MYPSWD --query
"select C1,utl_raw.cast_to_varchar2(C2) as C2 from t1_blob_parq_new"
#########
OUTPUT
#########
-----------------------------------------------
| C1 | C2 |
-----------------------------------------------
| 1 | testblobtype |
-----------------------------------------------
[root@host-10-17-81-247 ~]#
########################################################
STEP 02 - IMPORT AS PARQUET FILE
########################################################
sqoop import --connect $MYCONN --username $MYUSER --password $MYPSWD --table
T1_BLOB_PARQ_NEW --as-parquetfile --target-dir '/projects/t1_bob_parq_new' -m 1
#########
OUTPUT
#########
17/02/22 08:03:51 INFO mapreduce.ImportJobBase: Beginning import of
T1_BLOB_PARQ_NEW
17/02/22 08:03:52 INFO manager.SqlManager: Executing SQL statement: SELECT t.*
FROM "T1_BLOB_PARQ_NEW" t WHERE 1=0
17/02/22 08:03:52 INFO manager.SqlManager: Executing SQL statement: SELECT t.*
FROM "T1_BLOB_PARQ_NEW" t WHERE 1=0
17/02/22 08:04:00 INFO mapreduce.Job: Running job: job_1487339765830_0010
17/02/22 08:04:13 INFO mapreduce.Job: Job job_1487339765830_0010 running in
uber mode : false
17/02/22 08:04:13 INFO mapreduce.Job: map 0% reduce 0%
17/02/22 08:04:32 INFO mapreduce.Job: map 100% reduce 0%
17/02/22 08:04:32 INFO mapreduce.Job: Job job_1487339765830_0010 completed
successfully
17/02/22 08:04:32 INFO mapreduce.ImportJobBase: Transferred 1.7598 KB in
38.6604 seconds (46.611 bytes/sec)
17/02/22 08:04:32 INFO mapreduce.ImportJobBase: Retrieved 1 records.
[root@host-10-17-81-247 ~]# hadoop fs -ls /projects/t1_bob_parq_new
Found 3 items
drwxr-xr-x - root supergroup 0 2017-02-22 08:03
/projects/t1_bob_parq_new/.metadata
drwxr-xr-x - root supergroup 0 2017-02-22 08:04
/projects/t1_bob_parq_new/.signals
-rw-r--r-- 3 root supergroup 666 2017-02-22 08:04
/projects/t1_bob_parq_new/94e7de9a-88fc-4b6b-be4d-2547698fe599.parquet
[root@host-10-17-81-247 ~]# hadoop fs -cat
/projects/t1_bob_parq_new/.metadata/schemas/1.avsc
{
"type" : "record",
"name" : "T1_BLOB_PARQ_NEW",
"doc" : "Sqoop import of T1_BLOB_PARQ_NEW",
"fields" : [ {
"name" : "C1",
"type" : [ "null", "string" ],
"default" : null,
"columnName" : "C1",
"sqlType" : "2"
}, {
"name" : "C2",
"type" : [ "null", "bytes" ],
"default" : null,
"columnName" : "C2",
"sqlType" : "2004"
} ],
"tableName" : "T1_BLOB_PARQ_NEW"
}
--
This message was sent by Atlassian JIRA
(v6.3.15#6346)