Spark 1.5 is the latest that I have access to and where this problem happens.
I don't see it's fixed in master but I might be wrong. diff atatched. https://raw.githubusercontent.com/apache/spark/branch-1.5/python/pyspark/sql/types.py https://raw.githubusercontent.com/apache/spark/d57daf1f7732a7ac54a91fe112deeda0a254f9ef/python/pyspark/sql/types.py -- Ruslan Dautkhanov On Wed, Mar 16, 2016 at 4:44 PM, Reynold Xin <r...@databricks.com> wrote: > We probably should have the alias. Is this still a problem on master > branch? > > On Wed, Mar 16, 2016 at 9:40 AM, Ruslan Dautkhanov <dautkha...@gmail.com> > wrote: > >> Running following: >> >> #fix schema for gaid which should not be Double >>> from pyspark.sql.types import * >>> customSchema = StructType() >>> for (col,typ) in tsp_orig.dtypes: >>> if col=='Agility_GAID': >>> typ='string' >>> customSchema.add(col,typ,True) >> >> >> Getting >> >> ValueError: Could not parse datatype: bigint >> >> >> Looks like pyspark.sql.types doesn't know anything about bigint.. >> Should it be aliased to LongType in pyspark.sql.types? >> >> Thanks >> >> >> On Wed, Mar 16, 2016 at 10:18 AM, Ruslan Dautkhanov <dautkha...@gmail.com >> > wrote: >> >>> Hello, >>> >>> Looking at >>> >>> https://spark.apache.org/docs/1.5.1/api/python/_modules/pyspark/sql/types.html >>> >>> and can't wrap my head around how to convert string data types names to >>> actual >>> pyspark.sql.types data types? >>> >>> Does pyspark.sql.types has an interface to return StringType() for >>> "string", >>> IntegerType() for "integer" etc? If it doesn't exist it would be great >>> to have such a >>> mapping function. >>> >>> Thank you. >>> >>> >>> ps. I have a data frame, and use its dtypes to loop through all columns >>> to fix a few >>> columns' data types as a workaround for SPARK-13866. >>> >>> >>> -- >>> Ruslan Dautkhanov >>> >> >> >
683a684,806 > _FIXED_DECIMAL = re.compile("decimal\\(\\s*(\\d+)\\s*,\\s*(\\d+)\\s*\\)") > > > _BRACKETS = {'(': ')', '[': ']', '{': '}'} > > > def _parse_basic_datatype_string(s): > if s in _all_atomic_types.keys(): > return _all_atomic_types[s]() > elif s == "int": > return IntegerType() > elif _FIXED_DECIMAL.match(s): > m = _FIXED_DECIMAL.match(s) > return DecimalType(int(m.group(1)), int(m.group(2))) > else: > raise ValueError("Could not parse datatype: %s" % s) > > > def _ignore_brackets_split(s, separator): > """ > Splits the given string by given separator, but ignore separators inside > brackets pairs, e.g. > given "a,b" and separator ",", it will return ["a", "b"], but given > "a<b,c>, d", it will return > ["a<b,c>", "d"]. > """ > parts = [] > buf = "" > level = 0 > for c in s: > if c in _BRACKETS.keys(): > level += 1 > buf += c > elif c in _BRACKETS.values(): > if level == 0: > raise ValueError("Brackets are not correctly paired: %s" % s) > level -= 1 > buf += c > elif c == separator and level > 0: > buf += c > elif c == separator: > parts.append(buf) > buf = "" > else: > buf += c > > if len(buf) == 0: > raise ValueError("The %s cannot be the last char: %s" % (separator, > s)) > parts.append(buf) > return parts > > > def _parse_struct_fields_string(s): > parts = _ignore_brackets_split(s, ",") > fields = [] > for part in parts: > name_and_type = _ignore_brackets_split(part, ":") > if len(name_and_type) != 2: > raise ValueError("The strcut field string format is: > 'field_name:field_type', " + > "but got: %s" % part) > field_name = name_and_type[0].strip() > field_type = _parse_datatype_string(name_and_type[1]) > fields.append(StructField(field_name, field_type)) > return StructType(fields) > > > def _parse_datatype_string(s): > """ > Parses the given data type string to a :class:`DataType`. The data type > string format equals > to `DataType.simpleString`, except that top level struct type can omit > the `struct<>` and > atomic types use `typeName()` as their format, e.g. use `byte` instead of > `tinyint` for > ByteType. We can also use `int` as a short name for IntegerType. > > >>> _parse_datatype_string("int ") > IntegerType > >>> _parse_datatype_string("a: byte, b: decimal( 16 , 8 ) ") > > StructType(List(StructField(a,ByteType,true),StructField(b,DecimalType(16,8),true))) > >>> _parse_datatype_string("a: array< short>") > StructType(List(StructField(a,ArrayType(ShortType,true),true))) > >>> _parse_datatype_string(" map<string , string > ") > MapType(StringType,StringType,true) > > >>> # Error cases > >>> _parse_datatype_string("blabla") # doctest: +IGNORE_EXCEPTION_DETAIL > Traceback (most recent call last): > ... > ValueError:... > >>> _parse_datatype_string("a: int,") # doctest: +IGNORE_EXCEPTION_DETAIL > Traceback (most recent call last): > ... > ValueError:... > >>> _parse_datatype_string("array<int") # doctest: > +IGNORE_EXCEPTION_DETAIL > Traceback (most recent call last): > ... > ValueError:... > >>> _parse_datatype_string("map<int, boolean>>") # doctest: > +IGNORE_EXCEPTION_DETAIL > Traceback (most recent call last): > ... > ValueError:... > """ > s = s.strip() > if s.startswith("array<"): > if s[-1] != ">": > raise ValueError("'>' should be the last char, but got: %s" % s) > return ArrayType(_parse_datatype_string(s[6:-1])) > elif s.startswith("map<"): > if s[-1] != ">": > raise ValueError("'>' should be the last char, but got: %s" % s) > parts = _ignore_brackets_split(s[4:-1], ",") > if len(parts) != 2: > raise ValueError("The map type string format is: > 'map<key_type,value_type>', " + > "but got: %s" % s) > kt = _parse_datatype_string(parts[0]) > vt = _parse_datatype_string(parts[1]) > return MapType(kt, vt) > elif s.startswith("struct<"): > if s[-1] != ">": > raise ValueError("'>' should be the last char, but got: %s" % s) > return _parse_struct_fields_string(s[7:-1]) > elif ":" in s: > return _parse_struct_fields_string(s) > else: > return _parse_basic_datatype_string(s) > > 733,735d855 < _FIXED_DECIMAL = re.compile("decimal\\((\\d+),(\\d+)\\)") < < 943,945d1062 < _BRACKETS = {'(': ')', '[': ']', '{': '}'} < < 1094c1211 < def _verify_type(obj, dataType): --- > def _verify_type(obj, dataType, nullable=True): 1096,1097c1213,1217 < Verify the type of obj against dataType, raise an exception if < they do not match. --- > Verify the type of obj against dataType, raise a TypeError if they do not > match. > > Also verify the value of obj against datatype, raise a ValueError if it's > not within the allowed > range, e.g. using 128 as ByteType will overflow. Note that, Python float > is not checked, so it > will become infinity when cast to Java float if it overflows. 1113a1234,1256 > >>> # Check if numeric values are within the allowed range. > >>> _verify_type(12, ByteType()) > >>> _verify_type(1234, ByteType()) # doctest: +IGNORE_EXCEPTION_DETAIL > Traceback (most recent call last): > ... > ValueError:... > >>> _verify_type(None, ByteType(), False) # doctest: > +IGNORE_EXCEPTION_DETAIL > Traceback (most recent call last): > ... > ValueError:... > >>> _verify_type([1, None], ArrayType(ShortType(), False)) # doctest: > +IGNORE_EXCEPTION_DETAIL > Traceback (most recent call last): > ... > ValueError:... > >>> _verify_type({None: 1}, MapType(StringType(), IntegerType())) > Traceback (most recent call last): > ... > ValueError:... > >>> schema = StructType().add("a", IntegerType()).add("b", StringType(), > False) > >>> _verify_type((1, None), schema) # doctest: +IGNORE_EXCEPTION_DETAIL > Traceback (most recent call last): > ... > ValueError:... 1115d1257 < # all objects are nullable 1117c1259,1262 < return --- > if nullable: > return > else: > raise ValueError("This field is not nullable, but got None") 1130c1275 < assert _type in _acceptable_types, "unknown datatype: %s" % dataType --- > assert _type in _acceptable_types, "unknown datatype: %s for object %r" % > (dataType, obj) 1134c1279 < raise TypeError("StructType can not accept object in type %s" % type(obj)) --- > raise TypeError("StructType can not accept object %r in type %s" > % (obj, type(obj))) 1138c1283 < raise TypeError("%s can not accept object in type %s" % (dataType, type(obj))) --- > raise TypeError("%s can not accept object %r in type %s" % > (dataType, obj, type(obj))) 1140c1285,1297 < if isinstance(dataType, ArrayType): --- > if isinstance(dataType, ByteType): > if obj < -128 or obj > 127: > raise ValueError("object of ByteType out of range, got: %s" % obj) > > elif isinstance(dataType, ShortType): > if obj < -32768 or obj > 32767: > raise ValueError("object of ShortType out of range, got: %s" % > obj) > > elif isinstance(dataType, IntegerType): > if obj < -2147483648 or obj > 2147483647: > raise ValueError("object of IntegerType out of range, got: %s" % > obj) > > elif isinstance(dataType, ArrayType): 1142c1299 < _verify_type(i, dataType.elementType) --- > _verify_type(i, dataType.elementType, dataType.containsNull) 1146,1147c1303,1304 < _verify_type(k, dataType.keyType) < _verify_type(v, dataType.valueType) --- > _verify_type(k, dataType.keyType, False) > _verify_type(v, dataType.valueType, dataType.valueContainsNull) 1154c1311 < _verify_type(v, f.dataType) --- > _verify_type(v, f.dataType, f.nullable) 1178a1336,1337 > >>> row['name'], row['age'] > ('Alice', 11) 1245a1405,1417 > def __getitem__(self, item): > if isinstance(item, (int, slice)): > return super(Row, self).__getitem__(item) > try: > # it will be slow when it has many fields, > # but this will not be used in normal cases > idx = self.__fields__.index(item) > return super(Row, self).__getitem__(idx) > except IndexError: > raise KeyError(item) > except ValueError: > raise ValueError(item) > 1295,1296c1467,1471 < return Timestamp(int(time.mktime(obj.timetuple())) * 1000 + obj.microsecond // 1000) < --- > seconds = (calendar.timegm(obj.utctimetuple()) if obj.tzinfo > else time.mktime(obj.timetuple())) > t = Timestamp(int(seconds) * 1000) > t.setNanos(obj.microsecond * 1000) > return t
--------------------------------------------------------------------- To unsubscribe, e-mail: user-unsubscr...@spark.apache.org For additional commands, e-mail: user-h...@spark.apache.org