Hi experts,

Trying to use the "slicing" functionality in strings as part of a Spark
program (PySpark) I get this error:

**** Code ****

import pandas as pd
from pyspark.sql import SQLContext
hc = SQLContext(sc)
A = pd.DataFrame({'Firstname': ['James', 'Ali', 'Daniel'], 'Lastname':
['Jones', 'Bajwa', 'Day']})
a = hc.createDataFrame(A)
print A

b = a.select(a.Firstname[:2])
print b.toPandas()
c = a.select(a.Lastname[2:])
print c.toPandas()

Output:

 Firstname Lastname
0     James    Jones
1       Ali    Bajwa
2    Daniel      Day
  SUBSTR(Firstname, 0, 2)
0                      Ja
1                      Al
2                      Da

---------------------------------------------------------------------------
Py4JError                                 Traceback (most recent call last)
<ipython-input-17-6ee5d7d069ce> in <module>()
     10 b = a.select(a.Firstname[:2])
     11 print b.toPandas()
---> 12 c = a.select(a.Lastname[2:])
     13 print c.toPandas()

/home/jupyter/spark-1.3.1/python/pyspark/sql/dataframe.pyc in substr(self,
startPos, length)
   1089             raise TypeError("Can not mix the type")
   1090         if isinstance(startPos, (int, long)):
-> 1091             jc = self._jc.substr(startPos, length)
   1092         elif isinstance(startPos, Column):
   1093             jc = self._jc.substr(startPos._jc, length._jc)

/home/jupyter/spark-1.3.1/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py
in __call__(self, *args)
    536         answer = self.gateway_client.send_command(command)
    537         return_value = get_return_value(answer, self.gateway_client,
--> 538                 self.target_id, self.name)
    539
    540         for temp_arg in temp_args:

/home/jupyter/spark-1.3.1/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py
in get_return_value(answer, gateway_client, target_id, name)
    302                 raise Py4JError(
    303                     'An error occurred while calling {0}{1}{2}.
Trace:\n{3}\n'.
--> 304                     format(target_id, '.', name, value))
    305         else:
    306             raise Py4JError(

Py4JError: An error occurred while calling o1887.substr. Trace:
py4j.Py4JException: Method substr([class java.lang.Integer, class
java.lang.Long]) does not exist
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:333)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:342)
at py4j.Gateway.invoke(Gateway.java:252)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Thread.java:745)

Looks like X[:2] works but X[2:] fails with the error above
Anyone else have this issue?

Clearly I can use substr() to workaround this, but if this is a confirmed
bug we should open a JIRA.

Thanks,
Ali

Reply via email to